diff options
Diffstat (limited to 'youtube_dl/downloader/f4m.py')
-rw-r--r-- | youtube_dl/downloader/f4m.py | 438 |
1 files changed, 438 insertions, 0 deletions
diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py new file mode 100644 index 0000000..15e71be --- /dev/null +++ b/youtube_dl/downloader/f4m.py @@ -0,0 +1,438 @@ +from __future__ import division, unicode_literals + +import io +import itertools +import time + +from .fragment import FragmentFD +from ..compat import ( + compat_b64decode, + compat_etree_fromstring, + compat_urlparse, + compat_urllib_error, + compat_urllib_parse_urlparse, + compat_struct_pack, + compat_struct_unpack, +) +from ..utils import ( + fix_xml_ampersands, + xpath_text, +) + + +class DataTruncatedError(Exception): + pass + + +class FlvReader(io.BytesIO): + """ + Reader for Flv files + The file format is documented in https://www.adobe.com/devnet/f4v.html + """ + + def read_bytes(self, n): + data = self.read(n) + if len(data) < n: + raise DataTruncatedError( + 'FlvReader error: need %d bytes while only %d bytes got' % ( + n, len(data))) + return data + + # Utility functions for reading numbers and strings + def read_unsigned_long_long(self): + return compat_struct_unpack('!Q', self.read_bytes(8))[0] + + def read_unsigned_int(self): + return compat_struct_unpack('!I', self.read_bytes(4))[0] + + def read_unsigned_char(self): + return compat_struct_unpack('!B', self.read_bytes(1))[0] + + def read_string(self): + res = b'' + while True: + char = self.read_bytes(1) + if char == b'\x00': + break + res += char + return res + + def read_box_info(self): + """ + Read a box and return the info as a tuple: (box_size, box_type, box_data) + """ + real_size = size = self.read_unsigned_int() + box_type = self.read_bytes(4) + header_end = 8 + if size == 1: + real_size = self.read_unsigned_long_long() + header_end = 16 + return real_size, box_type, self.read_bytes(real_size - header_end) + + def read_asrt(self): + # version + self.read_unsigned_char() + # flags + self.read_bytes(3) + quality_entry_count = self.read_unsigned_char() + # QualityEntryCount + for i in range(quality_entry_count): + self.read_string() + + segment_run_count = self.read_unsigned_int() + segments = [] + for i in range(segment_run_count): + first_segment = self.read_unsigned_int() + fragments_per_segment = self.read_unsigned_int() + segments.append((first_segment, fragments_per_segment)) + + return { + 'segment_run': segments, + } + + def read_afrt(self): + # version + self.read_unsigned_char() + # flags + self.read_bytes(3) + # time scale + self.read_unsigned_int() + + quality_entry_count = self.read_unsigned_char() + # QualitySegmentUrlModifiers + for i in range(quality_entry_count): + self.read_string() + + fragments_count = self.read_unsigned_int() + fragments = [] + for i in range(fragments_count): + first = self.read_unsigned_int() + first_ts = self.read_unsigned_long_long() + duration = self.read_unsigned_int() + if duration == 0: + discontinuity_indicator = self.read_unsigned_char() + else: + discontinuity_indicator = None + fragments.append({ + 'first': first, + 'ts': first_ts, + 'duration': duration, + 'discontinuity_indicator': discontinuity_indicator, + }) + + return { + 'fragments': fragments, + } + + def read_abst(self): + # version + self.read_unsigned_char() + # flags + self.read_bytes(3) + + self.read_unsigned_int() # BootstrapinfoVersion + # Profile,Live,Update,Reserved + flags = self.read_unsigned_char() + live = flags & 0x20 != 0 + # time scale + self.read_unsigned_int() + # CurrentMediaTime + self.read_unsigned_long_long() + # SmpteTimeCodeOffset + self.read_unsigned_long_long() + + self.read_string() # MovieIdentifier + server_count = self.read_unsigned_char() + # ServerEntryTable + for i in range(server_count): + self.read_string() + quality_count = self.read_unsigned_char() + # QualityEntryTable + for i in range(quality_count): + self.read_string() + # DrmData + self.read_string() + # MetaData + self.read_string() + + segments_count = self.read_unsigned_char() + segments = [] + for i in range(segments_count): + box_size, box_type, box_data = self.read_box_info() + assert box_type == b'asrt' + segment = FlvReader(box_data).read_asrt() + segments.append(segment) + fragments_run_count = self.read_unsigned_char() + fragments = [] + for i in range(fragments_run_count): + box_size, box_type, box_data = self.read_box_info() + assert box_type == b'afrt' + fragments.append(FlvReader(box_data).read_afrt()) + + return { + 'segments': segments, + 'fragments': fragments, + 'live': live, + } + + def read_bootstrap_info(self): + total_size, box_type, box_data = self.read_box_info() + assert box_type == b'abst' + return FlvReader(box_data).read_abst() + + +def read_bootstrap_info(bootstrap_bytes): + return FlvReader(bootstrap_bytes).read_bootstrap_info() + + +def build_fragments_list(boot_info): + """ Return a list of (segment, fragment) for each fragment in the video """ + res = [] + segment_run_table = boot_info['segments'][0] + fragment_run_entry_table = boot_info['fragments'][0]['fragments'] + first_frag_number = fragment_run_entry_table[0]['first'] + fragments_counter = itertools.count(first_frag_number) + for segment, fragments_count in segment_run_table['segment_run']: + # In some live HDS streams (for example Rai), `fragments_count` is + # abnormal and causing out-of-memory errors. It's OK to change the + # number of fragments for live streams as they are updated periodically + if fragments_count == 4294967295 and boot_info['live']: + fragments_count = 2 + for _ in range(fragments_count): + res.append((segment, next(fragments_counter))) + + if boot_info['live']: + res = res[-2:] + + return res + + +def write_unsigned_int(stream, val): + stream.write(compat_struct_pack('!I', val)) + + +def write_unsigned_int_24(stream, val): + stream.write(compat_struct_pack('!I', val)[1:]) + + +def write_flv_header(stream): + """Writes the FLV header to stream""" + # FLV header + stream.write(b'FLV\x01') + stream.write(b'\x05') + stream.write(b'\x00\x00\x00\x09') + stream.write(b'\x00\x00\x00\x00') + + +def write_metadata_tag(stream, metadata): + """Writes optional metadata tag to stream""" + SCRIPT_TAG = b'\x12' + FLV_TAG_HEADER_LEN = 11 + + if metadata: + stream.write(SCRIPT_TAG) + write_unsigned_int_24(stream, len(metadata)) + stream.write(b'\x00\x00\x00\x00\x00\x00\x00') + stream.write(metadata) + write_unsigned_int(stream, FLV_TAG_HEADER_LEN + len(metadata)) + + +def remove_encrypted_media(media): + return list(filter(lambda e: 'drmAdditionalHeaderId' not in e.attrib and + 'drmAdditionalHeaderSetId' not in e.attrib, + media)) + + +def _add_ns(prop, ver=1): + return '{http://ns.adobe.com/f4m/%d.0}%s' % (ver, prop) + + +def get_base_url(manifest): + base_url = xpath_text( + manifest, [_add_ns('baseURL'), _add_ns('baseURL', 2)], + 'base URL', default=None) + if base_url: + base_url = base_url.strip() + return base_url + + +class F4mFD(FragmentFD): + """ + A downloader for f4m manifests or AdobeHDS. + """ + + FD_NAME = 'f4m' + + def _get_unencrypted_media(self, doc): + media = doc.findall(_add_ns('media')) + if not media: + self.report_error('No media found') + for e in (doc.findall(_add_ns('drmAdditionalHeader')) + + doc.findall(_add_ns('drmAdditionalHeaderSet'))): + # If id attribute is missing it's valid for all media nodes + # without drmAdditionalHeaderId or drmAdditionalHeaderSetId attribute + if 'id' not in e.attrib: + self.report_error('Missing ID in f4m DRM') + media = remove_encrypted_media(media) + if not media: + self.report_error('Unsupported DRM') + return media + + def _get_bootstrap_from_url(self, bootstrap_url): + bootstrap = self.ydl.urlopen(bootstrap_url).read() + return read_bootstrap_info(bootstrap) + + def _update_live_fragments(self, bootstrap_url, latest_fragment): + fragments_list = [] + retries = 30 + while (not fragments_list) and (retries > 0): + boot_info = self._get_bootstrap_from_url(bootstrap_url) + fragments_list = build_fragments_list(boot_info) + fragments_list = [f for f in fragments_list if f[1] > latest_fragment] + if not fragments_list: + # Retry after a while + time.sleep(5.0) + retries -= 1 + + if not fragments_list: + self.report_error('Failed to update fragments') + + return fragments_list + + def _parse_bootstrap_node(self, node, base_url): + # Sometimes non empty inline bootstrap info can be specified along + # with bootstrap url attribute (e.g. dummy inline bootstrap info + # contains whitespace characters in [1]). We will prefer bootstrap + # url over inline bootstrap info when present. + # 1. http://live-1-1.rutube.ru/stream/1024/HDS/SD/C2NKsS85HQNckgn5HdEmOQ/1454167650/S-s604419906/move/four/dirs/upper/1024-576p.f4m + bootstrap_url = node.get('url') + if bootstrap_url: + bootstrap_url = compat_urlparse.urljoin( + base_url, bootstrap_url) + boot_info = self._get_bootstrap_from_url(bootstrap_url) + else: + bootstrap_url = None + bootstrap = compat_b64decode(node.text) + boot_info = read_bootstrap_info(bootstrap) + return boot_info, bootstrap_url + + def real_download(self, filename, info_dict): + man_url = info_dict['url'] + requested_bitrate = info_dict.get('tbr') + self.to_screen('[%s] Downloading f4m manifest' % self.FD_NAME) + + urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url)) + man_url = urlh.geturl() + # Some manifests may be malformed, e.g. prosiebensat1 generated manifests + # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244 + # and https://github.com/rg3/youtube-dl/issues/7823) + manifest = fix_xml_ampersands(urlh.read().decode('utf-8', 'ignore')).strip() + + doc = compat_etree_fromstring(manifest) + formats = [(int(f.attrib.get('bitrate', -1)), f) + for f in self._get_unencrypted_media(doc)] + if requested_bitrate is None or len(formats) == 1: + # get the best format + formats = sorted(formats, key=lambda f: f[0]) + rate, media = formats[-1] + else: + rate, media = list(filter( + lambda f: int(f[0]) == requested_bitrate, formats))[0] + + # Prefer baseURL for relative URLs as per 11.2 of F4M 3.0 spec. + man_base_url = get_base_url(doc) or man_url + + base_url = compat_urlparse.urljoin(man_base_url, media.attrib['url']) + bootstrap_node = doc.find(_add_ns('bootstrapInfo')) + boot_info, bootstrap_url = self._parse_bootstrap_node( + bootstrap_node, man_base_url) + live = boot_info['live'] + metadata_node = media.find(_add_ns('metadata')) + if metadata_node is not None: + metadata = compat_b64decode(metadata_node.text) + else: + metadata = None + + fragments_list = build_fragments_list(boot_info) + test = self.params.get('test', False) + if test: + # We only download the first fragment + fragments_list = fragments_list[:1] + total_frags = len(fragments_list) + # For some akamai manifests we'll need to add a query to the fragment url + akamai_pv = xpath_text(doc, _add_ns('pv-2.0')) + + ctx = { + 'filename': filename, + 'total_frags': total_frags, + 'live': live, + } + + self._prepare_frag_download(ctx) + + dest_stream = ctx['dest_stream'] + + if ctx['complete_frags_downloaded_bytes'] == 0: + write_flv_header(dest_stream) + if not live: + write_metadata_tag(dest_stream, metadata) + + base_url_parsed = compat_urllib_parse_urlparse(base_url) + + self._start_frag_download(ctx) + + frag_index = 0 + while fragments_list: + seg_i, frag_i = fragments_list.pop(0) + frag_index += 1 + if frag_index <= ctx['fragment_index']: + continue + name = 'Seg%d-Frag%d' % (seg_i, frag_i) + query = [] + if base_url_parsed.query: + query.append(base_url_parsed.query) + if akamai_pv: + query.append(akamai_pv.strip(';')) + if info_dict.get('extra_param_to_segment_url'): + query.append(info_dict['extra_param_to_segment_url']) + url_parsed = base_url_parsed._replace(path=base_url_parsed.path + name, query='&'.join(query)) + try: + success, down_data = self._download_fragment(ctx, url_parsed.geturl(), info_dict) + if not success: + return False + reader = FlvReader(down_data) + while True: + try: + _, box_type, box_data = reader.read_box_info() + except DataTruncatedError: + if test: + # In tests, segments may be truncated, and thus + # FlvReader may not be able to parse the whole + # chunk. If so, write the segment as is + # See https://github.com/rg3/youtube-dl/issues/9214 + dest_stream.write(down_data) + break + raise + if box_type == b'mdat': + self._append_fragment(ctx, box_data) + break + except (compat_urllib_error.HTTPError, ) as err: + if live and (err.code == 404 or err.code == 410): + # We didn't keep up with the live window. Continue + # with the next available fragment. + msg = 'Fragment %d unavailable' % frag_i + self.report_warning(msg) + fragments_list = [] + else: + raise + + if not fragments_list and not test and live and bootstrap_url: + fragments_list = self._update_live_fragments(bootstrap_url, frag_i) + total_frags += len(fragments_list) + if fragments_list and (fragments_list[0][1] > frag_i + 1): + msg = 'Missed %d fragments' % (fragments_list[0][1] - (frag_i + 1)) + self.report_warning(msg) + + self._finish_frag_download(ctx) + + return True |