aboutsummaryrefslogtreecommitdiffstats
path: root/hypervideo_dl/extractor/common.py
diff options
context:
space:
mode:
Diffstat (limited to 'hypervideo_dl/extractor/common.py')
-rw-r--r--hypervideo_dl/extractor/common.py392
1 files changed, 264 insertions, 128 deletions
diff --git a/hypervideo_dl/extractor/common.py b/hypervideo_dl/extractor/common.py
index 4b56307..5a561a2 100644
--- a/hypervideo_dl/extractor/common.py
+++ b/hypervideo_dl/extractor/common.py
@@ -13,6 +13,7 @@ import netrc
import os
import random
import re
+import subprocess
import sys
import time
import types
@@ -21,9 +22,21 @@ import urllib.request
import xml.etree.ElementTree
from ..compat import functools # isort: split
-from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name
+from ..compat import (
+ compat_etree_fromstring,
+ compat_expanduser,
+ compat_os_name,
+ urllib_req_to_req,
+)
from ..cookies import LenientSimpleCookie
from ..downloader.f4m import get_base_url, remove_encrypted_media
+from ..downloader.hls import HlsFD
+from ..networking import HEADRequest, Request
+from ..networking.exceptions import (
+ HTTPError,
+ IncompleteRead,
+ network_exceptions,
+)
from ..utils import (
IDENTITY,
JSON_LD_RE,
@@ -33,6 +46,7 @@ from ..utils import (
GeoRestrictedError,
GeoUtils,
LenientJSONDecoder,
+ Popen,
RegexNotFoundError,
RetryManager,
UnsupportedError,
@@ -55,7 +69,7 @@ from ..utils import (
join_nonempty,
js_to_json,
mimetype2ext,
- network_exceptions,
+ netrc_from_content,
orderedSet,
parse_bitrate,
parse_codecs,
@@ -65,21 +79,20 @@ from ..utils import (
parse_resolution,
sanitize_filename,
sanitize_url,
- sanitized_Request,
smuggle_url,
str_or_none,
str_to_int,
strip_or_none,
traverse_obj,
+ truncate_string,
try_call,
try_get,
unescapeHTML,
unified_strdate,
unified_timestamp,
- update_Request,
- update_url_query,
url_basename,
url_or_none,
+ urlhandle_detect_ext,
urljoin,
variadic,
xpath_element,
@@ -129,6 +142,7 @@ class InfoExtractor:
is parsed from a string (in case of
fragmented media)
for MSS - URL of the ISM manifest.
+ * request_data Data to send in POST request to the URL
* manifest_url
The URL of the manifest file in case of
fragmented media:
@@ -216,7 +230,19 @@ class InfoExtractor:
width : height ratio as float.
* no_resume The server does not support resuming the
(HTTP or RTMP) download. Boolean.
- * has_drm The format has DRM and cannot be downloaded. Boolean
+ * has_drm True if the format has DRM and cannot be downloaded.
+ 'maybe' if the format may have DRM and has to be tested before download.
+ * extra_param_to_segment_url A query string to append to each
+ fragment's URL, or to update each existing query string
+ with. Only applied by the native HLS/DASH downloaders.
+ * hls_aes A dictionary of HLS AES-128 decryption information
+ used by the native HLS downloader to override the
+ values in the media playlist when an '#EXT-X-KEY' tag
+ is present in the playlist:
+ * uri The URI from which the key will be downloaded
+ * key The key (as hex) used to decrypt fragments.
+ If `key` is given, any key URI will be ignored
+ * iv The IV (as hex) used to decrypt fragments
* downloader_options A dictionary of downloader options
(For internal use only)
* http_chunk_size Chunk size for HTTP downloads
@@ -271,6 +297,7 @@ class InfoExtractor:
channel_id: Id of the channel.
channel_url: Full URL to a channel webpage.
channel_follower_count: Number of followers of the channel.
+ channel_is_verified: Whether the channel is verified on the platform.
location: Physical location where the video was filmed.
subtitles: The available subtitles as a dictionary in the format
{tag: subformats}. "tag" is usually a language code, and
@@ -299,6 +326,11 @@ class InfoExtractor:
* "author" - human-readable name of the comment author
* "author_id" - user ID of the comment author
* "author_thumbnail" - The thumbnail of the comment author
+ * "author_url" - The url to the comment author's page
+ * "author_is_verified" - Whether the author is verified
+ on the platform
+ * "author_is_uploader" - Whether the comment is made by
+ the video uploader
* "id" - Comment ID
* "html" - Comment as HTML
* "text" - Plain text of the comment
@@ -310,8 +342,8 @@ class InfoExtractor:
* "dislike_count" - Number of negative ratings of the comment
* "is_favorited" - Whether the comment is marked as
favorite by the video uploader
- * "author_is_uploader" - Whether the comment is made by
- the video uploader
+ * "is_pinned" - Whether the comment is pinned to
+ the top of the comments
age_limit: Age restriction for the video, as an integer (years)
webpage_url: The URL to the video webpage, if given to hypervideo it
should allow to get the same result again. (It will be set
@@ -335,6 +367,10 @@ class InfoExtractor:
* "start_time" - The start time of the chapter in seconds
* "end_time" - The end time of the chapter in seconds
* "title" (optional, string)
+ heatmap: A list of dictionaries, with the following entries:
+ * "start_time" - The start time of the data point in seconds
+ * "end_time" - The end time of the data point in seconds
+ * "value" - The normalized value of the data point (float between 0 and 1)
playable_in_embed: Whether this video is allowed to play in embedded
players on other sites. Can be True (=always allowed),
False (=never allowed), None (=unknown), or a string
@@ -446,8 +482,8 @@ class InfoExtractor:
Subclasses of this should also be added to the list of extractors and
- should define a _VALID_URL regexp and, re-define the _real_extract() and
- (optionally) _real_initialize() methods.
+ should define _VALID_URL as a regexp or a Sequence of regexps, and
+ re-define the _real_extract() and (optionally) _real_initialize() methods.
Subclasses may also override suitable() if necessary, but ensure the function
signature is preserved and that this function imports everything it needs
@@ -510,7 +546,7 @@ class InfoExtractor:
_EMBED_REGEX = []
def _login_hint(self, method=NO_DEFAULT, netrc=None):
- password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
+ password_hint = f'--username and --password, --netrc-cmd, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials'
return {
None: '',
'any': f'Use --cookies, --cookies-from-browser, {password_hint}',
@@ -537,8 +573,8 @@ class InfoExtractor:
# we have cached the regexp for *this* class, whereas getattr would also
# match the superclass
if '_VALID_URL_RE' not in cls.__dict__:
- cls._VALID_URL_RE = re.compile(cls._VALID_URL)
- return cls._VALID_URL_RE.match(url)
+ cls._VALID_URL_RE = tuple(map(re.compile, variadic(cls._VALID_URL)))
+ return next(filter(None, (regex.match(url) for regex in cls._VALID_URL_RE)), None)
@classmethod
def suitable(cls, url):
@@ -674,7 +710,8 @@ class InfoExtractor:
for _ in range(2):
try:
self.initialize()
- self.write_debug('Extracting URL: %s' % url)
+ self.to_screen('Extracting URL: %s' % (
+ url if self.get_param('verbose') else truncate_string(url, 100, 20)))
ie_result = self._real_extract(url)
if ie_result is None:
return None
@@ -692,11 +729,11 @@ class InfoExtractor:
except UnsupportedError:
raise
except ExtractorError as e:
- e.video_id = e.video_id or self.get_temp_id(url),
+ e.video_id = e.video_id or self.get_temp_id(url)
e.ie = e.ie or self.IE_NAME,
e.traceback = e.traceback or sys.exc_info()[2]
raise
- except http.client.IncompleteRead as e:
+ except IncompleteRead as e:
raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
except (KeyError, StopIteration) as e:
raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url))
@@ -755,20 +792,25 @@ class InfoExtractor:
@staticmethod
def __can_accept_status_code(err, expected_status):
- assert isinstance(err, urllib.error.HTTPError)
+ assert isinstance(err, HTTPError)
if expected_status is None:
return False
elif callable(expected_status):
- return expected_status(err.code) is True
+ return expected_status(err.status) is True
else:
- return err.code in variadic(expected_status)
+ return err.status in variadic(expected_status)
def _create_request(self, url_or_request, data=None, headers=None, query=None):
if isinstance(url_or_request, urllib.request.Request):
- return update_Request(url_or_request, data=data, headers=headers, query=query)
- if query:
- url_or_request = update_url_query(url_or_request, query)
- return sanitized_Request(url_or_request, data, headers or {})
+ self._downloader.deprecation_warning(
+ 'Passing a urllib.request.Request to _create_request() is deprecated. '
+ 'Use hypervideo_dl.networking.common.Request instead.')
+ url_or_request = urllib_req_to_req(url_or_request)
+ elif not isinstance(url_or_request, Request):
+ url_or_request = Request(url_or_request)
+
+ url_or_request.update(data=data, headers=headers, query=query)
+ return url_or_request
def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None):
"""
@@ -804,14 +846,9 @@ class InfoExtractor:
try:
return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query))
except network_exceptions as err:
- if isinstance(err, urllib.error.HTTPError):
+ if isinstance(err, HTTPError):
if self.__can_accept_status_code(err, expected_status):
- # Retain reference to error to prevent file object from
- # being closed before it can be read. Works around the
- # effects of <https://bugs.python.org/issue15002>
- # introduced in Python 3.4.1.
- err.fp._error = err
- return err.fp
+ return err.response
if errnote is False:
return False
@@ -943,11 +980,11 @@ class InfoExtractor:
if prefix is not None:
webpage_bytes = prefix + webpage_bytes
if self.get_param('dump_intermediate_pages', False):
- self.to_screen('Dumping request to ' + urlh.geturl())
+ self.to_screen('Dumping request to ' + urlh.url)
dump = base64.b64encode(webpage_bytes).decode('ascii')
self._downloader.to_screen(dump)
if self.get_param('write_pages'):
- filename = self._request_dump_filename(urlh.geturl(), video_id)
+ filename = self._request_dump_filename(urlh.url, video_id)
self.to_screen(f'Saving request to {filename}')
with open(filename, 'wb') as outf:
outf.write(webpage_bytes)
@@ -1005,7 +1042,7 @@ class InfoExtractor:
fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None):
if self.get_param('load_pages'):
url_or_request = self._create_request(url_or_request, data, headers, query)
- filename = self._request_dump_filename(url_or_request.full_url, video_id)
+ filename = self._request_dump_filename(url_or_request.url, video_id)
self.to_screen(f'Loading request from {filename}')
try:
with open(filename, 'rb') as dumpf:
@@ -1079,7 +1116,7 @@ class InfoExtractor:
while True:
try:
return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
- except http.client.IncompleteRead as e:
+ except IncompleteRead as e:
try_count += 1
if try_count >= tries:
raise e
@@ -1260,51 +1297,53 @@ class InfoExtractor:
Like _search_regex, but strips HTML tags and unescapes entities.
"""
res = self._search_regex(pattern, string, name, default, fatal, flags, group)
- if res:
- return clean_html(res).strip()
- else:
- return res
+ if isinstance(res, tuple):
+ return tuple(map(clean_html, res))
+ return clean_html(res)
def _get_netrc_login_info(self, netrc_machine=None):
- username = None
- password = None
netrc_machine = netrc_machine or self._NETRC_MACHINE
- if self.get_param('usenetrc', False):
- try:
- netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
- if os.path.isdir(netrc_file):
- netrc_file = os.path.join(netrc_file, '.netrc')
- info = netrc.netrc(file=netrc_file).authenticators(netrc_machine)
- if info is not None:
- username = info[0]
- password = info[2]
- else:
- raise netrc.NetrcParseError(
- 'No authenticators for %s' % netrc_machine)
- except (OSError, netrc.NetrcParseError) as err:
- self.report_warning(
- 'parsing .netrc: %s' % error_to_compat_str(err))
+ cmd = self.get_param('netrc_cmd')
+ if cmd:
+ cmd = cmd.replace('{}', netrc_machine)
+ self.to_screen(f'Executing command: {cmd}')
+ stdout, _, ret = Popen.run(cmd, text=True, shell=True, stdout=subprocess.PIPE)
+ if ret != 0:
+ raise OSError(f'Command returned error code {ret}')
+ info = netrc_from_content(stdout).authenticators(netrc_machine)
+
+ elif self.get_param('usenetrc', False):
+ netrc_file = compat_expanduser(self.get_param('netrc_location') or '~')
+ if os.path.isdir(netrc_file):
+ netrc_file = os.path.join(netrc_file, '.netrc')
+ info = netrc.netrc(netrc_file).authenticators(netrc_machine)
- return username, password
+ else:
+ return None, None
+ if not info:
+ raise netrc.NetrcParseError(f'No authenticators for {netrc_machine}')
+ return info[0], info[2]
def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
"""
Get the login info as (username, password)
First look for the manually specified credentials using username_option
and password_option as keys in params dictionary. If no such credentials
- available look in the netrc file using the netrc_machine or _NETRC_MACHINE
- value.
+ are available try the netrc_cmd if it is defined or look in the
+ netrc file using the netrc_machine or _NETRC_MACHINE value.
If there's no info available, return (None, None)
"""
- # Attempt to use provided username and password or .netrc data
username = self.get_param(username_option)
if username is not None:
password = self.get_param(password_option)
else:
- username, password = self._get_netrc_login_info(netrc_machine)
-
+ try:
+ username, password = self._get_netrc_login_info(netrc_machine)
+ except (OSError, netrc.NetrcParseError) as err:
+ self.report_warning(f'Failed to parse .netrc: {err}')
+ return None, None
return username, password
def _get_tfa_info(self, note='two-factor verification code'):
@@ -1324,7 +1363,7 @@ class InfoExtractor:
# Helper functions for extracting OpenGraph info
@staticmethod
def _og_regexes(prop):
- content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
+ content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?)(?=\s|/?>))'
property_re = (r'(?:name|property)=(?:\'og%(sep)s%(prop)s\'|"og%(sep)s%(prop)s"|\s*og%(sep)s%(prop)s\b)'
% {'prop': re.escape(prop), 'sep': '(?:&#x3A;|[:-])'})
template = r'<meta[^>]+?%s[^>]+?%s'
@@ -1394,10 +1433,16 @@ class InfoExtractor:
# And then there are the jokers who advertise that they use RTA, but actually don't.
AGE_LIMIT_MARKERS = [
r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
+ r'>[^<]*you acknowledge you are at least (\d+) years old',
+ r'>\s*(?:18\s+U(?:\.S\.C\.|SC)\s+)?(?:ยง+\s*)?2257\b',
]
- if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS):
- return 18
- return 0
+
+ age_limit = 0
+ for marker in AGE_LIMIT_MARKERS:
+ mobj = re.search(marker, html)
+ if mobj:
+ age_limit = max(age_limit, int(traverse_obj(mobj, 1, default=18)))
+ return age_limit
def _media_rating_search(self, html):
# See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
@@ -1650,11 +1695,8 @@ class InfoExtractor:
if js is None:
return {}
- args = dict(zip(arg_keys.split(','), arg_vals.split(',')))
-
- for key, val in args.items():
- if val in ('undefined', 'void 0'):
- args[key] = 'null'
+ args = dict(zip(arg_keys.split(','), map(json.dumps, self._parse_json(
+ f'[{arg_vals}]', video_id, transform_source=js_to_json, fatal=fatal) or ())))
ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal)
return traverse_obj(ret, traverse) or {}
@@ -1757,6 +1799,9 @@ class InfoExtractor:
def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None,
transform_source=lambda s: fix_xml_ampersands(s).strip(),
fatal=True, m3u8_id=None, data=None, headers={}, query={}):
+ if self.get_param('ignore_no_formats_error'):
+ fatal = False
+
res = self._download_xml_handle(
manifest_url, video_id, 'Downloading f4m manifest',
'Unable to download f4m manifest',
@@ -1768,7 +1813,7 @@ class InfoExtractor:
return []
manifest, urlh = res
- manifest_url = urlh.geturl()
+ manifest_url = urlh.url
return self._parse_f4m_formats(
manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id,
@@ -1906,6 +1951,17 @@ class InfoExtractor:
errnote=None, fatal=True, live=False, data=None, headers={},
query={}):
+ if self.get_param('ignore_no_formats_error'):
+ fatal = False
+
+ if not m3u8_url:
+ if errnote is not False:
+ errnote = errnote or 'Failed to obtain m3u8 URL'
+ if fatal:
+ raise ExtractorError(errnote, video_id=video_id)
+ self.report_warning(f'{errnote}{bug_reports_message()}')
+ return [], {}
+
res = self._download_webpage_handle(
m3u8_url, video_id,
note='Downloading m3u8 information' if note is None else note,
@@ -1916,7 +1972,7 @@ class InfoExtractor:
return [], {}
m3u8_doc, urlh = res
- m3u8_url = urlh.geturl()
+ m3u8_url = urlh.url
return self._parse_m3u8_formats_and_subtitles(
m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
@@ -1930,11 +1986,7 @@ class InfoExtractor:
errnote=None, fatal=True, data=None, headers={}, query={},
video_id=None):
formats, subtitles = [], {}
-
- has_drm = re.search('|'.join([
- r'#EXT-X-FAXS-CM:', # Adobe Flash Access
- r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay
- ]), m3u8_doc)
+ has_drm = HlsFD._has_drm(m3u8_doc)
def format_url(url):
return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url)
@@ -2032,6 +2084,7 @@ class InfoExtractor:
'protocol': entry_protocol,
'preference': preference,
'quality': quality,
+ 'has_drm': has_drm,
'vcodec': 'none' if media_type == 'AUDIO' else None,
} for idx in _extract_m3u8_playlist_indices(manifest_url))
@@ -2091,6 +2144,7 @@ class InfoExtractor:
'protocol': entry_protocol,
'preference': preference,
'quality': quality,
+ 'has_drm': has_drm,
}
resolution = last_stream_inf.get('RESOLUTION')
if resolution:
@@ -2157,13 +2211,23 @@ class InfoExtractor:
return self._parse_m3u8_vod_duration(m3u8_vod or '', video_id)
def _parse_m3u8_vod_duration(self, m3u8_vod, video_id):
- if '#EXT-X-PLAYLIST-TYPE:VOD' not in m3u8_vod:
+ if '#EXT-X-ENDLIST' not in m3u8_vod:
return None
return int(sum(
float(line[len('#EXTINF:'):].split(',')[0])
for line in m3u8_vod.splitlines() if line.startswith('#EXTINF:'))) or None
+ def _extract_mpd_vod_duration(
+ self, mpd_url, video_id, note=None, errnote=None, data=None, headers={}, query={}):
+
+ mpd_doc = self._download_xml(
+ mpd_url, video_id,
+ note='Downloading MPD VOD manifest' if note is None else note,
+ errnote='Failed to download VOD manifest' if errnote is None else errnote,
+ fatal=False, data=data, headers=headers, query=query) or {}
+ return int_or_none(parse_duration(mpd_doc.get('mediaPresentationDuration')))
+
@staticmethod
def _xpath_ns(path, namespace=None):
if not namespace:
@@ -2177,22 +2241,17 @@ class InfoExtractor:
return '/'.join(out)
def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
+ if self.get_param('ignore_no_formats_error'):
+ fatal = False
+
res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
if res is False:
assert not fatal
return [], {}
-
smil, urlh = res
- smil_url = urlh.geturl()
- namespace = self._parse_smil_namespace(smil)
-
- fmts = self._parse_smil_formats(
- smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
- subs = self._parse_smil_subtitles(
- smil, namespace=namespace)
-
- return fmts, subs
+ return self._parse_smil_formats_and_subtitles(smil, urlh.url, video_id, f4m_params=f4m_params,
+ namespace=self._parse_smil_namespace(smil))
def _extract_smil_formats(self, *args, **kwargs):
fmts, subs = self._extract_smil_formats_and_subtitles(*args, **kwargs)
@@ -2206,7 +2265,7 @@ class InfoExtractor:
return {}
smil, urlh = res
- smil_url = urlh.geturl()
+ smil_url = urlh.url
return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
@@ -2218,9 +2277,8 @@ class InfoExtractor:
def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
namespace = self._parse_smil_namespace(smil)
- formats = self._parse_smil_formats(
+ formats, subtitles = self._parse_smil_formats_and_subtitles(
smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
- subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
video_id = os.path.splitext(url_basename(smil_url))[0]
title = None
@@ -2259,7 +2317,14 @@ class InfoExtractor:
return self._search_regex(
r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
- def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
+ def _parse_smil_formats(self, *args, **kwargs):
+ fmts, subs = self._parse_smil_formats_and_subtitles(*args, **kwargs)
+ if subs:
+ self._report_ignoring_subs('SMIL')
+ return fmts
+
+ def _parse_smil_formats_and_subtitles(
+ self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
base = smil_url
for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
b = meta.get('base') or meta.get('httpBase')
@@ -2267,7 +2332,7 @@ class InfoExtractor:
base = b
break
- formats = []
+ formats, subtitles = [], {}
rtmp_count = 0
http_count = 0
m3u8_count = 0
@@ -2287,7 +2352,8 @@ class InfoExtractor:
height = int_or_none(medium.get('height'))
proto = medium.get('proto')
ext = medium.get('ext')
- src_ext = determine_ext(src)
+ src_ext = determine_ext(src, default_ext=None) or ext or urlhandle_detect_ext(
+ self._request_webpage(HEADRequest(src), video_id, note='Requesting extension info', fatal=False))
streamer = medium.get('streamer') or base
if proto == 'rtmp' or streamer.startswith('rtmp'):
@@ -2314,8 +2380,9 @@ class InfoExtractor:
src_url = src_url.strip()
if proto == 'm3u8' or src_ext == 'm3u8':
- m3u8_formats = self._extract_m3u8_formats(
+ m3u8_formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(
src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
+ self._merge_subtitles(m3u8_subs, target=subtitles)
if len(m3u8_formats) == 1:
m3u8_count += 1
m3u8_formats[0].update({
@@ -2336,11 +2403,15 @@ class InfoExtractor:
f4m_url += urllib.parse.urlencode(f4m_params)
formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
elif src_ext == 'mpd':
- formats.extend(self._extract_mpd_formats(
- src_url, video_id, mpd_id='dash', fatal=False))
+ mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(
+ src_url, video_id, mpd_id='dash', fatal=False)
+ formats.extend(mpd_formats)
+ self._merge_subtitles(mpd_subs, target=subtitles)
elif re.search(r'\.ism/[Mm]anifest', src_url):
- formats.extend(self._extract_ism_formats(
- src_url, video_id, ism_id='mss', fatal=False))
+ ism_formats, ism_subs = self._extract_ism_formats_and_subtitles(
+ src_url, video_id, ism_id='mss', fatal=False)
+ formats.extend(ism_formats)
+ self._merge_subtitles(ism_subs, target=subtitles)
elif src_url.startswith('http') and self._is_valid_url(src, video_id):
http_count += 1
formats.append({
@@ -2371,7 +2442,10 @@ class InfoExtractor:
'format_note': 'SMIL storyboards',
})
- return formats
+ smil_subs = self._parse_smil_subtitles(smil, namespace=namespace)
+ self._merge_subtitles(smil_subs, target=subtitles)
+
+ return formats, subtitles
def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
urls = []
@@ -2397,7 +2471,7 @@ class InfoExtractor:
return []
xspf, urlh = res
- xspf_url = urlh.geturl()
+ xspf_url = urlh.url
return self._parse_xspf(
xspf, playlist_id, xspf_url=xspf_url,
@@ -2452,6 +2526,10 @@ class InfoExtractor:
def _extract_mpd_formats_and_subtitles(
self, mpd_url, video_id, mpd_id=None, note=None, errnote=None,
fatal=True, data=None, headers={}, query={}):
+
+ if self.get_param('ignore_no_formats_error'):
+ fatal = False
+
res = self._download_xml_handle(
mpd_url, video_id,
note='Downloading MPD manifest' if note is None else note,
@@ -2464,7 +2542,7 @@ class InfoExtractor:
return [], {}
# We could have been redirected to a new url when we retrieved our mpd file.
- mpd_url = urlh.geturl()
+ mpd_url = urlh.url
mpd_base_url = base_url(mpd_url)
return self._parse_mpd_formats_and_subtitles(
@@ -2821,6 +2899,9 @@ class InfoExtractor:
return fmts
def _extract_ism_formats_and_subtitles(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
+ if self.get_param('ignore_no_formats_error'):
+ fatal = False
+
res = self._download_xml_handle(
ism_url, video_id,
note='Downloading ISM manifest' if note is None else note,
@@ -2832,7 +2913,7 @@ class InfoExtractor:
if ism_doc is None:
return [], {}
- return self._parse_ism_formats_and_subtitles(ism_doc, urlh.geturl(), ism_id)
+ return self._parse_ism_formats_and_subtitles(ism_doc, urlh.url, ism_id)
def _parse_ism_formats_and_subtitles(self, ism_doc, ism_url, ism_id=None):
"""
@@ -2928,6 +3009,8 @@ class InfoExtractor:
'protocol': 'ism',
'fragments': fragments,
'has_drm': ism_doc.find('Protection') is not None,
+ 'language': stream_language,
+ 'audio_channels': int_or_none(track.get('Channels')),
'_download_params': {
'stream_type': stream_type,
'duration': duration,
@@ -3190,7 +3273,7 @@ class InfoExtractor:
def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
mobj = re.search(
- r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
+ r'''(?s)jwplayer\s*\(\s*(?P<q>'|")(?!(?P=q)).+(?P=q)\s*\)(?!</script>).*?\.\s*setup\s*\(\s*(?P<options>(?:\([^)]*\)|[^)])+)\s*\)''',
webpage)
if mobj:
try:
@@ -3211,19 +3294,20 @@ class InfoExtractor:
def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
- # JWPlayer backward compatibility: flattened playlists
- # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
- if 'playlist' not in jwplayer_data:
- jwplayer_data = {'playlist': [jwplayer_data]}
-
entries = []
+ if not isinstance(jwplayer_data, dict):
+ return entries
- # JWPlayer backward compatibility: single playlist item
+ playlist_items = jwplayer_data.get('playlist')
+ # JWPlayer backward compatibility: single playlist item/flattened playlists
# https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
- if not isinstance(jwplayer_data['playlist'], list):
- jwplayer_data['playlist'] = [jwplayer_data['playlist']]
+ # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
+ if not isinstance(playlist_items, list):
+ playlist_items = (playlist_items or jwplayer_data, )
- for video_data in jwplayer_data['playlist']:
+ for video_data in playlist_items:
+ if not isinstance(video_data, dict):
+ continue
# JWPlayer backward compatibility: flattened sources
# https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
if 'sources' not in video_data:
@@ -3261,6 +3345,13 @@ class InfoExtractor:
'timestamp': int_or_none(video_data.get('pubdate')),
'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
'subtitles': subtitles,
+ 'alt_title': clean_html(video_data.get('subtitle')), # attributes used e.g. by Tele5 ...
+ 'genre': clean_html(video_data.get('genre')),
+ 'channel': clean_html(dict_get(video_data, ('category', 'channel'))),
+ 'season_number': int_or_none(video_data.get('season')),
+ 'episode_number': int_or_none(video_data.get('episode')),
+ 'release_year': int_or_none(video_data.get('releasedate')),
+ 'age_limit': int_or_none(video_data.get('age_restriction')),
}
# https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
@@ -3278,7 +3369,7 @@ class InfoExtractor:
def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
- urls = []
+ urls = set()
formats = []
for source in jwplayer_sources_data:
if not isinstance(source, dict):
@@ -3287,14 +3378,14 @@ class InfoExtractor:
base_url, self._proto_relative_url(source.get('file')))
if not source_url or source_url in urls:
continue
- urls.append(source_url)
+ urls.add(source_url)
source_type = source.get('type') or ''
ext = mimetype2ext(source_type) or determine_ext(source_url)
- if source_type == 'hls' or ext == 'm3u8':
+ if source_type == 'hls' or ext == 'm3u8' or 'format=m3u8-aapl' in source_url:
formats.extend(self._extract_m3u8_formats(
source_url, video_id, 'mp4', entry_protocol='m3u8_native',
m3u8_id=m3u8_id, fatal=False))
- elif source_type == 'dash' or ext == 'mpd':
+ elif source_type == 'dash' or ext == 'mpd' or 'format=mpd-time-csf' in source_url:
formats.extend(self._extract_mpd_formats(
source_url, video_id, mpd_id=mpd_id, fatal=False))
elif ext == 'smil':
@@ -3309,13 +3400,12 @@ class InfoExtractor:
'ext': ext,
})
else:
+ format_id = str_or_none(source.get('label'))
height = int_or_none(source.get('height'))
- if height is None:
+ if height is None and format_id:
# Often no height is provided but there is a label in
# format like "1080p", "720p SD", or 1080.
- height = int_or_none(self._search_regex(
- r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''),
- 'height', default=None))
+ height = parse_resolution(format_id).get('height')
a_format = {
'url': source_url,
'width': int_or_none(source.get('width')),
@@ -3323,6 +3413,7 @@ class InfoExtractor:
'tbr': int_or_none(source.get('bitrate'), scale=1000),
'filesize': int_or_none(source.get('filesize')),
'ext': ext,
+ 'format_id': format_id
}
if source_url.startswith('rtmp'):
a_format['ext'] = 'flv'
@@ -3375,7 +3466,7 @@ class InfoExtractor:
def _get_cookies(self, url):
""" Return a http.cookies.SimpleCookie with the cookies for the url """
- return LenientSimpleCookie(self._downloader._calc_cookies(url))
+ return LenientSimpleCookie(self._downloader.cookiejar.get_cookie_header(url))
def _apply_first_set_cookie_header(self, url_handle, cookie):
"""
@@ -3416,13 +3507,17 @@ class InfoExtractor:
continue
t['name'] = cls.ie_key()
yield t
+ if getattr(cls, '__wrapped__', None):
+ yield from cls.__wrapped__.get_testcases(include_onlymatching)
@classmethod
def get_webpage_testcases(cls):
tests = vars(cls).get('_WEBPAGE_TESTS', [])
for t in tests:
t['name'] = cls.ie_key()
- return tests
+ yield t
+ if getattr(cls, '__wrapped__', None):
+ yield from cls.__wrapped__.get_webpage_testcases()
@classproperty(cache=True)
def age_limit(cls):
@@ -3446,8 +3541,8 @@ class InfoExtractor:
@classmethod
def is_single_video(cls, url):
"""Returns whether the URL is of a single video, None if unknown"""
- assert cls.suitable(url), 'The URL must be suitable for the extractor'
- return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
+ if cls.suitable(url):
+ return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE)
@classmethod
def is_suitable(cls, age_limit):
@@ -3460,7 +3555,7 @@ class InfoExtractor:
desc = ''
if cls._NETRC_MACHINE:
if markdown:
- desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]'
+ desc += f' [*{cls._NETRC_MACHINE}*](## "netrc machine")'
else:
desc += f' [{cls._NETRC_MACHINE}]'
if cls.IE_DESC is False:
@@ -3468,7 +3563,7 @@ class InfoExtractor:
elif cls.IE_DESC:
desc += f' {cls.IE_DESC}'
if cls.SEARCH_KEY:
- desc += f'; "{cls.SEARCH_KEY}:" prefix'
+ desc += f'{";" if cls.IE_DESC else ""} "{cls.SEARCH_KEY}:" prefix'
if search_examples:
_COUNTS = ('', '5', '10', 'all')
desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")'
@@ -3582,6 +3677,42 @@ class InfoExtractor:
or urllib.parse.unquote(os.path.splitext(url_basename(url))[0])
or default)
+ def _extract_chapters_helper(self, chapter_list, start_function, title_function, duration, strict=True):
+ if not duration:
+ return
+ chapter_list = [{
+ 'start_time': start_function(chapter),
+ 'title': title_function(chapter),
+ } for chapter in chapter_list or []]
+ if strict:
+ warn = self.report_warning
+ else:
+ warn = self.write_debug
+ chapter_list.sort(key=lambda c: c['start_time'] or 0)
+
+ chapters = [{'start_time': 0}]
+ for idx, chapter in enumerate(chapter_list):
+ if chapter['start_time'] is None:
+ warn(f'Incomplete chapter {idx}')
+ elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration:
+ chapters.append(chapter)
+ elif chapter not in chapters:
+ issue = (f'{chapter["start_time"]} > {duration}' if chapter['start_time'] > duration
+ else f'{chapter["start_time"]} < {chapters[-1]["start_time"]}')
+ warn(f'Invalid start time ({issue}) for chapter "{chapter["title"]}"')
+ return chapters[1:]
+
+ def _extract_chapters_from_description(self, description, duration):
+ duration_re = r'(?:\d+:)?\d{1,2}:\d{2}'
+ sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$'
+ return self._extract_chapters_helper(
+ re.findall(sep_re % (duration_re, r'.+?'), description or ''),
+ start_function=lambda x: parse_duration(x[0]), title_function=lambda x: x[1],
+ duration=duration, strict=False) or self._extract_chapters_helper(
+ re.findall(sep_re % (r'.+?', duration_re), description or ''),
+ start_function=lambda x: parse_duration(x[1]), title_function=lambda x: x[0],
+ duration=duration, strict=False)
+
@staticmethod
def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None):
all_known = all(map(
@@ -3684,10 +3815,12 @@ class InfoExtractor:
if plugin_name:
mro = inspect.getmro(cls)
super_class = cls.__wrapped__ = mro[mro.index(cls) + 1]
- cls.IE_NAME, cls.ie_key = f'{super_class.IE_NAME}+{plugin_name}', super_class.ie_key
+ cls.PLUGIN_NAME, cls.ie_key = plugin_name, super_class.ie_key
+ cls.IE_NAME = f'{super_class.IE_NAME}+{plugin_name}'
while getattr(super_class, '__wrapped__', None):
super_class = super_class.__wrapped__
setattr(sys.modules[super_class.__module__], super_class.__name__, cls)
+ _PLUGIN_OVERRIDES[super_class].append(cls)
return super().__init_subclass__(**kwargs)
@@ -3744,3 +3877,6 @@ class UnsupportedURLIE(InfoExtractor):
def _real_extract(self, url):
raise UnsupportedError(url)
+
+
+_PLUGIN_OVERRIDES = collections.defaultdict(list)