aboutsummaryrefslogtreecommitdiffstats
path: root/yt_dlp/extractor/common.py
diff options
context:
space:
mode:
Diffstat (limited to 'yt_dlp/extractor/common.py')
-rw-r--r--yt_dlp/extractor/common.py56
1 files changed, 31 insertions, 25 deletions
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index aa98c0cc9..fc28bca2e 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -2,6 +2,7 @@
from __future__ import unicode_literals
import base64
+import collections
import datetime
import hashlib
import itertools
@@ -54,6 +55,7 @@ from ..utils import (
GeoRestrictedError,
GeoUtils,
int_or_none,
+ join_nonempty,
js_to_json,
JSON_LD_RE,
mimetype2ext,
@@ -341,6 +343,7 @@ class InfoExtractor(object):
series, programme or podcast:
series: Title of the series or programme the video episode belongs to.
+ series_id: Id of the series or programme the video episode belongs to, as a unicode string.
season: Title of the season the video episode belongs to.
season_number: Number of the season the video episode belongs to, as an integer.
season_id: Id of the season the video episode belongs to, as a unicode string.
@@ -441,11 +444,11 @@ class InfoExtractor(object):
_WORKING = True
_LOGIN_HINTS = {
- 'any': 'Use --cookies, --username and --password or --netrc to provide account credentials',
+ 'any': 'Use --cookies, --username and --password, or --netrc to provide account credentials',
'cookies': (
'Use --cookies-from-browser or --cookies for the authentication. '
'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to manually pass cookies'),
- 'password': 'Use --username and --password or --netrc to provide account credentials',
+ 'password': 'Use --username and --password, or --netrc to provide account credentials',
}
def __init__(self, downloader=None):
@@ -1449,6 +1452,9 @@ class InfoExtractor(object):
item_type = e.get('@type')
if expected_type is not None and expected_type != item_type:
continue
+ rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none)
+ if rating is not None:
+ info['average_rating'] = rating
if item_type in ('TVEpisode', 'Episode'):
episode_name = unescapeHTML(e.get('name'))
info.update({
@@ -1495,6 +1501,13 @@ class InfoExtractor(object):
break
return dict((k, v) for k, v in info.items() if v is not None)
+ def _search_nextjs_data(self, webpage, video_id, **kw):
+ return self._parse_json(
+ self._search_regex(
+ r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>',
+ webpage, 'next.js data', **kw),
+ video_id, **kw)
+
@staticmethod
def _hidden_inputs(html):
html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
@@ -1531,7 +1544,7 @@ class InfoExtractor(object):
'vcodec': {'type': 'ordered', 'regex': True,
'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
'acodec': {'type': 'ordered', 'regex': True,
- 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']},
+ 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
@@ -1911,7 +1924,7 @@ class InfoExtractor(object):
tbr = int_or_none(media_el.attrib.get('bitrate'))
width = int_or_none(media_el.attrib.get('width'))
height = int_or_none(media_el.attrib.get('height'))
- format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
+ format_id = join_nonempty(f4m_id, tbr or i)
# If <bootstrapInfo> is present, the specified f4m is a
# stream-level manifest, and only set-level manifests may refer to
# external resources. See section 11.4 and section 4 of F4M spec
@@ -1973,7 +1986,7 @@ class InfoExtractor(object):
def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None):
return {
- 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
+ 'format_id': join_nonempty(m3u8_id, 'meta'),
'url': m3u8_url,
'ext': ext,
'protocol': 'm3u8',
@@ -2026,10 +2039,10 @@ class InfoExtractor(object):
video_id=None):
formats, subtitles = [], {}
- if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
- return formats, subtitles
-
- has_drm = re.search(r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', m3u8_doc)
+ has_drm = re.search('|'.join([
+ r'#EXT-X-FAXS-CM:', # Adobe Flash Access
+ r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay
+ ]), m3u8_doc)
def format_url(url):
return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url)
@@ -2068,7 +2081,7 @@ class InfoExtractor(object):
if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
formats = [{
- 'format_id': '-'.join(map(str, filter(None, [m3u8_id, idx]))),
+ 'format_id': join_nonempty(m3u8_id, idx),
'format_index': idx,
'url': m3u8_url,
'ext': ext,
@@ -2117,7 +2130,7 @@ class InfoExtractor(object):
if media_url:
manifest_url = format_url(media_url)
formats.extend({
- 'format_id': '-'.join(map(str, filter(None, (m3u8_id, group_id, name, idx)))),
+ 'format_id': join_nonempty(m3u8_id, group_id, name, idx),
'format_note': name,
'format_index': idx,
'url': manifest_url,
@@ -2174,9 +2187,9 @@ class InfoExtractor(object):
# format_id intact.
if not live:
stream_name = build_stream_name()
- format_id[1] = stream_name if stream_name else '%d' % (tbr if tbr else len(formats))
+ format_id[1] = stream_name or '%d' % (tbr or len(formats))
f = {
- 'format_id': '-'.join(map(str, filter(None, format_id))),
+ 'format_id': join_nonempty(*format_id),
'format_index': idx,
'url': manifest_url,
'manifest_url': m3u8_url,
@@ -2640,7 +2653,7 @@ class InfoExtractor(object):
mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
formats, subtitles = [], {}
- stream_numbers = {'audio': 0, 'video': 0}
+ stream_numbers = collections.defaultdict(int)
for period in mpd_doc.findall(_add_ns('Period')):
period_duration = parse_duration(period.get('duration')) or mpd_duration
period_ms_info = extract_multisegment_info(period, {
@@ -2706,10 +2719,8 @@ class InfoExtractor(object):
'format_note': 'DASH %s' % content_type,
'filesize': filesize,
'container': mimetype2ext(mime_type) + '_dash',
- 'manifest_stream_number': stream_numbers[content_type]
}
f.update(parse_codecs(codecs))
- stream_numbers[content_type] += 1
elif content_type == 'text':
f = {
'ext': mimetype2ext(mime_type),
@@ -2876,7 +2887,9 @@ class InfoExtractor(object):
else:
# Assuming direct URL to unfragmented media.
f['url'] = base_url
- if content_type in ('video', 'audio') or mime_type == 'image/jpeg':
+ if content_type in ('video', 'audio', 'image/jpeg'):
+ f['manifest_stream_number'] = stream_numbers[f['url']]
+ stream_numbers[f['url']] += 1
formats.append(f)
elif content_type == 'text':
subtitles.setdefault(lang or 'und', []).append(f)
@@ -2965,13 +2978,6 @@ class InfoExtractor(object):
})
fragment_ctx['time'] += fragment_ctx['duration']
- format_id = []
- if ism_id:
- format_id.append(ism_id)
- if stream_name:
- format_id.append(stream_name)
- format_id.append(compat_str(tbr))
-
if stream_type == 'text':
subtitles.setdefault(stream_language, []).append({
'ext': 'ismt',
@@ -2990,7 +2996,7 @@ class InfoExtractor(object):
})
elif stream_type in ('video', 'audio'):
formats.append({
- 'format_id': '-'.join(format_id),
+ 'format_id': join_nonempty(ism_id, stream_name, tbr),
'url': ism_url,
'manifest_url': ism_url,
'ext': 'ismv' if stream_type == 'video' else 'isma',