aboutsummaryrefslogtreecommitdiffstats
path: root/hypervideo_dl/extractor/generic.py
diff options
context:
space:
mode:
Diffstat (limited to 'hypervideo_dl/extractor/generic.py')
-rw-r--r--hypervideo_dl/extractor/generic.py411
1 files changed, 237 insertions, 174 deletions
diff --git a/hypervideo_dl/extractor/generic.py b/hypervideo_dl/extractor/generic.py
index f28a77e..77b6fb3 100644
--- a/hypervideo_dl/extractor/generic.py
+++ b/hypervideo_dl/extractor/generic.py
@@ -14,7 +14,9 @@ from ..utils import (
ExtractorError,
UnsupportedError,
determine_ext,
+ determine_protocol,
dict_get,
+ extract_basic_auth,
format_field,
int_or_none,
is_html,
@@ -31,7 +33,9 @@ from ..utils import (
unescapeHTML,
unified_timestamp,
unsmuggle_url,
+ update_url_query,
url_or_none,
+ urljoin,
variadic,
xpath_attr,
xpath_text,
@@ -864,21 +868,7 @@ class GenericIE(InfoExtractor):
},
},
{
- # JWPlayer config passed as variable
- 'url': 'http://www.txxx.com/videos/3326530/ariele/',
- 'info_dict': {
- 'id': '3326530_hq',
- 'ext': 'mp4',
- 'title': 'ARIELE | Tube Cup',
- 'uploader': 'www.txxx.com',
- 'age_limit': 18,
- },
- 'params': {
- 'skip_download': True,
- }
- },
- {
- # Video.js embed, multiple formats
+ # Youtube embed, formerly: Video.js embed, multiple formats
'url': 'http://ortcam.com/solidworks-урок-6-настройка-чертежа_33f9b7351.html',
'info_dict': {
'id': 'yygqldloqIk',
@@ -905,6 +895,7 @@ class GenericIE(InfoExtractor):
'params': {
'skip_download': True,
},
+ 'skip': '404 Not Found',
},
# rtl.nl embed
{
@@ -1548,19 +1539,6 @@ class GenericIE(InfoExtractor):
'add_ie': ['WashingtonPost'],
},
{
- # Mediaset embed
- 'url': 'http://www.tgcom24.mediaset.it/politica/serracchiani-voglio-vivere-in-una-societa-aperta-reazioni-sproporzionate-_3071354-201702a.shtml',
- 'info_dict': {
- 'id': '720642',
- 'ext': 'mp4',
- 'title': 'Serracchiani: "Voglio vivere in una società aperta, con tutela del patto di fiducia"',
- },
- 'params': {
- 'skip_download': True,
- },
- 'add_ie': ['Mediaset'],
- },
- {
# JOJ.sk embeds
'url': 'https://www.noviny.sk/slovensko/238543-slovenskom-sa-prehnala-vlna-silnych-burok',
'info_dict': {
@@ -1864,11 +1842,6 @@ class GenericIE(InfoExtractor):
'title': 'I AM BIO Podcast | BIO',
},
'playlist_mincount': 52,
- },
- {
- # Sibnet embed (https://help.sibnet.ru/?sibnet_video_embed)
- 'url': 'https://phpbb3.x-tk.ru/bbcode-video-sibnet-t24.html',
- 'only_matching': True,
}, {
# WimTv embed player
'url': 'http://www.msmotor.tv/wearefmi-pt-2-2021/',
@@ -1885,11 +1858,13 @@ class GenericIE(InfoExtractor):
'display_id': 'kelis-4th-of-july',
'ext': 'mp4',
'title': 'Kelis - 4th Of July',
- 'thumbnail': 'https://kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
+ 'description': 'Kelis - 4th Of July',
+ 'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
},
'params': {
'skip_download': True,
},
+ 'expected_warnings': ['Untested major version'],
}, {
# KVS Player
'url': 'https://www.kvs-demo.com/embed/105/',
@@ -1898,35 +1873,12 @@ class GenericIE(InfoExtractor):
'display_id': 'kelis-4th-of-july',
'ext': 'mp4',
'title': 'Kelis - 4th Of July / Embed Player',
- 'thumbnail': 'https://kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
+ 'thumbnail': r're:https://(?:www\.)?kvs-demo.com/contents/videos_screenshots/0/105/preview.jpg',
},
'params': {
'skip_download': True,
},
}, {
- # KVS Player
- 'url': 'https://thisvid.com/videos/french-boy-pantsed/',
- 'md5': '3397979512c682f6b85b3b04989df224',
- 'info_dict': {
- 'id': '2400174',
- 'display_id': 'french-boy-pantsed',
- 'ext': 'mp4',
- 'title': 'French Boy Pantsed - ThisVid.com',
- 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg',
- }
- }, {
- # KVS Player
- 'url': 'https://thisvid.com/embed/2400174/',
- 'md5': '3397979512c682f6b85b3b04989df224',
- 'info_dict': {
- 'id': '2400174',
- 'display_id': 'french-boy-pantsed',
- 'ext': 'mp4',
- 'title': 'French Boy Pantsed - ThisVid.com',
- 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg',
- }
- }, {
- # KVS Player
'url': 'https://youix.com/video/leningrad-zoj/',
'md5': '94f96ba95706dc3880812b27b7d8a2b8',
'info_dict': {
@@ -1934,8 +1886,8 @@ class GenericIE(InfoExtractor):
'display_id': 'leningrad-zoj',
'ext': 'mp4',
'title': 'Клип: Ленинград - ЗОЖ скачать, смотреть онлайн | Youix.com',
- 'thumbnail': 'https://youix.com/contents/videos_screenshots/18000/18485/preview_480x320_youix_com.mp4.jpg',
- }
+ 'thumbnail': r're:https://youix.com/contents/videos_screenshots/18000/18485/preview(?:_480x320_youix_com.mp4)?\.jpg',
+ },
}, {
# KVS Player
'url': 'https://youix.com/embed/18485',
@@ -1945,19 +1897,20 @@ class GenericIE(InfoExtractor):
'display_id': 'leningrad-zoj',
'ext': 'mp4',
'title': 'Ленинград - ЗОЖ',
- 'thumbnail': 'https://youix.com/contents/videos_screenshots/18000/18485/preview_480x320_youix_com.mp4.jpg',
- }
+ 'thumbnail': r're:https://youix.com/contents/videos_screenshots/18000/18485/preview(?:_480x320_youix_com.mp4)?\.jpg',
+ },
}, {
# KVS Player
'url': 'https://bogmedia.org/videos/21217/40-nochey-40-nights-2016/',
'md5': '94166bdb26b4cb1fb9214319a629fc51',
'info_dict': {
'id': '21217',
- 'display_id': '40-nochey-40-nights-2016',
+ 'display_id': '40-nochey-2016',
'ext': 'mp4',
'title': '40 ночей (2016) - BogMedia.org',
+ 'description': 'md5:4e6d7d622636eb7948275432eb256dc3',
'thumbnail': 'https://bogmedia.org/contents/videos_screenshots/21000/21217/preview_480p.mp4.jpg',
- }
+ },
},
{
# KVS Player (for sites that serve kt_player.js via non-https urls)
@@ -1967,9 +1920,9 @@ class GenericIE(InfoExtractor):
'id': '389508',
'display_id': 'syren-de-mer-onlyfans-05-07-2020have-a-happy-safe-holiday5f014e68a220979bdb8cd-source',
'ext': 'mp4',
- 'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер',
- 'thumbnail': 'http://www.camhub.world/contents/videos_screenshots/389000/389508/preview.mp4.jpg',
- }
+ 'title': 'Syren De Mer onlyfans_05-07-2020Have_a_happy_safe_holiday5f014e68a220979bdb8cd_source / Embed плеер',
+ 'thumbnail': r're:https?://www\.camhub\.world/contents/videos_screenshots/389000/389508/preview\.mp4\.jpg',
+ },
},
{
# Reddit-hosted video that will redirect and be processed by RedditIE
@@ -2172,7 +2125,79 @@ class GenericIE(InfoExtractor):
'age_limit': 0,
'direct': True,
}
- }
+ },
+ {
+ 'note': 'server returns data in brotli compression by default if `accept-encoding: *` is specified.',
+ 'url': 'https://www.extra.cz/cauky-lidi-70-dil-babis-predstavil-pohadky-prymulanek-nebo-andrejovy-nove-saty-ac867',
+ 'info_dict': {
+ 'id': 'cauky-lidi-70-dil-babis-predstavil-pohadky-prymulanek-nebo-andrejovy-nove-saty-ac867',
+ 'ext': 'mp4',
+ 'title': 'čauky lidi 70 finall',
+ 'description': 'čauky lidi 70 finall',
+ 'thumbnail': 'h',
+ 'upload_date': '20220606',
+ 'timestamp': 1654513791,
+ 'duration': 318.0,
+ 'direct': True,
+ 'age_limit': 0,
+ },
+ },
+ {
+ 'note': 'JW Player embed with unicode-escape sequences in URL',
+ 'url': 'https://www.medici.tv/en/concerts/lahav-shani-mozart-mahler-israel-philharmonic-abu-dhabi-classics',
+ 'info_dict': {
+ 'id': 'm',
+ 'ext': 'mp4',
+ 'title': 'Lahav Shani conducts the Israel Philharmonic\'s first-ever concert in Abu Dhabi',
+ 'description': 'Mahler\'s ',
+ 'uploader': 'www.medici.tv',
+ 'age_limit': 0,
+ 'thumbnail': r're:^https?://.+\.jpg',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
+ 'url': 'https://shooshtime.com/videos/284002/just-out-of-the-shower-joi/',
+ 'md5': 'e2f0a4c329f7986280b7328e24036d60',
+ 'info_dict': {
+ 'id': '284002',
+ 'display_id': 'just-out-of-the-shower-joi',
+ 'ext': 'mp4',
+ 'title': 'Just Out Of The Shower JOI - Shooshtime',
+ 'thumbnail': 'https://i.shoosh.co/contents/videos_screenshots/284000/284002/preview.mp4.jpg',
+ 'height': 720,
+ 'age_limit': 18,
+ },
+ },
+ {
+ 'note': 'Live HLS direct link',
+ 'url': 'https://d18j67ugtrocuq.cloudfront.net/out/v1/2767aec339144787926bd0322f72c6e9/index.m3u8',
+ 'info_dict': {
+ 'id': 'index',
+ 'title': r're:index',
+ 'ext': 'mp4',
+ 'live_status': 'is_live',
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ },
+ {
+ 'note': 'Video.js VOD HLS',
+ 'url': 'https://gist.githubusercontent.com/bashonly/2aae0862c50f4a4b84f220c315767208/raw/e3380d413749dabbe804c9c2d8fd9a45142475c7/videojs_hls_test.html',
+ 'info_dict': {
+ 'id': 'videojs_hls_test',
+ 'title': 'video',
+ 'ext': 'mp4',
+ 'age_limit': 0,
+ 'duration': 1800,
+ },
+ 'params': {
+ 'skip_download': 'm3u8',
+ },
+ },
]
def report_following_redirect(self, new_url):
@@ -2189,12 +2214,41 @@ class GenericIE(InfoExtractor):
self._downloader.write_debug(f'Identified {num} {name}{format_field(note, None, "; %s")}')
- def _fragment_query(self, url):
- if self._configuration_arg('fragment_query'):
- query_string = urllib.parse.urlparse(url).query
- if query_string:
- return {'extra_param_to_segment_url': query_string}
- return {}
+ def _extra_manifest_info(self, info, manifest_url):
+ fragment_query = self._configuration_arg('fragment_query', [None], casesense=True)[0]
+ if fragment_query is not None:
+ info['extra_param_to_segment_url'] = (
+ urllib.parse.urlparse(fragment_query).query or fragment_query
+ or urllib.parse.urlparse(manifest_url).query or None)
+
+ hex_or_none = lambda x: x if re.fullmatch(r'(0x)?[\da-f]+', x, re.IGNORECASE) else None
+ info['hls_aes'] = traverse_obj(self._configuration_arg('hls_key', casesense=True), {
+ 'uri': (0, {url_or_none}), 'key': (0, {hex_or_none}), 'iv': (1, {hex_or_none}),
+ }) or None
+
+ variant_query = self._configuration_arg('variant_query', [None], casesense=True)[0]
+ if variant_query is not None:
+ query = urllib.parse.parse_qs(
+ urllib.parse.urlparse(variant_query).query or variant_query
+ or urllib.parse.urlparse(manifest_url).query)
+ for fmt in self._downloader._get_formats(info):
+ fmt['url'] = update_url_query(fmt['url'], query)
+
+ # Attempt to detect live HLS or set VOD duration
+ m3u8_format = next((f for f in self._downloader._get_formats(info)
+ if determine_protocol(f) == 'm3u8_native'), None)
+ if m3u8_format:
+ is_live = self._configuration_arg('is_live', [None])[0]
+ if is_live is not None:
+ info['live_status'] = 'not_live' if is_live == 'false' else 'is_live'
+ return
+ headers = m3u8_format.get('http_headers') or info.get('http_headers')
+ duration = self._extract_m3u8_vod_duration(
+ m3u8_format['url'], info.get('id'), note='Checking m3u8 live status',
+ errnote='Failed to download m3u8 media playlist', headers=headers)
+ if not duration:
+ info['live_status'] = 'is_live'
+ info['duration'] = info.get('duration') or duration
def _extract_rss(self, url, video_id, doc):
NS_MAP = {
@@ -2238,43 +2292,87 @@ class GenericIE(InfoExtractor):
'entries': entries,
}
- def _kvs_getrealurl(self, video_url, license_code):
+ @classmethod
+ def _kvs_get_real_url(cls, video_url, license_code):
if not video_url.startswith('function/0/'):
return video_url # not obfuscated
- url_path, _, url_query = video_url.partition('?')
- urlparts = url_path.split('/')[2:]
- license = self._kvs_getlicensetoken(license_code)
- newmagic = urlparts[5][:32]
+ parsed = urllib.parse.urlparse(video_url[len('function/0/'):])
+ license = cls._kvs_get_license_token(license_code)
+ urlparts = parsed.path.split('/')
- for o in range(len(newmagic) - 1, -1, -1):
- new = ''
- l = (o + sum(int(n) for n in license[o:])) % 32
+ HASH_LENGTH = 32
+ hash = urlparts[3][:HASH_LENGTH]
+ indices = list(range(HASH_LENGTH))
- for i in range(0, len(newmagic)):
- if i == o:
- new += newmagic[l]
- elif i == l:
- new += newmagic[o]
- else:
- new += newmagic[i]
- newmagic = new
+ # Swap indices of hash according to the destination calculated from the license token
+ accum = 0
+ for src in reversed(range(HASH_LENGTH)):
+ accum += license[src]
+ dest = (src + accum) % HASH_LENGTH
+ indices[src], indices[dest] = indices[dest], indices[src]
+
+ urlparts[3] = ''.join(hash[index] for index in indices) + urlparts[3][HASH_LENGTH:]
+ return urllib.parse.urlunparse(parsed._replace(path='/'.join(urlparts)))
- urlparts[5] = newmagic + urlparts[5][32:]
- return '/'.join(urlparts) + '?' + url_query
+ @staticmethod
+ def _kvs_get_license_token(license):
+ license = license.replace('$', '')
+ license_values = [int(char) for char in license]
- def _kvs_getlicensetoken(self, license):
- modlicense = license.replace('$', '').replace('0', '1')
- center = int(len(modlicense) / 2)
+ modlicense = license.replace('0', '1')
+ center = len(modlicense) // 2
fronthalf = int(modlicense[:center + 1])
backhalf = int(modlicense[center:])
+ modlicense = str(4 * abs(fronthalf - backhalf))[:center + 1]
+
+ return [
+ (license_values[index + offset] + current) % 10
+ for index, current in enumerate(map(int, modlicense))
+ for offset in range(4)
+ ]
+
+ def _extract_kvs(self, url, webpage, video_id):
+ flashvars = self._search_json(
+ r'(?s:<script\b[^>]*>.*?var\s+flashvars\s*=)',
+ webpage, 'flashvars', video_id, transform_source=js_to_json)
+
+ # extract the part after the last / as the display_id from the
+ # canonical URL.
+ display_id = self._search_regex(
+ r'(?:<link href="https?://[^"]+/(.+?)/?" rel="canonical"\s*/?>'
+ r'|<link rel="canonical" href="https?://[^"]+/(.+?)/?"\s*/?>)',
+ webpage, 'display_id', fatal=False)
+ title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)</(?:h1|title)>', webpage, 'title')
+
+ thumbnail = flashvars['preview_url']
+ if thumbnail.startswith('//'):
+ protocol, _, _ = url.partition('/')
+ thumbnail = protocol + thumbnail
+
+ url_keys = list(filter(re.compile(r'^video_(?:url|alt_url\d*)$').match, flashvars.keys()))
+ formats = []
+ for key in url_keys:
+ if '/get_file/' not in flashvars[key]:
+ continue
+ format_id = flashvars.get(f'{key}_text', key)
+ formats.append({
+ 'url': urljoin(url, self._kvs_get_real_url(flashvars[key], flashvars['license_code'])),
+ 'format_id': format_id,
+ 'ext': 'mp4',
+ **(parse_resolution(format_id) or parse_resolution(flashvars[key])),
+ 'http_headers': {'Referer': url},
+ })
+ if not formats[-1].get('height'):
+ formats[-1]['quality'] = 1
- modlicense = str(4 * abs(fronthalf - backhalf))
- retval = ''
- for o in range(0, center + 1):
- for i in range(1, 5):
- retval += str((int(license[o + i]) + int(modlicense[o])) % 10)
- return retval
+ return {
+ 'id': flashvars['video_id'],
+ 'display_id': display_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'formats': formats,
+ }
def _real_extract(self, url):
if url.startswith('//'):
@@ -2330,13 +2428,12 @@ class GenericIE(InfoExtractor):
# It may probably better to solve this by checking Content-Type for application/octet-stream
# after a HEAD request, but not sure if we can rely on this.
full_response = self._request_webpage(url, video_id, headers={
- 'Accept-Encoding': '*',
+ 'Accept-Encoding': 'identity',
**smuggled_data.get('http_headers', {})
})
- new_url = full_response.geturl()
- if new_url == urllib.parse.urlparse(url)._replace(scheme='https').geturl():
- url = new_url
- elif url != new_url:
+ new_url = full_response.url
+ url = urllib.parse.urlparse(url)._replace(scheme=urllib.parse.urlparse(new_url).scheme).geturl()
+ if new_url != extract_basic_auth(url)[0]:
self.report_following_redirect(new_url)
if force_videoid:
new_url = smuggle_url(new_url, {'force_videoid': force_videoid})
@@ -2355,14 +2452,13 @@ class GenericIE(InfoExtractor):
self.report_detected('direct video link')
headers = smuggled_data.get('http_headers', {})
format_id = str(m.group('format_id'))
+ ext = determine_ext(url)
subtitles = {}
- if format_id.endswith('mpegurl'):
+ if format_id.endswith('mpegurl') or ext == 'm3u8':
formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4', headers=headers)
- info_dict.update(self._fragment_query(url))
- elif format_id.endswith('mpd') or format_id.endswith('dash+xml'):
+ elif format_id.endswith('mpd') or format_id.endswith('dash+xml') or ext == 'mpd':
formats, subtitles = self._extract_mpd_formats_and_subtitles(url, video_id, headers=headers)
- info_dict.update(self._fragment_query(url))
- elif format_id == 'f4m':
+ elif format_id == 'f4m' or ext == 'f4m':
formats = self._extract_f4m_formats(url, video_id, headers=headers)
else:
formats = [{
@@ -2374,8 +2470,9 @@ class GenericIE(InfoExtractor):
info_dict.update({
'formats': formats,
'subtitles': subtitles,
- 'http_headers': headers,
+ 'http_headers': headers or None,
})
+ self._extra_manifest_info(info_dict, url)
return info_dict
if not self.get_param('test', False) and not is_intentional:
@@ -2388,7 +2485,7 @@ class GenericIE(InfoExtractor):
if first_bytes.startswith(b'#EXTM3U'):
self.report_detected('M3U playlist')
info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4')
- info_dict.update(self._fragment_query(url))
+ self._extra_manifest_info(info_dict, url)
return info_dict
# Maybe it's a direct link to a video?
@@ -2432,14 +2529,14 @@ class GenericIE(InfoExtractor):
return self.playlist_result(
self._parse_xspf(
doc, video_id, xspf_url=url,
- xspf_base_url=full_response.geturl()),
+ xspf_base_url=full_response.url),
video_id)
elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
info_dict['formats'], info_dict['subtitles'] = self._parse_mpd_formats_and_subtitles(
doc,
- mpd_base_url=full_response.geturl().rpartition('/')[0],
+ mpd_base_url=full_response.url.rpartition('/')[0],
mpd_url=url)
- info_dict.update(self._fragment_query(url))
+ self._extra_manifest_info(info_dict, url)
self.report_detected('DASH manifest')
return info_dict
elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag):
@@ -2465,7 +2562,7 @@ class GenericIE(InfoExtractor):
self._downloader.write_debug('Looking for embeds')
embeds = list(self._extract_embeds(original_url, webpage, urlh=full_response, info_dict=info_dict))
if len(embeds) == 1:
- return {**info_dict, **embeds[0]}
+ return merge_dicts(embeds[0], info_dict)
elif embeds:
return self.playlist_result(embeds, **info_dict)
raise UnsupportedError(url)
@@ -2475,7 +2572,7 @@ class GenericIE(InfoExtractor):
info_dict = types.MappingProxyType(info_dict) # Prevents accidental mutation
video_id = traverse_obj(info_dict, 'display_id', 'id') or self._generic_id(url)
url, smuggled_data = unsmuggle_url(url, {})
- actual_url = urlh.geturl() if urlh else url
+ actual_url = urlh.url if urlh else url
# Sometimes embedded video player is hidden behind percent encoding
# (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)
@@ -2528,8 +2625,7 @@ class GenericIE(InfoExtractor):
varname = mobj.group(1)
sources = variadic(self._parse_json(
mobj.group(2), video_id, transform_source=js_to_json, fatal=False) or [])
- formats = []
- subtitles = {}
+ formats, subtitles, src = [], {}, None
for source in sources:
src = source.get('src')
if not src or not isinstance(src, str):
@@ -2552,8 +2648,6 @@ class GenericIE(InfoExtractor):
m3u8_id='hls', fatal=False)
formats.extend(fmts)
self._merge_subtitles(subs, target=subtitles)
- for fmt in formats:
- fmt.update(self._fragment_query(src))
if not formats:
formats.append({
@@ -2569,11 +2663,11 @@ class GenericIE(InfoExtractor):
for sub_match in re.finditer(rf'(?s){re.escape(varname)}' r'\.addRemoteTextTrack\(({.+?})\s*,\s*(?:true|false)\)', webpage):
sub = self._parse_json(
sub_match.group(1), video_id, transform_source=js_to_json, fatal=False) or {}
- src = str_or_none(sub.get('src'))
- if not src:
+ sub_src = str_or_none(sub.get('src'))
+ if not sub_src:
continue
subtitles.setdefault(dict_get(sub, ('language', 'srclang')) or 'und', []).append({
- 'url': urllib.parse.urljoin(url, src),
+ 'url': urllib.parse.urljoin(url, sub_src),
'name': sub.get('label'),
'http_headers': {
'Referer': actual_url,
@@ -2581,7 +2675,21 @@ class GenericIE(InfoExtractor):
})
if formats or subtitles:
self.report_detected('video.js embed')
- return [{'formats': formats, 'subtitles': subtitles}]
+ info_dict = {'formats': formats, 'subtitles': subtitles}
+ if formats:
+ self._extra_manifest_info(info_dict, src)
+ return [info_dict]
+
+ # Look for generic KVS player (before json-ld bc of some urls that break otherwise)
+ found = self._search_regex((
+ r'<script\b[^>]+?\bsrc\s*=\s*(["\'])https?://(?:(?!\1)[^?#])+/kt_player\.js\?v=(?P<ver>\d+(?:\.\d+)+)\1[^>]*>',
+ r'kt_player\s*\(\s*(["\'])(?:(?!\1)[\w\W])+\1\s*,\s*(["\'])https?://(?:(?!\2)[^?#])+/kt_player\.swf\?v=(?P<ver>\d+(?:\.\d+)+)\2\s*,',
+ ), webpage, 'KVS player', group='ver', default=False)
+ if found:
+ self.report_detected('KVS Player')
+ if found.split('.')[0] not in ('4', '5', '6'):
+ self.report_warning(f'Untested major version ({found}) in player engine - download may fail.')
+ return [self._extract_kvs(url, webpage, video_id)]
# Looking for http://schema.org/VideoObject
json_ld = self._search_json_ld(webpage, video_id, default={})
@@ -2626,52 +2734,6 @@ class GenericIE(InfoExtractor):
if found:
self.report_detected('JW Player embed')
if not found:
- # Look for generic KVS player
- found = re.search(r'<script [^>]*?src="https?://.+?/kt_player\.js\?v=(?P<ver>(?P<maj_ver>\d+)(\.\d+)+)".*?>', webpage)
- if found:
- self.report_detected('KWS Player')
- if found.group('maj_ver') not in ['4', '5']:
- self.report_warning('Untested major version (%s) in player engine--Download may fail.' % found.group('ver'))
- flashvars = re.search(r'(?ms)<script.*?>.*?var\s+flashvars\s*=\s*(\{.*?\});.*?</script>', webpage)
- flashvars = self._parse_json(flashvars.group(1), video_id, transform_source=js_to_json)
-
- # extract the part after the last / as the display_id from the
- # canonical URL.
- display_id = self._search_regex(
- r'(?:<link href="https?://[^"]+/(.+?)/?" rel="canonical"\s*/?>'
- r'|<link rel="canonical" href="https?://[^"]+/(.+?)/?"\s*/?>)',
- webpage, 'display_id', fatal=False
- )
- title = self._html_search_regex(r'<(?:h1|title)>(?:Video: )?(.+?)</(?:h1|title)>', webpage, 'title')
-
- thumbnail = flashvars['preview_url']
- if thumbnail.startswith('//'):
- protocol, _, _ = url.partition('/')
- thumbnail = protocol + thumbnail
-
- url_keys = list(filter(re.compile(r'video_url|video_alt_url\d*').fullmatch, flashvars.keys()))
- formats = []
- for key in url_keys:
- if '/get_file/' not in flashvars[key]:
- continue
- format_id = flashvars.get(f'{key}_text', key)
- formats.append({
- 'url': self._kvs_getrealurl(flashvars[key], flashvars['license_code']),
- 'format_id': format_id,
- 'ext': 'mp4',
- **(parse_resolution(format_id) or parse_resolution(flashvars[key]))
- })
- if not formats[-1].get('height'):
- formats[-1]['quality'] = 1
-
- return [{
- 'id': flashvars['video_id'],
- 'display_id': display_id,
- 'title': title,
- 'thumbnail': thumbnail,
- 'formats': formats,
- }]
- if not found:
# Broaden the search a little bit
found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage))
if found:
@@ -2751,6 +2813,7 @@ class GenericIE(InfoExtractor):
entries = []
for video_url in orderedSet(found):
+ video_url = video_url.encode().decode('unicode-escape')
video_url = unescapeHTML(video_url)
video_url = video_url.replace('\\/', '/')
video_url = urllib.parse.urljoin(url, video_url)
@@ -2790,10 +2853,10 @@ class GenericIE(InfoExtractor):
return [self._extract_xspf_playlist(video_url, video_id)]
elif ext == 'm3u8':
entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4', headers=headers)
- entry_info_dict.update(self._fragment_query(video_url))
+ self._extra_manifest_info(entry_info_dict, video_url)
elif ext == 'mpd':
entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_mpd_formats_and_subtitles(video_url, video_id, headers=headers)
- entry_info_dict.update(self._fragment_query(video_url))
+ self._extra_manifest_info(entry_info_dict, video_url)
elif ext == 'f4m':
entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id, headers=headers)
elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url: