From 6b34d7cb270b8aed380a60298021c6494dd7964c Mon Sep 17 00:00:00 2001 From: Jesus Date: Mon, 11 Sep 2023 03:01:59 +0800 Subject: update from upstream --- README.md | 2 +- hypervideo_dl/__init__.py | 2 +- hypervideo_dl/extractor/_extractors.py | 5 +- hypervideo_dl/extractor/facebook.py | 41 +++++-- hypervideo_dl/extractor/gofile.py | 2 +- hypervideo_dl/extractor/noodlemagazine.py | 35 +++--- hypervideo_dl/extractor/s4c.py | 57 +++++++-- hypervideo_dl/extractor/twitter.py | 187 +++++++++++++++++++++--------- hypervideo_dl/extractor/wdr.py | 19 ++- 9 files changed, 258 insertions(+), 92 deletions(-) diff --git a/README.md b/README.md index ffa2289..2a97649 100644 --- a/README.md +++ b/README.md @@ -1731,7 +1731,7 @@ The following extractors use this feature: * `tab`: Which tab to download - one of `new`, `top`, `videos`, `podcasts`, `streams`, `stacks` #### twitter -* `legacy_api`: Force usage of the legacy Twitter API instead of the GraphQL API for tweet extraction. Has no effect if login cookies are passed +* `api`: Select one of `graphql` (default), `legacy` or `syndication` as the API for tweet extraction. Has no effect if logged in #### stacommu, wrestleuniverse * `device_id`: UUID value assigned by the website and used to enforce device limits for paid livestream content. Can be found in browser local storage diff --git a/hypervideo_dl/__init__.py b/hypervideo_dl/__init__.py index 42f5e8d..275ab85 100644 --- a/hypervideo_dl/__init__.py +++ b/hypervideo_dl/__init__.py @@ -956,7 +956,7 @@ def _real_main(argv=None): FFmpegPostProcessor._ffmpeg_location.set(opts.ffmpeg_location) with YoutubeDL(ydl_opts) as ydl: - pre_process = opts.rm_cachedir + pre_process = opts.update_self or opts.rm_cachedir actual_use = all_urls or opts.load_info_filename if opts.rm_cachedir: diff --git a/hypervideo_dl/extractor/_extractors.py b/hypervideo_dl/extractor/_extractors.py index f11554b..b788737 100644 --- a/hypervideo_dl/extractor/_extractors.py +++ b/hypervideo_dl/extractor/_extractors.py @@ -1710,7 +1710,10 @@ from .ruv import ( RuvIE, RuvSpilaIE ) -from .s4c import S4CIE +from .s4c import ( + S4CIE, + S4CSeriesIE +) from .safari import ( SafariIE, SafariApiIE, diff --git a/hypervideo_dl/extractor/facebook.py b/hypervideo_dl/extractor/facebook.py index 021c3cf..6a61e5f 100644 --- a/hypervideo_dl/extractor/facebook.py +++ b/hypervideo_dl/extractor/facebook.py @@ -74,6 +74,22 @@ class FacebookIE(InfoExtractor): _VIDEO_PAGE_TAHOE_TEMPLATE = 'https://www.facebook.com/video/tahoe/async/%s/?chain=true&isvideo=true&payloadtype=primary' _TESTS = [{ + 'url': 'https://www.facebook.com/radiokicksfm/videos/3676516585958356/', + 'info_dict': { + 'id': '3676516585958356', + 'ext': 'mp4', + 'title': 'dr Adam Przygoda', + 'description': 'md5:34675bda53336b1d16400265c2bb9b3b', + 'uploader': 'RADIO KICKS FM', + 'upload_date': '20230818', + 'timestamp': 1692346159, + 'thumbnail': r're:^https?://.*', + 'uploader_id': '100063551323670', + 'duration': 3132.184, + 'view_count': int, + 'concurrent_view_count': 0, + }, + }, { 'url': 'https://www.facebook.com/video.php?v=637842556329505&fref=nf', 'md5': '6a40d33c0eccbb1af76cf0485a052659', 'info_dict': { @@ -97,7 +113,7 @@ class FacebookIE(InfoExtractor): 'upload_date': '20140506', 'timestamp': 1399398998, 'thumbnail': r're:^https?://.*', - 'uploader_id': 'pfbid04scW44U4P9iTyLZAGy8y8W3pR3i2VugvHCimiRudUAVbN3MPp9eXBaYFcgVworZwl', + 'uploader_id': 'pfbid028wxorhX2ErLFJ578N6P3crHD3PHmXTCqCvfBpsnbSLmbokwSY75p5hWBjHGkG4zxl', 'duration': 131.03, 'concurrent_view_count': int, }, @@ -179,7 +195,7 @@ class FacebookIE(InfoExtractor): 'timestamp': 1486648217, 'upload_date': '20170209', 'uploader': 'Yaroslav Korpan', - 'uploader_id': 'pfbid029y8j22EwH3ikeqgH3SEP9G3CAi9kmWKgXJJG9s5geV7mo3J2bvURqHCdgucRgAyhl', + 'uploader_id': 'pfbid06AScABAWcW91qpiuGrLt99Ef9tvwHoXP6t8KeFYEqkSfreMtfa9nTveh8b2ZEVSWl', 'concurrent_view_count': int, 'thumbnail': r're:^https?://.*', 'view_count': int, @@ -274,7 +290,7 @@ class FacebookIE(InfoExtractor): 'title': 'Josef', 'thumbnail': r're:^https?://.*', 'concurrent_view_count': int, - 'uploader_id': 'pfbid02gXHbDwxumkaKJQaTGUf3znYfYzTuidGEWawiramNx4YamSj2afwYSRkpcjtHtMRJl', + 'uploader_id': 'pfbid0cibUN6tV7DYgdbJdsUFN46wc4jKpVSPAvJQhFofGqBGmVn3V3JtAs2tfUwziw2hUl', 'timestamp': 1549275572, 'duration': 3.413, 'uploader': 'Josef Novak', @@ -401,9 +417,9 @@ class FacebookIE(InfoExtractor): def extract_metadata(webpage): post_data = [self._parse_json(j, video_id, fatal=False) for j in re.findall( - r'handleWithCustomApplyEach\(\s*ScheduledApplyEach\s*,\s*(\{.+?\})\s*\);', webpage)] + r'data-sjs>({.*?ScheduledServerJS.*?})', webpage)] post = traverse_obj(post_data, ( - ..., 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] + ..., 'require', ..., ..., ..., '__bbox', 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: ( k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict) title = get_first(media, ('title', 'text')) @@ -493,14 +509,14 @@ class FacebookIE(InfoExtractor): def extract_relay_data(_filter): return self._parse_json(self._search_regex( - r'handleWithCustomApplyEach\([^,]+,\s*({.*?%s.*?})\);' % _filter, + r'data-sjs>({.*?%s.*?})' % _filter, webpage, 'replay data', default='{}'), video_id, fatal=False) or {} def extract_relay_prefetched_data(_filter): - replay_data = extract_relay_data(_filter) - for require in (replay_data.get('require') or []): - if require[0] == 'RelayPrefetchedStreamCache': - return try_get(require, lambda x: x[3][1]['__bbox']['result']['data'], dict) or {} + return traverse_obj(extract_relay_data(_filter), ( + 'require', (None, (..., ..., ..., '__bbox', 'require')), + lambda _, v: 'RelayPrefetchedStreamCache' in v, ..., ..., + '__bbox', 'result', 'data', {dict}), get_all=False) or {} if not video_data: server_js_data = self._parse_json(self._search_regex([ @@ -511,7 +527,7 @@ class FacebookIE(InfoExtractor): if not video_data: data = extract_relay_prefetched_data( - r'"(?:dash_manifest|playable_url(?:_quality_hd)?)"\s*:\s*"[^"]+"') + r'"(?:dash_manifest|playable_url(?:_quality_hd)?)') if data: entries = [] @@ -526,7 +542,8 @@ class FacebookIE(InfoExtractor): formats = [] q = qualities(['sd', 'hd']) for key, format_id in (('playable_url', 'sd'), ('playable_url_quality_hd', 'hd'), - ('playable_url_dash', '')): + ('playable_url_dash', ''), ('browser_native_hd_url', 'hd'), + ('browser_native_sd_url', 'sd')): playable_url = video.get(key) if not playable_url: continue diff --git a/hypervideo_dl/extractor/gofile.py b/hypervideo_dl/extractor/gofile.py index ddbce2e..8983905 100644 --- a/hypervideo_dl/extractor/gofile.py +++ b/hypervideo_dl/extractor/gofile.py @@ -66,7 +66,7 @@ class GofileIE(InfoExtractor): query_params = { 'contentId': file_id, 'token': self._TOKEN, - 'websiteToken': 12345, + 'websiteToken': '7fd94ds12fds4', # From https://gofile.io/dist/js/alljs.js } password = self.get_param('videopassword') if password: diff --git a/hypervideo_dl/extractor/noodlemagazine.py b/hypervideo_dl/extractor/noodlemagazine.py index 1cea0db..1c1a763 100644 --- a/hypervideo_dl/extractor/noodlemagazine.py +++ b/hypervideo_dl/extractor/noodlemagazine.py @@ -1,7 +1,5 @@ from .common import InfoExtractor from ..utils import ( - extract_attributes, - get_element_html_by_id, int_or_none, parse_count, parse_duration, @@ -42,27 +40,36 @@ class NoodleMagazineIE(InfoExtractor): like_count = parse_count(self._html_search_meta('ya:ovs:likes', webpage, default=None)) upload_date = unified_strdate(self._html_search_meta('ya:ovs:upload_date', webpage, default='')) - player_path = extract_attributes(get_element_html_by_id('iplayer', webpage) or '')['src'] + def build_url(url_or_path): + return urljoin('https://adult.noodlemagazine.com', url_or_path) + + headers = {'Referer': url} + player_path = self._html_search_regex( + r']+\bid="iplayer"[^>]+\bsrc="([^"]+)"', webpage, 'player path') player_iframe = self._download_webpage( - urljoin('https://adult.noodlemagazine.com', player_path), video_id, 'Downloading iframe page') + build_url(player_path), video_id, 'Downloading iframe page', headers=headers) playlist_url = self._search_regex( r'window\.playlistUrl\s*=\s*["\']([^"\']+)["\']', player_iframe, 'playlist url') - playlist_info = self._download_json( - urljoin('https://adult.noodlemagazine.com', playlist_url), video_id, headers={'Referer': url}) + playlist_info = self._download_json(build_url(playlist_url), video_id, headers=headers) - thumbnail = self._og_search_property('image', webpage, default=None) or playlist_info.get('image') - formats = traverse_obj(playlist_info, ('sources', lambda _, v: v['file'], { - 'url': 'file', - 'format_id': 'label', - 'height': ('label', {int_or_none}), - 'ext': 'type', - })) + formats = [] + for source in traverse_obj(playlist_info, ('sources', lambda _, v: v['file'])): + if source.get('type') == 'hls': + formats.extend(self._extract_m3u8_formats( + build_url(source['file']), video_id, 'mp4', fatal=False, m3u8_id='hls')) + else: + formats.append(traverse_obj(source, { + 'url': ('file', {build_url}), + 'format_id': 'label', + 'height': ('label', {int_or_none}), + 'ext': 'type', + })) return { 'id': video_id, 'formats': formats, 'title': title, - 'thumbnail': thumbnail, + 'thumbnail': self._og_search_property('image', webpage, default=None) or playlist_info.get('image'), 'duration': duration, 'description': description, 'tags': tags, diff --git a/hypervideo_dl/extractor/s4c.py b/hypervideo_dl/extractor/s4c.py index 38a9058..990ea2b 100644 --- a/hypervideo_dl/extractor/s4c.py +++ b/hypervideo_dl/extractor/s4c.py @@ -1,5 +1,5 @@ from .common import InfoExtractor -from ..utils import traverse_obj +from ..utils import traverse_obj, url_or_none class S4CIE(InfoExtractor): @@ -11,7 +11,8 @@ class S4CIE(InfoExtractor): 'ext': 'mp4', 'title': 'Y Swn', 'description': 'md5:f7681a30e4955b250b3224aa9fe70cf0', - 'duration': 5340 + 'duration': 5340, + 'thumbnail': 'https://www.s4c.cymru/amg/1920x1080/Y_Swn_2023S4C_099_ii.jpg' }, }, { 'url': 'https://www.s4c.cymru/clic/programme/856636948', @@ -21,6 +22,7 @@ class S4CIE(InfoExtractor): 'title': 'Am Dro', 'duration': 2880, 'description': 'md5:100d8686fc9a632a0cb2db52a3433ffe', + 'thumbnail': 'https://www.s4c.cymru/amg/1920x1080/Am_Dro_2022-23S4C_P6_4005.jpg' }, }] @@ -30,7 +32,7 @@ class S4CIE(InfoExtractor): f'https://www.s4c.cymru/df/full_prog_details?lang=e&programme_id={video_id}', video_id, fatal=False) - filename = self._download_json( + player_config = self._download_json( 'https://player-api.s4c-cdn.co.uk/player-configuration/prod', video_id, query={ 'programme_id': video_id, 'signed': '0', @@ -38,7 +40,13 @@ class S4CIE(InfoExtractor): 'mode': 'od', 'appId': 'clic', 'streamName': '', - }, note='Downloading player config JSON')['filename'] + }, note='Downloading player config JSON') + subtitles = {} + for sub in traverse_obj(player_config, ('subtitles', lambda _, v: url_or_none(v['0']))): + subtitles.setdefault(sub.get('3', 'en'), []).append({ + 'url': sub['0'], + 'name': sub.get('1'), + }) m3u8_url = self._download_json( 'https://player-api.s4c-cdn.co.uk/streaming-urls/prod', video_id, query={ 'mode': 'od', @@ -46,17 +54,52 @@ class S4CIE(InfoExtractor): 'region': 'WW', 'extra': 'false', 'thirdParty': 'false', - 'filename': filename, + 'filename': player_config['filename'], }, note='Downloading streaming urls JSON')['hls'] - formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4', m3u8_id='hls') return { 'id': video_id, - 'formats': formats, + 'formats': self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls'), 'subtitles': subtitles, + 'thumbnail': url_or_none(player_config.get('poster')), **traverse_obj(details, ('full_prog_details', 0, { 'title': (('programme_title', 'series_title'), {str}), 'description': ('full_billing', {str.strip}), 'duration': ('duration', {lambda x: int(x) * 60}), }), get_all=False), } + + +class S4CSeriesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?s4c\.cymru/clic/series/(?P\d+)' + _TESTS = [{ + 'url': 'https://www.s4c.cymru/clic/series/864982911', + 'playlist_mincount': 6, + 'info_dict': { + 'id': '864982911', + 'title': 'Iaith ar Daith', + 'description': 'md5:e878ebf660dce89bd2ef521d7ce06397' + }, + }, { + 'url': 'https://www.s4c.cymru/clic/series/866852587', + 'playlist_mincount': 8, + 'info_dict': { + 'id': '866852587', + 'title': 'FFIT Cymru', + 'description': 'md5:abcb3c129cb68dbb6cd304fd33b07e96' + }, + }] + + def _real_extract(self, url): + series_id = self._match_id(url) + series_details = self._download_json( + 'https://www.s4c.cymru/df/series_details', series_id, query={ + 'lang': 'e', + 'series_id': series_id, + 'show_prog_in_series': 'Y' + }, note='Downloading series details JSON') + + return self.playlist_result( + [self.url_result(f'https://www.s4c.cymru/clic/programme/{episode_id}', S4CIE, episode_id) + for episode_id in traverse_obj(series_details, ('other_progs_in_series', ..., 'id'))], + series_id, traverse_obj(series_details, ('full_prog_details', 0, 'series_title', {str}))) diff --git a/hypervideo_dl/extractor/twitter.py b/hypervideo_dl/extractor/twitter.py index 66d1eb8..d0b7cb1 100644 --- a/hypervideo_dl/extractor/twitter.py +++ b/hypervideo_dl/extractor/twitter.py @@ -1,9 +1,10 @@ -import functools import json +import random import re from .common import InfoExtractor from .periscope import PeriscopeBaseIE, PeriscopeIE +from ..compat import functools # isort: split from ..compat import ( compat_parse_qs, compat_urllib_parse_unquote, @@ -147,10 +148,14 @@ class TwitterBaseIE(InfoExtractor): def is_logged_in(self): return bool(self._get_cookies(self._API_BASE).get('auth_token')) + @functools.cached_property + def _selected_api(self): + return self._configuration_arg('api', ['graphql'], ie_key='Twitter')[0] + def _fetch_guest_token(self, display_id): guest_token = traverse_obj(self._download_json( f'{self._API_BASE}guest/activate.json', display_id, 'Downloading guest token', data=b'', - headers=self._set_base_headers(legacy=display_id and self._configuration_arg('legacy_api'))), + headers=self._set_base_headers(legacy=display_id and self._selected_api == 'legacy')), ('guest_token', {str})) if not guest_token: raise ExtractorError('Could not retrieve guest token') @@ -295,7 +300,7 @@ class TwitterBaseIE(InfoExtractor): self.report_login() def _call_api(self, path, video_id, query={}, graphql=False): - headers = self._set_base_headers(legacy=not graphql and self._configuration_arg('legacy_api')) + headers = self._set_base_headers(legacy=not graphql and self._selected_api == 'legacy') headers.update({ 'x-twitter-auth-type': 'OAuth2Session', 'x-twitter-client-language': 'en', @@ -707,6 +712,7 @@ class TwitterIE(TwitterBaseIE): 'tags': [], 'age_limit': 0, }, + 'skip': 'This Tweet is unavailable', }, { # not available in Periscope 'url': 'https://twitter.com/ViviEducation/status/1136534865145286656', @@ -721,6 +727,7 @@ class TwitterIE(TwitterBaseIE): 'view_count': int, }, 'add_ie': ['TwitterBroadcast'], + 'skip': 'Broadcast no longer exists', }, { # unified card 'url': 'https://twitter.com/BrooklynNets/status/1349794411333394432?s=20', @@ -773,9 +780,9 @@ class TwitterIE(TwitterBaseIE): 'url': 'https://twitter.com/UltimaShadowX/status/1577719286659006464', 'info_dict': { 'id': '1577719286659006464', - 'title': 'Ultima📛 | #вʟм - Test', + 'title': 'Ultima📛| New Era - Test', 'description': 'Test https://t.co/Y3KEZD7Dad', - 'uploader': 'Ultima📛 | #вʟм', + 'uploader': 'Ultima📛| New Era', 'uploader_id': 'UltimaShadowX', 'uploader_url': 'https://twitter.com/UltimaShadowX', 'upload_date': '20221005', @@ -811,7 +818,7 @@ class TwitterIE(TwitterBaseIE): 'age_limit': 0, }, }, { - # Adult content, fails if not logged in (GraphQL) + # Adult content, fails if not logged in 'url': 'https://twitter.com/Rizdraws/status/1575199173472927762', 'info_dict': { 'id': '1575199163847000068', @@ -831,9 +838,10 @@ class TwitterIE(TwitterBaseIE): 'age_limit': 18, 'tags': [] }, + 'params': {'skip_download': 'The media could not be played'}, 'skip': 'Requires authentication', }, { - # Playlist result only with auth + # Playlist result only with graphql API 'url': 'https://twitter.com/Srirachachau/status/1395079556562706435', 'playlist_mincount': 2, 'info_dict': { @@ -898,7 +906,7 @@ class TwitterIE(TwitterBaseIE): 'uploader_id': 'MoniqueCamarra', 'live_status': 'was_live', 'release_timestamp': 1658417414, - 'description': 'md5:4dc8e972f1d8b3c6580376fabb02a3ad', + 'description': 'md5:acce559345fd49f129c20dbcda3f1201', 'timestamp': 1658407771, 'release_date': '20220721', 'upload_date': '20220721', @@ -1007,10 +1015,10 @@ class TwitterIE(TwitterBaseIE): 'view_count': int, 'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1600009362759733248/pu/img/XVhFQivj75H_YxxV.jpg?name=orig', 'age_limit': 0, - 'uploader': 'Mün The Friend Of YWAP', + 'uploader': 'Mün', 'repost_count': int, 'upload_date': '20221206', - 'title': 'Mün The Friend Of YWAP - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525', + 'title': 'Mün - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525', 'comment_count': int, 'like_count': int, 'tags': [], @@ -1019,7 +1027,7 @@ class TwitterIE(TwitterBaseIE): 'timestamp': 1670306984.0, }, }, { - # url to retweet id w/ legacy api + # retweeted_status (private) 'url': 'https://twitter.com/liberdalau/status/1623739803874349067', 'info_dict': { 'id': '1623274794488659969', @@ -1039,32 +1047,84 @@ class TwitterIE(TwitterBaseIE): 'like_count': int, 'repost_count': int, }, - 'params': {'extractor_args': {'twitter': {'legacy_api': ['']}}}, 'skip': 'Protected tweet', }, { - # orig tweet w/ graphql - 'url': 'https://twitter.com/liberdalau/status/1623739803874349067', + # retweeted_status + 'url': 'https://twitter.com/playstrumpcard/status/1695424220702888009', 'info_dict': { - 'id': '1623274794488659969', - 'display_id': '1623739803874349067', + 'id': '1694928337846538240', 'ext': 'mp4', - 'title': '@selfisekai@hackerspace.pl 🐀 - RT @Johnnybull3ts: Me after going viral to over 30million people: Whoopsie-daisy', - 'description': 'md5:9258bdbb54793bdc124fe1cd47e96c6a', - 'uploader': '@selfisekai@hackerspace.pl 🐀', - 'uploader_id': 'liberdalau', - 'uploader_url': 'https://twitter.com/liberdalau', + 'display_id': '1695424220702888009', + 'title': 'md5:e8daa9527bc2b947121395494f786d9d', + 'description': 'md5:004f2d37fd58737724ec75bc7e679938', + 'uploader': 'Benny Johnson', + 'uploader_id': 'bennyjohnson', + 'uploader_url': 'https://twitter.com/bennyjohnson', 'age_limit': 0, 'tags': [], - 'duration': 8.033, - 'timestamp': 1675964711.0, - 'upload_date': '20230209', - 'thumbnail': r're:https://pbs\.twimg\.com/ext_tw_video_thumb/.+', + 'duration': 45.001, + 'timestamp': 1692962814.0, + 'upload_date': '20230825', + 'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+', 'like_count': int, - 'view_count': int, 'repost_count': int, + 'view_count': int, 'comment_count': int, }, - 'skip': 'Protected tweet', + }, { + # retweeted_status w/ legacy API + 'url': 'https://twitter.com/playstrumpcard/status/1695424220702888009', + 'info_dict': { + 'id': '1694928337846538240', + 'ext': 'mp4', + 'display_id': '1695424220702888009', + 'title': 'md5:e8daa9527bc2b947121395494f786d9d', + 'description': 'md5:004f2d37fd58737724ec75bc7e679938', + 'uploader': 'Benny Johnson', + 'uploader_id': 'bennyjohnson', + 'uploader_url': 'https://twitter.com/bennyjohnson', + 'age_limit': 0, + 'tags': [], + 'duration': 45.001, + 'timestamp': 1692962814.0, + 'upload_date': '20230825', + 'thumbnail': r're:https://pbs\.twimg\.com/amplify_video_thumb/.+', + 'like_count': int, + 'repost_count': int, + }, + 'params': {'extractor_args': {'twitter': {'api': ['legacy']}}}, + }, { + # Broadcast embedded in tweet + 'url': 'https://twitter.com/JessicaDobsonWX/status/1693057346933600402', + 'info_dict': { + 'id': '1yNGaNLjEblJj', + 'ext': 'mp4', + 'title': 'Jessica Dobson - WAVE Weather Now - Saturday 8/19/23 Update', + 'uploader': 'Jessica Dobson', + 'uploader_id': '1DZEoDwDovRQa', + 'thumbnail': r're:^https?://.*\.jpg', + 'view_count': int, + }, + 'add_ie': ['TwitterBroadcast'], + }, { + # Animated gif and quote tweet video, with syndication API + 'url': 'https://twitter.com/BAKKOOONN/status/1696256659889565950', + 'playlist_mincount': 2, + 'info_dict': { + 'id': '1696256659889565950', + 'title': 'BAKOON - https://t.co/zom968d0a0', + 'description': 'https://t.co/zom968d0a0', + 'tags': [], + 'uploader': 'BAKOON', + 'uploader_id': 'BAKKOOONN', + 'uploader_url': 'https://twitter.com/BAKKOOONN', + 'age_limit': 18, + 'timestamp': 1693254077.0, + 'upload_date': '20230828', + 'like_count': int, + }, + 'params': {'extractor_args': {'twitter': {'api': ['syndication']}}}, + 'expected_warnings': ['Not all metadata'], }, { # onion route 'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273', @@ -1103,6 +1163,14 @@ class TwitterIE(TwitterBaseIE): 'only_matching': True, }] + _MEDIA_ID_RE = re.compile(r'_video/(\d+)/') + + @property + def _GRAPHQL_ENDPOINT(self): + if self.is_logged_in: + return 'zZXycP0V6H7m-2r0mOnFcA/TweetDetail' + return '2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId' + def _graphql_to_legacy(self, data, twid): result = traverse_obj(data, ( 'threaded_conversation_with_injections_v2', 'instructions', 0, 'entries', @@ -1130,9 +1198,14 @@ class TwitterIE(TwitterBaseIE): 'user': ('core', 'user_results', 'result', 'legacy'), 'card': ('card', 'legacy'), 'quoted_status': ('quoted_status_result', 'result', 'legacy'), + 'retweeted_status': ('legacy', 'retweeted_status_result', 'result', 'legacy'), }, expected_type=dict, default={})) - # extra transformation is needed since result does not match legacy format + # extra transformations needed since result does not match legacy format + if status.get('retweeted_status'): + status['retweeted_status']['user'] = traverse_obj(status, ( + 'retweeted_status_result', 'result', 'core', 'user_results', 'result', 'legacy', {dict})) or {} + binding_values = { binding_value.get('key'): binding_value.get('value') for binding_value in traverse_obj(status, ('card', 'binding_values', ..., {dict})) @@ -1208,33 +1281,42 @@ class TwitterIE(TwitterBaseIE): } def _extract_status(self, twid): - if self.is_logged_in: - return self._graphql_to_legacy( - self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid), twid) - - try: - if not self._configuration_arg('legacy_api'): - return self._graphql_to_legacy( - self._call_graphql_api('2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId', twid), twid) - return traverse_obj(self._call_api(f'statuses/show/{twid}.json', twid, { + if self.is_logged_in or self._selected_api == 'graphql': + status = self._graphql_to_legacy(self._call_graphql_api(self._GRAPHQL_ENDPOINT, twid), twid) + + elif self._selected_api == 'legacy': + status = self._call_api(f'statuses/show/{twid}.json', twid, { 'cards_platform': 'Web-12', 'include_cards': 1, 'include_reply_count': 1, 'include_user_entities': 0, 'tweet_mode': 'extended', - }), 'retweeted_status', None) + }) - except ExtractorError as e: - if e.expected: - raise + elif self._selected_api == 'syndication': self.report_warning( - f'{e.orig_msg}. Falling back to syndication endpoint; some metadata may be missing', twid) + 'Not all metadata or media is available via syndication endpoint', twid, only_once=True) + status = self._download_json( + 'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON', + headers={'User-Agent': 'Googlebot'}, query={ + 'id': twid, + # TODO: token = ((Number(twid) / 1e15) * Math.PI).toString(36).replace(/(0+|\.)/g, '') + 'token': ''.join(random.choices('123456789abcdefghijklmnopqrstuvwxyz', k=10)), + }) + if not status: + raise ExtractorError('Syndication endpoint returned empty JSON response') + # Transform the result so its structure matches that of legacy/graphql + media = [] + for detail in traverse_obj(status, ((None, 'quoted_tweet'), 'mediaDetails', ..., {dict})): + detail['id_str'] = traverse_obj(detail, ( + 'video_info', 'variants', ..., 'url', {self._MEDIA_ID_RE.search}, 1), get_all=False) or twid + media.append(detail) + status['extended_entities'] = {'media': media} - status = self._download_json( - 'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON', - headers={'User-Agent': 'Googlebot'}, query={'id': twid}) - status['extended_entities'] = {'media': status.get('mediaDetails')} - return status + else: + raise ExtractorError(f'"{self._selected_api}" is not a valid API selection', expected=True) + + return traverse_obj(status, 'retweeted_status', None, expected_type=dict) or {} def _real_extract(self, url): twid, selected_index = self._match_valid_url(url).group('id', 'index') @@ -1266,10 +1348,7 @@ class TwitterIE(TwitterBaseIE): } def extract_from_video_info(media): - media_id = traverse_obj(media, 'id_str', 'id', ( - 'video_info', 'variants', ..., 'url', - {functools.partial(re.search, r'_video/(\d+)/')}, 1 - ), get_all=False, expected_type=str_or_none) or twid + media_id = traverse_obj(media, 'id_str', 'id', expected_type=str_or_none) self.write_debug(f'Extracting from video info: {media_id}') formats = [] @@ -1503,6 +1582,8 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): broadcast = self._call_api( 'broadcasts/show.json', broadcast_id, {'ids': broadcast_id})['broadcasts'][broadcast_id] + if not broadcast: + raise ExtractorError('Broadcast no longer exists', expected=True) info = self._parse_broadcast_data(broadcast, broadcast_id) media_key = broadcast['media_key'] source = self._call_api( @@ -1618,6 +1699,7 @@ class TwitterSpacesIE(TwitterBaseIE): is_live = live_status == 'is_live' formats = [] + headers = {'Referer': 'https://twitter.com/'} if live_status == 'is_upcoming': self.raise_no_formats('Twitter Space not started yet', expected=True) elif not is_live and not metadata.get('is_space_available_for_replay'): @@ -1628,7 +1710,7 @@ class TwitterSpacesIE(TwitterBaseIE): ('source', ('noRedirectPlaybackUrl', 'location'), {url_or_none}), get_all=False) formats = self._extract_m3u8_formats( # XXX: Some Spaces need ffmpeg as downloader source, metadata['media_key'], 'm4a', entry_protocol='m3u8', live=is_live, - headers={'Referer': 'https://twitter.com/'}, fatal=False) if source else [] + headers=headers, fatal=False) if source else [] for fmt in formats: fmt.update({'vcodec': 'none', 'acodec': 'aac'}) if not is_live: @@ -1653,6 +1735,7 @@ class TwitterSpacesIE(TwitterBaseIE): lambda: int_or_none(metadata['scheduled_start'], scale=1000)), 'timestamp': int_or_none(metadata.get('created_at'), scale=1000), 'formats': formats, + 'http_headers': headers, } diff --git a/hypervideo_dl/extractor/wdr.py b/hypervideo_dl/extractor/wdr.py index de5dc26..6767f26 100644 --- a/hypervideo_dl/extractor/wdr.py +++ b/hypervideo_dl/extractor/wdr.py @@ -173,6 +173,7 @@ class WDRPageIE(WDRIE): # XXX: Do not subclass from concrete IE 'skip': 'HTTP Error 404: Not Found', }, { + # FIXME: Asset JSON is directly embedded in webpage 'url': 'http://www1.wdr.de/mediathek/video/live/index.html', 'info_dict': { 'id': 'mdb-2296252', @@ -221,6 +222,8 @@ class WDRPageIE(WDRIE): # XXX: Do not subclass from concrete IE 'id': 'mdb-869971', 'ext': 'mp4', 'title': r're:^COSMO Livestream [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'alt_title': 'COSMO Livestream', + 'live_status': 'is_live', 'upload_date': '20160101', }, 'params': { @@ -248,6 +251,16 @@ class WDRPageIE(WDRIE): # XXX: Do not subclass from concrete IE 'url': 'https://kinder.wdr.de/tv/die-sendung-mit-dem-elefanten/av/video-folge---astronaut-100.html', 'only_matching': True, }, + { + 'url': 'https://www1.wdr.de/mediathek/video/sendungen/rockpalast/video-baroness---freak-valley-festival--100.html', + 'info_dict': { + 'id': 'mdb-2741028', + 'ext': 'mp4', + 'title': 'Baroness - Freak Valley Festival 2022', + 'alt_title': 'Rockpalast', + 'upload_date': '20220725', + }, + } ] def _real_extract(self, url): @@ -259,7 +272,7 @@ class WDRPageIE(WDRIE): # XXX: Do not subclass from concrete IE # Article with several videos - # for wdr.de the data-extension is in a tag with the class "mediaLink" + # for wdr.de the data-extension-ard is in a tag with the class "mediaLink" # for wdr.de radio players, in a tag with the class "wdrrPlayerPlayBtn" # for wdrmaus, in a tag with the class "videoButton" (previously a link # to the page in a multiline "videoLink"-tag) @@ -268,7 +281,7 @@ class WDRPageIE(WDRIE): # XXX: Do not subclass from concrete IE (?: (["\'])(?:mediaLink|wdrrPlayerPlayBtn|videoButton)\b.*?\1[^>]+| (["\'])videoLink\b.*?\2[\s]*>\n[^\n]* - )data-extension=(["\'])(?P(?:(?!\3).)+)\3 + )data-extension(?:-ard)?=(["\'])(?P(?:(?!\3).)+)\3 ''', webpage): media_link_obj = self._parse_json( mobj.group('data'), display_id, transform_source=js_to_json, @@ -295,7 +308,7 @@ class WDRPageIE(WDRIE): # XXX: Do not subclass from concrete IE compat_urlparse.urljoin(url, mobj.group('href')), ie=WDRPageIE.ie_key()) for mobj in re.finditer( - r']+\bhref=(["\'])(?P(?:(?!\1).)+)\1[^>]+\bdata-extension=', + r']+\bhref=(["\'])(?P(?:(?!\1).)+)\1[^>]+\bdata-extension(?:-ard)?=', webpage) if re.match(self._PAGE_REGEX, mobj.group('href')) ] -- cgit v1.2.3