diff options
32 files changed, 355 insertions, 876 deletions
diff --git a/supportedsites.md b/supportedsites.md index 7c4b9bee9..7166dc53a 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -3,7 +3,6 @@ - **17live:clip** - **1tv**: Первый канал - **20min** - - **220.ro** - **23video** - **247sports** - **24video** @@ -11,7 +10,6 @@ - **3sat** - **4tube** - **56.com** - - **5min** - **6play** - **7plus** - **8tracks** @@ -381,7 +379,6 @@ - **FranceTVSite** - **Freesound** - **freespeech.org** - - **FreshLive** - **FrontendMasters** - **FrontendMastersCourse** - **FrontendMastersLesson** @@ -454,7 +451,6 @@ - **hitbox:live** - **HitRecord** - **hketv**: 香港教育局教育電視 (HKETV) Educational Television, Hong Kong Educational Bureau - - **HornBunny** - **HotNewHipHop** - **hotstar** - **hotstar:playlist** @@ -499,7 +495,6 @@ - **iq.com**: International version of iQiyi - **iq.com:album** - **iqiyi**: 爱奇艺 - - **Ir90Tv** - **ITTF** - **ITV** - **ITVBTCC** @@ -516,7 +511,6 @@ - **JWPlatform** - **Kakao** - **Kaltura** - - **Kankan** - **Karaoketv** - **KarriereVideos** - **Katsomo** @@ -989,7 +983,6 @@ - **RoosterTeeth** - **RoosterTeethSeries** - **RottenTomatoes** - - **Roxwel** - **Rozhlas** - **RTBF** - **RTDocumentry** @@ -1181,7 +1174,6 @@ - **TheIntercept** - **ThePlatform** - **ThePlatformFeed** - - **TheScene** - **TheStar** - **TheSun** - **ThetaStream** @@ -1388,7 +1380,6 @@ - **VShare** - **VTM** - **VTXTV** - - **vube**: Vube.com - **VuClip** - **Vupload** - **VVVVID** diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index ac45a5160..4438e40e9 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1841,15 +1841,21 @@ class YoutubeDL(object): '^=': lambda attr, value: attr.startswith(value), '$=': lambda attr, value: attr.endswith(value), '*=': lambda attr, value: value in attr, + '~=': lambda attr, value: value.search(attr) is not None } str_operator_rex = re.compile(r'''(?x)\s* (?P<key>[a-zA-Z0-9._-]+)\s* - (?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s* - (?P<value>[a-zA-Z0-9._-]+)\s* + (?P<negation>!\s*)?(?P<op>%s)\s*(?P<none_inclusive>\?\s*)? + (?P<quote>["'])? + (?P<value>(?(quote)(?:(?!(?P=quote))[^\\]|\\.)+|[\w.-]+)) + (?(quote)(?P=quote))\s* ''' % '|'.join(map(re.escape, STR_OPERATORS.keys()))) m = str_operator_rex.fullmatch(filter_spec) if m: - comparison_value = m.group('value') + if m.group('op') == '~=': + comparison_value = re.compile(m.group('value')) + else: + comparison_value = re.sub(r'''\\([\\"'])''', r'\1', m.group('value')) str_op = STR_OPERATORS[m.group('op')] if m.group('negation'): op = lambda attr, value: not str_op(attr, value) @@ -3859,7 +3865,7 @@ class YoutubeDL(object): else: self.to_screen(f'[info] Downloading {thumb_display_id} ...') try: - uf = self.urlopen(t['url']) + uf = self.urlopen(sanitized_Request(t['url'], headers=t.get('http_headers', {}))) self.to_screen(f'[info] Writing {thumb_display_id} to: {thumb_filename}') with open(encodeFilename(thumb_filename), 'wb') as thumbf: shutil.copyfileobj(uf, thumbf) diff --git a/yt_dlp/downloader/__init__.py b/yt_dlp/downloader/__init__.py index acc19f43a..76841993b 100644 --- a/yt_dlp/downloader/__init__.py +++ b/yt_dlp/downloader/__init__.py @@ -117,7 +117,7 @@ def _get_suitable_downloader(info_dict, protocol, params, default): return FFmpegFD elif (external_downloader or '').lower() == 'native': return HlsFD - elif get_suitable_downloader( + elif protocol == 'm3u8_native' and get_suitable_downloader( info_dict, params, None, protocol='m3u8_frag_urls', to_stdout=info_dict['to_stdout']): return HlsFD elif params.get('hls_prefer_native') is True: diff --git a/yt_dlp/extractor/afreecatv.py b/yt_dlp/extractor/afreecatv.py index 80853487e..f25fc47fa 100644 --- a/yt_dlp/extractor/afreecatv.py +++ b/yt_dlp/extractor/afreecatv.py @@ -416,26 +416,35 @@ class AfreecaTVLiveIE(AfreecaTVIE): def _real_extract(self, url): broadcaster_id, broadcast_no = self._match_valid_url(url).group('id', 'bno') + password = self.get_param('videopassword') info = self._download_json(self._LIVE_API_URL, broadcaster_id, fatal=False, data=urlencode_postdata({'bid': broadcaster_id})) or {} channel_info = info.get('CHANNEL') or {} broadcaster_id = channel_info.get('BJID') or broadcaster_id broadcast_no = channel_info.get('BNO') or broadcast_no + password_protected = channel_info.get('BPWD') if not broadcast_no: raise ExtractorError(f'Unable to extract broadcast number ({broadcaster_id} may not be live)', expected=True) + if password_protected == 'Y' and password is None: + raise ExtractorError( + 'This livestream is protected by a password, use the --video-password option', + expected=True) formats = [] quality_key = qualities(self._QUALITIES) for quality_str in self._QUALITIES: + params = { + 'bno': broadcast_no, + 'stream_type': 'common', + 'type': 'aid', + 'quality': quality_str, + } + if password is not None: + params['pwd'] = password aid_response = self._download_json( self._LIVE_API_URL, broadcast_no, fatal=False, - data=urlencode_postdata({ - 'bno': broadcast_no, - 'stream_type': 'common', - 'type': 'aid', - 'quality': quality_str, - }), + data=urlencode_postdata(params), note=f'Downloading access token for {quality_str} stream', errnote=f'Unable to download access token for {quality_str} stream') aid = traverse_obj(aid_response, ('CHANNEL', 'AID')) diff --git a/yt_dlp/extractor/beeg.py b/yt_dlp/extractor/beeg.py index 8fbabe708..717fff3a6 100644 --- a/yt_dlp/extractor/beeg.py +++ b/yt_dlp/extractor/beeg.py @@ -1,32 +1,45 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import ( - compat_str, -) + from ..utils import ( int_or_none, - parse_qs, + traverse_obj, + try_get, unified_timestamp, ) class BeegIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?beeg\.(?:com|porn(?:/video)?)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?beeg\.(?:com(?:/video)?)/-?(?P<id>\d+)' _TESTS = [{ - # api/v6 v1 - 'url': 'http://beeg.com/5416503', - 'md5': 'a1a1b1a8bc70a89e49ccfd113aed0820', + 'url': 'https://beeg.com/-0983946056129650', + 'md5': '51d235147c4627cfce884f844293ff88', 'info_dict': { - 'id': '5416503', + 'id': '0983946056129650', 'ext': 'mp4', - 'title': 'Sultry Striptease', - 'description': 'md5:d22219c09da287c14bed3d6c37ce4bc2', - 'timestamp': 1391813355, - 'upload_date': '20140207', - 'duration': 383, + 'title': 'sucked cock and fucked in a private plane', + 'duration': 927, 'tags': list, 'age_limit': 18, + 'upload_date': '20220131', + 'timestamp': 1643656455, + 'display_id': 2540839, + } + }, { + 'url': 'https://beeg.com/-0599050563103750?t=4-861', + 'md5': 'bd8b5ea75134f7f07fad63008db2060e', + 'info_dict': { + 'id': '0599050563103750', + 'ext': 'mp4', + 'title': 'Bad Relatives', + 'duration': 2060, + 'tags': list, + 'age_limit': 18, + 'description': 'md5:b4fc879a58ae6c604f8f259155b7e3b9', + 'timestamp': 1643623200, + 'display_id': 2569965, + 'upload_date': '20220131', } }, { # api/v6 v2 @@ -36,12 +49,6 @@ class BeegIE(InfoExtractor): # api/v6 v2 w/o t 'url': 'https://beeg.com/1277207756', 'only_matching': True, - }, { - 'url': 'https://beeg.porn/video/5416503', - 'only_matching': True, - }, { - 'url': 'https://beeg.porn/5416503', - 'only_matching': True, }] def _real_extract(self, url): @@ -49,68 +56,38 @@ class BeegIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - beeg_version = self._search_regex( - r'beeg_version\s*=\s*([\da-zA-Z_-]+)', webpage, 'beeg version', - default='1546225636701') + video = self._download_json( + 'https://store.externulls.com/facts/file/%s' % video_id, + video_id, 'Downloading JSON for %s' % video_id) - if len(video_id) >= 10: - query = { - 'v': 2, - } - qs = parse_qs(url) - t = qs.get('t', [''])[0].split('-') - if len(t) > 1: - query.update({ - 's': t[0], - 'e': t[1], - }) - else: - query = {'v': 1} + fc_facts = video.get('fc_facts') + first_fact = {} + for fact in fc_facts: + if not first_fact or try_get(fact, lambda x: x['id'] < first_fact['id']): + first_fact = fact - for api_path in ('', 'api.'): - video = self._download_json( - 'https://%sbeeg.com/api/v6/%s/video/%s' - % (api_path, beeg_version, video_id), video_id, - fatal=api_path == 'api.', query=query) - if video: - break + resources = traverse_obj(video, ('file', 'hls_resources')) or first_fact.get('hls_resources') formats = [] - for format_id, video_url in video.items(): - if not video_url: - continue - height = self._search_regex( - r'^(\d+)[pP]$', format_id, 'height', default=None) - if not height: + for format_id, video_uri in resources.items(): + if not video_uri: continue - formats.append({ - 'url': self._proto_relative_url( - video_url.replace('{DATA_MARKERS}', 'data=pc_XX__%s_0' % beeg_version), 'https:'), - 'format_id': format_id, - 'height': int(height), - }) - self._sort_formats(formats) - - title = video['title'] - video_id = compat_str(video.get('id') or video_id) - display_id = video.get('code') - description = video.get('desc') - series = video.get('ps_name') + height = int_or_none(self._search_regex(r'fl_cdn_(\d+)', format_id, 'height', default=None)) + current_formats = self._extract_m3u8_formats(f'https://video.beeg.com/{video_uri}', video_id, ext='mp4', m3u8_id=str(height)) + for f in current_formats: + f['height'] = height + formats.extend(current_formats) - timestamp = unified_timestamp(video.get('date')) - duration = int_or_none(video.get('duration')) - - tags = [tag.strip() for tag in video['tags'].split(',')] if video.get('tags') else None + self._sort_formats(formats) return { 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'series': series, - 'timestamp': timestamp, - 'duration': duration, - 'tags': tags, + 'display_id': first_fact.get('id'), + 'title': traverse_obj(video, ('file', 'stuff', 'sf_name')), + 'description': traverse_obj(video, ('file', 'stuff', 'sf_story')), + 'timestamp': unified_timestamp(first_fact.get('fc_created')), + 'duration': int_or_none(traverse_obj(video, ('file', 'fl_duration'))), + 'tags': traverse_obj(video, ('tags', ..., 'tg_name')), 'formats': formats, 'age_limit': self._rta_search(webpage), } diff --git a/yt_dlp/extractor/bigo.py b/yt_dlp/extractor/bigo.py new file mode 100644 index 000000000..6e38ecc1d --- /dev/null +++ b/yt_dlp/extractor/bigo.py @@ -0,0 +1,57 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError, urlencode_postdata + + +class BigoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bigo\.tv/(?:[a-z]{2,}/)?(?P<id>[^/]+)' + + _TESTS = [{ + 'url': 'https://www.bigo.tv/ja/221338632', + 'info_dict': { + 'id': '6576287577575737440', + 'title': '土よ〜💁♂️ 休憩室/REST room', + 'thumbnail': r're:https?://.+', + 'uploader': '✨Shin💫', + 'uploader_id': '221338632', + 'is_live': True, + }, + 'skip': 'livestream', + }, { + 'url': 'https://www.bigo.tv/th/Tarlerm1304', + 'only_matching': True, + }, { + 'url': 'https://bigo.tv/115976881', + 'only_matching': True, + }] + + def _real_extract(self, url): + user_id = self._match_id(url) + + info_raw = self._download_json( + 'https://bigo.tv/studio/getInternalStudioInfo', + user_id, data=urlencode_postdata({'siteId': user_id})) + + if info_raw.get('code'): + raise ExtractorError( + f'{info_raw["msg"]} (code {info_raw["code"]})', expected=True) + info = info_raw.get('data') or {} + + if not info.get('alive'): + raise ExtractorError('This user is offline.', expected=True) + + return { + 'id': info.get('roomId') or user_id, + 'title': info.get('roomTopic'), + 'formats': [{ + 'url': info.get('hls_src'), + 'ext': 'mp4', + 'protocol': 'm3u8', + }], + 'thumbnail': info.get('snapshot'), + 'uploader': info.get('nick_name'), + 'uploader_id': user_id, + 'is_live': True, + } diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index ac9e28560..37c8be5f6 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -239,6 +239,7 @@ class InfoExtractor(object): * "resolution" (optional, string "{width}x{height}", deprecated) * "filesize" (optional, int) + * "http_headers" (dict) - HTTP headers for the request thumbnail: Full URL to a video thumbnail image. description: Full video description. uploader: Full name of the video uploader. @@ -272,6 +273,8 @@ class InfoExtractor(object): * "url": A URL pointing to the subtitles file It can optionally also have: * "name": Name or description of the subtitles + * http_headers: A dictionary of additional HTTP headers + to add to the request. "ext" will be calculated from URL if missing automatic_captions: Like 'subtitles'; contains automatically generated captions instead of normal subtitles @@ -3108,7 +3111,7 @@ class InfoExtractor(object): }) return formats, subtitles - def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None, quality=None): + def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8_native', mpd_id=None, preference=None, quality=None): def absolute_url(item_url): return urljoin(base_url, item_url) diff --git a/yt_dlp/extractor/engadget.py b/yt_dlp/extractor/engadget.py index 65635c18b..733bf322f 100644 --- a/yt_dlp/extractor/engadget.py +++ b/yt_dlp/extractor/engadget.py @@ -7,16 +7,6 @@ class EngadgetIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?engadget\.com/video/(?P<id>[^/?#]+)' _TESTS = [{ - # video with 5min ID - 'url': 'http://www.engadget.com/video/518153925/', - 'md5': 'c6820d4828a5064447a4d9fc73f312c9', - 'info_dict': { - 'id': '518153925', - 'ext': 'mp4', - 'title': 'Samsung Galaxy Tab Pro 8.4 Review', - }, - 'add_ie': ['FiveMin'], - }, { # video with vidible ID 'url': 'https://www.engadget.com/video/57a28462134aa15a39f0421a/', 'only_matching': True, diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index bd514f958..6af32451f 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -142,6 +142,7 @@ from .bfmtv import ( ) from .bibeltv import BibelTVIE from .bigflix import BigflixIE +from .bigo import BigoIE from .bild import BildIE from .bilibili import ( BiliBiliIE, @@ -481,7 +482,6 @@ from .filmon import ( ) from .filmweb import FilmwebIE from .firsttv import FirstTVIE -from .fivemin import FiveMinIE from .fivetv import FiveTVIE from .flickr import FlickrIE from .folketinget import FolketingetIE @@ -513,7 +513,6 @@ from .francetv import ( ) from .freesound import FreesoundIE from .freespeech import FreespeechIE -from .freshlive import FreshLiveIE from .frontendmasters import ( FrontendMastersIE, FrontendMastersLessonIE, @@ -548,7 +547,10 @@ from .gazeta import GazetaIE from .gdcvault import GDCVaultIE from .gedidigital import GediDigitalIE from .generic import GenericIE -from .gettr import GettrIE +from .gettr import ( + GettrIE, + GettrStreamingIE, +) from .gfycat import GfycatIE from .giantbomb import GiantBombIE from .giga import GigaIE @@ -585,7 +587,6 @@ from .hidive import HiDiveIE from .historicfilms import HistoricFilmsIE from .hitbox import HitboxIE, HitboxLiveIE from .hitrecord import HitRecordIE -from .hornbunny import HornBunnyIE from .hotnewhiphop import HotNewHipHopIE from .hotstar import ( HotStarIE, @@ -655,7 +656,6 @@ from .iqiyi import ( IqIE, IqAlbumIE ) -from .ir90tv import Ir90TvIE from .itv import ( ITVIE, ITVBTCCIE, @@ -677,7 +677,6 @@ from .joj import JojIE from .jwplatform import JWPlatformIE from .kakao import KakaoIE from .kaltura import KalturaIE -from .kankan import KankanIE from .karaoketv import KaraoketvIE from .karrierevideos import KarriereVideosIE from .keezmovies import KeezMoviesIE @@ -1320,11 +1319,9 @@ from .reuters import ReutersIE from .reverbnation import ReverbNationIE from .rice import RICEIE from .rmcdecouverte import RMCDecouverteIE -from .ro220 import Ro220IE from .rockstargames import RockstarGamesIE from .roosterteeth import RoosterTeethIE, RoosterTeethSeriesIE from .rottentomatoes import RottenTomatoesIE -from .roxwel import RoxwelIE from .rozhlas import RozhlasIE from .rtbf import RTBFIE from .rte import RteIE, RteRadioIE @@ -1595,7 +1592,6 @@ from .theplatform import ( ThePlatformIE, ThePlatformFeedIE, ) -from .thescene import TheSceneIE from .thestar import TheStarIE from .thesun import TheSunIE from .theta import ( @@ -1900,7 +1896,6 @@ from .vrv import ( from .vshare import VShareIE from .vtm import VTMIE from .medialaan import MedialaanIE -from .vube import VubeIE from .vuclip import VuClipIE from .vupload import VuploadIE from .vvvvid import ( diff --git a/yt_dlp/extractor/fivemin.py b/yt_dlp/extractor/fivemin.py deleted file mode 100644 index f3f876ecd..000000000 --- a/yt_dlp/extractor/fivemin.py +++ /dev/null @@ -1,54 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class FiveMinIE(InfoExtractor): - IE_NAME = '5min' - _VALID_URL = r'(?:5min:|https?://(?:[^/]*?5min\.com/|delivery\.vidible\.tv/aol)(?:(?:Scripts/PlayerSeed\.js|playerseed/?)?\?.*?playList=)?)(?P<id>\d+)' - - _TESTS = [ - { - # From http://www.engadget.com/2013/11/15/ipad-mini-retina-display-review/ - 'url': 'http://pshared.5min.com/Scripts/PlayerSeed.js?sid=281&width=560&height=345&playList=518013791', - 'md5': '4f7b0b79bf1a470e5004f7112385941d', - 'info_dict': { - 'id': '518013791', - 'ext': 'mp4', - 'title': 'iPad Mini with Retina Display Review', - 'description': 'iPad mini with Retina Display review', - 'duration': 177, - 'uploader': 'engadget', - 'upload_date': '20131115', - 'timestamp': 1384515288, - }, - 'params': { - # m3u8 download - 'skip_download': True, - } - }, - { - # From http://on.aol.com/video/how-to-make-a-next-level-fruit-salad-518086247 - 'url': '5min:518086247', - 'md5': 'e539a9dd682c288ef5a498898009f69e', - 'info_dict': { - 'id': '518086247', - 'ext': 'mp4', - 'title': 'How to Make a Next-Level Fruit Salad', - 'duration': 184, - }, - 'skip': 'no longer available', - }, - { - 'url': 'http://embed.5min.com/518726732/', - 'only_matching': True, - }, - { - 'url': 'http://delivery.vidible.tv/aol?playList=518013791', - 'only_matching': True, - } - ] - - def _real_extract(self, url): - video_id = self._match_id(url) - return self.url_result('aol-video:%s' % video_id) diff --git a/yt_dlp/extractor/freshlive.py b/yt_dlp/extractor/freshlive.py deleted file mode 100644 index ad19b8109..000000000 --- a/yt_dlp/extractor/freshlive.py +++ /dev/null @@ -1,80 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - int_or_none, - try_get, - unified_timestamp, -) - - -class FreshLiveIE(InfoExtractor): - _VALID_URL = r'https?://freshlive\.tv/[^/]+/(?P<id>\d+)' - _TEST = { - 'url': 'https://freshlive.tv/satotv/74712', - 'md5': '9f0cf5516979c4454ce982df3d97f352', - 'info_dict': { - 'id': '74712', - 'ext': 'mp4', - 'title': 'テスト', - 'description': 'テスト', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1511, - 'timestamp': 1483619655, - 'upload_date': '20170105', - 'uploader': 'サトTV', - 'uploader_id': 'satotv', - 'view_count': int, - 'comment_count': int, - 'is_live': False, - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - options = self._parse_json( - self._search_regex( - r'window\.__CONTEXT__\s*=\s*({.+?});\s*</script>', - webpage, 'initial context'), - video_id) - - info = options['context']['dispatcher']['stores']['ProgramStore']['programs'][video_id] - - title = info['title'] - - if info.get('status') == 'upcoming': - raise ExtractorError('Stream %s is upcoming' % video_id, expected=True) - - stream_url = info.get('liveStreamUrl') or info['archiveStreamUrl'] - - is_live = info.get('liveStreamUrl') is not None - - formats = self._extract_m3u8_formats( - stream_url, video_id, 'mp4', - 'm3u8_native', m3u8_id='hls') - - return { - 'id': video_id, - 'formats': formats, - 'title': title, - 'description': info.get('description'), - 'thumbnail': info.get('thumbnailUrl'), - 'duration': int_or_none(info.get('airTime')), - 'timestamp': unified_timestamp(info.get('createdAt')), - 'uploader': try_get( - info, lambda x: x['channel']['title'], compat_str), - 'uploader_id': try_get( - info, lambda x: x['channel']['code'], compat_str), - 'uploader_url': try_get( - info, lambda x: x['channel']['permalink'], compat_str), - 'view_count': int_or_none(info.get('viewCount')), - 'comment_count': int_or_none(info.get('commentCount')), - 'tags': info.get('tags', []), - 'is_live': is_live, - } diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 2b59d076f..baedd7948 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -213,7 +213,7 @@ class GenericIE(InfoExtractor): { 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml', 'info_dict': { - 'id': 'http://phihag.de/2014/youtube-dl/rss2.xml', + 'id': 'https://phihag.de/2014/youtube-dl/rss2.xml', 'title': 'Zero Punctuation', 'description': 're:.*groundbreaking video review series.*' }, @@ -258,6 +258,9 @@ class GenericIE(InfoExtractor): 'episode_number': 1, 'season_number': 1, 'age_limit': 0, + 'season': 'Season 1', + 'direct': True, + 'episode': 'Episode 1', }, }], 'params': { @@ -274,6 +277,16 @@ class GenericIE(InfoExtractor): }, 'playlist_mincount': 100, }, + # RSS feed with guid + { + 'url': 'https://www.omnycontent.com/d/playlist/a7b4f8fe-59d9-4afc-a79a-a90101378abf/bf2c1d80-3656-4449-9d00-a903004e8f84/efbff746-e7c1-463a-9d80-a903004e8f8f/podcast.rss', + 'info_dict': { + 'id': 'https://www.omnycontent.com/d/playlist/a7b4f8fe-59d9-4afc-a79a-a90101378abf/bf2c1d80-3656-4449-9d00-a903004e8f84/efbff746-e7c1-463a-9d80-a903004e8f8f/podcast.rss', + 'description': 'md5:be809a44b63b0c56fb485caf68685520', + 'title': 'The Little Red Podcast', + }, + 'playlist_mincount': 76, + }, # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng { 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml', @@ -1456,24 +1469,6 @@ class GenericIE(InfoExtractor): 'duration': 45.115, }, }, - # 5min embed - { - 'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/', - 'md5': '4c6f127a30736b59b3e2c19234ee2bf7', - 'info_dict': { - 'id': '518726732', - 'ext': 'mp4', - 'title': 'Facebook Creates "On This Day" | Crunch Report', - 'description': 'Amazon updates Fire TV line, Tesla\'s Model X spotted in the wild', - 'timestamp': 1427237531, - 'uploader': 'Crunch Report', - 'upload_date': '20150324', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, # Crooks and Liars embed { 'url': 'http://crooksandliars.com/2015/04/fox-friends-says-protecting-atheists', @@ -2536,6 +2531,9 @@ class GenericIE(InfoExtractor): if not next_url: continue + if it.find('guid').text is not None: + next_url = smuggle_url(next_url, {'force_videoid': it.find('guid').text}) + def itunes(key): return xpath_text( it, xpath_with_ns('./itunes:%s' % key, NS_MAP), @@ -3337,12 +3335,6 @@ class GenericIE(InfoExtractor): if mobj is not None: return self.url_result(mobj.group('url')) - # Look for 5min embeds - mobj = re.search( - r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage) - if mobj is not None: - return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin') - # Look for Crooks and Liars embeds mobj = re.search( r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage) diff --git a/yt_dlp/extractor/gettr.py b/yt_dlp/extractor/gettr.py index aa50b2f35..179bd7c47 100644 --- a/yt_dlp/extractor/gettr.py +++ b/yt_dlp/extractor/gettr.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( + bool_or_none, ExtractorError, dict_get, float_or_none, @@ -15,10 +16,17 @@ from ..utils import ( ) -class GettrIE(InfoExtractor): - _VALID_URL = r'https?://(www\.)?gettr\.com/post/(?P<id>[a-z0-9]+)' +class GettrBaseIE(InfoExtractor): + _BASE_REGEX = r'https?://(www\.)?gettr\.com/' _MEDIA_BASE_URL = 'https://media.gettr.com/' + def _call_api(self, path, video_id, *args, **kwargs): + return self._download_json(urljoin('https://api.gettr.com/u/', path), video_id, *args, **kwargs)['result'] + + +class GettrIE(GettrBaseIE): + _VALID_URL = GettrBaseIE._BASE_REGEX + r'post/(?P<id>[a-z0-9]+)' + _TESTS = [{ 'url': 'https://www.gettr.com/post/pcf6uv838f', 'info_dict': { @@ -51,11 +59,10 @@ class GettrIE(InfoExtractor): post_id = self._match_id(url) webpage = self._download_webpage(url, post_id) - api_data = self._download_json( - 'https://api.gettr.com/u/post/%s?incl="poststats|userinfo"' % post_id, post_id) + api_data = self._call_api('post/%s?incl="poststats|userinfo"' % post_id, post_id) - post_data = try_get(api_data, lambda x: x['result']['data']) - user_data = try_get(api_data, lambda x: x['result']['aux']['uinf'][post_data['uid']]) or {} + post_data = api_data.get('data') + user_data = try_get(api_data, lambda x: x['aux']['uinf'][post_data['uid']]) or {} if post_data.get('nfound'): raise ExtractorError(post_data.get('txt'), expected=True) @@ -108,3 +115,71 @@ class GettrIE(InfoExtractor): 'duration': float_or_none(post_data.get('vid_dur')), 'tags': post_data.get('htgs'), } + + +class GettrStreamingIE(GettrBaseIE): + _VALID_URL = GettrBaseIE._BASE_REGEX + r'streaming/(?P<id>[a-z0-9]+)' + + _TESTS = [{ + 'url': 'https://gettr.com/streaming/psoiulc122', + 'info_dict': { + 'id': 'psoiulc122', + 'ext': 'mp4', + 'description': 'md5:56bca4b8f48f1743d9fd03d49c723017', + 'view_count': int, + 'uploader': 'Corona Investigative Committee', + 'uploader_id': 'coronacommittee', + 'duration': 5180.184, + 'thumbnail': r're:^https?://.+', + 'title': 'Day 1: Opening Session of the Grand Jury Proceeding', + 'timestamp': 1644080997.164, + 'upload_date': '20220205', + } + }, { + 'url': 'https://gettr.com/streaming/psfmeefcc1', + 'info_dict': { + 'id': 'psfmeefcc1', + 'ext': 'mp4', + 'title': 'Session 90: "The Virus Of Power"', + 'view_count': int, + 'uploader_id': 'coronacommittee', + 'description': 'md5:98986acdf656aa836bf36f9c9704c65b', + 'uploader': 'Corona Investigative Committee', + 'thumbnail': r're:^https?://.+', + 'duration': 21872.507, + 'timestamp': 1643976662.858, + 'upload_date': '20220204', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + video_info = self._call_api('live/join/%s' % video_id, video_id, data={}) + + live_info = video_info['broadcast'] + live_url = url_or_none(live_info.get('url')) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + live_url, video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) if live_url else ([], {}) + + thumbnails = [{ + 'url': urljoin(self._MEDIA_BASE_URL, thumbnail), + } for thumbnail in try_get(video_info, lambda x: x['postData']['imgs']) or []] + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': try_get(video_info, lambda x: x['postData']['ttl']), + 'description': try_get(video_info, lambda x: x['postData']['dsc']), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + 'uploader': try_get(video_info, lambda x: x['liveHostInfo']['nickname']), + 'uploader_id': try_get(video_info, lambda x: x['liveHostInfo']['_id']), + 'view_count': int_or_none(live_info.get('viewsCount')), + 'timestamp': float_or_none(live_info.get('startAt'), scale=1000), + 'duration': float_or_none(live_info.get('duration'), scale=1000), + 'is_live': bool_or_none(live_info.get('isLive')), + } diff --git a/yt_dlp/extractor/globo.py b/yt_dlp/extractor/globo.py index 8624a160a..f6aaae1e9 100644 --- a/yt_dlp/extractor/globo.py +++ b/yt_dlp/extractor/globo.py @@ -139,11 +139,11 @@ class GloboIE(InfoExtractor): resource_url = source['scheme'] + '://' + source['domain'] + source['path'] signed_url = '%s?h=%s&k=html5&a=%s' % (resource_url, signed_hash, 'F' if video.get('subscriber_only') else 'A') - formats.extend(self._extract_m3u8_formats( - signed_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + fmts, subtitles = self._extract_m3u8_formats_and_subtitles( + signed_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + formats.extend(fmts) self._sort_formats(formats) - subtitles = {} for resource in video['resources']: if resource.get('type') == 'subtitle': subtitles.setdefault(resource.get('language') or 'por', []).append({ @@ -186,6 +186,7 @@ class GloboArticleIE(InfoExtractor): r'\bvideosIDs\s*:\s*["\']?(\d{7,})', r'\bdata-id=["\'](\d{7,})', r'<div[^>]+\bid=["\'](\d{7,})', + r'<bs-player[^>]+\bvideoid=["\'](\d{8,})', ] _TESTS = [{ @@ -213,6 +214,14 @@ class GloboArticleIE(InfoExtractor): }, { 'url': 'http://oglobo.globo.com/rio/a-amizade-entre-um-entregador-de-farmacia-um-piano-19946271', 'only_matching': True, + }, { + 'url': 'https://ge.globo.com/video/ta-na-area-como-foi-assistir-ao-jogo-do-palmeiras-que-a-globo-nao-passou-10287094.ghtml', + 'info_dict': { + 'id': 'ta-na-area-como-foi-assistir-ao-jogo-do-palmeiras-que-a-globo-nao-passou-10287094', + 'title': 'Tá na Área: como foi assistir ao jogo do Palmeiras que a Globo não passou', + 'description': 'md5:2d089d036c4c9675117d3a56f8c61739', + }, + 'playlist_count': 1, }] @classmethod @@ -228,6 +237,6 @@ class GloboArticleIE(InfoExtractor): entries = [ self.url_result('globo:%s' % video_id, GloboIE.ie_key()) for video_id in orderedSet(video_ids)] - title = self._og_search_title(webpage, fatal=False) + title = self._og_search_title(webpage) description = self._html_search_meta('description', webpage) return self.playlist_result(entries, display_id, title, description) diff --git a/yt_dlp/extractor/hornbunny.py b/yt_dlp/extractor/hornbunny.py deleted file mode 100644 index c458a959d..000000000 --- a/yt_dlp/extractor/hornbunny.py +++ /dev/null @@ -1,49 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_duration, -) - - -class HornBunnyIE(InfoExtractor): - _VALID_URL = r'http?://(?:www\.)?hornbunny\.com/videos/(?P<title_dash>[a-z-]+)-(?P<id>\d+)\.html' - _TEST = { - 'url': 'http://hornbunny.com/videos/panty-slut-jerk-off-instruction-5227.html', - 'md5': 'e20fd862d1894b67564c96f180f43924', - 'info_dict': { - 'id': '5227', - 'ext': 'mp4', - 'title': 'panty slut jerk off instruction', - 'duration': 550, - 'age_limit': 18, - 'view_count': int, - 'thumbnail': r're:^https?://.*\.jpg$', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - title = self._og_search_title(webpage) - info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0] - - duration = parse_duration(self._search_regex( - r'<strong>Runtime:</strong>\s*([0-9:]+)</div>', - webpage, 'duration', fatal=False)) - view_count = int_or_none(self._search_regex( - r'<strong>Views:</strong>\s*(\d+)</div>', - webpage, 'view count', fatal=False)) - - info_dict.update({ - 'id': video_id, - 'title': title, - 'duration': duration, - 'view_count': view_count, - 'age_limit': 18, - }) - - return info_dict diff --git a/yt_dlp/extractor/huffpost.py b/yt_dlp/extractor/huffpost.py index 97e36f056..54385bafa 100644 --- a/yt_dlp/extractor/huffpost.py +++ b/yt_dlp/extractor/huffpost.py @@ -80,9 +80,6 @@ class HuffPostIE(InfoExtractor): 'vcodec': 'none' if key.startswith('audio/') else None, }) - if not formats and data.get('fivemin_id'): - return self.url_result('5min:%s' % data['fivemin_id']) - self._sort_formats(formats) return { diff --git a/yt_dlp/extractor/ir90tv.py b/yt_dlp/extractor/ir90tv.py deleted file mode 100644 index d5a3f6fa5..000000000 --- a/yt_dlp/extractor/ir90tv.py +++ /dev/null @@ -1,42 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import remove_start - - -class Ir90TvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?90tv\.ir/video/(?P<id>[0-9]+)/.*' - _TESTS = [{ - 'url': 'http://90tv.ir/video/95719/%D8%B4%D8%A7%DB%8C%D8%B9%D8%A7%D8%AA-%D9%86%D9%82%D9%84-%D9%88-%D8%A7%D9%86%D8%AA%D9%82%D8%A7%D9%84%D8%A7%D8%AA-%D9%85%D9%87%D9%85-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7-940218', - 'md5': '411dbd94891381960cb9e13daa47a869', - 'info_dict': { - 'id': '95719', - 'ext': 'mp4', - 'title': 'شایعات نقل و انتقالات مهم فوتبال اروپا 94/02/18', - 'thumbnail': r're:^https?://.*\.jpg$', - } - }, { - 'url': 'http://www.90tv.ir/video/95719/%D8%B4%D8%A7%DB%8C%D8%B9%D8%A7%D8%AA-%D9%86%D9%82%D9%84-%D9%88-%D8%A7%D9%86%D8%AA%D9%82%D8%A7%D9%84%D8%A7%D8%AA-%D9%85%D9%87%D9%85-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7-940218', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - title = remove_start(self._html_search_regex( - r'<title>([^<]+)</title>', webpage, 'title'), '90tv.ir :: ') - - video_url = self._search_regex( - r'<source[^>]+src="([^"]+)"', webpage, 'video url') - - thumbnail = self._search_regex(r'poster="([^"]+)"', webpage, 'thumbnail url', fatal=False) - - return { - 'url': video_url, - 'id': video_id, - 'title': title, - 'video_url': video_url, - 'thumbnail': thumbnail, - } diff --git a/yt_dlp/extractor/kaltura.py b/yt_dlp/extractor/kaltura.py index c58216458..f6dfc9caa 100644 --- a/yt_dlp/extractor/kaltura.py +++ b/yt_dlp/extractor/kaltura.py @@ -301,6 +301,7 @@ class KalturaIE(InfoExtractor): data_url = re.sub(r'/flvclipper/.*', '/serveFlavor', data_url) formats = [] + subtitles = {} for f in flavor_assets: # Continue if asset is not ready if f.get('status') != 2: @@ -344,13 +345,14 @@ class KalturaIE(InfoExtractor): if '/playManifest/' in data_url: m3u8_url = sign_url(data_url.replace( 'format/url', 'format/applehttp')) - formats.extend(self._extract_m3u8_formats( + fmts, subs = self._extract_m3u8_formats_and_subtitles( m3u8_url, entry_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) + m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) self._sort_formats(formats) - subtitles = {} if captions: for caption in captions.get('objects', []): # Continue if caption is not ready diff --git a/yt_dlp/extractor/kankan.py b/yt_dlp/extractor/kankan.py deleted file mode 100644 index a677ff447..000000000 --- a/yt_dlp/extractor/kankan.py +++ /dev/null @@ -1,48 +0,0 @@ -from __future__ import unicode_literals - -import re -import hashlib - -from .common import InfoExtractor - -_md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() - - -class KankanIE(InfoExtractor): - _VALID_URL = r'https?://(?:.*?\.)?kankan\.com/.+?/(?P<id>\d+)\.shtml' - - _TEST = { - 'url': 'http://yinyue.kankan.com/vod/48/48863.shtml', - 'md5': '29aca1e47ae68fc28804aca89f29507e', - 'info_dict': { - 'id': '48863', - 'ext': 'flv', - 'title': 'Ready To Go', - }, - 'skip': 'Only available from China', - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - title = self._search_regex(r'(?:G_TITLE=|G_MOVIE_TITLE = )[\'"](.+?)[\'"]', webpage, 'video title') - surls = re.search(r'surls:\[\'.+?\'\]|lurl:\'.+?\.flv\'', webpage).group(0) - gcids = re.findall(r'http://.+?/.+?/(.+?)/', surls) - gcid = gcids[-1] - - info_url = 'http://p2s.cl.kankan.com/getCdnresource_flv?gcid=%s' % gcid - video_info_page = self._download_webpage( - info_url, video_id, 'Downloading video url info') - ip = self._search_regex(r'ip:"(.+?)"', video_info_page, 'video url ip') - path = self._search_regex(r'path:"(.+?)"', video_info_page, 'video url path') - param1 = self._search_regex(r'param1:(\d+)', video_info_page, 'param1') - param2 = self._search_regex(r'param2:(\d+)', video_info_page, 'param2') - key = _md5('xl_mp43651' + param1 + param2) - video_url = 'http://%s%s?key=%s&key1=%s' % (ip, path, key, param2) - - return { - 'id': video_id, - 'title': title, - 'url': video_url, - } diff --git a/yt_dlp/extractor/openrec.py b/yt_dlp/extractor/openrec.py index 7cdc7d17c..0525b4830 100644 --- a/yt_dlp/extractor/openrec.py +++ b/yt_dlp/extractor/openrec.py @@ -42,8 +42,7 @@ class OpenRecBaseIE(InfoExtractor): if not m3u8_url: continue formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', entry_protocol='m3u8', - m3u8_id='hls-%s' % name, live=True)) + m3u8_url, video_id, ext='mp4', live=is_live, m3u8_id='hls-%s' % name)) self._sort_formats(formats) diff --git a/yt_dlp/extractor/ro220.py b/yt_dlp/extractor/ro220.py deleted file mode 100644 index 69934ef2b..000000000 --- a/yt_dlp/extractor/ro220.py +++ /dev/null @@ -1,43 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote - - -class Ro220IE(InfoExtractor): - IE_NAME = '220.ro' - _VALID_URL = r'(?x)(?:https?://)?(?:www\.)?220\.ro/(?P<category>[^/]+)/(?P<shorttitle>[^/]+)/(?P<id>[^/]+)' - _TEST = { - 'url': 'http://www.220.ro/sport/Luati-Le-Banii-Sez-4-Ep-1/LYV6doKo7f/', - 'md5': '03af18b73a07b4088753930db7a34add', - 'info_dict': { - 'id': 'LYV6doKo7f', - 'ext': 'mp4', - 'title': 'Luati-le Banii sez 4 ep 1', - 'description': r're:^Iata-ne reveniti dupa o binemeritata vacanta\. +Va astept si pe Facebook cu pareri si comentarii.$', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - url = compat_urllib_parse_unquote(self._search_regex( - r'(?s)clip\s*:\s*{.*?url\s*:\s*\'([^\']+)\'', webpage, 'url')) - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) - - formats = [{ - 'format_id': 'sd', - 'url': url, - 'ext': 'mp4', - }] - - return { - 'id': video_id, - 'formats': formats, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - } diff --git a/yt_dlp/extractor/roxwel.py b/yt_dlp/extractor/roxwel.py deleted file mode 100644 index 84bb1aa00..000000000 --- a/yt_dlp/extractor/roxwel.py +++ /dev/null @@ -1,52 +0,0 @@ -from __future__ import unicode_literals - - -from .common import InfoExtractor -from ..utils import unified_strdate, determine_ext - - -class RoxwelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?roxwel\.com/player/(?P<filename>.+?)(\.|\?|$)' - - _TEST = { - 'url': 'http://www.roxwel.com/player/passionpittakeawalklive.html', - 'info_dict': { - 'id': 'passionpittakeawalklive', - 'ext': 'flv', - 'title': 'Take A Walk (live)', - 'uploader': 'Passion Pit', - 'uploader_id': 'passionpit', - 'upload_date': '20120928', - 'description': 'Passion Pit performs "Take A Walk\" live at The Backyard in Austin, Texas. ', - }, - 'params': { - # rtmp download - 'skip_download': True, - } - } - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - filename = mobj.group('filename') - info_url = 'http://www.roxwel.com/api/videos/%s' % filename - info = self._download_json(info_url, filename) - - rtmp_rates = sorted([int(r.replace('flv_', '')) for r in info['media_rates'] if r.startswith('flv_')]) - best_rate = rtmp_rates[-1] - url_page_url = 'http://roxwel.com/pl_one_time.php?filename=%s&quality=%s' % (filename, best_rate) - rtmp_url = self._download_webpage(url_page_url, filename, 'Downloading video url') - ext = determine_ext(rtmp_url) - if ext == 'f4v': - rtmp_url = rtmp_url.replace(filename, 'mp4:%s' % filename) - - return { - 'id': filename, - 'title': info['title'], - 'url': rtmp_url, - 'ext': 'flv', - 'description': info['description'], - 'thumbnail': info.get('player_image_url') or info.get('image_url_large'), - 'uploader': info['artist'], - 'uploader_id': info['artistname'], - 'upload_date': unified_strdate(info['dbdate']), - } diff --git a/yt_dlp/extractor/svt.py b/yt_dlp/extractor/svt.py index 6ad01a912..8ca62e370 100644 --- a/yt_dlp/extractor/svt.py +++ b/yt_dlp/extractor/svt.py @@ -23,23 +23,27 @@ class SVTBaseIE(InfoExtractor): is_live = dict_get(video_info, ('live', 'simulcast'), default=False) m3u8_protocol = 'm3u8' if is_live else 'm3u8_native' formats = [] + subtitles = {} for vr in video_info['videoReferences']: player_type = vr.get('playerType') or vr.get('format') vurl = vr['url'] ext = determine_ext(vurl) if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + fmts, subs = self._extract_m3u8_formats_and_subtitles( vurl, video_id, ext='mp4', entry_protocol=m3u8_protocol, - m3u8_id=player_type, fatal=False)) + m3u8_id=player_type, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( vurl + '?hdcore=3.3.0', video_id, f4m_id=player_type, fatal=False)) elif ext == 'mpd': - if player_type == 'dashhbbtv': - formats.extend(self._extract_mpd_formats( - vurl, video_id, mpd_id=player_type, fatal=False)) + fmts, subs = self._extract_mpd_formats_and_subtitles( + vurl, video_id, mpd_id=player_type, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) else: formats.append({ 'format_id': player_type, @@ -52,18 +56,19 @@ class SVTBaseIE(InfoExtractor): countries=self._GEO_COUNTRIES, metadata_available=True) self._sort_formats(formats) - subtitles = {} subtitle_references = dict_get(video_info, ('subtitles', 'subtitleReferences')) if isinstance(subtitle_references, list): for sr in subtitle_references: subtitle_url = sr.get('url') subtitle_lang = sr.get('language', 'sv') if subtitle_url: + sub = { + 'url': subtitle_url, + } if determine_ext(subtitle_url) == 'm3u8': - # TODO(yan12125): handle WebVTT in m3u8 manifests - continue - - subtitles.setdefault(subtitle_lang, []).append({'url': subtitle_url}) + # XXX: no way of testing, is it ever hit? + sub['ext'] = 'vtt' + subtitles.setdefault(subtitle_lang, []).append(sub) title = video_info.get('title') diff --git a/yt_dlp/extractor/thescene.py b/yt_dlp/extractor/thescene.py deleted file mode 100644 index cd642355c..000000000 --- a/yt_dlp/extractor/thescene.py +++ /dev/null @@ -1,44 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor - -from ..compat import compat_urlparse - - -class TheSceneIE(InfoExtractor): - _VALID_URL = r'https?://thescene\.com/watch/[^/]+/(?P<id>[^/#?]+)' - - _TEST = { - 'url': 'https://thescene.com/watch/vogue/narciso-rodriguez-spring-2013-ready-to-wear', - 'info_dict': { - 'id': '520e8faac2b4c00e3c6e5f43', - 'ext': 'mp4', - 'title': 'Narciso Rodriguez: Spring 2013 Ready-to-Wear', - 'display_id': 'narciso-rodriguez-spring-2013-ready-to-wear', - 'duration': 127, - 'series': 'Style.com Fashion Shows', - 'season': 'Ready To Wear Spring 2013', - 'tags': list, - 'categories': list, - 'upload_date': '20120913', - 'timestamp': 1347512400, - 'uploader': 'vogue', - }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - player_url = compat_urlparse.urljoin( - url, - self._html_search_regex( - r'id=\'js-player-script\'[^>]+src=\'(.+?)\'', webpage, 'player url')) - - return { - '_type': 'url_transparent', - 'display_id': display_id, - 'url': player_url, - 'ie_key': 'CondeNast', - } diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index 57391d766..c2dec244f 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -131,6 +131,8 @@ class VimeoBaseInfoExtractor(InfoExtractor): request = config.get('request') or {} formats = [] + subtitles = {} + config_files = video_data.get('files') or request.get('files') or {} for f in (config_files.get('progressive') or []): video_url = f.get('url') @@ -163,21 +165,24 @@ class VimeoBaseInfoExtractor(InfoExtractor): sep_manifest_urls = [(format_id, manifest_url)] for f_id, m_url in sep_manifest_urls: if files_type == 'hls': - formats.extend(self._extract_m3u8_formats( + fmts, subs = self._extract_m3u8_formats_and_subtitles( m_url, video_id, 'mp4', 'm3u8' if is_live else 'm3u8_native', m3u8_id=f_id, note='Downloading %s m3u8 information' % cdn_name, - fatal=False)) + fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) elif files_type == 'dash': if 'json=1' in m_url: real_m_url = (self._download_json(m_url, video_id, fatal=False) or {}).get('url') if real_m_url: m_url = real_m_url - mpd_formats = self._extract_mpd_formats( + fmts, subs = self._extract_mpd_formats_and_subtitles( m_url.replace('/master.json', '/master.mpd'), video_id, f_id, 'Downloading %s MPD information' % cdn_name, fatal=False) - formats.extend(mpd_formats) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) live_archive = live_event.get('archive') or {} live_archive_source_url = live_archive.get('source_url') @@ -188,12 +193,11 @@ class VimeoBaseInfoExtractor(InfoExtractor): 'quality': 10, }) - subtitles = {} for tt in (request.get('text_tracks') or []): - subtitles[tt['lang']] = [{ + subtitles.setdefault(tt['lang'], []).append({ 'ext': 'vtt', 'url': urljoin('https://vimeo.com', tt['url']), - }] + }) thumbnails = [] if not is_live: diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index fab16780f..18eb33b57 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -688,7 +688,7 @@ class VKWallPostIE(VKBaseIE): 'artist': performer, 'track': title, 'ext': 'mp4', - 'protocol': 'm3u8', + 'protocol': 'm3u8_native', }) for video in re.finditer( diff --git a/yt_dlp/extractor/vube.py b/yt_dlp/extractor/vube.py deleted file mode 100644 index 1c8f80ae9..000000000 --- a/yt_dlp/extractor/vube.py +++ /dev/null @@ -1,170 +0,0 @@ -from __future__ import unicode_literals - - -from .common import InfoExtractor -from ..compat import ( - compat_str, -) -from ..utils import ( - int_or_none, -) - - -class VubeIE(InfoExtractor): - IE_NAME = 'vube' - IE_DESC = 'Vube.com' - _VALID_URL = r'https?://vube\.com/(?:[^/]+/)+(?P<id>[\da-zA-Z]{10})\b' - - _TESTS = [ - { - 'url': 'http://vube.com/trending/William+Wei/Y8NUZ69Tf7?t=s', - 'md5': 'e7aabe1f8f1aa826b9e4735e1f9cee42', - 'info_dict': { - 'id': 'Y8NUZ69Tf7', - 'ext': 'mp4', - 'title': 'Best Drummer Ever [HD]', - 'description': 'md5:2d63c4b277b85c2277761c2cf7337d71', - 'thumbnail': r're:^https?://.*\.jpg', - 'uploader': 'William', - 'timestamp': 1406876915, - 'upload_date': '20140801', - 'duration': 258.051, - 'like_count': int, - 'dislike_count': int, - 'comment_count': int, - 'categories': ['amazing', 'hd', 'best drummer ever', 'william wei', 'bucket drumming', 'street drummer', 'epic street drumming'], - }, - 'skip': 'Not accessible from Travis CI server', - }, { - 'url': 'http://vube.com/Chiara+Grispo+Video+Channel/YL2qNPkqon', - 'md5': 'db7aba89d4603dadd627e9d1973946fe', - 'info_dict': { - 'id': 'YL2qNPkqon', - 'ext': 'mp4', - 'title': 'Chiara Grispo - Price Tag by Jessie J', - 'description': 'md5:8ea652a1f36818352428cb5134933313', - 'thumbnail': r're:^http://frame\.thestaticvube\.com/snap/[0-9x]+/102e7e63057-5ebc-4f5c-4065-6ce4ebde131f\.jpg$', - 'uploader': 'Chiara.Grispo', - 'timestamp': 1388743358, - 'upload_date': '20140103', - 'duration': 170.56, - 'like_count': int, - 'dislike_count': int, - 'comment_count': int, - 'categories': ['pop', 'music', 'cover', 'singing', 'jessie j', 'price tag', 'chiara grispo'], - }, - 'skip': 'Removed due to DMCA', - }, - { - 'url': 'http://vube.com/SerainaMusic/my-7-year-old-sister-and-i-singing-alive-by-krewella/UeBhTudbfS?t=s&n=1', - 'md5': '5d4a52492d76f72712117ce6b0d98d08', - 'info_dict': { - 'id': 'UeBhTudbfS', - 'ext': 'mp4', - 'title': 'My 7 year old Sister and I singing "Alive" by Krewella', - 'description': 'md5:40bcacb97796339f1690642c21d56f4a', - 'thumbnail': r're:^http://frame\.thestaticvube\.com/snap/[0-9x]+/102265d5a9f-0f17-4f6b-5753-adf08484ee1e\.jpg$', - 'uploader': 'Seraina', - 'timestamp': 1396492438, - 'upload_date': '20140403', - 'duration': 240.107, - 'like_count': int, - 'dislike_count': int, - 'comment_count': int, - 'categories': ['seraina', 'jessica', 'krewella', 'alive'], - }, - 'skip': 'Removed due to DMCA', - }, { - 'url': 'http://vube.com/vote/Siren+Gene/0nmsMY5vEq?n=2&t=s', - 'md5': '0584fc13b50f887127d9d1007589d27f', - 'info_dict': { - 'id': '0nmsMY5vEq', - 'ext': 'mp4', - 'title': 'Frozen - Let It Go Cover by Siren Gene', - 'description': 'My rendition of "Let It Go" originally sung by Idina Menzel.', - 'thumbnail': r're:^http://frame\.thestaticvube\.com/snap/[0-9x]+/10283ab622a-86c9-4681-51f2-30d1f65774af\.jpg$', - 'uploader': 'Siren', - 'timestamp': 1395448018, - 'upload_date': '20140322', - 'duration': 221.788, - 'like_count': int, - 'dislike_count': int, - 'comment_count': int, - 'categories': ['let it go', 'cover', 'idina menzel', 'frozen', 'singing', 'disney', 'siren gene'], - }, - 'skip': 'Removed due to DMCA', - } - ] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - - video = self._download_json( - 'http://vube.com/t-api/v1/video/%s' % video_id, video_id, 'Downloading video JSON') - - public_id = video['public_id'] - - formats = [] - - for media in video['media'].get('video', []) + video['media'].get('audio', []): - if media['transcoding_status'] != 'processed': - continue - fmt = { - 'url': 'http://video.thestaticvube.com/video/%s/%s.mp4' % (media['media_resolution_id'], public_id), - 'abr': int(media['audio_bitrate']), - 'format_id': compat_str(media['media_resolution_id']), - } - vbr = int(media['video_bitrate']) - if vbr: - fmt.update({ - 'vbr': vbr, - 'height': int(media['height']), - }) - formats.append(fmt) - - if not formats and video.get('vst') == 'dmca': - self.raise_no_formats( - 'This video has been removed in response to a complaint received under the US Digital Millennium Copyright Act.', - expected=True) - - self._sort_formats(formats) - - title = video['title'] - description = video.get('description') - thumbnail = self._proto_relative_url(video.get('thumbnail_src'), scheme='http:') - uploader = video.get('user_alias') or video.get('channel') - timestamp = int_or_none(video.get('upload_time')) - duration = video['duration'] - view_count = video.get('raw_view_count') - like_count = video.get('total_likes') - dislike_count = video.get('total_hates') - - comments = video.get('comments') - comment_count = None - if comments is None: - comment_data = self._download_json( - 'http://vube.com/api/video/%s/comment' % video_id, - video_id, 'Downloading video comment JSON', fatal=False) - if comment_data is not None: - comment_count = int_or_none(comment_data.get('total')) - else: - comment_count = len(comments) - - categories = [tag['text'] for tag in video['tags']] - - return { - 'id': video_id, - 'formats': formats, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'timestamp': timestamp, - 'duration': duration, - 'view_count': view_count, - 'like_count': like_count, - 'dislike_count': dislike_count, - 'comment_count': comment_count, - 'categories': categories, - } diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 61804e2af..5750e75d7 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3006,13 +3006,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): tbr = float_or_none( fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) + language_preference = ( + 10 if audio_track.get('audioIsDefault') and 10 + else -10 if 'descriptive' in (audio_track.get('displayName') or '').lower() and -10 + else -1) dct = { 'asr': int_or_none(fmt.get('audioSampleRate')), 'filesize': int_or_none(fmt.get('contentLength')), 'format_id': itag, 'format_note': join_nonempty( '%s%s' % (audio_track.get('displayName') or '', - ' (default)' if audio_track.get('audioIsDefault') else ''), + ' (default)' if language_preference > 0 else ''), fmt.get('qualityLabel') or quality.replace('audio_quality_', ''), throttled and 'THROTTLED', delim=', '), 'source_preference': -10 if throttled else -1, @@ -3022,8 +3026,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'tbr': tbr, 'url': fmt_url, 'width': int_or_none(fmt.get('width')), - 'language': audio_track.get('id', '').split('.')[0], - 'language_preference': 1 if audio_track.get('audioIsDefault') else -1, + 'language': join_nonempty(audio_track.get('id', '').split('.')[0], + 'desc' if language_preference < -1 else ''), + 'language_preference': language_preference, } mime_mobj = re.match( r'((?:[^/]+)/(?:[^;]+))(?:;\s*codecs="([^"]+)")?', fmt.get('mimeType') or '') @@ -3408,6 +3413,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): trans_name += format_field(lang_name, template=' from %s') process_language( automatic_captions, base_url, trans_code, trans_name, {'tlang': trans_code}) + if lang_code == f'a-{trans_code}': + process_language( + automatic_captions, base_url, f'{trans_code}-orig', f'{trans_name} (Original)', {'tlang': trans_code}) info['automatic_captions'] = automatic_captions info['subtitles'] = subtitles diff --git a/yt_dlp/extractor/zaq1.py b/yt_dlp/extractor/zaq1.py deleted file mode 100644 index 889aff5d8..000000000 --- a/yt_dlp/extractor/zaq1.py +++ /dev/null @@ -1,101 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - unified_timestamp, -) - - -class Zaq1IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?zaq1\.pl/video/(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'http://zaq1.pl/video/xev0e', - 'md5': '24a5eb3f052e604ae597c4d0d19b351e', - 'info_dict': { - 'id': 'xev0e', - 'title': 'DJ NA WESELE. TANIEC Z FIGURAMI.węgrów/sokołów podlaski/siedlce/mińsk mazowiecki/warszawa', - 'description': 'www.facebook.com/weseledjKontakt: 728 448 199 / 505 419 147', - 'ext': 'mp4', - 'duration': 511, - 'timestamp': 1490896361, - 'uploader': 'Anonim', - 'upload_date': '20170330', - 'view_count': int, - } - }, { - # malformed JSON-LD - 'url': 'http://zaq1.pl/video/x81vn', - 'info_dict': { - 'id': 'x81vn', - 'title': 'SEKRETNE ŻYCIE WALTERA MITTY', - 'ext': 'mp4', - 'duration': 6234, - 'timestamp': 1493494860, - 'uploader': 'Anonim', - 'upload_date': '20170429', - 'view_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Failed to parse JSON'], - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - video_url = self._search_regex( - r'data-video-url=(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, - 'video url', group='url') - - info = self._search_json_ld(webpage, video_id, fatal=False) - - def extract_data(field, name, fatal=False): - return self._search_regex( - r'data-%s=(["\'])(?P<field>(?:(?!\1).)+)\1' % field, - webpage, field, fatal=fatal, group='field') - - if not info.get('title'): - info['title'] = extract_data('file-name', 'title', fatal=True) - - if not info.get('duration'): - info['duration'] = int_or_none(extract_data('duration', 'duration')) - - if not info.get('thumbnail'): - info['thumbnail'] = extract_data('photo-url', 'thumbnail') - - if not info.get('timestamp'): - info['timestamp'] = unified_timestamp(self._html_search_meta( - 'uploadDate', webpage, 'timestamp')) - - if not info.get('interactionCount'): - info['view_count'] = int_or_none(self._html_search_meta( - 'interactionCount', webpage, 'view count')) - - uploader = self._html_search_regex( - r'Wideo dodał:\s*<a[^>]*>([^<]+)</a>', webpage, 'uploader', - fatal=False) - - width = int_or_none(self._html_search_meta( - 'width', webpage, fatal=False)) - height = int_or_none(self._html_search_meta( - 'height', webpage, fatal=False)) - - info.update({ - 'id': video_id, - 'formats': [{ - 'url': video_url, - 'width': width, - 'height': height, - 'http_headers': { - 'Referer': url, - }, - }], - 'uploader': uploader, - }) - - return info diff --git a/yt_dlp/extractor/zoom.py b/yt_dlp/extractor/zoom.py index 25a0902f6..c00548839 100644 --- a/yt_dlp/extractor/zoom.py +++ b/yt_dlp/extractor/zoom.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, + str_or_none, js_to_json, parse_filesize, urlencode_postdata, @@ -23,7 +24,8 @@ class ZoomIE(InfoExtractor): 'id': 'dUk_CNBETmZ5VA2BwEl-jjakPpJ3M1pcfVYAPRsoIbEByGsLjUZtaa4yCATQuOL3der8BlTwxQePl_j0.EImBkXzTIaPvdZO5', 'ext': 'mp4', 'title': 'China\'s "two sessions" and the new five-year plan', - } + }, + 'skip': 'Recording requires email authentication to access', } def _real_extract(self, url): @@ -56,22 +58,46 @@ class ZoomIE(InfoExtractor): webpage, 'data'), play_id, js_to_json) subtitles = {} - for _type in ('transcript', 'cc'): + for _type in ('transcript', 'cc', 'chapter'): if data.get('%sUrl' % _type): subtitles[_type] = [{ 'url': urljoin(base_url, data['%sUrl' % _type]), 'ext': 'vtt', }] + formats = [] + + if data.get('viewMp4Url'): + formats.append({ + 'format_note': 'Camera stream', + 'url': str_or_none(data.get('viewMp4Url')), + 'width': int_or_none(data.get('viewResolvtionsWidth')), + 'height': int_or_none(data.get('viewResolvtionsHeight')), + 'format_id': str_or_none(data.get('recordingId')), + 'ext': 'mp4', + 'filesize_approx': parse_filesize(data.get('fileSize')), + 'preference': 0 + }) + + if data.get('shareMp4Url'): + formats.append({ + 'format_note': 'Screen share stream', + 'url': str_or_none(data.get('shareMp4Url')), + 'width': int_or_none(data.get('shareResolvtionsWidth')), + 'height': int_or_none(data.get('shareResolvtionsHeight')), + 'format_id': str_or_none(data.get('shareVideoId')), + 'ext': 'mp4', + 'preference': -1 + }) + + self._sort_formats(formats) + return { 'id': play_id, - 'title': data['topic'], - 'url': data['viewMp4Url'], + 'title': data.get('topic'), 'subtitles': subtitles, - 'width': int_or_none(data.get('viewResolvtionsWidth')), - 'height': int_or_none(data.get('viewResolvtionsHeight')), + 'formats': formats, 'http_headers': { 'Referer': base_url, }, - 'filesize_approx': parse_filesize(data.get('fileSize')), } diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 0086c3619..eb21a25ac 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -117,6 +117,19 @@ def parseOpts(overrideArguments=None, ignore_config_files='if_override'): return parser, opts, args +class _YoutubeDLOptionParser(optparse.OptionParser): + # optparse is deprecated since python 3.2. So assume a stable interface even for private methods + + def _match_long_opt(self, opt): + """Improve ambigious argument resolution by comparing option objects instead of argument strings""" + try: + return super()._match_long_opt(opt) + except optparse.AmbiguousOptionError as e: + if len(set(self._long_opt[p] for p in e.possibilities)) == 1: + return e.possibilities[0] + raise + + def create_parser(): def _format_option_string(option): ''' ('-o', '--option') -> -o, --format METAVAR''' @@ -215,7 +228,7 @@ def create_parser(): 'conflict_handler': 'resolve', } - parser = optparse.OptionParser(**compat_kwargs(kw)) + parser = _YoutubeDLOptionParser(**compat_kwargs(kw)) general = optparse.OptionGroup(parser, 'General Options') general.add_option( @@ -1191,13 +1204,13 @@ def create_parser(): action='store_false', dest='allow_playlist_files', help='Do not write playlist metadata when using --write-info-json, --write-description etc.') filesystem.add_option( - '--clean-infojson', + '--clean-info-json', '--clean-infojson', action='store_true', dest='clean_infojson', default=None, help=( 'Remove some private fields such as filenames from the infojson. ' 'Note that it could still contain some personal information (default)')) filesystem.add_option( - '--no-clean-infojson', + '--no-clean-info-json', '--no-clean-infojson', action='store_false', dest='clean_infojson', help='Write all fields to the infojson') filesystem.add_option( diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 0c3c6c401..51931f164 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -1723,7 +1723,7 @@ def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None): def datetime_from_str(date_str, precision='auto', format='%Y%m%d'): """ Return a datetime object from a string in the format YYYYMMDD or - (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)? + (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)? format: string date format used to return datetime object from precision: round the time portion of a datetime object. @@ -1762,13 +1762,17 @@ def datetime_from_str(date_str, precision='auto', format='%Y%m%d'): return datetime_round(datetime.datetime.strptime(date_str, format), precision) -def date_from_str(date_str, format='%Y%m%d'): +def date_from_str(date_str, format='%Y%m%d', strict=False): """ Return a datetime object from a string in the format YYYYMMDD or - (now|today|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)? + (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)? + + If "strict", only (now|today)[+-][0-9](day|week|month|year)(s)? is allowed format: string date format used to return datetime object from """ + if strict and not re.fullmatch(r'\d{8}|(now|today)[+-]\d+(day|week|month|year)(s)?', date_str): + raise ValueError(f'Invalid date format {date_str}') return datetime_from_str(date_str, precision='microsecond', format=format).date() @@ -1815,11 +1819,11 @@ class DateRange(object): def __init__(self, start=None, end=None): """start and end must be strings in the format accepted by date""" if start is not None: - self.start = date_from_str(start) + self.start = date_from_str(start, strict=True) else: self.start = datetime.datetime.min.date() if end is not None: - self.end = date_from_str(end) + self.end = date_from_str(end, strict=True) else: self.end = datetime.datetime.max.date() if self.start > self.end: |