diff options
-rw-r--r-- | test/test_postprocessors.py | 4 | ||||
-rw-r--r-- | yt_dlp/extractor/archiveorg.py | 461 | ||||
-rw-r--r-- | yt_dlp/extractor/brightcove.py | 10 | ||||
-rw-r--r-- | yt_dlp/extractor/cbc.py | 38 | ||||
-rw-r--r-- | yt_dlp/extractor/crackle.py | 40 | ||||
-rw-r--r-- | yt_dlp/extractor/extractors.py | 2 | ||||
-rw-r--r-- | yt_dlp/extractor/gofile.py | 83 | ||||
-rw-r--r-- | yt_dlp/extractor/skeb.py | 143 | ||||
-rw-r--r-- | yt_dlp/extractor/zdf.py | 35 | ||||
-rw-r--r-- | yt_dlp/options.py | 25 | ||||
-rw-r--r-- | yt_dlp/postprocessor/modify_chapters.py | 4 | ||||
-rw-r--r-- | yt_dlp/postprocessor/sponsorblock.py | 14 | ||||
-rw-r--r-- | yt_dlp/utils.py | 9 |
13 files changed, 713 insertions, 155 deletions
diff --git a/test/test_postprocessors.py b/test/test_postprocessors.py index 090c7b47b..bbe998993 100644 --- a/test/test_postprocessors.py +++ b/test/test_postprocessors.py @@ -124,11 +124,11 @@ class TestModifyChaptersPP(unittest.TestCase): chapters = self._chapters([70], ['c']) + [ self._sponsor_chapter(10, 20, 'sponsor'), self._sponsor_chapter(30, 40, 'preview'), - self._sponsor_chapter(50, 60, 'sponsor')] + self._sponsor_chapter(50, 60, 'filler')] expected = self._chapters( [10, 20, 30, 40, 50, 60, 70], ['c', '[SponsorBlock]: Sponsor', 'c', '[SponsorBlock]: Preview/Recap', - 'c', '[SponsorBlock]: Sponsor', 'c']) + 'c', '[SponsorBlock]: Filler Tangent', 'c']) self._remove_marked_arrange_sponsors_test_impl(chapters, expected, []) def test_remove_marked_arrange_sponsors_UniqueNamesForOverlappingSponsors(self): diff --git a/yt_dlp/extractor/archiveorg.py b/yt_dlp/extractor/archiveorg.py index d90fcb13a..467fe4875 100644 --- a/yt_dlp/extractor/archiveorg.py +++ b/yt_dlp/extractor/archiveorg.py @@ -3,33 +3,36 @@ from __future__ import unicode_literals import re import json - from .common import InfoExtractor -from .youtube import YoutubeIE +from .youtube import YoutubeIE, YoutubeBaseInfoExtractor from ..compat import ( compat_urllib_parse_unquote, compat_urllib_parse_unquote_plus, compat_HTTPError ) from ..utils import ( + bug_reports_message, clean_html, - determine_ext, dict_get, extract_attributes, ExtractorError, + get_element_by_id, HEADRequest, int_or_none, KNOWN_EXTENSIONS, merge_dicts, mimetype2ext, + orderedSet, parse_duration, parse_qs, - RegexNotFoundError, str_to_int, str_or_none, + traverse_obj, try_get, unified_strdate, unified_timestamp, + urlhandle_detect_ext, + url_or_none ) @@ -262,12 +265,12 @@ class YoutubeWebArchiveIE(InfoExtractor): _VALID_URL = r"""(?x)^ (?:https?://)?web\.archive\.org/ (?:web/)? - (?:[0-9A-Za-z_*]+/)? # /web and the version index is optional + (?:(?P<date>[0-9]{14})?[0-9A-Za-z_*]*/)? # /web and the version index is optional (?:https?(?::|%3[Aa])//)? (?: - (?:\w+\.)?youtube\.com/watch(?:\?|%3[fF])(?:[^\#]+(?:&|%26))?v(?:=|%3[dD]) # Youtube URL - |(wayback-fakeurl\.archive\.org/yt/) # Or the internal fake url + (?:\w+\.)?youtube\.com(?::(?:80|443))?/watch(?:\.php)?(?:\?|%3[fF])(?:[^\#]+(?:&|%26))?v(?:=|%3[dD]) # Youtube URL + |(?:wayback-fakeurl\.archive\.org/yt/) # Or the internal fake url ) (?P<id>[0-9A-Za-z_-]{11})(?:%26|\#|&|$) """ @@ -278,141 +281,391 @@ class YoutubeWebArchiveIE(InfoExtractor): 'info_dict': { 'id': 'aYAGB11YrSs', 'ext': 'webm', - 'title': 'Team Fortress 2 - Sandviches!' + 'title': 'Team Fortress 2 - Sandviches!', + 'description': 'md5:4984c0f9a07f349fc5d8e82ab7af4eaf', + 'upload_date': '20110926', + 'uploader': 'Zeurel', + 'channel_id': 'UCukCyHaD-bK3in_pKpfH9Eg', + 'duration': 32, + 'uploader_id': 'Zeurel', + 'uploader_url': 'http://www.youtube.com/user/Zeurel' } - }, - { + }, { # Internal link 'url': 'https://web.archive.org/web/2oe/http://wayback-fakeurl.archive.org/yt/97t7Xj_iBv0', 'info_dict': { 'id': '97t7Xj_iBv0', 'ext': 'mp4', - 'title': 'How Flexible Machines Could Save The World' + 'title': 'Why Machines That Bend Are Better', + 'description': 'md5:00404df2c632d16a674ff8df1ecfbb6c', + 'upload_date': '20190312', + 'uploader': 'Veritasium', + 'channel_id': 'UCHnyfMqiRRG1u-2MsSQLbXA', + 'duration': 771, + 'uploader_id': '1veritasium', + 'uploader_url': 'http://www.youtube.com/user/1veritasium' } - }, - { - # Video from 2012, webm format itag 45. + }, { + # Video from 2012, webm format itag 45. Newest capture is deleted video, with an invalid description. + # Should use the date in the link. Title ends with '- Youtube'. Capture has description in eow-description 'url': 'https://web.archive.org/web/20120712231619/http://www.youtube.com/watch?v=AkhihxRKcrs&gl=US&hl=en', 'info_dict': { 'id': 'AkhihxRKcrs', 'ext': 'webm', - 'title': 'Limited Run: Mondo\'s Modern Classic 1 of 3 (SDCC 2012)' + 'title': 'Limited Run: Mondo\'s Modern Classic 1 of 3 (SDCC 2012)', + 'upload_date': '20120712', + 'duration': 398, + 'description': 'md5:ff4de6a7980cb65d951c2f6966a4f2f3', + 'uploader_id': 'machinima', + 'uploader_url': 'http://www.youtube.com/user/machinima' } - }, - { - # Old flash-only video. Webpage title starts with "YouTube - ". + }, { + # FLV video. Video file URL does not provide itag information 'url': 'https://web.archive.org/web/20081211103536/http://www.youtube.com/watch?v=jNQXAC9IVRw', 'info_dict': { 'id': 'jNQXAC9IVRw', - 'ext': 'unknown_video', - 'title': 'Me at the zoo' + 'ext': 'flv', + 'title': 'Me at the zoo', + 'upload_date': '20050423', + 'channel_id': 'UC4QobU6STFB0P71PMvOGN5A', + 'duration': 19, + 'description': 'md5:10436b12e07ac43ff8df65287a56efb4', + 'uploader_id': 'jawed', + 'uploader_url': 'http://www.youtube.com/user/jawed' } - }, - { - # Flash video with .flv extension (itag 34). Title has prefix "YouTube -" - # Title has some weird unicode characters too. + }, { 'url': 'https://web.archive.org/web/20110712231407/http://www.youtube.com/watch?v=lTx3G6h2xyA', 'info_dict': { 'id': 'lTx3G6h2xyA', 'ext': 'flv', - 'title': 'Madeon - Pop Culture (live mashup)' + 'title': 'Madeon - Pop Culture (live mashup)', + 'upload_date': '20110711', + 'uploader': 'Madeon', + 'channel_id': 'UCqMDNf3Pn5L7pcNkuSEeO3w', + 'duration': 204, + 'description': 'md5:f7535343b6eda34a314eff8b85444680', + 'uploader_id': 'itsmadeon', + 'uploader_url': 'http://www.youtube.com/user/itsmadeon' } - }, - { # Some versions of Youtube have have "YouTube" as page title in html (and later rewritten by js). - 'url': 'https://web.archive.org/web/http://www.youtube.com/watch?v=kH-G_aIBlFw', + }, { + # First capture is of dead video, second is the oldest from CDX response. + 'url': 'https://web.archive.org/https://www.youtube.com/watch?v=1JYutPM8O6E', 'info_dict': { - 'id': 'kH-G_aIBlFw', + 'id': '1JYutPM8O6E', 'ext': 'mp4', - 'title': 'kH-G_aIBlFw' - }, - 'expected_warnings': [ - 'unable to extract title', - ] - }, - { - # First capture is a 302 redirect intermediary page. - 'url': 'https://web.archive.org/web/20050214000000/http://www.youtube.com/watch?v=0altSZ96U4M', + 'title': 'Fake Teen Doctor Strikes AGAIN! - Weekly Weird News', + 'upload_date': '20160218', + 'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA', + 'duration': 1236, + 'description': 'md5:21032bae736421e89c2edf36d1936947', + 'uploader_id': 'MachinimaETC', + 'uploader_url': 'http://www.youtube.com/user/MachinimaETC' + } + }, { + # First capture of dead video, capture date in link links to dead capture. + 'url': 'https://web.archive.org/web/20180803221945/https://www.youtube.com/watch?v=6FPhZJGvf4E', 'info_dict': { - 'id': '0altSZ96U4M', + 'id': '6FPhZJGvf4E', 'ext': 'mp4', - 'title': '0altSZ96U4M' + 'title': 'WTF: Video Games Still Launch BROKEN?! - T.U.G.S.', + 'upload_date': '20160219', + 'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA', + 'duration': 798, + 'description': 'md5:a1dbf12d9a3bd7cb4c5e33b27d77ffe7', + 'uploader_id': 'MachinimaETC', + 'uploader_url': 'http://www.youtube.com/user/MachinimaETC' }, 'expected_warnings': [ - 'unable to extract title', + r'unable to download capture webpage \(it may not be archived\)' ] - }, - { + }, { # Very old YouTube page, has - YouTube in title. + 'url': 'http://web.archive.org/web/20070302011044/http://youtube.com/watch?v=-06-KB9XTzg', + 'info_dict': { + 'id': '-06-KB9XTzg', + 'ext': 'flv', + 'title': 'New Coin Hack!! 100% Safe!!' + } + }, { + 'url': 'web.archive.org/https://www.youtube.com/watch?v=dWW7qP423y8', + 'info_dict': { + 'id': 'dWW7qP423y8', + 'ext': 'mp4', + 'title': 'It\'s Bootleg AirPods Time.', + 'upload_date': '20211021', + 'channel_id': 'UC7Jwj9fkrf1adN4fMmTkpug', + 'channel_url': 'http://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug', + 'duration': 810, + 'description': 'md5:7b567f898d8237b256f36c1a07d6d7bc', + 'uploader': 'DankPods', + 'uploader_id': 'UC7Jwj9fkrf1adN4fMmTkpug', + 'uploader_url': 'http://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug' + } + }, { + # player response contains '};' See: https://github.com/ytdl-org/youtube-dl/issues/27093 + 'url': 'https://web.archive.org/web/20200827003909if_/http://www.youtube.com/watch?v=6Dh-RL__uN4', + 'info_dict': { + 'id': '6Dh-RL__uN4', + 'ext': 'mp4', + 'title': 'bitch lasagna', + 'upload_date': '20181005', + 'channel_id': 'UC-lHJZR3Gqxm24_Vd_AJ5Yw', + 'channel_url': 'http://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw', + 'duration': 135, + 'description': 'md5:2dbe4051feeff2dab5f41f82bb6d11d0', + 'uploader': 'PewDiePie', + 'uploader_id': 'PewDiePie', + 'uploader_url': 'http://www.youtube.com/user/PewDiePie' + } + }, { + 'url': 'https://web.archive.org/web/http://www.youtube.com/watch?v=kH-G_aIBlFw', + 'only_matching': True + }, { + 'url': 'https://web.archive.org/web/20050214000000_if/http://www.youtube.com/watch?v=0altSZ96U4M', + 'only_matching': True + }, { # Video not archived, only capture is unavailable video page 'url': 'https://web.archive.org/web/20210530071008/https://www.youtube.com/watch?v=lHJTf93HL1s&spfreload=10', - 'only_matching': True, - }, - { # Encoded url + 'only_matching': True + }, { # Encoded url 'url': 'https://web.archive.org/web/20120712231619/http%3A//www.youtube.com/watch%3Fgl%3DUS%26v%3DAkhihxRKcrs%26hl%3Den', - 'only_matching': True, - }, - { + 'only_matching': True + }, { 'url': 'https://web.archive.org/web/20120712231619/http%3A//www.youtube.com/watch%3Fv%3DAkhihxRKcrs%26gl%3DUS%26hl%3Den', - 'only_matching': True, + 'only_matching': True + }, { + 'url': 'https://web.archive.org/web/20060527081937/http://www.youtube.com:80/watch.php?v=ELTFsLT73fA&search=soccer', + 'only_matching': True + }, { + 'url': 'https://web.archive.org/http://www.youtube.com:80/watch?v=-05VVye-ffg', + 'only_matching': True } ] + _YT_INITIAL_DATA_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE + _YT_INITIAL_PLAYER_RESPONSE_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*({.+?})[)\s]*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE + _YT_INITIAL_BOUNDARY_RE = r'(?:(?:var\s+meta|</script|\n)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_BOUNDARY_RE + + _YT_DEFAULT_THUMB_SERVERS = ['i.ytimg.com'] # thumbnails most likely archived on these servers + _YT_ALL_THUMB_SERVERS = orderedSet( + _YT_DEFAULT_THUMB_SERVERS + ['img.youtube.com', *[f'{c}{n or ""}.ytimg.com' for c in ('i', 's') for n in (*range(0, 5), 9)]]) + + _WAYBACK_BASE_URL = 'https://web.archive.org/web/%sif_/' + _OLDEST_CAPTURE_DATE = 20050214000000 + _NEWEST_CAPTURE_DATE = 20500101000000 + + def _call_cdx_api(self, item_id, url, filters: list = None, collapse: list = None, query: dict = None, note='Downloading CDX API JSON'): + # CDX docs: https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md + query = { + 'url': url, + 'output': 'json', + 'fl': 'original,mimetype,length,timestamp', + 'limit': 500, + 'filter': ['statuscode:200'] + (filters or []), + 'collapse': collapse or [], + **(query or {}) + } + res = self._download_json('https://web.archive.org/cdx/search/cdx', item_id, note, query=query) + if isinstance(res, list) and len(res) >= 2: + # format response to make it easier to use + return list(dict(zip(res[0], v)) for v in res[1:]) + elif not isinstance(res, list) or len(res) != 0: + self.report_warning('Error while parsing CDX API response' + bug_reports_message()) + + def _extract_yt_initial_variable(self, webpage, regex, video_id, name): + return self._parse_json(self._search_regex( + (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE), + regex), webpage, name, default='{}'), video_id, fatal=False) + + def _extract_webpage_title(self, webpage): + page_title = self._html_search_regex( + r'<title>([^<]*)</title>', webpage, 'title', default='') + # YouTube video pages appear to always have either 'YouTube -' as prefix or '- YouTube' as suffix. + return self._html_search_regex( + r'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)', + page_title, 'title', default='') + + def _extract_metadata(self, video_id, webpage): + + search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None)) if webpage else (lambda x: None)) + player_response = self._extract_yt_initial_variable( + webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, video_id, 'initial player response') or {} + initial_data = self._extract_yt_initial_variable( + webpage, self._YT_INITIAL_DATA_RE, video_id, 'initial player response') or {} + + initial_data_video = traverse_obj( + initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', ..., 'videoPrimaryInfoRenderer'), + expected_type=dict, get_all=False, default={}) + + video_details = traverse_obj( + player_response, 'videoDetails', expected_type=dict, get_all=False, default={}) + + microformats = traverse_obj( + player_response, ('microformat', 'playerMicroformatRenderer'), expected_type=dict, get_all=False, default={}) + + video_title = ( + video_details.get('title') + or YoutubeBaseInfoExtractor._get_text(microformats, 'title') + or YoutubeBaseInfoExtractor._get_text(initial_data_video, 'title') + or self._extract_webpage_title(webpage) + or search_meta(['og:title', 'twitter:title', 'title'])) + + channel_id = str_or_none( + video_details.get('channelId') + or microformats.get('externalChannelId') + or search_meta('channelId') + or self._search_regex( + r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1', # @b45a9e6 + webpage, 'channel id', default=None, group='id')) + channel_url = f'http://www.youtube.com/channel/{channel_id}' if channel_id else None + + duration = int_or_none( + video_details.get('lengthSeconds') + or microformats.get('lengthSeconds') + or parse_duration(search_meta('duration'))) + description = ( + video_details.get('shortDescription') + or YoutubeBaseInfoExtractor._get_text(microformats, 'description') + or clean_html(get_element_by_id('eow-description', webpage)) # @9e6dd23 + or search_meta(['description', 'og:description', 'twitter:description'])) + + uploader = video_details.get('author') + + # Uploader ID and URL + uploader_mobj = re.search( + r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">', # @fd05024 + webpage) + if uploader_mobj is not None: + uploader_id, uploader_url = uploader_mobj.group('uploader_id'), uploader_mobj.group('uploader_url') + else: + # @a6211d2 + uploader_url = url_or_none(microformats.get('ownerProfileUrl')) + uploader_id = self._search_regex( + r'(?:user|channel)/([^/]+)', uploader_url or '', 'uploader id', default=None) + + upload_date = unified_strdate( + dict_get(microformats, ('uploadDate', 'publishDate')) + or search_meta(['uploadDate', 'datePublished']) + or self._search_regex( + [r'(?s)id="eow-date.*?>(.*?)</span>', + r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], # @7998520 + webpage, 'upload date', default=None)) + + return { + 'title': video_title, + 'description': description, + 'upload_date': upload_date, + 'uploader': uploader, + 'channel_id': channel_id, + 'channel_url': channel_url, + 'duration': duration, + 'uploader_url': uploader_url, + 'uploader_id': uploader_id, + } + + def _extract_thumbnails(self, video_id): + try_all = 'thumbnails' in self._configuration_arg('check_all') + thumbnail_base_urls = ['http://{server}/vi{webp}/{video_id}'.format( + webp='_webp' if ext == 'webp' else '', video_id=video_id, server=server) + for server in (self._YT_ALL_THUMB_SERVERS if try_all else self._YT_DEFAULT_THUMB_SERVERS) for ext in (('jpg', 'webp') if try_all else ('jpg',))] + + thumbnails = [] + for url in thumbnail_base_urls: + response = self._call_cdx_api( + video_id, url, filters=['mimetype:image/(?:webp|jpeg)'], + collapse=['urlkey'], query={'matchType': 'prefix'}) + if not response: + continue + thumbnails.extend( + { + 'url': (self._WAYBACK_BASE_URL % (int_or_none(thumbnail_dict.get('timestamp')) or self._OLDEST_CAPTURE_DATE)) + thumbnail_dict.get('original'), + 'filesize': int_or_none(thumbnail_dict.get('length')), + 'preference': int_or_none(thumbnail_dict.get('length')) + } for thumbnail_dict in response) + if not try_all: + break + + self._remove_duplicate_formats(thumbnails) + return thumbnails + + def _get_capture_dates(self, video_id, url_date): + capture_dates = [] + # Note: CDX API will not find watch pages with extra params in the url. + response = self._call_cdx_api( + video_id, f'https://www.youtube.com/watch?v={video_id}', + filters=['mimetype:text/html'], collapse=['timestamp:6', 'digest'], query={'matchType': 'prefix'}) or [] + all_captures = sorted([int_or_none(r['timestamp']) for r in response if int_or_none(r['timestamp']) is not None]) + + # Prefer the new polymer UI captures as we support extracting more metadata from them + # WBM captures seem to all switch to this layout ~July 2020 + modern_captures = list(filter(lambda x: x >= 20200701000000, all_captures)) + if modern_captures: + capture_dates.append(modern_captures[0]) + capture_dates.append(url_date) + if all_captures: + capture_dates.append(all_captures[0]) + + if 'captures' in self._configuration_arg('check_all'): + capture_dates.extend(modern_captures + all_captures) + + # Fallbacks if any of the above fail + capture_dates.extend([self._OLDEST_CAPTURE_DATE, self._NEWEST_CAPTURE_DATE]) + return orderedSet(capture_dates) def _real_extract(self, url): - video_id = self._match_id(url) - title = video_id # if we are not able get a title - - def _extract_title(webpage): - page_title = self._html_search_regex( - r'<title>([^<]*)</title>', webpage, 'title', fatal=False) or '' - # YouTube video pages appear to always have either 'YouTube -' as suffix or '- YouTube' as prefix. - try: - page_title = self._html_search_regex( - r'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)', - page_title, 'title', default='') - except RegexNotFoundError: - page_title = None - - if not page_title: - self.report_warning('unable to extract title', video_id=video_id) - return - return page_title - - # If the video is no longer available, the oldest capture may be one before it was removed. - # Setting the capture date in url to early date seems to redirect to earliest capture. - webpage = self._download_webpage( - 'https://web.archive.org/web/20050214000000/http://www.youtube.com/watch?v=%s' % video_id, - video_id=video_id, fatal=False, errnote='unable to download video webpage (probably not archived).') - if webpage: - title = _extract_title(webpage) or title - - # Use link translator mentioned in https://github.com/ytdl-org/youtube-dl/issues/13655 - internal_fake_url = 'https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s' % video_id + + url_date, video_id = self._match_valid_url(url).groups() + + urlh = None try: - video_file_webpage = self._request_webpage( - HEADRequest(internal_fake_url), video_id, - note='Fetching video file url', expected_status=True) + urlh = self._request_webpage( + HEADRequest('https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s' % video_id), + video_id, note='Fetching archived video file url', expected_status=True) except ExtractorError as e: # HTTP Error 404 is expected if the video is not saved. if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: - raise ExtractorError( - 'HTTP Error %s. Most likely the video is not archived or issue with web.archive.org.' % e.cause.code, + self.raise_no_formats( + 'The requested video is not archived, indexed, or there is an issue with web.archive.org', expected=True) - raise - video_file_url = compat_urllib_parse_unquote(video_file_webpage.url) - video_file_url_qs = parse_qs(video_file_url) - - # Attempt to recover any ext & format info from playback url - format = {'url': video_file_url} - itag = try_get(video_file_url_qs, lambda x: x['itag'][0]) - if itag and itag in YoutubeIE._formats: # Naughty access but it works - format.update(YoutubeIE._formats[itag]) - format.update({'format_id': itag}) - else: - mime = try_get(video_file_url_qs, lambda x: x['mime'][0]) - ext = mimetype2ext(mime) or determine_ext(video_file_url) - format.update({'ext': ext}) - return { - 'id': video_id, - 'title': title, - 'formats': [format], - 'duration': str_to_int(try_get(video_file_url_qs, lambda x: x['dur'][0])) - } + else: + raise + + capture_dates = self._get_capture_dates(video_id, int_or_none(url_date)) + self.write_debug('Captures to try: ' + ', '.join(str(i) for i in capture_dates if i is not None)) + info = {'id': video_id} + for capture in capture_dates: + if not capture: + continue + webpage = self._download_webpage( + (self._WAYBACK_BASE_URL + 'http://www.youtube.com/watch?v=%s') % (capture, video_id), + video_id=video_id, fatal=False, errnote='unable to download capture webpage (it may not be archived)', + note='Downloading capture webpage') + current_info = self._extract_metadata(video_id, webpage or '') + # Try avoid getting deleted video metadata + if current_info.get('title'): + info = merge_dicts(info, current_info) + if 'captures' not in self._configuration_arg('check_all'): + break + + info['thumbnails'] = self._extract_thumbnails(video_id) + + if urlh: + url = compat_urllib_parse_unquote(urlh.url) + video_file_url_qs = parse_qs(url) + # Attempt to recover any ext & format info from playback url & response headers + format = {'url': url, 'filesize': int_or_none(urlh.headers.get('x-archive-orig-content-length'))} + itag = try_get(video_file_url_qs, lambda x: x['itag'][0]) + if itag and itag in YoutubeIE._formats: + format.update(YoutubeIE._formats[itag]) + format.update({'format_id': itag}) + else: + mime = try_get(video_file_url_qs, lambda x: x['mime'][0]) + ext = (mimetype2ext(mime) + or urlhandle_detect_ext(urlh) + or mimetype2ext(urlh.headers.get('x-archive-guessed-content-type'))) + format.update({'ext': ext}) + info['formats'] = [format] + if not info.get('duration'): + info['duration'] = str_to_int(try_get(video_file_url_qs, lambda x: x['dur'][0])) + + if not info.get('title'): + info['title'] = video_id + return info diff --git a/yt_dlp/extractor/brightcove.py b/yt_dlp/extractor/brightcove.py index cd1c3f01c..171739b46 100644 --- a/yt_dlp/extractor/brightcove.py +++ b/yt_dlp/extractor/brightcove.py @@ -16,6 +16,7 @@ from ..compat import ( ) from ..utils import ( clean_html, + dict_get, extract_attributes, ExtractorError, find_xpath_attr, @@ -577,11 +578,20 @@ class BrightcoveNewIE(AdobePassIE): if duration is not None and duration <= 0: is_live = True + common_res = [(160, 90), (320, 180), (480, 720), (640, 360), (768, 432), (1024, 576), (1280, 720), (1366, 768), (1920, 1080)] + thumb_base_url = dict_get(json_data, ('poster', 'thumbnail')) + thumbnails = [{ + 'url': re.sub(r'\d+x\d+', f'{w}x{h}', thumb_base_url), + 'width': w, + 'height': h, + } for w, h in common_res] if thumb_base_url else None + return { 'id': video_id, 'title': self._live_title(title) if is_live else title, 'description': clean_html(json_data.get('description')), 'thumbnail': json_data.get('thumbnail') or json_data.get('poster'), + 'thumbnials': thumbnails, 'duration': duration, 'timestamp': parse_iso8601(json_data.get('published_at')), 'uploader_id': json_data.get('account_id'), diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index 413053499..392c77884 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -11,11 +11,13 @@ from ..compat import ( compat_str, ) from ..utils import ( + int_or_none, + join_nonempty, js_to_json, - smuggle_url, - try_get, orderedSet, + smuggle_url, strip_or_none, + try_get, ExtractorError, ) @@ -313,6 +315,37 @@ class CBCGemIE(InfoExtractor): return self._claims_token = self._downloader.cache.load(self._NETRC_MACHINE, 'claims_token') + def _find_secret_formats(self, formats, video_id): + """ Find a valid video url and convert it to the secret variant """ + base_format = next((f for f in formats if f.get('vcodec') != 'none'), None) + if not base_format: + return + + base_url = re.sub(r'(Manifest\(.*?),filter=[\w-]+(.*?\))', r'\1\2', base_format['url']) + url = re.sub(r'(Manifest\(.*?),format=[\w-]+(.*?\))', r'\1\2', base_url) + + secret_xml = self._download_xml(url, video_id, note='Downloading secret XML', fatal=False) + if not secret_xml: + return + + for child in secret_xml: + if child.attrib.get('Type') != 'video': + continue + for video_quality in child: + bitrate = int_or_none(video_quality.attrib.get('Bitrate')) + if not bitrate or 'Index' not in video_quality.attrib: + continue + height = int_or_none(video_quality.attrib.get('MaxHeight')) + + yield { + **base_format, + 'format_id': join_nonempty('sec', height), + 'url': re.sub(r'(QualityLevels\()\d+(\))', fr'\<1>{bitrate}\2', base_url), + 'width': int_or_none(video_quality.attrib.get('MaxWidth')), + 'tbr': bitrate / 1000.0, + 'height': height, + } + def _real_extract(self, url): video_id = self._match_id(url) video_info = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/assets/' + video_id, video_id) @@ -335,6 +368,7 @@ class CBCGemIE(InfoExtractor): formats = self._extract_m3u8_formats(m3u8_url, video_id, m3u8_id='hls') self._remove_duplicate_formats(formats) + formats.extend(self._find_secret_formats(formats, video_id)) for format in formats: if format.get('vcodec') == 'none': diff --git a/yt_dlp/extractor/crackle.py b/yt_dlp/extractor/crackle.py index 2c9d28d2e..db4962c42 100644 --- a/yt_dlp/extractor/crackle.py +++ b/yt_dlp/extractor/crackle.py @@ -23,32 +23,35 @@ from ..utils import ( class CrackleIE(InfoExtractor): _VALID_URL = r'(?:crackle:|https?://(?:(?:www|m)\.)?(?:sony)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P<id>\d+)' _TESTS = [{ - # geo restricted to CA - 'url': 'https://www.crackle.com/andromeda/2502343', + # Crackle is available in the United States and territories + 'url': 'https://www.crackle.com/thanksgiving/2510064', 'info_dict': { - 'id': '2502343', + 'id': '2510064', 'ext': 'mp4', - 'title': 'Under The Night', - 'description': 'md5:d2b8ca816579ae8a7bf28bfff8cefc8a', - 'duration': 2583, + 'title': 'Touch Football', + 'description': 'md5:cfbb513cf5de41e8b56d7ab756cff4df', + 'duration': 1398, 'view_count': int, 'average_rating': 0, - 'age_limit': 14, - 'genre': 'Action, Sci-Fi', - 'creator': 'Allan Kroeker', - 'artist': 'Keith Hamilton Cobb, Kevin Sorbo, Lisa Ryder, Lexa Doig, Robert Hewitt Wolfe', - 'release_year': 2000, - 'series': 'Andromeda', - 'episode': 'Under The Night', + 'age_limit': 17, + 'genre': 'Comedy', + 'creator': 'Daniel Powell', + 'artist': 'Chris Elliott, Amy Sedaris', + 'release_year': 2016, + 'series': 'Thanksgiving', + 'episode': 'Touch Football', 'season_number': 1, 'episode_number': 1, }, 'params': { # m3u8 download 'skip_download': True, - } + }, + 'expected_warnings': [ + 'Trying with a list of known countries' + ], }, { - 'url': 'https://www.sonycrackle.com/andromeda/2502343', + 'url': 'https://www.sonycrackle.com/thanksgiving/2510064', 'only_matching': True, }] @@ -129,7 +132,6 @@ class CrackleIE(InfoExtractor): break ignore_no_formats = self.get_param('ignore_no_formats_error') - allow_unplayable_formats = self.get_param('allow_unplayable_formats') if not media or (not media.get('MediaURLs') and not ignore_no_formats): raise ExtractorError( @@ -143,9 +145,9 @@ class CrackleIE(InfoExtractor): for e in media.get('MediaURLs') or []: if e.get('UseDRM'): has_drm = True - if not allow_unplayable_formats: - continue - format_url = url_or_none(e.get('Path')) + format_url = url_or_none(e.get('DRMPath')) + else: + format_url = url_or_none(e.get('Path')) if not format_url: continue ext = determine_ext(format_url) diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index 572c32751..8d7c54ec4 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -523,6 +523,7 @@ from .globo import ( ) from .go import GoIE from .godtube import GodTubeIE +from .gofile import GofileIE from .golem import GolemIE from .googledrive import GoogleDriveIE from .googlepodcasts import ( @@ -1315,6 +1316,7 @@ from .simplecast import ( ) from .sina import SinaIE from .sixplay import SixPlayIE +from .skeb import SkebIE from .skyit import ( SkyItPlayerIE, SkyItVideoIE, diff --git a/yt_dlp/extractor/gofile.py b/yt_dlp/extractor/gofile.py new file mode 100644 index 000000000..62d778cfe --- /dev/null +++ b/yt_dlp/extractor/gofile.py @@ -0,0 +1,83 @@ +# coding: utf-8 +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + try_get +) + + +class GofileIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gofile\.io/d/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://gofile.io/d/AMZyDw', + 'info_dict': { + 'id': 'AMZyDw', + }, + 'playlist_mincount': 2, + 'playlist': [{ + 'info_dict': { + 'id': 'de571ac1-5edc-42e2-8ec2-bdac83ad4a31', + 'filesize': 928116, + 'ext': 'mp4', + 'title': 'nuuh' + } + }] + }, { # URL to test mixed file types + 'url': 'https://gofile.io/d/avt34h', + 'info_dict': { + 'id': 'avt34h', + }, + 'playlist_mincount': 1, + }, { # URL to test no video/audio error + 'url': 'https://gofile.io/d/aB03lZ', + 'info_dict': { + 'id': 'aB03lZ', + }, + 'playlist_count': 0, + 'skip': 'No video/audio found at provided URL.', + }] + _TOKEN = None + + def _real_initialize(self): + token = self._get_cookies('https://gofile.io/').get('accountToken') + if token: + self._TOKEN = token.value + return + + account_data = self._download_json( + 'https://api.gofile.io/createAccount', None, note='Getting a new guest account') + self._TOKEN = account_data['data']['token'] + self._set_cookie('gofile.io', 'accountToken', self._TOKEN) + + def _entries(self, file_id): + files = self._download_json( + f'https://api.gofile.io/getContent?contentId={file_id}&token={self._TOKEN}&websiteToken=websiteToken&cache=true', + 'Gofile', note='Getting filelist') + + status = files['status'] + if status != 'ok': + raise ExtractorError(f'{self.IE_NAME} said: status {status}', expected=True) + + found_files = False + for file in (try_get(files, lambda x: x['data']['contents'], dict) or {}).values(): + file_type, file_format = file.get('mimetype').split('/', 1) + if file_type not in ('video', 'audio') and file_format != 'vnd.mts': + continue + + found_files = True + file_url = file.get('directLink') + if file_url: + yield { + 'id': file['id'], + 'title': file['name'].rsplit('.', 1)[0], + 'url': file_url, + 'filesize': file.get('size'), + 'release_timestamp': file.get('createTime') + } + + if not found_files: + raise ExtractorError('No video/audio found at provided URL.', expected=True) + + def _real_extract(self, url): + file_id = self._match_id(url) + return self.playlist_result(self._entries(file_id), playlist_id=file_id) diff --git a/yt_dlp/extractor/skeb.py b/yt_dlp/extractor/skeb.py new file mode 100644 index 000000000..81aecb311 --- /dev/null +++ b/yt_dlp/extractor/skeb.py @@ -0,0 +1,143 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError, determine_ext, parse_qs, traverse_obj + + +class SkebIE(InfoExtractor): + _VALID_URL = r'https?://skeb\.jp/@[^/]+/works/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://skeb.jp/@riiru_wm/works/10', + 'info_dict': { + 'id': '466853', + 'title': '内容はおまかせします! by 姫ノ森りぃる@一周年', + 'descripion': 'md5:1ec50901efc3437cfbfe3790468d532d', + 'uploader': '姫ノ森りぃる@一周年', + 'uploader_id': 'riiru_wm', + 'age_limit': 0, + 'tags': [], + 'url': r're:https://skeb.+', + 'thumbnail': r're:https://skeb.+', + 'subtitles': { + 'jpn': [{ + 'url': r're:https://skeb.+', + 'ext': 'vtt' + }] + }, + 'width': 720, + 'height': 405, + 'duration': 313, + 'fps': 30, + 'ext': 'mp4', + }, + }, { + 'url': 'https://skeb.jp/@furukawa_nob/works/3', + 'info_dict': { + 'id': '489408', + 'title': 'いつもお世話になってお... by 古川ノブ@音楽とVlo...', + 'descripion': 'md5:5adc2e41d06d33b558bf7b1faeb7b9c2', + 'uploader': '古川ノブ@音楽とVlogのVtuber', + 'uploader_id': 'furukawa_nob', + 'age_limit': 0, + 'tags': [ + 'よろしく', '大丈夫', 'お願い', 'でした', + '是非', 'O', 'バー', '遊び', 'おはよう', + 'オーバ', 'ボイス', + ], + 'url': r're:https://skeb.+', + 'thumbnail': r're:https://skeb.+', + 'subtitles': { + 'jpn': [{ + 'url': r're:https://skeb.+', + 'ext': 'vtt' + }] + }, + 'duration': 98, + 'ext': 'mp3', + 'vcodec': 'none', + 'abr': 128, + }, + }, { + 'url': 'https://skeb.jp/@mollowmollow/works/6', + 'info_dict': { + 'id': '6', + 'title': 'ヒロ。\n\n私のキャラク... by 諸々', + 'descripion': 'md5:aa6cbf2ba320b50bce219632de195f07', + '_type': 'playlist', + 'entries': [{ + 'id': '486430', + 'title': 'ヒロ。\n\n私のキャラク... by 諸々', + 'descripion': 'md5:aa6cbf2ba320b50bce219632de195f07', + }, { + 'id': '486431', + 'title': 'ヒロ。\n\n私のキャラク... by 諸々', + }] + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + nuxt_data = self._search_nuxt_data(self._download_webpage(url, video_id), video_id) + + parent = { + 'id': video_id, + 'title': nuxt_data.get('title'), + 'descripion': nuxt_data.get('description'), + 'uploader': traverse_obj(nuxt_data, ('creator', 'name')), + 'uploader_id': traverse_obj(nuxt_data, ('creator', 'screen_name')), + 'age_limit': 18 if nuxt_data.get('nsfw') else 0, + 'tags': nuxt_data.get('tag_list'), + } + + entries = [] + for item in nuxt_data.get('previews') or []: + vid_url = item.get('url') + given_ext = traverse_obj(item, ('information', 'extension')) + preview_ext = determine_ext(vid_url, default_ext=None) + if not preview_ext: + content_disposition = parse_qs(vid_url)['response-content-disposition'][0] + preview_ext = self._search_regex( + r'filename="[^"]+\.([^\.]+?)"', content_disposition, + 'preview file extension', fatal=False, group=1) + if preview_ext not in ('mp4', 'mp3'): + continue + if not vid_url or not item.get('id'): + continue + width, height = traverse_obj(item, ('information', 'width')), traverse_obj(item, ('information', 'height')) + if width is not None and height is not None: + # the longest side is at most 720px for non-client viewers + max_size = max(width, height) + width, height = list(x * 720 // max_size for x in (width, height)) + entries.append({ + **parent, + 'id': str(item['id']), + 'url': vid_url, + 'thumbnail': item.get('poster_url'), + 'subtitles': { + 'jpn': [{ + 'url': item.get('vtt_url'), + 'ext': 'vtt', + }] + } if item.get('vtt_url') else None, + 'width': width, + 'height': height, + 'duration': traverse_obj(item, ('information', 'duration')), + 'fps': traverse_obj(item, ('information', 'frame_rate')), + 'ext': preview_ext or given_ext, + 'vcodec': 'none' if preview_ext == 'mp3' else None, + # you'll always get 128kbps MP3 for non-client viewers + 'abr': 128 if preview_ext == 'mp3' else None, + }) + + if not entries: + raise ExtractorError('No video/audio attachment found in this commission.', expected=True) + elif len(entries) == 1: + return entries[0] + else: + parent.update({ + '_type': 'playlist', + 'entries': entries, + }) + return parent diff --git a/yt_dlp/extractor/zdf.py b/yt_dlp/extractor/zdf.py index df236c050..6f7f801e1 100644 --- a/yt_dlp/extractor/zdf.py +++ b/yt_dlp/extractor/zdf.py @@ -15,6 +15,7 @@ from ..utils import ( orderedSet, parse_codecs, qualities, + traverse_obj, try_get, unified_timestamp, update_url_query, @@ -135,19 +136,6 @@ class ZDFBaseIE(InfoExtractor): class ZDFIE(ZDFBaseIE): _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)\.html' _TESTS = [{ - # Same as https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html - 'url': 'https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html', - 'md5': '34ec321e7eb34231fd88616c65c92db0', - 'info_dict': { - 'id': '210222_phx_nachgehakt_corona_protest', - 'ext': 'mp4', - 'title': 'Wohin führt der Protest in der Pandemie?', - 'description': 'md5:7d643fe7f565e53a24aac036b2122fbd', - 'duration': 1691, - 'timestamp': 1613948400, - 'upload_date': '20210221', - }, - }, { # Same as https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html 'url': 'https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html', 'md5': '0aff3e7bc72c8813f5e0fae333316a1d', @@ -172,6 +160,18 @@ class ZDFIE(ZDFBaseIE): 'upload_date': '20160604', }, }, { + 'url': 'https://www.zdf.de/funk/druck-11790/funk-alles-ist-verzaubert-102.html', + 'md5': '3d6f1049e9682178a11c54b91f3dd065', + 'info_dict': { + 'ext': 'mp4', + 'id': 'video_funk_1770473', + 'duration': 1278, + 'description': 'Die Neue an der Schule verdreht Ismail den Kopf.', + 'title': 'Alles ist verzaubert', + 'timestamp': 1635520560, + 'upload_date': '20211029' + }, + }, { # Same as https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche 'url': 'https://www.zdf.de/politik/phoenix-sendungen/die-gesten-der-maechtigen-100.html', 'only_matching': True, @@ -192,6 +192,10 @@ class ZDFIE(ZDFBaseIE): }, { 'url': 'https://www.zdf.de/dokumentation/planet-e/planet-e-uebersichtsseite-weitere-dokumentationen-von-planet-e-100.html', 'only_matching': True, + }, { + # Same as https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html + 'url': 'https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html', + 'only_matching': True }] def _extract_entry(self, url, player, content, video_id): @@ -202,8 +206,9 @@ class ZDFIE(ZDFBaseIE): ptmd_path = t.get('http://zdf.de/rels/streams/ptmd') if not ptmd_path: - ptmd_path = t[ - 'http://zdf.de/rels/streams/ptmd-template'].replace( + ptmd_path = traverse_obj( + t, ('streams', 'default', 'http://zdf.de/rels/streams/ptmd-template'), + 'http://zdf.de/rels/streams/ptmd-template').replace( '{playerId}', 'ngplayer_2_4') info = self._extract_ptmd( diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 120084046..85c7c8cda 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -1461,20 +1461,29 @@ def parseOpts(overrideArguments=None): sponsorblock.add_option( '--sponsorblock-mark', metavar='CATS', dest='sponsorblock_mark', default=set(), action='callback', type='str', - callback=_set_from_options_callback, callback_kwargs={'allowed_values': SponsorBlockPP.CATEGORIES.keys()}, - help=( + callback=_set_from_options_callback, callback_kwargs={ + 'allowed_values': SponsorBlockPP.CATEGORIES.keys(), + 'aliases': {'default': ['all']} + }, help=( 'SponsorBlock categories to create chapters for, separated by commas. ' - 'Available categories are all, %s. You can prefix the category with a "-" to exempt it. ' - 'See https://wiki.sponsor.ajay.app/index.php/Segment_Categories for description of the categories. ' - 'Eg: --sponsorblock-mark all,-preview' % ', '.join(SponsorBlockPP.CATEGORIES.keys()))) + f'Available categories are all, default(=all), {", ".join(SponsorBlockPP.CATEGORIES.keys())}. ' + 'You can prefix the category with a "-" to exempt it. See [1] for description of the categories. ' + 'Eg: --sponsorblock-mark all,-preview [1] https://wiki.sponsor.ajay.app/w/Segment_Categories')) sponsorblock.add_option( '--sponsorblock-remove', metavar='CATS', dest='sponsorblock_remove', default=set(), action='callback', type='str', - callback=_set_from_options_callback, callback_kwargs={'allowed_values': SponsorBlockPP.CATEGORIES.keys()}, - help=( + callback=_set_from_options_callback, callback_kwargs={ + 'allowed_values': set(SponsorBlockPP.CATEGORIES.keys()) - set(SponsorBlockPP.POI_CATEGORIES.keys()), + # Note: From https://wiki.sponsor.ajay.app/w/Types: + # The filler category is very aggressive. + # It is strongly recommended to not use this in a client by default. + 'aliases': {'default': ['all', '-filler']} + }, help=( 'SponsorBlock categories to be removed from the video file, separated by commas. ' 'If a category is present in both mark and remove, remove takes precedence. ' - 'The syntax and available categories are the same as for --sponsorblock-mark')) + 'The syntax and available categories are the same as for --sponsorblock-mark ' + 'except that "default" refers to "all,-filler" ' + f'and {", ".join(SponsorBlockPP.POI_CATEGORIES.keys())} is not available')) sponsorblock.add_option( '--sponsorblock-chapter-title', metavar='TEMPLATE', default=DEFAULT_SPONSORBLOCK_CHAPTER_TITLE, dest='sponsorblock_chapter_title', diff --git a/yt_dlp/postprocessor/modify_chapters.py b/yt_dlp/postprocessor/modify_chapters.py index 0728bdcf5..91433c364 100644 --- a/yt_dlp/postprocessor/modify_chapters.py +++ b/yt_dlp/postprocessor/modify_chapters.py @@ -24,7 +24,7 @@ class ModifyChaptersPP(FFmpegPostProcessor): *, sponsorblock_chapter_title=DEFAULT_SPONSORBLOCK_CHAPTER_TITLE, force_keyframes=False): FFmpegPostProcessor.__init__(self, downloader) self._remove_chapters_patterns = set(remove_chapters_patterns or []) - self._remove_sponsor_segments = set(remove_sponsor_segments or []) + self._remove_sponsor_segments = set(remove_sponsor_segments or []) - set(SponsorBlockPP.POI_CATEGORIES.keys()) self._ranges_to_remove = set(remove_ranges or []) self._sponsorblock_chapter_title = sponsorblock_chapter_title self._force_keyframes = force_keyframes @@ -302,7 +302,7 @@ class ModifyChaptersPP(FFmpegPostProcessor): 'name': SponsorBlockPP.CATEGORIES[category], 'category_names': [SponsorBlockPP.CATEGORIES[c] for c in cats] }) - c['title'] = self._downloader.evaluate_outtmpl(self._sponsorblock_chapter_title, c) + c['title'] = self._downloader.evaluate_outtmpl(self._sponsorblock_chapter_title, c.copy()) # Merge identically named sponsors. if (new_chapters and 'categories' in new_chapters[-1] and new_chapters[-1]['title'] == c['title']): diff --git a/yt_dlp/postprocessor/sponsorblock.py b/yt_dlp/postprocessor/sponsorblock.py index 70c5462d1..cd48b15ae 100644 --- a/yt_dlp/postprocessor/sponsorblock.py +++ b/yt_dlp/postprocessor/sponsorblock.py @@ -10,18 +10,23 @@ from ..utils import PostProcessingError, network_exceptions, sanitized_Request class SponsorBlockPP(FFmpegPostProcessor): - + # https://wiki.sponsor.ajay.app/w/Types EXTRACTORS = { 'Youtube': 'YouTube', } + POI_CATEGORIES = { + 'poi_highlight': 'Highlight', + } CATEGORIES = { 'sponsor': 'Sponsor', 'intro': 'Intermission/Intro Animation', 'outro': 'Endcards/Credits', 'selfpromo': 'Unpaid/Self Promotion', - 'interaction': 'Interaction Reminder', 'preview': 'Preview/Recap', - 'music_offtopic': 'Non-Music Section' + 'filler': 'Filler Tangent', + 'interaction': 'Interaction Reminder', + 'music_offtopic': 'Non-Music Section', + **POI_CATEGORIES, } def __init__(self, downloader, categories=None, api='https://sponsor.ajay.app'): @@ -47,6 +52,9 @@ class SponsorBlockPP(FFmpegPostProcessor): # Ignore milliseconds difference at the start. if start_end[0] <= 1: start_end[0] = 0 + # Make POI chapters 1 sec so that we can properly mark them + if s['category'] in self.POI_CATEGORIES.keys(): + start_end[1] += 1 # Ignore milliseconds difference at the end. # Never allow the segment to exceed the video. if duration and duration - start_end[1] <= 1: diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 0ca7ed738..10c35cbb9 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -38,6 +38,7 @@ import time import traceback import xml.etree.ElementTree import zlib +import mimetypes from .compat import ( compat_HTMLParseError, @@ -4715,6 +4716,14 @@ def mimetype2ext(mt): return subtype.replace('+', '.') +def ext2mimetype(ext_or_url): + if not ext_or_url: + return None + if '.' not in ext_or_url: + ext_or_url = f'file.{ext_or_url}' + return mimetypes.guess_type(ext_or_url)[0] + + def parse_codecs(codecs_str): # http://tools.ietf.org/html/rfc6381 if not codecs_str: |