aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--test/test_postprocessors.py4
-rw-r--r--yt_dlp/extractor/archiveorg.py461
-rw-r--r--yt_dlp/extractor/brightcove.py10
-rw-r--r--yt_dlp/extractor/cbc.py38
-rw-r--r--yt_dlp/extractor/crackle.py40
-rw-r--r--yt_dlp/extractor/extractors.py2
-rw-r--r--yt_dlp/extractor/gofile.py83
-rw-r--r--yt_dlp/extractor/skeb.py143
-rw-r--r--yt_dlp/extractor/zdf.py35
-rw-r--r--yt_dlp/options.py25
-rw-r--r--yt_dlp/postprocessor/modify_chapters.py4
-rw-r--r--yt_dlp/postprocessor/sponsorblock.py14
-rw-r--r--yt_dlp/utils.py9
13 files changed, 713 insertions, 155 deletions
diff --git a/test/test_postprocessors.py b/test/test_postprocessors.py
index 090c7b47b..bbe998993 100644
--- a/test/test_postprocessors.py
+++ b/test/test_postprocessors.py
@@ -124,11 +124,11 @@ class TestModifyChaptersPP(unittest.TestCase):
chapters = self._chapters([70], ['c']) + [
self._sponsor_chapter(10, 20, 'sponsor'),
self._sponsor_chapter(30, 40, 'preview'),
- self._sponsor_chapter(50, 60, 'sponsor')]
+ self._sponsor_chapter(50, 60, 'filler')]
expected = self._chapters(
[10, 20, 30, 40, 50, 60, 70],
['c', '[SponsorBlock]: Sponsor', 'c', '[SponsorBlock]: Preview/Recap',
- 'c', '[SponsorBlock]: Sponsor', 'c'])
+ 'c', '[SponsorBlock]: Filler Tangent', 'c'])
self._remove_marked_arrange_sponsors_test_impl(chapters, expected, [])
def test_remove_marked_arrange_sponsors_UniqueNamesForOverlappingSponsors(self):
diff --git a/yt_dlp/extractor/archiveorg.py b/yt_dlp/extractor/archiveorg.py
index d90fcb13a..467fe4875 100644
--- a/yt_dlp/extractor/archiveorg.py
+++ b/yt_dlp/extractor/archiveorg.py
@@ -3,33 +3,36 @@ from __future__ import unicode_literals
import re
import json
-
from .common import InfoExtractor
-from .youtube import YoutubeIE
+from .youtube import YoutubeIE, YoutubeBaseInfoExtractor
from ..compat import (
compat_urllib_parse_unquote,
compat_urllib_parse_unquote_plus,
compat_HTTPError
)
from ..utils import (
+ bug_reports_message,
clean_html,
- determine_ext,
dict_get,
extract_attributes,
ExtractorError,
+ get_element_by_id,
HEADRequest,
int_or_none,
KNOWN_EXTENSIONS,
merge_dicts,
mimetype2ext,
+ orderedSet,
parse_duration,
parse_qs,
- RegexNotFoundError,
str_to_int,
str_or_none,
+ traverse_obj,
try_get,
unified_strdate,
unified_timestamp,
+ urlhandle_detect_ext,
+ url_or_none
)
@@ -262,12 +265,12 @@ class YoutubeWebArchiveIE(InfoExtractor):
_VALID_URL = r"""(?x)^
(?:https?://)?web\.archive\.org/
(?:web/)?
- (?:[0-9A-Za-z_*]+/)? # /web and the version index is optional
+ (?:(?P<date>[0-9]{14})?[0-9A-Za-z_*]*/)? # /web and the version index is optional
(?:https?(?::|%3[Aa])//)?
(?:
- (?:\w+\.)?youtube\.com/watch(?:\?|%3[fF])(?:[^\#]+(?:&|%26))?v(?:=|%3[dD]) # Youtube URL
- |(wayback-fakeurl\.archive\.org/yt/) # Or the internal fake url
+ (?:\w+\.)?youtube\.com(?::(?:80|443))?/watch(?:\.php)?(?:\?|%3[fF])(?:[^\#]+(?:&|%26))?v(?:=|%3[dD]) # Youtube URL
+ |(?:wayback-fakeurl\.archive\.org/yt/) # Or the internal fake url
)
(?P<id>[0-9A-Za-z_-]{11})(?:%26|\#|&|$)
"""
@@ -278,141 +281,391 @@ class YoutubeWebArchiveIE(InfoExtractor):
'info_dict': {
'id': 'aYAGB11YrSs',
'ext': 'webm',
- 'title': 'Team Fortress 2 - Sandviches!'
+ 'title': 'Team Fortress 2 - Sandviches!',
+ 'description': 'md5:4984c0f9a07f349fc5d8e82ab7af4eaf',
+ 'upload_date': '20110926',
+ 'uploader': 'Zeurel',
+ 'channel_id': 'UCukCyHaD-bK3in_pKpfH9Eg',
+ 'duration': 32,
+ 'uploader_id': 'Zeurel',
+ 'uploader_url': 'http://www.youtube.com/user/Zeurel'
}
- },
- {
+ }, {
# Internal link
'url': 'https://web.archive.org/web/2oe/http://wayback-fakeurl.archive.org/yt/97t7Xj_iBv0',
'info_dict': {
'id': '97t7Xj_iBv0',
'ext': 'mp4',
- 'title': 'How Flexible Machines Could Save The World'
+ 'title': 'Why Machines That Bend Are Better',
+ 'description': 'md5:00404df2c632d16a674ff8df1ecfbb6c',
+ 'upload_date': '20190312',
+ 'uploader': 'Veritasium',
+ 'channel_id': 'UCHnyfMqiRRG1u-2MsSQLbXA',
+ 'duration': 771,
+ 'uploader_id': '1veritasium',
+ 'uploader_url': 'http://www.youtube.com/user/1veritasium'
}
- },
- {
- # Video from 2012, webm format itag 45.
+ }, {
+ # Video from 2012, webm format itag 45. Newest capture is deleted video, with an invalid description.
+ # Should use the date in the link. Title ends with '- Youtube'. Capture has description in eow-description
'url': 'https://web.archive.org/web/20120712231619/http://www.youtube.com/watch?v=AkhihxRKcrs&gl=US&hl=en',
'info_dict': {
'id': 'AkhihxRKcrs',
'ext': 'webm',
- 'title': 'Limited Run: Mondo\'s Modern Classic 1 of 3 (SDCC 2012)'
+ 'title': 'Limited Run: Mondo\'s Modern Classic 1 of 3 (SDCC 2012)',
+ 'upload_date': '20120712',
+ 'duration': 398,
+ 'description': 'md5:ff4de6a7980cb65d951c2f6966a4f2f3',
+ 'uploader_id': 'machinima',
+ 'uploader_url': 'http://www.youtube.com/user/machinima'
}
- },
- {
- # Old flash-only video. Webpage title starts with "YouTube - ".
+ }, {
+ # FLV video. Video file URL does not provide itag information
'url': 'https://web.archive.org/web/20081211103536/http://www.youtube.com/watch?v=jNQXAC9IVRw',
'info_dict': {
'id': 'jNQXAC9IVRw',
- 'ext': 'unknown_video',
- 'title': 'Me at the zoo'
+ 'ext': 'flv',
+ 'title': 'Me at the zoo',
+ 'upload_date': '20050423',
+ 'channel_id': 'UC4QobU6STFB0P71PMvOGN5A',
+ 'duration': 19,
+ 'description': 'md5:10436b12e07ac43ff8df65287a56efb4',
+ 'uploader_id': 'jawed',
+ 'uploader_url': 'http://www.youtube.com/user/jawed'
}
- },
- {
- # Flash video with .flv extension (itag 34). Title has prefix "YouTube -"
- # Title has some weird unicode characters too.
+ }, {
'url': 'https://web.archive.org/web/20110712231407/http://www.youtube.com/watch?v=lTx3G6h2xyA',
'info_dict': {
'id': 'lTx3G6h2xyA',
'ext': 'flv',
- 'title': '‪Madeon - Pop Culture (live mashup)‬‏'
+ 'title': 'Madeon - Pop Culture (live mashup)',
+ 'upload_date': '20110711',
+ 'uploader': 'Madeon',
+ 'channel_id': 'UCqMDNf3Pn5L7pcNkuSEeO3w',
+ 'duration': 204,
+ 'description': 'md5:f7535343b6eda34a314eff8b85444680',
+ 'uploader_id': 'itsmadeon',
+ 'uploader_url': 'http://www.youtube.com/user/itsmadeon'
}
- },
- { # Some versions of Youtube have have "YouTube" as page title in html (and later rewritten by js).
- 'url': 'https://web.archive.org/web/http://www.youtube.com/watch?v=kH-G_aIBlFw',
+ }, {
+ # First capture is of dead video, second is the oldest from CDX response.
+ 'url': 'https://web.archive.org/https://www.youtube.com/watch?v=1JYutPM8O6E',
'info_dict': {
- 'id': 'kH-G_aIBlFw',
+ 'id': '1JYutPM8O6E',
'ext': 'mp4',
- 'title': 'kH-G_aIBlFw'
- },
- 'expected_warnings': [
- 'unable to extract title',
- ]
- },
- {
- # First capture is a 302 redirect intermediary page.
- 'url': 'https://web.archive.org/web/20050214000000/http://www.youtube.com/watch?v=0altSZ96U4M',
+ 'title': 'Fake Teen Doctor Strikes AGAIN! - Weekly Weird News',
+ 'upload_date': '20160218',
+ 'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA',
+ 'duration': 1236,
+ 'description': 'md5:21032bae736421e89c2edf36d1936947',
+ 'uploader_id': 'MachinimaETC',
+ 'uploader_url': 'http://www.youtube.com/user/MachinimaETC'
+ }
+ }, {
+ # First capture of dead video, capture date in link links to dead capture.
+ 'url': 'https://web.archive.org/web/20180803221945/https://www.youtube.com/watch?v=6FPhZJGvf4E',
'info_dict': {
- 'id': '0altSZ96U4M',
+ 'id': '6FPhZJGvf4E',
'ext': 'mp4',
- 'title': '0altSZ96U4M'
+ 'title': 'WTF: Video Games Still Launch BROKEN?! - T.U.G.S.',
+ 'upload_date': '20160219',
+ 'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA',
+ 'duration': 798,
+ 'description': 'md5:a1dbf12d9a3bd7cb4c5e33b27d77ffe7',
+ 'uploader_id': 'MachinimaETC',
+ 'uploader_url': 'http://www.youtube.com/user/MachinimaETC'
},
'expected_warnings': [
- 'unable to extract title',
+ r'unable to download capture webpage \(it may not be archived\)'
]
- },
- {
+ }, { # Very old YouTube page, has - YouTube in title.
+ 'url': 'http://web.archive.org/web/20070302011044/http://youtube.com/watch?v=-06-KB9XTzg',
+ 'info_dict': {
+ 'id': '-06-KB9XTzg',
+ 'ext': 'flv',
+ 'title': 'New Coin Hack!! 100% Safe!!'
+ }
+ }, {
+ 'url': 'web.archive.org/https://www.youtube.com/watch?v=dWW7qP423y8',
+ 'info_dict': {
+ 'id': 'dWW7qP423y8',
+ 'ext': 'mp4',
+ 'title': 'It\'s Bootleg AirPods Time.',
+ 'upload_date': '20211021',
+ 'channel_id': 'UC7Jwj9fkrf1adN4fMmTkpug',
+ 'channel_url': 'http://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug',
+ 'duration': 810,
+ 'description': 'md5:7b567f898d8237b256f36c1a07d6d7bc',
+ 'uploader': 'DankPods',
+ 'uploader_id': 'UC7Jwj9fkrf1adN4fMmTkpug',
+ 'uploader_url': 'http://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug'
+ }
+ }, {
+ # player response contains '};' See: https://github.com/ytdl-org/youtube-dl/issues/27093
+ 'url': 'https://web.archive.org/web/20200827003909if_/http://www.youtube.com/watch?v=6Dh-RL__uN4',
+ 'info_dict': {
+ 'id': '6Dh-RL__uN4',
+ 'ext': 'mp4',
+ 'title': 'bitch lasagna',
+ 'upload_date': '20181005',
+ 'channel_id': 'UC-lHJZR3Gqxm24_Vd_AJ5Yw',
+ 'channel_url': 'http://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw',
+ 'duration': 135,
+ 'description': 'md5:2dbe4051feeff2dab5f41f82bb6d11d0',
+ 'uploader': 'PewDiePie',
+ 'uploader_id': 'PewDiePie',
+ 'uploader_url': 'http://www.youtube.com/user/PewDiePie'
+ }
+ }, {
+ 'url': 'https://web.archive.org/web/http://www.youtube.com/watch?v=kH-G_aIBlFw',
+ 'only_matching': True
+ }, {
+ 'url': 'https://web.archive.org/web/20050214000000_if/http://www.youtube.com/watch?v=0altSZ96U4M',
+ 'only_matching': True
+ }, {
# Video not archived, only capture is unavailable video page
'url': 'https://web.archive.org/web/20210530071008/https://www.youtube.com/watch?v=lHJTf93HL1s&spfreload=10',
- 'only_matching': True,
- },
- { # Encoded url
+ 'only_matching': True
+ }, { # Encoded url
'url': 'https://web.archive.org/web/20120712231619/http%3A//www.youtube.com/watch%3Fgl%3DUS%26v%3DAkhihxRKcrs%26hl%3Den',
- 'only_matching': True,
- },
- {
+ 'only_matching': True
+ }, {
'url': 'https://web.archive.org/web/20120712231619/http%3A//www.youtube.com/watch%3Fv%3DAkhihxRKcrs%26gl%3DUS%26hl%3Den',
- 'only_matching': True,
+ 'only_matching': True
+ }, {
+ 'url': 'https://web.archive.org/web/20060527081937/http://www.youtube.com:80/watch.php?v=ELTFsLT73fA&amp;search=soccer',
+ 'only_matching': True
+ }, {
+ 'url': 'https://web.archive.org/http://www.youtube.com:80/watch?v=-05VVye-ffg',
+ 'only_matching': True
}
]
+ _YT_INITIAL_DATA_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE
+ _YT_INITIAL_PLAYER_RESPONSE_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*({.+?})[)\s]*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE
+ _YT_INITIAL_BOUNDARY_RE = r'(?:(?:var\s+meta|</script|\n)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_BOUNDARY_RE
+
+ _YT_DEFAULT_THUMB_SERVERS = ['i.ytimg.com'] # thumbnails most likely archived on these servers
+ _YT_ALL_THUMB_SERVERS = orderedSet(
+ _YT_DEFAULT_THUMB_SERVERS + ['img.youtube.com', *[f'{c}{n or ""}.ytimg.com' for c in ('i', 's') for n in (*range(0, 5), 9)]])
+
+ _WAYBACK_BASE_URL = 'https://web.archive.org/web/%sif_/'
+ _OLDEST_CAPTURE_DATE = 20050214000000
+ _NEWEST_CAPTURE_DATE = 20500101000000
+
+ def _call_cdx_api(self, item_id, url, filters: list = None, collapse: list = None, query: dict = None, note='Downloading CDX API JSON'):
+ # CDX docs: https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md
+ query = {
+ 'url': url,
+ 'output': 'json',
+ 'fl': 'original,mimetype,length,timestamp',
+ 'limit': 500,
+ 'filter': ['statuscode:200'] + (filters or []),
+ 'collapse': collapse or [],
+ **(query or {})
+ }
+ res = self._download_json('https://web.archive.org/cdx/search/cdx', item_id, note, query=query)
+ if isinstance(res, list) and len(res) >= 2:
+ # format response to make it easier to use
+ return list(dict(zip(res[0], v)) for v in res[1:])
+ elif not isinstance(res, list) or len(res) != 0:
+ self.report_warning('Error while parsing CDX API response' + bug_reports_message())
+
+ def _extract_yt_initial_variable(self, webpage, regex, video_id, name):
+ return self._parse_json(self._search_regex(
+ (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE),
+ regex), webpage, name, default='{}'), video_id, fatal=False)
+
+ def _extract_webpage_title(self, webpage):
+ page_title = self._html_search_regex(
+ r'<title>([^<]*)</title>', webpage, 'title', default='')
+ # YouTube video pages appear to always have either 'YouTube -' as prefix or '- YouTube' as suffix.
+ return self._html_search_regex(
+ r'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)',
+ page_title, 'title', default='')
+
+ def _extract_metadata(self, video_id, webpage):
+
+ search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None)) if webpage else (lambda x: None))
+ player_response = self._extract_yt_initial_variable(
+ webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, video_id, 'initial player response') or {}
+ initial_data = self._extract_yt_initial_variable(
+ webpage, self._YT_INITIAL_DATA_RE, video_id, 'initial player response') or {}
+
+ initial_data_video = traverse_obj(
+ initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', ..., 'videoPrimaryInfoRenderer'),
+ expected_type=dict, get_all=False, default={})
+
+ video_details = traverse_obj(
+ player_response, 'videoDetails', expected_type=dict, get_all=False, default={})
+
+ microformats = traverse_obj(
+ player_response, ('microformat', 'playerMicroformatRenderer'), expected_type=dict, get_all=False, default={})
+
+ video_title = (
+ video_details.get('title')
+ or YoutubeBaseInfoExtractor._get_text(microformats, 'title')
+ or YoutubeBaseInfoExtractor._get_text(initial_data_video, 'title')
+ or self._extract_webpage_title(webpage)
+ or search_meta(['og:title', 'twitter:title', 'title']))
+
+ channel_id = str_or_none(
+ video_details.get('channelId')
+ or microformats.get('externalChannelId')
+ or search_meta('channelId')
+ or self._search_regex(
+ r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1', # @b45a9e6
+ webpage, 'channel id', default=None, group='id'))
+ channel_url = f'http://www.youtube.com/channel/{channel_id}' if channel_id else None
+
+ duration = int_or_none(
+ video_details.get('lengthSeconds')
+ or microformats.get('lengthSeconds')
+ or parse_duration(search_meta('duration')))
+ description = (
+ video_details.get('shortDescription')
+ or YoutubeBaseInfoExtractor._get_text(microformats, 'description')
+ or clean_html(get_element_by_id('eow-description', webpage)) # @9e6dd23
+ or search_meta(['description', 'og:description', 'twitter:description']))
+
+ uploader = video_details.get('author')
+
+ # Uploader ID and URL
+ uploader_mobj = re.search(
+ r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">', # @fd05024
+ webpage)
+ if uploader_mobj is not None:
+ uploader_id, uploader_url = uploader_mobj.group('uploader_id'), uploader_mobj.group('uploader_url')
+ else:
+ # @a6211d2
+ uploader_url = url_or_none(microformats.get('ownerProfileUrl'))
+ uploader_id = self._search_regex(
+ r'(?:user|channel)/([^/]+)', uploader_url or '', 'uploader id', default=None)
+
+ upload_date = unified_strdate(
+ dict_get(microformats, ('uploadDate', 'publishDate'))
+ or search_meta(['uploadDate', 'datePublished'])
+ or self._search_regex(
+ [r'(?s)id="eow-date.*?>(.*?)</span>',
+ r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], # @7998520
+ webpage, 'upload date', default=None))
+
+ return {
+ 'title': video_title,
+ 'description': description,
+ 'upload_date': upload_date,
+ 'uploader': uploader,
+ 'channel_id': channel_id,
+ 'channel_url': channel_url,
+ 'duration': duration,
+ 'uploader_url': uploader_url,
+ 'uploader_id': uploader_id,
+ }
+
+ def _extract_thumbnails(self, video_id):
+ try_all = 'thumbnails' in self._configuration_arg('check_all')
+ thumbnail_base_urls = ['http://{server}/vi{webp}/{video_id}'.format(
+ webp='_webp' if ext == 'webp' else '', video_id=video_id, server=server)
+ for server in (self._YT_ALL_THUMB_SERVERS if try_all else self._YT_DEFAULT_THUMB_SERVERS) for ext in (('jpg', 'webp') if try_all else ('jpg',))]
+
+ thumbnails = []
+ for url in thumbnail_base_urls:
+ response = self._call_cdx_api(
+ video_id, url, filters=['mimetype:image/(?:webp|jpeg)'],
+ collapse=['urlkey'], query={'matchType': 'prefix'})
+ if not response:
+ continue
+ thumbnails.extend(
+ {
+ 'url': (self._WAYBACK_BASE_URL % (int_or_none(thumbnail_dict.get('timestamp')) or self._OLDEST_CAPTURE_DATE)) + thumbnail_dict.get('original'),
+ 'filesize': int_or_none(thumbnail_dict.get('length')),
+ 'preference': int_or_none(thumbnail_dict.get('length'))
+ } for thumbnail_dict in response)
+ if not try_all:
+ break
+
+ self._remove_duplicate_formats(thumbnails)
+ return thumbnails
+
+ def _get_capture_dates(self, video_id, url_date):
+ capture_dates = []
+ # Note: CDX API will not find watch pages with extra params in the url.
+ response = self._call_cdx_api(
+ video_id, f'https://www.youtube.com/watch?v={video_id}',
+ filters=['mimetype:text/html'], collapse=['timestamp:6', 'digest'], query={'matchType': 'prefix'}) or []
+ all_captures = sorted([int_or_none(r['timestamp']) for r in response if int_or_none(r['timestamp']) is not None])
+
+ # Prefer the new polymer UI captures as we support extracting more metadata from them
+ # WBM captures seem to all switch to this layout ~July 2020
+ modern_captures = list(filter(lambda x: x >= 20200701000000, all_captures))
+ if modern_captures:
+ capture_dates.append(modern_captures[0])
+ capture_dates.append(url_date)
+ if all_captures:
+ capture_dates.append(all_captures[0])
+
+ if 'captures' in self._configuration_arg('check_all'):
+ capture_dates.extend(modern_captures + all_captures)
+
+ # Fallbacks if any of the above fail
+ capture_dates.extend([self._OLDEST_CAPTURE_DATE, self._NEWEST_CAPTURE_DATE])
+ return orderedSet(capture_dates)
def _real_extract(self, url):
- video_id = self._match_id(url)
- title = video_id # if we are not able get a title
-
- def _extract_title(webpage):
- page_title = self._html_search_regex(
- r'<title>([^<]*)</title>', webpage, 'title', fatal=False) or ''
- # YouTube video pages appear to always have either 'YouTube -' as suffix or '- YouTube' as prefix.
- try:
- page_title = self._html_search_regex(
- r'(?:YouTube\s*-\s*(.*)$)|(?:(.*)\s*-\s*YouTube$)',
- page_title, 'title', default='')
- except RegexNotFoundError:
- page_title = None
-
- if not page_title:
- self.report_warning('unable to extract title', video_id=video_id)
- return
- return page_title
-
- # If the video is no longer available, the oldest capture may be one before it was removed.
- # Setting the capture date in url to early date seems to redirect to earliest capture.
- webpage = self._download_webpage(
- 'https://web.archive.org/web/20050214000000/http://www.youtube.com/watch?v=%s' % video_id,
- video_id=video_id, fatal=False, errnote='unable to download video webpage (probably not archived).')
- if webpage:
- title = _extract_title(webpage) or title
-
- # Use link translator mentioned in https://github.com/ytdl-org/youtube-dl/issues/13655
- internal_fake_url = 'https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s' % video_id
+
+ url_date, video_id = self._match_valid_url(url).groups()
+
+ urlh = None
try:
- video_file_webpage = self._request_webpage(
- HEADRequest(internal_fake_url), video_id,
- note='Fetching video file url', expected_status=True)
+ urlh = self._request_webpage(
+ HEADRequest('https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s' % video_id),
+ video_id, note='Fetching archived video file url', expected_status=True)
except ExtractorError as e:
# HTTP Error 404 is expected if the video is not saved.
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404:
- raise ExtractorError(
- 'HTTP Error %s. Most likely the video is not archived or issue with web.archive.org.' % e.cause.code,
+ self.raise_no_formats(
+ 'The requested video is not archived, indexed, or there is an issue with web.archive.org',
expected=True)
- raise
- video_file_url = compat_urllib_parse_unquote(video_file_webpage.url)
- video_file_url_qs = parse_qs(video_file_url)
-
- # Attempt to recover any ext & format info from playback url
- format = {'url': video_file_url}
- itag = try_get(video_file_url_qs, lambda x: x['itag'][0])
- if itag and itag in YoutubeIE._formats: # Naughty access but it works
- format.update(YoutubeIE._formats[itag])
- format.update({'format_id': itag})
- else:
- mime = try_get(video_file_url_qs, lambda x: x['mime'][0])
- ext = mimetype2ext(mime) or determine_ext(video_file_url)
- format.update({'ext': ext})
- return {
- 'id': video_id,
- 'title': title,
- 'formats': [format],
- 'duration': str_to_int(try_get(video_file_url_qs, lambda x: x['dur'][0]))
- }
+ else:
+ raise
+
+ capture_dates = self._get_capture_dates(video_id, int_or_none(url_date))
+ self.write_debug('Captures to try: ' + ', '.join(str(i) for i in capture_dates if i is not None))
+ info = {'id': video_id}
+ for capture in capture_dates:
+ if not capture:
+ continue
+ webpage = self._download_webpage(
+ (self._WAYBACK_BASE_URL + 'http://www.youtube.com/watch?v=%s') % (capture, video_id),
+ video_id=video_id, fatal=False, errnote='unable to download capture webpage (it may not be archived)',
+ note='Downloading capture webpage')
+ current_info = self._extract_metadata(video_id, webpage or '')
+ # Try avoid getting deleted video metadata
+ if current_info.get('title'):
+ info = merge_dicts(info, current_info)
+ if 'captures' not in self._configuration_arg('check_all'):
+ break
+
+ info['thumbnails'] = self._extract_thumbnails(video_id)
+
+ if urlh:
+ url = compat_urllib_parse_unquote(urlh.url)
+ video_file_url_qs = parse_qs(url)
+ # Attempt to recover any ext & format info from playback url & response headers
+ format = {'url': url, 'filesize': int_or_none(urlh.headers.get('x-archive-orig-content-length'))}
+ itag = try_get(video_file_url_qs, lambda x: x['itag'][0])
+ if itag and itag in YoutubeIE._formats:
+ format.update(YoutubeIE._formats[itag])
+ format.update({'format_id': itag})
+ else:
+ mime = try_get(video_file_url_qs, lambda x: x['mime'][0])
+ ext = (mimetype2ext(mime)
+ or urlhandle_detect_ext(urlh)
+ or mimetype2ext(urlh.headers.get('x-archive-guessed-content-type')))
+ format.update({'ext': ext})
+ info['formats'] = [format]
+ if not info.get('duration'):
+ info['duration'] = str_to_int(try_get(video_file_url_qs, lambda x: x['dur'][0]))
+
+ if not info.get('title'):
+ info['title'] = video_id
+ return info
diff --git a/yt_dlp/extractor/brightcove.py b/yt_dlp/extractor/brightcove.py
index cd1c3f01c..171739b46 100644
--- a/yt_dlp/extractor/brightcove.py
+++ b/yt_dlp/extractor/brightcove.py
@@ -16,6 +16,7 @@ from ..compat import (
)
from ..utils import (
clean_html,
+ dict_get,
extract_attributes,
ExtractorError,
find_xpath_attr,
@@ -577,11 +578,20 @@ class BrightcoveNewIE(AdobePassIE):
if duration is not None and duration <= 0:
is_live = True
+ common_res = [(160, 90), (320, 180), (480, 720), (640, 360), (768, 432), (1024, 576), (1280, 720), (1366, 768), (1920, 1080)]
+ thumb_base_url = dict_get(json_data, ('poster', 'thumbnail'))
+ thumbnails = [{
+ 'url': re.sub(r'\d+x\d+', f'{w}x{h}', thumb_base_url),
+ 'width': w,
+ 'height': h,
+ } for w, h in common_res] if thumb_base_url else None
+
return {
'id': video_id,
'title': self._live_title(title) if is_live else title,
'description': clean_html(json_data.get('description')),
'thumbnail': json_data.get('thumbnail') or json_data.get('poster'),
+ 'thumbnials': thumbnails,
'duration': duration,
'timestamp': parse_iso8601(json_data.get('published_at')),
'uploader_id': json_data.get('account_id'),
diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py
index 413053499..392c77884 100644
--- a/yt_dlp/extractor/cbc.py
+++ b/yt_dlp/extractor/cbc.py
@@ -11,11 +11,13 @@ from ..compat import (
compat_str,
)
from ..utils import (
+ int_or_none,
+ join_nonempty,
js_to_json,
- smuggle_url,
- try_get,
orderedSet,
+ smuggle_url,
strip_or_none,
+ try_get,
ExtractorError,
)
@@ -313,6 +315,37 @@ class CBCGemIE(InfoExtractor):
return
self._claims_token = self._downloader.cache.load(self._NETRC_MACHINE, 'claims_token')
+ def _find_secret_formats(self, formats, video_id):
+ """ Find a valid video url and convert it to the secret variant """
+ base_format = next((f for f in formats if f.get('vcodec') != 'none'), None)
+ if not base_format:
+ return
+
+ base_url = re.sub(r'(Manifest\(.*?),filter=[\w-]+(.*?\))', r'\1\2', base_format['url'])
+ url = re.sub(r'(Manifest\(.*?),format=[\w-]+(.*?\))', r'\1\2', base_url)
+
+ secret_xml = self._download_xml(url, video_id, note='Downloading secret XML', fatal=False)
+ if not secret_xml:
+ return
+
+ for child in secret_xml:
+ if child.attrib.get('Type') != 'video':
+ continue
+ for video_quality in child:
+ bitrate = int_or_none(video_quality.attrib.get('Bitrate'))
+ if not bitrate or 'Index' not in video_quality.attrib:
+ continue
+ height = int_or_none(video_quality.attrib.get('MaxHeight'))
+
+ yield {
+ **base_format,
+ 'format_id': join_nonempty('sec', height),
+ 'url': re.sub(r'(QualityLevels\()\d+(\))', fr'\<1>{bitrate}\2', base_url),
+ 'width': int_or_none(video_quality.attrib.get('MaxWidth')),
+ 'tbr': bitrate / 1000.0,
+ 'height': height,
+ }
+
def _real_extract(self, url):
video_id = self._match_id(url)
video_info = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/assets/' + video_id, video_id)
@@ -335,6 +368,7 @@ class CBCGemIE(InfoExtractor):
formats = self._extract_m3u8_formats(m3u8_url, video_id, m3u8_id='hls')
self._remove_duplicate_formats(formats)
+ formats.extend(self._find_secret_formats(formats, video_id))
for format in formats:
if format.get('vcodec') == 'none':
diff --git a/yt_dlp/extractor/crackle.py b/yt_dlp/extractor/crackle.py
index 2c9d28d2e..db4962c42 100644
--- a/yt_dlp/extractor/crackle.py
+++ b/yt_dlp/extractor/crackle.py
@@ -23,32 +23,35 @@ from ..utils import (
class CrackleIE(InfoExtractor):
_VALID_URL = r'(?:crackle:|https?://(?:(?:www|m)\.)?(?:sony)?crackle\.com/(?:playlist/\d+/|(?:[^/]+/)+))(?P<id>\d+)'
_TESTS = [{
- # geo restricted to CA
- 'url': 'https://www.crackle.com/andromeda/2502343',
+ # Crackle is available in the United States and territories
+ 'url': 'https://www.crackle.com/thanksgiving/2510064',
'info_dict': {
- 'id': '2502343',
+ 'id': '2510064',
'ext': 'mp4',
- 'title': 'Under The Night',
- 'description': 'md5:d2b8ca816579ae8a7bf28bfff8cefc8a',
- 'duration': 2583,
+ 'title': 'Touch Football',
+ 'description': 'md5:cfbb513cf5de41e8b56d7ab756cff4df',
+ 'duration': 1398,
'view_count': int,
'average_rating': 0,
- 'age_limit': 14,
- 'genre': 'Action, Sci-Fi',
- 'creator': 'Allan Kroeker',
- 'artist': 'Keith Hamilton Cobb, Kevin Sorbo, Lisa Ryder, Lexa Doig, Robert Hewitt Wolfe',
- 'release_year': 2000,
- 'series': 'Andromeda',
- 'episode': 'Under The Night',
+ 'age_limit': 17,
+ 'genre': 'Comedy',
+ 'creator': 'Daniel Powell',
+ 'artist': 'Chris Elliott, Amy Sedaris',
+ 'release_year': 2016,
+ 'series': 'Thanksgiving',
+ 'episode': 'Touch Football',
'season_number': 1,
'episode_number': 1,
},
'params': {
# m3u8 download
'skip_download': True,
- }
+ },
+ 'expected_warnings': [
+ 'Trying with a list of known countries'
+ ],
}, {
- 'url': 'https://www.sonycrackle.com/andromeda/2502343',
+ 'url': 'https://www.sonycrackle.com/thanksgiving/2510064',
'only_matching': True,
}]
@@ -129,7 +132,6 @@ class CrackleIE(InfoExtractor):
break
ignore_no_formats = self.get_param('ignore_no_formats_error')
- allow_unplayable_formats = self.get_param('allow_unplayable_formats')
if not media or (not media.get('MediaURLs') and not ignore_no_formats):
raise ExtractorError(
@@ -143,9 +145,9 @@ class CrackleIE(InfoExtractor):
for e in media.get('MediaURLs') or []:
if e.get('UseDRM'):
has_drm = True
- if not allow_unplayable_formats:
- continue
- format_url = url_or_none(e.get('Path'))
+ format_url = url_or_none(e.get('DRMPath'))
+ else:
+ format_url = url_or_none(e.get('Path'))
if not format_url:
continue
ext = determine_ext(format_url)
diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py
index 572c32751..8d7c54ec4 100644
--- a/yt_dlp/extractor/extractors.py
+++ b/yt_dlp/extractor/extractors.py
@@ -523,6 +523,7 @@ from .globo import (
)
from .go import GoIE
from .godtube import GodTubeIE
+from .gofile import GofileIE
from .golem import GolemIE
from .googledrive import GoogleDriveIE
from .googlepodcasts import (
@@ -1315,6 +1316,7 @@ from .simplecast import (
)
from .sina import SinaIE
from .sixplay import SixPlayIE
+from .skeb import SkebIE
from .skyit import (
SkyItPlayerIE,
SkyItVideoIE,
diff --git a/yt_dlp/extractor/gofile.py b/yt_dlp/extractor/gofile.py
new file mode 100644
index 000000000..62d778cfe
--- /dev/null
+++ b/yt_dlp/extractor/gofile.py
@@ -0,0 +1,83 @@
+# coding: utf-8
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ try_get
+)
+
+
+class GofileIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?gofile\.io/d/(?P<id>[^/]+)'
+ _TESTS = [{
+ 'url': 'https://gofile.io/d/AMZyDw',
+ 'info_dict': {
+ 'id': 'AMZyDw',
+ },
+ 'playlist_mincount': 2,
+ 'playlist': [{
+ 'info_dict': {
+ 'id': 'de571ac1-5edc-42e2-8ec2-bdac83ad4a31',
+ 'filesize': 928116,
+ 'ext': 'mp4',
+ 'title': 'nuuh'
+ }
+ }]
+ }, { # URL to test mixed file types
+ 'url': 'https://gofile.io/d/avt34h',
+ 'info_dict': {
+ 'id': 'avt34h',
+ },
+ 'playlist_mincount': 1,
+ }, { # URL to test no video/audio error
+ 'url': 'https://gofile.io/d/aB03lZ',
+ 'info_dict': {
+ 'id': 'aB03lZ',
+ },
+ 'playlist_count': 0,
+ 'skip': 'No video/audio found at provided URL.',
+ }]
+ _TOKEN = None
+
+ def _real_initialize(self):
+ token = self._get_cookies('https://gofile.io/').get('accountToken')
+ if token:
+ self._TOKEN = token.value
+ return
+
+ account_data = self._download_json(
+ 'https://api.gofile.io/createAccount', None, note='Getting a new guest account')
+ self._TOKEN = account_data['data']['token']
+ self._set_cookie('gofile.io', 'accountToken', self._TOKEN)
+
+ def _entries(self, file_id):
+ files = self._download_json(
+ f'https://api.gofile.io/getContent?contentId={file_id}&token={self._TOKEN}&websiteToken=websiteToken&cache=true',
+ 'Gofile', note='Getting filelist')
+
+ status = files['status']
+ if status != 'ok':
+ raise ExtractorError(f'{self.IE_NAME} said: status {status}', expected=True)
+
+ found_files = False
+ for file in (try_get(files, lambda x: x['data']['contents'], dict) or {}).values():
+ file_type, file_format = file.get('mimetype').split('/', 1)
+ if file_type not in ('video', 'audio') and file_format != 'vnd.mts':
+ continue
+
+ found_files = True
+ file_url = file.get('directLink')
+ if file_url:
+ yield {
+ 'id': file['id'],
+ 'title': file['name'].rsplit('.', 1)[0],
+ 'url': file_url,
+ 'filesize': file.get('size'),
+ 'release_timestamp': file.get('createTime')
+ }
+
+ if not found_files:
+ raise ExtractorError('No video/audio found at provided URL.', expected=True)
+
+ def _real_extract(self, url):
+ file_id = self._match_id(url)
+ return self.playlist_result(self._entries(file_id), playlist_id=file_id)
diff --git a/yt_dlp/extractor/skeb.py b/yt_dlp/extractor/skeb.py
new file mode 100644
index 000000000..81aecb311
--- /dev/null
+++ b/yt_dlp/extractor/skeb.py
@@ -0,0 +1,143 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import ExtractorError, determine_ext, parse_qs, traverse_obj
+
+
+class SkebIE(InfoExtractor):
+ _VALID_URL = r'https?://skeb\.jp/@[^/]+/works/(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'https://skeb.jp/@riiru_wm/works/10',
+ 'info_dict': {
+ 'id': '466853',
+ 'title': '内容はおまかせします! by 姫ノ森りぃる@一周年',
+ 'descripion': 'md5:1ec50901efc3437cfbfe3790468d532d',
+ 'uploader': '姫ノ森りぃる@一周年',
+ 'uploader_id': 'riiru_wm',
+ 'age_limit': 0,
+ 'tags': [],
+ 'url': r're:https://skeb.+',
+ 'thumbnail': r're:https://skeb.+',
+ 'subtitles': {
+ 'jpn': [{
+ 'url': r're:https://skeb.+',
+ 'ext': 'vtt'
+ }]
+ },
+ 'width': 720,
+ 'height': 405,
+ 'duration': 313,
+ 'fps': 30,
+ 'ext': 'mp4',
+ },
+ }, {
+ 'url': 'https://skeb.jp/@furukawa_nob/works/3',
+ 'info_dict': {
+ 'id': '489408',
+ 'title': 'いつもお世話になってお... by 古川ノブ@音楽とVlo...',
+ 'descripion': 'md5:5adc2e41d06d33b558bf7b1faeb7b9c2',
+ 'uploader': '古川ノブ@音楽とVlogのVtuber',
+ 'uploader_id': 'furukawa_nob',
+ 'age_limit': 0,
+ 'tags': [
+ 'よろしく', '大丈夫', 'お願い', 'でした',
+ '是非', 'O', 'バー', '遊び', 'おはよう',
+ 'オーバ', 'ボイス',
+ ],
+ 'url': r're:https://skeb.+',
+ 'thumbnail': r're:https://skeb.+',
+ 'subtitles': {
+ 'jpn': [{
+ 'url': r're:https://skeb.+',
+ 'ext': 'vtt'
+ }]
+ },
+ 'duration': 98,
+ 'ext': 'mp3',
+ 'vcodec': 'none',
+ 'abr': 128,
+ },
+ }, {
+ 'url': 'https://skeb.jp/@mollowmollow/works/6',
+ 'info_dict': {
+ 'id': '6',
+ 'title': 'ヒロ。\n\n私のキャラク... by 諸々',
+ 'descripion': 'md5:aa6cbf2ba320b50bce219632de195f07',
+ '_type': 'playlist',
+ 'entries': [{
+ 'id': '486430',
+ 'title': 'ヒロ。\n\n私のキャラク... by 諸々',
+ 'descripion': 'md5:aa6cbf2ba320b50bce219632de195f07',
+ }, {
+ 'id': '486431',
+ 'title': 'ヒロ。\n\n私のキャラク... by 諸々',
+ }]
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ nuxt_data = self._search_nuxt_data(self._download_webpage(url, video_id), video_id)
+
+ parent = {
+ 'id': video_id,
+ 'title': nuxt_data.get('title'),
+ 'descripion': nuxt_data.get('description'),
+ 'uploader': traverse_obj(nuxt_data, ('creator', 'name')),
+ 'uploader_id': traverse_obj(nuxt_data, ('creator', 'screen_name')),
+ 'age_limit': 18 if nuxt_data.get('nsfw') else 0,
+ 'tags': nuxt_data.get('tag_list'),
+ }
+
+ entries = []
+ for item in nuxt_data.get('previews') or []:
+ vid_url = item.get('url')
+ given_ext = traverse_obj(item, ('information', 'extension'))
+ preview_ext = determine_ext(vid_url, default_ext=None)
+ if not preview_ext:
+ content_disposition = parse_qs(vid_url)['response-content-disposition'][0]
+ preview_ext = self._search_regex(
+ r'filename="[^"]+\.([^\.]+?)"', content_disposition,
+ 'preview file extension', fatal=False, group=1)
+ if preview_ext not in ('mp4', 'mp3'):
+ continue
+ if not vid_url or not item.get('id'):
+ continue
+ width, height = traverse_obj(item, ('information', 'width')), traverse_obj(item, ('information', 'height'))
+ if width is not None and height is not None:
+ # the longest side is at most 720px for non-client viewers
+ max_size = max(width, height)
+ width, height = list(x * 720 // max_size for x in (width, height))
+ entries.append({
+ **parent,
+ 'id': str(item['id']),
+ 'url': vid_url,
+ 'thumbnail': item.get('poster_url'),
+ 'subtitles': {
+ 'jpn': [{
+ 'url': item.get('vtt_url'),
+ 'ext': 'vtt',
+ }]
+ } if item.get('vtt_url') else None,
+ 'width': width,
+ 'height': height,
+ 'duration': traverse_obj(item, ('information', 'duration')),
+ 'fps': traverse_obj(item, ('information', 'frame_rate')),
+ 'ext': preview_ext or given_ext,
+ 'vcodec': 'none' if preview_ext == 'mp3' else None,
+ # you'll always get 128kbps MP3 for non-client viewers
+ 'abr': 128 if preview_ext == 'mp3' else None,
+ })
+
+ if not entries:
+ raise ExtractorError('No video/audio attachment found in this commission.', expected=True)
+ elif len(entries) == 1:
+ return entries[0]
+ else:
+ parent.update({
+ '_type': 'playlist',
+ 'entries': entries,
+ })
+ return parent
diff --git a/yt_dlp/extractor/zdf.py b/yt_dlp/extractor/zdf.py
index df236c050..6f7f801e1 100644
--- a/yt_dlp/extractor/zdf.py
+++ b/yt_dlp/extractor/zdf.py
@@ -15,6 +15,7 @@ from ..utils import (
orderedSet,
parse_codecs,
qualities,
+ traverse_obj,
try_get,
unified_timestamp,
update_url_query,
@@ -135,19 +136,6 @@ class ZDFBaseIE(InfoExtractor):
class ZDFIE(ZDFBaseIE):
_VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)\.html'
_TESTS = [{
- # Same as https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html
- 'url': 'https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html',
- 'md5': '34ec321e7eb34231fd88616c65c92db0',
- 'info_dict': {
- 'id': '210222_phx_nachgehakt_corona_protest',
- 'ext': 'mp4',
- 'title': 'Wohin führt der Protest in der Pandemie?',
- 'description': 'md5:7d643fe7f565e53a24aac036b2122fbd',
- 'duration': 1691,
- 'timestamp': 1613948400,
- 'upload_date': '20210221',
- },
- }, {
# Same as https://www.3sat.de/film/ab-18/10-wochen-sommer-108.html
'url': 'https://www.zdf.de/dokumentation/ab-18/10-wochen-sommer-102.html',
'md5': '0aff3e7bc72c8813f5e0fae333316a1d',
@@ -172,6 +160,18 @@ class ZDFIE(ZDFBaseIE):
'upload_date': '20160604',
},
}, {
+ 'url': 'https://www.zdf.de/funk/druck-11790/funk-alles-ist-verzaubert-102.html',
+ 'md5': '3d6f1049e9682178a11c54b91f3dd065',
+ 'info_dict': {
+ 'ext': 'mp4',
+ 'id': 'video_funk_1770473',
+ 'duration': 1278,
+ 'description': 'Die Neue an der Schule verdreht Ismail den Kopf.',
+ 'title': 'Alles ist verzaubert',
+ 'timestamp': 1635520560,
+ 'upload_date': '20211029'
+ },
+ }, {
# Same as https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche
'url': 'https://www.zdf.de/politik/phoenix-sendungen/die-gesten-der-maechtigen-100.html',
'only_matching': True,
@@ -192,6 +192,10 @@ class ZDFIE(ZDFBaseIE):
}, {
'url': 'https://www.zdf.de/dokumentation/planet-e/planet-e-uebersichtsseite-weitere-dokumentationen-von-planet-e-100.html',
'only_matching': True,
+ }, {
+ # Same as https://www.phoenix.de/sendungen/ereignisse/corona-nachgehakt/wohin-fuehrt-der-protest-in-der-pandemie-a-2050630.html
+ 'url': 'https://www.zdf.de/politik/phoenix-sendungen/wohin-fuehrt-der-protest-in-der-pandemie-100.html',
+ 'only_matching': True
}]
def _extract_entry(self, url, player, content, video_id):
@@ -202,8 +206,9 @@ class ZDFIE(ZDFBaseIE):
ptmd_path = t.get('http://zdf.de/rels/streams/ptmd')
if not ptmd_path:
- ptmd_path = t[
- 'http://zdf.de/rels/streams/ptmd-template'].replace(
+ ptmd_path = traverse_obj(
+ t, ('streams', 'default', 'http://zdf.de/rels/streams/ptmd-template'),
+ 'http://zdf.de/rels/streams/ptmd-template').replace(
'{playerId}', 'ngplayer_2_4')
info = self._extract_ptmd(
diff --git a/yt_dlp/options.py b/yt_dlp/options.py
index 120084046..85c7c8cda 100644
--- a/yt_dlp/options.py
+++ b/yt_dlp/options.py
@@ -1461,20 +1461,29 @@ def parseOpts(overrideArguments=None):
sponsorblock.add_option(
'--sponsorblock-mark', metavar='CATS',
dest='sponsorblock_mark', default=set(), action='callback', type='str',
- callback=_set_from_options_callback, callback_kwargs={'allowed_values': SponsorBlockPP.CATEGORIES.keys()},
- help=(
+ callback=_set_from_options_callback, callback_kwargs={
+ 'allowed_values': SponsorBlockPP.CATEGORIES.keys(),
+ 'aliases': {'default': ['all']}
+ }, help=(
'SponsorBlock categories to create chapters for, separated by commas. '
- 'Available categories are all, %s. You can prefix the category with a "-" to exempt it. '
- 'See https://wiki.sponsor.ajay.app/index.php/Segment_Categories for description of the categories. '
- 'Eg: --sponsorblock-mark all,-preview' % ', '.join(SponsorBlockPP.CATEGORIES.keys())))
+ f'Available categories are all, default(=all), {", ".join(SponsorBlockPP.CATEGORIES.keys())}. '
+ 'You can prefix the category with a "-" to exempt it. See [1] for description of the categories. '
+ 'Eg: --sponsorblock-mark all,-preview [1] https://wiki.sponsor.ajay.app/w/Segment_Categories'))
sponsorblock.add_option(
'--sponsorblock-remove', metavar='CATS',
dest='sponsorblock_remove', default=set(), action='callback', type='str',
- callback=_set_from_options_callback, callback_kwargs={'allowed_values': SponsorBlockPP.CATEGORIES.keys()},
- help=(
+ callback=_set_from_options_callback, callback_kwargs={
+ 'allowed_values': set(SponsorBlockPP.CATEGORIES.keys()) - set(SponsorBlockPP.POI_CATEGORIES.keys()),
+ # Note: From https://wiki.sponsor.ajay.app/w/Types:
+ # The filler category is very aggressive.
+ # It is strongly recommended to not use this in a client by default.
+ 'aliases': {'default': ['all', '-filler']}
+ }, help=(
'SponsorBlock categories to be removed from the video file, separated by commas. '
'If a category is present in both mark and remove, remove takes precedence. '
- 'The syntax and available categories are the same as for --sponsorblock-mark'))
+ 'The syntax and available categories are the same as for --sponsorblock-mark '
+ 'except that "default" refers to "all,-filler" '
+ f'and {", ".join(SponsorBlockPP.POI_CATEGORIES.keys())} is not available'))
sponsorblock.add_option(
'--sponsorblock-chapter-title', metavar='TEMPLATE',
default=DEFAULT_SPONSORBLOCK_CHAPTER_TITLE, dest='sponsorblock_chapter_title',
diff --git a/yt_dlp/postprocessor/modify_chapters.py b/yt_dlp/postprocessor/modify_chapters.py
index 0728bdcf5..91433c364 100644
--- a/yt_dlp/postprocessor/modify_chapters.py
+++ b/yt_dlp/postprocessor/modify_chapters.py
@@ -24,7 +24,7 @@ class ModifyChaptersPP(FFmpegPostProcessor):
*, sponsorblock_chapter_title=DEFAULT_SPONSORBLOCK_CHAPTER_TITLE, force_keyframes=False):
FFmpegPostProcessor.__init__(self, downloader)
self._remove_chapters_patterns = set(remove_chapters_patterns or [])
- self._remove_sponsor_segments = set(remove_sponsor_segments or [])
+ self._remove_sponsor_segments = set(remove_sponsor_segments or []) - set(SponsorBlockPP.POI_CATEGORIES.keys())
self._ranges_to_remove = set(remove_ranges or [])
self._sponsorblock_chapter_title = sponsorblock_chapter_title
self._force_keyframes = force_keyframes
@@ -302,7 +302,7 @@ class ModifyChaptersPP(FFmpegPostProcessor):
'name': SponsorBlockPP.CATEGORIES[category],
'category_names': [SponsorBlockPP.CATEGORIES[c] for c in cats]
})
- c['title'] = self._downloader.evaluate_outtmpl(self._sponsorblock_chapter_title, c)
+ c['title'] = self._downloader.evaluate_outtmpl(self._sponsorblock_chapter_title, c.copy())
# Merge identically named sponsors.
if (new_chapters and 'categories' in new_chapters[-1]
and new_chapters[-1]['title'] == c['title']):
diff --git a/yt_dlp/postprocessor/sponsorblock.py b/yt_dlp/postprocessor/sponsorblock.py
index 70c5462d1..cd48b15ae 100644
--- a/yt_dlp/postprocessor/sponsorblock.py
+++ b/yt_dlp/postprocessor/sponsorblock.py
@@ -10,18 +10,23 @@ from ..utils import PostProcessingError, network_exceptions, sanitized_Request
class SponsorBlockPP(FFmpegPostProcessor):
-
+ # https://wiki.sponsor.ajay.app/w/Types
EXTRACTORS = {
'Youtube': 'YouTube',
}
+ POI_CATEGORIES = {
+ 'poi_highlight': 'Highlight',
+ }
CATEGORIES = {
'sponsor': 'Sponsor',
'intro': 'Intermission/Intro Animation',
'outro': 'Endcards/Credits',
'selfpromo': 'Unpaid/Self Promotion',
- 'interaction': 'Interaction Reminder',
'preview': 'Preview/Recap',
- 'music_offtopic': 'Non-Music Section'
+ 'filler': 'Filler Tangent',
+ 'interaction': 'Interaction Reminder',
+ 'music_offtopic': 'Non-Music Section',
+ **POI_CATEGORIES,
}
def __init__(self, downloader, categories=None, api='https://sponsor.ajay.app'):
@@ -47,6 +52,9 @@ class SponsorBlockPP(FFmpegPostProcessor):
# Ignore milliseconds difference at the start.
if start_end[0] <= 1:
start_end[0] = 0
+ # Make POI chapters 1 sec so that we can properly mark them
+ if s['category'] in self.POI_CATEGORIES.keys():
+ start_end[1] += 1
# Ignore milliseconds difference at the end.
# Never allow the segment to exceed the video.
if duration and duration - start_end[1] <= 1:
diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py
index 0ca7ed738..10c35cbb9 100644
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@@ -38,6 +38,7 @@ import time
import traceback
import xml.etree.ElementTree
import zlib
+import mimetypes
from .compat import (
compat_HTMLParseError,
@@ -4715,6 +4716,14 @@ def mimetype2ext(mt):
return subtype.replace('+', '.')
+def ext2mimetype(ext_or_url):
+ if not ext_or_url:
+ return None
+ if '.' not in ext_or_url:
+ ext_or_url = f'file.{ext_or_url}'
+ return mimetypes.guess_type(ext_or_url)[0]
+
+
def parse_codecs(codecs_str):
# http://tools.ietf.org/html/rfc6381
if not codecs_str: