diff options
24 files changed, 844 insertions, 423 deletions
diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 54911fcc5..e1c04d319 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -55,6 +55,7 @@ - **Aparat** - **AppleConnect** - **AppleDaily**: 臺灣蘋果日報 + - **ApplePodcasts** - **appletrailers** - **appletrailers:section** - **archive.org**: archive.org videos @@ -99,6 +100,10 @@ - **BellMedia** - **Bet** - **bfi:player** + - **bfmtv** + - **bfmtv:article** + - **bfmtv:live** + - **BibelTV** - **Bigflix** - **Bild**: Bild.de - **BiliBili** @@ -346,6 +351,8 @@ - **Go** - **GodTube** - **Golem** + - **google:podcasts** + - **google:podcasts:feed** - **GoogleDrive** - **Goshgay** - **GPUTechConf** @@ -381,6 +388,8 @@ - **HungamaSong** - **Hypem** - **ign.com** + - **IHeartRadio** + - **iheartradio:podcast** - **imdb**: Internet Movie Database trailers - **imdb:list**: Internet Movie Database lists - **Imgur** @@ -706,7 +715,6 @@ - **Playwire** - **pluralsight** - **pluralsight:course** - - **plus.google**: Google Plus - **podomatic** - **Pokemon** - **PokemonWatch** @@ -1146,7 +1154,7 @@ - **WWE** - **XBef** - **XboxClips** - - **XFileShare**: XFileShare based sites: ClipWatching, GoUnlimited, GoVid, HolaVid, Streamty, TheVideoBee, Uqload, VidBom, vidlo, VidLocker, VidShare, VUp, XVideoSharing + - **XFileShare**: XFileShare based sites: Aparat, ClipWatching, GoUnlimited, GoVid, HolaVid, Streamty, TheVideoBee, Uqload, VidBom, vidlo, VidLocker, VidShare, VUp, XVideoSharing - **XHamster** - **XHamsterEmbed** - **XHamsterUser** diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 86e20cb4b..0014d57b6 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -264,16 +264,24 @@ class TestNRKSubtitles(BaseTestSubtitles): class TestRaiPlaySubtitles(BaseTestSubtitles): - url = 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html' IE = RaiPlayIE - def test_allsubtitles(self): + def test_subtitles_key(self): + self.url = 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html' self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['it'])) self.assertEqual(md5(subtitles['it']), 'b1d90a98755126b61e667567a1f6680a') + def test_subtitles_array_key(self): + self.url = 'https://www.raiplay.it/video/2020/12/Report---04-01-2021-2e90f1de-8eee-4de4-ac0e-78d21db5b600.html' + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['it'])) + self.assertEqual(md5(subtitles['it']), '4b3264186fbb103508abe5311cfcb9cd') + class TestVikiSubtitles(BaseTestSubtitles): url = 'http://www.viki.com/videos/1060846v-punch-episode-18' diff --git a/test/test_utils.py b/test/test_utils.py index bb69b0522..a0f78ebe1 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -21,6 +21,7 @@ from youtube_dlc.utils import ( encode_base_n, caesar, clean_html, + clean_podcast_url, date_from_str, DateRange, detect_exe_version, @@ -1497,6 +1498,10 @@ Line 1 iri_to_uri('http://导航.中国/'), 'http://xn--fet810g.xn--fiqs8s/') + def test_clean_podcast_url(self): + self.assertEqual(clean_podcast_url('https://www.podtrac.com/pts/redirect.mp3/chtbl.com/track/5899E/traffic.megaphone.fm/HSW7835899191.mp3'), 'https://traffic.megaphone.fm/HSW7835899191.mp3') + self.assertEqual(clean_podcast_url('https://play.podtrac.com/npr-344098539/edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3'), 'https://edge1.pod.npr.org/anon.npr-podcasts/podcast/npr/waitwait/2020/10/20201003_waitwait_wwdtmpodcast201003-015621a5-f035-4eca-a9a1-7c118d90bc3c.mp3') + if __name__ == '__main__': unittest.main() diff --git a/youtube_dlc/downloader/hls.py b/youtube_dlc/downloader/hls.py index 5e1ff4f6b..7aaebc940 100644 --- a/youtube_dlc/downloader/hls.py +++ b/youtube_dlc/downloader/hls.py @@ -172,8 +172,12 @@ class HlsFD(FragmentFD): iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence) decrypt_info['KEY'] = decrypt_info.get('KEY') or self.ydl.urlopen( self._prepare_url(info_dict, info_dict.get('_decryption_key_url') or decrypt_info['URI'])).read() - frag_content = AES.new( - decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) + # Don't decrypt the content in tests since the data is explicitly truncated and it's not to a valid block + # size (see https://github.com/ytdl-org/youtube-dl/pull/27660). Tests only care that the correct data downloaded, + # not what it decrypts to. + if not test: + frag_content = AES.new( + decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content) self._append_fragment(ctx, frag_content) # We only download the first fragment during the test if test: diff --git a/youtube_dlc/extractor/acast.py b/youtube_dlc/extractor/acast.py index 60378db1b..b9355a2c8 100644 --- a/youtube_dlc/extractor/acast.py +++ b/youtube_dlc/extractor/acast.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( clean_html, + clean_podcast_url, int_or_none, parse_iso8601, ) @@ -17,7 +18,7 @@ class ACastBaseIE(InfoExtractor): info = { 'id': episode['id'], 'display_id': episode.get('episodeUrl'), - 'url': episode['url'], + 'url': clean_podcast_url(episode['url']), 'title': title, 'description': clean_html(episode.get('description') or episode.get('summary')), 'thumbnail': episode.get('image'), diff --git a/youtube_dlc/extractor/applepodcasts.py b/youtube_dlc/extractor/applepodcasts.py new file mode 100644 index 000000000..95758fece --- /dev/null +++ b/youtube_dlc/extractor/applepodcasts.py @@ -0,0 +1,61 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_podcast_url, + int_or_none, + parse_iso8601, + try_get, +) + + +class ApplePodcastsIE(InfoExtractor): + _VALID_URL = r'https?://podcasts\.apple\.com/(?:[^/]+/)?podcast(?:/[^/]+){1,2}.*?\bi=(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://podcasts.apple.com/us/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', + 'md5': 'df02e6acb11c10e844946a39e7222b08', + 'info_dict': { + 'id': '1000482637777', + 'ext': 'mp3', + 'title': '207 - Whitney Webb Returns', + 'description': 'md5:13a73bade02d2e43737751e3987e1399', + 'upload_date': '20200705', + 'timestamp': 1593921600, + 'duration': 6425, + 'series': 'The Tim Dillon Show', + } + }, { + 'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns/id1135137367?i=1000482637777', + 'only_matching': True, + }, { + 'url': 'https://podcasts.apple.com/podcast/207-whitney-webb-returns?i=1000482637777', + 'only_matching': True, + }, { + 'url': 'https://podcasts.apple.com/podcast/id1135137367?i=1000482637777', + 'only_matching': True, + }] + + def _real_extract(self, url): + episode_id = self._match_id(url) + webpage = self._download_webpage(url, episode_id) + ember_data = self._parse_json(self._search_regex( + r'id="shoebox-ember-data-store"[^>]*>\s*({.+?})\s*<', + webpage, 'ember data'), episode_id) + episode = ember_data['data']['attributes'] + description = episode.get('description') or {} + + series = None + for inc in (ember_data.get('included') or []): + if inc.get('type') == 'media/podcast': + series = try_get(inc, lambda x: x['attributes']['name']) + + return { + 'id': episode_id, + 'title': episode['name'], + 'url': clean_podcast_url(episode['assetUrl']), + 'description': description.get('standard') or description.get('short'), + 'timestamp': parse_iso8601(episode.get('releaseDateTime')), + 'duration': int_or_none(episode.get('durationInMilliseconds'), 1000), + 'series': series, + } diff --git a/youtube_dlc/extractor/bfmtv.py b/youtube_dlc/extractor/bfmtv.py new file mode 100644 index 000000000..501f69d80 --- /dev/null +++ b/youtube_dlc/extractor/bfmtv.py @@ -0,0 +1,103 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import extract_attributes + + +class BFMTVBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?bfmtv\.com/' + _VALID_URL_TMPL = _VALID_URL_BASE + r'(?:[^/]+/)*[^/?&#]+_%s[A-Z]-(?P<id>\d{12})\.html' + _VIDEO_BLOCK_REGEX = r'(<div[^>]+class="video_block"[^>]*>)' + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' + + def _brightcove_url_result(self, video_id, video_block): + account_id = video_block.get('accountid') or '876450612001' + player_id = video_block.get('playerid') or 'I2qBTln4u' + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id), + 'BrightcoveNew', video_id) + + +class BFMTVIE(BFMTVBaseIE): + IE_NAME = 'bfmtv' + _VALID_URL = BFMTVBaseIE._VALID_URL_TMPL % 'V' + _TESTS = [{ + 'url': 'https://www.bfmtv.com/politique/emmanuel-macron-l-islam-est-une-religion-qui-vit-une-crise-aujourd-hui-partout-dans-le-monde_VN-202010020146.html', + 'info_dict': { + 'id': '6196747868001', + 'ext': 'mp4', + 'title': 'Emmanuel Macron: "L\'Islam est une religion qui vit une crise aujourd’hui, partout dans le monde"', + 'description': 'Le Président s\'exprime sur la question du séparatisme depuis les Mureaux, dans les Yvelines.', + 'uploader_id': '876450610001', + 'upload_date': '20201002', + 'timestamp': 1601629620, + }, + }] + + def _real_extract(self, url): + bfmtv_id = self._match_id(url) + webpage = self._download_webpage(url, bfmtv_id) + video_block = extract_attributes(self._search_regex( + self._VIDEO_BLOCK_REGEX, webpage, 'video block')) + return self._brightcove_url_result(video_block['videoid'], video_block) + + +class BFMTVLiveIE(BFMTVIE): + IE_NAME = 'bfmtv:live' + _VALID_URL = BFMTVBaseIE._VALID_URL_BASE + '(?P<id>(?:[^/]+/)?en-direct)' + _TESTS = [{ + 'url': 'https://www.bfmtv.com/en-direct/', + 'info_dict': { + 'id': '5615950982001', + 'ext': 'mp4', + 'title': r're:^le direct BFMTV WEB \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'uploader_id': '876450610001', + 'upload_date': '20171018', + 'timestamp': 1508329950, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.bfmtv.com/economie/en-direct/', + 'only_matching': True, + }] + + +class BFMTVArticleIE(BFMTVBaseIE): + IE_NAME = 'bfmtv:article' + _VALID_URL = BFMTVBaseIE._VALID_URL_TMPL % 'A' + _TESTS = [{ + 'url': 'https://www.bfmtv.com/sante/covid-19-un-responsable-de-l-institut-pasteur-se-demande-quand-la-france-va-se-reconfiner_AV-202101060198.html', + 'info_dict': { + 'id': '202101060198', + 'title': 'Covid-19: un responsable de l\'Institut Pasteur se demande "quand la France va se reconfiner"', + 'description': 'md5:947974089c303d3ac6196670ae262843', + }, + 'playlist_count': 2, + }, { + 'url': 'https://www.bfmtv.com/international/pour-bolsonaro-le-bresil-est-en-faillite-mais-il-ne-peut-rien-faire_AD-202101060232.html', + 'only_matching': True, + }, { + 'url': 'https://www.bfmtv.com/sante/covid-19-oui-le-vaccin-de-pfizer-distribue-en-france-a-bien-ete-teste-sur-des-personnes-agees_AN-202101060275.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + bfmtv_id = self._match_id(url) + webpage = self._download_webpage(url, bfmtv_id) + + entries = [] + for video_block_el in re.findall(self._VIDEO_BLOCK_REGEX, webpage): + video_block = extract_attributes(video_block_el) + video_id = video_block.get('videoid') + if not video_id: + continue + entries.append(self._brightcove_url_result(video_id, video_block)) + + return self.playlist_result( + entries, bfmtv_id, self._og_search_title(webpage, fatal=False), + self._html_search_meta(['og:description', 'description'], webpage)) diff --git a/youtube_dlc/extractor/bibeltv.py b/youtube_dlc/extractor/bibeltv.py new file mode 100644 index 000000000..56c2bfee8 --- /dev/null +++ b/youtube_dlc/extractor/bibeltv.py @@ -0,0 +1,30 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class BibelTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bibeltv\.de/mediathek/videos/(?:crn/)?(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.bibeltv.de/mediathek/videos/329703-sprachkurs-in-malaiisch', + 'md5': '252f908192d611de038b8504b08bf97f', + 'info_dict': { + 'id': 'ref:329703', + 'ext': 'mp4', + 'title': 'Sprachkurs in Malaiisch', + 'description': 'md5:3e9f197d29ee164714e67351cf737dfe', + 'timestamp': 1608316701, + 'uploader_id': '5840105145001', + 'upload_date': '20201218', + } + }, { + 'url': 'https://www.bibeltv.de/mediathek/videos/crn/326374', + 'only_matching': True, + }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/5840105145001/default_default/index.html?videoId=ref:%s' + + def _real_extract(self, url): + crn_id = self._match_id(url) + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % crn_id, 'BrightcoveNew') diff --git a/youtube_dlc/extractor/canvas.py b/youtube_dlc/extractor/canvas.py index 8667a0d04..8b76a0200 100644 --- a/youtube_dlc/extractor/canvas.py +++ b/youtube_dlc/extractor/canvas.py @@ -7,12 +7,12 @@ from .common import InfoExtractor from .gigya import GigyaBaseIE from ..compat import compat_HTTPError from ..utils import ( + extract_attributes, ExtractorError, strip_or_none, float_or_none, int_or_none, merge_dicts, - parse_iso8601, str_or_none, url_or_none, ) @@ -37,6 +37,7 @@ class CanvasIE(InfoExtractor): 'url': 'https://mediazone.vrt.be/api/v1/canvas/assets/mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e', 'only_matching': True, }] + _GEO_BYPASS = False _HLS_ENTRY_PROTOCOLS_MAP = { 'HLS': 'm3u8_native', 'HLS_AES': 'm3u8', @@ -47,29 +48,34 @@ class CanvasIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) site_id, video_id = mobj.group('site_id'), mobj.group('id') - # Old API endpoint, serves more formats but may fail for some videos - data = self._download_json( - 'https://mediazone.vrt.be/api/v1/%s/assets/%s' - % (site_id, video_id), video_id, 'Downloading asset JSON', - 'Unable to download asset JSON', fatal=False) + data = None + if site_id != 'vrtvideo': + # Old API endpoint, serves more formats but may fail for some videos + data = self._download_json( + 'https://mediazone.vrt.be/api/v1/%s/assets/%s' + % (site_id, video_id), video_id, 'Downloading asset JSON', + 'Unable to download asset JSON', fatal=False) # New API endpoint if not data: + headers = self.geo_verification_headers() + headers.update({'Content-Type': 'application/json'}) token = self._download_json( '%s/tokens' % self._REST_API_BASE, video_id, - 'Downloading token', data=b'', - headers={'Content-Type': 'application/json'})['vrtPlayerToken'] + 'Downloading token', data=b'', headers=headers)['vrtPlayerToken'] data = self._download_json( '%s/videos/%s' % (self._REST_API_BASE, video_id), - video_id, 'Downloading video JSON', fatal=False, query={ + video_id, 'Downloading video JSON', query={ 'vrtPlayerToken': token, 'client': '%s@PROD' % site_id, }, expected_status=400) - message = data.get('message') - if message and not data.get('title'): - if data.get('code') == 'AUTHENTICATION_REQUIRED': - self.raise_login_required(message) - raise ExtractorError(message, expected=True) + if not data.get('title'): + code = data.get('code') + if code == 'AUTHENTICATION_REQUIRED': + self.raise_login_required() + elif code == 'INVALID_LOCATION': + self.raise_geo_restricted(countries=['BE']) + raise ExtractorError(data.get('message') or code, expected=True) title = data['title'] description = data.get('description') @@ -205,20 +211,24 @@ class CanvasEenIE(InfoExtractor): class VrtNUIE(GigyaBaseIE): IE_DESC = 'VrtNU.be' - _VALID_URL = r'https?://(?:www\.)?vrt\.be/(?P<site_id>vrtnu)/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?vrt\.be/vrtnu/a-z/(?:[^/]+/){2}(?P<id>[^/?#&]+)' _TESTS = [{ # Available via old API endpoint - 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1/postbus-x-s1a1/', + 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1989/postbus-x-s1989a1/', 'info_dict': { - 'id': 'pbs-pub-2e2d8c27-df26-45c9-9dc6-90c78153044d$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de', + 'id': 'pbs-pub-e8713dac-899e-41de-9313-81269f4c04ac$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de', 'ext': 'mp4', - 'title': 'De zwarte weduwe', - 'description': 'md5:db1227b0f318c849ba5eab1fef895ee4', + 'title': 'Postbus X - Aflevering 1 (Seizoen 1989)', + 'description': 'md5:b704f669eb9262da4c55b33d7c6ed4b7', 'duration': 1457.04, 'thumbnail': r're:^https?://.*\.jpg$', - 'season': 'Season 1', - 'season_number': 1, + 'series': 'Postbus X', + 'season': 'Seizoen 1989', + 'season_number': 1989, + 'episode': 'De zwarte weduwe', 'episode_number': 1, + 'timestamp': 1595822400, + 'upload_date': '20200727', }, 'skip': 'This video is only available for registered users', 'params': { @@ -300,69 +310,25 @@ class VrtNUIE(GigyaBaseIE): def _real_extract(self, url): display_id = self._match_id(url) - webpage, urlh = self._download_webpage_handle(url, display_id) - - info = self._search_json_ld(webpage, display_id, default={}) - - # title is optional here since it may be extracted by extractor - # that is delegated from here - title = strip_or_none(self._html_search_regex( - r'(?ms)<h1 class="content__heading">(.+?)</h1>', - webpage, 'title', default=None)) - - description = self._html_search_regex( - r'(?ms)<div class="content__description">(.+?)</div>', - webpage, 'description', default=None) - - season = self._html_search_regex( - [r'''(?xms)<div\ class="tabs__tab\ tabs__tab--active">\s* - <span>seizoen\ (.+?)</span>\s* - </div>''', - r'<option value="seizoen (\d{1,3})" data-href="[^"]+?" selected>'], - webpage, 'season', default=None) - - season_number = int_or_none(season) - - episode_number = int_or_none(self._html_search_regex( - r'''(?xms)<div\ class="content__episode">\s* - <abbr\ title="aflevering">afl</abbr>\s*<span>(\d+)</span> - </div>''', - webpage, 'episode_number', default=None)) - - release_date = parse_iso8601(self._html_search_regex( - r'(?ms)<div class="content__broadcastdate">\s*<time\ datetime="(.+?)"', - webpage, 'release_date', default=None)) - - # If there's a ? or a # in the URL, remove them and everything after - clean_url = urlh.geturl().split('?')[0].split('#')[0].strip('/') - securevideo_url = clean_url + '.mssecurevideo.json' - - try: - video = self._download_json(securevideo_url, display_id) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - self.raise_login_required() - raise + webpage = self._download_webpage(url, display_id) - # We are dealing with a '../<show>.relevant' URL - redirect_url = video.get('url') - if redirect_url: - return self.url_result(self._proto_relative_url(redirect_url, 'https:')) + attrs = extract_attributes(self._search_regex( + r'(<nui-media[^>]+>)', webpage, 'media element')) + video_id = attrs['videoid'] + publication_id = attrs.get('publicationid') + if publication_id: + video_id = publication_id + '$' + video_id - # There is only one entry, but with an unknown key, so just get - # the first one - video_id = list(video.values())[0].get('videoid') + page = (self._parse_json(self._search_regex( + r'digitalData\s*=\s*({.+?});', webpage, 'digial data', + default='{}'), video_id, fatal=False) or {}).get('page') or {} + info = self._search_json_ld(webpage, display_id, default={}) return merge_dicts(info, { '_type': 'url_transparent', 'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id, 'ie_key': CanvasIE.ie_key(), 'id': video_id, 'display_id': display_id, - 'title': title, - 'description': description, - 'season': season, - 'season_number': season_number, - 'episode_number': episode_number, - 'release_date': release_date, + 'season_number': int_or_none(page.get('episode_season')), }) diff --git a/youtube_dlc/extractor/dplay.py b/youtube_dlc/extractor/dplay.py index a7b9db568..47501dbe6 100644 --- a/youtube_dlc/extractor/dplay.py +++ b/youtube_dlc/extractor/dplay.py @@ -17,7 +17,12 @@ from ..utils import ( class DPlayIE(InfoExtractor): _VALID_URL = r'''(?x)https?:// (?P<domain> - (?:www\.)?(?P<host>dplay\.(?P<country>dk|fi|jp|se|no))| + (?:www\.)?(?P<host>d + (?: + play\.(?P<country>dk|fi|jp|se|no)| + iscoveryplus\.(?P<plus_country>dk|es|fi|it|se|no) + ) + )| (?P<subdomain_country>es|it)\.dplay\.com )/[^/]+/(?P<id>[^/]+/[^/?#]+)''' @@ -126,6 +131,24 @@ class DPlayIE(InfoExtractor): }, { 'url': 'https://www.dplay.jp/video/gold-rush/24086', 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.se/videos/nugammalt-77-handelser-som-format-sverige/nugammalt-77-handelser-som-format-sverige-101', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.dk/videoer/ted-bundy-mind-of-a-monster/ted-bundy-mind-of-a-monster', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.no/videoer/i-kongens-klr/sesong-1-episode-7', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.it/videos/biografie-imbarazzanti/luigi-di-maio-la-psicosi-di-stanislawskij', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.es/videos/la-fiebre-del-oro/temporada-8-episodio-1', + 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.fi/videot/shifting-gears-with-aaron-kaufman/episode-16', + 'only_matching': True, }] def _get_disco_api_info(self, url, display_id, disco_host, realm, country): @@ -241,7 +264,7 @@ class DPlayIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) display_id = mobj.group('id') domain = mobj.group('domain').lstrip('www.') - country = mobj.group('country') or mobj.group('subdomain_country') - host = 'disco-api.' + domain if domain.startswith('dplay.') else 'eu2-prod.disco-api.com' + country = mobj.group('country') or mobj.group('subdomain_country') or mobj.group('plus_country') + host = 'disco-api.' + domain if domain[0] == 'd' else 'eu2-prod.disco-api.com' return self._get_disco_api_info( url, display_id, host, 'dplay' + country, country) diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index 200cf1395..65effed8e 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -59,6 +59,7 @@ from .appletrailers import ( AppleTrailersIE, AppleTrailersSectionIE, ) +from .applepodcasts import ApplePodcastsIE from .archiveorg import ArchiveOrgIE from .arcpublishing import ArcPublishingIE from .arkena import ArkenaIE @@ -104,6 +105,12 @@ from .bellmedia import BellMediaIE from .beatport import BeatportIE from .bet import BetIE from .bfi import BFIPlayerIE +from .bfmtv import ( + BFMTVIE, + BFMTVLiveIE, + BFMTVArticleIE, +) +from .bibeltv import BibelTVIE from .bigflix import BigflixIE from .bild import BildIE from .bilibili import ( @@ -442,7 +449,10 @@ from .go import GoIE from .godtube import GodTubeIE from .golem import GolemIE from .googledrive import GoogleDriveIE -from .googleplus import GooglePlusIE +from .googlepodcasts import ( + GooglePodcastsIE, + GooglePodcastsFeedIE, +) from .googlesearch import GoogleSearchIE from .goshgay import GoshgayIE from .gputechconf import GPUTechConfIE @@ -484,6 +494,10 @@ from .ign import ( OneUPIE, PCMagIE, ) +from .iheart import ( + IHeartRadioIE, + IHeartRadioPodcastIE, +) from .imdb import ( ImdbIE, ImdbListIE diff --git a/youtube_dlc/extractor/googleplus.py b/youtube_dlc/extractor/googleplus.py deleted file mode 100644 index 6b927bb44..000000000 --- a/youtube_dlc/extractor/googleplus.py +++ /dev/null @@ -1,73 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import codecs - -from .common import InfoExtractor -from ..utils import unified_strdate - - -class GooglePlusIE(InfoExtractor): - IE_DESC = 'Google Plus' - _VALID_URL = r'https?://plus\.google\.com/(?:[^/]+/)*?posts/(?P<id>\w+)' - IE_NAME = 'plus.google' - _TEST = { - 'url': 'https://plus.google.com/u/0/108897254135232129896/posts/ZButuJc6CtH', - 'info_dict': { - 'id': 'ZButuJc6CtH', - 'ext': 'flv', - 'title': '嘆きの天使 降臨', - 'upload_date': '20120613', - 'uploader': '井上ヨシマサ', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - # Step 1, Retrieve post webpage to extract further information - webpage = self._download_webpage(url, video_id, 'Downloading entry webpage') - - title = self._og_search_description(webpage).splitlines()[0] - upload_date = unified_strdate(self._html_search_regex( - r'''(?x)<a.+?class="o-U-s\s[^"]+"\s+style="display:\s*none"\s*> - ([0-9]{4}-[0-9]{2}-[0-9]{2})</a>''', - webpage, 'upload date', fatal=False, flags=re.VERBOSE)) - uploader = self._html_search_regex( - r'rel="author".*?>(.*?)</a>', webpage, 'uploader', fatal=False) - - # Step 2, Simulate clicking the image box to launch video - DOMAIN = 'https://plus.google.com/' - video_page = self._search_regex( - r'<a href="((?:%s)?photos/.*?)"' % re.escape(DOMAIN), - webpage, 'video page URL') - if not video_page.startswith(DOMAIN): - video_page = DOMAIN + video_page - - webpage = self._download_webpage(video_page, video_id, 'Downloading video page') - - def unicode_escape(s): - decoder = codecs.getdecoder('unicode_escape') - return re.sub( - r'\\u[0-9a-fA-F]{4,}', - lambda m: decoder(m.group(0))[0], - s) - - # Extract video links all sizes - formats = [{ - 'url': unicode_escape(video_url), - 'ext': 'flv', - 'width': int(width), - 'height': int(height), - } for width, height, video_url in re.findall( - r'\d+,(\d+),(\d+),"(https?://[^.]+\.googleusercontent\.com.*?)"', webpage)] - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'uploader': uploader, - 'upload_date': upload_date, - 'formats': formats, - } diff --git a/youtube_dlc/extractor/googlepodcasts.py b/youtube_dlc/extractor/googlepodcasts.py new file mode 100644 index 000000000..31ad79907 --- /dev/null +++ b/youtube_dlc/extractor/googlepodcasts.py @@ -0,0 +1,88 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( + clean_podcast_url, + int_or_none, + try_get, + urlencode_postdata, +) + + +class GooglePodcastsBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://podcasts\.google\.com/feed/' + + def _batch_execute(self, func_id, video_id, params): + return json.loads(self._download_json( + 'https://podcasts.google.com/_/PodcastsUi/data/batchexecute', + video_id, data=urlencode_postdata({ + 'f.req': json.dumps([[[func_id, json.dumps(params), None, '1']]]), + }), transform_source=lambda x: self._search_regex(r'(?s)(\[.+\])', x, 'data'))[0][2]) + + def _extract_episode(self, episode): + return { + 'id': episode[4][3], + 'title': episode[8], + 'url': clean_podcast_url(episode[13]), + 'thumbnail': episode[2], + 'description': episode[9], + 'creator': try_get(episode, lambda x: x[14]), + 'timestamp': int_or_none(episode[11]), + 'duration': int_or_none(episode[12]), + 'series': episode[1], + } + + +class GooglePodcastsIE(GooglePodcastsBaseIE): + IE_NAME = 'google:podcasts' + _VALID_URL = GooglePodcastsBaseIE._VALID_URL_BASE + r'(?P<feed_url>[^/]+)/episode/(?P<id>[^/?&#]+)' + _TEST = { + 'url': 'https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5ucHIub3JnLzM0NDA5ODUzOS9wb2RjYXN0LnhtbA/episode/MzBlNWRlN2UtOWE4Yy00ODcwLTk2M2MtM2JlMmUyNmViOTRh', + 'md5': 'fa56b2ee8bd0703e27e42d4b104c4766', + 'info_dict': { + 'id': '30e5de7e-9a8c-4870-963c-3be2e26eb94a', + 'ext': 'mp3', + 'title': 'WWDTM New Year 2021', + 'description': 'We say goodbye to 2020 with Christine Baranksi, Doug Jones, Jonna Mendez, and Kellee Edwards.', + 'upload_date': '20210102', + 'timestamp': 1609606800, + 'duration': 2901, + 'series': "Wait Wait... Don't Tell Me!", + } + } + + def _real_extract(self, url): + b64_feed_url, b64_guid = re.match(self._VALID_URL, url).groups() + episode = self._batch_execute( + 'oNjqVe', b64_guid, [b64_feed_url, b64_guid])[1] + return self._extract_episode(episode) + + +class GooglePodcastsFeedIE(GooglePodcastsBaseIE): + IE_NAME = 'google:podcasts:feed' + _VALID_URL = GooglePodcastsBaseIE._VALID_URL_BASE + r'(?P<id>[^/?&#]+)/?(?:[?#&]|$)' + _TEST = { + 'url': 'https://podcasts.google.com/feed/aHR0cHM6Ly9mZWVkcy5ucHIub3JnLzM0NDA5ODUzOS9wb2RjYXN0LnhtbA', + 'info_dict': { + 'title': "Wait Wait... Don't Tell Me!", + 'description': "NPR's weekly current events quiz. Have a laugh and test your news knowledge while figuring out what's real and what we've made up.", + }, + 'playlist_mincount': 20, + } + + def _real_extract(self, url): + b64_feed_url = self._match_id(url) + data = self._batch_execute('ncqJEe', b64_feed_url, [b64_feed_url]) + + entries = [] + for episode in (try_get(data, lambda x: x[1][0]) or []): + entries.append(self._extract_episode(episode)) + + feed = try_get(data, lambda x: x[3]) or [] + return self.playlist_result( + entries, playlist_title=try_get(feed, lambda x: x[0]), + playlist_description=try_get(feed, lambda x: x[2])) diff --git a/youtube_dlc/extractor/iheart.py b/youtube_dlc/extractor/iheart.py new file mode 100644 index 000000000..b54c05eeb --- /dev/null +++ b/youtube_dlc/extractor/iheart.py @@ -0,0 +1,97 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + clean_podcast_url, + int_or_none, + str_or_none, +) + + +class IHeartRadioBaseIE(InfoExtractor): + def _call_api(self, path, video_id, fatal=True, query=None): + return self._download_json( + 'https://api.iheart.com/api/v3/podcast/' + path, + video_id, fatal=fatal, query=query) + + def _extract_episode(self, episode): + return { + 'thumbnail': episode.get('imageUrl'), + 'description': clean_html(episode.get('description')), + 'timestamp': int_or_none(episode.get('startDate'), 1000), + 'duration': int_or_none(episode.get('duration')), + } + + +class IHeartRadioIE(IHeartRadioBaseIE): + IENAME = 'iheartradio' + _VALID_URL = r'(?:https?://(?:www\.)?iheart\.com/podcast/[^/]+/episode/(?P<display_id>[^/?&#]+)-|iheartradio:)(?P<id>\d+)' + _TEST = { + 'url': 'https://www.iheart.com/podcast/105-behind-the-bastards-29236323/episode/part-one-alexander-lukashenko-the-dictator-70346499/?embed=true', + 'md5': 'c8609c92c8688dcb69d8541042b8abca', + 'info_dict': { + 'id': '70346499', + 'ext': 'mp3', + 'title': 'Part One: Alexander Lukashenko: The Dictator of Belarus', + 'description': 'md5:96cc7297b3a5a9ebae28643801c96fae', + 'timestamp': 1597741200, + 'upload_date': '20200818', + } + } + + def _real_extract(self, url): + episode_id = self._match_id(url) + episode = self._call_api( + 'episodes/' + episode_id, episode_id)['episode'] + info = self._extract_episode(episode) + info.update({ + 'id': episode_id, + 'title': episode['title'], + 'url': clean_podcast_url(episode['mediaUrl']), + }) + return info + + +class IHeartRadioPodcastIE(IHeartRadioBaseIE): + IE_NAME = 'iheartradio:podcast' + _VALID_URL = r'https?://(?:www\.)?iheart(?:podcastnetwork)?\.com/podcast/[^/?&#]+-(?P<id>\d+)/?(?:[?#&]|$)' + _TESTS = [{ + 'url': 'https://www.iheart.com/podcast/1119-it-could-happen-here-30717896/', + 'info_dict': { + 'id': '30717896', + 'title': 'It Could Happen Here', + 'description': 'md5:5842117412a967eb0b01f8088eb663e2', + }, + 'playlist_mincount': 11, + }, { + 'url': 'https://www.iheartpodcastnetwork.com/podcast/105-stuff-you-should-know-26940277', + 'only_matching': True, + }] + + def _real_extract(self, url): + podcast_id = self._match_id(url) + path = 'podcasts/' + podcast_id + episodes = self._call_api( + path + '/episodes', podcast_id, query={'limit': 1000000000})['data'] + + entries = [] + for episode in episodes: + episode_id = str_or_none(episode.get('id')) + if not episode_id: + continue + info = self._extract_episode(episode) + info.update({ + '_type': 'url', + 'id': episode_id, + 'title': episode.get('title'), + 'url': 'iheartradio:' + episode_id, + 'ie_key': IHeartRadioIE.ie_key(), + }) + entries.append(info) + + podcast = self._call_api(path, podcast_id, False) or {} + + return self.playlist_result( + entries, podcast_id, podcast.get('title'), podcast.get('description')) diff --git a/youtube_dlc/extractor/ketnet.py b/youtube_dlc/extractor/ketnet.py index 93a98e1e0..e0599d02f 100644 --- a/youtube_dlc/extractor/ketnet.py +++ b/youtube_dlc/extractor/ketnet.py @@ -2,92 +2,71 @@ from __future__ import unicode_literals from .canvas import CanvasIE from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote +from ..utils import ( + int_or_none, + parse_iso8601, +) class KetnetIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?ketnet\.be/(?P<id>(?:[^/]+/)*[^/?#&]+)' _TESTS = [{ - 'url': 'https://www.ketnet.be/kijken/zomerse-filmpjes', - 'md5': '6bdeb65998930251bbd1c510750edba9', + 'url': 'https://www.ketnet.be/kijken/n/nachtwacht/3/nachtwacht-s3a1-de-greystook', + 'md5': '37b2b7bb9b3dcaa05b67058dc3a714a9', 'info_dict': { - 'id': 'zomerse-filmpjes', + 'id': 'pbs-pub-aef8b526-115e-4006-aa24-e59ff6c6ef6f$vid-ddb815bf-c8e7-467b-8879-6bad7a32cebd', 'ext': 'mp4', - 'title': 'Gluur mee op de filmset en op Pennenzakkenrock', - 'description': 'Gluur mee met Ghost Rockers op de filmset', + 'title': 'Nachtwacht - Reeks 3: Aflevering 1', + 'description': 'De Nachtwacht krijgt te maken met een parasiet', 'thumbnail': r're:^https?://.*\.jpg$', - } - }, { - # mzid in playerConfig instead of sources - 'url': 'https://www.ketnet.be/kijken/nachtwacht/de-greystook', - 'md5': '90139b746a0a9bd7bb631283f6e2a64e', - 'info_dict': { - 'id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'display_id': 'md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', - 'ext': 'flv', - 'title': 'Nachtwacht: De Greystook', - 'description': 'md5:1db3f5dc4c7109c821261e7512975be7', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1468.03, + 'duration': 1468.02, + 'timestamp': 1609225200, + 'upload_date': '20201229', + 'series': 'Nachtwacht', + 'season': 'Reeks 3', + 'episode': 'De Greystook', + 'episode_number': 1, }, 'expected_warnings': ['is not a supported codec', 'Unknown MIME type'], }, { - 'url': 'https://www.ketnet.be/kijken/karrewiet/uitzending-8-september-2016', - 'only_matching': True, - }, { - 'url': 'https://www.ketnet.be/achter-de-schermen/sien-repeteert-voor-stars-for-life', - 'only_matching': True, - }, { - # mzsource, geo restricted to Belgium - 'url': 'https://www.ketnet.be/kijken/nachtwacht/de-bermadoe', + 'url': 'https://www.ketnet.be/themas/karrewiet/jaaroverzicht-20200/karrewiet-het-jaar-van-black-mamba', 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - config = self._parse_json( - self._search_regex( - r'(?s)playerConfig\s*=\s*({.+?})\s*;', webpage, - 'player config'), - video_id) - - mzid = config.get('mzid') - if mzid: - return self.url_result( - 'https://mediazone.vrt.be/api/v1/ketnet/assets/%s' % mzid, - CanvasIE.ie_key(), video_id=mzid) + display_id = self._match_id(url) - title = config['title'] + video = self._download_json( + 'https://senior-bff.ketnet.be/graphql', display_id, query={ + 'query': '''{ + video(id: "content/ketnet/nl/%s.model.json") { + description + episodeNr + imageUrl + mediaReference + programTitle + publicationDate + seasonTitle + subtitleVideodetail + titleVideodetail + } +}''' % display_id, + })['data']['video'] - formats = [] - for source_key in ('', 'mz'): - source = config.get('%ssource' % source_key) - if not isinstance(source, dict): - continue - for format_id, format_url in source.items(): - if format_id == 'hls': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id=format_id, - fatal=False)) - elif format_id == 'hds': - formats.extend(self._extract_f4m_formats( - format_url, video_id, f4m_id=format_id, fatal=False)) - else: - formats.append({ - 'url': format_url, - 'format_id': format_id, - }) - self._sort_formats(formats) + mz_id = compat_urllib_parse_unquote(video['mediaReference']) return { - 'id': video_id, - 'title': title, - 'description': config.get('description'), - 'thumbnail': config.get('image'), - 'series': config.get('program'), - 'episode': config.get('episode'), - 'formats': formats, + '_type': 'url_transparent', + 'id': mz_id, + 'title': video['titleVideodetail'], + 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/' + mz_id, + 'thumbnail': video.get('imageUrl'), + 'description': video.get('description'), + 'timestamp': parse_iso8601(video.get('publicationDate')), + 'series': video.get('programTitle'), + 'season': video.get('seasonTitle'), + 'episode': video.get('subtitleVideodetail'), + 'episode_number': int_or_none(video.get('episodeNr')), + 'ie_key': CanvasIE.ie_key(), } diff --git a/youtube_dlc/extractor/motherless.py b/youtube_dlc/extractor/motherless.py index b1615b4d8..ef1e081f2 100644 --- a/youtube_dlc/extractor/motherless.py +++ b/youtube_dlc/extractor/motherless.py @@ -61,6 +61,23 @@ class MotherlessIE(InfoExtractor): # no keywords 'url': 'http://motherless.com/8B4BBC1', 'only_matching': True, + }, { + # see https://motherless.com/videos/recent for recent videos with + # uploaded date in "ago" format + 'url': 'https://motherless.com/3C3E2CF', + 'info_dict': { + 'id': '3C3E2CF', + 'ext': 'mp4', + 'title': 'a/ Hot Teens', + 'categories': list, + 'upload_date': '20210104', + 'uploader_id': 'yonbiw', + 'thumbnail': r're:https?://.*\.jpg', + 'age_limit': 18, + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -85,20 +102,28 @@ class MotherlessIE(InfoExtractor): or 'http://cdn4.videos.motherlessmedia.com/videos/%s.mp4?fs=opencloud' % video_id) age_limit = self._rta_search(webpage) view_count = str_to_int(self._html_search_regex( - (r'>(\d+)\s+Views<', r'<strong>Views</strong>\s+([^<]+)<'), + (r'>([\d,.]+)\s+Views<', r'<strong>Views</strong>\s+([^<]+)<'), webpage, 'view count', fatal=False)) like_count = str_to_int(self._html_search_regex( - (r'>(\d+)\s+Favorites<', r'<strong>Favorited</strong>\s+([^<]+)<'), + (r'>([\d,.]+)\s+Favorites<', + r'<strong>Favorited</strong>\s+([^<]+)<'), webpage, 'like count', fatal=False)) - upload_date = self._html_search_regex( - (r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', - r'<strong>Uploaded</strong>\s+([^<]+)<'), webpage, 'upload date') - if 'Ago' in upload_date: - days = int(re.search(r'([0-9]+)', upload_date).group(1)) - upload_date = (datetime.datetime.now() - datetime.timedelta(days=days)).strftime('%Y%m%d') - else: - upload_date = unified_strdate(upload_date) + upload_date = unified_strdate(self._search_regex( + r'class=["\']count[^>]+>(\d+\s+[a-zA-Z]{3}\s+\d{4})<', webpage, + 'upload date', default=None)) + if not upload_date: + uploaded_ago = self._search_regex( + r'>\s*(\d+[hd])\s+[aA]go\b', webpage, 'uploaded ago', + default=None) + if uploaded_ago: + delta = int(uploaded_ago[:-1]) + _AGO_UNITS = { + 'h': 'hours', + 'd': 'days', + } + kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta} + upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d') comment_count = webpage.count('class="media-comment-contents"') uploader_id = self._html_search_regex( diff --git a/youtube_dlc/extractor/nrk.py b/youtube_dlc/extractor/nrk.py index 69178e157..40dee2162 100644 --- a/youtube_dlc/extractor/nrk.py +++ b/youtube_dlc/extractor/nrk.py @@ -223,12 +223,12 @@ class NRKIE(NRKBaseIE): legal_age = try_get( data, lambda x: x['legalAge']['body']['rating']['code'], compat_str) # https://en.wikipedia.org/wiki/Norwegian_Media_Authority - if legal_age == 'A': - age_limit = 0 - elif legal_age.isdigit(): - age_limit = int_or_none(legal_age) - else: - age_limit = None + age_limit = None + if legal_age: + if legal_age == 'A': + age_limit = 0 + elif legal_age.isdigit(): + age_limit = int_or_none(legal_age) is_series = try_get(data, lambda x: x['_links']['series']['name']) == 'series' @@ -298,6 +298,14 @@ class NRKTVIE(InfoExtractor): 'description': 'md5:46923a6e6510eefcce23d5ef2a58f2ce', 'duration': 2223.44, 'age_limit': 6, + 'subtitles': { + 'nb-nor': [{ + 'ext': 'vtt', + }], + 'nb-ttv': [{ + 'ext': 'vtt', + }] + }, }, }, { 'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', diff --git a/youtube_dlc/extractor/rai.py b/youtube_dlc/extractor/rai.py index 5eef7c633..c78580d95 100644 --- a/youtube_dlc/extractor/rai.py +++ b/youtube_dlc/extractor/rai.py @@ -103,22 +103,28 @@ class RaiBaseIE(InfoExtractor): }.items() if v is not None) @staticmethod - def _extract_subtitles(url, subtitle_url): + def _extract_subtitles(url, video_data): + STL_EXT = 'stl' + SRT_EXT = 'srt' subtitles = {} - if subtitle_url and isinstance(subtitle_url, compat_str): - subtitle_url = urljoin(url, subtitle_url) - STL_EXT = '.stl' - SRT_EXT = '.srt' - subtitles['it'] = [{ - 'ext': 'stl', - 'url': subtitle_url, - }] - if subtitle_url.endswith(STL_EXT): - srt_url = subtitle_url[:-len(STL_EXT)] + SRT_EXT - subtitles['it'].append({ - 'ext': 'srt', - 'url': srt_url, + subtitles_array = video_data.get('subtitlesArray') or [] + for k in ('subtitles', 'subtitlesUrl'): + subtitles_array.append({'url': video_data.get(k)}) + for subtitle in subtitles_array: + sub_url = subtitle.get('url') + if sub_url and isinstance(sub_url, compat_str): + sub_lang = subtitle.get('language') or 'it' + sub_url = urljoin(url, sub_url) + sub_ext = determine_ext(sub_url, SRT_EXT) + subtitles.setdefault(sub_lang, []).append({ + 'ext': sub_ext, + 'url': sub_url, }) + if STL_EXT == sub_ext: + subtitles[sub_lang].append({ + 'ext': SRT_EXT, + 'url': sub_url[:-len(STL_EXT)] + SRT_EXT, + }) return subtitles @@ -138,6 +144,9 @@ class RaiPlayIE(RaiBaseIE): 'duration': 6160, 'series': 'Report', 'season': '2013/14', + 'subtitles': { + 'it': 'count:2', + }, }, 'params': { 'skip_download': True, @@ -145,6 +154,10 @@ class RaiPlayIE(RaiBaseIE): }, { 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', 'only_matching': True, + }, { + # subtitles at 'subtitlesArray' key (see #27698) + 'url': 'https://www.raiplay.it/video/2020/12/Report---04-01-2021-2e90f1de-8eee-4de4-ac0e-78d21db5b600.html', + 'only_matching': True, }] def _real_extract(self, url): @@ -171,7 +184,7 @@ class RaiPlayIE(RaiBaseIE): if date_published and time_published: date_published += ' ' + time_published - subtitles = self._extract_subtitles(url, video.get('subtitles')) + subtitles = self._extract_subtitles(url, video) program_info = media.get('program_info') or {} season = media.get('season') @@ -326,6 +339,22 @@ class RaiIE(RaiBaseIE): 'skip_download': True, }, }, { + # ContentItem in iframe (see #12652) and subtitle at 'subtitlesUrl' key + 'url': 'http://www.presadiretta.rai.it/dl/portali/site/puntata/ContentItem-3ed19d13-26c2-46ff-a551-b10828262f1b.html', + 'info_dict': { + 'id': '1ad6dc64-444a-42a4-9bea-e5419ad2f5fd', + 'ext': 'mp4', + 'title': 'Partiti acchiappavoti - Presa diretta del 13/09/2015', + 'description': 'md5:d291b03407ec505f95f27970c0b025f4', + 'upload_date': '20150913', + 'subtitles': { + 'it': 'count:2', + }, + }, + 'params': { + 'skip_download': True, + }, + }, { # Direct MMS URL 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html', 'only_matching': True, @@ -365,7 +394,7 @@ class RaiIE(RaiBaseIE): 'url': compat_urlparse.urljoin(url, thumbnail_url), }) - subtitles = self._extract_subtitles(url, media.get('subtitlesUrl')) + subtitles = self._extract_subtitles(url, media) info = { 'id': content_id, @@ -402,7 +431,8 @@ class RaiIE(RaiBaseIE): r'''(?x) (?: (?:initEdizione|drawMediaRaiTV)\(| - <(?:[^>]+\bdata-id|var\s+uniquename)= + <(?:[^>]+\bdata-id|var\s+uniquename)=| + <iframe[^>]+\bsrc= ) (["\']) (?:(?!\1).)*\bContentItem-(?P<id>%s) diff --git a/youtube_dlc/extractor/sbs.py b/youtube_dlc/extractor/sbs.py index 0e623ff7b..f722528cd 100644 --- a/youtube_dlc/extractor/sbs.py +++ b/youtube_dlc/extractor/sbs.py @@ -10,7 +10,7 @@ from ..utils import ( class SBSIE(InfoExtractor): IE_DESC = 'sbs.com.au' - _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand|news)/video/(?:single/)?(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand(?:/video/(?:single/)?|.*?\bplay=)|news/(?:embeds/)?video/)(?P<id>[0-9]+)' _TESTS = [{ # Original URL is handled by the generic IE which finds the iframe: @@ -18,7 +18,7 @@ class SBSIE(InfoExtractor): 'url': 'http://www.sbs.com.au/ondemand/video/single/320403011771/?source=drupal&vertical=thefeed', 'md5': '3150cf278965eeabb5b4cea1c963fe0a', 'info_dict': { - 'id': '320403011771', + 'id': '_rFBPRPO4pMR', 'ext': 'mp4', 'title': 'Dingo Conservation (The Feed)', 'description': 'md5:f250a9856fca50d22dec0b5b8015f8a5', @@ -34,6 +34,15 @@ class SBSIE(InfoExtractor): }, { 'url': 'http://www.sbs.com.au/news/video/471395907773/The-Feed-July-9', 'only_matching': True, + }, { + 'url': 'https://www.sbs.com.au/ondemand/?play=1836638787723', + 'only_matching': True, + }, { + 'url': 'https://www.sbs.com.au/ondemand/program/inside-windsor-castle?play=1283505731842', + 'only_matching': True, + }, { + 'url': 'https://www.sbs.com.au/news/embeds/video/1840778819866', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dlc/extractor/stv.py b/youtube_dlc/extractor/stv.py index bae8b71f4..539220a94 100644 --- a/youtube_dlc/extractor/stv.py +++ b/youtube_dlc/extractor/stv.py @@ -8,13 +8,17 @@ from ..utils import ( compat_str, float_or_none, int_or_none, + smuggle_url, + str_or_none, + try_get, ) class STVPlayerIE(InfoExtractor): IE_NAME = 'stv:player' _VALID_URL = r'https?://player\.stv\.tv/(?P<type>episode|video)/(?P<id>[a-z0-9]{4})' - _TEST = { + _TESTS = [{ + # shortform 'url': 'https://player.stv.tv/video/4gwd/emmerdale/60-seconds-on-set-with-laura-norton/', 'md5': '5adf9439c31d554f8be0707c7abe7e0a', 'info_dict': { @@ -27,7 +31,11 @@ class STVPlayerIE(InfoExtractor): 'uploader_id': '1486976045', }, 'skip': 'this resource is unavailable outside of the UK', - } + }, { + # episodes + 'url': 'https://player.stv.tv/episode/4125/jennifer-saunders-memory-lane', + 'only_matching': True, + }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1486976045/default_default/index.html?videoId=%s' _PTYPE_MAP = { 'episode': 'episodes', @@ -36,11 +44,31 @@ class STVPlayerIE(InfoExtractor): def _real_extract(self, url): ptype, video_id = re.match(self._VALID_URL, url).groups() - resp = self._download_json( - 'https://player.api.stv.tv/v1/%s/%s' % (self._PTYPE_MAP[ptype], video_id), - video_id) - result = resp['results'] + webpage = self._download_webpage(url, video_id, fatal=False) or '' + props = (self._parse_json(self._search_regex( + r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>', + webpage, 'next data', default='{}'), video_id, + fatal=False) or {}).get('props') or {} + player_api_cache = try_get( + props, lambda x: x['initialReduxState']['playerApiCache']) or {} + + api_path, resp = None, {} + for k, v in player_api_cache.items(): + if k.startswith('/episodes/') or k.startswith('/shortform/'): + api_path, resp = k, v + break + else: + episode_id = str_or_none(try_get( + props, lambda x: x['pageProps']['episodeId'])) + api_path = '/%s/%s' % (self._PTYPE_MAP[ptype], episode_id or video_id) + + result = resp.get('results') + if not result: + resp = self._download_json( + 'https://player.api.stv.tv/v1' + api_path, video_id) + result = resp['results'] + video = result['video'] video_id = compat_str(video['id']) @@ -57,7 +85,7 @@ class STVPlayerIE(InfoExtractor): return { '_type': 'url_transparent', 'id': video_id, - 'url': self.BRIGHTCOVE_URL_TEMPLATE % video_id, + 'url': smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {'geo_countries': ['GB']}), 'description': result.get('summary'), 'duration': float_or_none(video.get('length'), 1000), 'subtitles': subtitles, diff --git a/youtube_dlc/extractor/twitch.py b/youtube_dlc/extractor/twitch.py index ab131a07d..503d019de 100644 --- a/youtube_dlc/extractor/twitch.py +++ b/youtube_dlc/extractor/twitch.py @@ -9,7 +9,6 @@ import re from .common import InfoExtractor from ..compat import ( - compat_kwargs, compat_parse_qs, compat_str, compat_urlparse, @@ -42,30 +41,16 @@ class TwitchBaseIE(InfoExtractor): _CLIENT_ID = 'kimne78kx3ncx6brgo4mv6wki5h1ko' _NETRC_MACHINE = 'twitch' - def _handle_error(self, response): - if not isinstance(response, dict): - return - error = response.get('error') - if error: - raise ExtractorError( - '%s returned error: %s - %s' % (self.IE_NAME, error, response.get('message')), - expected=True) - - def _call_api(self, path, item_id, *args, **kwargs): - headers = kwargs.get('headers', {}).copy() - headers.update({ - 'Accept': 'application/vnd.twitchtv.v5+json; charset=UTF-8', - 'Client-ID': self._CLIENT_ID, - }) - kwargs.update({ - 'headers': headers, - 'expected_status': (400, 410), - }) - response = self._download_json( - '%s/%s' % (self._API_BASE, path), item_id, - *args, **compat_kwargs(kwargs)) - self._handle_error(response) - return response + _OPERATION_HASHES = { + 'CollectionSideBar': '27111f1b382effad0b6def325caef1909c733fe6a4fbabf54f8d491ef2cf2f14', + 'FilterableVideoTower_Videos': 'a937f1d22e269e39a03b509f65a7490f9fc247d7f83d6ac1421523e3b68042cb', + 'ClipsCards__User': 'b73ad2bfaecfd30a9e6c28fada15bd97032c83ec77a0440766a56fe0bd632777', + 'ChannelCollectionsContent': '07e3691a1bad77a36aba590c351180439a40baefc1c275356f40fc7082419a84', + 'StreamMetadata': '1c719a40e481453e5c48d9bb585d971b8b372f8ebb105b17076722264dfa5b3e', + 'ComscoreStreamingQuery': 'e1edae8122517d013405f237ffcc124515dc6ded82480a88daef69c83b53ac01', + 'VideoPreviewOverlay': '3006e77e51b128d838fa4e835723ca4dc9a05c5efd4466c1085215c6e437e65c', + 'VideoMetadata': '226edb3e692509f727fd56821f5653c05740242c82b0388883e0c0e75dcbf687', + } def _real_initialize(self): self._login() @@ -151,13 +136,46 @@ class TwitchBaseIE(InfoExtractor): }) self._sort_formats(formats) - def _download_access_token(self, channel_name): - return self._call_api( - 'api/channels/%s/access_token' % channel_name, channel_name, - 'Downloading access token JSON') + def _download_base_gql(self, video_id, ops, note, fatal=True): + return self._download_json( + 'https://gql.twitch.tv/gql', video_id, note, + data=json.dumps(ops).encode(), + headers={ + 'Content-Type': 'text/plain;charset=UTF-8', + 'Client-ID': self._CLIENT_ID, + }, fatal=fatal) - def _extract_channel_id(self, token, channel_name): - return compat_str(self._parse_json(token, channel_name)['channel_id']) + def _download_gql(self, video_id, ops, note, fatal=True): + for op in ops: + op['extensions'] = { + 'persistedQuery': { + 'version': 1, + 'sha256Hash': self._OPERATION_HASHES[op['operationName']], + } + } + return self._download_base_gql(video_id, ops, note) + + def _download_access_token(self, video_id, token_kind, param_name): + method = '%sPlaybackAccessToken' % token_kind + ops = { + 'query': '''{ + %s( + %s: "%s", + params: { + platform: "web", + playerBackend: "mediaplayer", + playerType: "site" + } + ) + { + value + signature + } + }''' % (method, param_name, video_id), + } + return self._download_base_gql( + video_id, ops, + 'Downloading %s access token GraphQL' % token_kind)['data'][method] class TwitchVodIE(TwitchBaseIE): @@ -170,8 +188,6 @@ class TwitchVodIE(TwitchBaseIE): ) (?P<id>\d+) ''' - _ITEM_TYPE = 'vod' - _ITEM_SHORTCUT = 'v' _TESTS = [{ 'url': 'http://www.twitch.tv/riotgames/v/6528877?t=5m10s', @@ -181,7 +197,7 @@ class TwitchVodIE(TwitchBaseIE): 'title': 'LCK Summer Split - Week 6 Day 1', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 17208, - 'timestamp': 1435131709, + 'timestamp': 1435131734, 'upload_date': '20150624', 'uploader': 'Riot Games', 'uploader_id': 'riotgames', @@ -230,10 +246,20 @@ class TwitchVodIE(TwitchBaseIE): }] def _download_info(self, item_id): - return self._extract_info( - self._call_api( - 'kraken/videos/%s' % item_id, item_id, - 'Downloading video info JSON')) + data = self._download_gql( + item_id, [{ + 'operationName': 'VideoMetadata', + 'variables': { + 'channelLogin': '', + 'videoID': item_id, + }, + }], + 'Downloading stream metadata GraphQL')[0]['data'] + video = data.get('video') + if video is None: + raise ExtractorError( + 'Video %s does not exist' % item_id, expected=True) + return self._extract_info_gql(video, item_id) @staticmethod def _extract_info(info): @@ -272,13 +298,33 @@ class TwitchVodIE(TwitchBaseIE): 'is_live': is_live, } + @staticmethod + def _extract_info_gql(info, item_id): + vod_id = info.get('id') or item_id + # id backward compatibility for download archives + if vod_id[0] != 'v': + vod_id = 'v%s' % vod_id + thumbnail = url_or_none(info.get('previewThumbnailURL')) + if thumbnail: + for p in ('width', 'height'): + thumbnail = thumbnail.replace('{%s}' % p, '0') + return { + 'id': vod_id, + 'title': info.get('title') or 'Untitled Broadcast', + 'description': info.get('description'), + 'duration': int_or_none(info.get('lengthSeconds')), + 'thumbnail': thumbnail, + 'uploader': try_get(info, lambda x: x['owner']['displayName'], compat_str), + 'uploader_id': try_get(info, lambda x: x['owner']['login'], compat_str), + 'timestamp': unified_timestamp(info.get('publishedAt')), + 'view_count': int_or_none(info.get('viewCount')), + } + def _real_extract(self, url): vod_id = self._match_id(url) info = self._download_info(vod_id) - access_token = self._call_api( - 'api/vods/%s/access_token' % vod_id, vod_id, - 'Downloading %s access token' % self._ITEM_TYPE) + access_token = self._download_access_token(vod_id, 'video', 'id') formats = self._extract_m3u8_formats( '%s/vod/%s.m3u8?%s' % ( @@ -289,8 +335,8 @@ class TwitchVodIE(TwitchBaseIE): 'allow_spectre': 'true', 'player': 'twitchweb', 'playlist_include_framerate': 'true', - 'nauth': access_token['token'], - 'nauthsig': access_token['sig'], + 'nauth': access_token['value'], + 'nauthsig': access_token['signature'], })), vod_id, 'mp4', entry_protocol='m3u8_native') @@ -333,37 +379,7 @@ def _make_video_result(node): } -class TwitchGraphQLBaseIE(TwitchBaseIE): - _PAGE_LIMIT = 100 - - _OPERATION_HASHES = { - 'CollectionSideBar': '27111f1b382effad0b6def325caef1909c733fe6a4fbabf54f8d491ef2cf2f14', - 'FilterableVideoTower_Videos': 'a937f1d22e269e39a03b509f65a7490f9fc247d7f83d6ac1421523e3b68042cb', - 'ClipsCards__User': 'b73ad2bfaecfd30a9e6c28fada15bd97032c83ec77a0440766a56fe0bd632777', - 'ChannelCollectionsContent': '07e3691a1bad77a36aba590c351180439a40baefc1c275356f40fc7082419a84', - 'StreamMetadata': '1c719a40e481453e5c48d9bb585d971b8b372f8ebb105b17076722264dfa5b3e', - 'ComscoreStreamingQuery': 'e1edae8122517d013405f237ffcc124515dc6ded82480a88daef69c83b53ac01', - 'VideoPreviewOverlay': '3006e77e51b128d838fa4e835723ca4dc9a05c5efd4466c1085215c6e437e65c', - } - - def _download_gql(self, video_id, ops, note, fatal=True): - for op in ops: - op['extensions'] = { - 'persistedQuery': { - 'version': 1, - 'sha256Hash': self._OPERATION_HASHES[op['operationName']], - } - } - return self._download_json( - 'https://gql.twitch.tv/gql', video_id, note, - data=json.dumps(ops).encode(), - headers={ - 'Content-Type': 'text/plain;charset=UTF-8', - 'Client-ID': self._CLIENT_ID, - }, fatal=fatal) - - -class TwitchCollectionIE(TwitchGraphQLBaseIE): +class TwitchCollectionIE(TwitchBaseIE): _VALID_URL = r'https?://(?:(?:www|go|m)\.)?twitch\.tv/collections/(?P<id>[^/]+)' _TESTS = [{ @@ -400,7 +416,9 @@ class TwitchCollectionIE(TwitchGraphQLBaseIE): entries, playlist_id=collection_id, playlist_title=title) -class TwitchPlaylistBaseIE(TwitchGraphQLBaseIE): +class TwitchPlaylistBaseIE(TwitchBaseIE): + _PAGE_LIMIT = 100 + def _entries(self, channel_name, *args): cursor = None variables_common = self._make_variables(channel_name, *args) @@ -440,49 +458,6 @@ class TwitchPlaylistBaseIE(TwitchGraphQLBaseIE): if not cursor or not isinstance(cursor, compat_str): break - # Deprecated kraken v5 API - def _entries_kraken(self, channel_name, broadcast_type, sort): - access_token = self._download_access_token(channel_name) - channel_id = self._extract_channel_id(access_token['token'], channel_name) - offset = 0 - counter_override = None - for counter in itertools.count(1): - response = self._call_api( - 'kraken/channels/%s/videos/' % channel_id, - channel_id, - 'Downloading video JSON page %s' % (counter_override or counter), - query={ - 'offset': offset, - 'limit': self._PAGE_LIMIT, - 'broadcast_type': broadcast_type, - 'sort': sort, - }) - videos = response.get('videos') - if not isinstance(videos, list): - break - for video in videos: - if not isinstance(video, dict): - continue - video_url = url_or_none(video.get('url')) - if not video_url: - continue - yield { - '_type': 'url_transparent', - 'ie_key': TwitchVodIE.ie_key(), - 'id': video.get('_id'), - 'url': video_url, - 'title': video.get('title'), - 'description': video.get('description'), - 'timestamp': unified_timestamp(video.get('published_at')), - 'duration': float_or_none(video.get('length')), - 'view_count': int_or_none(video.get('views')), - 'language': video.get('language'), - } - offset += self._PAGE_LIMIT - total = int_or_none(response.get('_total')) - if total and offset >= total: - break - class TwitchVideosIE(TwitchPlaylistBaseIE): _VALID_URL = r'https?://(?:(?:www|go|m)\.)?twitch\.tv/(?P<id>[^/]+)/(?:videos|profile)' @@ -724,7 +699,7 @@ class TwitchVideosCollectionsIE(TwitchPlaylistBaseIE): playlist_title='%s - Collections' % channel_name) -class TwitchStreamIE(TwitchGraphQLBaseIE): +class TwitchStreamIE(TwitchBaseIE): IE_NAME = 'twitch:stream' _VALID_URL = r'''(?x) https?:// @@ -814,8 +789,9 @@ class TwitchStreamIE(TwitchGraphQLBaseIE): if not stream: raise ExtractorError('%s is offline' % channel_name, expected=True) - access_token = self._download_access_token(channel_name) - token = access_token['token'] + access_token = self._download_access_token( + channel_name, 'stream', 'channelName') + token = access_token['value'] stream_id = stream.get('id') or channel_name query = { @@ -826,7 +802,7 @@ class TwitchStreamIE(TwitchGraphQLBaseIE): 'player': 'twitchweb', 'playlist_include_framerate': 'true', 'segment_preference': '4', - 'sig': access_token['sig'].encode('utf-8'), + 'sig': access_token['signature'].encode('utf-8'), 'token': token.encode('utf-8'), } formats = self._extract_m3u8_formats( @@ -912,8 +888,8 @@ class TwitchClipsIE(TwitchBaseIE): def _real_extract(self, url): video_id = self._match_id(url) - clip = self._download_json( - 'https://gql.twitch.tv/gql', video_id, data=json.dumps({ + clip = self._download_base_gql( + video_id, { 'query': '''{ clip(slug: "%s") { broadcaster { @@ -937,10 +913,7 @@ class TwitchClipsIE(TwitchBaseIE): } viewCount } -}''' % video_id, - }).encode(), headers={ - 'Client-ID': self._CLIENT_ID, - })['data']['clip'] +}''' % video_id}, 'Downloading clip GraphQL')['data']['clip'] if not clip: raise ExtractorError( diff --git a/youtube_dlc/extractor/twitter.py b/youtube_dlc/extractor/twitter.py index ca5e040c6..4602c0984 100644 --- a/youtube_dlc/extractor/twitter.py +++ b/youtube_dlc/extractor/twitter.py @@ -251,10 +251,10 @@ class TwitterIE(TwitterBaseIE): 'info_dict': { 'id': '700207533655363584', 'ext': 'mp4', - 'title': 'simon vetugo - BEAT PROD: @suhmeduh #Damndaniel', + 'title': 'simon vertugo - BEAT PROD: @suhmeduh #Damndaniel', 'description': 'BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ', 'thumbnail': r're:^https?://.*\.jpg', - 'uploader': 'simon vetugo', + 'uploader': 'simon vertugo', 'uploader_id': 'simonvertugo', 'duration': 30.0, 'timestamp': 1455777459, @@ -312,6 +312,7 @@ class TwitterIE(TwitterBaseIE): 'timestamp': 1492000653, 'upload_date': '20170412', }, + 'skip': 'Account suspended', }, { 'url': 'https://twitter.com/i/web/status/910031516746514432', 'info_dict': { @@ -380,6 +381,14 @@ class TwitterIE(TwitterBaseIE): # promo_video_website card 'url': 'https://twitter.com/GunB1g/status/1163218564784017422', 'only_matching': True, + }, { + # promo_video_convo card + 'url': 'https://twitter.com/poco_dandy/status/1047395834013384704', + 'only_matching': True, + }, { + # appplayer card + 'url': 'https://twitter.com/poco_dandy/status/1150646424461176832', + 'only_matching': True, }] def _real_extract(self, url): @@ -462,7 +471,30 @@ class TwitterIE(TwitterBaseIE): return try_get(o, lambda x: x[x['type'].lower() + '_value']) card_name = card['name'].split(':')[-1] - if card_name in ('amplify', 'promo_video_website'): + if card_name == 'player': + info.update({ + '_type': 'url', + 'url': get_binding_value('player_url'), + }) + elif card_name == 'periscope_broadcast': + info.update({ + '_type': 'url', + 'url': get_binding_value('url') or get_binding_value('player_url'), + 'ie_key': PeriscopeIE.ie_key(), + }) + elif card_name == 'broadcast': + info.update({ + '_type': 'url', + 'url': get_binding_value('broadcast_url'), + 'ie_key': TwitterBroadcastIE.ie_key(), + }) + elif card_name == 'summary': + info.update({ + '_type': 'url', + 'url': get_binding_value('card_url'), + }) + # amplify, promo_video_website, promo_video_convo, appplayer, ... + else: is_amplify = card_name == 'amplify' vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url') content_id = get_binding_value('%s_content_id' % (card_name if is_amplify else 'player')) @@ -488,25 +520,6 @@ class TwitterIE(TwitterBaseIE): 'duration': int_or_none(get_binding_value( 'content_duration_seconds')), }) - elif card_name == 'player': - info.update({ - '_type': 'url', - 'url': get_binding_value('player_url'), - }) - elif card_name == 'periscope_broadcast': - info.update({ - '_type': 'url', - 'url': get_binding_value('url') or get_binding_value('player_url'), - 'ie_key': PeriscopeIE.ie_key(), - }) - elif card_name == 'broadcast': - info.update({ - '_type': 'url', - 'url': get_binding_value('broadcast_url'), - 'ie_key': TwitterBroadcastIE.ie_key(), - }) - else: - raise ExtractorError('Unsupported Twitter Card.') else: expanded_url = try_get(status, lambda x: x['entities']['urls'][0]['expanded_url']) if not expanded_url: diff --git a/youtube_dlc/extractor/xfileshare.py b/youtube_dlc/extractor/xfileshare.py index 48ef07ed1..cbd5d1cbb 100644 --- a/youtube_dlc/extractor/xfileshare.py +++ b/youtube_dlc/extractor/xfileshare.py @@ -45,6 +45,7 @@ def aa_decode(aa_code): class XFileShareIE(InfoExtractor): _SITES = ( + (r'aparat\.cam', 'Aparat'), (r'clipwatching\.com', 'ClipWatching'), (r'gounlimited\.to', 'GoUnlimited'), (r'govid\.me', 'GoVid'), @@ -78,6 +79,9 @@ class XFileShareIE(InfoExtractor): 'title': 'sample', 'thumbnail': r're:http://.*\.jpg', }, + }, { + 'url': 'https://aparat.cam/n4d6dh0wvlpr', + 'only_matching': True, }] @staticmethod diff --git a/youtube_dlc/utils.py b/youtube_dlc/utils.py index 6a04b710e..586ad4150 100644 --- a/youtube_dlc/utils.py +++ b/youtube_dlc/utils.py @@ -5819,3 +5819,20 @@ def format_field(obj, field, template='%s', ignore=(None, ''), default='', func= if func and val not in ignore: val = func(val) return template % val if val not in ignore else default + + +def clean_podcast_url(url): + return re.sub(r'''(?x) + (?: + (?: + chtbl\.com/track| + media\.blubrry\.com| # https://create.blubrry.com/resources/podcast-media-download-statistics/getting-started/ + play\.podtrac\.com + )/[^/]+| + (?:dts|www)\.podtrac\.com/(?:pts/)?redirect\.[0-9a-z]{3,4}| # http://analytics.podtrac.com/how-to-measure + flex\.acast\.com| + pd(?: + cn\.co| # https://podcorn.com/analytics-prefix/ + st\.fm # https://podsights.com/docs/ + )/e + )/''', '', url) |