diff options
151 files changed, 6361 insertions, 2722 deletions
diff --git a/.gitignore b/.gitignore index e7cca0525..1b79afe62 100644 --- a/.gitignore +++ b/.gitignore @@ -41,8 +41,11 @@ cookies *.webp *.annotations.xml *.description +.cache/ + # Allow config/media files in testdata !test/** + # Python *.pyc *.pyo diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 2bf96affe..f035ce10d 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -129,3 +129,13 @@ Bojidarist nixklai smplayer-dev Zirro +CrypticSignal +flashdagger +fractalf +frafra +kaz-us +ozburo +rhendric +sdomi +selfisekai +stanoarn diff --git a/Changelog.md b/Changelog.md index b46199168..072dc336d 100644 --- a/Changelog.md +++ b/Changelog.md @@ -14,6 +14,95 @@ --> +### 2021.11.10.1 + +* Temporarily disable MacOS Build + +### 2021.11.10 + +* [youtube] **Fix throttling by decrypting n-sig** +* Merging extractors from [haruhi-dl](https://git.sakamoto.pl/laudom/haruhi-dl) by [selfisekai](https://github.com/selfisekai) + * [extractor] Add `_search_nextjs_data` + * [tvp] Fix extractors + * [tvp] Add TVPStreamIE + * [wppilot] Add extractors + * [polskieradio] Add extractors + * [radiokapital] Add extractors + * [polsatgo] Add extractor by [selfisekai](https://github.com/selfisekai), [sdomi](https://github.com/sdomi) +* Separate `--check-all-formats` from `--check-formats` +* Approximate filesize from bitrate +* Don't create console in `windows_enable_vt_mode` +* Fix bug in `--load-infojson` of playlists +* [minicurses] Add colors to `-F` and standardize color-printing code +* [outtmpl] Add type `link` for internet shortcut files +* [outtmpl] Add alternate forms for `q` and `j` +* [outtmpl] Do not traverse `None` +* [fragment] Fix progress display in fragmented downloads +* [downloader/ffmpeg] Fix vtt download with ffmpeg +* [ffmpeg] Detect presence of setts and libavformat version +* [ExtractAudio] Rescale `--audio-quality` correctly by [CrypticSignal](https://github.com/CrypticSignal), [pukkandan](https://github.com/pukkandan) +* [ExtractAudio] Use `libfdk_aac` if available by [CrypticSignal](https://github.com/CrypticSignal) +* [FormatSort] `eac3` is better than `ac3` +* [FormatSort] Fix some fields' defaults +* [generic] Detect more json_ld +* [generic] parse jwplayer with only the json URL +* [extractor] Add keyword automatically to SearchIE descriptions +* [extractor] Fix some errors being converted to `ExtractorError` +* [utils] Add `join_nonempty` +* [utils] Add `jwt_decode_hs256` by [Ashish0804](https://github.com/Ashish0804) +* [utils] Create `DownloadCancelled` exception +* [utils] Parse `vp09` as vp9 +* [utils] Sanitize URL when determining protocol +* [test/download] Fallback test to `bv` +* [docs] Minor documentation improvements +* [cleanup] Improvements to error and debug messages +* [cleanup] Minor fixes and cleanup +* [3speak] Add extractors by [Ashish0804](https://github.com/Ashish0804) +* [AmazonStore] Add extractor by [Ashish0804](https://github.com/Ashish0804) +* [Gab] Add extractor by [u-spec-png](https://github.com/u-spec-png) +* [mediaset] Add playlist support by [nixxo](https://github.com/nixxo) +* [MLSScoccer] Add extractor by [Ashish0804](https://github.com/Ashish0804) +* [N1] Add support for nova.rs by [u-spec-png](https://github.com/u-spec-png) +* [PlanetMarathi] Add extractor by [Ashish0804](https://github.com/Ashish0804) +* [RaiplayRadio] Add extractors by [frafra](https://github.com/frafra) +* [roosterteeth] Add series extractor +* [sky] Add `SkyNewsStoryIE` by [ajj8](https://github.com/ajj8) +* [youtube] Fix sorting for some videos +* [youtube] Populate `thumbnail` with the best "known" thumbnail +* [youtube] Refactor itag processing +* [youtube] Remove unnecessary no-playlist warning +* [youtube:tab] Add Invidious list for playlists/channels by [rhendric](https://github.com/rhendric) +* [Bilibili:comments] Fix infinite loop by [u-spec-png](https://github.com/u-spec-png) +* [ceskatelevize] Fix extractor by [flashdagger](https://github.com/flashdagger) +* [Coub] Fix media format identification by [wlritchi](https://github.com/wlritchi) +* [crunchyroll] Add extractor-args `language` and `hardsub` +* [DiscoveryPlus] Allow language codes in URL +* [imdb] Fix thumbnail by [ozburo](https://github.com/ozburo) +* [instagram] Add IOS URL support by [u-spec-png](https://github.com/u-spec-png) +* [instagram] Improve login code by [u-spec-png](https://github.com/u-spec-png) +* [Instagram] Improve metadata extraction by [u-spec-png](https://github.com/u-spec-png) +* [iPrima] Fix extractor by [stanoarn](https://github.com/stanoarn) +* [itv] Add support for ITV News by [ajj8](https://github.com/ajj8) +* [la7] Fix extractor by [nixxo](https://github.com/nixxo) +* [linkedin] Don't login multiple times +* [mtv] Fix some videos by [Sipherdrakon](https://github.com/Sipherdrakon) +* [Newgrounds] Fix description by [u-spec-png](https://github.com/u-spec-png) +* [Nrk] Minor fixes by [fractalf](https://github.com/fractalf) +* [Olympics] Fix extractor by [u-spec-png](https://github.com/u-spec-png) +* [piksel] Fix sorting +* [twitter] Do not sort by codec +* [viewlift] Add cookie-based login and series support by [Ashish0804](https://github.com/Ashish0804), [pukkandan](https://github.com/pukkandan) +* [vimeo] Detect source extension and misc cleanup by [flashdagger](https://github.com/flashdagger) +* [vimeo] Fix ondemand videos and direct URLs with hash +* [vk] Fix login and add subtitles by [kaz-us](https://github.com/kaz-us) +* [VLive] Add upload_date and thumbnail by [Ashish0804](https://github.com/Ashish0804) +* [VRT] Fix login by [pgaig](https://github.com/pgaig) +* [Vupload] Fix extractor by [u-spec-png](https://github.com/u-spec-png) +* [wakanim] Add support for MPD manifests by [nyuszika7h](https://github.com/nyuszika7h) +* [wakanim] Detect geo-restriction by [nyuszika7h](https://github.com/nyuszika7h) +* [ZenYandex] Fix extractor by [u-spec-png](https://github.com/u-spec-png) + + ### 2021.10.22 * [build] Improvements diff --git a/supportedsites.md b/supportedsites.md index 01c3f43a9..50fa7f9f1 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -48,6 +48,7 @@ - **Alura** - **AluraCourse** - **Amara** + - **AmazonStore** - **AMCNetworks** - **AmericasTestKitchen** - **AmericasTestKitchenSeason** @@ -184,7 +185,6 @@ - **CCTV**: 央视网 - **CDA** - **CeskaTelevize** - - **CeskaTelevizePorady** - **CGTN** - **channel9**: Channel 9 - **CharlieRose** @@ -366,6 +366,7 @@ - **Funk** - **Fusion** - **Fux** + - **Gab** - **GabTV** - **Gaia** - **GameInformer** @@ -449,9 +450,11 @@ - **Instagram** - **instagram:tag**: Instagram hashtag search - **instagram:user**: Instagram user profile + - **InstagramIOS**: IOS instagram:// URL - **Internazionale** - **InternetVideoArchive** - **IPrima** + - **IPrimaCNN** - **iqiyi**: 爱奇艺 - **Ir90Tv** - **ITTF** @@ -560,6 +563,7 @@ - **MediaKlikk** - **Medialaan** - **Mediaset** + - **MediasetShow** - **Mediasite** - **MediasiteCatalog** - **MediasiteNamedCatalog** @@ -592,6 +596,7 @@ - **mixcloud:user** - **MLB** - **MLBVideo** + - **MLSSoccer** - **Mnet** - **MNetTV** - **MoeVideo**: LetitBit video services: moevideo.net, playreplay.net and videochart.net @@ -801,6 +806,7 @@ - **Pinterest** - **PinterestCollection** - **Pladform** + - **PlanetMarathi** - **Platzi** - **PlatziCourse** - **play.fm** @@ -817,7 +823,12 @@ - **podomatic** - **Pokemon** - **PokemonWatch** + - **PolsatGo** - **PolskieRadio** + - **polskieradio:kierowcow** + - **polskieradio:player** + - **polskieradio:podcast** + - **polskieradio:podcast:list** - **PolskieRadioCategory** - **Popcorntimes** - **PopcornTV** @@ -860,6 +871,8 @@ - **radiocanada:audiovideo** - **radiofrance** - **RadioJavan** + - **radiokapital** + - **radiokapital:show** - **radlive** - **radlive:channel** - **radlive:season** @@ -867,6 +880,8 @@ - **RaiPlay** - **RaiPlayLive** - **RaiPlayPlaylist** + - **RaiPlayRadio** + - **RaiPlayRadioPlaylist** - **RayWenderlich** - **RayWenderlichCourse** - **RBMARadio** @@ -894,6 +909,7 @@ - **RMCDecouverte** - **RockstarGames** - **RoosterTeeth** + - **RoosterTeethSeries** - **RottenTomatoes** - **Roxwel** - **Rozhlas** @@ -961,6 +977,7 @@ - **Sina** - **sky.it** - **sky:news** + - **sky:news:story** - **sky:sports** - **sky:sports:news** - **skyacademy.it** @@ -1079,6 +1096,8 @@ - **ThisAmericanLife** - **ThisAV** - **ThisOldHouse** + - **ThreeSpeak** + - **ThreeSpeakUser** - **TikTok** - **tiktok:user** - **tinypic**: tinypic.com videos @@ -1142,6 +1161,7 @@ - **tvp**: Telewizja Polska - **tvp:embed**: Telewizja Polska - **tvp:series** + - **tvp:stream** - **TVPlayer** - **TVPlayHome** - **Tweakers** @@ -1296,6 +1316,8 @@ - **WistiaPlaylist** - **wnl**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **WorldStarHipHop** + - **wppilot** + - **wppilot:channels** - **WSJ**: Wall Street Journal - **WSJArticle** - **WWE** diff --git a/test/parameters.json b/test/parameters.json index 9ca7d2ca9..06fe3e31b 100644 --- a/test/parameters.json +++ b/test/parameters.json @@ -9,7 +9,7 @@ "forcetitle": false, "forceurl": false, "force_write_download_archive": false, - "format": "best", + "format": "b/bv", "ignoreerrors": false, "listformats": null, "logtostderr": false, diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index bd2d752e2..63ef50e1a 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -137,7 +137,7 @@ class TestFormatSelection(unittest.TestCase): test('webm/mp4', '47') test('3gp/40/mp4', '35') test('example-with-dashes', 'example-with-dashes') - test('all', '35', 'example-with-dashes', '45', '47', '2') # Order doesn't actually matter for this + test('all', '2', '47', '45', 'example-with-dashes', '35') test('mergeall', '2+47+45+example-with-dashes+35', multi=True) def test_format_selection_audio(self): @@ -520,7 +520,7 @@ class TestFormatSelection(unittest.TestCase): ydl = YDL({'format': 'all[width>=400][width<=600]'}) ydl.process_ie_result(info_dict) downloaded_ids = [info['format_id'] for info in ydl.downloaded_info_dicts] - self.assertEqual(downloaded_ids, ['B', 'C', 'D']) + self.assertEqual(downloaded_ids, ['D', 'C', 'B']) ydl = YDL({'format': 'best[height<40]'}) try: @@ -656,7 +656,7 @@ class TestYoutubeDL(unittest.TestCase): 'playlist_autonumber': 2, '_last_playlist_index': 100, 'n_entries': 10, - 'formats': [{'id': 'id1'}, {'id': 'id2'}, {'id': 'id3'}] + 'formats': [{'id': 'id 1'}, {'id': 'id 2'}, {'id': 'id 3'}] } def test_prepare_outtmpl_and_filename(self): @@ -737,6 +737,7 @@ class TestYoutubeDL(unittest.TestCase): test(NA_TEST_OUTTMPL, 'NA-NA-def-1234.mp4') test(NA_TEST_OUTTMPL, 'none-none-def-1234.mp4', outtmpl_na_placeholder='none') test(NA_TEST_OUTTMPL, '--def-1234.mp4', outtmpl_na_placeholder='') + test('%(non_existent.0)s', 'NA') # String formatting FMT_TEST_OUTTMPL = '%%(height)%s.%%(ext)s' @@ -762,14 +763,15 @@ class TestYoutubeDL(unittest.TestCase): test('a%(width|)d', 'a', outtmpl_na_placeholder='none') FORMATS = self.outtmpl_info['formats'] - sanitize = lambda x: x.replace(':', ' -').replace('"', "'") + sanitize = lambda x: x.replace(':', ' -').replace('"', "'").replace('\n', ' ') # Custom type casting - test('%(formats.:.id)l', 'id1, id2, id3') - test('%(formats.:.id)#l', ('id1\nid2\nid3', 'id1 id2 id3')) + test('%(formats.:.id)l', 'id 1, id 2, id 3') + test('%(formats.:.id)#l', ('id 1\nid 2\nid 3', 'id 1 id 2 id 3')) test('%(ext)l', 'mp4') - test('%(formats.:.id) 15l', ' id1, id2, id3') + test('%(formats.:.id) 18l', ' id 1, id 2, id 3') test('%(formats)j', (json.dumps(FORMATS), sanitize(json.dumps(FORMATS)))) + test('%(formats)#j', (json.dumps(FORMATS, indent=4), sanitize(json.dumps(FORMATS, indent=4)))) test('%(title5).3B', 'á') test('%(title5)U', 'áéí 𝐀') test('%(title5)#U', 'a\u0301e\u0301i\u0301 𝐀') @@ -777,8 +779,12 @@ class TestYoutubeDL(unittest.TestCase): test('%(title5)+#U', 'a\u0301e\u0301i\u0301 A') if compat_os_name == 'nt': test('%(title4)q', ('"foo \\"bar\\" test"', "'foo _'bar_' test'")) + test('%(formats.:.id)#q', ('"id 1" "id 2" "id 3"', "'id 1' 'id 2' 'id 3'")) + test('%(formats.0.id)#q', ('"id 1"', "'id 1'")) else: test('%(title4)q', ('\'foo "bar" test\'', "'foo 'bar' test'")) + test('%(formats.:.id)#q', "'id 1' 'id 2' 'id 3'") + test('%(formats.0.id)#q', "'id 1'") # Internal formatting test('%(timestamp-1000>%H-%M-%S)s', '11-43-20') diff --git a/test/test_aes.py b/test/test_aes.py index 46db59e57..5c9273f8a 100644 --- a/test/test_aes.py +++ b/test/test_aes.py @@ -10,6 +10,8 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from yt_dlp.aes import ( aes_decrypt, aes_encrypt, + aes_ecb_encrypt, + aes_ecb_decrypt, aes_cbc_decrypt, aes_cbc_decrypt_bytes, aes_cbc_encrypt, @@ -17,7 +19,8 @@ from yt_dlp.aes import ( aes_ctr_encrypt, aes_gcm_decrypt_and_verify, aes_gcm_decrypt_and_verify_bytes, - aes_decrypt_text + aes_decrypt_text, + BLOCK_SIZE_BYTES, ) from yt_dlp.compat import compat_pycrypto_AES from yt_dlp.utils import bytes_to_intlist, intlist_to_bytes @@ -94,6 +97,19 @@ class TestAES(unittest.TestCase): decrypted = (aes_decrypt_text(encrypted, password, 32)) self.assertEqual(decrypted, self.secret_msg) + def test_ecb_encrypt(self): + data = bytes_to_intlist(self.secret_msg) + data += [0x08] * (BLOCK_SIZE_BYTES - len(data) % BLOCK_SIZE_BYTES) + encrypted = intlist_to_bytes(aes_ecb_encrypt(data, self.key, self.iv)) + self.assertEqual( + encrypted, + b'\xaa\x86]\x81\x97>\x02\x92\x9d\x1bR[[L/u\xd3&\xd1(h\xde{\x81\x94\xba\x02\xae\xbd\xa6\xd0:') + + def test_ecb_decrypt(self): + data = bytes_to_intlist(b'\xaa\x86]\x81\x97>\x02\x92\x9d\x1bR[[L/u\xd3&\xd1(h\xde{\x81\x94\xba\x02\xae\xbd\xa6\xd0:') + decrypted = intlist_to_bytes(aes_ecb_decrypt(data, self.key, self.iv)) + self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) + if __name__ == '__main__': unittest.main() diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 68c1c68d3..2d89366d4 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -38,7 +38,6 @@ class TestAllURLsMatching(unittest.TestCase): assertTab('https://www.youtube.com/AsapSCIENCE') assertTab('https://www.youtube.com/embedded') assertTab('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q') - assertTab('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') assertTab('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') assertTab('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668 self.assertFalse('youtube:playlist' in self.matching_ies('PLtS2H6bU1M')) diff --git a/test/test_jsinterp.py b/test/test_jsinterp.py index 8b2b60403..e230b045f 100644 --- a/test/test_jsinterp.py +++ b/test/test_jsinterp.py @@ -112,6 +112,71 @@ class TestJSInterpreter(unittest.TestCase): ''') self.assertEqual(jsi.call_function('z'), 5) + def test_for_loop(self): + jsi = JSInterpreter(''' + function x() { a=0; for (i=0; i-10; i++) {a++} a } + ''') + self.assertEqual(jsi.call_function('x'), 10) + + def test_switch(self): + jsi = JSInterpreter(''' + function x(f) { switch(f){ + case 1:f+=1; + case 2:f+=2; + case 3:f+=3;break; + case 4:f+=4; + default:f=0; + } return f } + ''') + self.assertEqual(jsi.call_function('x', 1), 7) + self.assertEqual(jsi.call_function('x', 3), 6) + self.assertEqual(jsi.call_function('x', 5), 0) + + def test_switch_default(self): + jsi = JSInterpreter(''' + function x(f) { switch(f){ + case 2: f+=2; + default: f-=1; + case 5: + case 6: f+=6; + case 0: break; + case 1: f+=1; + } return f } + ''') + self.assertEqual(jsi.call_function('x', 1), 2) + self.assertEqual(jsi.call_function('x', 5), 11) + self.assertEqual(jsi.call_function('x', 9), 14) + + def test_try(self): + jsi = JSInterpreter(''' + function x() { try{return 10} catch(e){return 5} } + ''') + self.assertEqual(jsi.call_function('x'), 10) + + def test_for_loop_continue(self): + jsi = JSInterpreter(''' + function x() { a=0; for (i=0; i-10; i++) { continue; a++ } a } + ''') + self.assertEqual(jsi.call_function('x'), 0) + + def test_for_loop_break(self): + jsi = JSInterpreter(''' + function x() { a=0; for (i=0; i-10; i++) { break; a++ } a } + ''') + self.assertEqual(jsi.call_function('x'), 0) + + def test_literal_list(self): + jsi = JSInterpreter(''' + function x() { [1, 2, "asdf", [5, 6, 7]][3] } + ''') + self.assertEqual(jsi.call_function('x'), [5, 6, 7]) + + def test_comma(self): + jsi = JSInterpreter(''' + function x() { a=5; a -= 1, a+=3; return a } + ''') + self.assertEqual(jsi.call_function('x'), 7) + if __name__ == '__main__': unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index 810ed3de4..22dda4f37 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1222,12 +1222,49 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4') def test_render_table(self): self.assertEqual( render_table( + ['a', 'empty', 'bcd'], + [[123, '', 4], [9999, '', 51]]), + 'a empty bcd\n' + '123 4\n' + '9999 51') + + self.assertEqual( + render_table( + ['a', 'empty', 'bcd'], + [[123, '', 4], [9999, '', 51]], + hide_empty=True), + 'a bcd\n' + '123 4\n' + '9999 51') + + self.assertEqual( + render_table( + ['\ta', 'bcd'], + [['1\t23', 4], ['\t9999', 51]]), + ' a bcd\n' + '1 23 4\n' + '9999 51') + + self.assertEqual( + render_table( ['a', 'bcd'], - [[123, 4], [9999, 51]]), + [[123, 4], [9999, 51]], + delim='-'), 'a bcd\n' + '--------\n' '123 4\n' '9999 51') + self.assertEqual( + render_table( + ['a', 'bcd'], + [[123, 4], [9999, 51]], + delim='-', extra_gap=2), + 'a bcd\n' + '----------\n' + '123 4\n' + '9999 51') + def test_match_str(self): # Unary self.assertFalse(match_str('xy', {'x': 1200})) @@ -1620,9 +1657,9 @@ Line 1 self.assertEqual(repr(LazyList(it)), repr(it)) self.assertEqual(str(LazyList(it)), str(it)) - self.assertEqual(list(LazyList(it).reverse()), it[::-1]) - self.assertEqual(list(LazyList(it).reverse()[1:3:7]), it[::-1][1:3:7]) - self.assertEqual(list(LazyList(it).reverse()[::-1]), it) + self.assertEqual(list(LazyList(it, reverse=True)), it[::-1]) + self.assertEqual(list(reversed(LazyList(it))[::-1]), it) + self.assertEqual(list(reversed(LazyList(it))[1:3:7]), it[::-1][1:3:7]) def test_LazyList_laziness(self): @@ -1635,13 +1672,13 @@ Line 1 test(ll, 5, 5, range(6)) test(ll, -3, 7, range(10)) - ll = LazyList(range(10)).reverse() + ll = LazyList(range(10), reverse=True) test(ll, -1, 0, range(1)) test(ll, 3, 6, range(10)) ll = LazyList(itertools.count()) test(ll, 10, 10, range(11)) - ll.reverse() + ll = reversed(ll) test(ll, -15, 14, range(15)) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index e831393e4..d9638658d 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -26,29 +26,31 @@ class TestYoutubeLists(unittest.TestCase): def test_youtube_playlist_noplaylist(self): dl = FakeYDL() dl.params['noplaylist'] = True - ie = YoutubePlaylistIE(dl) + ie = YoutubeTabIE(dl) result = ie.extract('https://www.youtube.com/watch?v=FXxLjLQi3Fg&list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re') self.assertEqual(result['_type'], 'url') - self.assertEqual(YoutubeIE().extract_id(result['url']), 'FXxLjLQi3Fg') + self.assertEqual(YoutubeIE.extract_id(result['url']), 'FXxLjLQi3Fg') def test_youtube_course(self): + print('Skipping: Course URLs no longer exists') + return dl = FakeYDL() ie = YoutubePlaylistIE(dl) # TODO find a > 100 (paginating?) videos course result = ie.extract('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') entries = list(result['entries']) - self.assertEqual(YoutubeIE().extract_id(entries[0]['url']), 'j9WZyLZCBzs') + self.assertEqual(YoutubeIE.extract_id(entries[0]['url']), 'j9WZyLZCBzs') self.assertEqual(len(entries), 25) - self.assertEqual(YoutubeIE().extract_id(entries[-1]['url']), 'rYefUsYuEp0') + self.assertEqual(YoutubeIE.extract_id(entries[-1]['url']), 'rYefUsYuEp0') def test_youtube_mix(self): dl = FakeYDL() - ie = YoutubePlaylistIE(dl) - result = ie.extract('https://www.youtube.com/watch?v=W01L70IGBgE&index=2&list=RDOQpdSVF_k_w') - entries = result['entries'] + ie = YoutubeTabIE(dl) + result = ie.extract('https://www.youtube.com/watch?v=tyITL_exICo&list=RDCLAK5uy_kLWIr9gv1XLlPbaDS965-Db4TrBoUTxQ8') + entries = list(result['entries']) self.assertTrue(len(entries) >= 50) original_video = entries[0] - self.assertEqual(original_video['id'], 'OQpdSVF_k_w') + self.assertEqual(original_video['id'], 'tyITL_exICo') def test_youtube_toptracks(self): print('Skipping: The playlist page gives error 500') @@ -68,10 +70,10 @@ class TestYoutubeLists(unittest.TestCase): entries = list(result['entries']) self.assertTrue(len(entries) == 1) video = entries[0] - self.assertEqual(video['_type'], 'url_transparent') + self.assertEqual(video['_type'], 'url') self.assertEqual(video['ie_key'], 'Youtube') self.assertEqual(video['id'], 'BaW_jenozKc') - self.assertEqual(video['url'], 'BaW_jenozKc') + self.assertEqual(video['url'], 'https://www.youtube.com/watch?v=BaW_jenozKc') self.assertEqual(video['title'], 'youtube-dl test video "\'/\\ä↭𝕐') self.assertEqual(video['duration'], 10) self.assertEqual(video['uploader'], 'Philipp Hagemeister') diff --git a/test/test_youtube_signature.py b/test/test_youtube_signature.py index dcf6ab60d..3359ac457 100644 --- a/test/test_youtube_signature.py +++ b/test/test_youtube_signature.py @@ -14,9 +14,10 @@ import string from test.helper import FakeYDL, is_download_test from yt_dlp.extractor import YoutubeIE +from yt_dlp.jsinterp import JSInterpreter from yt_dlp.compat import compat_str, compat_urlretrieve -_TESTS = [ +_SIG_TESTS = [ ( 'https://s.ytimg.com/yts/jsbin/html5player-vflHOr_nV.js', 86, @@ -64,6 +65,25 @@ _TESTS = [ ) ] +_NSIG_TESTS = [ + ( + 'https://www.youtube.com/s/player/9216d1f7/player_ias.vflset/en_US/base.js', + 'SLp9F5bwjAdhE9F-', 'gWnb9IK2DJ8Q1w', + ), + ( + 'https://www.youtube.com/s/player/f8cb7a3b/player_ias.vflset/en_US/base.js', + 'oBo2h5euWy6osrUt', 'ivXHpm7qJjJN', + ), + ( + 'https://www.youtube.com/s/player/2dfe380c/player_ias.vflset/en_US/base.js', + 'oBo2h5euWy6osrUt', '3DIBbn3qdQ', + ), + ( + 'https://www.youtube.com/s/player/f1ca6900/player_ias.vflset/en_US/base.js', + 'cu3wyu6LQn2hse', 'jvxetvmlI9AN9Q', + ), +] + @is_download_test class TestPlayerInfo(unittest.TestCase): @@ -97,35 +117,49 @@ class TestSignature(unittest.TestCase): os.mkdir(self.TESTDATA_DIR) -def make_tfunc(url, sig_input, expected_sig): - m = re.match(r'.*-([a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.[a-z]+$', url) - assert m, '%r should follow URL format' % url - test_id = m.group(1) +def t_factory(name, sig_func, url_pattern): + def make_tfunc(url, sig_input, expected_sig): + m = url_pattern.match(url) + assert m, '%r should follow URL format' % url + test_id = m.group('id') + + def test_func(self): + basename = f'player-{name}-{test_id}.js' + fn = os.path.join(self.TESTDATA_DIR, basename) + + if not os.path.exists(fn): + compat_urlretrieve(url, fn) + with io.open(fn, encoding='utf-8') as testf: + jscode = testf.read() + self.assertEqual(sig_func(jscode, sig_input), expected_sig) + + test_func.__name__ = f'test_{name}_js_{test_id}' + setattr(TestSignature, test_func.__name__, test_func) + return make_tfunc + - def test_func(self): - basename = 'player-%s.js' % test_id - fn = os.path.join(self.TESTDATA_DIR, basename) +def signature(jscode, sig_input): + func = YoutubeIE(FakeYDL())._parse_sig_js(jscode) + src_sig = ( + compat_str(string.printable[:sig_input]) + if isinstance(sig_input, int) else sig_input) + return func(src_sig) - if not os.path.exists(fn): - compat_urlretrieve(url, fn) - ydl = FakeYDL() - ie = YoutubeIE(ydl) - with io.open(fn, encoding='utf-8') as testf: - jscode = testf.read() - func = ie._parse_sig_js(jscode) - src_sig = ( - compat_str(string.printable[:sig_input]) - if isinstance(sig_input, int) else sig_input) - got_sig = func(src_sig) - self.assertEqual(got_sig, expected_sig) +def n_sig(jscode, sig_input): + funcname = YoutubeIE(FakeYDL())._extract_n_function_name(jscode) + return JSInterpreter(jscode).call_function(funcname, sig_input) - test_func.__name__ = str('test_signature_js_' + test_id) - setattr(TestSignature, test_func.__name__, test_func) +make_sig_test = t_factory( + 'signature', signature, re.compile(r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.[a-z]+$')) +for test_spec in _SIG_TESTS: + make_sig_test(*test_spec) -for test_spec in _TESTS: - make_tfunc(*test_spec) +make_nsig_test = t_factory( + 'nsig', n_sig, re.compile(r'.+/player/(?P<id>[a-zA-Z0-9_-]+)/.+.js$')) +for test_spec in _NSIG_TESTS: + make_nsig_test(*test_spec) if __name__ == '__main__': diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index b10e56fa1..15995bd3d 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -74,6 +74,7 @@ from .utils import ( int_or_none, iri_to_uri, ISO3166Utils, + join_nonempty, LazyList, LINK_TEMPLATES, locked_file, @@ -209,12 +210,15 @@ class YoutubeDL(object): simulate: Do not download the video files. If unset (or None), simulate only if listsubtitles, listformats or list_thumbnails is used format: Video format code. see "FORMAT SELECTION" for more details. + You can also pass a function. The function takes 'ctx' as + argument and returns the formats to download. + See "build_format_selector" for an implementation allow_unplayable_formats: Allow unplayable formats to be extracted and downloaded. ignore_no_formats_error: Ignore "No video formats" error. Usefull for extracting metadata even if the video is not actually available for download (experimental) - format_sort: How to sort the video formats. see "Sorting Formats" - for more details. + format_sort: A list of fields by which to sort the video formats. + See "Sorting Formats" for more details. format_sort_force: Force the given format_sort. see "Sorting Formats" for more details. allow_multiple_video_streams: Allow multiple video streams to be merged @@ -372,8 +376,7 @@ class YoutubeDL(object): (with status "started" and "finished") if the processing is successful. merge_output_format: Extension to use when merging formats. final_ext: Expected final extension; used to detect when the file was - already downloaded and converted. "merge_output_format" is - replaced by this extension when given + already downloaded and converted fixup: Automatically correct known faults of the file. One of: - "never": do nothing @@ -427,7 +430,7 @@ class YoutubeDL(object): compat_opts: Compatibility options. See "Differences in default behavior". The following options do not work when used through the API: filename, abort-on-error, multistreams, no-live-chat, format-sort - no-clean-infojson, no-playlist-metafiles, no-keep-subs. + no-clean-infojson, no-playlist-metafiles, no-keep-subs, no-attach-info-json. Refer __init__.py for their implementation progress_template: Dictionary of templates for progress outputs. Allowed keys are 'download', 'postprocess', @@ -439,7 +442,7 @@ class YoutubeDL(object): nopart, updatetime, buffersize, ratelimit, throttledratelimit, min_filesize, max_filesize, test, noresizebuffer, retries, fragment_retries, continuedl, noprogress, xattr_set_filesize, hls_use_mpegts, http_chunk_size, - external_downloader_args. + external_downloader_args, concurrent_fragment_downloads. The following options are used by the post processors: prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available, @@ -524,7 +527,6 @@ class YoutubeDL(object): self.cache = Cache(self) windows_enable_vt_mode() - # FIXME: This will break if we ever print color to stdout self._allow_colors = { 'screen': not self.params.get('no_color') and supports_terminal_sequences(self._screen_file), 'err': not self.params.get('no_color') and supports_terminal_sequences(self._err_file), @@ -612,6 +614,7 @@ class YoutubeDL(object): # Creating format selector here allows us to catch syntax errors before the extraction self.format_selector = ( None if self.params.get('format') is None + else self.params['format'] if callable(self.params['format']) else self.build_format_selector(self.params['format'])) self._setup_opener() @@ -628,11 +631,14 @@ class YoutubeDL(object): pp = pp_class(self, **compat_kwargs(pp_def)) self.add_post_processor(pp, when=when) - for ph in self.params.get('post_hooks', []): - self.add_post_hook(ph) - - for ph in self.params.get('progress_hooks', []): - self.add_progress_hook(ph) + hooks = { + 'post_hooks': self.add_post_hook, + 'progress_hooks': self.add_progress_hook, + 'postprocessor_hooks': self.add_postprocessor_hook, + } + for opt, fn in hooks.items(): + for ph in self.params.get(opt, []): + fn(ph) register_socks_protocols() @@ -840,6 +846,7 @@ class YoutubeDL(object): DELIM = 'blue' ERROR = 'red' WARNING = 'yellow' + SUPPRESS = 'light black' def __format_text(self, out, text, f, fallback=None, *, test_encoding=False): assert out in ('screen', 'err') @@ -1102,22 +1109,23 @@ class YoutubeDL(object): value = default if value is None else value + flags = outer_mobj.group('conversion') or '' str_fmt = f'{fmt[:-1]}s' if fmt[-1] == 'l': # list - delim = '\n' if '#' in (outer_mobj.group('conversion') or '') else ', ' + delim = '\n' if '#' in flags else ', ' value, fmt = delim.join(variadic(value)), str_fmt elif fmt[-1] == 'j': # json - value, fmt = json.dumps(value, default=_dumpjson_default), str_fmt + value, fmt = json.dumps(value, default=_dumpjson_default, indent=4 if '#' in flags else None), str_fmt elif fmt[-1] == 'q': # quoted - value, fmt = compat_shlex_quote(str(value)), str_fmt + value = map(str, variadic(value) if '#' in flags else [value]) + value, fmt = ' '.join(map(compat_shlex_quote, value)), str_fmt elif fmt[-1] == 'B': # bytes value = f'%{str_fmt}'.encode('utf-8') % str(value).encode('utf-8') value, fmt = value.decode('utf-8', 'ignore'), 's' elif fmt[-1] == 'U': # unicode normalized - opts = outer_mobj.group('conversion') or '' value, fmt = unicodedata.normalize( # "+" = compatibility equivalence, "#" = NFD - 'NF%s%s' % ('K' if '+' in opts else '', 'D' if '#' in opts else 'C'), + 'NF%s%s' % ('K' if '+' in flags else '', 'D' if '#' in flags else 'C'), value), str_fmt elif fmt[-1] == 'c': if value: @@ -1168,7 +1176,7 @@ class YoutubeDL(object): sub_ext = '' if len(fn_groups) > 2: sub_ext = fn_groups[-2] - filename = '.'.join(filter(None, [fn_groups[0][:trim_file_name], sub_ext, ext])) + filename = join_nonempty(fn_groups[0][:trim_file_name], sub_ext, ext, delim='.') return filename except ValueError as err: @@ -1316,11 +1324,11 @@ class YoutubeDL(object): self.report_error(msg) except ExtractorError as e: # An error we somewhat expected self.report_error(compat_str(e), e.format_traceback()) - except ThrottledDownload: + except ThrottledDownload as e: self.to_stderr('\r') - self.report_warning('The download speed is below throttle limit. Re-extracting data') + self.report_warning(f'{e}; Re-extracting data') return wrapper(self, *args, **kwargs) - except (DownloadCancelled, LazyList.IndexError): + except (DownloadCancelled, LazyList.IndexError, PagedList.IndexError): raise except Exception as e: if self.params.get('ignoreerrors'): @@ -1497,12 +1505,14 @@ class YoutubeDL(object): self.to_screen('[download] Downloading playlist: %s' % playlist) if 'entries' not in ie_result: - raise EntryNotInPlaylist() + raise EntryNotInPlaylist('There are no entries') + + MissingEntry = object() incomplete_entries = bool(ie_result.get('requested_entries')) if incomplete_entries: - def fill_missing_entries(entries, indexes): - ret = [None] * max(*indexes) - for i, entry in zip(indexes, entries): + def fill_missing_entries(entries, indices): + ret = [MissingEntry] * max(indices) + for i, entry in zip(indices, entries): ret[i - 1] = entry return ret ie_result['entries'] = fill_missing_entries(ie_result['entries'], ie_result['requested_entries']) @@ -1537,7 +1547,7 @@ class YoutubeDL(object): def get_entry(i): return ie_entries[i - 1] else: - if not isinstance(ie_entries, PagedList): + if not isinstance(ie_entries, (PagedList, LazyList)): ie_entries = LazyList(ie_entries) def get_entry(i): @@ -1555,11 +1565,11 @@ class YoutubeDL(object): entry = None try: entry = get_entry(i) - if entry is None: + if entry is MissingEntry: raise EntryNotInPlaylist() except (IndexError, EntryNotInPlaylist): if incomplete_entries: - raise EntryNotInPlaylist() + raise EntryNotInPlaylist(f'Entry {i} cannot be found') elif not playlistitems: break entries.append(entry) @@ -1577,10 +1587,11 @@ class YoutubeDL(object): if entry is not None] n_entries = len(entries) - if not playlistitems and (playliststart or playlistend): + if not playlistitems and (playliststart != 1 or playlistend): playlistitems = list(range(playliststart, playliststart + n_entries)) ie_result['requested_entries'] = playlistitems + _infojson_written = False if not self.params.get('simulate') and self.params.get('allow_playlist_files', True): ie_copy = { 'playlist': playlist, @@ -1593,8 +1604,9 @@ class YoutubeDL(object): } ie_copy.update(dict(ie_result)) - if self._write_info_json('playlist', ie_result, - self.prepare_filename(ie_copy, 'pl_infojson')) is None: + _infojson_written = self._write_info_json( + 'playlist', ie_result, self.prepare_filename(ie_copy, 'pl_infojson')) + if _infojson_written is None: return if self._write_description('playlist', ie_result, self.prepare_filename(ie_copy, 'pl_description')) is None: @@ -1647,9 +1659,14 @@ class YoutubeDL(object): self.report_error( 'Skipping the remaining entries in playlist "%s" since %d items failed extraction' % (playlist, failures)) break - # TODO: skip failed (empty) entries? playlist_results.append(entry_result) ie_result['entries'] = playlist_results + + # Write the updated info to json + if _infojson_written and self._write_info_json( + 'updated playlist', ie_result, + self.prepare_filename(ie_copy, 'pl_infojson'), overwrite=True) is None: + return self.to_screen('[download] Finished downloading playlist: %s' % playlist) return ie_result @@ -1722,9 +1739,10 @@ class YoutubeDL(object): def _check_formats(self, formats): for f in formats: self.to_screen('[info] Testing format %s' % f['format_id']) - temp_file = tempfile.NamedTemporaryFile( - suffix='.tmp', delete=False, - dir=self.get_output_path('temp') or None) + path = self.get_output_path('temp') + if not self._ensure_dir_exists(f'{path}/'): + continue + temp_file = tempfile.NamedTemporaryFile(suffix='.tmp', delete=False, dir=path or None) temp_file.close() try: success, _ = self.dl(temp_file.name, f, test=True) @@ -1925,9 +1943,9 @@ class YoutubeDL(object): 'format_id': '+'.join(filtered('format_id')), 'ext': output_ext, 'protocol': '+'.join(map(determine_protocol, formats_info)), - 'language': '+'.join(orderedSet(filtered('language'))), - 'format_note': '+'.join(orderedSet(filtered('format_note'))), - 'filesize_approx': sum(filtered('filesize', 'filesize_approx')), + 'language': '+'.join(orderedSet(filtered('language'))) or None, + 'format_note': '+'.join(orderedSet(filtered('format_note'))) or None, + 'filesize_approx': sum(filtered('filesize', 'filesize_approx')) or None, 'tbr': sum(filtered('tbr', 'vbr', 'abr')), } @@ -1994,7 +2012,7 @@ class YoutubeDL(object): # TODO: Add allvideo, allaudio etc by generalizing the code with best/worst selector if format_spec == 'all': def selector_function(ctx): - yield from _check_formats(ctx['formats']) + yield from _check_formats(ctx['formats'][::-1]) elif format_spec == 'mergeall': def selector_function(ctx): formats = list(_check_formats(ctx['formats'])) @@ -2149,7 +2167,7 @@ class YoutubeDL(object): t['url'] = sanitize_url(t['url']) if self.params.get('check_formats') is True: - info_dict['thumbnails'] = LazyList(check_thumbnails(thumbnails[::-1])).reverse() + info_dict['thumbnails'] = LazyList(check_thumbnails(thumbnails[::-1]), reverse=True) else: info_dict['thumbnails'] = thumbnails @@ -2344,7 +2362,7 @@ class YoutubeDL(object): # TODO Central sorting goes here if self.params.get('check_formats') is True: - formats = LazyList(self._check_formats(formats[::-1])).reverse() + formats = LazyList(self._check_formats(formats[::-1]), reverse=True) if not formats or formats[0] is not info_dict: # only set the 'formats' fields if the original info_dict list them @@ -2355,6 +2373,9 @@ class YoutubeDL(object): info_dict, _ = self.pre_process(info_dict) + # The pre-processors may have modified the formats + formats = info_dict.get('formats', [info_dict]) + if self.params.get('list_thumbnails'): self.list_thumbnails(info_dict) if self.params.get('listformats'): @@ -2636,6 +2657,8 @@ class YoutubeDL(object): infofn = self.prepare_filename(info_dict, 'infojson') _infojson_written = self._write_info_json('video', info_dict, infofn) if _infojson_written: + info_dict['infojson_filename'] = infofn + # For backward compatability, even though it was a private field info_dict['__infojson_filename'] = infofn elif _infojson_written is None: return @@ -2668,6 +2691,8 @@ class YoutubeDL(object): self.report_error('Cannot write internet shortcut file because the "webpage_url" field is missing in the media information') return False linkfn = replace_extension(self.prepare_filename(info_dict, 'link'), link_type, info_dict.get('ext')) + if not self._ensure_dir_exists(encodeFilename(linkfn)): + return False if self.params.get('overwrites', True) and os.path.exists(encodeFilename(linkfn)): self.to_screen(f'[info] Internet shortcut (.{link_type}) is already present') return True @@ -2909,9 +2934,10 @@ class YoutubeDL(object): downloader = get_suitable_downloader(info_dict, self.params) if 'protocol' in info_dict else None downloader = downloader.__name__ if downloader else None ffmpeg_fixup(info_dict.get('requested_formats') is None and downloader == 'HlsFD', - 'malformed AAC bitstream detected', FFmpegFixupM3u8PP) - ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'malformed timestamps detected', FFmpegFixupTimestampPP) - ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'malformed duration detected', FFmpegFixupDurationPP) + 'Possible MPEG-TS in MP4 container or malformed AAC timestamps', + FFmpegFixupM3u8PP) + ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'Malformed timestamps detected', FFmpegFixupTimestampPP) + ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'Malformed duration detected', FFmpegFixupDurationPP) fixup() try: @@ -2933,8 +2959,25 @@ class YoutubeDL(object): if max_downloads is not None and self._num_downloads >= int(max_downloads): raise MaxDownloadsReached() + def __download_wrapper(self, func): + @functools.wraps(func) + def wrapper(*args, **kwargs): + try: + res = func(*args, **kwargs) + except UnavailableVideoError as e: + self.report_error(e) + except DownloadCancelled as e: + self.to_screen(f'[info] {e}') + raise + else: + if self.params.get('dump_single_json', False): + self.post_extract(res) + self.to_stdout(json.dumps(self.sanitize_info(res))) + return wrapper + def download(self, url_list): """Download a given list of URLs.""" + url_list = variadic(url_list) # Passing a single URL is a common mistake outtmpl = self.outtmpl_dict['default'] if (len(url_list) > 1 and outtmpl != '-' @@ -2943,19 +2986,8 @@ class YoutubeDL(object): raise SameFileError(outtmpl) for url in url_list: - try: - # It also downloads the videos - res = self.extract_info( - url, force_generic_extractor=self.params.get('force_generic_extractor', False)) - except UnavailableVideoError: - self.report_error('unable to download video') - except DownloadCancelled as e: - self.to_screen(f'[info] {e.msg}') - raise - else: - if self.params.get('dump_single_json', False): - self.post_extract(res) - self.to_stdout(json.dumps(self.sanitize_info(res))) + self.__download_wrapper(self.extract_info)( + url, force_generic_extractor=self.params.get('force_generic_extractor', False)) return self._download_retcode @@ -2966,11 +2998,13 @@ class YoutubeDL(object): # FileInput doesn't have a read method, we can't call json.load info = self.sanitize_info(json.loads('\n'.join(f)), self.params.get('clean_infojson', True)) try: - self.process_ie_result(info, download=True) - except (DownloadError, EntryNotInPlaylist, ThrottledDownload): + self.__download_wrapper(self.process_ie_result)(info, download=True) + except (DownloadError, EntryNotInPlaylist, ThrottledDownload) as e: + if not isinstance(e, EntryNotInPlaylist): + self.to_stderr('\r') webpage_url = info.get('webpage_url') if webpage_url is not None: - self.report_warning('The info failed to download, trying with "%s"' % webpage_url) + self.report_warning(f'The info failed to download: {e}; trying with URL {webpage_url}') return self.download([webpage_url]) else: raise @@ -2983,11 +3017,11 @@ class YoutubeDL(object): return info_dict info_dict.setdefault('epoch', int(time.time())) remove_keys = {'__original_infodict'} # Always remove this since this may contain a copy of the entire dict - keep_keys = ['_type'], # Always keep this to facilitate load-info-json + keep_keys = ['_type'] # Always keep this to facilitate load-info-json if remove_private_keys: remove_keys |= { - 'requested_formats', 'requested_subtitles', 'requested_entries', - 'filepath', 'entries', 'original_url', 'playlist_autonumber', + 'requested_formats', 'requested_subtitles', 'requested_entries', 'entries', + 'filepath', 'infojson_filename', 'original_url', 'playlist_autonumber', } empty_values = (None, {}, [], set(), tuple()) reject = lambda k, v: k not in keep_keys and ( @@ -3117,22 +3151,17 @@ class YoutubeDL(object): @staticmethod def format_resolution(format, default='unknown'): - is_images = format.get('vcodec') == 'none' and format.get('acodec') == 'none' if format.get('vcodec') == 'none' and format.get('acodec') != 'none': return 'audio only' if format.get('resolution') is not None: return format['resolution'] if format.get('width') and format.get('height'): - res = '%dx%d' % (format['width'], format['height']) + return '%dx%d' % (format['width'], format['height']) elif format.get('height'): - res = '%sp' % format['height'] + return '%sp' % format['height'] elif format.get('width'): - res = '%dx?' % format['width'] - elif is_images: - return 'images' - else: - return default - return f'{res} images' if is_images else res + return '%dx?' % format['width'] + return default def _format_note(self, fdict): res = '' @@ -3199,37 +3228,42 @@ class YoutubeDL(object): formats = info_dict.get('formats', [info_dict]) new_format = self.params.get('listformats_table', True) is not False if new_format: - tbr_digits = number_of_digits(max(f.get('tbr') or 0 for f in formats)) - vbr_digits = number_of_digits(max(f.get('vbr') or 0 for f in formats)) - abr_digits = number_of_digits(max(f.get('abr') or 0 for f in formats)) delim = self._format_screen('\u2502', self.Styles.DELIM, '|', test_encoding=True) table = [ [ self._format_screen(format_field(f, 'format_id'), self.Styles.ID), format_field(f, 'ext'), - self.format_resolution(f), - format_field(f, 'fps', '%d'), + format_field(f, func=self.format_resolution, ignore=('audio only', 'images')), + format_field(f, 'fps', '\t%d'), format_field(f, 'dynamic_range', '%s', ignore=(None, 'SDR')).replace('HDR', ''), delim, - format_field(f, 'filesize', ' %s', func=format_bytes) + format_field(f, 'filesize_approx', '~%s', func=format_bytes), - format_field(f, 'tbr', f'%{tbr_digits}dk'), - shorten_protocol_name(f.get('protocol', '').replace("native", "n")), + format_field(f, 'filesize', ' \t%s', func=format_bytes) + format_field(f, 'filesize_approx', '~\t%s', func=format_bytes), + format_field(f, 'tbr', '\t%dk'), + shorten_protocol_name(f.get('protocol', '').replace('native', 'n')), delim, - format_field(f, 'vcodec', default='unknown').replace('none', ''), - format_field(f, 'vbr', f'%{vbr_digits}dk'), - format_field(f, 'acodec', default='unknown').replace('none', ''), - format_field(f, 'abr', f'%{abr_digits}dk'), - format_field(f, 'asr', '%5dHz'), - ', '.join(filter(None, ( - self._format_screen('UNSUPPORTED', 'light red') if f.get('ext') in ('f4f', 'f4m') else '', + format_field(f, 'vcodec', default='unknown').replace( + 'none', + 'images' if f.get('acodec') == 'none' + else self._format_screen('audio only', self.Styles.SUPPRESS)), + format_field(f, 'vbr', '\t%dk'), + format_field(f, 'acodec', default='unknown').replace( + 'none', + '' if f.get('vcodec') == 'none' + else self._format_screen('video only', self.Styles.SUPPRESS)), + format_field(f, 'abr', '\t%dk'), + format_field(f, 'asr', '\t%dHz'), + join_nonempty( + self._format_screen('UNSUPPORTED', 'light red') if f.get('ext') in ('f4f', 'f4m') else None, format_field(f, 'language', '[%s]'), - format_field(f, 'format_note'), - format_field(f, 'container', ignore=(None, f.get('ext'))), - ))), + join_nonempty( + format_field(f, 'format_note'), + format_field(f, 'container', ignore=(None, f.get('ext'))), + delim=', '), + delim=' '), ] for f in formats if f.get('preference') is None or f['preference'] >= -1000] header_line = self._list_format_headers( - 'ID', 'EXT', 'RESOLUTION', 'FPS', 'HDR', delim, ' FILESIZE', ' TBR', 'PROTO', - delim, 'VCODEC', ' VBR', 'ACODEC', ' ABR', ' ASR', 'MORE INFO') + 'ID', 'EXT', 'RESOLUTION', '\tFPS', 'HDR', delim, '\tFILESIZE', '\tTBR', 'PROTO', + delim, 'VCODEC', '\tVBR', 'ACODEC', '\tABR', '\tASR', 'MORE INFO') else: table = [ [ @@ -3245,8 +3279,8 @@ class YoutubeDL(object): '[info] Available formats for %s:' % info_dict['id']) self.to_stdout(render_table( header_line, table, - extraGap=(0 if new_format else 1), - hideEmpty=new_format, + extra_gap=(0 if new_format else 1), + hide_empty=new_format, delim=new_format and self._format_screen('\u2500', self.Styles.DELIM, '-', test_encoding=True))) def list_thumbnails(self, info_dict): @@ -3277,7 +3311,7 @@ class YoutubeDL(object): self.to_stdout(render_table( self._list_format_headers('Language', 'Name', 'Formats'), [_row(lang, formats) for lang, formats in subtitles.items()], - hideEmpty=True)) + hide_empty=True)) def urlopen(self, req): """ Start an HTTP download """ @@ -3349,7 +3383,11 @@ class YoutubeDL(object): platform.architecture()[0], platform_name())) - exe_versions = FFmpegPostProcessor.get_versions(self) + exe_versions, ffmpeg_features = FFmpegPostProcessor.get_versions_and_features(self) + ffmpeg_features = {key for key, val in ffmpeg_features.items() if val} + if ffmpeg_features: + exe_versions['ffmpeg'] += ' (%s)' % ','.join(ffmpeg_features) + exe_versions['rtmpdump'] = rtmpdump_version() exe_versions['phantomjs'] = PhantomJSwrapper._version() exe_str = ', '.join( @@ -3361,13 +3399,13 @@ class YoutubeDL(object): from .postprocessor.embedthumbnail import has_mutagen from .cookies import SQLITE_AVAILABLE, KEYRING_AVAILABLE - lib_str = ', '.join(sorted(filter(None, ( + lib_str = join_nonempty( compat_pycrypto_AES and compat_pycrypto_AES.__name__.split('.')[0], - has_websockets and 'websockets', + KEYRING_AVAILABLE and 'keyring', has_mutagen and 'mutagen', SQLITE_AVAILABLE and 'sqlite', - KEYRING_AVAILABLE and 'keyring', - )))) or 'none' + has_websockets and 'websockets', + delim=', ') or 'none' write_debug('Optional libraries: %s' % lib_str) proxy_map = {} @@ -3452,8 +3490,10 @@ class YoutubeDL(object): encoding = preferredencoding() return encoding - def _write_info_json(self, label, ie_result, infofn): + def _write_info_json(self, label, ie_result, infofn, overwrite=None): ''' Write infojson and returns True = written, False = skip, None = error ''' + if overwrite is None: + overwrite = self.params.get('overwrites', True) if not self.params.get('writeinfojson'): return False elif not infofn: @@ -3461,7 +3501,7 @@ class YoutubeDL(object): return False elif not self._ensure_dir_exists(infofn): return None - elif not self.params.get('overwrites', True) and os.path.exists(infofn): + elif not overwrite and os.path.exists(infofn): self.to_screen(f'[info] {label.title()} metadata is already present') else: self.to_screen(f'[info] Writing {label} metadata as JSON to: {infofn}') @@ -3560,14 +3600,15 @@ class YoutubeDL(object): for t in thumbnails[::-1]: thumb_ext = (f'{t["id"]}.' if multiple else '') + determine_ext(t['url'], 'jpg') - thumb_display_id = f'{label} thumbnail' + (f' {t["id"]}' if multiple else '') + thumb_display_id = f'{label} thumbnail {t["id"]}' thumb_filename = replace_extension(filename, thumb_ext, info_dict.get('ext')) thumb_filename_final = replace_extension(thumb_filename_base, thumb_ext, info_dict.get('ext')) if not self.params.get('overwrites', True) and os.path.exists(thumb_filename): ret.append((thumb_filename, thumb_filename_final)) t['filepath'] = thumb_filename - self.to_screen(f'[info] {thumb_display_id.title()} is already present') + self.to_screen('[info] %s is already present' % ( + thumb_display_id if multiple else f'{label} thumbnail').capitalize()) else: self.to_screen(f'[info] Downloading {thumb_display_id} ...') try: diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 3a4b81efd..88f5bbae2 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -27,6 +27,8 @@ from .utils import ( error_to_compat_str, ExistingVideoReached, expand_path, + float_or_none, + int_or_none, match_filter_func, MaxDownloadsReached, parse_duration, @@ -222,11 +224,13 @@ def _real_main(argv=None): if opts.playlistend not in (-1, None) and opts.playlistend < opts.playliststart: raise ValueError('Playlist end must be greater than playlist start') if opts.extractaudio: + opts.audioformat = opts.audioformat.lower() if opts.audioformat not in ['best'] + list(FFmpegExtractAudioPP.SUPPORTED_EXTS): parser.error('invalid audio format specified') if opts.audioquality: opts.audioquality = opts.audioquality.strip('k').strip('K') - if not opts.audioquality.isdigit(): + audioquality = int_or_none(float_or_none(opts.audioquality)) # int_or_none prevents inf, nan + if audioquality is None or audioquality < 0: parser.error('invalid audio quality specified') if opts.recodevideo is not None: opts.recodevideo = opts.recodevideo.replace(' ', '') @@ -283,6 +287,11 @@ def _real_main(argv=None): set_default_compat('abort-on-error', 'ignoreerrors', 'only_download') set_default_compat('no-playlist-metafiles', 'allow_playlist_files') set_default_compat('no-clean-infojson', 'clean_infojson') + if 'no-attach-info-json' in compat_opts: + if opts.embed_infojson: + _unused_compat_opt('no-attach-info-json') + else: + opts.embed_infojson = False if 'format-sort' in compat_opts: opts.format_sort.extend(InfoExtractor.FormatSort.ytdl_default) _video_multistreams_set = set_default_compat('multistreams', 'allow_multiple_video_streams', False, remove_compat=False) @@ -366,8 +375,6 @@ def _real_main(argv=None): opts.sponsorblock_remove = set() sponsorblock_query = opts.sponsorblock_mark | opts.sponsorblock_remove - if (opts.addmetadata or opts.sponsorblock_mark) and opts.addchapters is None: - opts.addchapters = True opts.remove_chapters = opts.remove_chapters or [] if (opts.remove_chapters or sponsorblock_query) and opts.sponskrub is not False: @@ -388,40 +395,32 @@ def _real_main(argv=None): opts.remuxvideo = False if opts.allow_unplayable_formats: - if opts.extractaudio: - report_conflict('--allow-unplayable-formats', '--extract-audio') - opts.extractaudio = False - if opts.remuxvideo: - report_conflict('--allow-unplayable-formats', '--remux-video') - opts.remuxvideo = False - if opts.recodevideo: - report_conflict('--allow-unplayable-formats', '--recode-video') - opts.recodevideo = False - if opts.addmetadata: - report_conflict('--allow-unplayable-formats', '--add-metadata') - opts.addmetadata = False - if opts.embedsubtitles: - report_conflict('--allow-unplayable-formats', '--embed-subs') - opts.embedsubtitles = False - if opts.embedthumbnail: - report_conflict('--allow-unplayable-formats', '--embed-thumbnail') - opts.embedthumbnail = False - if opts.xattrs: - report_conflict('--allow-unplayable-formats', '--xattrs') - opts.xattrs = False - if opts.fixup and opts.fixup.lower() not in ('never', 'ignore'): - report_conflict('--allow-unplayable-formats', '--fixup') + def report_unplayable_conflict(opt_name, arg, default=False, allowed=None): + val = getattr(opts, opt_name) + if (not allowed and val) or (allowed and not allowed(val)): + report_conflict('--allow-unplayable-formats', arg) + setattr(opts, opt_name, default) + + report_unplayable_conflict('extractaudio', '--extract-audio') + report_unplayable_conflict('remuxvideo', '--remux-video') + report_unplayable_conflict('recodevideo', '--recode-video') + report_unplayable_conflict('addmetadata', '--embed-metadata') + report_unplayable_conflict('addchapters', '--embed-chapters') + report_unplayable_conflict('embed_infojson', '--embed-info-json') + opts.embed_infojson = False + report_unplayable_conflict('embedsubtitles', '--embed-subs') + report_unplayable_conflict('embedthumbnail', '--embed-thumbnail') + report_unplayable_conflict('xattrs', '--xattrs') + report_unplayable_conflict('fixup', '--fixup', default='never', allowed=lambda x: x in (None, 'never', 'ignore')) opts.fixup = 'never' - if opts.remove_chapters: - report_conflict('--allow-unplayable-formats', '--remove-chapters') - opts.remove_chapters = [] - if opts.sponsorblock_remove: - report_conflict('--allow-unplayable-formats', '--sponsorblock-remove') - opts.sponsorblock_remove = set() - if opts.sponskrub: - report_conflict('--allow-unplayable-formats', '--sponskrub') + report_unplayable_conflict('remove_chapters', '--remove-chapters', default=[]) + report_unplayable_conflict('sponsorblock_remove', '--sponsorblock-remove', default=set()) + report_unplayable_conflict('sponskrub', '--sponskrub', default=set()) opts.sponskrub = False + if (opts.addmetadata or opts.sponsorblock_mark) and opts.addchapters is None: + opts.addchapters = True + # PostProcessors postprocessors = list(opts.add_postprocessors) if sponsorblock_query: @@ -519,11 +518,14 @@ def _real_main(argv=None): # By default ffmpeg preserves metadata applicable for both # source and target containers. From this point the container won't change, # so metadata can be added here. - if opts.addmetadata or opts.addchapters: + if opts.addmetadata or opts.addchapters or opts.embed_infojson: + if opts.embed_infojson is None: + opts.embed_infojson = 'if_exists' postprocessors.append({ 'key': 'FFmpegMetadata', 'add_chapters': opts.addchapters, 'add_metadata': opts.addmetadata, + 'add_infojson': opts.embed_infojson, }) # Note: Deprecated # This should be above EmbedThumbnail since sponskrub removes the thumbnail attachment @@ -777,15 +779,15 @@ def main(argv=None): _real_main(argv) except DownloadError: sys.exit(1) - except SameFileError: - sys.exit('ERROR: fixed output name but more than one file to download') + except SameFileError as e: + sys.exit(f'ERROR: {e}') except KeyboardInterrupt: sys.exit('\nERROR: Interrupted by user') - except BrokenPipeError: + except BrokenPipeError as e: # https://docs.python.org/3/library/signal.html#note-on-sigpipe devnull = os.open(os.devnull, os.O_WRONLY) os.dup2(devnull, sys.stdout.fileno()) - sys.exit(r'\nERROR: {err}') + sys.exit(f'\nERROR: {e}') __all__ = ['main', 'YoutubeDL', 'gen_extractors', 'list_extractors'] diff --git a/yt_dlp/aes.py b/yt_dlp/aes.py index 60cdeb74e..8503e3dfd 100644 --- a/yt_dlp/aes.py +++ b/yt_dlp/aes.py @@ -28,6 +28,48 @@ else: BLOCK_SIZE_BYTES = 16 +def aes_ecb_encrypt(data, key, iv=None): + """ + Encrypt with aes in ECB mode + + @param {int[]} data cleartext + @param {int[]} key 16/24/32-Byte cipher key + @param {int[]} iv Unused for this mode + @returns {int[]} encrypted data + """ + expanded_key = key_expansion(key) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + + encrypted_data = [] + for i in range(block_count): + block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] + encrypted_data += aes_encrypt(block, expanded_key) + encrypted_data = encrypted_data[:len(data)] + + return encrypted_data + + +def aes_ecb_decrypt(data, key, iv=None): + """ + Decrypt with aes in ECB mode + + @param {int[]} data cleartext + @param {int[]} key 16/24/32-Byte cipher key + @param {int[]} iv Unused for this mode + @returns {int[]} decrypted data + """ + expanded_key = key_expansion(key) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + + encrypted_data = [] + for i in range(block_count): + block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] + encrypted_data += aes_decrypt(block, expanded_key) + encrypted_data = encrypted_data[:len(data)] + + return encrypted_data + + def aes_ctr_decrypt(data, key, iv): """ Decrypt with aes in counter mode diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index c9ae9b6db..ec68a809d 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -117,7 +117,7 @@ def _extract_firefox_cookies(profile, logger): raise FileNotFoundError('could not find firefox cookies database in {}'.format(search_root)) logger.debug('Extracting cookies from: "{}"'.format(cookie_database_path)) - with tempfile.TemporaryDirectory(prefix='youtube_dl') as tmpdir: + with tempfile.TemporaryDirectory(prefix='yt_dlp') as tmpdir: cursor = None try: cursor = _open_database_copy(cookie_database_path, tmpdir) @@ -236,7 +236,7 @@ def _extract_chrome_cookies(browser_name, profile, logger): decryptor = get_cookie_decryptor(config['browser_dir'], config['keyring_name'], logger) - with tempfile.TemporaryDirectory(prefix='youtube_dl') as tmpdir: + with tempfile.TemporaryDirectory(prefix='yt_dlp') as tmpdir: cursor = None try: cursor = _open_database_copy(cookie_database_path, tmpdir) diff --git a/yt_dlp/downloader/common.py b/yt_dlp/downloader/common.py index 6cfbb6657..4528f3be5 100644 --- a/yt_dlp/downloader/common.py +++ b/yt_dlp/downloader/common.py @@ -319,6 +319,10 @@ class FileDownloader(object): msg_template = '%(_downloaded_bytes_str)s at %(_speed_str)s' else: msg_template = '%(_percent_str)s % at %(_speed_str)s ETA %(_eta_str)s' + if s.get('fragment_index') and s.get('fragment_count'): + msg_template += ' (frag %(fragment_index)s/%(fragment_count)s)' + elif s.get('fragment_index'): + msg_template += ' (frag %(fragment_index)s)' s['_default_template'] = msg_template % s self._report_progress_status(s) diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py index ce3370fb7..da69423f7 100644 --- a/yt_dlp/downloader/external.py +++ b/yt_dlp/downloader/external.py @@ -21,7 +21,6 @@ from ..utils import ( encodeArgument, handle_youtubedl_headers, check_executable, - is_outdated_version, Popen, sanitize_open, ) @@ -444,8 +443,7 @@ class FFmpegFD(ExternalFD): if info_dict.get('requested_formats') or protocol == 'http_dash_segments': for (i, fmt) in enumerate(info_dict.get('requested_formats') or [info_dict]): stream_number = fmt.get('manifest_stream_number', 0) - a_or_v = 'a' if fmt.get('acodec') != 'none' else 'v' - args.extend(['-map', f'{i}:{a_or_v}:{stream_number}']) + args.extend(['-map', f'{i}:{stream_number}']) if self.params.get('test', False): args += ['-fs', compat_str(self._TEST_FILE_SIZE)] @@ -459,7 +457,7 @@ class FFmpegFD(ExternalFD): args += ['-f', 'mpegts'] else: args += ['-f', 'mp4'] - if (ffpp.basename == 'ffmpeg' and is_outdated_version(ffpp._versions['ffmpeg'], '3.2', False)) and (not info_dict.get('acodec') or info_dict['acodec'].split('.')[0] in ('aac', 'mp4a')): + if (ffpp.basename == 'ffmpeg' and ffpp._features.get('needs_adtstoasc')) and (not info_dict.get('acodec') or info_dict['acodec'].split('.')[0] in ('aac', 'mp4a')): args += ['-bsf:a', 'aac_adtstoasc'] elif protocol == 'rtmp': args += ['-f', 'flv'] diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py index a9d1471f8..04b0f68c0 100644 --- a/yt_dlp/downloader/fragment.py +++ b/yt_dlp/downloader/fragment.py @@ -31,6 +31,10 @@ class HttpQuietDownloader(HttpFD): def to_screen(self, *args, **kargs): pass + def report_retry(self, err, count, retries): + super().to_screen( + f'[download] Got server HTTP error: {err}. Retrying (attempt {count} of {self.format_retries(retries)}) ...') + class FragmentFD(FileDownloader): """ @@ -44,6 +48,7 @@ class FragmentFD(FileDownloader): Skip unavailable fragments (DASH and hlsnative only) keep_fragments: Keep downloaded fragments on disk after downloading is finished + concurrent_fragment_downloads: The number of threads to use for native hls and dash downloads _no_ytdl_file: Don't use .ytdl file For each incomplete fragment download yt-dlp keeps on disk a special @@ -167,7 +172,7 @@ class FragmentFD(FileDownloader): self.ydl, { 'continuedl': True, - 'quiet': True, + 'quiet': self.params.get('quiet'), 'noprogress': True, 'ratelimit': self.params.get('ratelimit'), 'retries': self.params.get('retries', 0), @@ -237,6 +242,7 @@ class FragmentFD(FileDownloader): start = time.time() ctx.update({ 'started': start, + 'fragment_started': start, # Amount of fragment's bytes downloaded by the time of the previous # frag progress hook invocation 'prev_frag_downloaded_bytes': 0, @@ -267,6 +273,9 @@ class FragmentFD(FileDownloader): ctx['fragment_index'] = state['fragment_index'] state['downloaded_bytes'] += frag_total_bytes - ctx['prev_frag_downloaded_bytes'] ctx['complete_frags_downloaded_bytes'] = state['downloaded_bytes'] + ctx['speed'] = state['speed'] = self.calc_speed( + ctx['fragment_started'], time_now, frag_total_bytes) + ctx['fragment_started'] = time.time() ctx['prev_frag_downloaded_bytes'] = 0 else: frag_downloaded_bytes = s['downloaded_bytes'] @@ -275,8 +284,8 @@ class FragmentFD(FileDownloader): state['eta'] = self.calc_eta( start, time_now, estimated_size - resume_len, state['downloaded_bytes'] - resume_len) - state['speed'] = s.get('speed') or ctx.get('speed') - ctx['speed'] = state['speed'] + ctx['speed'] = state['speed'] = self.calc_speed( + ctx['fragment_started'], time_now, frag_downloaded_bytes) ctx['prev_frag_downloaded_bytes'] = frag_downloaded_bytes self._hook_progress(state, info_dict) diff --git a/yt_dlp/downloader/hls.py b/yt_dlp/downloader/hls.py index ef8a81b01..f3f32b514 100644 --- a/yt_dlp/downloader/hls.py +++ b/yt_dlp/downloader/hls.py @@ -77,6 +77,15 @@ class HlsFD(FragmentFD): message = ('The stream has AES-128 encryption and neither ffmpeg nor pycryptodome are available; ' 'Decryption will be performed natively, but will be extremely slow') if not can_download: + has_drm = re.search('|'.join([ + r'#EXT-X-FAXS-CM:', # Adobe Flash Access + r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay + ]), s) + if has_drm and not self.params.get('allow_unplayable_formats'): + self.report_error( + 'This video is DRM protected; Try selecting another format with --format or ' + 'add --check-formats to automatically fallback to the next best format') + return False message = message or 'Unsupported features have been detected' fd = FFmpegFD(self.ydl, self.params) self.report_warning(f'{message}; extraction will be delegated to {fd.get_basename()}') diff --git a/yt_dlp/downloader/mhtml.py b/yt_dlp/downloader/mhtml.py index b75db18a8..1477f65a6 100644 --- a/yt_dlp/downloader/mhtml.py +++ b/yt_dlp/downloader/mhtml.py @@ -114,8 +114,8 @@ body > figure > img { fragment_base_url = info_dict.get('fragment_base_url') fragments = info_dict['fragments'][:1] if self.params.get( 'test', False) else info_dict['fragments'] - title = info_dict['title'] - origin = info_dict['webpage_url'] + title = info_dict.get('title', info_dict['format_id']) + origin = info_dict.get('webpage_url', info_dict['url']) ctx = { 'filename': filename, diff --git a/yt_dlp/extractor/adobetv.py b/yt_dlp/extractor/adobetv.py index 12b819206..3cfa1ff55 100644 --- a/yt_dlp/extractor/adobetv.py +++ b/yt_dlp/extractor/adobetv.py @@ -9,6 +9,7 @@ from ..utils import ( float_or_none, int_or_none, ISO639Utils, + join_nonempty, OnDemandPagedList, parse_duration, str_or_none, @@ -263,7 +264,7 @@ class AdobeTVVideoIE(AdobeTVBaseIE): continue formats.append({ 'filesize': int_or_none(source.get('kilobytes') or None, invscale=1000), - 'format_id': '-'.join(filter(None, [source.get('format'), source.get('label')])), + 'format_id': join_nonempty(source.get('format'), source.get('label')), 'height': int_or_none(source.get('height') or None), 'tbr': int_or_none(source.get('bitrate') or None), 'width': int_or_none(source.get('width') or None), diff --git a/yt_dlp/extractor/aljazeera.py b/yt_dlp/extractor/aljazeera.py index e829b45e4..7bcdb7afb 100644 --- a/yt_dlp/extractor/aljazeera.py +++ b/yt_dlp/extractor/aljazeera.py @@ -1,55 +1,86 @@ +# coding: utf-8 from __future__ import unicode_literals import json from .common import InfoExtractor +from ..utils import ( + try_get, +) class AlJazeeraIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?P<type>program/[^/]+|(?:feature|video)s)/\d{4}/\d{1,2}/\d{1,2}/(?P<id>[^/?&#]+)' + _VALID_URL = r'https?://(?P<base>\w+\.aljazeera\.\w+)/(?P<type>programs?/[^/]+|(?:feature|video|new)s)?/\d{4}/\d{1,2}/\d{1,2}/(?P<id>[^/?&#]+)' _TESTS = [{ - 'url': 'https://www.aljazeera.com/program/episode/2014/9/19/deliverance', + 'url': 'https://balkans.aljazeera.net/videos/2021/11/6/pojedini-domovi-u-sarajevu-jos-pod-vodom-mjestanima-se-dostavlja-hrana', 'info_dict': { - 'id': '3792260579001', + 'id': '6280641530001', 'ext': 'mp4', - 'title': 'The Slum - Episode 1: Deliverance', - 'description': 'As a birth attendant advocating for family planning, Remy is on the frontline of Tondo\'s battle with overcrowding.', - 'uploader_id': '665003303001', - 'timestamp': 1411116829, - 'upload_date': '20140919', - }, - 'add_ie': ['BrightcoveNew'], - 'skip': 'Not accessible from Travis CI server', - }, { - 'url': 'https://www.aljazeera.com/videos/2017/5/11/sierra-leone-709-carat-diamond-to-be-auctioned-off', - 'only_matching': True, + 'title': 'Pojedini domovi u Sarajevu još pod vodom, mještanima se dostavlja hrana', + 'timestamp': 1636219149, + 'description': 'U sarajevskim naseljima Rajlovac i Reljevo stambeni objekti, ali i industrijska postrojenja i dalje su pod vodom.', + 'upload_date': '20211106', + } }, { - 'url': 'https://www.aljazeera.com/features/2017/8/21/transforming-pakistans-buses-into-art', - 'only_matching': True, + 'url': 'https://balkans.aljazeera.net/videos/2021/11/6/djokovic-usao-u-finale-mastersa-u-parizu', + 'info_dict': { + 'id': '6280654936001', + 'ext': 'mp4', + 'title': 'Đoković ušao u finale Mastersa u Parizu', + 'timestamp': 1636221686, + 'description': 'Novak Đoković je u polufinalu Mastersa u Parizu nakon preokreta pobijedio Poljaka Huberta Hurkacza.', + 'upload_date': '20211106', + }, }] - BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s' + BRIGHTCOVE_URL_RE = r'https?://players.brightcove.net/(?P<account>\d+)/(?P<player_id>[a-zA-Z0-9]+)_(?P<embed>[^/]+)/index.html\?videoId=(?P<id>\d+)' def _real_extract(self, url): - post_type, name = self._match_valid_url(url).groups() + base, post_type, id = self._match_valid_url(url).groups() + wp = { + 'balkans.aljazeera.net': 'ajb', + 'chinese.aljazeera.net': 'chinese', + 'mubasher.aljazeera.net': 'ajm', + }.get(base) or 'aje' post_type = { 'features': 'post', 'program': 'episode', + 'programs': 'episode', 'videos': 'video', + 'news': 'news', }[post_type.split('/')[0]] video = self._download_json( - 'https://www.aljazeera.com/graphql', name, query={ + f'https://{base}/graphql', id, query={ + 'wp-site': wp, 'operationName': 'ArchipelagoSingleArticleQuery', 'variables': json.dumps({ - 'name': name, + 'name': id, 'postType': post_type, }), }, headers={ - 'wp-site': 'aje', - })['data']['article']['video'] - video_id = video['id'] - account_id = video.get('accountId') or '665003303001' - player_id = video.get('playerId') or 'BkeSH5BDb' - return self.url_result( - self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id), - 'BrightcoveNew', video_id) + 'wp-site': wp, + }) + video = try_get(video, lambda x: x['data']['article']['video']) or {} + video_id = video.get('id') + account = video.get('accountId') or '911432371001' + player_id = video.get('playerId') or 'csvTfAlKW' + embed = 'default' + + if video_id is None: + webpage = self._download_webpage(url, id) + + account, player_id, embed, video_id = self._search_regex(self.BRIGHTCOVE_URL_RE, webpage, 'video id', + group=(1, 2, 3, 4), default=(None, None, None, None)) + + if video_id is None: + return { + '_type': 'url_transparent', + 'url': url, + 'ie_key': 'Generic' + } + + return { + '_type': 'url_transparent', + 'url': f'https://players.brightcove.net/{account}/{player_id}_{embed}/index.html?videoId={video_id}', + 'ie_key': 'BrightcoveNew' + } diff --git a/yt_dlp/extractor/amazon.py b/yt_dlp/extractor/amazon.py new file mode 100644 index 000000000..7c5d35f47 --- /dev/null +++ b/yt_dlp/extractor/amazon.py @@ -0,0 +1,53 @@ +# coding: utf-8 +from .common import InfoExtractor +from ..utils import int_or_none + + +class AmazonStoreIE(InfoExtractor): + _VALID_URL = r'(?:https?://)(?:www\.)?amazon\.(?:[a-z]{2,3})(?:\.[a-z]{2})?/(?:[^/]+/)?(?:dp|gp/product)/(?P<id>[^/&#$?]+)' + + _TESTS = [{ + 'url': 'https://www.amazon.co.uk/dp/B098XNCHLD/', + 'info_dict': { + 'id': 'B098XNCHLD', + 'title': 'md5:5f3194dbf75a8dcfc83079bd63a2abed', + }, + 'playlist_mincount': 1, + 'playlist': [{ + 'info_dict': { + 'id': 'A1F83G8C2ARO7P', + 'ext': 'mp4', + 'title': 'mcdodo usb c cable 100W 5a', + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }] + }, { + 'url': 'https://www.amazon.in/Sony-WH-1000XM4-Cancelling-Headphones-Bluetooth/dp/B0863TXGM3', + 'info_dict': { + 'id': 'B0863TXGM3', + 'title': 'md5:b0bde4881d3cfd40d63af19f7898b8ff', + }, + 'playlist_mincount': 4, + }, { + 'url': 'https://www.amazon.com/dp/B0845NXCXF/', + 'info_dict': { + 'id': 'B0845NXCXF', + 'title': 'md5:2145cd4e3c7782f1ee73649a3cff1171', + }, + 'playlist-mincount': 1, + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + data_json = self._parse_json(self._html_search_regex(r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'(.*)\'\)', webpage, 'data'), id) + entries = [{ + 'id': video['marketPlaceID'], + 'url': video['url'], + 'title': video.get('title'), + 'thumbnail': video.get('thumbUrl') or video.get('thumb'), + 'duration': video.get('durationSeconds'), + 'height': int_or_none(video.get('videoHeight')), + 'width': int_or_none(video.get('videoWidth')), + } for video in (data_json.get('videos') or []) if video.get('isVideo') and video.get('url')] + return self.playlist_result(entries, playlist_id=id, playlist_title=data_json['title']) diff --git a/yt_dlp/extractor/animeondemand.py b/yt_dlp/extractor/animeondemand.py index 54e097d2f..5694f7240 100644 --- a/yt_dlp/extractor/animeondemand.py +++ b/yt_dlp/extractor/animeondemand.py @@ -8,6 +8,7 @@ from ..utils import ( determine_ext, extract_attributes, ExtractorError, + join_nonempty, url_or_none, urlencode_postdata, urljoin, @@ -140,15 +141,8 @@ class AnimeOnDemandIE(InfoExtractor): kind = self._search_regex( r'videomaterialurl/\d+/([^/]+)/', playlist_url, 'media kind', default=None) - format_id_list = [] - if lang: - format_id_list.append(lang) - if kind: - format_id_list.append(kind) - if not format_id_list and num is not None: - format_id_list.append(compat_str(num)) - format_id = '-'.join(format_id_list) - format_note = ', '.join(filter(None, (kind, lang_note))) + format_id = join_nonempty(lang, kind) if lang or kind else str(num) + format_note = join_nonempty(kind, lang_note, delim=', ') item_id_list = [] if format_id: item_id_list.append(format_id) @@ -195,12 +189,10 @@ class AnimeOnDemandIE(InfoExtractor): if not file_: continue ext = determine_ext(file_) - format_id_list = [lang, kind] - if ext == 'm3u8': - format_id_list.append('hls') - elif source.get('type') == 'video/dash' or ext == 'mpd': - format_id_list.append('dash') - format_id = '-'.join(filter(None, format_id_list)) + format_id = join_nonempty( + lang, kind, + 'hls' if ext == 'm3u8' else None, + 'dash' if source.get('type') == 'video/dash' or ext == 'mpd' else None) if ext == 'm3u8': file_formats = self._extract_m3u8_formats( file_, video_id, 'mp4', diff --git a/yt_dlp/extractor/anvato.py b/yt_dlp/extractor/anvato.py index d688e2c5b..0d444fc33 100644 --- a/yt_dlp/extractor/anvato.py +++ b/yt_dlp/extractor/anvato.py @@ -16,6 +16,7 @@ from ..utils import ( determine_ext, intlist_to_bytes, int_or_none, + join_nonempty, strip_jsonp, unescapeHTML, unsmuggle_url, @@ -303,13 +304,13 @@ class AnvatoIE(InfoExtractor): tbr = int_or_none(published_url.get('kbps')) a_format = { 'url': video_url, - 'format_id': ('-'.join(filter(None, ['http', published_url.get('cdn_name')]))).lower(), - 'tbr': tbr if tbr != 0 else None, + 'format_id': join_nonempty('http', published_url.get('cdn_name')).lower(), + 'tbr': tbr or None, } if media_format == 'm3u8' and tbr is not None: a_format.update({ - 'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])), + 'format_id': join_nonempty('hls', tbr), 'ext': 'mp4', }) elif media_format == 'm3u8-variant' or ext == 'm3u8': diff --git a/yt_dlp/extractor/ard.py b/yt_dlp/extractor/ard.py index 048d30f27..f8d57109e 100644 --- a/yt_dlp/extractor/ard.py +++ b/yt_dlp/extractor/ard.py @@ -388,7 +388,13 @@ class ARDIE(InfoExtractor): class ARDBetaMediathekIE(ARDMediathekBaseIE): - _VALID_URL = r'https://(?:(?:beta|www)\.)?ardmediathek\.de/(?P<client>[^/]+)/(?P<mode>player|live|video|sendung|sammlung)/(?P<display_id>(?:[^/]+/)*)(?P<video_id>[a-zA-Z0-9]+)' + _VALID_URL = r'''(?x)https:// + (?:(?:beta|www)\.)?ardmediathek\.de/ + (?:(?P<client>[^/]+)/)? + (?:player|live|video|(?P<playlist>sendung|sammlung))/ + (?:(?P<display_id>[^?#]+)/)? + (?P<id>(?(playlist)|Y3JpZDovL)[a-zA-Z0-9]+)''' + _TESTS = [{ 'url': 'https://www.ardmediathek.de/mdr/video/die-robuste-roswita/Y3JpZDovL21kci5kZS9iZWl0cmFnL2Ntcy84MWMxN2MzZC0wMjkxLTRmMzUtODk4ZS0wYzhlOWQxODE2NGI/', 'md5': 'a1dc75a39c61601b980648f7c9f9f71d', @@ -403,6 +409,18 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): 'upload_date': '20200805', 'ext': 'mp4', }, + 'skip': 'Error', + }, { + 'url': 'https://www.ardmediathek.de/video/tagesschau-oder-tagesschau-20-00-uhr/das-erste/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhZ2Vzc2NoYXUvZmM4ZDUxMjgtOTE0ZC00Y2MzLTgzNzAtNDZkNGNiZWJkOTll', + 'md5': 'f1837e563323b8a642a8ddeff0131f51', + 'info_dict': { + 'id': '10049223', + 'ext': 'mp4', + 'title': 'tagesschau, 20:00 Uhr', + 'timestamp': 1636398000, + 'description': 'md5:39578c7b96c9fe50afdf5674ad985e6b', + 'upload_date': '20211108', + }, }, { 'url': 'https://beta.ardmediathek.de/ard/video/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', 'only_matching': True, @@ -426,6 +444,12 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): # playlist of type 'sammlung' 'url': 'https://www.ardmediathek.de/ard/sammlung/team-muenster/5JpTzLSbWUAK8184IOvEir/', 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/video/coronavirus-update-ndr-info/astrazeneca-kurz-lockdown-und-pims-syndrom-81/ndr/Y3JpZDovL25kci5kZS84NzE0M2FjNi0wMWEwLTQ5ODEtOTE5NS1mOGZhNzdhOTFmOTI/', + 'only_matching': True, + }, { + 'url': 'https://www.ardmediathek.de/ard/player/Y3JpZDovL3dkci5kZS9CZWl0cmFnLWQ2NDJjYWEzLTMwZWYtNGI4NS1iMTI2LTU1N2UxYTcxOGIzOQ/tatort-duo-koeln-leipzig-ihr-kinderlein-kommet', + 'only_matching': True, }] def _ARD_load_playlist_snipped(self, playlist_id, display_id, client, mode, pageNumber): @@ -525,20 +549,12 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): return self.playlist_result(entries, playlist_title=display_id) def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('video_id') - display_id = mobj.group('display_id') - if display_id: - display_id = display_id.rstrip('/') - if not display_id: - display_id = video_id - - if mobj.group('mode') in ('sendung', 'sammlung'): - # this is a playlist-URL - return self._ARD_extract_playlist( - url, video_id, display_id, - mobj.group('client'), - mobj.group('mode')) + video_id, display_id, playlist_type, client = self._match_valid_url(url).group( + 'id', 'display_id', 'playlist', 'client') + display_id, client = display_id or video_id, client or 'ard' + + if playlist_type: + return self._ARD_extract_playlist(url, video_id, display_id, client, playlist_type) player_page = self._download_json( 'https://api.ardmediathek.de/public-gateway', @@ -574,7 +590,7 @@ class ARDBetaMediathekIE(ARDMediathekBaseIE): } } } -}''' % (mobj.group('client'), video_id), +}''' % (client, video_id), }).encode(), headers={ 'Content-Type': 'application/json' })['data']['playerPage'] diff --git a/yt_dlp/extractor/atresplayer.py b/yt_dlp/extractor/atresplayer.py index 8143eb4d7..6d843966a 100644 --- a/yt_dlp/extractor/atresplayer.py +++ b/yt_dlp/extractor/atresplayer.py @@ -24,9 +24,6 @@ class AtresPlayerIE(InfoExtractor): 'description': 'md5:7634cdcb4d50d5381bedf93efb537fbc', 'duration': 3413, }, - 'params': { - 'format': 'bestvideo', - }, 'skip': 'This video is only available for registered users' }, { diff --git a/yt_dlp/extractor/bandaichannel.py b/yt_dlp/extractor/bandaichannel.py index d67285913..f1bcdef7a 100644 --- a/yt_dlp/extractor/bandaichannel.py +++ b/yt_dlp/extractor/bandaichannel.py @@ -21,7 +21,6 @@ class BandaiChannelIE(BrightcoveNewIE): 'duration': 1387.733, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }] diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 4e2dcd76b..672ed1ffe 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -451,9 +451,10 @@ class BBCCoUkIE(InfoExtractor): playlist = self._download_json( 'http://www.bbc.co.uk/programmes/%s/playlist.json' % playlist_id, playlist_id, 'Downloading playlist JSON') + formats = [] + subtitles = {} - version = playlist.get('defaultAvailableVersion') - if version: + for version in playlist.get('allAvailableVersions', []): smp_config = version['smpConfig'] title = smp_config['title'] description = smp_config['summary'] @@ -463,8 +464,18 @@ class BBCCoUkIE(InfoExtractor): continue programme_id = item.get('vpid') duration = int_or_none(item.get('duration')) - formats, subtitles = self._download_media_selector(programme_id) - return programme_id, title, description, duration, formats, subtitles + version_formats, version_subtitles = self._download_media_selector(programme_id) + types = version['types'] + for f in version_formats: + f['format_note'] = ', '.join(types) + if any('AudioDescribed' in x for x in types): + f['language_preference'] = -10 + formats += version_formats + for tag, subformats in (version_subtitles or {}).items(): + subtitles.setdefault(tag, []) + subtitles[tag] += subformats + + return programme_id, title, description, duration, formats, subtitles except ExtractorError as ee: if not (isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404): raise diff --git a/yt_dlp/extractor/blogger.py b/yt_dlp/extractor/blogger.py new file mode 100644 index 000000000..dba131cb0 --- /dev/null +++ b/yt_dlp/extractor/blogger.py @@ -0,0 +1,54 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from ..utils import ( + mimetype2ext, + parse_duration, + parse_qs, + str_or_none, + traverse_obj, +) +from .common import InfoExtractor + + +class BloggerIE(InfoExtractor): + IE_NAME = 'blogger.com' + _VALID_URL = r'https?://(?:www\.)?blogger\.com/video\.g\?token=(?P<id>.+)' + _VALID_EMBED = r'''<iframe[^>]+src=["']((?:https?:)?//(?:www\.)?blogger\.com/video\.g\?token=[^"']+)["']''' + _TESTS = [{ + 'url': 'https://www.blogger.com/video.g?token=AD6v5dzEe9hfcARr5Hlq1WTkYy6t-fXH3BBahVhGvVHe5szdEUBEloSEDSTA8-b111089KbfWuBvTN7fnbxMtymsHhXAXwVvyzHH4Qch2cfLQdGxKQrrEuFpC1amSl_9GuLWODjPgw', + 'md5': 'f1bc19b6ea1b0fd1d81e84ca9ec467ac', + 'info_dict': { + 'id': 'BLOGGER-video-3c740e3a49197e16-796', + 'title': 'BLOGGER-video-3c740e3a49197e16-796', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.*', + 'duration': 76.068, + } + }] + + @staticmethod + def _extract_urls(webpage): + return re.findall(BloggerIE._VALID_EMBED, webpage) + + def _real_extract(self, url): + token_id = self._match_id(url) + webpage = self._download_webpage(url, token_id) + data_json = self._search_regex(r'var\s+VIDEO_CONFIG\s*=\s*(\{.*)', webpage, 'JSON data') + data = self._parse_json(data_json.encode('utf-8').decode('unicode_escape'), token_id) + streams = data['streams'] + formats = [{ + 'ext': mimetype2ext(traverse_obj(parse_qs(stream['play_url']), ('mime', 0))), + 'url': stream['play_url'], + 'format_id': str_or_none(stream.get('format_id')), + } for stream in streams] + + return { + 'id': data.get('iframe_id', token_id), + 'title': data.get('iframe_id', token_id), + 'formats': formats, + 'thumbnail': data.get('thumbnail'), + 'duration': parse_duration(traverse_obj(parse_qs(streams[0]['play_url']), ('dur', 0))), + } diff --git a/yt_dlp/extractor/breitbart.py b/yt_dlp/extractor/breitbart.py new file mode 100644 index 000000000..f50f719dc --- /dev/null +++ b/yt_dlp/extractor/breitbart.py @@ -0,0 +1,39 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class BreitBartIE(InfoExtractor): + _VALID_URL = r'https?:\/\/(?:www\.)breitbart.com/videos/v/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://www.breitbart.com/videos/v/5cOz1yup/?pl=Ij6NDOji', + 'md5': '0aa6d1d6e183ac5ca09207fe49f17ade', + 'info_dict': { + 'id': '5cOz1yup', + 'ext': 'mp4', + 'title': 'Watch \u2013 Clyburn: Statues in Congress Have to Go Because they Are Honoring Slavery', + 'description': 'md5:bac35eb0256d1cb17f517f54c79404d5', + 'thumbnail': 'https://cdn.jwplayer.com/thumbs/5cOz1yup-1920.jpg', + 'age_limit': 0, + } + }, { + 'url': 'https://www.breitbart.com/videos/v/eaiZjVOn/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + formats = self._extract_m3u8_formats(f'https://cdn.jwplayer.com/manifests/{video_id}.m3u8', video_id, ext='mp4') + self._sort_formats(formats) + return { + 'id': video_id, + 'title': self._og_search_title( + webpage, default=None) or self._html_search_regex( + r'(?s)<title>(.*?)</title>', webpage, 'video title'), + 'description': self._og_search_description(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'age_limit': self._rta_search(webpage), + 'formats': formats + } diff --git a/yt_dlp/extractor/canalalpha.py b/yt_dlp/extractor/canalalpha.py new file mode 100644 index 000000000..7287677c1 --- /dev/null +++ b/yt_dlp/extractor/canalalpha.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + dict_get, + try_get, + unified_strdate, +) + + +class CanalAlphaIE(InfoExtractor): + _VALID_URL = r'(?:https?://)(?:www\.)?canalalpha\.ch/play/[^/]+/[^/]+/(?P<id>\d+)/?.*' + + _TESTS = [{ + 'url': 'https://www.canalalpha.ch/play/le-journal/episode/24520/jeudi-28-octobre-2021', + 'info_dict': { + 'id': '24520', + 'ext': 'mp4', + 'title': 'Jeudi 28 octobre 2021', + 'description': 'md5:d30c6c3e53f8ad40d405379601973b30', + 'thumbnail': 'https://static.canalalpha.ch/poster/journal/journal_20211028.jpg', + 'upload_date': '20211028', + 'duration': 1125, + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.canalalpha.ch/play/le-journal/topic/24512/la-poste-fait-de-neuchatel-un-pole-cryptographique', + 'info_dict': { + 'id': '24512', + 'ext': 'mp4', + 'title': 'La Poste fait de Neuchâtel un pôle cryptographique', + 'description': 'md5:4ba63ae78a0974d1a53d6703b6e1dedf', + 'thumbnail': 'https://static.canalalpha.ch/poster/news/news_39712.jpg', + 'upload_date': '20211028', + 'duration': 138, + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.canalalpha.ch/play/eureka/episode/24484/ces-innovations-qui-veulent-rendre-lagriculture-plus-durable', + 'info_dict': { + 'id': '24484', + 'ext': 'mp4', + 'title': 'Ces innovations qui veulent rendre l’agriculture plus durable', + 'description': 'md5:3de3f151180684621e85be7c10e4e613', + 'thumbnail': 'https://static.canalalpha.ch/poster/magazine/magazine_10236.jpg', + 'upload_date': '20211026', + 'duration': 360, + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://www.canalalpha.ch/play/avec-le-temps/episode/23516/redonner-de-leclat-grace-au-polissage', + 'info_dict': { + 'id': '23516', + 'ext': 'mp4', + 'title': 'Redonner de l\'éclat grâce au polissage', + 'description': 'md5:0d8fbcda1a5a4d6f6daa3165402177e1', + 'thumbnail': 'https://static.canalalpha.ch/poster/magazine/magazine_9990.png', + 'upload_date': '20210726', + 'duration': 360, + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + data_json = self._parse_json(self._search_regex( + r'window\.__SERVER_STATE__\s?=\s?({(?:(?!};)[^"]|"([^"]|\\")*")+})\s?;', + webpage, 'data_json'), id)['1']['data']['data'] + manifests = try_get(data_json, lambda x: x['video']['manifests'], expected_type=dict) or {} + subtitles = {} + formats = [{ + 'url': video['$url'], + 'ext': 'mp4', + 'width': try_get(video, lambda x: x['res']['width'], expected_type=int), + 'height': try_get(video, lambda x: x['res']['height'], expected_type=int), + } for video in try_get(data_json, lambda x: x['video']['mp4'], expected_type=list) or [] if video.get('$url')] + if manifests.get('hls'): + m3u8_frmts, m3u8_subs = self._parse_m3u8_formats_and_subtitles(manifests['hls'], id) + formats.extend(m3u8_frmts) + subtitles = self._merge_subtitles(subtitles, m3u8_subs) + if manifests.get('dash'): + dash_frmts, dash_subs = self._parse_mpd_formats_and_subtitles(manifests['dash'], id) + formats.extend(dash_frmts) + subtitles = self._merge_subtitles(subtitles, dash_subs) + self._sort_formats(formats) + return { + 'id': id, + 'title': data_json.get('title').strip(), + 'description': clean_html(dict_get(data_json, ('longDesc', 'shortDesc'))), + 'thumbnail': data_json.get('poster'), + 'upload_date': unified_strdate(dict_get(data_json, ('webPublishAt', 'featuredAt', 'diffusionDate'))), + 'duration': try_get(data_json, lambda x: x['video']['duration'], expected_type=int), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/yt_dlp/extractor/canvas.py b/yt_dlp/extractor/canvas.py index 49e7e4e39..e97c91929 100644 --- a/yt_dlp/extractor/canvas.py +++ b/yt_dlp/extractor/canvas.py @@ -1,4 +1,5 @@ from __future__ import unicode_literals +import json from .common import InfoExtractor @@ -41,9 +42,9 @@ class CanvasIE(InfoExtractor): _GEO_BYPASS = False _HLS_ENTRY_PROTOCOLS_MAP = { 'HLS': 'm3u8_native', - 'HLS_AES': 'm3u8', + 'HLS_AES': 'm3u8_native', } - _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v1' + _REST_API_BASE = 'https://media-services-public.vrt.be/vualto-video-aggregator-web/rest/external/v2' def _real_extract(self, url): mobj = self._match_valid_url(url) @@ -59,16 +60,21 @@ class CanvasIE(InfoExtractor): # New API endpoint if not data: + vrtnutoken = self._download_json('https://token.vrt.be/refreshtoken', + video_id, note='refreshtoken: Retrieve vrtnutoken', + errnote='refreshtoken failed')['vrtnutoken'] headers = self.geo_verification_headers() - headers.update({'Content-Type': 'application/json'}) - token = self._download_json( + headers.update({'Content-Type': 'application/json; charset=utf-8'}) + vrtPlayerToken = self._download_json( '%s/tokens' % self._REST_API_BASE, video_id, - 'Downloading token', data=b'', headers=headers)['vrtPlayerToken'] + 'Downloading token', headers=headers, data=json.dumps({ + 'identityToken': vrtnutoken + }).encode('utf-8'))['vrtPlayerToken'] data = self._download_json( '%s/videos/%s' % (self._REST_API_BASE, video_id), video_id, 'Downloading video JSON', query={ - 'vrtPlayerToken': token, - 'client': '%s@PROD' % site_id, + 'vrtPlayerToken': vrtPlayerToken, + 'client': 'null', }, expected_status=400) if not data.get('title'): code = data.get('code') @@ -264,7 +270,7 @@ class VrtNUIE(GigyaBaseIE): 'expected_warnings': ['Unable to download asset JSON', 'is not a supported codec', 'Unknown MIME type'], }] _NETRC_MACHINE = 'vrtnu' - _APIKEY = '3_qhEcPa5JGFROVwu5SWKqJ4mVOIkwlFNMSKwzPDAh8QZOtHqu6L4nD5Q7lk0eXOOG' + _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy' _CONTEXT_ID = 'R3595707040' def _real_initialize(self): @@ -275,16 +281,13 @@ class VrtNUIE(GigyaBaseIE): if username is None: return - auth_info = self._download_json( - 'https://accounts.vrt.be/accounts.login', None, - note='Login data', errnote='Could not get Login data', - headers={}, data=urlencode_postdata({ - 'loginID': username, - 'password': password, - 'sessionExpiration': '-2', - 'APIKey': self._APIKEY, - 'targetEnv': 'jssdk', - })) + auth_info = self._gigya_login({ + 'APIKey': self._APIKEY, + 'targetEnv': 'jssdk', + 'loginID': username, + 'password': password, + 'authMode': 'cookie', + }) if auth_info.get('errorDetails'): raise ExtractorError('Unable to login: VrtNU said: ' + auth_info.get('errorDetails'), expected=True) @@ -301,14 +304,15 @@ class VrtNUIE(GigyaBaseIE): 'UID': auth_info['UID'], 'UIDSignature': auth_info['UIDSignature'], 'signatureTimestamp': auth_info['signatureTimestamp'], - 'client_id': 'vrtnu-site', '_csrf': self._get_cookies('https://login.vrt.be').get('OIDCXSRF').value, } self._request_webpage( 'https://login.vrt.be/perform_login', - None, note='Requesting a token', errnote='Could not get a token', - headers={}, data=urlencode_postdata(post_data)) + None, note='Performing login', errnote='perform login failed', + headers={}, query={ + 'client_id': 'vrtnu-site' + }, data=urlencode_postdata(post_data)) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index 4fcf2a9c1..413053499 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -390,7 +390,8 @@ class CBCGemPlaylistIE(InfoExtractor): show = match.group('show') show_info = self._download_json(self._API_BASE + show, season_id) season = int(match.group('season')) - season_info = try_get(show_info, lambda x: x['seasons'][season - 1]) + + season_info = next((s for s in show_info['seasons'] if s.get('season') == season), None) if season_info is None: raise ExtractorError(f'Couldn\'t find season {season} of {show}') diff --git a/yt_dlp/extractor/ceskatelevize.py b/yt_dlp/extractor/ceskatelevize.py index 5e04d38a2..f766dfbb7 100644 --- a/yt_dlp/extractor/ceskatelevize.py +++ b/yt_dlp/extractor/ceskatelevize.py @@ -20,22 +20,8 @@ from ..utils import ( class CeskaTelevizeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)*(?P<id>[^/#?]+)' + _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/(?:ivysilani|porady)/(?:[^/?#&]+/)*(?P<id>[^/#?]+)' _TESTS = [{ - 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', - 'info_dict': { - 'id': '61924494877246241', - 'ext': 'mp4', - 'title': 'Hyde Park Civilizace: Život v Grónsku', - 'description': 'md5:3fec8f6bb497be5cdb0c9e8781076626', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 3350, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { 'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en', 'info_dict': { 'id': '61924494877028507', @@ -66,12 +52,58 @@ class CeskaTelevizeIE(InfoExtractor): }, { 'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25', 'only_matching': True, + }, { + # video with 18+ caution trailer + 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', + 'info_dict': { + 'id': '215562210900007-bogotart', + 'title': 'Queer: Bogotart', + 'description': 'Hlavní město Kolumbie v doprovodu queer umělců. Vroucí svět plný vášně, sebevědomí, ale i násilí a bolesti. Připravil Peter Serge Butko', + }, + 'playlist': [{ + 'info_dict': { + 'id': '61924494877311053', + 'ext': 'mp4', + 'title': 'Queer: Bogotart (Varování 18+)', + 'duration': 11.9, + }, + }, { + 'info_dict': { + 'id': '61924494877068022', + 'ext': 'mp4', + 'title': 'Queer: Bogotart (Queer)', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 1558.3, + }, + }], + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + # iframe embed + 'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/', + 'only_matching': True, }] def _real_extract(self, url): playlist_id = self._match_id(url) - + parsed_url = compat_urllib_parse_urlparse(url) webpage = self._download_webpage(url, playlist_id) + site_name = self._og_search_property('site_name', webpage, fatal=False, default=None) + playlist_title = self._og_search_title(webpage, default=None) + if site_name and playlist_title: + playlist_title = playlist_title.replace(f' — {site_name}', '', 1) + playlist_description = self._og_search_description(webpage, default=None) + if playlist_description: + playlist_description = playlist_description.replace('\xa0', ' ') + + if parsed_url.path.startswith('/porady/'): + refer_url = update_url_query(unescapeHTML(self._search_regex( + (r'<span[^>]*\bdata-url=(["\'])(?P<url>(?:(?!\1).)+)\1', + r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?ceskatelevize\.cz/ivysilani/embed/iFramePlayer\.php.*?)\1'), + webpage, 'iframe player url', group='url')), query={'autoStart': 'true'}) + webpage = self._download_webpage(refer_url, playlist_id) NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.' if '%s</p>' % NOT_AVAILABLE_STRING in webpage: @@ -100,7 +132,7 @@ class CeskaTelevizeIE(InfoExtractor): data = { 'playlist[0][type]': type_, 'playlist[0][id]': episode_id, - 'requestUrl': compat_urllib_parse_urlparse(url).path, + 'requestUrl': parsed_url.path, 'requestSource': 'iVysilani', } @@ -108,7 +140,7 @@ class CeskaTelevizeIE(InfoExtractor): for user_agent in (None, USER_AGENTS['Safari']): req = sanitized_Request( - 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist', + 'https://www.ceskatelevize.cz/ivysilani/ajax/get-client-playlist/', data=urlencode_postdata(data)) req.add_header('Content-type', 'application/x-www-form-urlencoded') @@ -130,9 +162,6 @@ class CeskaTelevizeIE(InfoExtractor): req = sanitized_Request(compat_urllib_parse_unquote(playlist_url)) req.add_header('Referer', url) - playlist_title = self._og_search_title(webpage, default=None) - playlist_description = self._og_search_description(webpage, default=None) - playlist = self._download_json(req, playlist_id, fatal=False) if not playlist: continue @@ -237,54 +266,3 @@ class CeskaTelevizeIE(InfoExtractor): yield line return '\r\n'.join(_fix_subtitle(subtitles)) - - -class CeskaTelevizePoradyIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/porady/(?:[^/?#&]+/)*(?P<id>[^/#?]+)' - _TESTS = [{ - # video with 18+ caution trailer - 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', - 'info_dict': { - 'id': '215562210900007-bogotart', - 'title': 'Queer: Bogotart', - 'description': 'Alternativní průvodce současným queer světem', - }, - 'playlist': [{ - 'info_dict': { - 'id': '61924494876844842', - 'ext': 'mp4', - 'title': 'Queer: Bogotart (Varování 18+)', - 'duration': 10.2, - }, - }, { - 'info_dict': { - 'id': '61924494877068022', - 'ext': 'mp4', - 'title': 'Queer: Bogotart (Queer)', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 1558.3, - }, - }], - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - # iframe embed - 'url': 'http://www.ceskatelevize.cz/porady/10614999031-neviditelni/21251212048/', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - data_url = update_url_query(unescapeHTML(self._search_regex( - (r'<span[^>]*\bdata-url=(["\'])(?P<url>(?:(?!\1).)+)\1', - r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?ceskatelevize\.cz/ivysilani/embed/iFramePlayer\.php.*?)\1'), - webpage, 'iframe player url', group='url')), query={ - 'autoStart': 'true', - }) - - return self.url_result(data_url, ie=CeskaTelevizeIE.ie_key()) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index aa98c0cc9..fc28bca2e 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import base64 +import collections import datetime import hashlib import itertools @@ -54,6 +55,7 @@ from ..utils import ( GeoRestrictedError, GeoUtils, int_or_none, + join_nonempty, js_to_json, JSON_LD_RE, mimetype2ext, @@ -341,6 +343,7 @@ class InfoExtractor(object): series, programme or podcast: series: Title of the series or programme the video episode belongs to. + series_id: Id of the series or programme the video episode belongs to, as a unicode string. season: Title of the season the video episode belongs to. season_number: Number of the season the video episode belongs to, as an integer. season_id: Id of the season the video episode belongs to, as a unicode string. @@ -441,11 +444,11 @@ class InfoExtractor(object): _WORKING = True _LOGIN_HINTS = { - 'any': 'Use --cookies, --username and --password or --netrc to provide account credentials', + 'any': 'Use --cookies, --username and --password, or --netrc to provide account credentials', 'cookies': ( 'Use --cookies-from-browser or --cookies for the authentication. ' 'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to manually pass cookies'), - 'password': 'Use --username and --password or --netrc to provide account credentials', + 'password': 'Use --username and --password, or --netrc to provide account credentials', } def __init__(self, downloader=None): @@ -1449,6 +1452,9 @@ class InfoExtractor(object): item_type = e.get('@type') if expected_type is not None and expected_type != item_type: continue + rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none) + if rating is not None: + info['average_rating'] = rating if item_type in ('TVEpisode', 'Episode'): episode_name = unescapeHTML(e.get('name')) info.update({ @@ -1495,6 +1501,13 @@ class InfoExtractor(object): break return dict((k, v) for k, v in info.items() if v is not None) + def _search_nextjs_data(self, webpage, video_id, **kw): + return self._parse_json( + self._search_regex( + r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>', + webpage, 'next.js data', **kw), + video_id, **kw) + @staticmethod def _hidden_inputs(html): html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html) @@ -1531,7 +1544,7 @@ class InfoExtractor(object): 'vcodec': {'type': 'ordered', 'regex': True, 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']}, 'acodec': {'type': 'ordered', 'regex': True, - 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e?a?c-?3', 'dts', '', None, 'none']}, + 'order': ['opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']}, 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range', 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]}, 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol', @@ -1911,7 +1924,7 @@ class InfoExtractor(object): tbr = int_or_none(media_el.attrib.get('bitrate')) width = int_or_none(media_el.attrib.get('width')) height = int_or_none(media_el.attrib.get('height')) - format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])) + format_id = join_nonempty(f4m_id, tbr or i) # If <bootstrapInfo> is present, the specified f4m is a # stream-level manifest, and only set-level manifests may refer to # external resources. See section 11.4 and section 4 of F4M spec @@ -1973,7 +1986,7 @@ class InfoExtractor(object): def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, quality=None, m3u8_id=None): return { - 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), + 'format_id': join_nonempty(m3u8_id, 'meta'), 'url': m3u8_url, 'ext': ext, 'protocol': 'm3u8', @@ -2026,10 +2039,10 @@ class InfoExtractor(object): video_id=None): formats, subtitles = [], {} - if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access - return formats, subtitles - - has_drm = re.search(r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', m3u8_doc) + has_drm = re.search('|'.join([ + r'#EXT-X-FAXS-CM:', # Adobe Flash Access + r'#EXT-X-(?:SESSION-)?KEY:.*?URI="skd://', # Apple FairPlay + ]), m3u8_doc) def format_url(url): return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url) @@ -2068,7 +2081,7 @@ class InfoExtractor(object): if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is formats = [{ - 'format_id': '-'.join(map(str, filter(None, [m3u8_id, idx]))), + 'format_id': join_nonempty(m3u8_id, idx), 'format_index': idx, 'url': m3u8_url, 'ext': ext, @@ -2117,7 +2130,7 @@ class InfoExtractor(object): if media_url: manifest_url = format_url(media_url) formats.extend({ - 'format_id': '-'.join(map(str, filter(None, (m3u8_id, group_id, name, idx)))), + 'format_id': join_nonempty(m3u8_id, group_id, name, idx), 'format_note': name, 'format_index': idx, 'url': manifest_url, @@ -2174,9 +2187,9 @@ class InfoExtractor(object): # format_id intact. if not live: stream_name = build_stream_name() - format_id[1] = stream_name if stream_name else '%d' % (tbr if tbr else len(formats)) + format_id[1] = stream_name or '%d' % (tbr or len(formats)) f = { - 'format_id': '-'.join(map(str, filter(None, format_id))), + 'format_id': join_nonempty(*format_id), 'format_index': idx, 'url': manifest_url, 'manifest_url': m3u8_url, @@ -2640,7 +2653,7 @@ class InfoExtractor(object): mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) formats, subtitles = [], {} - stream_numbers = {'audio': 0, 'video': 0} + stream_numbers = collections.defaultdict(int) for period in mpd_doc.findall(_add_ns('Period')): period_duration = parse_duration(period.get('duration')) or mpd_duration period_ms_info = extract_multisegment_info(period, { @@ -2706,10 +2719,8 @@ class InfoExtractor(object): 'format_note': 'DASH %s' % content_type, 'filesize': filesize, 'container': mimetype2ext(mime_type) + '_dash', - 'manifest_stream_number': stream_numbers[content_type] } f.update(parse_codecs(codecs)) - stream_numbers[content_type] += 1 elif content_type == 'text': f = { 'ext': mimetype2ext(mime_type), @@ -2876,7 +2887,9 @@ class InfoExtractor(object): else: # Assuming direct URL to unfragmented media. f['url'] = base_url - if content_type in ('video', 'audio') or mime_type == 'image/jpeg': + if content_type in ('video', 'audio', 'image/jpeg'): + f['manifest_stream_number'] = stream_numbers[f['url']] + stream_numbers[f['url']] += 1 formats.append(f) elif content_type == 'text': subtitles.setdefault(lang or 'und', []).append(f) @@ -2965,13 +2978,6 @@ class InfoExtractor(object): }) fragment_ctx['time'] += fragment_ctx['duration'] - format_id = [] - if ism_id: - format_id.append(ism_id) - if stream_name: - format_id.append(stream_name) - format_id.append(compat_str(tbr)) - if stream_type == 'text': subtitles.setdefault(stream_language, []).append({ 'ext': 'ismt', @@ -2990,7 +2996,7 @@ class InfoExtractor(object): }) elif stream_type in ('video', 'audio'): formats.append({ - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty(ism_id, stream_name, tbr), 'url': ism_url, 'manifest_url': ism_url, 'ext': 'ismv' if stream_type == 'video' else 'isma', diff --git a/yt_dlp/extractor/corus.py b/yt_dlp/extractor/corus.py index 352951e20..119461375 100644 --- a/yt_dlp/extractor/corus.py +++ b/yt_dlp/extractor/corus.py @@ -55,7 +55,6 @@ class CorusIE(ThePlatformFeedIE): 'timestamp': 1486392197, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, 'expected_warnings': ['Failed to parse JSON'], diff --git a/yt_dlp/extractor/cozytv.py b/yt_dlp/extractor/cozytv.py new file mode 100644 index 000000000..868d8d27d --- /dev/null +++ b/yt_dlp/extractor/cozytv.py @@ -0,0 +1,40 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import unified_strdate + + +class CozyTVIE(InfoExtractor): + _VALID_URL = r'(?:https?://)(?:www\.)?cozy\.tv/(?P<uploader>[^/]+)/replays/(?P<id>[^/$#&?]+)' + + _TESTS = [{ + 'url': 'https://cozy.tv/beardson/replays/2021-11-19_1', + 'info_dict': { + 'id': 'beardson-2021-11-19_1', + 'ext': 'mp4', + 'title': 'pokemon pt2', + 'uploader': 'beardson', + 'upload_date': '20211119', + 'was_live': True, + 'duration': 7981, + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + uploader, date = self._match_valid_url(url).groups() + id = f'{uploader}-{date}' + data_json = self._download_json(f'https://api.cozy.tv/cache/{uploader}/replay/{date}', id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + f'https://cozycdn.foxtrotstream.xyz/replays/{uploader}/{date}/index.m3u8', id, ext='mp4') + return { + 'id': id, + 'title': data_json.get('title'), + 'uploader': data_json.get('user') or uploader, + 'upload_date': unified_strdate(data_json.get('date')), + 'was_live': True, + 'duration': data_json.get('duration'), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index 511ac1b2c..cd35728e5 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -27,6 +27,7 @@ from ..utils import ( int_or_none, lowercase_escape, merge_dicts, + qualities, remove_end, sanitized_Request, try_get, @@ -478,19 +479,24 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text [r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', r'<div>\s*Publisher:\s*<span>\s*(.+?)\s*</span>\s*</div>'], webpage, 'video_uploader', default=False) + requested_languages = self._configuration_arg('language') + requested_hardsubs = [('' if val == 'none' else val) for val in self._configuration_arg('hardsub')] + language_preference = qualities((requested_languages or [language or ''])[::-1]) + hardsub_preference = qualities((requested_hardsubs or ['', language or ''])[::-1]) + formats = [] for stream in media.get('streams', []): - audio_lang = stream.get('audio_lang') - hardsub_lang = stream.get('hardsub_lang') + audio_lang = stream.get('audio_lang') or '' + hardsub_lang = stream.get('hardsub_lang') or '' + if (requested_languages and audio_lang.lower() not in requested_languages + or requested_hardsubs and hardsub_lang.lower() not in requested_hardsubs): + continue vrv_formats = self._extract_vrv_formats( stream.get('url'), video_id, stream.get('format'), audio_lang, hardsub_lang) for f in vrv_formats: - f['language_preference'] = 1 if audio_lang == language else 0 - f['quality'] = ( - 1 if not hardsub_lang - else 0 if hardsub_lang == language - else -1) + f['language_preference'] = language_preference(audio_lang) + f['quality'] = hardsub_preference(hardsub_lang) formats.extend(vrv_formats) if not formats: available_fmts = [] diff --git a/yt_dlp/extractor/cspan.py b/yt_dlp/extractor/cspan.py index 2e01aff48..c717aec3a 100644 --- a/yt_dlp/extractor/cspan.py +++ b/yt_dlp/extractor/cspan.py @@ -18,7 +18,7 @@ from ..utils import ( str_to_int, unescapeHTML, ) -from .senateisvp import SenateISVPIE +from .senategov import SenateISVPIE from .ustream import UstreamIE diff --git a/yt_dlp/extractor/curiositystream.py b/yt_dlp/extractor/curiositystream.py index 034a5c92a..485b6031f 100644 --- a/yt_dlp/extractor/curiositystream.py +++ b/yt_dlp/extractor/curiositystream.py @@ -15,7 +15,6 @@ from ..utils import ( class CuriosityStreamBaseIE(InfoExtractor): _NETRC_MACHINE = 'curiositystream' _auth_token = None - _API_BASE_URL = 'https://api.curiositystream.com/v1/' def _handle_errors(self, result): error = result.get('error', {}).get('message') @@ -39,38 +38,44 @@ class CuriosityStreamBaseIE(InfoExtractor): if email is None: return result = self._download_json( - self._API_BASE_URL + 'login', None, data=urlencode_postdata({ + 'https://api.curiositystream.com/v1/login', None, + note='Logging in', data=urlencode_postdata({ 'email': email, 'password': password, })) self._handle_errors(result) - self._auth_token = result['message']['auth_token'] + CuriosityStreamBaseIE._auth_token = result['message']['auth_token'] class CuriosityStreamIE(CuriosityStreamBaseIE): IE_NAME = 'curiositystream' _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/video/(?P<id>\d+)' - _TEST = { + _TESTS = [{ 'url': 'https://app.curiositystream.com/video/2', 'info_dict': { 'id': '2', 'ext': 'mp4', 'title': 'How Did You Develop The Internet?', 'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.', + 'channel': 'Curiosity Stream', + 'categories': ['Technology', 'Interview'], + 'average_rating': 96.79, + 'series_id': '2', }, 'params': { - 'format': 'bestvideo', # m3u8 download 'skip_download': True, }, - } + }] + + _API_BASE_URL = 'https://api.curiositystream.com/v1/media/' def _real_extract(self, url): video_id = self._match_id(url) formats = [] for encoding_format in ('m3u8', 'mpd'): - media = self._call_api('media/' + video_id, video_id, query={ + media = self._call_api(video_id, video_id, query={ 'encodingsNew': 'true', 'encodingsFormat': encoding_format, }) @@ -140,12 +145,33 @@ class CuriosityStreamIE(CuriosityStreamBaseIE): 'duration': int_or_none(media.get('duration')), 'tags': media.get('tags'), 'subtitles': subtitles, + 'channel': media.get('producer'), + 'categories': [media.get('primary_category'), media.get('type')], + 'average_rating': media.get('rating_percentage'), + 'series_id': str(media.get('collection_id') or '') or None, } -class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): - IE_NAME = 'curiositystream:collection' - _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:collections?|series)/(?P<id>\d+)' +class CuriosityStreamCollectionBaseIE(CuriosityStreamBaseIE): + + def _real_extract(self, url): + collection_id = self._match_id(url) + collection = self._call_api(collection_id, collection_id) + entries = [] + for media in collection.get('media', []): + media_id = compat_str(media.get('id')) + media_type, ie = ('series', CuriosityStreamSeriesIE) if media.get('is_collection') else ('video', CuriosityStreamIE) + entries.append(self.url_result( + 'https://curiositystream.com/%s/%s' % (media_type, media_id), + ie=ie.ie_key(), video_id=media_id)) + return self.playlist_result( + entries, collection_id, + collection.get('title'), collection.get('description')) + + +class CuriosityStreamCollectionsIE(CuriosityStreamCollectionBaseIE): + IE_NAME = 'curiositystream:collections' + _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/collections/(?P<id>\d+)' _API_BASE_URL = 'https://api.curiositystream.com/v2/collections/' _TESTS = [{ 'url': 'https://curiositystream.com/collections/86', @@ -156,7 +182,17 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): }, 'playlist_mincount': 7, }, { - 'url': 'https://app.curiositystream.com/collection/2', + 'url': 'https://curiositystream.com/collections/36', + 'only_matching': True, + }] + + +class CuriosityStreamSeriesIE(CuriosityStreamCollectionBaseIE): + IE_NAME = 'curiositystream:series' + _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/(?:series|collection)/(?P<id>\d+)' + _API_BASE_URL = 'https://api.curiositystream.com/v2/series/' + _TESTS = [{ + 'url': 'https://curiositystream.com/series/2', 'info_dict': { 'id': '2', 'title': 'Curious Minds: The Internet', @@ -164,23 +200,6 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): }, 'playlist_mincount': 16, }, { - 'url': 'https://curiositystream.com/series/2', - 'only_matching': True, - }, { - 'url': 'https://curiositystream.com/collections/36', + 'url': 'https://curiositystream.com/collection/2', 'only_matching': True, }] - - def _real_extract(self, url): - collection_id = self._match_id(url) - collection = self._call_api(collection_id, collection_id) - entries = [] - for media in collection.get('media', []): - media_id = compat_str(media.get('id')) - media_type, ie = ('series', CuriosityStreamCollectionIE) if media.get('is_collection') else ('video', CuriosityStreamIE) - entries.append(self.url_result( - 'https://curiositystream.com/%s/%s' % (media_type, media_id), - ie=ie.ie_key(), video_id=media_id)) - return self.playlist_result( - entries, collection_id, - collection.get('title'), collection.get('description')) diff --git a/yt_dlp/extractor/discoverynetworks.py b/yt_dlp/extractor/discoverynetworks.py index f43c87160..4f8bdf0b9 100644 --- a/yt_dlp/extractor/discoverynetworks.py +++ b/yt_dlp/extractor/discoverynetworks.py @@ -19,7 +19,6 @@ class DiscoveryNetworksDeIE(DPlayIE): 'upload_date': '20190331', }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, { diff --git a/yt_dlp/extractor/discoveryplusindia.py b/yt_dlp/extractor/discoveryplusindia.py index 51801402c..8ec418a97 100644 --- a/yt_dlp/extractor/discoveryplusindia.py +++ b/yt_dlp/extractor/discoveryplusindia.py @@ -28,7 +28,6 @@ class DiscoveryPlusIndiaIE(DPlayIE): 'creator': 'Discovery Channel', }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, 'skip': 'Cookies (not necessarily logged in) are needed' diff --git a/yt_dlp/extractor/disney.py b/yt_dlp/extractor/disney.py index f018cbe9d..0ad7b1f46 100644 --- a/yt_dlp/extractor/disney.py +++ b/yt_dlp/extractor/disney.py @@ -7,8 +7,8 @@ from .common import InfoExtractor from ..utils import ( int_or_none, unified_strdate, - compat_str, determine_ext, + join_nonempty, update_url_query, ) @@ -119,18 +119,13 @@ class DisneyIE(InfoExtractor): continue formats.append(f) continue - format_id = [] - if flavor_format: - format_id.append(flavor_format) - if tbr: - format_id.append(compat_str(tbr)) ext = determine_ext(flavor_url) if flavor_format == 'applehttp' or ext == 'm3u8': ext = 'mp4' width = int_or_none(flavor.get('width')) height = int_or_none(flavor.get('height')) formats.append({ - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty(flavor_format, tbr), 'url': flavor_url, 'width': width, 'height': height, diff --git a/yt_dlp/extractor/dplay.py b/yt_dlp/extractor/dplay.py index d62480810..525c8e243 100644 --- a/yt_dlp/extractor/dplay.py +++ b/yt_dlp/extractor/dplay.py @@ -46,7 +46,6 @@ class DPlayIE(InfoExtractor): 'episode_number': 1, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, { @@ -67,7 +66,6 @@ class DPlayIE(InfoExtractor): 'episode_number': 1, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, { @@ -87,7 +85,6 @@ class DPlayIE(InfoExtractor): 'episode_number': 7, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, 'skip': 'Available for Premium users', @@ -313,9 +310,6 @@ class HGTVDeIE(DPlayIE): 'season_number': 3, 'episode_number': 3, }, - 'params': { - 'format': 'bestvideo', - }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/dvtv.py b/yt_dlp/extractor/dvtv.py index de7f6d670..08663cffb 100644 --- a/yt_dlp/extractor/dvtv.py +++ b/yt_dlp/extractor/dvtv.py @@ -8,6 +8,7 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, + join_nonempty, js_to_json, mimetype2ext, try_get, @@ -139,13 +140,9 @@ class DVTVIE(InfoExtractor): label = video.get('label') height = self._search_regex( r'^(\d+)[pP]', label or '', 'height', default=None) - format_id = ['http'] - for f in (ext, label): - if f: - format_id.append(f) formats.append({ 'url': video_url, - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty('http', ext, label), 'height': int_or_none(height), }) self._sort_formats(formats) diff --git a/yt_dlp/extractor/egghead.py b/yt_dlp/extractor/egghead.py index f6b50e7c2..b6b86768c 100644 --- a/yt_dlp/extractor/egghead.py +++ b/yt_dlp/extractor/egghead.py @@ -86,7 +86,6 @@ class EggheadLessonIE(EggheadBaseIE): }, 'params': { 'skip_download': True, - 'format': 'bestvideo', }, }, { 'url': 'https://egghead.io/api/v1/lessons/react-add-redux-to-a-react-application', diff --git a/yt_dlp/extractor/espn.py b/yt_dlp/extractor/espn.py index d4a66c29f..dc50f3b8b 100644 --- a/yt_dlp/extractor/espn.py +++ b/yt_dlp/extractor/espn.py @@ -7,7 +7,9 @@ from .once import OnceIE from ..compat import compat_str from ..utils import ( determine_ext, + dict_get, int_or_none, + unified_strdate, unified_timestamp, ) @@ -236,3 +238,44 @@ class FiveThirtyEightIE(InfoExtractor): webpage, 'embed url') return self.url_result(embed_url, 'AbcNewsVideo') + + +class ESPNCricInfoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?espncricinfo\.com/video/[^#$&?/]+-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.espncricinfo.com/video/finch-chasing-comes-with-risks-despite-world-cup-trend-1289135', + 'info_dict': { + 'id': '1289135', + 'ext': 'mp4', + 'title': 'Finch: Chasing comes with \'risks\' despite World Cup trend', + 'description': 'md5:ea32373303e25efbb146efdfc8a37829', + 'upload_date': '20211113', + 'duration': 96, + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + id = self._match_id(url) + data_json = self._download_json(f'https://hs-consumer-api.espncricinfo.com/v1/pages/video/video-details?videoId={id}', id)['video'] + formats, subtitles = [], {} + for item in data_json.get('playbacks') or []: + if item.get('type') == 'HLS' and item.get('url'): + m3u8_frmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles(item['url'], id) + formats.extend(m3u8_frmts) + subtitles = self._merge_subtitles(subtitles, m3u8_subs) + elif item.get('type') == 'AUDIO' and item.get('url'): + formats.append({ + 'url': item['url'], + 'vcodec': 'none', + }) + self._sort_formats(formats) + return { + 'id': id, + 'title': data_json.get('title'), + 'description': data_json.get('summary'), + 'upload_date': unified_strdate(dict_get(data_json, ('publishedAt', 'recordedAt'))), + 'duration': data_json.get('duration'), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index 9d963ee46..a4baad2da 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -50,6 +50,7 @@ from .animelab import ( AnimeLabIE, AnimeLabShowsIE, ) +from .amazon import AmazonStoreIE from .americastestkitchen import ( AmericasTestKitchenIE, AmericasTestKitchenSeasonIE, @@ -165,6 +166,7 @@ from .bleacherreport import ( BleacherReportIE, BleacherReportCMSIE, ) +from .blogger import BloggerIE from .bloomberg import BloombergIE from .bokecc import BokeCCIE from .bongacams import BongaCamsIE @@ -177,6 +179,7 @@ from .br import ( ) from .bravotv import BravoTVIE from .breakcom import BreakIE +from .breitbart import BreitBartIE from .brightcove import ( BrightcoveLegacyIE, BrightcoveNewIE, @@ -192,6 +195,7 @@ from .camdemy import ( ) from .cammodels import CamModelsIE from .camwithher import CamWithHerIE +from .canalalpha import CanalAlphaIE from .canalplus import CanalplusIE from .canalc2 import Canalc2IE from .canvas import ( @@ -235,10 +239,7 @@ from .ccc import ( from .ccma import CCMAIE from .cctv import CCTVIE from .cda import CDAIE -from .ceskatelevize import ( - CeskaTelevizeIE, - CeskaTelevizePoradyIE, -) +from .ceskatelevize import CeskaTelevizeIE from .cgtn import CGTNIE from .channel9 import Channel9IE from .charlierose import CharlieRoseIE @@ -293,6 +294,7 @@ from .commonprotocols import ( from .condenast import CondeNastIE from .contv import CONtvIE from .corus import CorusIE +from .cozytv import CozyTVIE from .cracked import CrackedIE from .crackle import CrackleIE from .crooksandliars import CrooksAndLiarsIE @@ -309,7 +311,8 @@ from .ctvnews import CTVNewsIE from .cultureunplugged import CultureUnpluggedIE from .curiositystream import ( CuriosityStreamIE, - CuriosityStreamCollectionIE, + CuriosityStreamCollectionsIE, + CuriosityStreamSeriesIE, ) from .cwtv import CWTVIE from .dailymail import DailyMailIE @@ -419,6 +422,7 @@ from .espn import ( ESPNIE, ESPNArticleIE, FiveThirtyEightIE, + ESPNCricInfoIE, ) from .esri import EsriVideoIE from .europa import EuropaIE @@ -495,7 +499,10 @@ from .funimation import ( ) from .funk import FunkIE from .fusion import FusionIE -from .gab import GabTVIE +from .gab import ( + GabTVIE, + GabIE, +) from .gaia import GaiaIE from .gameinformer import GameInformerIE from .gamespot import GameSpotIE @@ -591,12 +598,16 @@ from .indavideo import IndavideoEmbedIE from .infoq import InfoQIE from .instagram import ( InstagramIE, + InstagramIOSIE, InstagramUserIE, InstagramTagIE, ) from .internazionale import InternazionaleIE from .internetvideoarchive import InternetVideoArchiveIE -from .iprima import IPrimaIE +from .iprima import ( + IPrimaIE, + IPrimaCNNIE +) from .iqiyi import IqiyiIE from .ir90tv import Ir90TvIE from .itv import ( @@ -696,6 +707,7 @@ from .line import ( LineLiveChannelIE, ) from .linkedin import ( + LinkedInIE, LinkedInLearningIE, LinkedInLearningCourseIE, ) @@ -787,6 +799,7 @@ from .mirrativ import ( ) from .mit import TechTVMITIE, OCWMITIE from .mitele import MiTeleIE +from .mixch import MixchIE from .mixcloud import ( MixcloudIE, MixcloudUserIE, @@ -839,7 +852,10 @@ from .myvi import ( ) from .myvideoge import MyVideoGeIE from .myvidster import MyVidsterIE -from .n1 import N1InfoIIE, N1InfoAssetIE +from .n1 import ( + N1InfoAssetIE, + N1InfoIIE, +) from .nationalgeographic import ( NationalGeographicVideoIE, NationalGeographicTVIE, @@ -873,7 +889,10 @@ from .ndr import ( NJoyEmbedIE, ) from .ndtv import NDTVIE -from .nebula import NebulaIE +from .nebula import ( + NebulaIE, + NebulaCollectionIE, +) from .nerdcubed import NerdCubedFeedIE from .netzkino import NetzkinoIE from .neteasemusic import ( @@ -927,7 +946,10 @@ from .niconico import ( NicovideoSearchIE, NicovideoSearchURLIE, ) -from .ninecninemedia import NineCNineMediaIE +from .ninecninemedia import ( + NineCNineMediaIE, + CPTwentyFourIE, +) from .ninegag import NineGagIE from .ninenow import NineNowIE from .nintendo import NintendoIE @@ -991,6 +1013,7 @@ from .oktoberfesttv import OktoberfestTVIE from .olympics import OlympicsReplayIE from .on24 import On24IE from .ondemandkorea import OnDemandKoreaIE +from .onefootball import OneFootballIE from .onet import ( OnetIE, OnetChannelIE, @@ -1051,6 +1074,7 @@ from .peertube import ( PeerTubeIE, PeerTubePlaylistIE, ) +from .peertv import PeerTVIE from .peloton import ( PelotonIE, PelotonLiveIE @@ -1075,6 +1099,7 @@ from .pinterest import ( PinterestCollectionIE, ) from .pladform import PladformIE +from .planetmarathi import PlanetMarathiIE from .platzi import ( PlatziIE, PlatziCourseIE, @@ -1096,9 +1121,14 @@ from .pokemon import ( PokemonIE, PokemonWatchIE, ) +from .polsatgo import PolsatGoIE from .polskieradio import ( PolskieRadioIE, PolskieRadioCategoryIE, + PolskieRadioPlayerIE, + PolskieRadioPodcastIE, + PolskieRadioPodcastListIE, + PolskieRadioRadioKierowcowIE, ) from .popcorntimes import PopcorntimesIE from .popcorntv import PopcornTVIE @@ -1145,6 +1175,11 @@ from .radiode import RadioDeIE from .radiojavan import RadioJavanIE from .radiobremen import RadioBremenIE from .radiofrance import RadioFranceIE +from .radiozet import RadioZetPodcastIE +from .radiokapital import ( + RadioKapitalIE, + RadioKapitalShowIE, +) from .radlive import ( RadLiveIE, RadLiveChannelIE, @@ -1155,6 +1190,8 @@ from .rai import ( RaiPlayLiveIE, RaiPlayPlaylistIE, RaiIE, + RaiPlayRadioIE, + RaiPlayRadioPlaylistIE, ) from .raywenderlich import ( RayWenderlichIE, @@ -1178,10 +1215,8 @@ from .redbulltv import ( RedBullTVRrnContentIE, RedBullIE, ) -from .reddit import ( - RedditIE, - RedditRIE, -) +from .reddit import RedditIE +from .redgifs import RedGifsIE from .redtube import RedTubeIE from .regiotv import RegioTVIE from .rentv import ( @@ -1195,7 +1230,7 @@ from .rice import RICEIE from .rmcdecouverte import RMCDecouverteIE from .ro220 import Ro220IE from .rockstargames import RockstarGamesIE -from .roosterteeth import RoosterTeethIE +from .roosterteeth import RoosterTeethIE, RoosterTeethSeriesIE from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE from .rozhlas import RozhlasIE @@ -1208,6 +1243,7 @@ from .rtl2 import ( RTL2YouSeriesIE, ) from .rtp import RTPIE +from .rtrfm import RTRFMIE from .rts import RTSIE from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETelevisionIE from .rtvnh import RTVNHIE @@ -1249,7 +1285,7 @@ from .scte import ( SCTECourseIE, ) from .seeker import SeekerIE -from .senateisvp import SenateISVPIE +from .senategov import SenateISVPIE, SenateGovIE from .sendtonews import SendtoNewsIE from .servus import ServusIE from .sevenplus import SevenPlusIE @@ -1376,8 +1412,10 @@ from .streamable import StreamableIE from .streamanity import StreamanityIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE +from .streamff import StreamFFIE from .streetvoice import StreetVoiceIE from .stretchinternet import StretchInternetIE +from .stripchat import StripchatIE from .stv import STVPlayerIE from .sunporno import SunPornoIE from .sverigesradio import ( @@ -1551,6 +1589,7 @@ from .tvnow import ( from .tvp import ( TVPEmbedIE, TVPIE, + TVPStreamIE, TVPWebsiteIE, ) from .tvplay import ( @@ -1761,6 +1800,7 @@ from .weibo import ( WeiboMobileIE ) from .weiqitv import WeiqiTVIE +from .willow import WillowIE from .wimtv import WimTVIE from .whowatch import WhoWatchIE from .wistia import ( @@ -1768,6 +1808,10 @@ from .wistia import ( WistiaPlaylistIE, ) from .worldstarhiphop import WorldStarHipHopIE +from .wppilot import ( + WPPilotIE, + WPPilotChannelsIE, +) from .wsj import ( WSJIE, WSJArticleIE, diff --git a/yt_dlp/extractor/fancode.py b/yt_dlp/extractor/fancode.py index 912feb702..f6733b124 100644 --- a/yt_dlp/extractor/fancode.py +++ b/yt_dlp/extractor/fancode.py @@ -21,7 +21,6 @@ class FancodeVodIE(InfoExtractor): 'url': 'https://fancode.com/video/15043/match-preview-pbks-vs-mi', 'params': { 'skip_download': True, - 'format': 'bestvideo' }, 'info_dict': { 'id': '6249806281001', diff --git a/yt_dlp/extractor/francetv.py b/yt_dlp/extractor/francetv.py index 3bbab69e6..bc5ef4df9 100644 --- a/yt_dlp/extractor/francetv.py +++ b/yt_dlp/extractor/francetv.py @@ -185,7 +185,7 @@ class FranceTVIE(InfoExtractor): 'vcodec': 'none', 'ext': 'mhtml', 'protocol': 'mhtml', - 'url': 'about:dummy', + 'url': 'about:invalid', 'fragments': [{ 'path': sheet, # XXX: not entirely accurate; each spritesheet seems to be diff --git a/yt_dlp/extractor/funimation.py b/yt_dlp/extractor/funimation.py index 382cbe159..96dad2ca3 100644 --- a/yt_dlp/extractor/funimation.py +++ b/yt_dlp/extractor/funimation.py @@ -10,6 +10,7 @@ from ..compat import compat_HTTPError from ..utils import ( determine_ext, int_or_none, + join_nonempty, js_to_json, orderedSet, qualities, @@ -275,7 +276,7 @@ class FunimationIE(FunimationBaseIE): def _get_subtitles(self, subtitles, experience_id, episode, display_id, format_name): if isinstance(episode, str): webpage = self._download_webpage( - f'https://www.funimation.com/player/{experience_id}', display_id, + f'https://www.funimation.com/player/{experience_id}/', display_id, fatal=False, note=f'Downloading player webpage for {format_name}') episode, _, _ = self._get_episode(webpage, episode_id=episode, fatal=False) @@ -288,10 +289,11 @@ class FunimationIE(FunimationBaseIE): sub_type = sub_type if sub_type != 'FULL' else None current_sub = { 'url': text_track['src'], - 'name': ' '.join(filter(None, (version, text_track.get('label'), sub_type))) + 'name': join_nonempty(version, text_track.get('label'), sub_type, delim=' ') } - lang = '_'.join(filter(None, ( - text_track.get('language', 'und'), version if version != 'Simulcast' else None, sub_type))) + lang = join_nonempty(text_track.get('language', 'und'), + version if version != 'Simulcast' else None, + sub_type, delim='_') if current_sub not in subtitles.get(lang, []): subtitles.setdefault(lang, []).append(current_sub) return subtitles diff --git a/yt_dlp/extractor/gab.py b/yt_dlp/extractor/gab.py index 25b5cb066..bde6e8624 100644 --- a/yt_dlp/extractor/gab.py +++ b/yt_dlp/extractor/gab.py @@ -6,7 +6,11 @@ import re from .common import InfoExtractor from ..utils import ( clean_html, + int_or_none, + parse_codecs, + parse_duration, str_to_int, + unified_timestamp ) @@ -32,8 +36,10 @@ class GabTVIE(InfoExtractor): channel_name = self._search_regex(r'data-channel-name=\"(?P<channel_id>[^\"]+)', webpage, 'channel_name') title = self._search_regex(r'data-episode-title=\"(?P<channel_id>[^\"]+)', webpage, 'title') view_key = self._search_regex(r'data-view-key=\"(?P<channel_id>[^\"]+)', webpage, 'view_key') - description = clean_html(self._html_search_regex(self._meta_regex('description'), webpage, 'description', group='content')) or None - available_resolutions = re.findall(r'<a\ data-episode-id=\"%s\"\ data-resolution=\"(?P<resolution>[^\"]+)' % id, webpage) + description = clean_html( + self._html_search_regex(self._meta_regex('description'), webpage, 'description', group='content')) or None + available_resolutions = re.findall(r'<a\ data-episode-id=\"%s\"\ data-resolution=\"(?P<resolution>[^\"]+)' % id, + webpage) formats = [] for resolution in available_resolutions: @@ -62,3 +68,80 @@ class GabTVIE(InfoExtractor): 'uploader_id': channel_id, 'thumbnail': f'https://tv.gab.com/image/{id}', } + + +class GabIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gab\.com/[^/]+/posts/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://gab.com/SomeBitchIKnow/posts/107163961867310434', + 'md5': '8ca34fb00f1e1033b5c5988d79ec531d', + 'info_dict': { + 'id': '107163961867310434-0', + 'ext': 'mp4', + 'title': 'L on Gab', + 'uploader_id': '946600', + 'uploader': 'SomeBitchIKnow', + 'description': 'md5:204055fafd5e1a519f5d6db953567ca3', + 'timestamp': 1635192289, + 'upload_date': '20211025', + } + }, { + 'url': 'https://gab.com/TheLonelyProud/posts/107045884469287653', + 'md5': 'f9cefcfdff6418e392611a828d47839d', + 'info_dict': { + 'id': '107045884469287653-0', + 'ext': 'mp4', + 'title': 'Jody Sadowski on Gab', + 'uploader_id': '1390705', + 'timestamp': 1633390571, + 'upload_date': '20211004', + 'uploader': 'TheLonelyProud', + } + }] + + def _real_extract(self, url): + post_id = self._match_id(url) + json_data = self._download_json(f'https://gab.com/api/v1/statuses/{post_id}', post_id) + + entries = [] + for idx, media in enumerate(json_data['media_attachments']): + if media.get('type') not in ('video', 'gifv'): + continue + metadata = media['meta'] + format_metadata = { + 'acodec': parse_codecs(metadata.get('audio_encode')).get('acodec'), + 'asr': int_or_none((metadata.get('audio_bitrate') or '').split(' ')[0]), + 'fps': metadata.get('fps'), + } + + formats = [{ + 'url': url, + 'width': f.get('width'), + 'height': f.get('height'), + 'tbr': int_or_none(f.get('bitrate'), scale=1000), + **format_metadata, + } for url, f in ((media.get('url'), metadata.get('original') or {}), + (media.get('source_mp4'), metadata.get('playable') or {})) if url] + + self._sort_formats(formats) + + author = json_data.get('account') or {} + entries.append({ + 'id': f'{post_id}-{idx}', + 'title': f'{json_data["account"]["display_name"]} on Gab', + 'timestamp': unified_timestamp(json_data.get('created_at')), + 'formats': formats, + 'description': clean_html(json_data.get('content')), + 'duration': metadata.get('duration') or parse_duration(metadata.get('length')), + 'like_count': json_data.get('favourites_count'), + 'comment_count': json_data.get('replies_count'), + 'repost_count': json_data.get('reblogs_count'), + 'uploader': author.get('username'), + 'uploader_id': author.get('id'), + 'uploader_url': author.get('url'), + }) + + if len(entries) > 1: + return self.playlist_result(entries, post_id) + + return entries[0] diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 0d279016b..51557f0f1 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -56,7 +56,7 @@ from .sportbox import SportBoxIE from .myvi import MyviIE from .condenast import CondeNastIE from .udn import UDNEmbedIE -from .senateisvp import SenateISVPIE +from .senategov import SenateISVPIE from .svt import SVTIE from .pornhub import PornHubIE from .xhamster import XHamsterEmbedIE @@ -135,6 +135,8 @@ from .arcpublishing import ArcPublishingIE from .medialaan import MedialaanIE from .simplecast import SimplecastIE from .wimtv import WimTVIE +from .tvp import TVPEmbedIE +from .blogger import BloggerIE class GenericIE(InfoExtractor): @@ -359,9 +361,6 @@ class GenericIE(InfoExtractor): 'formats': 'mincount:9', 'upload_date': '20130904', }, - 'params': { - 'format': 'bestvideo', - }, }, # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8 { @@ -2175,6 +2174,17 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + { + # blogger embed + 'url': 'https://blog.tomeuvizoso.net/2019/01/a-panfrost-milestone.html', + 'md5': 'f1bc19b6ea1b0fd1d81e84ca9ec467ac', + 'info_dict': { + 'id': 'BLOGGER-video-3c740e3a49197e16-796', + 'ext': 'mp4', + 'title': 'Blogger', + 'thumbnail': r're:^https?://.*', + }, + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -2334,12 +2344,43 @@ class GenericIE(InfoExtractor): 'thumbnail': 'https://bogmedia.org/contents/videos_screenshots/21000/21217/preview_480p.mp4.jpg', } }, + { + # Reddit-hosted video that will redirect and be processed by RedditIE + # Redirects to https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/ + 'url': 'https://v.redd.it/zv89llsvexdz', + 'md5': '87f5f02f6c1582654146f830f21f8662', + 'info_dict': { + 'id': 'zv89llsvexdz', + 'ext': 'mp4', + 'timestamp': 1501941939.0, + 'title': 'That small heart attack.', + 'upload_date': '20170805', + 'uploader': 'Antw87' + } + }, + { + # 1080p Reddit-hosted video that will redirect and be processed by RedditIE + 'url': 'https://v.redd.it/33hgok7dfbz71/', + 'md5': '7a1d587940242c9bb3bd6eb320b39258', + 'info_dict': { + 'id': '33hgok7dfbz71', + 'ext': 'mp4', + 'title': "The game Didn't want me to Knife that Guy I guess", + 'uploader': 'paraf1ve', + 'timestamp': 1636788683.0, + 'upload_date': '20211113' + } + } + # ] def report_following_redirect(self, new_url): """Report information extraction.""" self._downloader.to_screen('[redirect] Following redirect to %s' % new_url) + def report_detected(self, name): + self._downloader.write_debug(f'Identified a {name}') + def _extract_rss(self, url, video_id, doc): playlist_title = doc.find('./channel/title').text playlist_desc_el = doc.find('./channel/description') @@ -2555,10 +2596,13 @@ class GenericIE(InfoExtractor): content_type = head_response.headers.get('Content-Type', '').lower() m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type) if m: + self.report_detected('direct video link') format_id = compat_str(m.group('format_id')) subtitles = {} if format_id.endswith('mpegurl'): formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') + elif format_id.endswith('mpd') or format_id.endswith('dash+xml'): + formats, subtitles = self._extract_mpd_formats_and_subtitles(url, video_id) elif format_id == 'f4m': formats = self._extract_f4m_formats(url, video_id) else: @@ -2595,6 +2639,7 @@ class GenericIE(InfoExtractor): # Is it an M3U playlist? if first_bytes.startswith(b'#EXTM3U'): + self.report_detected('M3U playlist') info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') self._sort_formats(info_dict['formats']) return info_dict @@ -2625,16 +2670,20 @@ class GenericIE(InfoExtractor): except compat_xml_parse_error: doc = compat_etree_fromstring(webpage.encode('utf-8')) if doc.tag == 'rss': + self.report_detected('RSS feed') return self._extract_rss(url, video_id, doc) elif doc.tag == 'SmoothStreamingMedia': info_dict['formats'], info_dict['subtitles'] = self._parse_ism_formats_and_subtitles(doc, url) + self.report_detected('ISM manifest') self._sort_formats(info_dict['formats']) return info_dict elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): smil = self._parse_smil(doc, url, video_id) + self.report_detected('SMIL file') self._sort_formats(smil['formats']) return smil elif doc.tag == '{http://xspf.org/ns/0/}playlist': + self.report_detected('XSPF playlist') return self.playlist_result( self._parse_xspf( doc, video_id, xspf_url=url, @@ -2645,10 +2694,12 @@ class GenericIE(InfoExtractor): doc, mpd_base_url=full_response.geturl().rpartition('/')[0], mpd_url=url) + self.report_detected('DASH manifest') self._sort_formats(info_dict['formats']) return info_dict elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id) + self.report_detected('F4M manifest') self._sort_formats(info_dict['formats']) return info_dict except compat_xml_parse_error: @@ -2657,6 +2708,7 @@ class GenericIE(InfoExtractor): # Is it a Camtasia project? camtasia_res = self._extract_camtasia(url, video_id, webpage) if camtasia_res is not None: + self.report_detected('Camtasia video') return camtasia_res # Sometimes embedded video player is hidden behind percent encoding @@ -2707,6 +2759,8 @@ class GenericIE(InfoExtractor): 'age_limit': age_limit, }) + self._downloader.write_debug('Looking for video embeds') + # Look for Brightcove Legacy Studio embeds bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) if bc_urls: @@ -3204,6 +3258,11 @@ class GenericIE(InfoExtractor): if onionstudios_url: return self.url_result(onionstudios_url) + # Look for Blogger embeds + blogger_urls = BloggerIE._extract_urls(webpage) + if blogger_urls: + return self.playlist_from_matches(blogger_urls, video_id, video_title, ie=BloggerIE.ie_key()) + # Look for ViewLift embeds viewlift_url = ViewLiftEmbedIE._extract_url(webpage) if viewlift_url: @@ -3497,9 +3556,14 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( rumble_urls, video_id, video_title, ie=RumbleEmbedIE.ie_key()) + tvp_urls = TVPEmbedIE._extract_urls(webpage) + if tvp_urls: + return self.playlist_from_matches(tvp_urls, video_id, video_title, ie=TVPEmbedIE.ie_key()) + # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: + self.report_detected('HTML5 media') if len(entries) == 1: entries[0].update({ 'id': video_id, @@ -3519,6 +3583,7 @@ class GenericIE(InfoExtractor): webpage, video_id, transform_source=js_to_json) if jwplayer_data: if isinstance(jwplayer_data.get('playlist'), str): + self.report_detected('JW Player playlist') return { **info_dict, '_type': 'url', @@ -3528,6 +3593,7 @@ class GenericIE(InfoExtractor): try: info = self._parse_jwplayer_data( jwplayer_data, video_id, require_title=False, base_url=url) + self.report_detected('JW Player data') return merge_dicts(info, info_dict) except ExtractorError: # See https://github.com/ytdl-org/youtube-dl/pull/16735 @@ -3577,6 +3643,7 @@ class GenericIE(InfoExtractor): }, }) if formats or subtitles: + self.report_detected('video.js embed') self._sort_formats(formats) info_dict['formats'] = formats info_dict['subtitles'] = subtitles @@ -3585,6 +3652,7 @@ class GenericIE(InfoExtractor): # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld(webpage, video_id, default={}) if json_ld.get('url'): + self.report_detected('JSON LD') return merge_dicts(json_ld, info_dict) def check_video(vurl): @@ -3601,7 +3669,9 @@ class GenericIE(InfoExtractor): # Start with something easy: JW Player in SWFObject found = filter_video(re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)) - if not found: + if found: + self.report_detected('JW Player in SFWObject') + else: # Look for gorilla-vid style embedding found = filter_video(re.findall(r'''(?sx) (?: @@ -3611,10 +3681,13 @@ class GenericIE(InfoExtractor): ) .*? ['"]?file['"]?\s*:\s*["\'](.*?)["\']''', webpage)) + if found: + self.report_detected('JW Player embed') if not found: # Look for generic KVS player found = re.search(r'<script [^>]*?src="https://.+?/kt_player\.js\?v=(?P<ver>(?P<maj_ver>\d+)(\.\d+)+)".*?>', webpage) if found: + self.report_detected('KWS Player') if found.group('maj_ver') not in ['4', '5']: self.report_warning('Untested major version (%s) in player engine--Download may fail.' % found.group('ver')) flashvars = re.search(r'(?ms)<script.*?>.*?var\s+flashvars\s*=\s*(\{.*?\});.*?</script>', webpage) @@ -3660,10 +3733,14 @@ class GenericIE(InfoExtractor): if not found: # Broaden the search a little bit found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)) + if found: + self.report_detected('video file') if not found: # Broaden the findall a little bit: JWPlayer JS loader found = filter_video(re.findall( r'[^A-Za-z0-9]?(?:file|video_url)["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage)) + if found: + self.report_detected('JW Player JS loader') if not found: # Flow player found = filter_video(re.findall(r'''(?xs) @@ -3672,10 +3749,14 @@ class GenericIE(InfoExtractor): \s*\{[^}]+? ["']?clip["']?\s*:\s*\{\s* ["']?url["']?\s*:\s*["']([^"']+)["'] ''', webpage)) + if found: + self.report_detected('Flow Player') if not found: # Cinerama player found = re.findall( r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage) + if found: + self.report_detected('Cinerama player') if not found: # Try to find twitter cards info # twitter:player:stream should be checked before twitter:player since @@ -3683,6 +3764,8 @@ class GenericIE(InfoExtractor): # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser) found = filter_video(re.findall( r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)) + if found: + self.report_detected('Twitter card') if not found: # We look for Open Graph info: # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am) @@ -3690,6 +3773,8 @@ class GenericIE(InfoExtractor): # We only look in og:video if the MIME type is a video, don't try if it's a Flash player: if m_video_type is not None: found = filter_video(re.findall(r'<meta.*?property="og:(?:video|audio)".*?content="(.*?)"', webpage)) + if found: + self.report_detected('Open Graph video info') if not found: REDIRECT_REGEX = r'[0-9]{,2};\s*(?:URL|url)=\'?([^\'"]+)' found = re.search( @@ -3721,6 +3806,7 @@ class GenericIE(InfoExtractor): # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser) embed_url = self._html_search_meta('twitter:player', webpage, default=None) if embed_url and embed_url != url: + self.report_detected('twitter:player iframe') return self.url_result(embed_url) if not found: diff --git a/yt_dlp/extractor/hotstar.py b/yt_dlp/extractor/hotstar.py index 12e6c53d4..0bdf772a1 100644 --- a/yt_dlp/extractor/hotstar.py +++ b/yt_dlp/extractor/hotstar.py @@ -230,6 +230,11 @@ class HotStarIE(HotStarBaseIE): if tags and 'encryption:plain' not in tags: for f in current_formats: f['has_drm'] = True + if tags and 'language' in tags: + lang = re.search(r'language:(?P<lang>[a-z]+)', tags).group('lang') + for f in current_formats: + if not f.get('langauge'): + f['language'] = lang formats.extend(current_formats) subs = self._merge_subtitles(subs, current_subs) if not formats and geo_restricted: diff --git a/yt_dlp/extractor/imdb.py b/yt_dlp/extractor/imdb.py index a31301985..24f1fde64 100644 --- a/yt_dlp/extractor/imdb.py +++ b/yt_dlp/extractor/imdb.py @@ -111,7 +111,7 @@ class ImdbIE(InfoExtractor): 'formats': formats, 'description': info.get('videoDescription'), 'thumbnail': url_or_none(try_get( - video_metadata, lambda x: x['videoSlate']['source'])), + info, lambda x: x['videoSlate']['source'])), 'duration': parse_duration(info.get('videoRuntime')), } diff --git a/yt_dlp/extractor/instagram.py b/yt_dlp/extractor/instagram.py index ccfcddd5b..1fcf97a19 100644 --- a/yt_dlp/extractor/instagram.py +++ b/yt_dlp/extractor/instagram.py @@ -1,4 +1,4 @@ -from __future__ import unicode_literals +# coding: utf-8 import itertools import hashlib @@ -8,7 +8,6 @@ import time from .common import InfoExtractor from ..compat import ( - compat_str, compat_HTTPError, ) from ..utils import ( @@ -18,16 +17,156 @@ from ..utils import ( int_or_none, lowercase_escape, std_headers, - try_get, + traverse_obj, url_or_none, - variadic, urlencode_postdata, ) -class InstagramIE(InfoExtractor): - _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com/(?:p|tv|reel)/(?P<id>[^/?#&]+))' +class InstagramBaseIE(InfoExtractor): _NETRC_MACHINE = 'instagram' + _IS_LOGGED_IN = False + + def _login(self): + username, password = self._get_login_info() + if username is None or self._IS_LOGGED_IN: + return + + login_webpage = self._download_webpage( + 'https://www.instagram.com/accounts/login/', None, + note='Downloading login webpage', errnote='Failed to download login webpage') + + shared_data = self._parse_json( + self._search_regex( + r'window\._sharedData\s*=\s*({.+?});', + login_webpage, 'shared data', default='{}'), + None) + + login = self._download_json('https://www.instagram.com/accounts/login/ajax/', None, note='Logging in', headers={ + 'Accept': '*/*', + 'X-IG-App-ID': '936619743392459', + 'X-ASBD-ID': '198387', + 'X-IG-WWW-Claim': '0', + 'X-Requested-With': 'XMLHttpRequest', + 'X-CSRFToken': shared_data['config']['csrf_token'], + 'X-Instagram-AJAX': shared_data['rollout_hash'], + 'Referer': 'https://www.instagram.com/', + }, data=urlencode_postdata({ + 'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{int(time.time())}:{password}', + 'username': username, + 'queryParams': '{}', + 'optIntoOneTap': 'false', + 'stopDeletionNonce': '', + 'trustedDeviceRecords': '{}', + })) + + if not login.get('authenticated'): + if login.get('message'): + raise ExtractorError(f'Unable to login: {login["message"]}') + raise ExtractorError('Unable to login') + InstagramBaseIE._IS_LOGGED_IN = True + + def _real_initialize(self): + self._login() + + def _get_count(self, media, kind, *keys): + return traverse_obj( + media, (kind, 'count'), *((f'edge_media_{key}', 'count') for key in keys), + expected_type=int_or_none) + + def _get_dimension(self, name, media, webpage=None): + return ( + traverse_obj(media, ('dimensions', name), expected_type=int_or_none) + or int_or_none(self._html_search_meta( + (f'og:video:{name}', f'video:{name}'), webpage or '', default=None))) + + def _extract_nodes(self, nodes, is_direct=False): + for idx, node in enumerate(nodes, start=1): + if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True: + continue + + video_id = node.get('shortcode') + + if is_direct: + info = { + 'id': video_id or node['id'], + 'url': node.get('video_url'), + 'width': self._get_dimension('width', node), + 'height': self._get_dimension('height', node), + 'http_headers': { + 'Referer': 'https://www.instagram.com/', + } + } + elif not video_id: + continue + else: + info = { + '_type': 'url', + 'ie_key': 'Instagram', + 'id': video_id, + 'url': f'https://instagram.com/p/{video_id}', + } + + yield { + **info, + 'title': node.get('title') or (f'Video {idx}' if is_direct else None), + 'description': traverse_obj( + node, ('edge_media_to_caption', 'edges', 0, 'node', 'text'), expected_type=str), + 'thumbnail': traverse_obj( + node, 'display_url', 'thumbnail_src', 'display_src', expected_type=url_or_none), + 'duration': float_or_none(node.get('video_duration')), + 'timestamp': int_or_none(node.get('taken_at_timestamp')), + 'view_count': int_or_none(node.get('video_view_count')), + 'comment_count': self._get_count(node, 'comments', 'preview_comment', 'to_comment', 'to_parent_comment'), + 'like_count': self._get_count(node, 'likes', 'preview_like'), + } + + +class InstagramIOSIE(InfoExtractor): + IE_DESC = 'IOS instagram:// URL' + _VALID_URL = r'instagram://media\?id=(?P<id>[\d_]+)' + _TESTS = [{ + 'url': 'instagram://media?id=482584233761418119', + 'md5': '0d2da106a9d2631273e192b372806516', + 'info_dict': { + 'id': 'aye83DjauH', + 'ext': 'mp4', + 'title': 'Video by naomipq', + 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 0, + 'timestamp': 1371748545, + 'upload_date': '20130620', + 'uploader_id': 'naomipq', + 'uploader': 'B E A U T Y F O R A S H E S', + 'like_count': int, + 'comment_count': int, + 'comments': list, + }, + 'add_ie': ['Instagram'] + }] + + def _get_id(self, id): + """Source: https://stackoverflow.com/questions/24437823/getting-instagram-post-url-from-media-id""" + chrs = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_' + media_id = int(id.split('_')[0]) + shortened_id = '' + while media_id > 0: + r = media_id % 64 + media_id = (media_id - r) // 64 + shortened_id = chrs[r] + shortened_id + return shortened_id + + def _real_extract(self, url): + return { + '_type': 'url_transparent', + 'url': f'http://instagram.com/tv/{self._get_id(self._match_id(url))}/', + 'ie_key': 'Instagram', + } + + +class InstagramIE(InstagramBaseIE): + _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com/(?:p|tv|reel)/(?P<id>[^/?#&]+))' _TESTS = [{ 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', 'md5': '0d2da106a9d2631273e192b372806516', @@ -143,71 +282,23 @@ class InstagramIE(InfoExtractor): if mobj: return mobj.group('link') - def _login(self): - username, password = self._get_login_info() - if username is None: - return - - login_webpage = self._download_webpage( - 'https://www.instagram.com/accounts/login/', None, - note='Downloading login webpage', errnote='Failed to download login webpage') - - shared_data = self._parse_json( - self._search_regex( - r'window\._sharedData\s*=\s*({.+?});', - login_webpage, 'shared data', default='{}'), - None) - - login = self._download_json('https://www.instagram.com/accounts/login/ajax/', None, note='Logging in', headers={ - 'Accept': '*/*', - 'X-IG-App-ID': '936619743392459', - 'X-ASBD-ID': '198387', - 'X-IG-WWW-Claim': '0', - 'X-Requested-With': 'XMLHttpRequest', - 'X-CSRFToken': shared_data['config']['csrf_token'], - 'X-Instagram-AJAX': shared_data['rollout_hash'], - 'Referer': 'https://www.instagram.com/', - }, data=urlencode_postdata({ - 'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{int(time.time())}:{password}', - 'username': username, - 'queryParams': '{}', - 'optIntoOneTap': 'false', - 'stopDeletionNonce': '', - 'trustedDeviceRecords': '{}', - })) - - if not login.get('authenticated'): - if login.get('message'): - raise ExtractorError(f'Unable to login: {login["message"]}') - raise ExtractorError('Unable to login') - - def _real_initialize(self): - self._login() - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - url = mobj.group('url') - + video_id, url = self._match_valid_url(url).group('id', 'url') webpage, urlh = self._download_webpage_handle(url, video_id) - if 'www.instagram.com/accounts/login' in urlh.geturl().rstrip('/'): + if 'www.instagram.com/accounts/login' in urlh.geturl(): self.raise_login_required('You need to log in to access this content') - (media, video_url, description, thumbnail, timestamp, uploader, - uploader_id, like_count, comment_count, comments, height, - width) = [None] * 12 - shared_data = self._parse_json( self._search_regex( r'window\._sharedData\s*=\s*({.+?});', webpage, 'shared data', default='{}'), video_id, fatal=False) - if shared_data: - media = try_get( - shared_data, - (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'], - lambda x: x['entry_data']['PostPage'][0]['media']), - dict) + media = traverse_obj( + shared_data, + ('entry_data', 'PostPage', 0, 'graphql', 'shortcode_media'), + ('entry_data', 'PostPage', 0, 'media'), + expected_type=dict) + # _sharedData.entry_data.PostPage is empty when authenticated (see # https://github.com/ytdl-org/youtube-dl/pull/22880) if not media: @@ -216,123 +307,78 @@ class InstagramIE(InfoExtractor): r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*({.+?})\s*\)\s*;', webpage, 'additional data', default='{}'), video_id, fatal=False) - if additional_data: - media = try_get( - additional_data, lambda x: x['graphql']['shortcode_media'], - dict) - if media: - video_url = media.get('video_url') - height = int_or_none(media.get('dimensions', {}).get('height')) - width = int_or_none(media.get('dimensions', {}).get('width')) - description = try_get( - media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], - compat_str) or media.get('caption') - title = media.get('title') - thumbnail = media.get('display_src') or media.get('display_url') - duration = float_or_none(media.get('video_duration')) - timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) - uploader = media.get('owner', {}).get('full_name') - uploader_id = media.get('owner', {}).get('username') - - def get_count(keys, kind): - for key in variadic(keys): - count = int_or_none(try_get( - media, (lambda x: x['edge_media_%s' % key]['count'], - lambda x: x['%ss' % kind]['count']))) - if count is not None: - return count - - like_count = get_count('preview_like', 'like') - comment_count = get_count( - ('preview_comment', 'to_comment', 'to_parent_comment'), 'comment') - - comments = [] - for comment in try_get(media, lambda x: x['edge_media_to_parent_comment']['edges']): - comment_dict = comment.get('node', {}) - comment_text = comment_dict.get('text') - if comment_text: - comments.append({ - 'author': try_get(comment_dict, lambda x: x['owner']['username']), - 'author_id': try_get(comment_dict, lambda x: x['owner']['id']), - 'id': comment_dict.get('id'), - 'text': comment_text, - 'timestamp': int_or_none(comment_dict.get('created_at')), - }) - if not video_url: - edges = try_get( - media, lambda x: x['edge_sidecar_to_children']['edges'], - list) or [] - if edges: - entries = [] - for edge_num, edge in enumerate(edges, start=1): - node = try_get(edge, lambda x: x['node'], dict) - if not node: - continue - node_video_url = url_or_none(node.get('video_url')) - if not node_video_url: - continue - entries.append({ - 'id': node.get('shortcode') or node['id'], - 'title': node.get('title') or 'Video %d' % edge_num, - 'url': node_video_url, - 'thumbnail': node.get('display_url'), - 'duration': float_or_none(node.get('video_duration')), - 'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])), - 'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])), - 'view_count': int_or_none(node.get('video_view_count')), - }) - return self.playlist_result( - entries, video_id, - 'Post by %s' % uploader_id if uploader_id else None, - description) + media = traverse_obj(additional_data, ('graphql', 'shortcode_media'), expected_type=dict) or {} - if not video_url: - video_url = self._og_search_video_url(webpage, secure=False) - - formats = [{ - 'url': video_url, - 'width': width, - 'height': height, - }] - - if not uploader_id: - uploader_id = self._search_regex( - r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"', - webpage, 'uploader id', fatal=False) + uploader_id = traverse_obj(media, ('owner', 'username')) or self._search_regex( + r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"', webpage, 'uploader id', fatal=False) + description = ( + traverse_obj(media, ('edge_media_to_caption', 'edges', 0, 'node', 'text'), expected_type=str) + or media.get('caption')) if not description: description = self._search_regex( r'"caption"\s*:\s*"(.+?)"', webpage, 'description', default=None) if description is not None: description = lowercase_escape(description) - if not thumbnail: - thumbnail = self._og_search_thumbnail(webpage) + video_url = media.get('video_url') + if not video_url: + nodes = traverse_obj(media, ('edge_sidecar_to_children', 'edges', ..., 'node'), expected_type=dict) or [] + if nodes: + return self.playlist_result( + self._extract_nodes(nodes, True), video_id, + 'Post by %s' % uploader_id if uploader_id else None, description) + + video_url = self._og_search_video_url(webpage, secure=False) + + formats = [{ + 'url': video_url, + 'width': self._get_dimension('width', media, webpage), + 'height': self._get_dimension('height', media, webpage), + }] + dash = traverse_obj(media, ('dash_info', 'video_dash_manifest')) + if dash: + formats.extend(self._parse_mpd_formats(self._parse_xml(dash, video_id), mpd_id='dash')) + self._sort_formats(formats) + + comments = [{ + 'author': traverse_obj(comment_dict, ('node', 'owner', 'username')), + 'author_id': traverse_obj(comment_dict, ('node', 'owner', 'id')), + 'id': traverse_obj(comment_dict, ('node', 'id')), + 'text': traverse_obj(comment_dict, ('node', 'text')), + 'timestamp': traverse_obj(comment_dict, ('node', 'created_at'), expected_type=int_or_none), + } for comment_dict in traverse_obj(media, ('edge_media_to_parent_comment', 'edges'))] + + display_resources = ( + media.get('display_resources') + or [{'src': media.get(key)} for key in ('display_src', 'display_url')] + or [{'src': self._og_search_thumbnail(webpage)}]) + thumbnails = [{ + 'url': thumbnail['src'], + 'width': thumbnail.get('config_width'), + 'height': thumbnail.get('config_height'), + } for thumbnail in display_resources if thumbnail.get('src')] return { 'id': video_id, 'formats': formats, - 'ext': 'mp4', - 'title': title or 'Video by %s' % uploader_id, + 'title': media.get('title') or 'Video by %s' % uploader_id, 'description': description, - 'duration': duration, - 'thumbnail': thumbnail, - 'timestamp': timestamp, + 'duration': float_or_none(media.get('video_duration')), + 'timestamp': traverse_obj(media, 'taken_at_timestamp', 'date', expected_type=int_or_none), 'uploader_id': uploader_id, - 'uploader': uploader, - 'like_count': like_count, - 'comment_count': comment_count, + 'uploader': traverse_obj(media, ('owner', 'full_name')), + 'like_count': self._get_count(media, 'likes', 'preview_like'), + 'comment_count': self._get_count(media, 'comments', 'preview_comment', 'to_comment', 'to_parent_comment'), 'comments': comments, + 'thumbnails': thumbnails, 'http_headers': { 'Referer': 'https://www.instagram.com/', } } -class InstagramPlaylistIE(InfoExtractor): - # A superclass for handling any kind of query based on GraphQL which - # results in a playlist. - +class InstagramPlaylistBaseIE(InstagramBaseIE): _gis_tmpl = None # used to cache GIS request type def _parse_graphql(self, webpage, item_id): @@ -344,10 +390,6 @@ class InstagramPlaylistIE(InfoExtractor): def _extract_graphql(self, data, url): # Parses GraphQL queries containing videos and generates a playlist. - def get_count(suffix): - return int_or_none(try_get( - node, lambda x: x['edge_media_' + suffix]['count'])) - uploader_id = self._match_id(url) csrf_token = data['config']['csrf_token'] rhx_gis = data.get('rhx_gis') or '3c7ca9dcefcf966d11dacf1f151335e8' @@ -396,55 +438,14 @@ class InstagramPlaylistIE(InfoExtractor): continue raise - edges = media.get('edges') - if not edges or not isinstance(edges, list): + nodes = traverse_obj(media, ('edges', ..., 'node'), expected_type=dict) or [] + if not nodes: break + yield from self._extract_nodes(nodes) - for edge in edges: - node = edge.get('node') - if not node or not isinstance(node, dict): - continue - if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True: - continue - video_id = node.get('shortcode') - if not video_id: - continue - - info = self.url_result( - 'https://instagram.com/p/%s/' % video_id, - ie=InstagramIE.ie_key(), video_id=video_id) - - description = try_get( - node, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], - compat_str) - thumbnail = node.get('thumbnail_src') or node.get('display_src') - timestamp = int_or_none(node.get('taken_at_timestamp')) - - comment_count = get_count('to_comment') - like_count = get_count('preview_like') - view_count = int_or_none(node.get('video_view_count')) - - info.update({ - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'comment_count': comment_count, - 'like_count': like_count, - 'view_count': view_count, - }) - - yield info - - page_info = media.get('page_info') - if not page_info or not isinstance(page_info, dict): - break - - has_next_page = page_info.get('has_next_page') - if not has_next_page: - break - - cursor = page_info.get('end_cursor') - if not cursor or not isinstance(cursor, compat_str): + has_next_page = traverse_obj(media, ('page_info', 'has_next_page')) + cursor = traverse_obj(media, ('page_info', 'end_cursor'), expected_type=str) + if not has_next_page or not cursor: break def _real_extract(self, url): @@ -458,11 +459,11 @@ class InstagramPlaylistIE(InfoExtractor): self._extract_graphql(data, url), user_or_tag, user_or_tag) -class InstagramUserIE(InstagramPlaylistIE): +class InstagramUserIE(InstagramPlaylistBaseIE): _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P<id>[^/]{2,})/?(?:$|[?#])' IE_DESC = 'Instagram user profile' IE_NAME = 'instagram:user' - _TEST = { + _TESTS = [{ 'url': 'https://instagram.com/porsche', 'info_dict': { 'id': 'porsche', @@ -474,7 +475,7 @@ class InstagramUserIE(InstagramPlaylistIE): 'skip_download': True, 'playlistend': 5, } - } + }] _QUERY_HASH = '42323d64886122307be10013ad2dcc44', @@ -492,11 +493,11 @@ class InstagramUserIE(InstagramPlaylistIE): } -class InstagramTagIE(InstagramPlaylistIE): +class InstagramTagIE(InstagramPlaylistBaseIE): _VALID_URL = r'https?://(?:www\.)?instagram\.com/explore/tags/(?P<id>[^/]+)' IE_DESC = 'Instagram hashtag search' IE_NAME = 'instagram:tag' - _TEST = { + _TESTS = [{ 'url': 'https://instagram.com/explore/tags/lolcats', 'info_dict': { 'id': 'lolcats', @@ -508,7 +509,7 @@ class InstagramTagIE(InstagramPlaylistIE): 'skip_download': True, 'playlistend': 50, } - } + }] _QUERY_HASH = 'f92f56d47dc7a55b606908374b43a314', diff --git a/yt_dlp/extractor/internazionale.py b/yt_dlp/extractor/internazionale.py index 676e8e269..45e2af690 100644 --- a/yt_dlp/extractor/internazionale.py +++ b/yt_dlp/extractor/internazionale.py @@ -20,9 +20,6 @@ class InternazionaleIE(InfoExtractor): 'upload_date': '20150219', 'thumbnail': r're:^https?://.*\.jpg$', }, - 'params': { - 'format': 'bestvideo', - }, }, { 'url': 'https://www.internazionale.it/video/2018/08/29/telefono-stare-con-noi-stessi', 'md5': '9db8663704cab73eb972d1cee0082c79', @@ -36,9 +33,6 @@ class InternazionaleIE(InfoExtractor): 'upload_date': '20180829', 'thumbnail': r're:^https?://.*\.jpg$', }, - 'params': { - 'format': 'bestvideo', - }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/iprima.py b/yt_dlp/extractor/iprima.py index 28e660972..347fec1d5 100644 --- a/yt_dlp/extractor/iprima.py +++ b/yt_dlp/extractor/iprima.py @@ -8,12 +8,19 @@ from .common import InfoExtractor from ..utils import ( determine_ext, js_to_json, + urlencode_postdata, + ExtractorError, + parse_qs ) class IPrimaIE(InfoExtractor): - _VALID_URL = r'https?://(?:[^/]+)\.iprima\.cz/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?!cnn)(?:[^/]+)\.iprima\.cz/(?:[^/]+/)*(?P<id>[^/?#&]+)' _GEO_BYPASS = False + _NETRC_MACHINE = 'iprima' + _LOGIN_URL = 'https://auth.iprima.cz/oauth2/login' + _TOKEN_URL = 'https://auth.iprima.cz/oauth2/token' + access_token = None _TESTS = [{ 'url': 'https://prima.iprima.cz/particka/92-epizoda', @@ -22,16 +29,8 @@ class IPrimaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Partička (92)', 'description': 'md5:859d53beae4609e6dd7796413f1b6cac', - }, - 'params': { - 'skip_download': True, # m3u8 download - }, - }, { - 'url': 'https://cnn.iprima.cz/videa/70-epizoda', - 'info_dict': { - 'id': 'p681554', - 'ext': 'mp4', - 'title': 'HLAVNÍ ZPRÁVY 3.5.2020', + 'upload_date': '20201103', + 'timestamp': 1604437480, }, 'params': { 'skip_download': True, # m3u8 download @@ -44,11 +43,9 @@ class IPrimaIE(InfoExtractor): 'url': 'http://play.iprima.cz/closer-nove-pripady/closer-nove-pripady-iv-1', 'only_matching': True, }, { - # iframe api.play-backend.iprima.cz 'url': 'https://prima.iprima.cz/my-little-pony/mapa-znameni-2-2', 'only_matching': True, }, { - # iframe prima.iprima.cz 'url': 'https://prima.iprima.cz/porady/jak-se-stavi-sen/rodina-rathousova-praha', 'only_matching': True, }, { @@ -66,9 +63,127 @@ class IPrimaIE(InfoExtractor): }, { 'url': 'https://love.iprima.cz/laska-az-za-hrob/slib-dany-bratrovi', 'only_matching': True, - }, { - 'url': 'https://autosalon.iprima.cz/motorsport/7-epizoda-1', - 'only_matching': True, + }] + + def _login(self): + username, password = self._get_login_info() + + if username is None or password is None: + self.raise_login_required('Login is required to access any iPrima content', method='password') + + login_page = self._download_webpage( + self._LOGIN_URL, None, note='Downloading login page', + errnote='Downloading login page failed') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + '_email': username, + '_password': password}) + + _, login_handle = self._download_webpage_handle( + self._LOGIN_URL, None, data=urlencode_postdata(login_form), + note='Logging in') + + code = parse_qs(login_handle.geturl()).get('code')[0] + if not code: + raise ExtractorError('Login failed', expected=True) + + token_request_data = { + 'scope': 'openid+email+profile+phone+address+offline_access', + 'client_id': 'prima_sso', + 'grant_type': 'authorization_code', + 'code': code, + 'redirect_uri': 'https://auth.iprima.cz/sso/auth-check'} + + token_data = self._download_json( + self._TOKEN_URL, None, + note='Downloading token', errnote='Downloading token failed', + data=urlencode_postdata(token_request_data)) + + self.access_token = token_data.get('access_token') + if self.access_token is None: + raise ExtractorError('Getting token failed', expected=True) + + def _raise_access_error(self, error_code): + if error_code == 'PLAY_GEOIP_DENIED': + self.raise_geo_restricted(countries=['CZ'], metadata_available=True) + elif error_code is not None: + self.raise_no_formats('Access to stream infos forbidden', expected=True) + + def _real_initialize(self): + if not self.access_token: + self._login() + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._html_search_meta( + ['og:title', 'twitter:title'], + webpage, 'title', default=None) + + video_id = self._search_regex(( + r'productId\s*=\s*([\'"])(?P<id>p\d+)\1', + r'pproduct_id\s*=\s*([\'"])(?P<id>p\d+)\1'), + webpage, 'real id', group='id') + + metadata = self._download_json( + f'https://api.play-backend.iprima.cz/api/v1//products/id-{video_id}/play', + video_id, note='Getting manifest URLs', errnote='Failed to get manifest URLs', + headers={'X-OTT-Access-Token': self.access_token}, + expected_status=403) + + self._raise_access_error(metadata.get('errorCode')) + + stream_infos = metadata.get('streamInfos') + formats = [] + if stream_infos is None: + self.raise_no_formats('Reading stream infos failed', expected=True) + else: + for manifest in stream_infos: + manifest_type = manifest.get('type') + manifest_url = manifest.get('url') + ext = determine_ext(manifest_url) + if manifest_type == 'HLS' or ext == 'm3u8': + formats += self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + elif manifest_type == 'DASH' or ext == 'mpd': + formats += self._extract_mpd_formats( + manifest_url, video_id, mpd_id='dash', fatal=False) + self._sort_formats(formats) + + final_result = self._search_json_ld(webpage, video_id) or {} + final_result.update({ + 'id': video_id, + 'title': title, + 'thumbnail': self._html_search_meta( + ['thumbnail', 'og:image', 'twitter:image'], + webpage, 'thumbnail', default=None), + 'formats': formats, + 'description': self._html_search_meta( + ['description', 'og:description', 'twitter:description'], + webpage, 'description', default=None)}) + + return final_result + + +class IPrimaCNNIE(InfoExtractor): + _VALID_URL = r'https?://cnn\.iprima\.cz/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _GEO_BYPASS = False + + _TESTS = [{ + 'url': 'https://cnn.iprima.cz/porady/strunc/24072020-koronaviru-mam-plne-zuby-strasit-druhou-vlnou-je-absurdni-rika-senatorka-dernerova', + 'info_dict': { + 'id': 'p716177', + 'ext': 'mp4', + 'title': 'md5:277c6b1ed0577e51b40ddd35602ff43e', + }, + 'params': { + 'skip_download': 'm3u8' + } }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/itv.py b/yt_dlp/extractor/itv.py index 6e6a3673c..bdd6af688 100644 --- a/yt_dlp/extractor/itv.py +++ b/yt_dlp/extractor/itv.py @@ -117,7 +117,7 @@ class ITVIE(InfoExtractor): # See: https://github.com/yt-dlp/yt-dlp/issues/986 platform_tag_subs, featureset_subs = next( ((platform_tag, featureset) - for platform_tag, featuresets in reversed(variants.items()) for featureset in featuresets + for platform_tag, featuresets in reversed(list(variants.items())) for featureset in featuresets if try_get(featureset, lambda x: x[2]) == 'outband-webvtt'), (None, None)) @@ -146,8 +146,8 @@ class ITVIE(InfoExtractor): # See: https://github.com/yt-dlp/yt-dlp/issues/986 platform_tag_video, featureset_video = next( ((platform_tag, featureset) - for platform_tag, featuresets in reversed(variants.items()) for featureset in featuresets - if try_get(featureset, lambda x: x[:2]) == ['hls', 'aes']), + for platform_tag, featuresets in reversed(list(variants.items())) for featureset in featuresets + if set(try_get(featureset, lambda x: x[:2]) or []) == {'aes', 'hls'}), (None, None)) if not platform_tag_video or not featureset_video: raise ExtractorError('No downloads available', expected=True, video_id=video_id) diff --git a/yt_dlp/extractor/kinopoisk.py b/yt_dlp/extractor/kinopoisk.py index 9e8d01f53..cdbb642e2 100644 --- a/yt_dlp/extractor/kinopoisk.py +++ b/yt_dlp/extractor/kinopoisk.py @@ -23,9 +23,6 @@ class KinoPoiskIE(InfoExtractor): 'duration': 4533, 'age_limit': 12, }, - 'params': { - 'format': 'bestvideo', - }, }, { 'url': 'https://www.kinopoisk.ru/film/81041', 'only_matching': True, diff --git a/yt_dlp/extractor/la7.py b/yt_dlp/extractor/la7.py index 363fbd6a5..de985e450 100644 --- a/yt_dlp/extractor/la7.py +++ b/yt_dlp/extractor/la7.py @@ -7,8 +7,9 @@ from .common import InfoExtractor from ..utils import ( determine_ext, float_or_none, + HEADRequest, + int_or_none, parse_duration, - smuggle_url, unified_strdate, ) @@ -25,19 +26,38 @@ class LA7IE(InfoExtractor): 'url': 'http://www.la7.it/crozza/video/inccool8-02-10-2015-163722', 'md5': '8b613ffc0c4bf9b9e377169fc19c214c', 'info_dict': { - 'id': '0_42j6wd36', + 'id': 'inccool8-02-10-2015-163722', 'ext': 'mp4', 'title': 'Inc.Cool8', 'description': 'Benvenuti nell\'incredibile mondo della INC. COOL. 8. dove “INC.” sta per “Incorporated” “COOL” sta per “fashion” ed Eight sta per il gesto atletico', 'thumbnail': 're:^https?://.*', - 'uploader_id': 'kdla7pillole@iltrovatore.it', - 'timestamp': 1443814869, 'upload_date': '20151002', }, }, { 'url': 'http://www.la7.it/omnibus/rivedila7/omnibus-news-02-07-2016-189077', 'only_matching': True, }] + _HOST = 'https://awsvodpkg.iltrovatore.it' + + def _generate_mp4_url(self, quality, m3u8_formats): + for f in m3u8_formats: + if f['vcodec'] != 'none' and quality in f['url']: + http_url = '%s%s.mp4' % (self._HOST, quality) + + urlh = self._request_webpage( + HEADRequest(http_url), quality, + note='Check filesize', fatal=False) + if urlh: + http_f = f.copy() + del http_f['manifest_url'] + http_f.update({ + 'format_id': http_f['format_id'].replace('hls-', 'https-'), + 'url': http_url, + 'protocol': 'https', + 'filesize_approx': int_or_none(urlh.headers.get('Content-Length', None)), + }) + return http_f + return None def _real_extract(self, url): video_id = self._match_id(url) @@ -46,22 +66,30 @@ class LA7IE(InfoExtractor): url = '%s//%s' % (self.http_scheme(), url) webpage = self._download_webpage(url, video_id) + video_path = self._search_regex(r'(/content/.*?).mp4', webpage, 'video_path') - player_data = self._search_regex( - [r'(?s)videoParams\s*=\s*({.+?});', r'videoLa7\(({[^;]+})\);'], - webpage, 'player data') - vid = self._search_regex(r'vid\s*:\s*"(.+?)",', player_data, 'vid') + formats = self._extract_mpd_formats( + f'{self._HOST}/local/dash/,{video_path}.mp4.urlset/manifest.mpd', + video_id, mpd_id='dash', fatal=False) + m3u8_formats = self._extract_m3u8_formats( + f'{self._HOST}/local/hls/,{video_path}.mp4.urlset/master.m3u8', + video_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(m3u8_formats) + + for q in filter(None, video_path.split(',')): + http_f = self._generate_mp4_url(q, m3u8_formats) + if http_f: + formats.append(http_f) + + self._sort_formats(formats) return { - '_type': 'url_transparent', - 'url': smuggle_url('kaltura:103:%s' % vid, { - 'service_url': 'http://nkdam.iltrovatore.it', - }), 'id': video_id, 'title': self._og_search_title(webpage, default=None), 'description': self._og_search_description(webpage, default=None), 'thumbnail': self._og_search_thumbnail(webpage, default=None), - 'ie_key': 'Kaltura', + 'formats': formats, + 'upload_date': unified_strdate(self._search_regex(r'datetime="(.+?)"', webpage, 'upload_date', fatal=False)) } diff --git a/yt_dlp/extractor/lego.py b/yt_dlp/extractor/lego.py index b9d8b167c..901f43bcf 100644 --- a/yt_dlp/extractor/lego.py +++ b/yt_dlp/extractor/lego.py @@ -8,6 +8,7 @@ from ..compat import compat_HTTPError from ..utils import ( ExtractorError, int_or_none, + join_nonempty, qualities, ) @@ -102,12 +103,8 @@ class LEGOIE(InfoExtractor): m3u8_id=video_source_format, fatal=False)) else: video_source_quality = video_source.get('Quality') - format_id = [] - for v in (video_source_format, video_source_quality): - if v: - format_id.append(v) f = { - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty(video_source_format, video_source_quality), 'quality': q(video_source_quality), 'url': video_source_url, } diff --git a/yt_dlp/extractor/linkedin.py b/yt_dlp/extractor/linkedin.py index 3ce906e2f..bd76ae166 100644 --- a/yt_dlp/extractor/linkedin.py +++ b/yt_dlp/extractor/linkedin.py @@ -6,18 +6,54 @@ import re from .common import InfoExtractor from ..utils import ( + clean_html, + extract_attributes, ExtractorError, float_or_none, + get_element_by_class, int_or_none, srt_subtitles_timecode, + strip_or_none, + mimetype2ext, try_get, urlencode_postdata, urljoin, ) -class LinkedInLearningBaseIE(InfoExtractor): +class LinkedInBaseIE(InfoExtractor): _NETRC_MACHINE = 'linkedin' + _logged_in = False + + def _real_initialize(self): + if self._logged_in: + return + email, password = self._get_login_info() + if email is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') + action_url = urljoin(self._LOGIN_URL, self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, 'post url', + default='https://www.linkedin.com/uas/login-submit', group='url')) + data = self._hidden_inputs(login_page) + data.update({ + 'session_key': email, + 'session_password': password, + }) + login_submit_page = self._download_webpage( + action_url, None, 'Logging in', + data=urlencode_postdata(data)) + error = self._search_regex( + r'<span[^>]+class="error"[^>]*>\s*(.+?)\s*</span>', + login_submit_page, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) + LinkedInBaseIE._logged_in = True + + +class LinkedInLearningBaseIE(LinkedInBaseIE): _LOGIN_URL = 'https://www.linkedin.com/uas/login?trk=learning' def _call_api(self, course_slug, fields, video_slug=None, resolution=None): @@ -34,6 +70,8 @@ class LinkedInLearningBaseIE(InfoExtractor): }) sub = ' %dp' % resolution api_url = 'https://www.linkedin.com/learning-api/detailedCourses' + if not self._get_cookies(api_url).get('JSESSIONID'): + self.raise_login_required() return self._download_json( api_url, video_slug, 'Downloading%s JSON metadata' % sub, headers={ 'Csrf-Token': self._get_cookies(api_url)['JSESSIONID'].value, @@ -49,29 +87,47 @@ class LinkedInLearningBaseIE(InfoExtractor): def _get_video_id(self, video_data, course_slug, video_slug): return self._get_urn_id(video_data) or '%s/%s' % (course_slug, video_slug) - def _real_initialize(self): - email, password = self._get_login_info() - if email is None: - return - login_page = self._download_webpage( - self._LOGIN_URL, None, 'Downloading login page') - action_url = urljoin(self._LOGIN_URL, self._search_regex( - r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, 'post url', - default='https://www.linkedin.com/uas/login-submit', group='url')) - data = self._hidden_inputs(login_page) - data.update({ - 'session_key': email, - 'session_password': password, - }) - login_submit_page = self._download_webpage( - action_url, None, 'Logging in', - data=urlencode_postdata(data)) - error = self._search_regex( - r'<span[^>]+class="error"[^>]*>\s*(.+?)\s*</span>', - login_submit_page, 'error', default=None) - if error: - raise ExtractorError(error, expected=True) +class LinkedInIE(LinkedInBaseIE): + _VALID_URL = r'https?://(?:www\.)?linkedin\.com/posts/.+?(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.linkedin.com/posts/mishalkhawaja_sendinblueviews-toronto-digitalmarketing-ugcPost-6850898786781339649-mM20', + 'info_dict': { + 'id': '6850898786781339649', + 'ext': 'mp4', + 'title': 'Mishal K. on LinkedIn: #sendinblueviews #toronto #digitalmarketing', + 'description': 'md5:be125430bab1c574f16aeb186a4d5b19', + 'creator': 'Mishal K.' + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, 'title') + description = clean_html(get_element_by_class('share-update-card__update-text', webpage)) + like_count = int_or_none(get_element_by_class('social-counts-reactions__social-counts-numRections', webpage)) + creator = strip_or_none(clean_html(get_element_by_class('comment__actor-name', webpage))) + + sources = self._parse_json(extract_attributes(self._search_regex(r'(<video[^>]+>)', webpage, 'video'))['data-sources'], video_id) + formats = [{ + 'url': source['src'], + 'ext': mimetype2ext(source.get('type')), + 'tbr': float_or_none(source.get('data-bitrate'), scale=1000), + } for source in sources] + + self._sort_formats(formats) + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'like_count': like_count, + 'creator': creator, + 'thumbnail': self._og_search_thumbnail(webpage), + 'description': description, + } class LinkedInLearningIE(LinkedInLearningBaseIE): @@ -102,7 +158,6 @@ class LinkedInLearningIE(LinkedInLearningBaseIE): def _real_extract(self, url): course_slug, video_slug = self._match_valid_url(url).groups() - video_data = None formats = [] for width, height in ((640, 360), (960, 540), (1280, 720)): video_data = self._call_api( diff --git a/yt_dlp/extractor/mdr.py b/yt_dlp/extractor/mdr.py index 0bdd62693..3ca174c2b 100644 --- a/yt_dlp/extractor/mdr.py +++ b/yt_dlp/extractor/mdr.py @@ -2,13 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urlparse, -) +from ..compat import compat_urlparse from ..utils import ( determine_ext, int_or_none, + join_nonempty, parse_duration, parse_iso8601, url_or_none, @@ -148,13 +146,9 @@ class MDRIE(InfoExtractor): abr = int_or_none(xpath_text(asset, './bitrateAudio', 'abr'), 1000) filesize = int_or_none(xpath_text(asset, './fileSize', 'file size')) - format_id = [media_type] - if vbr or abr: - format_id.append(compat_str(vbr or abr)) - f = { 'url': video_url, - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty(media_type, vbr or abr), 'filesize': filesize, 'abr': abr, 'vbr': vbr, diff --git a/yt_dlp/extractor/mediaklikk.py b/yt_dlp/extractor/mediaklikk.py index b9b6d739f..18ff3befa 100644 --- a/yt_dlp/extractor/mediaklikk.py +++ b/yt_dlp/extractor/mediaklikk.py @@ -12,8 +12,8 @@ from ..compat import ( class MediaKlikkIE(InfoExtractor): - _VALID_URL = r'''(?x)^https?:\/\/(?:www\.)? - (?:mediaklikk|m4sport|hirado|petofilive)\.hu\/.*?videok?\/ + _VALID_URL = r'''(?x)https?://(?:www\.)? + (?:mediaklikk|m4sport|hirado|petofilive)\.hu/.*?(?:videok?|cikk)/ (?:(?P<year>[0-9]{4})/(?P<month>[0-9]{1,2})/(?P<day>[0-9]{1,2})/)? (?P<id>[^/#?_]+)''' diff --git a/yt_dlp/extractor/mixch.py b/yt_dlp/extractor/mixch.py new file mode 100644 index 000000000..a99ddd172 --- /dev/null +++ b/yt_dlp/extractor/mixch.py @@ -0,0 +1,55 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + traverse_obj, +) + + +class MixchIE(InfoExtractor): + IE_NAME = 'mixch' + _VALID_URL = r'https?://(?:www\.)?mixch\.tv/u/(?P<id>\d+)' + + TESTS = [{ + 'url': 'https://mixch.tv/u/16236849/live', + 'skip': 'don\'t know if this live persists', + 'info_dict': { + 'id': '16236849', + 'title': '24配信シェア⭕️投票🙏💦', + 'comment_count': 13145, + 'view_count': 28348, + 'timestamp': 1636189377, + 'uploader': '🦥伊咲👶🏻#フレアワ', + 'uploader_id': '16236849', + } + }, { + 'url': 'https://mixch.tv/u/16137876/live', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(f'https://mixch.tv/u/{video_id}/live', video_id) + + initial_js_state = self._parse_json(self._search_regex( + r'(?m)^\s*window\.__INITIAL_JS_STATE__\s*=\s*(\{.+?\});\s*$', webpage, 'initial JS state'), video_id) + if not initial_js_state.get('liveInfo'): + raise ExtractorError('Livestream has ended.', expected=True) + + return { + 'id': video_id, + 'title': traverse_obj(initial_js_state, ('liveInfo', 'title')), + 'comment_count': traverse_obj(initial_js_state, ('liveInfo', 'comments')), + 'view_count': traverse_obj(initial_js_state, ('liveInfo', 'visitor')), + 'timestamp': traverse_obj(initial_js_state, ('liveInfo', 'created')), + 'uploader': traverse_obj(initial_js_state, ('broadcasterInfo', 'name')), + 'uploader_id': video_id, + 'formats': [{ + 'format_id': 'hls', + 'url': traverse_obj(initial_js_state, ('liveInfo', 'hls')) or 'https://d1hd0ww6piyb43.cloudfront.net/hls/torte_%s.m3u8' % video_id, + 'ext': 'mp4', + 'protocol': 'm3u8', + }], + 'is_live': True, + } diff --git a/yt_dlp/extractor/mtv.py b/yt_dlp/extractor/mtv.py index 141dd7deb..be5de0a70 100644 --- a/yt_dlp/extractor/mtv.py +++ b/yt_dlp/extractor/mtv.py @@ -15,6 +15,7 @@ from ..utils import ( float_or_none, HEADRequest, int_or_none, + join_nonempty, RegexNotFoundError, sanitized_Request, strip_or_none, @@ -99,9 +100,9 @@ class MTVServicesInfoExtractor(InfoExtractor): formats.extend([{ 'ext': 'flv' if rtmp_video_url.startswith('rtmp') else ext, 'url': rtmp_video_url, - 'format_id': '-'.join(filter(None, [ + 'format_id': join_nonempty( 'rtmp' if rtmp_video_url.startswith('rtmp') else None, - rendition.get('bitrate')])), + rendition.get('bitrate')), 'width': int(rendition.get('width')), 'height': int(rendition.get('height')), }]) @@ -306,20 +307,22 @@ class MTVServicesInfoExtractor(InfoExtractor): mgid = self._extract_triforce_mgid(webpage) if not mgid: - mgid = self._search_regex( - r'"videoConfig":{"videoId":"(mgid:.*?)"', webpage, 'mgid', default=None) - - if not mgid: - mgid = self._search_regex( - r'"media":{"video":{"config":{"uri":"(mgid:.*?)"', webpage, 'mgid', default=None) - - if not mgid: data = self._parse_json(self._search_regex( r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None) main_container = self._extract_child_with_type(data, 'MainContainer') ab_testing = self._extract_child_with_type(main_container, 'ABTesting') video_player = self._extract_child_with_type(ab_testing or main_container, 'VideoPlayer') - mgid = video_player['props']['media']['video']['config']['uri'] + if video_player: + mgid = try_get(video_player, lambda x: x['props']['media']['video']['config']['uri']) + else: + flex_wrapper = self._extract_child_with_type(ab_testing or main_container, 'FlexWrapper') + auth_suite_wrapper = self._extract_child_with_type(flex_wrapper, 'AuthSuiteWrapper') + player = self._extract_child_with_type(auth_suite_wrapper or flex_wrapper, 'Player') + if player: + mgid = try_get(player, lambda x: x['props']['videoDetail']['mgid']) + + if not mgid: + raise ExtractorError('Could not extract mgid') return mgid diff --git a/yt_dlp/extractor/n1.py b/yt_dlp/extractor/n1.py index 7a09c6779..fdb7f32db 100644 --- a/yt_dlp/extractor/n1.py +++ b/yt_dlp/extractor/n1.py @@ -3,8 +3,6 @@ from __future__ import unicode_literals import re -from .youtube import YoutubeIE -from .reddit import RedditRIE from .common import InfoExtractor from ..utils import ( unified_timestamp, @@ -40,7 +38,7 @@ class N1InfoAssetIE(InfoExtractor): class N1InfoIIE(InfoExtractor): IE_NAME = 'N1Info:article' - _VALID_URL = r'https?://(?:(?:ba|rs|hr)\.)?n1info\.(?:com|si)/(?:[^/]+/){1,2}(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:(?:(?:ba|rs|hr)\.)?n1info\.(?:com|si)|nova\.rs)/(?:[^/]+/){1,2}(?P<id>[^/]+)' _TESTS = [{ # Youtube embedded 'url': 'https://rs.n1info.com/sport-klub/tenis/kako-je-djokovic-propustio-istorijsku-priliku-video/', @@ -90,10 +88,18 @@ class N1InfoIIE(InfoExtractor): 'uploader': 'YouLotWhatDontStop', }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, { + 'url': 'https://nova.rs/vesti/politika/zaklina-tatalovic-ani-brnabic-pricate-lazi-video/', + 'info_dict': { + 'id': 'tnjganabrnabicizaklinatatalovic100danavladegp-novas-worldwide', + 'ext': 'mp4', + 'title': 'Žaklina Tatalović Ani Brnabić: Pričate laži (VIDEO)', + 'upload_date': '20211102', + 'timestamp': 1635861677, + }, + }, { 'url': 'https://hr.n1info.com/vijesti/pravobraniteljica-o-ubojstvu-u-zagrebu-radi-se-o-doista-nezapamcenoj-situaciji/', 'only_matching': True, }] @@ -116,16 +122,16 @@ class N1InfoIIE(InfoExtractor): 'title': title, 'thumbnail': video_data.get('data-thumbnail'), 'timestamp': timestamp, - 'ie_key': N1InfoAssetIE.ie_key()}) + 'ie_key': 'N1InfoAsset'}) embedded_videos = re.findall(r'(<iframe[^>]+>)', webpage) for embedded_video in embedded_videos: video_data = extract_attributes(embedded_video) - url = video_data.get('src') + url = video_data.get('src') or '' if url.startswith('https://www.youtube.com'): - entries.append(self.url_result(url, ie=YoutubeIE.ie_key())) + entries.append(self.url_result(url, ie='Youtube')) elif url.startswith('https://www.redditmedia.com'): - entries.append(self.url_result(url, ie=RedditRIE.ie_key())) + entries.append(self.url_result(url, ie='RedditR')) return { '_type': 'playlist', diff --git a/yt_dlp/extractor/nebula.py b/yt_dlp/extractor/nebula.py index 9698a358e..d235805c3 100644 --- a/yt_dlp/extractor/nebula.py +++ b/yt_dlp/extractor/nebula.py @@ -1,22 +1,163 @@ # coding: utf-8 from __future__ import unicode_literals +import itertools import json import time +import urllib -from urllib.error import HTTPError -from .common import InfoExtractor -from ..compat import compat_str, compat_urllib_parse_unquote, compat_urllib_parse_quote from ..utils import ( ExtractorError, parse_iso8601, try_get, - urljoin, ) +from .common import InfoExtractor + + +class NebulaBaseIE(InfoExtractor): + _NETRC_MACHINE = 'watchnebula' + + _nebula_api_token = None + _nebula_bearer_token = None + _zype_access_token = None + + def _perform_nebula_auth(self): + username, password = self._get_login_info() + if not (username and password): + self.raise_login_required() + + data = json.dumps({'email': username, 'password': password}).encode('utf8') + response = self._download_json( + 'https://api.watchnebula.com/api/v1/auth/login/', + data=data, fatal=False, video_id=None, + headers={ + 'content-type': 'application/json', + # Submitting the 'sessionid' cookie always causes a 403 on auth endpoint + 'cookie': '' + }, + note='Logging in to Nebula with supplied credentials', + errnote='Authentication failed or rejected') + if not response or not response.get('key'): + self.raise_login_required() + + # save nebula token as cookie + self._set_cookie( + 'nebula.app', 'nebula-auth', + urllib.parse.quote( + json.dumps({ + "apiToken": response["key"], + "isLoggingIn": False, + "isLoggingOut": False, + }, separators=(",", ":"))), + expire_time=int(time.time()) + 86400 * 365, + ) + + return response['key'] + + def _retrieve_nebula_api_token(self): + """ + Check cookie jar for valid token. Try to authenticate using credentials if no valid token + can be found in the cookie jar. + """ + nebula_cookies = self._get_cookies('https://nebula.app') + nebula_cookie = nebula_cookies.get('nebula-auth') + if nebula_cookie: + self.to_screen('Authenticating to Nebula with token from cookie jar') + nebula_cookie_value = urllib.parse.unquote(nebula_cookie.value) + nebula_api_token = self._parse_json(nebula_cookie_value, None).get('apiToken') + if nebula_api_token: + return nebula_api_token + + return self._perform_nebula_auth() + def _call_nebula_api(self, url, video_id=None, method='GET', auth_type='api', note=''): + assert method in ('GET', 'POST',) + assert auth_type in ('api', 'bearer',) -class NebulaIE(InfoExtractor): + def inner_call(): + authorization = f'Token {self._nebula_api_token}' if auth_type == 'api' else f'Bearer {self._nebula_bearer_token}' + return self._download_json( + url, video_id, note=note, headers={'Authorization': authorization}, + data=b'' if method == 'POST' else None) + + try: + return inner_call() + except ExtractorError as exc: + # if 401 or 403, attempt credential re-auth and retry + if exc.cause and isinstance(exc.cause, urllib.error.HTTPError) and exc.cause.code in (401, 403): + self.to_screen(f'Reauthenticating to Nebula and retrying, because last {auth_type} call resulted in error {exc.cause.code}') + self._login() + return inner_call() + else: + raise + + def _fetch_nebula_bearer_token(self): + """ + Get a Bearer token for the Nebula API. This will be required to fetch video meta data. + """ + response = self._call_nebula_api('https://api.watchnebula.com/api/v1/authorization/', + method='POST', + note='Authorizing to Nebula') + return response['token'] + def _fetch_zype_access_token(self): + """ + Get a Zype access token, which is required to access video streams -- in our case: to + generate video URLs. + """ + user_object = self._call_nebula_api('https://api.watchnebula.com/api/v1/auth/user/', note='Retrieving Zype access token') + + access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], str) + if not access_token: + if try_get(user_object, lambda x: x['is_subscribed'], bool): + # TODO: Reimplement the same Zype token polling the Nebula frontend implements + # see https://github.com/ytdl-org/youtube-dl/pull/24805#issuecomment-749231532 + raise ExtractorError( + 'Unable to extract Zype access token from Nebula API authentication endpoint. ' + 'Open an arbitrary video in a browser with this account to generate a token', + expected=True) + raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint') + return access_token + + def _build_video_info(self, episode): + zype_id = episode['zype_id'] + zype_video_url = f'https://player.zype.com/embed/{zype_id}.html?access_token={self._zype_access_token}' + channel_slug = episode['channel_slug'] + return { + 'id': episode['zype_id'], + 'display_id': episode['slug'], + '_type': 'url_transparent', + 'ie_key': 'Zype', + 'url': zype_video_url, + 'title': episode['title'], + 'description': episode['description'], + 'timestamp': parse_iso8601(episode['published_at']), + 'thumbnails': [{ + # 'id': tn.get('name'), # this appears to be null + 'url': tn['original'], + 'height': key, + } for key, tn in episode['assets']['thumbnail'].items()], + 'duration': episode['duration'], + 'channel': episode['channel_title'], + 'channel_id': channel_slug, + 'channel_url': f'https://nebula.app/{channel_slug}', + 'uploader': episode['channel_title'], + 'uploader_id': channel_slug, + 'uploader_url': f'https://nebula.app/{channel_slug}', + 'series': episode['channel_title'], + 'creator': episode['channel_title'], + } + + def _login(self): + self._nebula_api_token = self._retrieve_nebula_api_token() + self._nebula_bearer_token = self._fetch_nebula_bearer_token() + self._zype_access_token = self._fetch_zype_access_token() + + def _real_initialize(self): + self._login() + + +class NebulaIE(NebulaBaseIE): _VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app)/videos/(?P<id>[-\w]+)' _TESTS = [ { @@ -30,12 +171,13 @@ class NebulaIE(InfoExtractor): 'upload_date': '20180731', 'timestamp': 1533009600, 'channel': 'Lindsay Ellis', + 'channel_id': 'lindsayellis', 'uploader': 'Lindsay Ellis', + 'uploader_id': 'lindsayellis', }, 'params': { 'usenetrc': True, }, - 'skip': 'All Nebula content requires authentication', }, { 'url': 'https://nebula.app/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', @@ -47,13 +189,14 @@ class NebulaIE(InfoExtractor): 'description': r're:^In this episode we explore the unsung heroes of D-Day, the landing craft.', 'upload_date': '20200327', 'timestamp': 1585348140, - 'channel': 'The Logistics of D-Day', - 'uploader': 'The Logistics of D-Day', + 'channel': 'Real Engineering', + 'channel_id': 'realengineering', + 'uploader': 'Real Engineering', + 'uploader_id': 'realengineering', }, 'params': { 'usenetrc': True, }, - 'skip': 'All Nebula content requires authentication', }, { 'url': 'https://nebula.app/videos/money-episode-1-the-draw', @@ -66,173 +209,82 @@ class NebulaIE(InfoExtractor): 'upload_date': '20200323', 'timestamp': 1584980400, 'channel': 'Tom Scott Presents: Money', + 'channel_id': 'tom-scott-presents-money', 'uploader': 'Tom Scott Presents: Money', + 'uploader_id': 'tom-scott-presents-money', }, 'params': { 'usenetrc': True, }, - 'skip': 'All Nebula content requires authentication', }, { 'url': 'https://watchnebula.com/videos/money-episode-1-the-draw', 'only_matching': True, }, ] - _NETRC_MACHINE = 'watchnebula' - _nebula_token = None + def _fetch_video_metadata(self, slug): + return self._call_nebula_api(f'https://content.watchnebula.com/video/{slug}/', + video_id=slug, + auth_type='bearer', + note='Fetching video meta data') - def _retrieve_nebula_auth(self): - """ - Log in to Nebula, and returns a Nebula API token - """ + def _real_extract(self, url): + slug = self._match_id(url) + video = self._fetch_video_metadata(slug) + return self._build_video_info(video) - username, password = self._get_login_info() - if not (username and password): - self.raise_login_required() - self.report_login() - data = json.dumps({'email': username, 'password': password}).encode('utf8') - response = self._download_json( - 'https://api.watchnebula.com/api/v1/auth/login/', - data=data, fatal=False, video_id=None, - headers={ - 'content-type': 'application/json', - # Submitting the 'sessionid' cookie always causes a 403 on auth endpoint - 'cookie': '' +class NebulaCollectionIE(NebulaBaseIE): + IE_NAME = 'nebula:collection' + _VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app)/(?!videos/)(?P<id>[-\w]+)' + _TESTS = [ + { + 'url': 'https://nebula.app/tom-scott-presents-money', + 'info_dict': { + 'id': 'tom-scott-presents-money', + 'title': 'Tom Scott Presents: Money', + 'description': 'Tom Scott hosts a series all about trust, negotiation and money.', }, - note='Authenticating to Nebula with supplied credentials', - errnote='Authentication failed or rejected') - if not response or not response.get('key'): - self.raise_login_required() - - # save nebula token as cookie - self._set_cookie( - 'nebula.app', 'nebula-auth', - compat_urllib_parse_quote( - json.dumps({ - "apiToken": response["key"], - "isLoggingIn": False, - "isLoggingOut": False, - }, separators=(",", ":"))), - expire_time=int(time.time()) + 86400 * 365, - ) - - return response['key'] - - def _retrieve_zype_api_key(self, page_url, display_id): - """ - Retrieves the Zype API key - """ - - # Find the js that has the API key from the webpage and download it - webpage = self._download_webpage(page_url, video_id=display_id) - main_script_relpath = self._search_regex( - r'<script[^>]*src="(?P<script_relpath>[^"]*main.[0-9a-f]*.chunk.js)"[^>]*>', webpage, - group='script_relpath', name='script relative path', fatal=True) - main_script_abspath = urljoin(page_url, main_script_relpath) - main_script = self._download_webpage(main_script_abspath, video_id=display_id, - note='Retrieving Zype API key') - - api_key = self._search_regex( - r'REACT_APP_ZYPE_API_KEY\s*:\s*"(?P<api_key>[\w-]*)"', main_script, - group='api_key', name='API key', fatal=True) - - return api_key - - def _call_zype_api(self, path, params, video_id, api_key, note): - """ - A helper for making calls to the Zype API. - """ - query = {'api_key': api_key, 'per_page': 1} - query.update(params) - return self._download_json('https://api.zype.com' + path, video_id, query=query, note=note) - - def _call_nebula_api(self, path, video_id, access_token, note): - """ - A helper for making calls to the Nebula API. - """ - return self._download_json('https://api.watchnebula.com/api/v1' + path, video_id, headers={ - 'Authorization': 'Token {access_token}'.format(access_token=access_token) - }, note=note) - - def _fetch_zype_access_token(self, video_id): - try: - user_object = self._call_nebula_api('/auth/user/', video_id, self._nebula_token, note='Retrieving Zype access token') - except ExtractorError as exc: - # if 401, attempt credential auth and retry - if exc.cause and isinstance(exc.cause, HTTPError) and exc.cause.code == 401: - self._nebula_token = self._retrieve_nebula_auth() - user_object = self._call_nebula_api('/auth/user/', video_id, self._nebula_token, note='Retrieving Zype access token') - else: - raise - - access_token = try_get(user_object, lambda x: x['zype_auth_info']['access_token'], compat_str) - if not access_token: - if try_get(user_object, lambda x: x['is_subscribed'], bool): - # TODO: Reimplement the same Zype token polling the Nebula frontend implements - # see https://github.com/ytdl-org/youtube-dl/pull/24805#issuecomment-749231532 - raise ExtractorError( - 'Unable to extract Zype access token from Nebula API authentication endpoint. ' - 'Open an arbitrary video in a browser with this account to generate a token', - expected=True) - raise ExtractorError('Unable to extract Zype access token from Nebula API authentication endpoint') - return access_token - - def _extract_channel_title(self, video_meta): - # TODO: Implement the API calls giving us the channel list, - # so that we can do the title lookup and then figure out the channel URL - categories = video_meta.get('categories', []) if video_meta else [] - # the channel name is the value of the first category - for category in categories: - if category.get('value'): - return category['value'][0] - - def _real_initialize(self): - # check cookie jar for valid token - nebula_cookies = self._get_cookies('https://nebula.app') - nebula_cookie = nebula_cookies.get('nebula-auth') - if nebula_cookie: - self.to_screen('Authenticating to Nebula with token from cookie jar') - nebula_cookie_value = compat_urllib_parse_unquote(nebula_cookie.value) - self._nebula_token = self._parse_json(nebula_cookie_value, None).get('apiToken') + 'playlist_count': 5, + 'params': { + 'usenetrc': True, + }, + }, { + 'url': 'https://nebula.app/lindsayellis', + 'info_dict': { + 'id': 'lindsayellis', + 'title': 'Lindsay Ellis', + 'description': 'Enjoy these hottest of takes on Disney, Transformers, and Musicals.', + }, + 'playlist_mincount': 100, + 'params': { + 'usenetrc': True, + }, + }, + ] - # try to authenticate using credentials if no valid token has been found - if not self._nebula_token: - self._nebula_token = self._retrieve_nebula_auth() + def _generate_playlist_entries(self, collection_id, channel): + episodes = channel['episodes']['results'] + for page_num in itertools.count(2): + for episode in episodes: + yield self._build_video_info(episode) + next_url = channel['episodes']['next'] + if not next_url: + break + channel = self._call_nebula_api(next_url, collection_id, auth_type='bearer', + note=f'Retrieving channel page {page_num}') + episodes = channel['episodes']['results'] def _real_extract(self, url): - display_id = self._match_id(url) - api_key = self._retrieve_zype_api_key(url, display_id) - - response = self._call_zype_api('/videos', {'friendly_title': display_id}, - display_id, api_key, note='Retrieving metadata from Zype') - if len(response.get('response') or []) != 1: - raise ExtractorError('Unable to find video on Zype API') - video_meta = response['response'][0] - - video_id = video_meta['_id'] - zype_access_token = self._fetch_zype_access_token(display_id) + collection_id = self._match_id(url) + channel_url = f'https://content.watchnebula.com/video/channels/{collection_id}/' + channel = self._call_nebula_api(channel_url, collection_id, auth_type='bearer', note='Retrieving channel') + channel_details = channel['details'] - channel_title = self._extract_channel_title(video_meta) - - return { - 'id': video_id, - 'display_id': display_id, - '_type': 'url_transparent', - 'ie_key': 'Zype', - 'url': 'https://player.zype.com/embed/%s.html?access_token=%s' % (video_id, zype_access_token), - 'title': video_meta.get('title'), - 'description': video_meta.get('description'), - 'timestamp': parse_iso8601(video_meta.get('published_at')), - 'thumbnails': [{ - 'id': tn.get('name'), # this appears to be null - 'url': tn['url'], - 'width': tn.get('width'), - 'height': tn.get('height'), - } for tn in video_meta.get('thumbnails', [])], - 'duration': video_meta.get('duration'), - 'channel': channel_title, - 'uploader': channel_title, # we chose uploader = channel name - # TODO: uploader_url, channel_id, channel_url - } + return self.playlist_result( + entries=self._generate_playlist_entries(collection_id, channel), + playlist_id=collection_id, + playlist_title=channel_details['title'], + playlist_description=channel_details['description'] + ) diff --git a/yt_dlp/extractor/newgrounds.py b/yt_dlp/extractor/newgrounds.py index bbbd9e8ee..1e1274ef0 100644 --- a/yt_dlp/extractor/newgrounds.py +++ b/yt_dlp/extractor/newgrounds.py @@ -6,7 +6,9 @@ import re from .common import InfoExtractor from ..utils import ( + clean_html, extract_attributes, + get_element_by_id, int_or_none, parse_count, parse_duration, @@ -29,7 +31,8 @@ class NewgroundsIE(InfoExtractor): 'timestamp': 1378878540, 'upload_date': '20130911', 'duration': 143, - 'description': 'md5:6d885138814015dfd656c2ddb00dacfc', + 'view_count': int, + 'description': 'md5:b8b3c2958875189f07d8e313462e8c4f', }, }, { 'url': 'https://www.newgrounds.com/portal/view/1', @@ -41,6 +44,7 @@ class NewgroundsIE(InfoExtractor): 'uploader': 'Brian-Beaton', 'timestamp': 955064100, 'upload_date': '20000406', + 'view_count': int, 'description': 'Scrotum plays "catch."', 'age_limit': 17, }, @@ -54,7 +58,8 @@ class NewgroundsIE(InfoExtractor): 'uploader': 'ZONE-SAMA', 'timestamp': 1487965140, 'upload_date': '20170224', - 'description': 'ZTV News Episode 8 (February 2017)', + 'view_count': int, + 'description': 'md5:aff9b330ec2e78ed93b1ad6d017accc6', 'age_limit': 17, }, 'params': { @@ -70,7 +75,8 @@ class NewgroundsIE(InfoExtractor): 'uploader': 'Egoraptor', 'timestamp': 1140663240, 'upload_date': '20060223', - 'description': 'Metal Gear is awesome is so is this movie.', + 'view_count': int, + 'description': 'md5:9246c181614e23754571995104da92e0', 'age_limit': 13, } }, { @@ -80,7 +86,7 @@ class NewgroundsIE(InfoExtractor): 'id': '297383', 'ext': 'swf', 'title': 'Metal Gear Awesome', - 'description': 'Metal Gear is awesome is so is this movie.', + 'description': 'Metal Gear Awesome', 'uploader': 'Egoraptor', 'upload_date': '20060223', 'timestamp': 1140663240, @@ -145,10 +151,13 @@ class NewgroundsIE(InfoExtractor): (r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+</dd>\s*<dd>[^<]+)', r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+)'), webpage, 'timestamp', default=None)) + duration = parse_duration(self._html_search_regex( r'"duration"\s*:\s*["\']?(\d+)["\']?', webpage, 'duration', default=None)) + description = clean_html(get_element_by_id('author_comments', webpage)) or self._og_search_description(webpage) + view_count = parse_count(self._html_search_regex( r'(?s)<dt>\s*(?:Views|Listens)\s*</dt>\s*<dd>([\d\.,]+)</dd>', webpage, 'view count', default=None)) @@ -177,7 +186,7 @@ class NewgroundsIE(InfoExtractor): 'duration': duration, 'formats': formats, 'thumbnail': self._og_search_thumbnail(webpage), - 'description': self._og_search_description(webpage), + 'description': description, 'age_limit': age_limit, 'view_count': view_count, } diff --git a/yt_dlp/extractor/nexx.py b/yt_dlp/extractor/nexx.py index 860d636e2..8aceebd49 100644 --- a/yt_dlp/extractor/nexx.py +++ b/yt_dlp/extractor/nexx.py @@ -385,8 +385,7 @@ class NexxIE(InfoExtractor): elif cdn == 'free': formats = self._extract_free_formats(video, video_id) else: - # TODO: reverse more cdns - assert False + self.raise_no_formats(f'{cdn} formats are currently not supported', video_id) self._sort_formats(formats) @@ -427,7 +426,6 @@ class NexxEmbedIE(InfoExtractor): 'upload_date': '20140305', }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, { diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py index 950a3d0d4..4998fed83 100644 --- a/yt_dlp/extractor/nhk.py +++ b/yt_dlp/extractor/nhk.py @@ -73,6 +73,7 @@ class NhkBaseIE(InfoExtractor): m3u8_id='hls', fatal=False) for f in info['formats']: f['language'] = lang + self._sort_formats(info['formats']) else: info.update({ '_type': 'url_transparent', diff --git a/yt_dlp/extractor/ninecninemedia.py b/yt_dlp/extractor/ninecninemedia.py index 4aaf21a12..781842721 100644 --- a/yt_dlp/extractor/ninecninemedia.py +++ b/yt_dlp/extractor/ninecninemedia.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( float_or_none, @@ -99,3 +98,37 @@ class NineCNineMediaIE(InfoExtractor): } return info + + +class CPTwentyFourIE(InfoExtractor): + IE_NAME = 'cp24' + _GEO_COUNTRIES = ['CA'] + _VALID_URL = r'https?://(?:www\.)?cp24\.com/news/(?P<id>[^?#]+)' + + _TESTS = [{ + 'url': 'https://www.cp24.com/news/video-shows-atm-being-ripped-out-of-business-by-pickup-truck-driver-in-mississauga-1.5676877', + 'info_dict': { + 'id': '2328005', + 'ext': 'mp4', + 'title': 'WATCH: Truck rips ATM from Mississauga business', + 'description': 'md5:cf7498480885f080a754389a2b2f7073', + 'timestamp': 1637618377, + 'episode_number': None, + 'season': 'Season 0', + 'season_number': 0, + 'season_id': 57974, + 'series': 'CTV News Toronto', + 'duration': 26.86, + 'thumbnail': 'http://images2.9c9media.com/image_asset/2014_11_5_2eb609a0-475b-0132-fbd6-34b52f6f1279_jpg_2000x1125.jpg', + 'upload_date': '20211122', + }, + 'params': {'skip_download': True, 'format': 'bv'} + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + id, destination = self._search_regex( + r'getAuthStates\("(?P<id>[^"]+)",\s?"(?P<destination>[^"]+)"\);', + webpage, 'video id and destination', group=('id', 'destination')) + return self.url_result(f'9c9media:{destination}:{id}', ie=NineCNineMediaIE.ie_key(), video_id=id) diff --git a/yt_dlp/extractor/nova.py b/yt_dlp/extractor/nova.py index 3acb88121..0007b6b12 100644 --- a/yt_dlp/extractor/nova.py +++ b/yt_dlp/extractor/nova.py @@ -10,6 +10,7 @@ from ..utils import ( int_or_none, js_to_json, qualities, + traverse_obj, unified_strdate, url_or_none, ) @@ -17,30 +18,44 @@ from ..utils import ( class NovaEmbedIE(InfoExtractor): _VALID_URL = r'https?://media\.cms\.nova\.cz/embed/(?P<id>[^/?#&]+)' - _TEST = { + _TESTS = [{ 'url': 'https://media.cms.nova.cz/embed/8o0n0r?autoplay=1', - 'md5': 'ee009bafcc794541570edd44b71cbea3', 'info_dict': { 'id': '8o0n0r', - 'ext': 'mp4', 'title': '2180. díl', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 2578, }, - } + 'params': { + 'skip_download': True, + 'ignore_no_formats_error': True, + }, + 'expected_warnings': ['DRM protected', 'Requested format is not available'], + }, { + 'url': 'https://media.cms.nova.cz/embed/KybpWYvcgOa', + 'info_dict': { + 'id': 'KybpWYvcgOa', + 'ext': 'mp4', + 'title': 'Borhyová oslavila 60? Soutěžící z pořadu odboural moderátora Ondřeje Sokola', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 114, + }, + 'params': {'skip_download': 'm3u8'}, + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + has_drm = False duration = None formats = [] player = self._parse_json( self._search_regex( - r'Player\.init\s*\([^,]+,\s*(?:\w+\s*\?\s*{.+?}\s*:\s*)?({.+})\s*,\s*{.+?}\s*\)\s*;', - webpage, 'player', default='{}'), video_id, fatal=False) + r'Player\.init\s*\([^,]+,(?P<cndn>\s*\w+\s*\?)?\s*(?P<json>{(?(cndn).+?|.+)})\s*(?(cndn):|,\s*{.+?}\s*\)\s*;)', + webpage, 'player', default='{}', group='json'), video_id, fatal=False) if player: for format_id, format_list in player['tracks'].items(): if not isinstance(format_list, list): @@ -48,6 +63,10 @@ class NovaEmbedIE(InfoExtractor): for format_dict in format_list: if not isinstance(format_dict, dict): continue + if (not self.get_param('allow_unplayable_formats') + and traverse_obj(format_dict, ('drm', 'keySystem'))): + has_drm = True + continue format_url = url_or_none(format_dict.get('src')) format_type = format_dict.get('type') ext = determine_ext(format_url) @@ -104,6 +123,8 @@ class NovaEmbedIE(InfoExtractor): f['format_id'] = f_id formats.append(f) + if not formats and has_drm: + self.report_drm(video_id) self._sort_formats(formats) title = self._og_search_title( diff --git a/yt_dlp/extractor/nrl.py b/yt_dlp/extractor/nrl.py index 22a2df8d3..0bd5086ae 100644 --- a/yt_dlp/extractor/nrl.py +++ b/yt_dlp/extractor/nrl.py @@ -16,7 +16,6 @@ class NRLTVIE(InfoExtractor): 'params': { # m3u8 download 'skip_download': True, - 'format': 'bestvideo', }, } diff --git a/yt_dlp/extractor/olympics.py b/yt_dlp/extractor/olympics.py index 0bc9206ed..0aad836fa 100644 --- a/yt_dlp/extractor/olympics.py +++ b/yt_dlp/extractor/olympics.py @@ -2,22 +2,26 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import unified_strdate +from ..utils import ( + int_or_none, + try_get +) class OlympicsReplayIE(InfoExtractor): - _VALID_URL = r'(?:https?://)(?:www\.)?olympics\.com/tokyo-2020/(?:[a-z]{2}/)?replay/(?P<id>[^/#&?]+)' + _VALID_URL = r'https?://(?:www\.)?olympics\.com(?:/tokyo-2020)?/[a-z]{2}/(?:replay|video)/(?P<id>[^/#&?]+)' _TESTS = [{ - 'url': 'https://olympics.com/tokyo-2020/en/replay/300622eb-abc0-43ea-b03b-c5f2d429ec7b/jumping-team-qualifier', + 'url': 'https://olympics.com/fr/video/men-s-109kg-group-a-weightlifting-tokyo-2020-replays', 'info_dict': { - 'id': '300622eb-abc0-43ea-b03b-c5f2d429ec7b', + 'id': 'f6a0753c-8e6f-4b7d-a435-027054a4f8e9', 'ext': 'mp4', - 'title': 'Jumping Team Qualifier', - 'release_date': '20210806', - 'upload_date': '20210713', + 'title': '+109kg (H) Groupe A - Haltérophilie | Replay de Tokyo 2020', + 'upload_date': '20210801', + 'timestamp': 1627783200, + 'description': 'md5:c66af4a5bc7429dbcc43d15845ff03b3', }, 'params': { - 'format': 'bv', + 'skip_download': True, }, }, { 'url': 'https://olympics.com/tokyo-2020/en/replay/bd242924-4b22-49a5-a846-f1d4c809250d/mens-bronze-medal-match-hun-esp', @@ -26,31 +30,41 @@ class OlympicsReplayIE(InfoExtractor): def _real_extract(self, url): id = self._match_id(url) - # The parameters are hardcoded in the webpage, it's not necessary to download the webpage just for these parameters. - # If in downloading webpage serves other functions aswell, then extract these parameters from it. - token_url = 'https://appovptok.ovpobs.tv/api/identity/app/token?api_key=OTk5NDcxOjpvY3N3LWFwaXVzZXI%3D&api_secret=ODY4ODM2MjE3ODMwYmVjNTAxMWZlMDJiMTYxZmY0MjFiMjMwMjllMjJmNDA1YWRiYzA5ODcxYTZjZTljZDkxOTo6NTM2NWIzNjRlMTM1ZmI2YWNjNmYzMGMzOGM3NzZhZTY%3D' - token = self._download_webpage(token_url, id) - headers = {'x-obs-app-token': token} - data_json = self._download_json(f'https://appocswtok.ovpobs.tv/api/schedule-sessions/{id}?include=stream', - id, headers=headers) - meta_data = data_json['data']['attributes'] - for t_dict in data_json['included']: - if t_dict.get('type') == 'Stream': - stream_data = t_dict['attributes'] + + webpage = self._download_webpage(url, id) + title = self._html_search_meta(('title', 'og:title', 'twitter:title'), webpage) + uuid = self._html_search_meta('episode_uid', webpage) + m3u8_url = self._html_search_meta('video_url', webpage) + json_ld = self._search_json_ld(webpage, uuid) + thumbnails_list = json_ld.get('image') + if not thumbnails_list: + thumbnails_list = self._html_search_regex( + r'["\']image["\']:\s*["\']([^"\']+)["\']', webpage, 'images', default='') + thumbnails_list = thumbnails_list.replace('[', '').replace(']', '').split(',') + thumbnails_list = [thumbnail.strip() for thumbnail in thumbnails_list] + thumbnails = [] + for thumbnail in thumbnails_list: + width_a, height_a, width = self._search_regex( + r'/images/image/private/t_(?P<width_a>\d+)-(?P<height_a>\d+)_(?P<width>\d+)/primary/[\W\w\d]+', + thumbnail, 'thumb', group=(1, 2, 3), default=(None, None, None)) + width_a, height_a, width = int_or_none(width_a), int_or_none(height_a), int_or_none(width) + thumbnails.append({ + 'url': thumbnail, + 'width': width, + 'height': int_or_none(try_get(width, lambda x: x * height_a / width_a)) + }) m3u8_url = self._download_json( - 'https://meteringtok.ovpobs.tv/api/playback-sessions', id, headers=headers, query={ - 'alias': stream_data['alias'], - 'stream': stream_data['stream'], - 'type': 'vod' - })['data']['attributes']['url'] - formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, id) + f'https://olympics.com/tokenGenerator?url={m3u8_url}', uuid, note='Downloading m3u8 url') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, uuid, m3u8_id='hls') self._sort_formats(formats) return { - 'id': id, - 'title': meta_data['title'], - 'release_date': unified_strdate(meta_data.get('start') or meta_data.get('broadcastPublished')), - 'upload_date': unified_strdate(meta_data.get('publishedAt')), + 'id': uuid, + 'title': title, + 'timestamp': json_ld.get('timestamp'), + 'description': json_ld.get('description'), + 'thumbnails': thumbnails, + 'duration': json_ld.get('duration'), 'formats': formats, 'subtitles': subtitles, } diff --git a/yt_dlp/extractor/onefootball.py b/yt_dlp/extractor/onefootball.py new file mode 100644 index 000000000..79501003d --- /dev/null +++ b/yt_dlp/extractor/onefootball.py @@ -0,0 +1,51 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class OneFootballIE(InfoExtractor): + _VALID_URL = r'(?:https?://)(?:www\.)?onefootball\.com/[a-z]{2}/video/[^/&?#]+-(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://onefootball.com/en/video/highlights-fc-zuerich-3-3-fc-basel-34012334', + 'info_dict': { + 'id': '34012334', + 'ext': 'mp4', + 'title': 'Highlights: FC Zürich 3-3 FC Basel', + 'description': 'md5:33d9855cb790702c4fe42a513700aba8', + 'thumbnail': 'https://photobooth-api.onefootball.com/api/screenshot/https:%2F%2Fperegrine-api.onefootball.com%2Fv2%2Fphotobooth%2Fcms%2Fen%2F34012334', + 'timestamp': 1635874604, + 'upload_date': '20211102' + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://onefootball.com/en/video/klopp-fumes-at-var-decisions-in-west-ham-defeat-34041020', + 'info_dict': { + 'id': '34041020', + 'ext': 'mp4', + 'title': 'Klopp fumes at VAR decisions in West Ham defeat', + 'description': 'md5:9c50371095a01ad3f63311c73d8f51a5', + 'thumbnail': 'https://photobooth-api.onefootball.com/api/screenshot/https:%2F%2Fperegrine-api.onefootball.com%2Fv2%2Fphotobooth%2Fcms%2Fen%2F34041020', + 'timestamp': 1636314103, + 'upload_date': '20211107' + }, + 'params': {'skip_download': True} + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + data_json = self._search_json_ld(webpage, id) + m3u8_url = self._html_search_regex(r'(https://cdn\.jwplayer\.com/manifests/.+\.m3u8)', webpage, 'm3u8_url') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, id) + self._sort_formats(formats) + return { + 'id': id, + 'title': data_json.get('title'), + 'description': data_json.get('description'), + 'thumbnail': data_json.get('thumbnail'), + 'timestamp': data_json.get('timestamp'), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/yt_dlp/extractor/orf.py b/yt_dlp/extractor/orf.py index 428ec97e4..e2b703880 100644 --- a/yt_dlp/extractor/orf.py +++ b/yt_dlp/extractor/orf.py @@ -11,6 +11,7 @@ from ..utils import ( float_or_none, HEADRequest, int_or_none, + join_nonempty, orderedSet, remove_end, str_or_none, @@ -82,12 +83,7 @@ class ORFTVthekIE(InfoExtractor): src = url_or_none(fd.get('src')) if not src: continue - format_id_list = [] - for key in ('delivery', 'quality', 'quality_string'): - value = fd.get(key) - if value: - format_id_list.append(value) - format_id = '-'.join(format_id_list) + format_id = join_nonempty('delivery', 'quality', 'quality_string', from_dict=fd) ext = determine_ext(src) if ext == 'm3u8': m3u8_formats = self._extract_m3u8_formats( diff --git a/yt_dlp/extractor/paramountplus.py b/yt_dlp/extractor/paramountplus.py index 338b84d5b..17138985a 100644 --- a/yt_dlp/extractor/paramountplus.py +++ b/yt_dlp/extractor/paramountplus.py @@ -60,7 +60,6 @@ class ParamountPlusIE(CBSBaseIE): }, 'params': { 'skip_download': 'm3u8', - 'format': 'bestvideo', }, 'expected_warnings': ['Ignoring subtitle tracks'], # TODO: Investigate this }, { @@ -76,7 +75,6 @@ class ParamountPlusIE(CBSBaseIE): }, 'params': { 'skip_download': 'm3u8', - 'format': 'bestvideo', }, 'expected_warnings': ['Ignoring subtitle tracks'], }, { diff --git a/yt_dlp/extractor/parliamentliveuk.py b/yt_dlp/extractor/parliamentliveuk.py index 869ebd865..974d65482 100644 --- a/yt_dlp/extractor/parliamentliveuk.py +++ b/yt_dlp/extractor/parliamentliveuk.py @@ -25,9 +25,6 @@ class ParliamentLiveUKIE(InfoExtractor): 'timestamp': 1395153872, 'upload_date': '20140318', }, - 'params': { - 'format': 'bestvideo', - }, }, { 'url': 'http://parliamentlive.tv/event/index/3f24936f-130f-40bf-9a5d-b3d6479da6a4', 'only_matching': True, diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py index c7d316efc..d3ee071e0 100644 --- a/yt_dlp/extractor/patreon.py +++ b/yt_dlp/extractor/patreon.py @@ -191,7 +191,7 @@ class PatreonIE(InfoExtractor): class PatreonUserIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?P<id>[-_\w\d]+)/?(?:posts/?)?' + _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?!rss)(?P<id>[-\w]+)' _TESTS = [{ 'url': 'https://www.patreon.com/dissonancepod/', diff --git a/yt_dlp/extractor/pbs.py b/yt_dlp/extractor/pbs.py index 0eabf9bee..ffaa6bf92 100644 --- a/yt_dlp/extractor/pbs.py +++ b/yt_dlp/extractor/pbs.py @@ -193,7 +193,7 @@ class PBSIE(InfoExtractor): # Article with embedded player (or direct video) (?:www\.)?pbs\.org/(?:[^/]+/){1,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) | # Player - (?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/ + (?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+) ) ''' % '|'.join(list(zip(*_STATIONS))[0]) diff --git a/yt_dlp/extractor/peertv.py b/yt_dlp/extractor/peertv.py new file mode 100644 index 000000000..002d33a88 --- /dev/null +++ b/yt_dlp/extractor/peertv.py @@ -0,0 +1,57 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import js_to_json + + +class PeerTVIE(InfoExtractor): + IE_NAME = 'peer.tv' + _VALID_URL = r'https?://(?:www\.)?peer\.tv/(?:de|it|en)/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.peer.tv/de/841', + 'info_dict': { + 'id': '841', + 'ext': 'mp4', + 'title': 'Die Brunnenburg', + 'description': 'md5:4395f6142b090338340ab88a3aae24ed', + }, + }, { + 'url': 'https://www.peer.tv/it/404', + 'info_dict': { + 'id': '404', + 'ext': 'mp4', + 'title': 'Cascate di ghiaccio in Val Gardena', + 'description': 'md5:e8e5907f236171842674e8090e3577b8', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_key = self._html_search_regex(r'player\.peer\.tv/js/([a-zA-Z0-9]+)', webpage, 'video key') + + js = self._download_webpage(f'https://player.peer.tv/js/{video_key}/', video_id, + headers={'Referer': 'https://www.peer.tv/'}, note='Downloading session id') + + session_id = self._search_regex(r'["\']session_id["\']:\s*["\']([a-zA-Z0-9]+)["\']', js, 'session id') + + player_webpage = self._download_webpage( + f'https://player.peer.tv/jsc/{video_key}/{session_id}?jsr=aHR0cHM6Ly93d3cucGVlci50di9kZS84NDE=&cs=UTF-8&mq=2&ua=0&webm=p&mp4=p&hls=1', + video_id, note='Downloading player webpage') + + m3u8_url = self._search_regex(r'["\']playlist_url["\']:\s*(["\'][^"\']+["\'])', player_webpage, 'm3u8 url') + m3u8_url = self._parse_json(m3u8_url, video_id, transform_source=js_to_json) + + formats = self._extract_m3u8_formats(m3u8_url, video_id, m3u8_id='hls') + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._html_search_regex(r'<h1>(.+?)</h1>', webpage, 'title').replace('\xa0', ' '), + 'formats': formats, + 'description': self._html_search_meta(('og:description', 'description'), webpage), + 'thumbnail': self._html_search_meta(('og:image', 'image'), webpage) + } diff --git a/yt_dlp/extractor/peloton.py b/yt_dlp/extractor/peloton.py index 287d341c9..7d832253f 100644 --- a/yt_dlp/extractor/peloton.py +++ b/yt_dlp/extractor/peloton.py @@ -203,7 +203,6 @@ class PelotonLiveIE(InfoExtractor): 'chapters': 'count:3' }, 'params': { - 'format': 'bestvideo', 'skip_download': 'm3u8', }, '_skip': 'Account needed' diff --git a/yt_dlp/extractor/picarto.py b/yt_dlp/extractor/picarto.py index e6c51e16b..17d08d69e 100644 --- a/yt_dlp/extractor/picarto.py +++ b/yt_dlp/extractor/picarto.py @@ -111,7 +111,7 @@ class PicartoVodIE(InfoExtractor): vod_info = self._parse_json( self._search_regex( r'(?s)#vod-player["\']\s*,\s*(\{.+?\})\s*\)', webpage, - video_id), + 'vod player'), video_id, transform_source=js_to_json) formats = self._extract_m3u8_formats( diff --git a/yt_dlp/extractor/piksel.py b/yt_dlp/extractor/piksel.py index a362664b2..84c3de2f0 100644 --- a/yt_dlp/extractor/piksel.py +++ b/yt_dlp/extractor/piksel.py @@ -4,11 +4,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( dict_get, ExtractorError, int_or_none, + join_nonempty, parse_iso8601, try_get, unescapeHTML, @@ -116,12 +116,8 @@ class PikselIE(InfoExtractor): elif asset_type == 'audio': tbr = abr - format_id = ['http'] - if tbr: - format_id.append(compat_str(tbr)) - formats.append({ - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty('http', tbr), 'url': unescapeHTML(http_url), 'vbr': vbr, 'abr': abr, @@ -167,7 +163,7 @@ class PikselIE(InfoExtractor): re.sub(r'/od/[^/]+/', '/od/http/', smil_url), video_id, transform_source=transform_source, fatal=False)) - self._sort_formats(formats) + self._sort_formats(formats, ('tbr', )) # Incomplete resolution information subtitles = {} for caption in video_data.get('captions', []): diff --git a/yt_dlp/extractor/planetmarathi.py b/yt_dlp/extractor/planetmarathi.py new file mode 100644 index 000000000..d1d9911f7 --- /dev/null +++ b/yt_dlp/extractor/planetmarathi.py @@ -0,0 +1,76 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + try_get, + unified_strdate, +) + + +class PlanetMarathiIE(InfoExtractor): + _VALID_URL = r'(?:https?://)(?:www\.)?planetmarathi\.com/titles/(?P<id>[^/#&?$]+)' + _TESTS = [{ + 'url': 'https://www.planetmarathi.com/titles/ek-unad-divas', + 'playlist_mincount': 2, + 'info_dict': { + 'id': 'ek-unad-divas', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'ASSETS-MOVIE-ASSET-01_ek-unad-divas', + 'ext': 'mp4', + 'title': 'ek unad divas', + 'alt_title': 'चित्रपट', + 'description': 'md5:41c7ed6b041c2fea9820a3f3125bd881', + 'season_number': None, + 'episode_number': 1, + 'duration': 5539, + 'upload_date': '20210829', + }, + }] # Trailer skipped + }, { + 'url': 'https://www.planetmarathi.com/titles/baap-beep-baap-season-1', + 'playlist_mincount': 10, + 'info_dict': { + 'id': 'baap-beep-baap-season-1', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'ASSETS-CHARACTER-PROFILE-SEASON-01-ASSET-01_baap-beep-baap-season-1', + 'ext': 'mp4', + 'title': 'Manohar Kanhere', + 'alt_title': 'मनोहर कान्हेरे', + 'description': 'md5:285ed45d5c0ab5522cac9a043354ebc6', + 'season_number': 1, + 'episode_number': 1, + 'duration': 29, + 'upload_date': '20210829', + }, + }] # Trailers, Episodes, other Character profiles skipped + }] + + def _real_extract(self, url): + id = self._match_id(url) + entries = [] + json_data = self._download_json(f'https://www.planetmarathi.com/api/v1/titles/{id}/assets', id)['assets'] + for asset in json_data: + asset_title = asset['mediaAssetName']['en'] + if asset_title == 'Movie': + asset_title = id.replace('-', ' ') + asset_id = f'{asset["sk"]}_{id}'.replace('#', '-') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(asset['mediaAssetURL'], asset_id) + self._sort_formats(formats) + entries.append({ + 'id': asset_id, + 'title': asset_title, + 'alt_title': try_get(asset, lambda x: x['mediaAssetName']['mr']), + 'description': try_get(asset, lambda x: x['mediaAssetDescription']['en']), + 'season_number': asset.get('mediaAssetSeason'), + 'episode_number': asset.get('mediaAssetIndexForAssetType'), + 'duration': asset.get('mediaAssetDurationInSeconds'), + 'upload_date': unified_strdate(asset.get('created')), + 'formats': formats, + 'subtitles': subtitles, + }) + return self.playlist_result(entries, playlist_id=id) diff --git a/yt_dlp/extractor/polsatgo.py b/yt_dlp/extractor/polsatgo.py new file mode 100644 index 000000000..1e3f46c07 --- /dev/null +++ b/yt_dlp/extractor/polsatgo.py @@ -0,0 +1,90 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from uuid import uuid4 +import json + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + try_get, + url_or_none, + ExtractorError, +) + + +class PolsatGoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?polsat(?:box)?go\.pl/.+/(?P<id>[0-9a-fA-F]+)(?:[/#?]|$)' + _TESTS = [{ + 'url': 'https://polsatgo.pl/wideo/seriale/swiat-wedlug-kiepskich/5024045/sezon-1/5028300/swiat-wedlug-kiepskich-odcinek-88/4121', + 'info_dict': { + 'id': '4121', + 'ext': 'mp4', + 'title': 'Świat według Kiepskich - Odcinek 88', + 'age_limit': 12, + }, + }] + + def _extract_formats(self, sources, video_id): + for source in sources or []: + if not source.get('id'): + continue + url = url_or_none(self._call_api( + 'drm', video_id, 'getPseudoLicense', + {'mediaId': video_id, 'sourceId': source['id']}).get('url')) + if not url: + continue + yield { + 'url': url, + 'height': int_or_none(try_get(source, lambda x: x['quality'][:-1])) + } + + def _real_extract(self, url): + video_id = self._match_id(url) + media = self._call_api('navigation', video_id, 'prePlayData', {'mediaId': video_id})['mediaItem'] + + formats = list(self._extract_formats( + try_get(media, lambda x: x['playback']['mediaSources']), video_id)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': media['displayInfo']['title'], + 'formats': formats, + 'age_limit': int_or_none(media['displayInfo']['ageGroup']) + } + + def _call_api(self, endpoint, media_id, method, params): + rand_uuid = str(uuid4()) + res = self._download_json( + f'https://b2c-mobile.redefine.pl/rpc/{endpoint}/', media_id, + note=f'Downloading {method} JSON metadata', + data=json.dumps({ + 'method': method, + 'id': '2137', + 'jsonrpc': '2.0', + 'params': { + **params, + 'userAgentData': { + 'deviceType': 'mobile', + 'application': 'native', + 'os': 'android', + 'build': 10003, + 'widevine': False, + 'portal': 'pg', + 'player': 'cpplayer', + }, + 'deviceId': { + 'type': 'other', + 'value': rand_uuid, + }, + 'clientId': rand_uuid, + 'cpid': 1, + }, + }).encode('utf-8'), + headers={'Content-type': 'application/json'}) + if not res.get('result'): + if res['error']['code'] == 13404: + raise ExtractorError('This video is either unavailable in your region or is DRM protected', expected=True) + raise ExtractorError(f'Solorz said: {res["error"]["message"]} - {res["error"]["data"]["userMessage"]}') + return res['result'] diff --git a/yt_dlp/extractor/polskieradio.py b/yt_dlp/extractor/polskieradio.py index 53fe0340a..b2b3eb29c 100644 --- a/yt_dlp/extractor/polskieradio.py +++ b/yt_dlp/extractor/polskieradio.py @@ -2,6 +2,8 @@ from __future__ import unicode_literals import itertools +import json +import math import re from .common import InfoExtractor @@ -12,15 +14,45 @@ from ..compat import ( ) from ..utils import ( extract_attributes, + ExtractorError, + InAdvancePagedList, int_or_none, + js_to_json, + parse_iso8601, strip_or_none, unified_timestamp, unescapeHTML, + url_or_none, ) -class PolskieRadioIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)' +class PolskieRadioBaseExtractor(InfoExtractor): + def _extract_webpage_player_entries(self, webpage, playlist_id, base_data): + media_urls = set() + + for data_media in re.findall(r'<[^>]+data-media="?({[^>]+})"?', webpage): + media = self._parse_json(data_media, playlist_id, transform_source=unescapeHTML, fatal=False) + if not media.get('file') or not media.get('desc'): + continue + media_url = self._proto_relative_url(media['file']) + if media_url in media_urls: + continue + media_urls.add(media_url) + entry = base_data.copy() + entry.update({ + 'id': compat_str(media['id']), + 'url': media_url, + 'duration': int_or_none(media.get('length')), + 'vcodec': 'none' if media.get('provider') == 'audio' else None, + }) + entry_title = compat_urllib_parse_unquote(media['desc']) + if entry_title: + entry['title'] = entry_title + yield entry + + +class PolskieRadioIE(PolskieRadioBaseExtractor): + _VALID_URL = r'https?://(?:www\.)?polskieradio(?:24)?\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)' _TESTS = [{ # Old-style single broadcast. 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie', 'info_dict': { @@ -59,22 +91,14 @@ class PolskieRadioIE(InfoExtractor): 'thumbnail': r're:^https?://static\.prsa\.pl/images/.*\.jpg$' }, }], - }, { # Old-style multiple broadcast playlist. - 'url': 'https://www.polskieradio.pl/8/4346/Artykul/2487823,Marek-Kondrat-czyta-Mistrza-i-Malgorzate', - 'info_dict': { - 'id': '2487823', - 'title': 'Marek Kondrat czyta "Mistrza i Małgorzatę"', - 'description': 'md5:8422a95cc83834f2aaeff9d82e9c8f39', - }, - 'playlist_mincount': 50, - }, { # New-style multiple broadcast playlist. - 'url': 'https://www.polskieradio.pl/8/4346/Artykul/2541317,Czytamy-Kalendarz-i-klepsydre-Tadeusza-Konwickiego', + }, { + # PR4 audition - other frontend + 'url': 'https://www.polskieradio.pl/10/6071/Artykul/2610977,Poglos-29-pazdziernika-godz-2301', 'info_dict': { - 'id': '2541317', - 'title': 'Czytamy "Kalendarz i klepsydrę" Tadeusza Konwickiego', - 'description': 'md5:0baeaa46d877f1351fb2eeed3e871f9f', + 'id': '2610977', + 'ext': 'mp3', + 'title': 'Pogłos 29 października godz. 23:01', }, - 'playlist_mincount': 15, }, { 'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis', 'only_matching': True, @@ -85,6 +109,9 @@ class PolskieRadioIE(InfoExtractor): # with mp4 video 'url': 'http://www.polskieradio.pl/9/299/Artykul/1634903,Brexit-Leszek-Miller-swiat-sie-nie-zawali-Europa-bedzie-trwac-dalej', 'only_matching': True, + }, { + 'url': 'https://polskieradio24.pl/130/4503/Artykul/2621876,Narusza-nasza-suwerennosc-Publicysci-o-uzaleznieniu-funduszy-UE-od-praworzadnosci', + 'only_matching': True, }] def _real_extract(self, url): @@ -94,39 +121,37 @@ class PolskieRadioIE(InfoExtractor): content = self._search_regex( r'(?s)<div[^>]+class="\s*this-article\s*"[^>]*>(.+?)<div[^>]+class="tags"[^>]*>', - webpage, 'content') + webpage, 'content', default=None) timestamp = unified_timestamp(self._html_search_regex( r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>', - webpage, 'timestamp', fatal=False)) + webpage, 'timestamp', default=None)) - thumbnail_url = self._og_search_thumbnail(webpage) + thumbnail_url = self._og_search_thumbnail(webpage, default=None) - entries = [] + title = self._og_search_title(webpage).strip() - media_urls = set() + description = strip_or_none(self._og_search_description(webpage, default=None)) + description = description.replace('\xa0', ' ') if description is not None else None - for data_media in re.findall(r'<[^>]+data-media="?({[^>]+})"?', content): - media = self._parse_json(data_media, playlist_id, transform_source=unescapeHTML, fatal=False) - if not media.get('file') or not media.get('desc'): - continue - media_url = self._proto_relative_url(media['file'], 'http:') - if media_url in media_urls: - continue - media_urls.add(media_url) - entries.append({ - 'id': compat_str(media['id']), - 'url': media_url, - 'title': compat_urllib_parse_unquote(media['desc']), - 'duration': int_or_none(media.get('length')), - 'vcodec': 'none' if media.get('provider') == 'audio' else None, + if not content: + return { + 'id': playlist_id, + 'url': self._proto_relative_url( + self._search_regex( + r"source:\s*'(//static\.prsa\.pl/[^']+)'", + webpage, 'audition record url')), + 'title': title, + 'description': description, 'timestamp': timestamp, - 'thumbnail': thumbnail_url - }) + 'thumbnail': thumbnail_url, + } - title = self._og_search_title(webpage).strip() - description = strip_or_none(self._og_search_description(webpage)) - description = description.replace('\xa0', ' ') if description is not None else None + entries = self._extract_webpage_player_entries(content, playlist_id, { + 'title': title, + 'timestamp': timestamp, + 'thumbnail': thumbnail_url, + }) return self.playlist_result(entries, playlist_id, title, description) @@ -207,3 +232,201 @@ class PolskieRadioCategoryIE(InfoExtractor): return self.playlist_result( self._entries(url, webpage, category_id), category_id, title) + + +class PolskieRadioPlayerIE(InfoExtractor): + IE_NAME = 'polskieradio:player' + _VALID_URL = r'https?://player\.polskieradio\.pl/anteny/(?P<id>[^/]+)' + + _BASE_URL = 'https://player.polskieradio.pl' + _PLAYER_URL = 'https://player.polskieradio.pl/main.bundle.js' + _STATIONS_API_URL = 'https://apipr.polskieradio.pl/api/stacje' + + _TESTS = [{ + 'url': 'https://player.polskieradio.pl/anteny/trojka', + 'info_dict': { + 'id': '3', + 'ext': 'm4a', + 'title': 'Trójka', + }, + 'params': { + 'format': 'bestaudio', + 'skip_download': 'endless stream', + }, + }] + + def _get_channel_list(self, channel_url='no_channel'): + player_code = self._download_webpage( + self._PLAYER_URL, channel_url, + note='Downloading js player') + channel_list = js_to_json(self._search_regex( + r';var r="anteny",a=(\[.+?\])},', player_code, 'channel list')) + return self._parse_json(channel_list, channel_url) + + def _real_extract(self, url): + channel_url = self._match_id(url) + channel_list = self._get_channel_list(channel_url) + + channel = next((c for c in channel_list if c.get('url') == channel_url), None) + + if not channel: + raise ExtractorError('Channel not found') + + station_list = self._download_json(self._STATIONS_API_URL, channel_url, + note='Downloading stream url list', + headers={ + 'Accept': 'application/json', + 'Referer': url, + 'Origin': self._BASE_URL, + }) + station = next((s for s in station_list + if s.get('Name') == (channel.get('streamName') or channel.get('name'))), None) + if not station: + raise ExtractorError('Station not found even though we extracted channel') + + formats = [] + for stream_url in station['Streams']: + stream_url = self._proto_relative_url(stream_url) + if stream_url.endswith('/playlist.m3u8'): + formats.extend(self._extract_m3u8_formats(stream_url, channel_url, live=True)) + elif stream_url.endswith('/manifest.f4m'): + formats.extend(self._extract_mpd_formats(stream_url, channel_url)) + elif stream_url.endswith('/Manifest'): + formats.extend(self._extract_ism_formats(stream_url, channel_url)) + else: + formats.append({ + 'url': stream_url, + }) + + self._sort_formats(formats) + + return { + 'id': compat_str(channel['id']), + 'formats': formats, + 'title': channel.get('name') or channel.get('streamName'), + 'display_id': channel_url, + 'thumbnail': f'{self._BASE_URL}/images/{channel_url}-color-logo.png', + 'is_live': True, + } + + +class PolskieRadioPodcastBaseExtractor(InfoExtractor): + _API_BASE = 'https://apipodcasts.polskieradio.pl/api' + + def _parse_episode(self, data): + return { + 'id': data['guid'], + 'formats': [{ + 'url': data['url'], + 'filesize': int_or_none(data.get('fileSize')), + }], + 'title': data['title'], + 'description': data.get('description'), + 'duration': int_or_none(data.get('length')), + 'timestamp': parse_iso8601(data.get('publishDate')), + 'thumbnail': url_or_none(data.get('image')), + 'series': data.get('podcastTitle'), + 'episode': data['title'], + } + + +class PolskieRadioPodcastListIE(PolskieRadioPodcastBaseExtractor): + IE_NAME = 'polskieradio:podcast:list' + _VALID_URL = r'https?://podcasty\.polskieradio\.pl/podcast/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://podcasty.polskieradio.pl/podcast/8/', + 'info_dict': { + 'id': '8', + 'title': 'Śniadanie w Trójce', + 'description': 'md5:57abcc27bc4c6a6b25baa3061975b9ef', + 'uploader': 'Beata Michniewicz', + }, + 'playlist_mincount': 714, + }] + _PAGE_SIZE = 10 + + def _call_api(self, podcast_id, page): + return self._download_json( + f'{self._API_BASE}/Podcasts/{podcast_id}/?pageSize={self._PAGE_SIZE}&page={page}', + podcast_id, f'Downloading page {page}') + + def _real_extract(self, url): + podcast_id = self._match_id(url) + data = self._call_api(podcast_id, 1) + + def get_page(page_num): + page_data = self._call_api(podcast_id, page_num + 1) if page_num else data + yield from (self._parse_episode(ep) for ep in page_data['items']) + + return { + '_type': 'playlist', + 'entries': InAdvancePagedList( + get_page, math.ceil(data['itemCount'] / self._PAGE_SIZE), self._PAGE_SIZE), + 'id': str(data['id']), + 'title': data['title'], + 'description': data.get('description'), + 'uploader': data.get('announcer'), + } + + +class PolskieRadioPodcastIE(PolskieRadioPodcastBaseExtractor): + IE_NAME = 'polskieradio:podcast' + _VALID_URL = r'https?://podcasty\.polskieradio\.pl/track/(?P<id>[a-f\d]{8}(?:-[a-f\d]{4}){4}[a-f\d]{8})' + _TESTS = [{ + 'url': 'https://podcasty.polskieradio.pl/track/6eafe403-cb8f-4756-b896-4455c3713c32', + 'info_dict': { + 'id': '6eafe403-cb8f-4756-b896-4455c3713c32', + 'ext': 'mp3', + 'title': 'Theresa May rezygnuje. Co dalej z brexitem?', + 'description': 'md5:e41c409a29d022b70ef0faa61dbded60', + }, + }] + + def _real_extract(self, url): + podcast_id = self._match_id(url) + data = self._download_json( + f'{self._API_BASE}/audio', + podcast_id, 'Downloading podcast metadata', + data=json.dumps({ + 'guids': [podcast_id], + }).encode('utf-8'), + headers={ + 'Content-Type': 'application/json', + }) + return self._parse_episode(data[0]) + + +class PolskieRadioRadioKierowcowIE(PolskieRadioBaseExtractor): + _VALID_URL = r'https?://(?:www\.)?radiokierowcow\.pl/artykul/(?P<id>[0-9]+)' + IE_NAME = 'polskieradio:kierowcow' + + _TESTS = [{ + 'url': 'https://radiokierowcow.pl/artykul/2694529', + 'info_dict': { + 'id': '2694529', + 'title': 'Zielona fala reliktem przeszłości?', + 'description': 'md5:343950a8717c9818fdfd4bd2b8ca9ff2', + }, + 'playlist_count': 3, + }] + + def _real_extract(self, url): + media_id = self._match_id(url) + webpage = self._download_webpage(url, media_id) + nextjs_build = self._search_nextjs_data(webpage, media_id)['buildId'] + article = self._download_json( + f'https://radiokierowcow.pl/_next/data/{nextjs_build}/artykul/{media_id}.json?articleId={media_id}', + media_id) + data = article['pageProps']['data'] + title = data['title'] + entries = self._extract_webpage_player_entries(data['content'], media_id, { + 'title': title, + }) + + return { + '_type': 'playlist', + 'id': media_id, + 'entries': entries, + 'title': title, + 'description': data.get('lead'), + } diff --git a/yt_dlp/extractor/pornflip.py b/yt_dlp/extractor/pornflip.py index d0aefa2dd..accf45269 100644 --- a/yt_dlp/extractor/pornflip.py +++ b/yt_dlp/extractor/pornflip.py @@ -29,7 +29,6 @@ class PornFlipIE(InfoExtractor): 'age_limit': 18, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, diff --git a/yt_dlp/extractor/radiokapital.py b/yt_dlp/extractor/radiokapital.py new file mode 100644 index 000000000..2e93e034f --- /dev/null +++ b/yt_dlp/extractor/radiokapital.py @@ -0,0 +1,99 @@ +# coding: utf-8 + +from .common import InfoExtractor +from ..utils import ( + clean_html, + traverse_obj, + unescapeHTML, +) + +import itertools +from urllib.parse import urlencode + + +class RadioKapitalBaseIE(InfoExtractor): + def _call_api(self, resource, video_id, note='Downloading JSON metadata', qs={}): + return self._download_json( + f'https://www.radiokapital.pl/wp-json/kapital/v1/{resource}?{urlencode(qs)}', + video_id, note=note) + + def _parse_episode(self, data): + release = '%s%s%s' % (data['published'][6:11], data['published'][3:6], data['published'][:3]) + return { + '_type': 'url_transparent', + 'url': data['mixcloud_url'], + 'ie_key': 'Mixcloud', + 'title': unescapeHTML(data['title']), + 'description': clean_html(data.get('content')), + 'tags': traverse_obj(data, ('tags', ..., 'name')), + 'release_date': release, + 'series': traverse_obj(data, ('show', 'title')), + } + + +class RadioKapitalIE(RadioKapitalBaseIE): + IE_NAME = 'radiokapital' + _VALID_URL = r'https?://(?:www\.)?radiokapital\.pl/shows/[a-z\d-]+/(?P<id>[a-z\d-]+)' + + _TESTS = [{ + 'url': 'https://radiokapital.pl/shows/tutaj-sa-smoki/5-its-okay-to-be-immaterial', + 'info_dict': { + 'id': 'radiokapital_radio-kapitał-tutaj-są-smoki-5-its-okay-to-be-immaterial-2021-05-20', + 'ext': 'm4a', + 'title': '#5: It’s okay to\xa0be\xa0immaterial', + 'description': 'md5:2499da5fbfb0e88333b7d37ec8e9e4c4', + 'uploader': 'Radio Kapitał', + 'uploader_id': 'radiokapital', + 'timestamp': 1621640164, + 'upload_date': '20210521', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + episode = self._call_api('episodes/%s' % video_id, video_id) + return self._parse_episode(episode) + + +class RadioKapitalShowIE(RadioKapitalBaseIE): + IE_NAME = 'radiokapital:show' + _VALID_URL = r'https?://(?:www\.)?radiokapital\.pl/shows/(?P<id>[a-z\d-]+)/?(?:$|[?#])' + + _TESTS = [{ + 'url': 'https://radiokapital.pl/shows/wesz', + 'info_dict': { + 'id': '100', + 'title': 'WĘSZ', + 'description': 'md5:3a557a1e0f31af612b0dcc85b1e0ca5c', + }, + 'playlist_mincount': 17, + }] + + def _get_episode_list(self, series_id, page_no): + return self._call_api( + 'episodes', series_id, + f'Downloading episode list page #{page_no}', qs={ + 'show': series_id, + 'page': page_no, + }) + + def _entries(self, series_id): + for page_no in itertools.count(1): + episode_list = self._get_episode_list(series_id, page_no) + yield from (self._parse_episode(ep) for ep in episode_list['items']) + if episode_list['next'] is None: + break + + def _real_extract(self, url): + series_id = self._match_id(url) + + show = self._call_api(f'shows/{series_id}', series_id, 'Downloading show metadata') + entries = self._entries(series_id) + return { + '_type': 'playlist', + 'entries': entries, + 'id': str(show['id']), + 'title': show.get('title'), + 'description': clean_html(show.get('content')), + } diff --git a/yt_dlp/extractor/radiozet.py b/yt_dlp/extractor/radiozet.py new file mode 100644 index 000000000..2e1ff36c2 --- /dev/null +++ b/yt_dlp/extractor/radiozet.py @@ -0,0 +1,51 @@ +# coding: utf-8 +from .common import InfoExtractor +from ..utils import ( + traverse_obj, + strip_or_none, +) + + +class RadioZetPodcastIE(InfoExtractor): + _VALID_URL = r'https?://player\.radiozet\.pl\/Podcasty/.*?/(?P<id>.+)' + _TEST = { + 'url': 'https://player.radiozet.pl/Podcasty/Nie-Ma-Za-Co/O-przedmiotach-szkolnych-ktore-przydaja-sie-w-zyciu', + 'md5': 'e03665c316b4fbc5f6a8f232948bbba3', + 'info_dict': { + 'id': '42154', + 'display_id': 'O-przedmiotach-szkolnych-ktore-przydaja-sie-w-zyciu', + 'title': 'O przedmiotach szkolnych, które przydają się w życiu', + 'description': 'md5:fa72bed49da334b09e5b2f79851f185c', + 'release_timestamp': 1592985480, + 'ext': 'mp3', + 'thumbnail': r're:^https?://.*\.png$', + 'duration': 83, + 'series': 'Nie Ma Za Co', + 'creator': 'Katarzyna Pakosińska', + } + } + + def _call_api(self, podcast_id, display_id): + return self._download_json( + f'https://player.radiozet.pl/api/podcasts/getPodcast/(node)/{podcast_id}/(station)/radiozet', + display_id) + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + podcast_id = self._html_search_regex(r'<div.*?\sid="player".*?\sdata-id=[\'"]([^\'"]+)[\'"]', + webpage, 'podcast id') + data = self._call_api(podcast_id, display_id)['data'][0] + + return { + 'id': podcast_id, + 'display_id': display_id, + 'title': strip_or_none(data.get('title')), + 'description': strip_or_none(traverse_obj(data, ('program', 'desc'))), + 'release_timestamp': data.get('published_date'), + 'url': traverse_obj(data, ('player', 'stream')), + 'thumbnail': traverse_obj(data, ('program', 'image', 'original')), + 'duration': traverse_obj(data, ('player', 'duration')), + 'series': strip_or_none(traverse_obj(data, ('program', 'title'))), + 'creator': strip_or_none(traverse_obj(data, ('presenter', 0, 'title'))), + } diff --git a/yt_dlp/extractor/rai.py b/yt_dlp/extractor/rai.py index 27cd01801..6aa62c955 100644 --- a/yt_dlp/extractor/rai.py +++ b/yt_dlp/extractor/rai.py @@ -14,12 +14,15 @@ from ..utils import ( find_xpath_attr, fix_xml_ampersands, GeoRestrictedError, + get_element_by_class, HEADRequest, int_or_none, parse_duration, + parse_list, remove_start, strip_or_none, try_get, + unescapeHTML, unified_strdate, unified_timestamp, update_url_query, @@ -585,3 +588,84 @@ class RaiIE(RaiBaseIE): info.update(relinker_info) return info + + +class RaiPlayRadioBaseIE(InfoExtractor): + _BASE = 'https://www.raiplayradio.it' + + def get_playlist_iter(self, url, uid): + webpage = self._download_webpage(url, uid) + for attrs in parse_list(webpage): + title = attrs['data-title'].strip() + audio_url = urljoin(url, attrs['data-mediapolis']) + entry = { + 'url': audio_url, + 'id': attrs['data-uniquename'].lstrip('ContentItem-'), + 'title': title, + 'ext': 'mp3', + 'language': 'it', + } + if 'data-image' in attrs: + entry['thumbnail'] = urljoin(url, attrs['data-image']) + yield entry + + +class RaiPlayRadioIE(RaiPlayRadioBaseIE): + _VALID_URL = r'%s/audio/.+?-(?P<id>%s)\.html' % ( + RaiPlayRadioBaseIE._BASE, RaiBaseIE._UUID_RE) + _TEST = { + 'url': 'https://www.raiplayradio.it/audio/2019/07/RADIO3---LEZIONI-DI-MUSICA-36b099ff-4123-4443-9bf9-38e43ef5e025.html', + 'info_dict': { + 'id': '36b099ff-4123-4443-9bf9-38e43ef5e025', + 'ext': 'mp3', + 'title': 'Dal "Chiaro di luna" al "Clair de lune", prima parte con Giovanni Bietti', + 'thumbnail': r're:^https?://.*\.jpg$', + 'language': 'it', + } + } + + def _real_extract(self, url): + audio_id = self._match_id(url) + list_url = url.replace('.html', '-list.html') + return next(entry for entry in self.get_playlist_iter(list_url, audio_id) if entry['id'] == audio_id) + + +class RaiPlayRadioPlaylistIE(RaiPlayRadioBaseIE): + _VALID_URL = r'%s/playlist/.+?-(?P<id>%s)\.html' % ( + RaiPlayRadioBaseIE._BASE, RaiBaseIE._UUID_RE) + _TEST = { + 'url': 'https://www.raiplayradio.it/playlist/2017/12/Alice-nel-paese-delle-meraviglie-72371d3c-d998-49f3-8860-d168cfdf4966.html', + 'info_dict': { + 'id': '72371d3c-d998-49f3-8860-d168cfdf4966', + 'title': "Alice nel paese delle meraviglie", + 'description': "di Lewis Carrol letto da Aldo Busi", + }, + 'playlist_count': 11, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + playlist_webpage = self._download_webpage(url, playlist_id) + playlist_title = unescapeHTML(self._html_search_regex( + r'data-playlist-title="(.+?)"', playlist_webpage, 'title')) + playlist_creator = self._html_search_meta( + 'nomeProgramma', playlist_webpage) + playlist_description = get_element_by_class( + 'textDescriptionProgramma', playlist_webpage) + + player_href = self._html_search_regex( + r'data-player-href="(.+?)"', playlist_webpage, 'href') + list_url = urljoin(url, player_href) + + entries = list(self.get_playlist_iter(list_url, playlist_id)) + for index, entry in enumerate(entries, start=1): + entry.update({ + 'track': entry['title'], + 'track_number': index, + 'artist': playlist_creator, + 'album': playlist_title + }) + + return self.playlist_result( + entries, playlist_id, playlist_title, playlist_description, + creator=playlist_creator) diff --git a/yt_dlp/extractor/rcti.py b/yt_dlp/extractor/rcti.py index 31d9779dd..19b2f451c 100644 --- a/yt_dlp/extractor/rcti.py +++ b/yt_dlp/extractor/rcti.py @@ -85,9 +85,6 @@ class RCTIPlusIE(RCTIPlusBaseIE): 'series': 'iNews Malam', 'channel': 'INews', }, - 'params': { - 'format': 'bestvideo', - }, }, { # Missed event/replay 'url': 'https://www.rctiplus.com/missed-event/2507/mou-signing-ceremony-27-juli-2021-1400-wib', 'md5': '649c5f27250faed1452ca8b91e06922d', @@ -132,7 +129,6 @@ class RCTIPlusIE(RCTIPlusBaseIE): }, 'params': { 'skip_download': True, - 'format': 'bestvideo', }, }] _CONVIVA_JSON_TEMPLATE = { @@ -329,7 +325,6 @@ class RCTIPlusTVIE(RCTIPlusBaseIE): }, 'params': { 'skip_download': True, - 'format': 'bestvideo', } }, { # Returned video will always change diff --git a/yt_dlp/extractor/reddit.py b/yt_dlp/extractor/reddit.py index c75d95a8e..a042a59cc 100644 --- a/yt_dlp/extractor/reddit.py +++ b/yt_dlp/extractor/reddit.py @@ -8,46 +8,11 @@ from ..utils import ( try_get, unescapeHTML, url_or_none, + traverse_obj ) class RedditIE(InfoExtractor): - _VALID_URL = r'https?://v\.redd\.it/(?P<id>[^/?#&]+)' - _TEST = { - # from https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/ - 'url': 'https://v.redd.it/zv89llsvexdz', - 'md5': '0a070c53eba7ec4534d95a5a1259e253', - 'info_dict': { - 'id': 'zv89llsvexdz', - 'ext': 'mp4', - 'title': 'zv89llsvexdz', - }, - 'params': { - 'format': 'bestvideo', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - formats = self._extract_m3u8_formats( - 'https://v.redd.it/%s/HLSPlaylist.m3u8' % video_id, video_id, - 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) - - formats.extend(self._extract_mpd_formats( - 'https://v.redd.it/%s/DASHPlaylist.mpd' % video_id, video_id, - mpd_id='dash', fatal=False)) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': video_id, - 'formats': formats, - } - - -class RedditRIE(InfoExtractor): _VALID_URL = r'https?://(?P<subdomain>[^/]+\.)?reddit(?:media)?\.com/r/(?P<slug>[^/]+/comments/(?P<id>[^/?#&]+))' _TESTS = [{ 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/', @@ -67,7 +32,6 @@ class RedditRIE(InfoExtractor): 'age_limit': 0, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, { @@ -151,19 +115,53 @@ class RedditRIE(InfoExtractor): for resolution in resolutions: add_thumbnail(resolution) - return { - '_type': 'url_transparent', - 'url': video_url, + info = { 'title': data.get('title'), 'thumbnails': thumbnails, 'timestamp': float_or_none(data.get('created_utc')), 'uploader': data.get('author'), - 'duration': int_or_none(try_get( - data, - (lambda x: x['media']['reddit_video']['duration'], - lambda x: x['secure_media']['reddit_video']['duration']))), 'like_count': int_or_none(data.get('ups')), 'dislike_count': int_or_none(data.get('downs')), 'comment_count': int_or_none(data.get('num_comments')), 'age_limit': age_limit, } + + # Check if media is hosted on reddit: + reddit_video = traverse_obj(data, (('media', 'secure_media'), 'reddit_video'), get_all=False) + if reddit_video: + playlist_urls = [ + try_get(reddit_video, lambda x: unescapeHTML(x[y])) + for y in ('dash_url', 'hls_url') + ] + + # Update video_id + display_id = video_id + video_id = self._search_regex( + r'https?://v\.redd\.it/(?P<id>[^/?#&]+)', reddit_video['fallback_url'], + 'video_id', default=display_id) + + dash_playlist_url = playlist_urls[0] or f'https://v.redd.it/{video_id}/DASHPlaylist.mpd' + hls_playlist_url = playlist_urls[1] or f'https://v.redd.it/{video_id}/HLSPlaylist.m3u8' + + formats = self._extract_m3u8_formats( + hls_playlist_url, display_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + formats.extend(self._extract_mpd_formats( + dash_playlist_url, display_id, mpd_id='dash', fatal=False)) + self._sort_formats(formats) + + return { + **info, + 'id': video_id, + 'display_id': display_id, + 'formats': formats, + 'duration': int_or_none(reddit_video.get('duration')), + } + + # Not hosted on reddit, must continue extraction + return { + **info, + 'display_id': video_id, + '_type': 'url_transparent', + 'url': video_url, + } diff --git a/yt_dlp/extractor/redgifs.py b/yt_dlp/extractor/redgifs.py new file mode 100644 index 000000000..1257d1344 --- /dev/null +++ b/yt_dlp/extractor/redgifs.py @@ -0,0 +1,94 @@ +# coding: utf-8 + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + qualities, + try_get, +) + + +class RedGifsIE(InfoExtractor): + _VALID_URL = r'https?://(?:(?:www|thumbs2?)\.)?redgifs\.com/(?:watch/)?(?P<id>[^-/?#\.]+)' + _FORMATS = { + 'gif': 250, + 'sd': 480, + 'hd': None, + } + _TESTS = [{ + 'url': 'https://www.redgifs.com/watch/squeakyhelplesswisent', + 'info_dict': { + 'id': 'squeakyhelplesswisent', + 'ext': 'mp4', + 'title': 'Hotwife Legs Thick', + 'timestamp': 1636287915, + 'upload_date': '20211107', + 'uploader': 'ignored52', + 'duration': 16, + 'view_count': int, + 'like_count': int, + 'categories': list, + 'age_limit': 18, + } + }, { + 'url': 'https://thumbs2.redgifs.com/SqueakyHelplessWisent-mobile.mp4#t=0', + 'info_dict': { + 'id': 'squeakyhelplesswisent', + 'ext': 'mp4', + 'title': 'Hotwife Legs Thick', + 'timestamp': 1636287915, + 'upload_date': '20211107', + 'uploader': 'ignored52', + 'duration': 16, + 'view_count': int, + 'like_count': int, + 'categories': list, + 'age_limit': 18, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url).lower() + + video_info = self._download_json( + 'https://api.redgifs.com/v2/gifs/%s' % video_id, + video_id, 'Downloading video info') + if 'error' in video_info: + raise ExtractorError(f'RedGifs said: {video_info["error"]}', expected=True) + + gif = video_info['gif'] + urls = gif['urls'] + + quality = qualities(tuple(self._FORMATS.keys())) + + orig_height = int_or_none(gif.get('height')) + aspect_ratio = try_get(gif, lambda x: orig_height / x['width']) + + formats = [] + for format_id, height in self._FORMATS.items(): + video_url = urls.get(format_id) + if not video_url: + continue + height = min(orig_height, height or orig_height) + formats.append({ + 'url': video_url, + 'format_id': format_id, + 'width': height * aspect_ratio if aspect_ratio else None, + 'height': height, + 'quality': quality(format_id), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': ' '.join(gif.get('tags') or []) or 'RedGifs', + 'timestamp': int_or_none(gif.get('createDate')), + 'uploader': gif.get('userName'), + 'duration': int_or_none(gif.get('duration')), + 'view_count': int_or_none(gif.get('views')), + 'like_count': int_or_none(gif.get('likes')), + 'categories': gif.get('tags') or [], + 'age_limit': 18, + 'formats': formats, + } diff --git a/yt_dlp/extractor/rmcdecouverte.py b/yt_dlp/extractor/rmcdecouverte.py index 422d47ae9..8bfce3416 100644 --- a/yt_dlp/extractor/rmcdecouverte.py +++ b/yt_dlp/extractor/rmcdecouverte.py @@ -26,7 +26,6 @@ class RMCDecouverteIE(InfoExtractor): 'upload_date': '20210428', }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, { diff --git a/yt_dlp/extractor/roosterteeth.py b/yt_dlp/extractor/roosterteeth.py index 2c815bda6..18672b2e3 100644 --- a/yt_dlp/extractor/roosterteeth.py +++ b/yt_dlp/extractor/roosterteeth.py @@ -1,25 +1,94 @@ # coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, -) +from ..compat import compat_HTTPError from ..utils import ( ExtractorError, int_or_none, + join_nonempty, + LazyList, + parse_qs, str_or_none, + traverse_obj, + url_or_none, urlencode_postdata, + urljoin, + update_url_query, ) -class RoosterTeethIE(InfoExtractor): - _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/(?:episode|watch)/(?P<id>[^/?#&]+)' +class RoosterTeethBaseIE(InfoExtractor): _NETRC_MACHINE = 'roosterteeth' + _API_BASE = 'https://svod-be.roosterteeth.com' + _API_BASE_URL = f'{_API_BASE}/api/v1' + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + if self._get_cookies(self._API_BASE_URL).get('rt_access_token'): + return + + try: + self._download_json( + 'https://auth.roosterteeth.com/oauth/token', + None, 'Logging in', data=urlencode_postdata({ + 'client_id': '4338d2b4bdc8db1239360f28e72f0d9ddb1fd01e7a38fbb07b4b1f4ba4564cc5', + 'grant_type': 'password', + 'username': username, + 'password': password, + })) + except ExtractorError as e: + msg = 'Unable to login' + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + resp = self._parse_json(e.cause.read().decode(), None, fatal=False) + if resp: + error = resp.get('extra_info') or resp.get('error_description') or resp.get('error') + if error: + msg += ': ' + error + self.report_warning(msg) + + def _real_initialize(self): + self._login() + + def _extract_video_info(self, data): + thumbnails = [] + for image in traverse_obj(data, ('included', 'images')): + if image.get('type') not in ('episode_image', 'bonus_feature_image'): + continue + thumbnails.extend([{ + 'id': name, + 'url': url, + } for name, url in (image.get('attributes') or {}).items() if url_or_none(url)]) + + attributes = data.get('attributes') or {} + title = traverse_obj(attributes, 'title', 'display_title') + sub_only = attributes.get('is_sponsors_only') + + return { + 'id': str(data.get('id')), + 'display_id': attributes.get('slug'), + 'title': title, + 'description': traverse_obj(attributes, 'description', 'caption'), + 'series': attributes.get('show_title'), + 'season_number': int_or_none(attributes.get('season_number')), + 'season_id': attributes.get('season_id'), + 'episode': title, + 'episode_number': int_or_none(attributes.get('number')), + 'episode_id': str_or_none(data.get('uuid')), + 'channel_id': attributes.get('channel_id'), + 'duration': int_or_none(attributes.get('length')), + 'thumbnails': thumbnails, + 'availability': self._availability( + needs_premium=sub_only, needs_subscription=sub_only, needs_auth=sub_only, + is_private=False, is_unlisted=False), + 'tags': attributes.get('genres') + } + + +class RoosterTeethIE(RoosterTeethBaseIE): + _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/(?:episode|watch)/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://roosterteeth.com/episode/million-dollars-but-season-2-million-dollars-but-the-game-announcement', - 'md5': 'e2bd7764732d785ef797700a2489f212', 'info_dict': { 'id': '9156', 'display_id': 'million-dollars-but-season-2-million-dollars-but-the-game-announcement', @@ -30,19 +99,20 @@ class RoosterTeethIE(InfoExtractor): 'series': 'Million Dollars, But...', 'episode': 'Million Dollars, But... The Game Announcement', }, + 'skip_download': 'm3u8', }, { 'url': 'https://roosterteeth.com/watch/rwby-bonus-25', - 'md5': 'fe8d9d976b272c18a24fe7f1f5830084', 'info_dict': { - 'id': '31', + 'id': '40432', 'display_id': 'rwby-bonus-25', - 'title': 'Volume 2, World of Remnant 3', - 'description': 'md5:8d58d3270292ea11da00ea712bbfb009', - 'episode': 'Volume 2, World of Remnant 3', - 'channel_id': 'fab60c1c-29cb-43bc-9383-5c3538d9e246', + 'title': 'Grimm', + 'description': 'md5:f30ff570741213418a8d2c19868b93ab', + 'episode': 'Grimm', + 'channel_id': '92f780eb-ebfe-4bf5-a3b5-c6ad5460a5f1', 'thumbnail': r're:^https?://.*\.(png|jpe?g)$', 'ext': 'mp4', }, + 'skip_download': 'm3u8', }, { 'url': 'http://achievementhunter.roosterteeth.com/episode/off-topic-the-achievement-hunter-podcast-2016-i-didn-t-think-it-would-pass-31', 'only_matching': True, @@ -63,40 +133,10 @@ class RoosterTeethIE(InfoExtractor): 'url': 'https://roosterteeth.com/watch/million-dollars-but-season-2-million-dollars-but-the-game-announcement', 'only_matching': True, }] - _EPISODE_BASE_URL = 'https://svod-be.roosterteeth.com/api/v1/watch/' - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - - try: - self._download_json( - 'https://auth.roosterteeth.com/oauth/token', - None, 'Logging in', data=urlencode_postdata({ - 'client_id': '4338d2b4bdc8db1239360f28e72f0d9ddb1fd01e7a38fbb07b4b1f4ba4564cc5', - 'grant_type': 'password', - 'username': username, - 'password': password, - })) - except ExtractorError as e: - msg = 'Unable to login' - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: - resp = self._parse_json(e.cause.read().decode(), None, fatal=False) - if resp: - error = resp.get('extra_info') or resp.get('error_description') or resp.get('error') - if error: - msg += ': ' + error - self.report_warning(msg) - - def _real_initialize(self): - if self._get_cookies(self._EPISODE_BASE_URL).get('rt_access_token'): - return - self._login() def _real_extract(self, url): display_id = self._match_id(url) - api_episode_url = self._EPISODE_BASE_URL + display_id + api_episode_url = f'{self._API_BASE_URL}/watch/{display_id}' try: video_data = self._download_json( @@ -118,36 +158,62 @@ class RoosterTeethIE(InfoExtractor): episode = self._download_json( api_episode_url, display_id, 'Downloading episode JSON metadata')['data'][0] - attributes = episode['attributes'] - title = attributes.get('title') or attributes['display_title'] - video_id = compat_str(episode['id']) - - thumbnails = [] - for image in episode.get('included', {}).get('images', []): - if image.get('type') in ('episode_image', 'bonus_feature_image'): - img_attributes = image.get('attributes') or {} - for k in ('thumb', 'small', 'medium', 'large'): - img_url = img_attributes.get(k) - if img_url: - thumbnails.append({ - 'id': k, - 'url': img_url, - }) return { - 'id': video_id, 'display_id': display_id, - 'title': title, - 'description': attributes.get('description') or attributes.get('caption'), - 'thumbnails': thumbnails, - 'series': attributes.get('show_title'), - 'season_number': int_or_none(attributes.get('season_number')), - 'season_id': attributes.get('season_id'), - 'episode': title, - 'episode_number': int_or_none(attributes.get('number')), - 'episode_id': str_or_none(episode.get('uuid')), 'formats': formats, - 'channel_id': attributes.get('channel_id'), - 'duration': int_or_none(attributes.get('length')), - 'subtitles': subtitles + 'subtitles': subtitles, + **self._extract_video_info(episode) + } + + +class RoosterTeethSeriesIE(RoosterTeethBaseIE): + _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/series/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://roosterteeth.com/series/rwby?season=7', + 'playlist_count': 13, + 'info_dict': { + 'id': 'rwby-7', + 'title': 'RWBY - Season 7', } + }, { + 'url': 'https://roosterteeth.com/series/role-initiative', + 'playlist_mincount': 16, + 'info_dict': { + 'id': 'role-initiative', + 'title': 'Role Initiative', + } + }, { + 'url': 'https://roosterteeth.com/series/let-s-play-minecraft?season=9', + 'playlist_mincount': 50, + 'info_dict': { + 'id': 'let-s-play-minecraft-9', + 'title': 'Let\'s Play Minecraft - Season 9', + } + }] + + def _entries(self, series_id, season_number): + display_id = join_nonempty(series_id, season_number) + # TODO: extract bonus material + for data in self._download_json( + f'{self._API_BASE_URL}/shows/{series_id}/seasons?order=asc&order_by', display_id)['data']: + idx = traverse_obj(data, ('attributes', 'number')) + if season_number and idx != season_number: + continue + season_url = update_url_query(urljoin(self._API_BASE, data['links']['episodes']), {'per_page': 1000}) + season = self._download_json(season_url, display_id, f'Downloading season {idx} JSON metadata')['data'] + for episode in season: + yield self.url_result( + f'https://www.roosterteeth.com{episode["canonical_links"]["self"]}', + RoosterTeethIE.ie_key(), + **self._extract_video_info(episode)) + + def _real_extract(self, url): + series_id = self._match_id(url) + season_number = traverse_obj(parse_qs(url), ('season', 0), expected_type=int_or_none) + + entries = LazyList(self._entries(series_id, season_number)) + return self.playlist_result( + entries, + join_nonempty(series_id, season_number), + join_nonempty(entries[0].get('series'), season_number, delim=' - Season ')) diff --git a/yt_dlp/extractor/rtrfm.py b/yt_dlp/extractor/rtrfm.py new file mode 100644 index 000000000..93d51e8ed --- /dev/null +++ b/yt_dlp/extractor/rtrfm.py @@ -0,0 +1,67 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class RTRFMIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?rtrfm\.com\.au/(?:shows|show-episode)/(?P<id>[^/?\#&]+)' + _TESTS = [ + { + 'url': 'https://rtrfm.com.au/shows/breakfast/', + 'md5': '46168394d3a5ce237cf47e85d0745413', + 'info_dict': { + 'id': 'breakfast-2021-11-16', + 'ext': 'mp3', + 'series': 'Breakfast with Taylah', + 'title': r're:^Breakfast with Taylah \d{4}-\d{2}-\d{2}$', + 'description': 'md5:0979c3ab1febfbec3f1ccb743633c611', + }, + 'skip': 'ID and md5 changes daily', + }, + { + 'url': 'https://rtrfm.com.au/show-episode/breakfast-2021-11-11/', + 'md5': '396bedf1e40f96c62b30d4999202a790', + 'info_dict': { + 'id': 'breakfast-2021-11-11', + 'ext': 'mp3', + 'series': 'Breakfast with Taylah', + 'title': 'Breakfast with Taylah 2021-11-11', + 'description': 'md5:0979c3ab1febfbec3f1ccb743633c611', + }, + }, + { + 'url': 'https://rtrfm.com.au/show-episode/breakfast-2020-06-01/', + 'md5': '594027f513ec36a24b15d65007a24dff', + 'info_dict': { + 'id': 'breakfast-2020-06-01', + 'ext': 'mp3', + 'series': 'Breakfast with Taylah', + 'title': 'Breakfast with Taylah 2020-06-01', + 'description': r're:^Breakfast with Taylah ', + }, + 'skip': 'This audio has expired', + }, + ] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + show, date, title = self._search_regex( + r'''\.playShow(?:From)?\(['"](?P<show>[^'"]+)['"],\s*['"](?P<date>[0-9]{4}-[0-9]{2}-[0-9]{2})['"],\s*['"](?P<title>[^'"]+)['"]''', + webpage, 'details', group=('show', 'date', 'title')) + url = self._download_json( + 'https://restreams.rtrfm.com.au/rzz', + show, 'Downloading MP3 URL', query={'n': show, 'd': date})['u'] + # This is the only indicator of an error until trying to download the URL and + # downloads of mp4 URLs always fail (403 for current episodes, 404 for missing). + if '.mp4' in url: + url = None + self.raise_no_formats('Expired or no episode on this date', expected=True) + return { + 'id': '%s-%s' % (show, date), + 'title': '%s %s' % (title, date), + 'series': title, + 'url': url, + 'release_date': date, + 'description': self._og_search_description(webpage), + } diff --git a/yt_dlp/extractor/sbs.py b/yt_dlp/extractor/sbs.py index 0a806ee4e..4090f6385 100644 --- a/yt_dlp/extractor/sbs.py +++ b/yt_dlp/extractor/sbs.py @@ -10,7 +10,14 @@ from ..utils import ( class SBSIE(InfoExtractor): IE_DESC = 'sbs.com.au' - _VALID_URL = r'https?://(?:www\.)?sbs\.com\.au/(?:ondemand(?:/video/(?:single/)?|.*?\bplay=|/watch/)|news/(?:embeds/)?video/)(?P<id>[0-9]+)' + _VALID_URL = r'''(?x) + https?://(?:www\.)?sbs\.com\.au/(?: + ondemand(?: + /video/(?:single/)?| + /movie/[^/]+/| + .*?\bplay=|/watch/ + )|news/(?:embeds/)?video/ + )(?P<id>[0-9]+)''' _TESTS = [{ # Original URL is handled by the generic IE which finds the iframe: @@ -46,6 +53,13 @@ class SBSIE(InfoExtractor): }, { 'url': 'https://www.sbs.com.au/ondemand/watch/1698704451971', 'only_matching': True, + }, { + 'url': 'https://www.sbs.com.au/ondemand/movie/coherence/1469404227931', + 'only_matching': True, + }, { + 'note': 'Live stream', + 'url': 'https://www.sbs.com.au/ondemand/video/1726824003663/sbs-24x7-live-stream-nsw', + 'only_matching': True, }] def _real_extract(self, url): @@ -75,4 +89,5 @@ class SBSIE(InfoExtractor): 'ie_key': 'ThePlatform', 'id': video_id, 'url': smuggle_url(self._proto_relative_url(theplatform_url), {'force_smil_url': True}), + 'is_live': player_params.get('streamType') == 'live', } diff --git a/yt_dlp/extractor/senategov.py b/yt_dlp/extractor/senategov.py new file mode 100644 index 000000000..6f4240422 --- /dev/null +++ b/yt_dlp/extractor/senategov.py @@ -0,0 +1,213 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_parse_qs, + compat_urlparse, +) +from ..utils import ( + ExtractorError, + parse_qs, + unsmuggle_url, +) + +_COMMITTEES = { + 'ag': ('76440', 'http://ag-f.akamaihd.net'), + 'aging': ('76442', 'http://aging-f.akamaihd.net'), + 'approps': ('76441', 'http://approps-f.akamaihd.net'), + 'arch': ('', 'http://ussenate-f.akamaihd.net'), + 'armed': ('76445', 'http://armed-f.akamaihd.net'), + 'banking': ('76446', 'http://banking-f.akamaihd.net'), + 'budget': ('76447', 'http://budget-f.akamaihd.net'), + 'cecc': ('76486', 'http://srs-f.akamaihd.net'), + 'commerce': ('80177', 'http://commerce1-f.akamaihd.net'), + 'csce': ('75229', 'http://srs-f.akamaihd.net'), + 'dpc': ('76590', 'http://dpc-f.akamaihd.net'), + 'energy': ('76448', 'http://energy-f.akamaihd.net'), + 'epw': ('76478', 'http://epw-f.akamaihd.net'), + 'ethics': ('76449', 'http://ethics-f.akamaihd.net'), + 'finance': ('76450', 'http://finance-f.akamaihd.net'), + 'foreign': ('76451', 'http://foreign-f.akamaihd.net'), + 'govtaff': ('76453', 'http://govtaff-f.akamaihd.net'), + 'help': ('76452', 'http://help-f.akamaihd.net'), + 'indian': ('76455', 'http://indian-f.akamaihd.net'), + 'intel': ('76456', 'http://intel-f.akamaihd.net'), + 'intlnarc': ('76457', 'http://intlnarc-f.akamaihd.net'), + 'jccic': ('85180', 'http://jccic-f.akamaihd.net'), + 'jec': ('76458', 'http://jec-f.akamaihd.net'), + 'judiciary': ('76459', 'http://judiciary-f.akamaihd.net'), + 'rpc': ('76591', 'http://rpc-f.akamaihd.net'), + 'rules': ('76460', 'http://rules-f.akamaihd.net'), + 'saa': ('76489', 'http://srs-f.akamaihd.net'), + 'smbiz': ('76461', 'http://smbiz-f.akamaihd.net'), + 'srs': ('75229', 'http://srs-f.akamaihd.net'), + 'uscc': ('76487', 'http://srs-f.akamaihd.net'), + 'vetaff': ('76462', 'http://vetaff-f.akamaihd.net'), +} + + +class SenateISVPIE(InfoExtractor): + _IE_NAME = 'senate.gov:isvp' + _VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P<qs>.+)' + + _TESTS = [{ + 'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png', + 'info_dict': { + 'id': 'judiciary031715', + 'ext': 'mp4', + 'title': 'Integrated Senate Video Player', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.senate.gov/isvp/?type=live&comm=commerce&filename=commerce011514.mp4&auto_play=false', + 'info_dict': { + 'id': 'commerce011514', + 'ext': 'mp4', + 'title': 'Integrated Senate Video Player' + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.senate.gov/isvp/?type=arch&comm=intel&filename=intel090613&hc_location=ufi', + # checksum differs each time + 'info_dict': { + 'id': 'intel090613', + 'ext': 'mp4', + 'title': 'Integrated Senate Video Player' + } + }, { + # From http://www.c-span.org/video/?96791-1 + 'url': 'http://www.senate.gov/isvp?type=live&comm=banking&filename=banking012715', + 'only_matching': True, + }] + + @staticmethod + def _search_iframe_url(webpage): + mobj = re.search( + r"<iframe[^>]+src=['\"](?P<url>https?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]", + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + + qs = compat_parse_qs(self._match_valid_url(url).group('qs')) + if not qs.get('filename') or not qs.get('type') or not qs.get('comm'): + raise ExtractorError('Invalid URL', expected=True) + + video_id = re.sub(r'.mp4$', '', qs['filename'][0]) + + webpage = self._download_webpage(url, video_id) + + if smuggled_data.get('force_title'): + title = smuggled_data['force_title'] + else: + title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, video_id) + poster = qs.get('poster') + thumbnail = poster[0] if poster else None + + video_type = qs['type'][0] + committee = video_type if video_type == 'arch' else qs['comm'][0] + + stream_num, domain = _COMMITTEES[committee] + + formats = [] + if video_type == 'arch': + filename = video_id if '.' in video_id else video_id + '.mp4' + m3u8_url = compat_urlparse.urljoin(domain, 'i/' + filename + '/master.m3u8') + formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8') + else: + hdcore_sign = 'hdcore=3.1.0' + url_params = (domain, video_id, stream_num) + f4m_url = f'%s/z/%s_1@%s/manifest.f4m?{hdcore_sign}' % url_params + m3u8_url = '%s/i/%s_1@%s/master.m3u8' % url_params + for entry in self._extract_f4m_formats(f4m_url, video_id, f4m_id='f4m'): + # URLs without the extra param induce an 404 error + entry.update({'extra_param_to_segment_url': hdcore_sign}) + formats.append(entry) + for entry in self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8'): + mobj = re.search(r'(?P<tag>(?:-p|-b)).m3u8', entry['url']) + if mobj: + entry['format_id'] += mobj.group('tag') + formats.append(entry) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnail': thumbnail, + } + + +class SenateGovIE(InfoExtractor): + _IE_NAME = 'senate.gov' + _VALID_URL = r'https?:\/\/(?:www\.)?(help|appropriations|judiciary|banking|armed-services|finance)\.senate\.gov' + _TESTS = [{ + 'url': 'https://www.help.senate.gov/hearings/vaccines-saving-lives-ensuring-confidence-and-protecting-public-health', + 'info_dict': { + 'id': 'help090920', + 'display_id': 'vaccines-saving-lives-ensuring-confidence-and-protecting-public-health', + 'title': 'Vaccines: Saving Lives, Ensuring Confidence, and Protecting Public Health', + 'description': 'The U.S. Senate Committee on Health, Education, Labor & Pensions', + 'ext': 'mp4', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.appropriations.senate.gov/hearings/watch?hearingid=B8A25434-5056-A066-6020-1F68CB75F0CD', + 'info_dict': { + 'id': 'appropsA051518', + 'display_id': 'watch?hearingid=B8A25434-5056-A066-6020-1F68CB75F0CD', + 'title': 'Review of the FY2019 Budget Request for the U.S. Army', + 'ext': 'mp4', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.banking.senate.gov/hearings/21st-century-communities-public-transportation-infrastructure-investment-and-fast-act-reauthorization', + 'info_dict': { + 'id': 'banking041521', + 'display_id': '21st-century-communities-public-transportation-infrastructure-investment-and-fast-act-reauthorization', + 'title': '21st Century Communities: Public Transportation Infrastructure Investment and FAST Act Reauthorization', + 'description': 'The Official website of The United States Committee on Banking, Housing, and Urban Affairs', + 'ext': 'mp4', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + display_id = self._generic_id(url) + webpage = self._download_webpage(url, display_id) + parse_info = parse_qs(self._search_regex( + r'<iframe class="[^>"]*streaminghearing[^>"]*"\s[^>]*\bsrc="([^">]*)', webpage, 'hearing URL')) + + stream_num, stream_domain = _COMMITTEES[parse_info['comm'][-1]] + filename = parse_info['filename'][-1] + + formats = self._extract_m3u8_formats( + f'{stream_domain}/i/{filename}_1@{stream_num}/master.m3u8', + display_id, ext='mp4') + self._sort_formats(formats) + + title = self._html_search_regex( + (*self._og_regexes('title'), r'(?s)<title>([^<]*?)</title>'), webpage, 'video title') + + return { + 'id': re.sub(r'.mp4$', '', filename), + 'display_id': display_id, + 'title': re.sub(r'\s+', ' ', title.split('|')[0]).strip(), + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'age_limit': self._rta_search(webpage), + 'formats': formats + } diff --git a/yt_dlp/extractor/senateisvp.py b/yt_dlp/extractor/senateisvp.py deleted file mode 100644 index 8794d47ef..000000000 --- a/yt_dlp/extractor/senateisvp.py +++ /dev/null @@ -1,153 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - unsmuggle_url, -) -from ..compat import ( - compat_parse_qs, - compat_urlparse, -) - - -class SenateISVPIE(InfoExtractor): - _COMM_MAP = [ - ['ag', '76440', 'http://ag-f.akamaihd.net'], - ['aging', '76442', 'http://aging-f.akamaihd.net'], - ['approps', '76441', 'http://approps-f.akamaihd.net'], - ['armed', '76445', 'http://armed-f.akamaihd.net'], - ['banking', '76446', 'http://banking-f.akamaihd.net'], - ['budget', '76447', 'http://budget-f.akamaihd.net'], - ['cecc', '76486', 'http://srs-f.akamaihd.net'], - ['commerce', '80177', 'http://commerce1-f.akamaihd.net'], - ['csce', '75229', 'http://srs-f.akamaihd.net'], - ['dpc', '76590', 'http://dpc-f.akamaihd.net'], - ['energy', '76448', 'http://energy-f.akamaihd.net'], - ['epw', '76478', 'http://epw-f.akamaihd.net'], - ['ethics', '76449', 'http://ethics-f.akamaihd.net'], - ['finance', '76450', 'http://finance-f.akamaihd.net'], - ['foreign', '76451', 'http://foreign-f.akamaihd.net'], - ['govtaff', '76453', 'http://govtaff-f.akamaihd.net'], - ['help', '76452', 'http://help-f.akamaihd.net'], - ['indian', '76455', 'http://indian-f.akamaihd.net'], - ['intel', '76456', 'http://intel-f.akamaihd.net'], - ['intlnarc', '76457', 'http://intlnarc-f.akamaihd.net'], - ['jccic', '85180', 'http://jccic-f.akamaihd.net'], - ['jec', '76458', 'http://jec-f.akamaihd.net'], - ['judiciary', '76459', 'http://judiciary-f.akamaihd.net'], - ['rpc', '76591', 'http://rpc-f.akamaihd.net'], - ['rules', '76460', 'http://rules-f.akamaihd.net'], - ['saa', '76489', 'http://srs-f.akamaihd.net'], - ['smbiz', '76461', 'http://smbiz-f.akamaihd.net'], - ['srs', '75229', 'http://srs-f.akamaihd.net'], - ['uscc', '76487', 'http://srs-f.akamaihd.net'], - ['vetaff', '76462', 'http://vetaff-f.akamaihd.net'], - ['arch', '', 'http://ussenate-f.akamaihd.net/'] - ] - _IE_NAME = 'senate.gov' - _VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P<qs>.+)' - _TESTS = [{ - 'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png', - 'info_dict': { - 'id': 'judiciary031715', - 'ext': 'mp4', - 'title': 'Integrated Senate Video Player', - 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://www.senate.gov/isvp/?type=live&comm=commerce&filename=commerce011514.mp4&auto_play=false', - 'info_dict': { - 'id': 'commerce011514', - 'ext': 'mp4', - 'title': 'Integrated Senate Video Player' - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://www.senate.gov/isvp/?type=arch&comm=intel&filename=intel090613&hc_location=ufi', - # checksum differs each time - 'info_dict': { - 'id': 'intel090613', - 'ext': 'mp4', - 'title': 'Integrated Senate Video Player' - } - }, { - # From http://www.c-span.org/video/?96791-1 - 'url': 'http://www.senate.gov/isvp?type=live&comm=banking&filename=banking012715', - 'only_matching': True, - }] - - @staticmethod - def _search_iframe_url(webpage): - mobj = re.search( - r"<iframe[^>]+src=['\"](?P<url>https?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]", - webpage) - if mobj: - return mobj.group('url') - - def _get_info_for_comm(self, committee): - for entry in self._COMM_MAP: - if entry[0] == committee: - return entry[1:] - - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - - qs = compat_parse_qs(self._match_valid_url(url).group('qs')) - if not qs.get('filename') or not qs.get('type') or not qs.get('comm'): - raise ExtractorError('Invalid URL', expected=True) - - video_id = re.sub(r'.mp4$', '', qs['filename'][0]) - - webpage = self._download_webpage(url, video_id) - - if smuggled_data.get('force_title'): - title = smuggled_data['force_title'] - else: - title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, video_id) - poster = qs.get('poster') - thumbnail = poster[0] if poster else None - - video_type = qs['type'][0] - committee = video_type if video_type == 'arch' else qs['comm'][0] - stream_num, domain = self._get_info_for_comm(committee) - - formats = [] - if video_type == 'arch': - filename = video_id if '.' in video_id else video_id + '.mp4' - formats = [{ - # All parameters in the query string are necessary to prevent a 403 error - 'url': compat_urlparse.urljoin(domain, filename) + '?v=3.1.0&fp=&r=&g=', - }] - else: - hdcore_sign = 'hdcore=3.1.0' - url_params = (domain, video_id, stream_num) - f4m_url = '%s/z/%s_1@%s/manifest.f4m?' % url_params + hdcore_sign - m3u8_url = '%s/i/%s_1@%s/master.m3u8' % url_params - for entry in self._extract_f4m_formats(f4m_url, video_id, f4m_id='f4m'): - # URLs without the extra param induce an 404 error - entry.update({'extra_param_to_segment_url': hdcore_sign}) - formats.append(entry) - for entry in self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8'): - mobj = re.search(r'(?P<tag>(?:-p|-b)).m3u8', entry['url']) - if mobj: - entry['format_id'] += mobj.group('tag') - formats.append(entry) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - } diff --git a/yt_dlp/extractor/sevenplus.py b/yt_dlp/extractor/sevenplus.py index 210c44ab2..9867961f0 100644 --- a/yt_dlp/extractor/sevenplus.py +++ b/yt_dlp/extractor/sevenplus.py @@ -35,7 +35,6 @@ class SevenPlusIE(BrightcoveNewIE): 'episode': 'Wind Surf', }, 'params': { - 'format': 'bestvideo', 'skip_download': True, } }, { diff --git a/yt_dlp/extractor/slideslive.py b/yt_dlp/extractor/slideslive.py index 9409a0100..df6084647 100644 --- a/yt_dlp/extractor/slideslive.py +++ b/yt_dlp/extractor/slideslive.py @@ -35,9 +35,6 @@ class SlidesLiveIE(InfoExtractor): 'ext': 'mp4', 'title': 'Offline Reinforcement Learning: From Algorithms to Practical Challenges', }, - 'params': { - 'format': 'bestvideo', - }, }, { # video_service_name = youtube 'url': 'https://slideslive.com/38903721/magic-a-scientific-resurrection-of-an-esoteric-legend', diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index 824528474..2bb449220 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -893,5 +893,6 @@ class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): break def _get_n_results(self, query, n): - tracks = self._get_collection('search/tracks', query, limit=n, q=query) - return self.playlist_result(tracks, query, query) + return self.playlist_result(itertools.islice( + self._get_collection('search/tracks', query, limit=n, q=query), + 0, None if n == float('inf') else n), query, query) diff --git a/yt_dlp/extractor/southpark.py b/yt_dlp/extractor/southpark.py index d49749467..942a52dcf 100644 --- a/yt_dlp/extractor/southpark.py +++ b/yt_dlp/extractor/southpark.py @@ -6,19 +6,18 @@ from .mtv import MTVServicesInfoExtractor class SouthParkIE(MTVServicesInfoExtractor): IE_NAME = 'southpark.cc.com' - _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark(?:\.cc|studios)\.com/(?:clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark(?:\.cc|studios)\.com/((?:video-)?clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed' _TESTS = [{ - 'url': 'http://southpark.cc.com/clips/104437/bat-daded#tab=featured', + 'url': 'https://southpark.cc.com/video-clips/d7wr06/south-park-you-all-agreed-to-counseling', 'info_dict': { - 'id': 'a7bff6c2-ed00-11e0-aca6-0026b9414f30', 'ext': 'mp4', - 'title': 'South Park|Bat Daded', - 'description': 'Randy disqualifies South Park by getting into a fight with Bat Dad.', - 'timestamp': 1112760000, - 'upload_date': '20050406', + 'title': 'You All Agreed to Counseling', + 'description': 'Kenny, Cartman, Stan, and Kyle visit Mr. Mackey and ask for his help getting Mrs. Nelson to come back. Mr. Mackey reveals the only way to get things back to normal is to get the teachers vaccinated.', + 'timestamp': 1615352400, + 'upload_date': '20210310', }, }, { 'url': 'http://southpark.cc.com/collections/7758/fan-favorites/1', @@ -40,11 +39,11 @@ class SouthParkIE(MTVServicesInfoExtractor): class SouthParkEsIE(SouthParkIE): IE_NAME = 'southpark.cc.com:español' - _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/episodios-en-espanol/(?P<id>.+?)(\?|#|$))' + _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/es/episodios/(?P<id>.+?)(\?|#|$))' _LANG = 'es' _TESTS = [{ - 'url': 'http://southpark.cc.com/episodios-en-espanol/s01e01-cartman-consigue-una-sonda-anal#source=351c1323-0b96-402d-a8b9-40d01b2e9bde&position=1&sort=!airdate', + 'url': 'http://southpark.cc.com/es/episodios/s01e01-cartman-consigue-una-sonda-anal#source=351c1323-0b96-402d-a8b9-40d01b2e9bde&position=1&sort=!airdate', 'info_dict': { 'title': 'Cartman Consigue Una Sonda Anal', 'description': 'Cartman Consigue Una Sonda Anal', diff --git a/yt_dlp/extractor/srgssr.py b/yt_dlp/extractor/srgssr.py index cbc1c47d2..f9919816d 100644 --- a/yt_dlp/extractor/srgssr.py +++ b/yt_dlp/extractor/srgssr.py @@ -7,6 +7,7 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + join_nonempty, parse_iso8601, qualities, try_get, @@ -94,11 +95,7 @@ class SRGSSRIE(InfoExtractor): continue protocol = source.get('protocol') quality = source.get('quality') - format_id = [] - for e in (protocol, source.get('encoding'), quality): - if e: - format_id.append(e) - format_id = '-'.join(format_id) + format_id = join_nonempty(protocol, source.get('encoding'), quality) if protocol in ('HDS', 'HLS'): if source.get('tokenType') == 'AKAMAI': diff --git a/yt_dlp/extractor/streamff.py b/yt_dlp/extractor/streamff.py new file mode 100644 index 000000000..6b190bb3b --- /dev/null +++ b/yt_dlp/extractor/streamff.py @@ -0,0 +1,31 @@ +# coding: utf-8 +from .common import InfoExtractor +from ..utils import int_or_none, parse_iso8601 + + +class StreamFFIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?streamff\.com/v/(?P<id>[a-zA-Z0-9]+)' + + _TESTS = [{ + 'url': 'https://streamff.com/v/55cc94', + 'md5': '8745a67bb5e5c570738efe7983826370', + 'info_dict': { + 'id': '55cc94', + 'ext': 'mp4', + 'title': '55cc94', + 'timestamp': 1634764643, + 'upload_date': '20211020', + 'view_count': int, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + json_data = self._download_json(f'https://streamff.com/api/videos/{video_id}', video_id) + return { + 'id': video_id, + 'title': json_data.get('name') or video_id, + 'url': 'https://streamff.com/%s' % json_data['videoLink'], + 'view_count': int_or_none(json_data.get('views')), + 'timestamp': parse_iso8601(json_data.get('date')), + } diff --git a/yt_dlp/extractor/stripchat.py b/yt_dlp/extractor/stripchat.py new file mode 100644 index 000000000..efd0afc75 --- /dev/null +++ b/yt_dlp/extractor/stripchat.py @@ -0,0 +1,66 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import ( + compat_str, +) +from ..utils import ( + ExtractorError, + lowercase_escape, + try_get, +) + + +class StripchatIE(InfoExtractor): + _VALID_URL = r'https?://stripchat\.com/(?P<id>[0-9A-Za-z-_]+)' + _TESTS = [{ + 'url': 'https://stripchat.com/feel_me', + 'info_dict': { + 'id': 'feel_me', + 'ext': 'mp4', + 'title': 're:^feel_me [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'description': str, + 'is_live': True, + 'age_limit': 18, + }, + 'skip': 'Room is offline', + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + 'https://stripchat.com/%s/' % video_id, video_id, + headers=self.geo_verification_headers()) + + data = self._parse_json( + self._search_regex( + r'<script\b[^>]*>\s*window\.__PRELOADED_STATE__\s*=(?P<value>.*?)<\/script>', + webpage, 'data', default='{}', group='value'), + video_id, transform_source=lowercase_escape, fatal=False) + if not data: + raise ExtractorError('Unable to find configuration for stream.') + + if try_get(data, lambda x: x['viewCam']['show'], dict): + raise ExtractorError('Model is in private show', expected=True) + elif not try_get(data, lambda x: x['viewCam']['model']['isLive'], bool): + raise ExtractorError('Model is offline', expected=True) + + server = try_get(data, lambda x: x['viewCam']['viewServers']['flashphoner-hls'], compat_str) + host = try_get(data, lambda x: x['config']['data']['hlsStreamHost'], compat_str) + model_id = try_get(data, lambda x: x['viewCam']['model']['id'], int) + + formats = self._extract_m3u8_formats( + 'https://b-%s.%s/hls/%d/%d.m3u8' % (server, host, model_id, model_id), + video_id, ext='mp4', m3u8_id='hls', fatal=False, live=True) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._live_title(video_id), + 'description': self._og_search_description(webpage), + 'is_live': True, + 'formats': formats, + # Stripchat declares the RTA meta-tag, but in an non-standard format so _rta_search() can't be used + 'age_limit': 18, + } diff --git a/yt_dlp/extractor/svt.py b/yt_dlp/extractor/svt.py index 38e0086b3..489f197fe 100644 --- a/yt_dlp/extractor/svt.py +++ b/yt_dlp/extractor/svt.py @@ -168,7 +168,6 @@ class SVTPlayIE(SVTPlayBaseIE): }, }, 'params': { - 'format': 'bestvideo', # skip for now due to download test asserts that segment is > 10000 bytes and svt uses # init segments that are smaller # AssertionError: Expected test_SVTPlay_jNwpV9P.mp4 to be at least 9.77KiB, but it's only 864.00B diff --git a/yt_dlp/extractor/telemundo.py b/yt_dlp/extractor/telemundo.py index 18552a0ef..e326bbdd5 100644 --- a/yt_dlp/extractor/telemundo.py +++ b/yt_dlp/extractor/telemundo.py @@ -1,4 +1,4 @@ -# coding=utf-8 +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor diff --git a/yt_dlp/extractor/telequebec.py b/yt_dlp/extractor/telequebec.py index 800d87b70..4bef2fe76 100644 --- a/yt_dlp/extractor/telequebec.py +++ b/yt_dlp/extractor/telequebec.py @@ -43,9 +43,6 @@ class TeleQuebecIE(TeleQuebecBaseIE): 'uploader_id': '6150020952001', 'upload_date': '20200512', }, - 'params': { - 'format': 'bestvideo', - }, 'add_ie': ['BrightcoveNew'], }, { 'url': 'https://zonevideo.telequebec.tv/media/55267/le-soleil/passe-partout', @@ -58,9 +55,6 @@ class TeleQuebecIE(TeleQuebecBaseIE): 'upload_date': '20200625', 'timestamp': 1593090307, }, - 'params': { - 'format': 'bestvideo', - }, 'add_ie': ['BrightcoveNew'], }, { # no description @@ -157,9 +151,6 @@ class TeleQuebecEmissionIE(InfoExtractor): 'timestamp': 1588713424, 'uploader_id': '6150020952001', }, - 'params': { - 'format': 'bestvideo', - }, }, { 'url': 'http://bancpublic.telequebec.tv/emissions/emission-49/31986/jeunes-meres-sous-pression', 'only_matching': True, @@ -220,9 +211,6 @@ class TeleQuebecVideoIE(TeleQuebecBaseIE): 'timestamp': 1603115930, 'uploader_id': '6101674910001', }, - 'params': { - 'format': 'bestvideo', - }, }, { 'url': 'https://video.telequebec.tv/player-live/28527', 'only_matching': True, diff --git a/yt_dlp/extractor/tenplay.py b/yt_dlp/extractor/tenplay.py index c810cfd0d..5b3222ecf 100644 --- a/yt_dlp/extractor/tenplay.py +++ b/yt_dlp/extractor/tenplay.py @@ -58,7 +58,7 @@ class TenPlayIE(InfoExtractor): 'email': username, 'password': password, })) - return "Bearer " + data['jwt']['accessToken'] + return 'Bearer ' + data['jwt']['accessToken'] def _real_extract(self, url): content_id = self._match_id(url) diff --git a/yt_dlp/extractor/tf1.py b/yt_dlp/extractor/tf1.py index 669eb5015..44785bc65 100644 --- a/yt_dlp/extractor/tf1.py +++ b/yt_dlp/extractor/tf1.py @@ -29,7 +29,6 @@ class TF1IE(InfoExtractor): 'params': { # Sometimes wat serves the whole file with the --test option 'skip_download': True, - 'format': 'bestvideo', }, }, { 'url': 'http://www.tf1.fr/tf1/koh-lanta/videos/replay-koh-lanta-22-mai-2015.html', diff --git a/yt_dlp/extractor/threeqsdn.py b/yt_dlp/extractor/threeqsdn.py index bb7610352..e5c6a6de1 100644 --- a/yt_dlp/extractor/threeqsdn.py +++ b/yt_dlp/extractor/threeqsdn.py @@ -9,6 +9,7 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + join_nonempty, parse_iso8601, ) @@ -119,24 +120,16 @@ class ThreeQSDNIE(InfoExtractor): src = s.get('src') if not (src and self._is_valid_url(src, video_id)): continue - width = None - format_id = ['http'] ext = determine_ext(src) - if ext: - format_id.append(ext) height = int_or_none(s.get('height')) - if height: - format_id.append('%dp' % height) - if aspect: - width = int(height * aspect) formats.append({ 'ext': ext, - 'format_id': '-'.join(format_id), + 'format_id': join_nonempty('http', ext, height and '%dp' % height), 'height': height, 'source_preference': 0, 'url': src, 'vcodec': 'none' if height == 0 else None, - 'width': width, + 'width': int(height * aspect) if height and aspect else None, }) # It seems like this would be correctly handled by default # However, unless someone can confirm this, the old diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 1db6327e2..7d79ad8d5 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -12,6 +12,7 @@ from ..compat import compat_urllib_parse_unquote from ..utils import ( ExtractorError, int_or_none, + join_nonempty, str_or_none, traverse_obj, try_get, @@ -38,8 +39,8 @@ class TikTokBaseIE(InfoExtractor): 'build_number': self._APP_VERSION, 'manifest_version_code': self._MANIFEST_APP_VERSION, 'update_version_code': self._MANIFEST_APP_VERSION, - 'openudid': ''.join(random.choice('0123456789abcdef') for i in range(16)), - 'uuid': ''.join([random.choice(string.digits) for num in range(16)]), + 'openudid': ''.join(random.choice('0123456789abcdef') for _ in range(16)), + 'uuid': ''.join([random.choice(string.digits) for _ in range(16)]), '_rticket': int(time.time() * 1000), 'ts': int(time.time()), 'device_brand': 'Google', @@ -66,7 +67,7 @@ class TikTokBaseIE(InfoExtractor): 'as': 'a1qwert123', 'cp': 'cbfhckdckkde1', } - self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choice('0123456789abcdef') for i in range(160))) + self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choice('0123456789abcdef') for _ in range(160))) return self._download_json( 'https://%s/aweme/v1/%s/' % (self._API_HOSTNAME, ep), video_id=video_id, fatal=fatal, note=note, errnote=errnote, headers={ @@ -107,8 +108,8 @@ class TikTokBaseIE(InfoExtractor): 'acodec': 'aac', 'source_preference': -2 if 'aweme/v1' in url else -1, # Downloads from API might get blocked **add_meta, **parsed_meta, - 'format_note': ' '.join(filter(None, ( - add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else ''))) + 'format_note': join_nonempty( + add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else None, delim=' ') } for url in addr.get('url_list') or []] # Hack: Add direct video links first to prioritize them when removing duplicate formats @@ -416,7 +417,7 @@ class TikTokUserIE(TikTokBaseIE): 'max_cursor': 0, 'min_cursor': 0, 'retry_type': 'no_retry', - 'device_id': ''.join(random.choice(string.digits) for i in range(19)), # Some endpoints don't like randomized device_id, so it isn't directly set in _call_api. + 'device_id': ''.join(random.choice(string.digits) for _ in range(19)), # Some endpoints don't like randomized device_id, so it isn't directly set in _call_api. } max_retries = self.get_param('extractor_retries', 3) @@ -437,6 +438,7 @@ class TikTokUserIE(TikTokBaseIE): **self._parse_aweme_video_app(video), 'ie_key': TikTokIE.ie_key(), 'extractor': 'TikTok', + 'webpage_url': f'https://tiktok.com/@{user_id}/video/{video["aweme_id"]}', } if not post_list.get('has_more'): break diff --git a/yt_dlp/extractor/tokentube.py b/yt_dlp/extractor/tokentube.py index d6362117f..579623fed 100644 --- a/yt_dlp/extractor/tokentube.py +++ b/yt_dlp/extractor/tokentube.py @@ -6,7 +6,10 @@ import re from .common import InfoExtractor from ..utils import ( + clean_html, + get_element_by_class, parse_count, + remove_end, unified_strdate, js_to_json, OnDemandPagedList, @@ -35,7 +38,7 @@ class TokentubeIE(InfoExtractor): 'id': '3950239124', 'ext': 'mp4', 'title': 'Linux Ubuntu Studio perus käyttö', - 'description': 'md5:854ff1dc732ff708976de2880ea32050', + 'description': 'md5:46077d0daaba1974f2dc381257f9d64c', 'uploader': 'jyrilehtonen', 'upload_date': '20210825', }, @@ -45,7 +48,7 @@ class TokentubeIE(InfoExtractor): 'id': '3582463289', 'ext': 'mp4', 'title': 'Police for Freedom - toiminta aloitetaan Suomessa ❤️??', - 'description': 'md5:cd92e620d7f5fa162e8410d0fc9a08be', + 'description': 'md5:37ebf1cb44264e0bf23ed98b337ee63e', 'uploader': 'Voitontie', 'upload_date': '20210428', } @@ -90,7 +93,10 @@ class TokentubeIE(InfoExtractor): r'<a\s*class="place-left"[^>]+>(.+?)</a>', webpage, 'uploader', fatal=False) - description = self._html_search_meta('description', webpage) + description = (clean_html(get_element_by_class('p-d-txt', webpage)) + or self._html_search_meta(('og:description', 'description', 'twitter:description'), webpage)) + + description = remove_end(description, 'Category') self._sort_formats(formats) diff --git a/yt_dlp/extractor/tonline.py b/yt_dlp/extractor/tonline.py index cc11eae2a..9b6a40db5 100644 --- a/yt_dlp/extractor/tonline.py +++ b/yt_dlp/extractor/tonline.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import int_or_none, join_nonempty class TOnlineIE(InfoExtractor): @@ -30,13 +30,8 @@ class TOnlineIE(InfoExtractor): asset_source = asset.get('source') or asset.get('source2') if not asset_source: continue - formats_id = [] - for field_key in ('type', 'profile'): - field_value = asset.get(field_key) - if field_value: - formats_id.append(field_value) formats.append({ - 'format_id': '-'.join(formats_id), + 'format_id': join_nonempty('type', 'profile', from_dict=asset), 'url': asset_source, }) diff --git a/yt_dlp/extractor/tv2.py b/yt_dlp/extractor/tv2.py index e0851531c..da351eeb0 100644 --- a/yt_dlp/extractor/tv2.py +++ b/yt_dlp/extractor/tv2.py @@ -19,7 +19,7 @@ from ..utils import ( class TV2IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tv2\.no/v/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?tv2\.no/v\d*/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.tv2.no/v/916509/', 'info_dict': { @@ -33,6 +33,9 @@ class TV2IE(InfoExtractor): 'view_count': int, 'categories': list, }, + }, { + 'url': 'http://www.tv2.no/v2/916509', + 'only_matching': True, }] _PROTOCOLS = ('HLS', 'DASH') _GEO_COUNTRIES = ['NO'] diff --git a/yt_dlp/extractor/tvp.py b/yt_dlp/extractor/tvp.py index 1e42b33a4..48e2c6e76 100644 --- a/yt_dlp/extractor/tvp.py +++ b/yt_dlp/extractor/tvp.py @@ -2,35 +2,40 @@ from __future__ import unicode_literals import itertools +import random import re from .common import InfoExtractor from ..utils import ( - clean_html, determine_ext, + dict_get, ExtractorError, - get_element_by_attribute, + int_or_none, + js_to_json, orderedSet, + str_or_none, + try_get, ) class TVPIE(InfoExtractor): IE_NAME = 'tvp' IE_DESC = 'Telewizja Polska' - _VALID_URL = r'https?://[^/]+\.tvp\.(?:pl|info)/(?:video/(?:[^,\s]*,)*|(?:(?!\d+/)[^/]+/)*)(?P<id>\d+)' + _VALID_URL = r'https?://(?:[^/]+\.)?(?:tvp(?:parlament)?\.(?:pl|info)|polandin\.com)/(?:video/(?:[^,\s]*,)*|(?:(?!\d+/)[^/]+/)*)(?P<id>\d+)' _TESTS = [{ + # TVPlayer 2 in js wrapper 'url': 'https://vod.tvp.pl/video/czas-honoru,i-seria-odc-13,194536', - 'md5': 'a21eb0aa862f25414430f15fdfb9e76c', 'info_dict': { 'id': '194536', 'ext': 'mp4', 'title': 'Czas honoru, odc. 13 – Władek', 'description': 'md5:437f48b93558370b031740546b696e24', + 'age_limit': 12, }, }, { + # TVPlayer legacy 'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176', - 'md5': 'b0005b542e5b4de643a9690326ab1257', 'info_dict': { 'id': '17916176', 'ext': 'mp4', @@ -38,16 +43,63 @@ class TVPIE(InfoExtractor): 'description': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata', }, }, { - # page id is not the same as video id(#7799) - 'url': 'https://wiadomosci.tvp.pl/33908820/28092017-1930', - 'md5': '84cd3c8aec4840046e5ab712416b73d0', + # TVPlayer 2 in iframe + 'url': 'https://wiadomosci.tvp.pl/50725617/dzieci-na-sprzedaz-dla-homoseksualistow', 'info_dict': { - 'id': '33908820', + 'id': '50725617', 'ext': 'mp4', - 'title': 'Wiadomości, 28.09.2017, 19:30', - 'description': 'Wydanie główne codziennego serwisu informacyjnego.' + 'title': 'Dzieci na sprzedaż dla homoseksualistów', + 'description': 'md5:7d318eef04e55ddd9f87a8488ac7d590', + 'age_limit': 12, }, - 'skip': 'HTTP Error 404: Not Found', + }, { + # TVPlayer 2 in client-side rendered website (regional; window.__newsData) + 'url': 'https://warszawa.tvp.pl/25804446/studio-yayo', + 'info_dict': { + 'id': '25804446', + 'ext': 'mp4', + 'title': 'Studio Yayo', + 'upload_date': '20160616', + 'timestamp': 1466075700, + } + }, { + # TVPlayer 2 in client-side rendered website (tvp.info; window.__videoData) + 'url': 'https://www.tvp.info/52880236/09042021-0800', + 'info_dict': { + 'id': '52880236', + 'ext': 'mp4', + 'title': '09.04.2021, 08:00', + }, + }, { + # client-side rendered (regional) program (playlist) page + 'url': 'https://opole.tvp.pl/9660819/rozmowa-dnia', + 'info_dict': { + 'id': '9660819', + 'description': 'Od poniedziałku do piątku o 18:55', + 'title': 'Rozmowa dnia', + }, + 'playlist_mincount': 1800, + 'params': { + 'skip_download': True, + } + }, { + # ABC-specific video embeding + # moved to https://bajkowakraina.tvp.pl/wideo/50981130,teleranek,51027049,zubr,51116450 + 'url': 'https://abc.tvp.pl/48636269/zubry-odc-124', + 'info_dict': { + 'id': '48320456', + 'ext': 'mp4', + 'title': 'Teleranek, Żubr', + }, + 'skip': 'unavailable', + }, { + # yet another vue page + 'url': 'https://jp2.tvp.pl/46925618/filmy', + 'info_dict': { + 'id': '46925618', + 'title': 'Filmy', + }, + 'playlist_mincount': 19, }, { 'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272', 'only_matching': True, @@ -66,137 +118,344 @@ class TVPIE(InfoExtractor): }, { 'url': 'http://www.tvp.info/25511919/trwa-rewolucja-wladza-zdecydowala-sie-na-pogwalcenie-konstytucji', 'only_matching': True, + }, { + 'url': 'https://tvp.info/49193823/teczowe-flagi-na-pomnikach-prokuratura-wszczela-postepowanie-wieszwiecej', + 'only_matching': True, + }, { + 'url': 'https://www.tvpparlament.pl/retransmisje-vod/inne/wizyta-premiera-mateusza-morawieckiego-w-firmie-berotu-sp-z-oo/48857277', + 'only_matching': True, + }, { + 'url': 'https://polandin.com/47942651/pln-10-billion-in-subsidies-transferred-to-companies-pm', + 'only_matching': True, }] + def _parse_vue_website_data(self, webpage, page_id): + website_data = self._search_regex([ + # website - regiony, tvp.info + # directory - jp2.tvp.pl + r'window\.__(?:website|directory)Data\s*=\s*({(?:.|\s)+?});', + ], webpage, 'website data') + if not website_data: + return None + return self._parse_json(website_data, page_id, transform_source=js_to_json) + + def _extract_vue_video(self, video_data, page_id=None): + if isinstance(video_data, str): + video_data = self._parse_json(video_data, page_id, transform_source=js_to_json) + thumbnails = [] + image = video_data.get('image') + if image: + for thumb in (image if isinstance(image, list) else [image]): + thmb_url = str_or_none(thumb.get('url')) + if thmb_url: + thumbnails.append({ + 'url': thmb_url, + }) + is_website = video_data.get('type') == 'website' + if is_website: + url = video_data['url'] + fucked_up_url_parts = re.match(r'https?://vod\.tvp\.pl/(\d+)/([^/?#]+)', url) + if fucked_up_url_parts: + url = f'https://vod.tvp.pl/website/{fucked_up_url_parts.group(2)},{fucked_up_url_parts.group(1)}' + else: + url = 'tvp:' + str_or_none(video_data.get('_id') or page_id) + return { + '_type': 'url_transparent', + 'id': str_or_none(video_data.get('_id') or page_id), + 'url': url, + 'ie_key': 'TVPEmbed' if not is_website else 'TVPWebsite', + 'title': str_or_none(video_data.get('title')), + 'description': str_or_none(video_data.get('lead')), + 'timestamp': int_or_none(video_data.get('release_date_long')), + 'duration': int_or_none(video_data.get('duration')), + 'thumbnails': thumbnails, + } + + def _handle_vuejs_page(self, url, webpage, page_id): + # vue client-side rendered sites (all regional pages + tvp.info) + video_data = self._search_regex([ + r'window\.__(?:news|video)Data\s*=\s*({(?:.|\s)+?})\s*;', + ], webpage, 'video data', default=None) + if video_data: + return self._extract_vue_video(video_data, page_id=page_id) + # paged playlists + website_data = self._parse_vue_website_data(webpage, page_id) + if website_data: + entries = self._vuejs_entries(url, website_data, page_id) + + return { + '_type': 'playlist', + 'id': page_id, + 'title': str_or_none(website_data.get('title')), + 'description': str_or_none(website_data.get('lead')), + 'entries': entries, + } + raise ExtractorError('Could not extract video/website data') + + def _vuejs_entries(self, url, website_data, page_id): + + def extract_videos(wd): + if wd.get('latestVideo'): + yield self._extract_vue_video(wd['latestVideo']) + for video in wd.get('videos') or []: + yield self._extract_vue_video(video) + for video in wd.get('items') or []: + yield self._extract_vue_video(video) + + yield from extract_videos(website_data) + + if website_data.get('items_total_count') > website_data.get('items_per_page'): + for page in itertools.count(2): + page_website_data = self._parse_vue_website_data( + self._download_webpage(url, page_id, note='Downloading page #%d' % page, + query={'page': page}), + page_id) + if not page_website_data.get('videos') and not page_website_data.get('items'): + break + yield from extract_videos(page_website_data) + def _real_extract(self, url): page_id = self._match_id(url) - webpage = self._download_webpage(url, page_id) + webpage, urlh = self._download_webpage_handle(url, page_id) + + # The URL may redirect to a VOD + # example: https://vod.tvp.pl/48463890/wadowickie-spotkania-z-janem-pawlem-ii + if TVPWebsiteIE.suitable(urlh.url): + return self.url_result(urlh.url, ie=TVPWebsiteIE.ie_key(), video_id=page_id) + + if re.search( + r'window\.__(?:video|news|website|directory)Data\s*=', + webpage): + return self._handle_vuejs_page(url, webpage, page_id) + + # classic server-side rendered sites video_id = self._search_regex([ + r'<iframe[^>]+src="[^"]*?embed\.php\?(?:[^&]+&)*ID=(\d+)', r'<iframe[^>]+src="[^"]*?object_id=(\d+)', r"object_id\s*:\s*'(\d+)'", - r'data-video-id="(\d+)"'], webpage, 'video id', default=page_id) + r'data-video-id="(\d+)"', + + # abc.tvp.pl - somehow there are more than one video IDs that seem to be the same video? + # the first one is referenced to as "copyid", and seems to be unused by the website + r'<script>\s*tvpabc\.video\.init\(\s*\d+,\s*(\d+)\s*\)\s*</script>', + ], webpage, 'video id', default=page_id) return { '_type': 'url_transparent', 'url': 'tvp:' + video_id, 'description': self._og_search_description( - webpage, default=None) or self._html_search_meta( - 'description', webpage, default=None), + webpage, default=None) or (self._html_search_meta( + 'description', webpage, default=None) + if '//s.tvp.pl/files/portal/v' in webpage else None), 'thumbnail': self._og_search_thumbnail(webpage, default=None), 'ie_key': 'TVPEmbed', } +class TVPStreamIE(InfoExtractor): + IE_NAME = 'tvp:stream' + _VALID_URL = r'(?:tvpstream:|https?://tvpstream\.vod\.tvp\.pl/(?:\?(?:[^&]+[&;])*channel_id=)?)(?P<id>\d*)' + _TESTS = [{ + # untestable as "video" id changes many times across a day + 'url': 'https://tvpstream.vod.tvp.pl/?channel_id=1455', + 'only_matching': True, + }, { + 'url': 'tvpstream:39821455', + 'only_matching': True, + }, { + # the default stream when you provide no channel_id, most probably TVP Info + 'url': 'tvpstream:', + 'only_matching': True, + }, { + 'url': 'https://tvpstream.vod.tvp.pl/', + 'only_matching': True, + }] + + _PLAYER_BOX_RE = r'<div\s[^>]*id\s*=\s*["\']?tvp_player_box["\']?[^>]+data-%s-id\s*=\s*["\']?(\d+)' + _BUTTON_RE = r'<div\s[^>]*data-channel-id=["\']?%s["\']?[^>]*\sdata-title=(?:"([^"]*)"|\'([^\']*)\')[^>]*\sdata-stationname=(?:"([^"]*)"|\'([^\']*)\')' + + def _real_extract(self, url): + channel_id = self._match_id(url) + channel_url = self._proto_relative_url('//tvpstream.vod.tvp.pl/?channel_id=%s' % channel_id or 'default') + webpage = self._download_webpage(channel_url, channel_id, 'Downloading channel webpage') + if not channel_id: + channel_id = self._search_regex(self._PLAYER_BOX_RE % 'channel', + webpage, 'default channel id') + video_id = self._search_regex(self._PLAYER_BOX_RE % 'video', + webpage, 'video id') + audition_title, station_name = self._search_regex( + self._BUTTON_RE % (re.escape(channel_id)), webpage, + 'audition title and station name', + group=(1, 2)) + return { + '_type': 'url_transparent', + 'id': channel_id, + 'url': 'tvp:%s' % video_id, + 'title': audition_title, + 'alt_title': station_name, + 'is_live': True, + 'ie_key': 'TVPEmbed', + } + + class TVPEmbedIE(InfoExtractor): IE_NAME = 'tvp:embed' IE_DESC = 'Telewizja Polska' - _VALID_URL = r'(?:tvp:|https?://[^/]+\.tvp\.(?:pl|info)/sess/tvplayer\.php\?.*?object_id=)(?P<id>\d+)' + _VALID_URL = r'''(?x) + (?: + tvp: + |https?:// + (?:[^/]+\.)? + (?:tvp(?:parlament)?\.pl|tvp\.info|polandin\.com)/ + (?:sess/ + (?:tvplayer\.php\?.*?object_id + |TVPlayer2/(?:embed|api)\.php\?.*[Ii][Dd]) + |shared/details\.php\?.*?object_id) + =) + (?P<id>\d+) + ''' _TESTS = [{ 'url': 'tvp:194536', - 'md5': 'a21eb0aa862f25414430f15fdfb9e76c', 'info_dict': { 'id': '194536', 'ext': 'mp4', 'title': 'Czas honoru, odc. 13 – Władek', + 'description': 'md5:76649d2014f65c99477be17f23a4dead', + 'age_limit': 12, }, }, { - # not available - 'url': 'http://www.tvp.pl/sess/tvplayer.php?object_id=22670268', - 'md5': '8c9cd59d16edabf39331f93bf8a766c7', + 'url': 'https://www.tvp.pl/sess/tvplayer.php?object_id=51247504&autoplay=false', 'info_dict': { - 'id': '22670268', + 'id': '51247504', 'ext': 'mp4', - 'title': 'Panorama, 07.12.2015, 15:40', + 'title': 'Razmova 091220', }, - 'skip': 'Transmisja została zakończona lub materiał niedostępny', }, { - 'url': 'tvp:22670268', + # TVPlayer2 embed URL + 'url': 'https://tvp.info/sess/TVPlayer2/embed.php?ID=50595757', + 'only_matching': True, + }, { + 'url': 'https://wiadomosci.tvp.pl/sess/TVPlayer2/api.php?id=51233452', + 'only_matching': True, + }, { + # pulsembed on dziennik.pl + 'url': 'https://www.tvp.pl/shared/details.php?copy_id=52205981&object_id=52204505&autoplay=false&is_muted=false&allowfullscreen=true&template=external-embed/video/iframe-video.html', 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage, **kw): + return [m.group('embed') for m in re.finditer( + r'(?x)<iframe[^>]+?src=(["\'])(?P<embed>%s)' % TVPEmbedIE._VALID_URL[4:], + webpage)] + def _real_extract(self, url): video_id = self._match_id(url) + # it could be anything that is a valid JS function name + callback = random.choice(( + 'jebac_pis', + 'jebacpis', + 'ziobro', + 'sasin70', + 'sasin_przejebal_70_milionow_PLN', + 'tvp_is_a_state_propaganda_service', + )) + webpage = self._download_webpage( - 'http://www.tvp.pl/sess/tvplayer.php?object_id=%s' % video_id, video_id) - - error = self._html_search_regex( - r'(?s)<p[^>]+\bclass=["\']notAvailable__text["\'][^>]*>(.+?)</p>', - webpage, 'error', default=None) or clean_html( - get_element_by_attribute('class', 'msg error', webpage)) - if error: - raise ExtractorError('%s said: %s' % ( - self.IE_NAME, clean_html(error)), expected=True) - - title = self._search_regex( - r'name\s*:\s*([\'"])Title\1\s*,\s*value\s*:\s*\1(?P<title>.+?)\1', - webpage, 'title', group='title') - series_title = self._search_regex( - r'name\s*:\s*([\'"])SeriesTitle\1\s*,\s*value\s*:\s*\1(?P<series>.+?)\1', - webpage, 'series', group='series', default=None) - if series_title: - title = '%s, %s' % (series_title, title) - - thumbnail = self._search_regex( - r"poster\s*:\s*'([^']+)'", webpage, 'thumbnail', default=None) - - video_url = self._search_regex( - r'0:{src:([\'"])(?P<url>.*?)\1', webpage, - 'formats', group='url', default=None) - if not video_url or 'material_niedostepny.mp4' in video_url: - video_url = self._download_json( - 'http://www.tvp.pl/pub/stat/videofileinfo?video_id=%s' % video_id, - video_id)['video_url'] + ('https://www.tvp.pl/sess/TVPlayer2/api.php?id=%s' + + '&@method=getTvpConfig&@callback=%s') % (video_id, callback), video_id) + + # stripping JSONP padding + datastr = webpage[15 + len(callback):-3] + if datastr.startswith('null,'): + error = self._parse_json(datastr[5:], video_id) + raise ExtractorError(error[0]['desc']) + + content = self._parse_json(datastr, video_id)['content'] + info = content['info'] + is_live = try_get(info, lambda x: x['isLive'], bool) formats = [] - video_url_base = self._search_regex( - r'(https?://.+?/video)(?:\.(?:ism|f4m|m3u8)|-\d+\.mp4)', - video_url, 'video base url', default=None) - if video_url_base: - # TODO: <Group> found instead of <AdaptationSet> in MPD manifest. - # It's not mentioned in MPEG-DASH standard. Figure that out. - # formats.extend(self._extract_mpd_formats( - # video_url_base + '.ism/video.mpd', - # video_id, mpd_id='dash', fatal=False)) - formats.extend(self._extract_ism_formats( - video_url_base + '.ism/Manifest', - video_id, 'mss', fatal=False)) - formats.extend(self._extract_f4m_formats( - video_url_base + '.ism/video.f4m', - video_id, f4m_id='hds', fatal=False)) - m3u8_formats = self._extract_m3u8_formats( - video_url_base + '.ism/video.m3u8', video_id, - 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - self._sort_formats(m3u8_formats) - m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none', m3u8_formats)) - formats.extend(m3u8_formats) - for i, m3u8_format in enumerate(m3u8_formats, 2): - http_url = '%s-%d.mp4' % (video_url_base, i) - if self._is_valid_url(http_url, video_id): - f = m3u8_format.copy() - f.update({ - 'url': http_url, - 'format_id': f['format_id'].replace('hls', 'http'), - 'protocol': 'http', - }) - formats.append(f) - else: - formats = [{ - 'format_id': 'direct', - 'url': video_url, - 'ext': determine_ext(video_url, 'mp4'), - }] + for file in content['files']: + video_url = file.get('url') + if not video_url: + continue + if video_url.endswith('.m3u8'): + formats.extend(self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls', fatal=False, live=is_live)) + elif video_url.endswith('.mpd'): + if is_live: + # doesn't work with either ffmpeg or native downloader + continue + formats.extend(self._extract_mpd_formats(video_url, video_id, mpd_id='dash', fatal=False)) + elif video_url.endswith('.f4m'): + formats.extend(self._extract_f4m_formats(video_url, video_id, f4m_id='hds', fatal=False)) + elif video_url.endswith('.ism/manifest'): + formats.extend(self._extract_ism_formats(video_url, video_id, ism_id='mss', fatal=False)) + else: + # mp4, wmv or something + quality = file.get('quality', {}) + formats.append({ + 'format_id': 'direct', + 'url': video_url, + 'ext': determine_ext(video_url, file['type']), + 'fps': int_or_none(quality.get('fps')), + 'tbr': int_or_none(quality.get('bitrate')), + 'width': int_or_none(quality.get('width')), + 'height': int_or_none(quality.get('height')), + }) self._sort_formats(formats) - return { + title = dict_get(info, ('subtitle', 'title', 'seoTitle')) + description = dict_get(info, ('description', 'seoDescription')) + thumbnails = [] + for thumb in content.get('posters') or (): + thumb_url = thumb.get('src') + if not thumb_url or '{width}' in thumb_url or '{height}' in thumb_url: + continue + thumbnails.append({ + 'url': thumb.get('src'), + 'width': thumb.get('width'), + 'height': thumb.get('height'), + }) + age_limit = try_get(info, lambda x: x['ageGroup']['minAge'], int) + if age_limit == 1: + age_limit = 0 + duration = try_get(info, lambda x: x['duration'], int) if not is_live else None + + subtitles = {} + for sub in content.get('subtitles') or []: + if not sub.get('url'): + continue + subtitles.setdefault(sub['lang'], []).append({ + 'url': sub['url'], + 'ext': sub.get('type'), + }) + + info_dict = { 'id': video_id, 'title': title, - 'thumbnail': thumbnail, + 'description': description, + 'thumbnails': thumbnails, + 'age_limit': age_limit, + 'is_live': is_live, + 'duration': duration, 'formats': formats, + 'subtitles': subtitles, } + # vod.tvp.pl + if info.get('vortalName') == 'vod': + info_dict.update({ + 'title': '%s, %s' % (info.get('title'), info.get('subtitle')), + 'series': info.get('title'), + 'season': info.get('season'), + 'episode_number': info.get('episode'), + }) + + return info_dict + class TVPWebsiteIE(InfoExtractor): IE_NAME = 'tvp:series' @@ -204,18 +463,20 @@ class TVPWebsiteIE(InfoExtractor): _TESTS = [{ # series - 'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312/video', + 'url': 'https://vod.tvp.pl/website/wspaniale-stulecie,17069012/video', 'info_dict': { - 'id': '38678312', + 'id': '17069012', }, - 'playlist_count': 115, + 'playlist_count': 312, }, { # film - 'url': 'https://vod.tvp.pl/website/gloria,35139666', + 'url': 'https://vod.tvp.pl/website/krzysztof-krawczyk-cale-moje-zycie,51374466', 'info_dict': { - 'id': '36637049', + 'id': '51374509', 'ext': 'mp4', - 'title': 'Gloria, Gloria', + 'title': 'Krzysztof Krawczyk – całe moje życie, Krzysztof Krawczyk – całe moje życie', + 'description': 'md5:2e80823f00f5fc263555482f76f8fa42', + 'age_limit': 12, }, 'params': { 'skip_download': True, diff --git a/yt_dlp/extractor/tvplay.py b/yt_dlp/extractor/tvplay.py index fbafb41f8..b5dbc5526 100644 --- a/yt_dlp/extractor/tvplay.py +++ b/yt_dlp/extractor/tvplay.py @@ -12,9 +12,9 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, - parse_duration, parse_iso8601, qualities, + traverse_obj, try_get, update_url_query, url_or_none, @@ -369,7 +369,6 @@ class ViafreeIE(InfoExtractor): 'upload_date': '20201217' }, 'params': { - 'format': 'bestvideo', 'skip_download': True } }, { @@ -432,77 +431,96 @@ class ViafreeIE(InfoExtractor): class TVPlayHomeIE(InfoExtractor): - _VALID_URL = r'https?://(?:tv3?)?play\.(?:tv3\.lt|skaties\.lv|tv3\.ee)/(?:[^/]+/)*[^/?#&]+-(?P<id>\d+)' + _VALID_URL = r'''(?x) + https?:// + (?:tv3?)? + play\.(?:tv3|skaties)\.(?P<country>lv|lt|ee)/ + (?P<live>lives/)? + [^?#&]+(?:episode|programme|clip)-(?P<id>\d+) + ''' _TESTS = [{ - 'url': 'https://tvplay.tv3.lt/aferistai-n-7/aferistai-10047125/', + 'url': 'https://play.tv3.lt/series/gauju-karai-karveliai,serial-2343791/serija-8,episode-2343828', 'info_dict': { - 'id': '366367', + 'id': '2343828', 'ext': 'mp4', - 'title': 'Aferistai', - 'description': 'Aferistai. Kalėdinė pasaka.', - 'series': 'Aferistai [N-7]', - 'season': '1 sezonas', + 'title': 'Gaujų karai. Karveliai (2021) | S01E08: Serija 8', + 'description': 'md5:f6fcfbb236429f05531131640dfa7c81', + 'duration': 2710, + 'season': 'Gaujų karai. Karveliai', 'season_number': 1, - 'duration': 464, - 'timestamp': 1394209658, - 'upload_date': '20140307', - 'age_limit': 18, + 'release_year': 2021, + 'episode': 'Serija 8', + 'episode_number': 8, }, 'params': { - 'skip_download': True, + 'skip_download': 'm3u8', }, }, { - 'url': 'https://tvplay.skaties.lv/vinas-melo-labak/vinas-melo-labak-10280317/', - 'only_matching': True, + 'url': 'https://play.tv3.lt/series/moterys-meluoja-geriau-n-7,serial-2574652/serija-25,episode-3284937', + 'info_dict': { + 'id': '3284937', + 'ext': 'mp4', + 'season': 'Moterys meluoja geriau [N-7]', + 'season_number': 14, + 'release_year': 2021, + 'episode': 'Serija 25', + 'episode_number': 25, + 'title': 'Moterys meluoja geriau [N-7] (2021) | S14|E25: Serija 25', + 'description': 'md5:c6926e9710f1a126f028fbe121eddb79', + 'duration': 2440, + }, + 'skip': '404' }, { - 'url': 'https://tvplay.tv3.ee/cool-d-ga-mehhikosse/cool-d-ga-mehhikosse-10044354/', + 'url': 'https://play.tv3.lt/lives/tv6-lt,live-2838694/optibet-a-lygos-rungtynes-marijampoles-suduva--vilniaus-riteriai,programme-3422014', 'only_matching': True, }, { - 'url': 'https://play.tv3.lt/aferistai-10047125', + 'url': 'https://tv3play.skaties.lv/series/women-lie-better-lv,serial-1024464/women-lie-better-lv,episode-1038762', 'only_matching': True, }, { - 'url': 'https://tv3play.skaties.lv/vinas-melo-labak-10280317', + 'url': 'https://play.tv3.ee/series/_,serial-2654462/_,episode-2654474', 'only_matching': True, }, { - 'url': 'https://play.tv3.ee/cool-d-ga-mehhikosse-10044354', + 'url': 'https://tv3play.skaties.lv/clips/tv3-zinas-valsti-lidz-15novembrim-bus-majsede,clip-3464509', 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + country, is_live, video_id = self._match_valid_url(url).groups() - asset = self._download_json( - urljoin(url, '/sb/public/asset/' + video_id), video_id) + api_path = 'lives/programmes' if is_live else 'vods' + data = self._download_json( + urljoin(url, f'/api/products/{api_path}/{video_id}?platform=BROWSER&lang={country.upper()}'), + video_id) - m3u8_url = asset['movie']['contentUrl'] - video_id = asset['assetId'] - asset_title = asset['title'] - title = asset_title['title'] - - formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') + video_type = 'CATCHUP' if is_live else 'MOVIE' + stream_id = data['programRecordingId'] if is_live else video_id + stream = self._download_json( + urljoin(url, f'/api/products/{stream_id}/videos/playlist?videoType={video_type}&platform=BROWSER'), video_id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + stream['sources']['HLS'][0]['src'], video_id, 'mp4', 'm3u8_native', m3u8_id='hls') self._sort_formats(formats) - thumbnails = None - image_url = asset.get('imageUrl') - if image_url: - thumbnails = [{ - 'url': urljoin(url, image_url), - 'ext': 'jpg', - }] - - metadata = asset.get('metadata') or {} + thumbnails = set(traverse_obj( + data, (('galary', 'images', 'artworks'), ..., ..., ('miniUrl', 'mainUrl')), expected_type=url_or_none)) return { 'id': video_id, - 'title': title, - 'description': asset_title.get('summaryLong') or asset_title.get('summaryShort'), - 'thumbnails': thumbnails, - 'duration': parse_duration(asset_title.get('runTime')), - 'series': asset.get('tvSeriesTitle'), - 'season': asset.get('tvSeasonTitle'), - 'season_number': int_or_none(metadata.get('seasonNumber')), - 'episode': asset_title.get('titleBrief'), - 'episode_number': int_or_none(metadata.get('episodeNumber')), + 'title': self._resolve_title(data), + 'description': traverse_obj(data, 'description', 'lead'), + 'duration': int_or_none(data.get('duration')), + 'season': traverse_obj(data, ('season', 'serial', 'title')), + 'season_number': int_or_none(traverse_obj(data, ('season', 'number'))), + 'episode': data.get('title'), + 'episode_number': int_or_none(data.get('episode')), + 'release_year': int_or_none(traverse_obj(data, ('season', 'serial', 'year'))), + 'thumbnails': [{'url': url, 'ext': 'jpg'} for url in thumbnails], 'formats': formats, + 'subtitles': subtitles, } + + @staticmethod + def _resolve_title(data): + return try_get(data, lambda x: ( + f'{data["season"]["serial"]["title"]} ({data["season"]["serial"]["year"]}) | ' + f'S{data["season"]["number"]:02d}E{data["episode"]:02d}: {data["title"]}' + )) or data.get('title') diff --git a/yt_dlp/extractor/twitch.py b/yt_dlp/extractor/twitch.py index be70beed4..cd97f0a24 100644 --- a/yt_dlp/extractor/twitch.py +++ b/yt_dlp/extractor/twitch.py @@ -24,6 +24,8 @@ from ..utils import ( parse_iso8601, parse_qs, qualities, + str_or_none, + traverse_obj, try_get, unified_timestamp, update_url_query, @@ -52,6 +54,7 @@ class TwitchBaseIE(InfoExtractor): 'VideoAccessToken_Clip': '36b89d2507fce29e5ca551df756d27c1cfe079e2609642b4390aa4c35796eb11', 'VideoPreviewOverlay': '3006e77e51b128d838fa4e835723ca4dc9a05c5efd4466c1085215c6e437e65c', 'VideoMetadata': '226edb3e692509f727fd56821f5653c05740242c82b0388883e0c0e75dcbf687', + 'VideoPlayer_ChapterSelectButtonVideo': '8d2793384aac3773beab5e59bd5d6f585aedb923d292800119e03d40cd0f9b41', } def _real_initialize(self): @@ -249,6 +252,38 @@ class TwitchVodIE(TwitchBaseIE): }, { 'url': 'https://player.twitch.tv/?video=480452374', 'only_matching': True, + }, { + 'url': 'https://www.twitch.tv/videos/635475444', + 'info_dict': { + 'id': 'v635475444', + 'ext': 'mp4', + 'title': 'Riot Games', + 'duration': 11643, + 'uploader': 'Riot Games', + 'uploader_id': 'riotgames', + 'timestamp': 1590770569, + 'upload_date': '20200529', + 'chapters': [ + { + 'start_time': 0, + 'end_time': 573, + 'title': 'League of Legends' + }, + { + 'start_time': 573, + 'end_time': 3922, + 'title': 'Legends of Runeterra' + }, + { + 'start_time': 3922, + 'end_time': 11643, + 'title': 'Art' + } + ], + }, + 'params': { + 'skip_download': True + } }] def _download_info(self, item_id): @@ -259,16 +294,24 @@ class TwitchVodIE(TwitchBaseIE): 'channelLogin': '', 'videoID': item_id, }, + }, { + 'operationName': 'VideoPlayer_ChapterSelectButtonVideo', + 'variables': { + 'includePrivate': False, + 'videoID': item_id, + }, }], - 'Downloading stream metadata GraphQL')[0]['data'] - video = data.get('video') + 'Downloading stream metadata GraphQL') + + video = traverse_obj(data, (0, 'data', 'video')) + video['moments'] = traverse_obj(data, (1, 'data', 'video', 'moments', 'edges', ..., 'node')) + if video is None: raise ExtractorError( 'Video %s does not exist' % item_id, expected=True) return self._extract_info_gql(video, item_id) - @staticmethod - def _extract_info(info): + def _extract_info(self, info): status = info.get('status') if status == 'recording': is_live = True @@ -302,18 +345,39 @@ class TwitchVodIE(TwitchBaseIE): 'timestamp': parse_iso8601(info.get('recorded_at')), 'view_count': int_or_none(info.get('views')), 'is_live': is_live, + 'was_live': True, } - @staticmethod - def _extract_info_gql(info, item_id): + def _extract_moments(self, info, item_id): + for moment in info.get('moments') or []: + start_time = int_or_none(moment.get('positionMilliseconds'), 1000) + duration = int_or_none(moment.get('durationMilliseconds'), 1000) + name = str_or_none(moment.get('description')) + + if start_time is None or duration is None: + self.report_warning(f'Important chapter information missing for chapter {name}', item_id) + continue + yield { + 'start_time': start_time, + 'end_time': start_time + duration, + 'title': name, + } + + def _extract_info_gql(self, info, item_id): vod_id = info.get('id') or item_id # id backward compatibility for download archives if vod_id[0] != 'v': vod_id = 'v%s' % vod_id thumbnail = url_or_none(info.get('previewThumbnailURL')) + is_live = None if thumbnail: - for p in ('width', 'height'): - thumbnail = thumbnail.replace('{%s}' % p, '0') + if thumbnail.endswith('/404_processing_{width}x{height}.png'): + is_live, thumbnail = True, None + else: + is_live = False + for p in ('width', 'height'): + thumbnail = thumbnail.replace('{%s}' % p, '0') + return { 'id': vod_id, 'title': info.get('title') or 'Untitled Broadcast', @@ -324,6 +388,9 @@ class TwitchVodIE(TwitchBaseIE): 'uploader_id': try_get(info, lambda x: x['owner']['login'], compat_str), 'timestamp': unified_timestamp(info.get('publishedAt')), 'view_count': int_or_none(info.get('viewCount')), + 'chapters': list(self._extract_moments(info, item_id)), + 'is_live': is_live, + 'was_live': True, } def _real_extract(self, url): diff --git a/yt_dlp/extractor/ustream.py b/yt_dlp/extractor/ustream.py index 8b758795f..4a7a8f879 100644 --- a/yt_dlp/extractor/ustream.py +++ b/yt_dlp/extractor/ustream.py @@ -13,6 +13,7 @@ from ..utils import ( ExtractorError, int_or_none, float_or_none, + join_nonempty, mimetype2ext, str_or_none, ) @@ -139,8 +140,8 @@ class UstreamIE(InfoExtractor): content_type = stream['contentType'] kind = content_type.split('/')[0] f = { - 'format_id': '-'.join(filter(None, [ - 'dash', kind, str_or_none(stream.get('bitrate'))])), + 'format_id': join_nonempty( + 'dash', kind, str_or_none(stream.get('bitrate'))), 'protocol': 'http_dash_segments', # TODO: generate a MPD doc for external players? 'url': encode_data_uri(b'<MPD/>', 'text/xml'), diff --git a/yt_dlp/extractor/vice.py b/yt_dlp/extractor/vice.py index ca4d3edbd..c8c30559e 100644 --- a/yt_dlp/extractor/vice.py +++ b/yt_dlp/extractor/vice.py @@ -290,7 +290,6 @@ class ViceArticleIE(ViceBaseIE): }, 'params': { 'skip_download': True, - 'format': 'bestvideo', }, 'add_ie': [ViceIE.ie_key()], }, { diff --git a/yt_dlp/extractor/vidlii.py b/yt_dlp/extractor/vidlii.py index f4774256b..ce7487ec1 100644 --- a/yt_dlp/extractor/vidlii.py +++ b/yt_dlp/extractor/vidlii.py @@ -5,9 +5,11 @@ import re from .common import InfoExtractor from ..utils import ( + HEADRequest, float_or_none, get_element_by_id, int_or_none, + str_to_int, strip_or_none, unified_strdate, urljoin, @@ -36,6 +38,25 @@ class VidLiiIE(InfoExtractor): 'tags': ['Vidlii', 'Jan', 'Videogames'], } }, { + 'url': 'https://www.vidlii.com/watch?v=zTAtaAgOLKt', + 'md5': '5778f7366aa4c569b77002f8bf6b614f', + 'info_dict': { + 'id': 'zTAtaAgOLKt', + 'ext': 'mp4', + 'title': 'FULPTUBE SUCKS.', + 'description': 'md5:087b2ca355d4c8f8f77e97c43e72d711', + 'thumbnail': 'https://www.vidlii.com/usfi/thmp/zTAtaAgOLKt.jpg', + 'uploader': 'Homicide', + 'uploader_url': 'https://www.vidlii.com/user/Homicide', + 'upload_date': '20210612', + 'duration': 89, + 'view_count': int, + 'comment_count': int, + 'average_rating': float, + 'categories': ['News & Politics'], + 'tags': ['fulp', 'tube', 'sucks', 'bad', 'fulptube'], + }, + }, { 'url': 'https://www.vidlii.com/embed?v=tJluaH4BJ3v&a=0', 'only_matching': True, }] @@ -45,10 +66,20 @@ class VidLiiIE(InfoExtractor): webpage = self._download_webpage( 'https://www.vidlii.com/watch?v=%s' % video_id, video_id) - - video_url = self._search_regex( - r'src\s*:\s*(["\'])(?P<url>(?:https?://)?(?:(?!\1).)+)\1', webpage, - 'video url', group='url') + formats = [] + + sources = [source[1] for source in re.findall( + r'src\s*:\s*(["\'])(?P<url>(?:https?://)?(?:(?!\1).)+)\1', + webpage) or []] + for source in sources: + height = int(self._search_regex(r'(\d+).mp4', source, 'height', default=360)) + if self._request_webpage(HEADRequest(source), video_id, f'Checking {height}p url', errnote=False): + formats.append({ + 'url': source, + 'format_id': f'{height}p', + 'height': height, + }) + self._sort_formats(formats) title = self._search_regex( (r'<h1>([^<]+)</h1>', r'<title>([^<]+) - VidLii<'), webpage, @@ -82,9 +113,9 @@ class VidLiiIE(InfoExtractor): default=None) or self._search_regex( r'duration\s*:\s*(\d+)', webpage, 'duration', fatal=False)) - view_count = int_or_none(self._search_regex( - (r'<strong>(\d+)</strong> views', - r'Views\s*:\s*<strong>(\d+)</strong>'), + view_count = str_to_int(self._search_regex( + (r'<strong>([,0-9]+)</strong> views', + r'Views\s*:\s*<strong>([,0-9]+)</strong>'), webpage, 'view count', fatal=False)) comment_count = int_or_none(self._search_regex( @@ -109,11 +140,11 @@ class VidLiiIE(InfoExtractor): return { 'id': video_id, - 'url': video_url, 'title': title, 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, + 'formats': formats, 'uploader_url': uploader_url, 'upload_date': upload_date, 'duration': duration, diff --git a/yt_dlp/extractor/viki.py b/yt_dlp/extractor/viki.py index acb5ae550..6a3c5532d 100644 --- a/yt_dlp/extractor/viki.py +++ b/yt_dlp/extractor/viki.py @@ -135,9 +135,6 @@ class VikiIE(VikiBaseIE): 'uploader': 'FCC', 'upload_date': '20201127', }, - 'params': { - 'format': 'bestvideo', - }, }, { 'url': 'http://www.viki.com/videos/1023585v-heirs-episode-14', 'info_dict': { @@ -151,9 +148,6 @@ class VikiIE(VikiBaseIE): 'duration': 3570, 'episode_number': 14, }, - 'params': { - 'format': 'bestvideo', - }, 'skip': 'Blocked in the US', }, { # clip @@ -203,9 +197,6 @@ class VikiIE(VikiBaseIE): 'age_limit': 13, 'episode_number': 1, }, - 'params': { - 'format': 'bestvideo', - }, }, { # youtube external 'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1', @@ -241,9 +232,6 @@ class VikiIE(VikiBaseIE): 'title': 'Love In Magic', 'age_limit': 13, }, - 'params': { - 'format': 'bestvideo', - }, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index 04c504934..e2b86662b 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -18,6 +18,7 @@ from ..utils import ( determine_ext, ExtractorError, get_element_by_class, + HEADRequest, js_to_json, int_or_none, merge_dicts, @@ -35,6 +36,7 @@ from ..utils import ( urlencode_postdata, urljoin, unescapeHTML, + urlhandle_detect_ext, ) @@ -229,27 +231,26 @@ class VimeoBaseInfoExtractor(InfoExtractor): query['unlisted_hash'] = unlisted_hash download_data = self._download_json( url, video_id, fatal=False, query=query, - headers={'X-Requested-With': 'XMLHttpRequest'}) - if download_data: - source_file = download_data.get('source_file') - if isinstance(source_file, dict): - download_url = source_file.get('download_url') - if download_url and not source_file.get('is_cold') and not source_file.get('is_defrosting'): - source_name = source_file.get('public_name', 'Original') - if self._is_valid_url(download_url, video_id, '%s video' % source_name): - ext = (try_get( - source_file, lambda x: x['extension'], - compat_str) or determine_ext( - download_url, None) or 'mp4').lower() - return { - 'url': download_url, - 'ext': ext, - 'width': int_or_none(source_file.get('width')), - 'height': int_or_none(source_file.get('height')), - 'filesize': parse_filesize(source_file.get('size')), - 'format_id': source_name, - 'quality': 1, - } + headers={'X-Requested-With': 'XMLHttpRequest'}, + expected_status=(403, 404)) or {} + source_file = download_data.get('source_file') + download_url = try_get(source_file, lambda x: x['download_url']) + if download_url and not source_file.get('is_cold') and not source_file.get('is_defrosting'): + source_name = source_file.get('public_name', 'Original') + if self._is_valid_url(download_url, video_id, '%s video' % source_name): + ext = (try_get( + source_file, lambda x: x['extension'], + compat_str) or determine_ext( + download_url, None) or 'mp4').lower() + return { + 'url': download_url, + 'ext': ext, + 'width': int_or_none(source_file.get('width')), + 'height': int_or_none(source_file.get('height')), + 'filesize': parse_filesize(source_file.get('size')), + 'format_id': source_name, + 'quality': 1, + } jwt_response = self._download_json( 'https://vimeo.com/_rv/viewer', video_id, note='Downloading jwt token', fatal=False) or {} @@ -258,15 +259,19 @@ class VimeoBaseInfoExtractor(InfoExtractor): headers = {'Authorization': 'jwt %s' % jwt_response['jwt']} original_response = self._download_json( f'https://api.vimeo.com/videos/{video_id}', video_id, - headers=headers, fatal=False) or {} - for download_data in original_response.get('download') or {}: + headers=headers, fatal=False, expected_status=(403, 404)) or {} + for download_data in original_response.get('download') or []: download_url = download_data.get('link') if not download_url or download_data.get('quality') != 'source': continue - query = parse_qs(download_url) + ext = determine_ext(parse_qs(download_url).get('filename', [''])[0].lower(), default_ext=None) + if not ext: + urlh = self._request_webpage( + HEADRequest(download_url), video_id, fatal=False, note='Determining source extension') + ext = urlh and urlhandle_detect_ext(urlh) return { 'url': download_url, - 'ext': determine_ext(query.get('filename', [''])[0].lower()), + 'ext': ext or 'unknown_video', 'format_id': download_data.get('public_name', 'Original'), 'width': int_or_none(download_data.get('width')), 'height': int_or_none(download_data.get('height')), @@ -291,7 +296,7 @@ class VimeoIE(VimeoBaseInfoExtractor): )? vimeo(?:pro)?\.com/ (?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/) - (?:.*?/)? + (?:[^/]+/)*? (?: (?: play_redirect_hls| @@ -362,7 +367,6 @@ class VimeoIE(VimeoBaseInfoExtractor): 'params': { 'format': 'best[protocol=https]', }, - 'expected_warnings': ['Unable to download JSON metadata'], }, { 'url': 'http://vimeo.com/68375962', @@ -402,7 +406,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'upload_date': '20130928', 'duration': 187, }, - 'expected_warnings': ['Unable to download JSON metadata'], + 'params': {'format': 'http-1080p'}, }, { 'url': 'http://vimeo.com/76979871', @@ -424,7 +428,8 @@ class VimeoIE(VimeoBaseInfoExtractor): 'es': [{'ext': 'vtt'}], 'fr': [{'ext': 'vtt'}], }, - } + }, + 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'], }, { # from https://www.ouya.tv/game/Pier-Solar-and-the-Great-Architects/ @@ -469,7 +474,6 @@ class VimeoIE(VimeoBaseInfoExtractor): 'description': 'md5:f2edc61af3ea7a5592681ddbb683db73', 'upload_date': '20200225', }, - 'expected_warnings': ['Unable to download JSON metadata'], }, { # only available via https://vimeo.com/channels/tributes/6213729 and @@ -491,7 +495,6 @@ class VimeoIE(VimeoBaseInfoExtractor): 'params': { 'skip_download': True, }, - 'expected_warnings': ['Unable to download JSON metadata'], }, { # redirects to ondemand extractor and should be passed through it @@ -511,7 +514,6 @@ class VimeoIE(VimeoBaseInfoExtractor): 'params': { 'skip_download': True, }, - 'expected_warnings': ['Unable to download JSON metadata'], 'skip': 'this page is no longer available.', }, { @@ -572,14 +574,55 @@ class VimeoIE(VimeoBaseInfoExtractor): 'only_matching': True, }, { + 'note': 'Direct URL with hash', 'url': 'https://vimeo.com/160743502/abd0e13fb4', - 'only_matching': True, + 'info_dict': { + 'id': '160743502', + 'ext': 'mp4', + 'uploader': 'Julian Tryba', + 'uploader_id': 'aliniamedia', + 'title': 'Harrisville New Hampshire', + 'timestamp': 1459259666, + 'upload_date': '20160329', + }, + 'params': {'skip_download': True}, + }, + { + 'url': 'https://vimeo.com/138909882', + 'info_dict': { + 'id': '138909882', + 'ext': 'mp4', + 'title': 'Eastnor Castle 2015 Firework Champions - The Promo!', + 'description': 'md5:5967e090768a831488f6e74b7821b3c1', + 'uploader_id': 'fireworkchampions', + 'uploader': 'Firework Champions', + 'upload_date': '20150910', + 'timestamp': 1441901895, + }, + 'params': { + 'skip_download': True, + 'format': 'Original', + }, + }, + { + 'url': 'https://vimeo.com/channels/staffpicks/143603739', + 'info_dict': { + 'id': '143603739', + 'ext': 'mp4', + 'uploader': 'Karim Huu Do', + 'timestamp': 1445846953, + 'upload_date': '20151026', + 'title': 'The Shoes - Submarine Feat. Blaine Harrison', + 'uploader_id': 'karimhd', + 'description': 'md5:8e2eea76de4504c2e8020a9bcfa1e843', + }, + 'params': {'skip_download': 'm3u8'}, }, { # requires passing unlisted_hash(a52724358e) to load_download_config request 'url': 'https://vimeo.com/392479337/a52724358e', 'only_matching': True, - } + }, # https://gettingthingsdone.com/workflowmap/ # vimeo embed with check-password page protected by Referer header ] @@ -708,7 +751,8 @@ class VimeoIE(VimeoBaseInfoExtractor): headers['Referer'] = url # Extract ID from URL - video_id, unlisted_hash = self._match_valid_url(url).groups() + mobj = self._match_valid_url(url).groupdict() + video_id, unlisted_hash = mobj['id'], mobj.get('unlisted_hash') if unlisted_hash: return self._extract_from_api(video_id, unlisted_hash) @@ -768,18 +812,19 @@ class VimeoIE(VimeoBaseInfoExtractor): timestamp = None video_description = None info_dict = {} + config_url = None channel_id = self._search_regex( r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None) if channel_id: config_url = self._html_search_regex( - r'\bdata-config-url="([^"]+)"', webpage, 'config URL') + r'\bdata-config-url="([^"]+)"', webpage, 'config URL', default=None) video_description = clean_html(get_element_by_class('description', webpage)) info_dict.update({ 'channel_id': channel_id, 'channel_url': 'https://vimeo.com/channels/' + channel_id, }) - else: + if not config_url: page_config = self._parse_json(self._search_regex( r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});', webpage, 'page config', default='{}'), video_id, fatal=False) @@ -1100,10 +1145,10 @@ class VimeoGroupsIE(VimeoChannelIE): IE_NAME = 'vimeo:group' _VALID_URL = r'https://vimeo\.com/groups/(?P<id>[^/]+)(?:/(?!videos?/\d+)|$)' _TESTS = [{ - 'url': 'https://vimeo.com/groups/kattykay', + 'url': 'https://vimeo.com/groups/meetup', 'info_dict': { - 'id': 'kattykay', - 'title': 'Katty Kay', + 'id': 'meetup', + 'title': 'Vimeo Meetup!', }, 'playlist_mincount': 27, }] @@ -1125,7 +1170,6 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): 'uploader_id': 'user21297594', 'description': "Comedian Dick Hardwick's five minute demo filmed in front of a live theater audience.\nEdit by Doug Mattocks", }, - 'expected_warnings': ['Unable to download JSON metadata'], }, { 'note': 'video player needs Referer', 'url': 'https://vimeo.com/user22258446/review/91613211/13f927e053', diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index d8a9b9ab4..9a5c9ee6b 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -51,7 +51,7 @@ class VKBaseIE(InfoExtractor): self._apply_first_set_cookie_header(url_handle, 'remixlhk') login_page = self._download_webpage( - 'https://login.vk.com/?act=login', None, + 'https://vk.com/login', None, note='Logging in', data=urlencode_postdata(login_form)) @@ -471,6 +471,13 @@ class VKIE(VKBaseIE): }) self._sort_formats(formats) + subtitles = {} + for sub in data.get('subs') or {}: + subtitles.setdefault(sub.get('lang', 'en'), []).append({ + 'ext': sub.get('title', '.srt').split('.')[-1], + 'url': url_or_none(sub.get('url')), + }) + return { 'id': video_id, 'formats': formats, @@ -484,6 +491,7 @@ class VKIE(VKBaseIE): 'like_count': int_or_none(mv_data.get('likes')), 'comment_count': int_or_none(mv_data.get('commcount')), 'is_live': is_live, + 'subtitles': subtitles, } diff --git a/yt_dlp/extractor/vlive.py b/yt_dlp/extractor/vlive.py index 4340b1d4c..8fccf1b63 100644 --- a/yt_dlp/extractor/vlive.py +++ b/yt_dlp/extractor/vlive.py @@ -12,6 +12,7 @@ from ..compat import ( from ..utils import ( ExtractorError, int_or_none, + LazyList, merge_dicts, str_or_none, strip_or_none, @@ -363,11 +364,10 @@ class VLiveChannelIE(VLiveBaseIE): if board.get('boardType') not in ('STAR', 'VLIVE_PLUS'): raise ExtractorError(f'Board {board_name!r} is not supported', expected=True) - entries = self._entries(posts_id or channel_id, board_name) - first_video = next(entries) - channel_name = first_video['channel'] + entries = LazyList(self._entries(posts_id or channel_id, board_name)) + channel_name = entries[0]['channel'] return self.playlist_result( - itertools.chain([first_video], entries), + entries, f'{channel_id}-{posts_id}' if posts_id else channel_id, f'{channel_name} - {board_name}' if channel_name and board_name else channel_name) diff --git a/yt_dlp/extractor/vrv.py b/yt_dlp/extractor/vrv.py index 419602148..7bc55f333 100644 --- a/yt_dlp/extractor/vrv.py +++ b/yt_dlp/extractor/vrv.py @@ -19,6 +19,7 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + join_nonempty, traverse_obj, ) @@ -141,14 +142,10 @@ class VRVIE(VRVBaseIE): def _extract_vrv_formats(self, url, video_id, stream_format, audio_lang, hardsub_lang): if not url or stream_format not in ('hls', 'dash', 'adaptive_hls'): return [] - stream_id_list = [] - if audio_lang: - stream_id_list.append('audio-%s' % audio_lang) - if hardsub_lang: - stream_id_list.append('hardsub-%s' % hardsub_lang) - format_id = stream_format - if stream_id_list: - format_id += '-' + '-'.join(stream_id_list) + format_id = join_nonempty( + stream_format, + audio_lang and 'audio-%s' % audio_lang, + hardsub_lang and 'hardsub-%s' % hardsub_lang) if 'hls' in stream_format: adaptive_formats = self._extract_m3u8_formats( url, video_id, 'mp4', m3u8_id=format_id, diff --git a/yt_dlp/extractor/vupload.py b/yt_dlp/extractor/vupload.py index 9846ababc..2229a6591 100644 --- a/yt_dlp/extractor/vupload.py +++ b/yt_dlp/extractor/vupload.py @@ -7,6 +7,7 @@ from ..utils import ( parse_filesize, extract_attributes, int_or_none, + js_to_json ) @@ -28,8 +29,11 @@ class VuploadIE(InfoExtractor): webpage = self._download_webpage(url, video_id) title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title') - video_e = self._html_search_regex(r'\|([a-z0-9]{60})\|', webpage, 'video') - video_url = f'https://wurize.megaupload.to/{video_e}/v.mp4' + video_json = self._parse_json(self._html_search_regex(r'sources:\s*(.+?]),', webpage, 'video'), video_id, transform_source=js_to_json) + formats = [] + for source in video_json: + if source['src'].endswith('.m3u8'): + formats.extend(self._extract_m3u8_formats(source['src'], video_id, m3u8_id='hls')) duration = parse_duration(self._html_search_regex( r'<i\s*class=["\']fad\s*fa-clock["\']></i>\s*([\d:]+)\s*</div>', webpage, 'duration', fatal=False)) filesize_approx = parse_filesize(self._html_search_regex( @@ -40,7 +44,7 @@ class VuploadIE(InfoExtractor): return { 'id': video_id, - 'url': video_url, + 'formats': formats, 'duration': duration, 'filesize_approx': filesize_approx, 'width': int_or_none(extra_video_info.get('width')), diff --git a/yt_dlp/extractor/wakanim.py b/yt_dlp/extractor/wakanim.py index a61a630e2..a70a71961 100644 --- a/yt_dlp/extractor/wakanim.py +++ b/yt_dlp/extractor/wakanim.py @@ -25,7 +25,6 @@ class WakanimIE(InfoExtractor): 'episode_number': 2, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, { diff --git a/yt_dlp/extractor/watchbox.py b/yt_dlp/extractor/watchbox.py index 7469fe962..d19d80102 100644 --- a/yt_dlp/extractor/watchbox.py +++ b/yt_dlp/extractor/watchbox.py @@ -30,7 +30,6 @@ class WatchBoxIE(InfoExtractor): 'release_year': 2009, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, 'expected_warnings': ['Failed to download m3u8 information'], @@ -52,7 +51,6 @@ class WatchBoxIE(InfoExtractor): 'episode_number': 1, }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, 'expected_warnings': ['Failed to download m3u8 information'], diff --git a/yt_dlp/extractor/wdr.py b/yt_dlp/extractor/wdr.py index f54aa6ff9..d3229d8af 100644 --- a/yt_dlp/extractor/wdr.py +++ b/yt_dlp/extractor/wdr.py @@ -22,7 +22,11 @@ from ..utils import ( class WDRIE(InfoExtractor): - _VALID_URL = r'https?://deviceids-medp\.wdr\.de/ondemand/\d+/(?P<id>\d+)\.js' + _VALID_URL = r'''(?x)https?:// + (?:deviceids-medp\.wdr\.de/ondemand/\d+/| + kinder\.wdr\.de/(?!mediathek/)[^#?]+-) + (?P<id>\d+)\.(?:js|assetjsonp) + ''' _GEO_COUNTRIES = ['DE'] _TEST = { 'url': 'http://deviceids-medp.wdr.de/ondemand/155/1557833.js', diff --git a/yt_dlp/extractor/webcaster.py b/yt_dlp/extractor/webcaster.py index e4b65f54f..a858e992c 100644 --- a/yt_dlp/extractor/webcaster.py +++ b/yt_dlp/extractor/webcaster.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( determine_ext, + join_nonempty, xpath_text, ) @@ -34,12 +35,9 @@ class WebcasterIE(InfoExtractor): title = xpath_text(video, './/event_name', 'event name', fatal=True) - def make_id(parts, separator): - return separator.join(filter(None, parts)) - formats = [] for format_id in (None, 'noise'): - track_tag = make_id(('track', format_id), '_') + track_tag = join_nonempty('track', format_id, delim='_') for track in video.findall('.//iphone/%s' % track_tag): track_url = track.text if not track_url: @@ -48,7 +46,7 @@ class WebcasterIE(InfoExtractor): m3u8_formats = self._extract_m3u8_formats( track_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id=make_id(('hls', format_id), '-'), fatal=False) + m3u8_id=join_nonempty('hls', format_id, delim='-'), fatal=False) for f in m3u8_formats: f.update({ 'source_preference': 0 if format_id == 'noise' else 1, diff --git a/yt_dlp/extractor/willow.py b/yt_dlp/extractor/willow.py new file mode 100644 index 000000000..4d3d62f95 --- /dev/null +++ b/yt_dlp/extractor/willow.py @@ -0,0 +1,58 @@ +# coding: utf-8 +from ..utils import ExtractorError +from .common import InfoExtractor + + +class WillowIE(InfoExtractor): + _VALID_URL = r'https?://(www\.)?willow\.tv/videos/(?P<id>[0-9a-z-_]+)' + _GEO_COUNTRIES = ['US'] + + _TESTS = [{ + 'url': 'http://willow.tv/videos/d5winning-moment-eng-vs-ind-streaming-online-4th-test-india-tour-of-england-2021', + 'info_dict': { + 'id': '169662', + 'display_id': 'd5winning-moment-eng-vs-ind-streaming-online-4th-test-india-tour-of-england-2021', + 'ext': 'mp4', + 'title': 'Winning Moment: 4th Test, England vs India', + 'thumbnail': 'https://aimages.willow.tv/ytThumbnails/6748_D5winning_moment.jpg', + 'duration': 233, + 'timestamp': 1630947954, + 'upload_date': '20210906', + 'location': 'Kennington Oval, London', + 'series': 'India tour of England 2021', + }, + 'params': { + 'skip_download': True, # AES-encrypted m3u8 + }, + }, { + 'url': 'http://willow.tv/videos/highlights-short-ind-vs-nz-streaming-online-2nd-t20i-new-zealand-tour-of-india-2021', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + video_data = self._parse_json(self._html_search_regex( + r'var\s+data_js\s*=\s*JSON\.parse\(\'(.+)\'\)', webpage, + 'data_js'), video_id) + + video = next((v for v in video_data.get('trending_videos') or [] + if v.get('secureurl')), None) + if not video: + raise ExtractorError('No videos found') + + formats = self._extract_m3u8_formats(video['secureurl'], video_id, 'mp4') + self._sort_formats(formats) + + return { + 'id': str(video.get('content_id')), + 'display_id': video.get('video_slug'), + 'title': video.get('video_name') or self._html_search_meta('twitter:title', webpage), + 'formats': formats, + 'thumbnail': video.get('yt_thumb_url') or self._html_search_meta( + 'twitter:image', webpage, default=None), + 'duration': video.get('duration_seconds'), + 'timestamp': video.get('created_date'), + 'location': video.get('venue'), + 'series': video.get('series_name'), + } diff --git a/yt_dlp/extractor/wppilot.py b/yt_dlp/extractor/wppilot.py new file mode 100644 index 000000000..3003a0f10 --- /dev/null +++ b/yt_dlp/extractor/wppilot.py @@ -0,0 +1,177 @@ +# coding: utf-8 + +from .common import InfoExtractor +from ..utils import ( + try_get, + ExtractorError, +) + +import json +import random +import re + + +class WPPilotBaseIE(InfoExtractor): + _VIDEO_URL = 'https://pilot.wp.pl/api/v1/channel/%s' + _VIDEO_GUEST_URL = 'https://pilot.wp.pl/api/v1/guest/channel/%s' + + _HEADERS_WEB = { + 'Content-Type': 'application/json; charset=UTF-8', + 'Referer': 'https://pilot.wp.pl/tv/', + } + + def _get_channel_list(self, cache=True): + if cache is True: + cache_res = self._downloader.cache.load('wppilot', 'channel-list') + if cache_res: + return cache_res, True + webpage = self._download_webpage('https://pilot.wp.pl/tv/', None, 'Downloading webpage') + page_data_base_url = self._search_regex( + r'<script src="(https://wp-pilot-gatsby\.wpcdn\.pl/v[\d.-]+/desktop)', + webpage, 'gatsby build version') + '/page-data' + page_data = self._download_json(f'{page_data_base_url}/tv/page-data.json', None, 'Downloading page data') + for qhash in page_data['staticQueryHashes']: + qhash_content = self._download_json( + f'{page_data_base_url}/sq/d/{qhash}.json', None, + 'Searching for channel list') + channel_list = try_get(qhash_content, lambda x: x['data']['allChannels']['nodes']) + if channel_list is None: + continue + self._downloader.cache.store('wppilot', 'channel-list', channel_list) + return channel_list, False + raise ExtractorError('Unable to find the channel list') + + def _parse_channel(self, chan): + return { + 'id': str(chan['id']), + 'title': chan['name'], + 'is_live': True, + 'thumbnails': [{ + 'id': key, + 'url': chan[key], + } for key in ('thumbnail', 'thumbnail_mobile', 'icon') if chan.get(key)], + } + + +class WPPilotIE(WPPilotBaseIE): + _VALID_URL = r'(?:https?://pilot\.wp\.pl/tv/?#|wppilot:)(?P<id>[a-z\d-]+)' + IE_NAME = 'wppilot' + + _TESTS = [{ + 'url': 'https://pilot.wp.pl/tv/#telewizja-wp-hd', + 'info_dict': { + 'id': '158', + 'ext': 'mp4', + 'title': 'Telewizja WP HD', + }, + 'params': { + 'format': 'bestvideo', + }, + }, { + # audio only + 'url': 'https://pilot.wp.pl/tv/#radio-nowy-swiat', + 'info_dict': { + 'id': '238', + 'ext': 'm4a', + 'title': 'Radio Nowy Świat', + }, + 'params': { + 'format': 'bestaudio', + }, + }, { + 'url': 'wppilot:9', + 'only_matching': True, + }] + + def _get_channel(self, id_or_slug): + video_list, is_cached = self._get_channel_list(cache=True) + key = 'id' if re.match(r'^\d+$', id_or_slug) else 'slug' + for video in video_list: + if video.get(key) == id_or_slug: + return self._parse_channel(video) + # if cached channel not found, download and retry + if is_cached: + video_list, _ = self._get_channel_list(cache=False) + for video in video_list: + if video.get(key) == id_or_slug: + return self._parse_channel(video) + raise ExtractorError('Channel not found') + + def _real_extract(self, url): + video_id = self._match_id(url) + + channel = self._get_channel(video_id) + video_id = str(channel['id']) + + is_authorized = next((c for c in self._downloader.cookiejar if c.name == 'netviapisessid'), None) + # cookies starting with "g:" are assigned to guests + is_authorized = True if is_authorized is not None and not is_authorized.value.startswith('g:') else False + + video = self._download_json( + (self._VIDEO_URL if is_authorized else self._VIDEO_GUEST_URL) % video_id, + video_id, query={ + 'device_type': 'web', + }, headers=self._HEADERS_WEB, + expected_status=(200, 422)) + + stream_token = try_get(video, lambda x: x['_meta']['error']['info']['stream_token']) + if stream_token: + close = self._download_json( + 'https://pilot.wp.pl/api/v1/channels/close', video_id, + 'Invalidating previous stream session', headers=self._HEADERS_WEB, + data=json.dumps({ + 'channelId': video_id, + 't': stream_token, + }).encode('utf-8')) + if try_get(close, lambda x: x['data']['status']) == 'ok': + return self.url_result(url, ie=WPPilotIE.ie_key()) + + formats = [] + + for fmt in video['data']['stream_channel']['streams']: + # live DASH does not work for now + # if fmt['type'] == 'dash@live:abr': + # formats.extend( + # self._extract_mpd_formats( + # random.choice(fmt['url']), video_id)) + if fmt['type'] == 'hls@live:abr': + formats.extend( + self._extract_m3u8_formats( + random.choice(fmt['url']), + video_id, live=True)) + + self._sort_formats(formats) + + channel['formats'] = formats + return channel + + +class WPPilotChannelsIE(WPPilotBaseIE): + _VALID_URL = r'(?:https?://pilot\.wp\.pl/(?:tv/?)?(?:\?[^#]*)?#?|wppilot:)$' + IE_NAME = 'wppilot:channels' + + _TESTS = [{ + 'url': 'wppilot:', + 'info_dict': { + 'id': 'wppilot', + 'title': 'WP Pilot', + }, + 'playlist_mincount': 100, + }, { + 'url': 'https://pilot.wp.pl/', + 'only_matching': True, + }] + + def _entries(self): + channel_list, _ = self._get_channel_list() + for chan in channel_list: + entry = self._parse_channel(chan) + entry.update({ + '_type': 'url_transparent', + 'url': f'wppilot:{chan["id"]}', + 'ie_key': WPPilotIE.ie_key(), + }) + yield entry + + def _real_extract(self, url): + return self.playlist_result(self._entries(), 'wppilot', 'WP Pilot') diff --git a/yt_dlp/extractor/xvideos.py b/yt_dlp/extractor/xvideos.py index 8fc64914c..ab07f01af 100644 --- a/yt_dlp/extractor/xvideos.py +++ b/yt_dlp/extractor/xvideos.py @@ -19,7 +19,7 @@ class XVideosIE(InfoExtractor): (?: (?:[^/]+\.)?xvideos2?\.com/video| (?:www\.)?xvideos\.es/video| - flashservice\.xvideos\.com/embedframe/| + (?:www|flashservice)\.xvideos\.com/embedframe/| static-hw\.xvideos\.com/swf/xv-player\.swf\?.*?\bid_video= ) (?P<id>[0-9]+) @@ -38,6 +38,9 @@ class XVideosIE(InfoExtractor): 'url': 'https://flashservice.xvideos.com/embedframe/4588838', 'only_matching': True, }, { + 'url': 'https://www.xvideos.com/embedframe/4588838', + 'only_matching': True, + }, { 'url': 'http://static-hw.xvideos.com/swf/xv-player.swf?id_video=4588838', 'only_matching': True, }, { @@ -80,9 +83,7 @@ class XVideosIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - - webpage = self._download_webpage( - 'https://www.xvideos.com/video%s/' % video_id, video_id) + webpage = self._download_webpage(url, video_id) mobj = re.search(r'<h1 class="inlineError">(.+?)</h1>', webpage) if mobj: diff --git a/yt_dlp/extractor/yandexvideo.py b/yt_dlp/extractor/yandexvideo.py index 9974d65d6..67095f2fd 100644 --- a/yt_dlp/extractor/yandexvideo.py +++ b/yt_dlp/extractor/yandexvideo.py @@ -7,6 +7,7 @@ import re from .common import InfoExtractor from ..utils import ( determine_ext, + extract_attributes, int_or_none, try_get, url_or_none, @@ -148,7 +149,7 @@ class YandexVideoIE(InfoExtractor): class ZenYandexIE(InfoExtractor): - _VALID_URL = r'https?://zen\.yandex\.ru/media/(?:id/[^/]+/|[^/]+/)(?:[a-z0-9-]+)-(?P<id>[a-z0-9-]+)' + _VALID_URL = r'https?://zen\.yandex\.ru(?:/video)?/(media|watch)/(?:(?:id/[^/]+/|[^/]+/)(?:[a-z0-9-]+)-)?(?P<id>[a-z0-9-]+)' _TESTS = [{ 'url': 'https://zen.yandex.ru/media/popmech/izverjenie-vulkana-iz-spichek-zreliscnyi-opyt-6002240ff8b1af50bb2da5e3', 'info_dict': { @@ -156,19 +157,38 @@ class ZenYandexIE(InfoExtractor): 'ext': 'mp4', 'title': 'Извержение вулкана из спичек: зрелищный опыт', 'description': 'md5:053ad3c61b5596d510c9a199dc8ee633', - 'thumbnail': 'https://avatars.mds.yandex.net/get-zen-pub-og/3558619/pub_6002240ff8b1af50bb2da5e3_600bad814d953e4132a30b5e/orig', + 'thumbnail': 're:^https://avatars.mds.yandex.net/', 'uploader': 'Популярная механика', }, + 'params': { + 'skip_download': 'm3u8', + }, }, { 'url': 'https://zen.yandex.ru/media/id/606fd806cc13cb3c58c05cf5/vot-eto-focus-dedy-morozy-na-gidrociklah-60c7c443da18892ebfe85ed7', 'info_dict': { 'id': '60c7c443da18892ebfe85ed7', 'ext': 'mp4', 'title': 'ВОТ ЭТО Focus. Деды Морозы на гидроциклах', - 'description': 'md5:8684912f6086f298f8078d4af0e8a600', - 'thumbnail': 'https://avatars.mds.yandex.net/get-zen-pub-og/4410519/pub_60c7c443da18892ebfe85ed7_60c7c48e060a163121f42cc3/orig', + 'description': 'md5:f3db3d995763b9bbb7b56d4ccdedea89', + 'thumbnail': 're:^https://avatars.mds.yandex.net/', 'uploader': 'AcademeG DailyStream' }, + 'params': { + 'skip_download': 'm3u8', + 'format': 'bestvideo', + }, + }, { + 'url': 'https://zen.yandex.ru/video/watch/6002240ff8b1af50bb2da5e3', + 'info_dict': { + 'id': '6002240ff8b1af50bb2da5e3', + 'ext': 'mp4', + 'title': 'Извержение вулкана из спичек: зрелищный опыт', + 'description': 'md5:053ad3c61b5596d510c9a199dc8ee633', + 'uploader': 'Популярная механика', + }, + 'params': { + 'skip_download': 'm3u8', + }, }, { 'url': 'https://zen.yandex.ru/media/id/606fd806cc13cb3c58c05cf5/novyi-samsung-fold-3-moskvich-barahlit-612f93b7f8d48e7e945792a2?from=channel&rid=2286618386.482.1630817595976.42360', 'only_matching': True, @@ -177,23 +197,37 @@ class ZenYandexIE(InfoExtractor): def _real_extract(self, url): id = self._match_id(url) webpage = self._download_webpage(url, id) - data_json = self._parse_json(self._search_regex(r'w\._data\s?=\s?({.+?});', webpage, 'metadata'), id) - stream_json = try_get(data_json, lambda x: x['publication']['content']['gifContent'], dict) - stream_url = stream_json.get('stream') or try_get(stream_json, lambda x: x['streams']['url']) - formats = self._extract_m3u8_formats(stream_url, id) + data_json = self._parse_json( + self._search_regex(r'data\s*=\s*({["\']_*serverState_*video.+?});', webpage, 'metadata'), id) + serverstate = self._search_regex(r'(_+serverState_+video-site_[^_]+_+)', + webpage, 'server state').replace('State', 'Settings') + uploader = self._search_regex(r'(<a\s*class=["\']card-channel-link[^"\']+["\'][^>]+>)', + webpage, 'uploader', default='<a>') + uploader_name = extract_attributes(uploader).get('aria-label') + video_json = try_get(data_json, lambda x: x[serverstate]['exportData']['video'], dict) + stream_urls = try_get(video_json, lambda x: x['video']['streams']) + formats = [] + for s_url in stream_urls: + ext = determine_ext(s_url) + if ext == 'mpd': + formats.extend(self._extract_mpd_formats(s_url, id, mpd_id='dash')) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats(s_url, id, 'mp4')) self._sort_formats(formats) return { 'id': id, - 'title': try_get(data_json, (lambda x: x['og']['title'], lambda x: x['publication']['content']['preview']['title'])), - 'uploader': data_json.get('authorName') or try_get(data_json, lambda x: x['publisher']['name']), - 'description': try_get(data_json, lambda x: x['og']['description']), - 'thumbnail': try_get(data_json, lambda x: x['og']['imageUrl']), + 'title': video_json.get('title') or self._og_search_title(webpage), 'formats': formats, + 'duration': int_or_none(video_json.get('duration')), + 'view_count': int_or_none(video_json.get('views')), + 'uploader': uploader_name or data_json.get('authorName') or try_get(data_json, lambda x: x['publisher']['name']), + 'description': self._og_search_description(webpage) or try_get(data_json, lambda x: x['og']['description']), + 'thumbnail': self._og_search_thumbnail(webpage) or try_get(data_json, lambda x: x['og']['imageUrl']), } class ZenYandexChannelIE(InfoExtractor): - _VALID_URL = r'https?://zen\.yandex\.ru/(?!media)(?:id/)?(?P<id>[a-z0-9-_]+)' + _VALID_URL = r'https?://zen\.yandex\.ru/(?!media|video)(?:id/)?(?P<id>[a-z0-9-_]+)' _TESTS = [{ 'url': 'https://zen.yandex.ru/tok_media', 'info_dict': { diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 658b45fe1..ba135613b 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -9,6 +9,7 @@ import datetime import hashlib import itertools import json +import math import os.path import random import re @@ -28,6 +29,7 @@ from ..compat import ( ) from ..jsinterp import JSInterpreter from ..utils import ( + bug_reports_message, bytes_to_intlist, clean_html, datetime_from_str, @@ -39,8 +41,10 @@ from ..utils import ( int_or_none, intlist_to_bytes, is_html, + join_nonempty, mimetype2ext, network_exceptions, + NO_DEFAULT, orderedSet, parse_codecs, parse_count, @@ -65,6 +69,10 @@ from ..utils import ( ) +def get_first(obj, keys, **kwargs): + return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False) + + # any clients starting with _ cannot be explicity requested by the user INNERTUBE_CLIENTS = { 'web': { @@ -258,6 +266,70 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False + _INVIDIOUS_SITES = ( + # invidious-redirect websites + r'(?:www\.)?redirect\.invidious\.io', + r'(?:(?:www|dev)\.)?invidio\.us', + # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md + r'(?:www\.)?invidious\.pussthecat\.org', + r'(?:www\.)?invidious\.zee\.li', + r'(?:www\.)?invidious\.ethibox\.fr', + r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion', + # youtube-dl invidious instances list + r'(?:(?:www|no)\.)?invidiou\.sh', + r'(?:(?:www|fi)\.)?invidious\.snopyta\.org', + r'(?:www\.)?invidious\.kabi\.tk', + r'(?:www\.)?invidious\.mastodon\.host', + r'(?:www\.)?invidious\.zapashcanon\.fr', + r'(?:www\.)?(?:invidious(?:-us)?|piped)\.kavin\.rocks', + r'(?:www\.)?invidious\.tinfoil-hat\.net', + r'(?:www\.)?invidious\.himiko\.cloud', + r'(?:www\.)?invidious\.reallyancient\.tech', + r'(?:www\.)?invidious\.tube', + r'(?:www\.)?invidiou\.site', + r'(?:www\.)?invidious\.site', + r'(?:www\.)?invidious\.xyz', + r'(?:www\.)?invidious\.nixnet\.xyz', + r'(?:www\.)?invidious\.048596\.xyz', + r'(?:www\.)?invidious\.drycat\.fr', + r'(?:www\.)?inv\.skyn3t\.in', + r'(?:www\.)?tube\.poal\.co', + r'(?:www\.)?tube\.connect\.cafe', + r'(?:www\.)?vid\.wxzm\.sx', + r'(?:www\.)?vid\.mint\.lgbt', + r'(?:www\.)?vid\.puffyan\.us', + r'(?:www\.)?yewtu\.be', + r'(?:www\.)?yt\.elukerio\.org', + r'(?:www\.)?yt\.lelux\.fi', + r'(?:www\.)?invidious\.ggc-project\.de', + r'(?:www\.)?yt\.maisputain\.ovh', + r'(?:www\.)?ytprivate\.com', + r'(?:www\.)?invidious\.13ad\.de', + r'(?:www\.)?invidious\.toot\.koeln', + r'(?:www\.)?invidious\.fdn\.fr', + r'(?:www\.)?watch\.nettohikari\.com', + r'(?:www\.)?invidious\.namazso\.eu', + r'(?:www\.)?invidious\.silkky\.cloud', + r'(?:www\.)?invidious\.exonip\.de', + r'(?:www\.)?invidious\.riverside\.rocks', + r'(?:www\.)?invidious\.blamefran\.net', + r'(?:www\.)?invidious\.moomoo\.de', + r'(?:www\.)?ytb\.trom\.tf', + r'(?:www\.)?yt\.cyberhost\.uk', + r'(?:www\.)?kgg2m7yk5aybusll\.onion', + r'(?:www\.)?qklhadlycap4cnod\.onion', + r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion', + r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion', + r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion', + r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion', + r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p', + r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion', + r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion', + r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion', + r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion', + r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion', + ) + def _login(self): """ Attempt to log in to YouTube. @@ -437,9 +509,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor): Extracts visitorData from an API response or ytcfg Appears to be used to track session state """ - return traverse_obj( - args, (..., ('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))), - expected_type=compat_str, get_all=False) + return get_first( + args, (('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))), + expected_type=str) @property def is_authenticated(self): @@ -696,69 +768,6 @@ class YoutubeBaseInfoExtractor(InfoExtractor): class YoutubeIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube' - _INVIDIOUS_SITES = ( - # invidious-redirect websites - r'(?:www\.)?redirect\.invidious\.io', - r'(?:(?:www|dev)\.)?invidio\.us', - # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md - r'(?:www\.)?invidious\.pussthecat\.org', - r'(?:www\.)?invidious\.zee\.li', - r'(?:www\.)?invidious\.ethibox\.fr', - r'(?:www\.)?invidious\.3o7z6yfxhbw7n3za4rss6l434kmv55cgw2vuziwuigpwegswvwzqipyd\.onion', - # youtube-dl invidious instances list - r'(?:(?:www|no)\.)?invidiou\.sh', - r'(?:(?:www|fi)\.)?invidious\.snopyta\.org', - r'(?:www\.)?invidious\.kabi\.tk', - r'(?:www\.)?invidious\.mastodon\.host', - r'(?:www\.)?invidious\.zapashcanon\.fr', - r'(?:www\.)?(?:invidious(?:-us)?|piped)\.kavin\.rocks', - r'(?:www\.)?invidious\.tinfoil-hat\.net', - r'(?:www\.)?invidious\.himiko\.cloud', - r'(?:www\.)?invidious\.reallyancient\.tech', - r'(?:www\.)?invidious\.tube', - r'(?:www\.)?invidiou\.site', - r'(?:www\.)?invidious\.site', - r'(?:www\.)?invidious\.xyz', - r'(?:www\.)?invidious\.nixnet\.xyz', - r'(?:www\.)?invidious\.048596\.xyz', - r'(?:www\.)?invidious\.drycat\.fr', - r'(?:www\.)?inv\.skyn3t\.in', - r'(?:www\.)?tube\.poal\.co', - r'(?:www\.)?tube\.connect\.cafe', - r'(?:www\.)?vid\.wxzm\.sx', - r'(?:www\.)?vid\.mint\.lgbt', - r'(?:www\.)?vid\.puffyan\.us', - r'(?:www\.)?yewtu\.be', - r'(?:www\.)?yt\.elukerio\.org', - r'(?:www\.)?yt\.lelux\.fi', - r'(?:www\.)?invidious\.ggc-project\.de', - r'(?:www\.)?yt\.maisputain\.ovh', - r'(?:www\.)?ytprivate\.com', - r'(?:www\.)?invidious\.13ad\.de', - r'(?:www\.)?invidious\.toot\.koeln', - r'(?:www\.)?invidious\.fdn\.fr', - r'(?:www\.)?watch\.nettohikari\.com', - r'(?:www\.)?invidious\.namazso\.eu', - r'(?:www\.)?invidious\.silkky\.cloud', - r'(?:www\.)?invidious\.exonip\.de', - r'(?:www\.)?invidious\.riverside\.rocks', - r'(?:www\.)?invidious\.blamefran\.net', - r'(?:www\.)?invidious\.moomoo\.de', - r'(?:www\.)?ytb\.trom\.tf', - r'(?:www\.)?yt\.cyberhost\.uk', - r'(?:www\.)?kgg2m7yk5aybusll\.onion', - r'(?:www\.)?qklhadlycap4cnod\.onion', - r'(?:www\.)?axqzx4s6s54s32yentfqojs3x5i7faxza6xo3ehd4bzzsg2ii4fv2iid\.onion', - r'(?:www\.)?c7hqkpkpemu6e7emz5b4vyz7idjgdvgaaa3dyimmeojqbgpea3xqjoid\.onion', - r'(?:www\.)?fz253lmuao3strwbfbmx46yu7acac2jz27iwtorgmbqlkurlclmancad\.onion', - r'(?:www\.)?invidious\.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd\.onion', - r'(?:www\.)?owxfohz4kjyv25fvlqilyxast7inivgiktls3th44jhk3ej3i7ya\.b32\.i2p', - r'(?:www\.)?4l2dgddgsrkf2ous66i6seeyi6etzfgrue332grh2n7madpwopotugyd\.onion', - r'(?:www\.)?w6ijuptxiku4xpnnaetxvnkc5vqcdu7mgns2u77qefoixi63vbvnpnqd\.onion', - r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion', - r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion', - r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion', - ) _VALID_URL = r"""(?x)^ ( (?:https?://|//) # http(s):// or protocol-independent URL @@ -792,7 +801,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID (?(1).+)? # if we found the ID, everything can follow (?:\#|$)""" % { - 'invidious': '|'.join(_INVIDIOUS_SITES), + 'invidious': '|'.join(YoutubeBaseInfoExtractor._INVIDIOUS_SITES), } _PLAYER_INFO_RE = ( r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player', @@ -1666,7 +1675,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # shorts 'url': 'https://www.youtube.com/shorts/BGQWPY4IigY', 'only_matching': True, - }, + }, { + 'note': 'Storyboards', + 'url': 'https://www.youtube.com/watch?v=5KLPxDtMqe8', + 'info_dict': { + 'id': '5KLPxDtMqe8', + 'ext': 'mhtml', + 'format_id': 'sb0', + 'title': 'Your Brain is Plastic', + 'uploader_id': 'scishow', + 'description': 'md5:89cd86034bdb5466cd87c6ba206cd2bc', + 'upload_date': '20140324', + 'uploader': 'SciShow', + }, 'params': {'format': 'mhtml', 'skip_download': True} + } ] @classmethod @@ -1720,7 +1742,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): raise ExtractorError('Cannot identify player %r' % player_url) return id_m.group('id') - def _load_player(self, video_id, player_url, fatal=True) -> bool: + def _load_player(self, video_id, player_url, fatal=True): player_id = self._extract_player_info(player_url) if player_id not in self._code_cache: code = self._download_webpage( @@ -1729,7 +1751,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): errnote='Download of %s failed' % player_url) if code: self._code_cache[player_id] = code - return player_id in self._code_cache + return self._code_cache.get(player_id) def _extract_signature_function(self, video_id, player_url, example_sig): player_id = self._extract_player_info(player_url) @@ -1743,8 +1765,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if cache_spec is not None: return lambda s: ''.join(s[i] for i in cache_spec) - if self._load_player(video_id, player_url): - code = self._code_cache[player_id] + code = self._load_player(video_id, player_url) + if code: res = self._parse_sig_js(code) test_string = ''.join(map(compat_chr, range(len(example_sig)))) @@ -1755,6 +1777,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return res def _print_sig_code(self, func, example_sig): + if not self.get_param('youtube_print_sig_code'): + return + def gen_sig_code(idxs): def _genslice(start, end, step): starts = '' if start == 0 else str(start) @@ -1831,13 +1856,58 @@ class YoutubeIE(YoutubeBaseInfoExtractor): ) self._player_cache[player_id] = func func = self._player_cache[player_id] - if self.get_param('youtube_print_sig_code'): - self._print_sig_code(func, s) + self._print_sig_code(func, s) return func(s) except Exception as e: - tb = traceback.format_exc() - raise ExtractorError( - 'Signature extraction failed: ' + tb, cause=e) + raise ExtractorError('Signature extraction failed: ' + traceback.format_exc(), cause=e) + + def _decrypt_nsig(self, s, video_id, player_url): + """Turn the encrypted n field into a working signature""" + if player_url is None: + raise ExtractorError('Cannot decrypt nsig without player_url') + if player_url.startswith('//'): + player_url = 'https:' + player_url + elif not re.match(r'https?://', player_url): + player_url = compat_urlparse.urljoin( + 'https://www.youtube.com', player_url) + + sig_id = ('nsig_value', s) + if sig_id in self._player_cache: + return self._player_cache[sig_id] + + try: + player_id = ('nsig', player_url) + if player_id not in self._player_cache: + self._player_cache[player_id] = self._extract_n_function(video_id, player_url) + func = self._player_cache[player_id] + self._player_cache[sig_id] = func(s) + self.write_debug(f'Decrypted nsig {s} => {self._player_cache[sig_id]}') + return self._player_cache[sig_id] + except Exception as e: + raise ExtractorError(traceback.format_exc(), cause=e, video_id=video_id) + + def _extract_n_function_name(self, jscode): + return self._search_regex( + (r'\.get\("n"\)\)&&\(b=(?P<nfunc>[a-zA-Z0-9$]{3})\([a-zA-Z0-9]\)',), + jscode, 'Initial JS player n function name', group='nfunc') + + def _extract_n_function(self, video_id, player_url): + player_id = self._extract_player_info(player_url) + func_code = self._downloader.cache.load('youtube-nsig', player_id) + + if func_code: + jsi = JSInterpreter(func_code) + else: + jscode = self._load_player(video_id, player_url) + funcname = self._extract_n_function_name(jscode) + jsi = JSInterpreter(jscode) + func_code = jsi.extract_function_code(funcname) + self._downloader.cache.store('youtube-nsig', player_id, func_code) + + if self.get_param('youtube_print_sig_code'): + self.to_screen(f'Extracted nsig function from {player_id}:\n{func_code[1]}\n') + + return lambda s: jsi.extract_function_from_code(*func_code)([s]) def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False): """ @@ -1856,18 +1926,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): raise ExtractorError(error_msg) self.report_warning(error_msg) return - if self._load_player(video_id, player_url, fatal=fatal): - player_id = self._extract_player_info(player_url) - code = self._code_cache[player_id] + code = self._load_player(video_id, player_url, fatal=fatal) + if code: sts = int_or_none(self._search_regex( r'(?:signatureTimestamp|sts)\s*:\s*(?P<sts>[0-9]{5})', code, 'JS player signature timestamp', group='sts', fatal=fatal)) return sts def _mark_watched(self, video_id, player_responses): - playback_url = traverse_obj( - player_responses, (..., 'playbackTracking', 'videostatsPlaybackUrl', 'baseUrl'), - expected_type=url_or_none, get_all=False) + playback_url = get_first( + player_responses, ('playbackTracking', 'videostatsPlaybackUrl', 'baseUrl'), + expected_type=url_or_none) if not playback_url: self.report_warning('Unable to mark watched') return @@ -2290,18 +2359,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _get_requested_clients(self, url, smuggled_data): requested_clients = [] + default = ['android', 'web'] allowed_clients = sorted( [client for client in INNERTUBE_CLIENTS.keys() if client[:1] != '_'], key=lambda client: INNERTUBE_CLIENTS[client]['priority'], reverse=True) for client in self._configuration_arg('player_client'): if client in allowed_clients: requested_clients.append(client) + elif client == 'default': + requested_clients.extend(default) elif client == 'all': requested_clients.extend(allowed_clients) else: self.report_warning(f'Skipping unsupported client {client}') if not requested_clients: - requested_clients = ['android', 'web'] + requested_clients = default if smuggled_data.get('is_music_url') or self.is_music_url(url): requested_clients.extend( @@ -2387,7 +2459,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return prs, player_url def _extract_formats(self, streaming_data, video_id, player_url, is_live): - itags, stream_ids = [], [] + itags, stream_ids = {}, [] itag_qualities, res_qualities = {}, {} q = qualities([ # Normally tiny is the smallest video-only formats. But @@ -2440,8 +2512,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor): sp = try_get(sc, lambda x: x['sp'][0]) or 'signature' fmt_url += '&' + sp + '=' + signature + query = parse_qs(fmt_url) + throttled = False + if query.get('ratebypass') != ['yes'] and query.get('n'): + try: + fmt_url = update_url_query(fmt_url, { + 'n': self._decrypt_nsig(query['n'][0], video_id, player_url)}) + except ExtractorError as e: + self.report_warning( + f'nsig extraction failed: You may experience throttling for some formats\n' + f'n = {query["n"][0]} ; player = {player_url}\n{e}', only_once=True) + throttled = True + if itag: - itags.append(itag) + itags[itag] = 'https' stream_ids.append(stream_id) tbr = float_or_none( @@ -2450,11 +2534,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'asr': int_or_none(fmt.get('audioSampleRate')), 'filesize': int_or_none(fmt.get('contentLength')), 'format_id': itag, - 'format_note': ', '.join(filter(None, ( + 'format_note': join_nonempty( '%s%s' % (audio_track.get('displayName') or '', ' (default)' if audio_track.get('audioIsDefault') else ''), - fmt.get('qualityLabel') or quality.replace('audio_quality_', '')))), - 'fps': int_or_none(fmt.get('fps')), + fmt.get('qualityLabel') or quality.replace('audio_quality_', ''), + throttled and 'THROTTLED', delim=', '), + 'source_preference': -10 if throttled else -1, + 'fps': int_or_none(fmt.get('fps')) or None, 'height': height, 'quality': q(quality), 'tbr': tbr, @@ -2489,46 +2575,71 @@ class YoutubeIE(YoutubeBaseInfoExtractor): and 'dash' not in skip_manifests and self.get_param('youtube_include_dash_manifest', True)) get_hls = 'hls' not in skip_manifests and self.get_param('youtube_include_hls_manifest', True) - def guess_quality(f): - for val, qdict in ((f.get('format_id'), itag_qualities), (f.get('height'), res_qualities)): - if val in qdict: - return q(qdict[val]) - return -1 + def process_manifest_format(f, proto, itag): + if itag in itags: + if itags[itag] == proto or f'{itag}-{proto}' in itags: + return False + itag = f'{itag}-{proto}' + if itag: + f['format_id'] = itag + itags[itag] = proto + + f['quality'] = next(( + q(qdict[val]) + for val, qdict in ((f.get('format_id', '').split('-')[0], itag_qualities), (f.get('height'), res_qualities)) + if val in qdict), -1) + return True for sd in streaming_data: hls_manifest_url = get_hls and sd.get('hlsManifestUrl') if hls_manifest_url: for f in self._extract_m3u8_formats(hls_manifest_url, video_id, 'mp4', fatal=False): - itag = self._search_regex( - r'/itag/(\d+)', f['url'], 'itag', default=None) - if itag in itags: - itag += '-hls' - if itag in itags: - continue - if itag: - f['format_id'] = itag - itags.append(itag) - f['quality'] = guess_quality(f) - yield f + if process_manifest_format(f, 'hls', self._search_regex( + r'/itag/(\d+)', f['url'], 'itag', default=None)): + yield f dash_manifest_url = get_dash and sd.get('dashManifestUrl') if dash_manifest_url: for f in self._extract_mpd_formats(dash_manifest_url, video_id, fatal=False): - itag = f['format_id'] - if itag in itags: - itag += '-dash' - if itag in itags: - continue - if itag: - f['format_id'] = itag - itags.append(itag) - f['quality'] = guess_quality(f) - filesize = int_or_none(self._search_regex( - r'/clen/(\d+)', f.get('fragment_base_url') - or f['url'], 'file size', default=None)) - if filesize: - f['filesize'] = filesize - yield f + if process_manifest_format(f, 'dash', f['format_id']): + f['filesize'] = int_or_none(self._search_regex( + r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) + yield f + + def _extract_storyboard(self, player_responses, duration): + spec = get_first( + player_responses, ('storyboards', 'playerStoryboardSpecRenderer', 'spec'), default='').split('|')[::-1] + if not spec: + return + base_url = spec.pop() + L = len(spec) - 1 + for i, args in enumerate(spec): + args = args.split('#') + counts = list(map(int_or_none, args[:5])) + if len(args) != 8 or not all(counts): + self.report_warning(f'Malformed storyboard {i}: {"#".join(args)}{bug_reports_message()}') + continue + width, height, frame_count, cols, rows = counts + N, sigh = args[6:] + + url = base_url.replace('$L', str(L - i)).replace('$N', N) + f'&sigh={sigh}' + fragment_count = frame_count / (cols * rows) + fragment_duration = duration / fragment_count + yield { + 'format_id': f'sb{i}', + 'format_note': 'storyboard', + 'ext': 'mhtml', + 'protocol': 'mhtml', + 'acodec': 'none', + 'vcodec': 'none', + 'url': url, + 'width': width, + 'height': height, + 'fragments': [{ + 'path': url.replace('$M', str(j)), + 'duration': min(fragment_duration, duration - (j * fragment_duration)), + } for j in range(math.ceil(fragment_count))], + } def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -2547,8 +2658,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self._get_requested_clients(url, smuggled_data), video_id, webpage, master_ytcfg) - get_first = lambda obj, keys, **kwargs: traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False) - playability_statuses = traverse_obj( player_responses, (..., 'playabilityStatus'), expected_type=dict, default=[]) @@ -2574,49 +2683,48 @@ class YoutubeIE(YoutubeBaseInfoExtractor): or search_meta(['og:title', 'twitter:title', 'title'])) video_description = get_first(video_details, 'shortDescription') - if not smuggled_data.get('force_singlefeed', False): - if not self.get_param('noplaylist'): - multifeed_metadata_list = get_first( - player_responses, - ('multicamera', 'playerLegacyMulticameraRenderer', 'metadataList'), - expected_type=str) - if multifeed_metadata_list: - entries = [] - feed_ids = [] - for feed in multifeed_metadata_list.split(','): - # Unquote should take place before split on comma (,) since textual - # fields may contain comma as well (see - # https://github.com/ytdl-org/youtube-dl/issues/8536) - feed_data = compat_parse_qs( - compat_urllib_parse_unquote_plus(feed)) - - def feed_entry(name): - return try_get( - feed_data, lambda x: x[name][0], compat_str) - - feed_id = feed_entry('id') - if not feed_id: - continue - feed_title = feed_entry('title') - title = video_title - if feed_title: - title += ' (%s)' % feed_title - entries.append({ - '_type': 'url_transparent', - 'ie_key': 'Youtube', - 'url': smuggle_url( - '%swatch?v=%s' % (base_url, feed_data['id'][0]), - {'force_singlefeed': True}), - 'title': title, - }) - feed_ids.append(feed_id) - self.to_screen( - 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' - % (', '.join(feed_ids), video_id)) - return self.playlist_result( - entries, video_id, video_title, video_description) - else: + multifeed_metadata_list = get_first( + player_responses, + ('multicamera', 'playerLegacyMulticameraRenderer', 'metadataList'), + expected_type=str) + if multifeed_metadata_list and not smuggled_data.get('force_singlefeed'): + if self.get_param('noplaylist'): self.to_screen('Downloading just video %s because of --no-playlist' % video_id) + else: + entries = [] + feed_ids = [] + for feed in multifeed_metadata_list.split(','): + # Unquote should take place before split on comma (,) since textual + # fields may contain comma as well (see + # https://github.com/ytdl-org/youtube-dl/issues/8536) + feed_data = compat_parse_qs( + compat_urllib_parse_unquote_plus(feed)) + + def feed_entry(name): + return try_get( + feed_data, lambda x: x[name][0], compat_str) + + feed_id = feed_entry('id') + if not feed_id: + continue + feed_title = feed_entry('title') + title = video_title + if feed_title: + title += ' (%s)' % feed_title + entries.append({ + '_type': 'url_transparent', + 'ie_key': 'Youtube', + 'url': smuggle_url( + '%swatch?v=%s' % (base_url, feed_data['id'][0]), + {'force_singlefeed': True}), + 'title': title, + }) + feed_ids.append(feed_id) + self.to_screen( + 'Downloading multifeed video (%s) - add --no-playlist to just download video %s' + % (', '.join(feed_ids), video_id)) + return self.playlist_result( + entries, video_id, video_title, video_description) live_broadcast_details = traverse_obj(microformats, (..., 'liveBroadcastDetails')) is_live = get_first(video_details, 'isLive') @@ -2645,16 +2753,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if reason: self.raise_no_formats(reason, expected=True) - for f in formats: - if '&c=WEB&' in f['url'] and '&ratebypass=yes&' not in f['url']: # throttled - f['source_preference'] = -10 - # TODO: this method is not reliable - f['format_note'] = format_field(f, 'format_note', '%s ') + '(maybe throttled)' - - # Source is given priority since formats that throttle are given lower source_preference - # When throttling issue is fully fixed, remove this - self._sort_formats(formats, ('quality', 'res', 'fps', 'hdr:12', 'source', 'codec:vp9.2', 'lang')) - keywords = get_first(video_details, 'keywords', expected_type=list) or [] if not keywords and webpage: keywords = [ @@ -2742,6 +2840,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not duration and live_endtime and live_starttime: duration = live_endtime - live_starttime + formats.extend(self._extract_storyboard(player_responses, duration)) + + # Source is given priority since formats that throttle are given lower source_preference + # When throttling issue is fully fixed, remove this + self._sort_formats(formats, ('quality', 'res', 'fps', 'hdr:12', 'source', 'codec:vp9.2', 'lang', 'proto')) + info = { 'id': video_id, 'title': self._live_title(video_title) if is_live else video_title, @@ -3014,494 +3118,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return info -class YoutubeTabIE(YoutubeBaseInfoExtractor): - IE_DESC = 'YouTube Tabs' - _VALID_URL = r'''(?x) - https?:// - (?:\w+\.)? - (?: - youtube(?:kids)?\.com| - invidio\.us - )/ - (?: - (?P<channel_type>channel|c|user|browse)/| - (?P<not_channel> - feed/|hashtag/| - (?:playlist|watch)\?.*?\blist= - )| - (?!(?:%s)\b) # Direct URLs - ) - (?P<id>[^/?\#&]+) - ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES - IE_NAME = 'youtube:tab' - - _TESTS = [{ - 'note': 'playlists, multipage', - 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid', - 'playlist_mincount': 94, - 'info_dict': { - 'id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'title': 'Игорь Клейнер - Playlists', - 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', - 'uploader': 'Игорь Клейнер', - 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg', - }, - }, { - 'note': 'playlists, multipage, different order', - 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd', - 'playlist_mincount': 94, - 'info_dict': { - 'id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'title': 'Игорь Клейнер - Playlists', - 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', - 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg', - 'uploader': 'Игорь Клейнер', - }, - }, { - 'note': 'playlists, series', - 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3', - 'playlist_mincount': 5, - 'info_dict': { - 'id': 'UCYO_jab_esuFRV4b17AJtAw', - 'title': '3Blue1Brown - Playlists', - 'description': 'md5:e1384e8a133307dd10edee76e875d62f', - 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw', - 'uploader': '3Blue1Brown', - }, - }, { - 'note': 'playlists, singlepage', - 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', - 'playlist_mincount': 4, - 'info_dict': { - 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ', - 'title': 'ThirstForScience - Playlists', - 'description': 'md5:609399d937ea957b0f53cbffb747a14c', - 'uploader': 'ThirstForScience', - 'uploader_id': 'UCAEtajcuhQ6an9WEzY9LEMQ', - } - }, { - 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', - 'only_matching': True, - }, { - 'note': 'basic, single video playlist', - 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', - 'info_dict': { - 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', - 'uploader': 'Sergey M.', - 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', - 'title': 'youtube-dl public playlist', - }, - 'playlist_count': 1, - }, { - 'note': 'empty playlist', - 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', - 'info_dict': { - 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', - 'uploader': 'Sergey M.', - 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', - 'title': 'youtube-dl empty playlist', - }, - 'playlist_count': 0, - }, { - 'note': 'Home tab', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Home', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - 'uploader': 'lex will', - 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - }, - 'playlist_mincount': 2, - }, { - 'note': 'Videos tab', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Videos', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - 'uploader': 'lex will', - 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - }, - 'playlist_mincount': 975, - }, { - 'note': 'Videos tab, sorted by popular', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Videos', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - 'uploader': 'lex will', - 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - }, - 'playlist_mincount': 199, - }, { - 'note': 'Playlists tab', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Playlists', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - 'uploader': 'lex will', - 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - }, - 'playlist_mincount': 17, - }, { - 'note': 'Community tab', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Community', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - 'uploader': 'lex will', - 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - }, - 'playlist_mincount': 18, - }, { - 'note': 'Channels tab', - 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels', - 'info_dict': { - 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - 'title': 'lex will - Channels', - 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', - 'uploader': 'lex will', - 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', - }, - 'playlist_mincount': 12, - }, { - 'note': 'Search tab', - 'url': 'https://www.youtube.com/c/3blue1brown/search?query=linear%20algebra', - 'playlist_mincount': 40, - 'info_dict': { - 'id': 'UCYO_jab_esuFRV4b17AJtAw', - 'title': '3Blue1Brown - Search - linear algebra', - 'description': 'md5:e1384e8a133307dd10edee76e875d62f', - 'uploader': '3Blue1Brown', - 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw', - }, - }, { - 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA', - 'only_matching': True, - }, { - 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', - 'only_matching': True, - }, { - 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', - 'only_matching': True, - }, { - 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', - 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', - 'info_dict': { - 'title': '29C3: Not my department', - 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', - 'uploader': 'Christiaan008', - 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg', - 'description': 'md5:a14dc1a8ef8307a9807fe136a0660268', - }, - 'playlist_count': 96, - }, { - 'note': 'Large playlist', - 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q', - 'info_dict': { - 'title': 'Uploads from Cauchemar', - 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', - 'uploader': 'Cauchemar', - 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q', - }, - 'playlist_mincount': 1123, - }, { - 'note': 'even larger playlist, 8832 videos', - 'url': 'http://www.youtube.com/user/NASAgovVideo/videos', - 'only_matching': True, - }, { - 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos', - 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA', - 'info_dict': { - 'title': 'Uploads from Interstellar Movie', - 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', - 'uploader': 'Interstellar Movie', - 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA', - }, - 'playlist_mincount': 21, - }, { - 'note': 'Playlist with "show unavailable videos" button', - 'url': 'https://www.youtube.com/playlist?list=UUTYLiWFZy8xtPwxFwX9rV7Q', - 'info_dict': { - 'title': 'Uploads from Phim Siêu Nhân Nhật Bản', - 'id': 'UUTYLiWFZy8xtPwxFwX9rV7Q', - 'uploader': 'Phim Siêu Nhân Nhật Bản', - 'uploader_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q', - }, - 'playlist_mincount': 200, - }, { - 'note': 'Playlist with unavailable videos in page 7', - 'url': 'https://www.youtube.com/playlist?list=UU8l9frL61Yl5KFOl87nIm2w', - 'info_dict': { - 'title': 'Uploads from BlankTV', - 'id': 'UU8l9frL61Yl5KFOl87nIm2w', - 'uploader': 'BlankTV', - 'uploader_id': 'UC8l9frL61Yl5KFOl87nIm2w', - }, - 'playlist_mincount': 1000, - }, { - 'note': 'https://github.com/ytdl-org/youtube-dl/issues/21844', - 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', - 'info_dict': { - 'title': 'Data Analysis with Dr Mike Pound', - 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', - 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA', - 'uploader': 'Computerphile', - 'description': 'md5:7f567c574d13d3f8c0954d9ffee4e487', - }, - 'playlist_mincount': 11, - }, { - 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', - 'only_matching': True, - }, { - 'note': 'Playlist URL that does not actually serve a playlist', - 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4', - 'info_dict': { - 'id': 'FqZTN594JQw', - 'ext': 'webm', - 'title': "Smiley's People 01 detective, Adventure Series, Action", - 'uploader': 'STREEM', - 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng', - 'upload_date': '20150526', - 'license': 'Standard YouTube License', - 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', - 'categories': ['People & Blogs'], - 'tags': list, - 'view_count': int, - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'This video is not available.', - 'add_ie': [YoutubeIE.ie_key()], - }, { - 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live', - 'info_dict': { - 'id': '3yImotZU3tw', # This will keep changing - 'ext': 'mp4', - 'title': compat_str, - 'uploader': 'Sky News', - 'uploader_id': 'skynews', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews', - 'upload_date': r're:\d{8}', - 'description': compat_str, - 'categories': ['News & Politics'], - 'tags': list, - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Downloading just video ', 'Ignoring subtitle tracks found in '], - }, { - 'url': 'https://www.youtube.com/user/TheYoungTurks/live', - 'info_dict': { - 'id': 'a48o2S1cPoo', - 'ext': 'mp4', - 'title': 'The Young Turks - Live Main Show', - 'uploader': 'The Young Turks', - 'uploader_id': 'TheYoungTurks', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', - 'upload_date': '20150715', - 'license': 'Standard YouTube License', - 'description': 'md5:438179573adcdff3c97ebb1ee632b891', - 'categories': ['News & Politics'], - 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], - 'like_count': int, - 'dislike_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', - 'only_matching': True, - }, { - 'note': 'A channel that is not live. Should raise error', - 'url': 'https://www.youtube.com/user/numberphile/live', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/feed/trending', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/feed/library', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/feed/history', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/feed/subscriptions', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/feed/watch_later', - 'only_matching': True, - }, { - 'note': 'Recommended - redirects to home page.', - 'url': 'https://www.youtube.com/feed/recommended', - 'only_matching': True, - }, { - 'note': 'inline playlist with not always working continuations', - 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/course', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/zsecurity', - 'only_matching': True, - }, { - 'url': 'http://www.youtube.com/NASAgovVideo/videos', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/TheYoungTurks/live', - 'only_matching': True, - }, { - 'url': 'https://www.youtube.com/hashtag/cctv9', - 'info_dict': { - 'id': 'cctv9', - 'title': '#cctv9', - }, - 'playlist_mincount': 350, - }, { - 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU', - 'only_matching': True, - }, { - 'note': 'Requires Premium: should request additional YTM-info webpage (and have format 141) for videos in playlist', - 'url': 'https://music.youtube.com/playlist?list=PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', - 'only_matching': True - }, { - 'note': '/browse/ should redirect to /channel/', - 'url': 'https://music.youtube.com/browse/UC1a8OFewdjuLq6KlF8M_8Ng', - 'only_matching': True - }, { - 'note': 'VLPL, should redirect to playlist?list=PL...', - 'url': 'https://music.youtube.com/browse/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', - 'info_dict': { - 'id': 'PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', - 'uploader': 'NoCopyrightSounds', - 'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!', - 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', - 'title': 'NCS Releases', - }, - 'playlist_mincount': 166, - }, { - 'note': 'Topic, should redirect to playlist?list=UU...', - 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw', - 'info_dict': { - 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw', - 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw', - 'title': 'Uploads from Royalty Free Music - Topic', - 'uploader': 'Royalty Free Music - Topic', - }, - 'expected_warnings': [ - 'A channel/user page was given', - 'The URL does not have a videos tab', - ], - 'playlist_mincount': 101, - }, { - 'note': 'Topic without a UU playlist', - 'url': 'https://www.youtube.com/channel/UCtFRv9O2AHqOZjjynzrv-xg', - 'info_dict': { - 'id': 'UCtFRv9O2AHqOZjjynzrv-xg', - 'title': 'UCtFRv9O2AHqOZjjynzrv-xg', - }, - 'expected_warnings': [ - 'A channel/user page was given', - 'The URL does not have a videos tab', - 'Falling back to channel URL', - ], - 'playlist_mincount': 9, - }, { - 'note': 'Youtube music Album', - 'url': 'https://music.youtube.com/browse/MPREb_gTAcphH99wE', - 'info_dict': { - 'id': 'OLAK5uy_l1m0thk3g31NmIIz_vMIbWtyv7eZixlH0', - 'title': 'Album - Royalty Free Music Library V2 (50 Songs)', - }, - 'playlist_count': 50, - }, { - 'note': 'unlisted single video playlist', - 'url': 'https://www.youtube.com/playlist?list=PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf', - 'info_dict': { - 'uploader_id': 'UC9zHu_mHU96r19o-wV5Qs1Q', - 'uploader': 'colethedj', - 'id': 'PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf', - 'title': 'yt-dlp unlisted playlist test', - 'availability': 'unlisted' - }, - 'playlist_count': 1, - }, { - 'note': 'API Fallback: Recommended - redirects to home page. Requires visitorData', - 'url': 'https://www.youtube.com/feed/recommended', - 'info_dict': { - 'id': 'recommended', - 'title': 'recommended', - }, - 'playlist_mincount': 50, - 'params': { - 'skip_download': True, - 'extractor_args': {'youtubetab': {'skip': ['webpage']}} - }, - }, { - 'note': 'API Fallback: /videos tab, sorted by oldest first', - 'url': 'https://www.youtube.com/user/theCodyReeder/videos?view=0&sort=da&flow=grid', - 'info_dict': { - 'id': 'UCu6mSoMNzHQiBIOCkHUa2Aw', - 'title': 'Cody\'sLab - Videos', - 'description': 'md5:d083b7c2f0c67ee7a6c74c3e9b4243fa', - 'uploader': 'Cody\'sLab', - 'uploader_id': 'UCu6mSoMNzHQiBIOCkHUa2Aw', - }, - 'playlist_mincount': 650, - 'params': { - 'skip_download': True, - 'extractor_args': {'youtubetab': {'skip': ['webpage']}} - }, - }, { - 'note': 'API Fallback: Topic, should redirect to playlist?list=UU...', - 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw', - 'info_dict': { - 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw', - 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw', - 'title': 'Uploads from Royalty Free Music - Topic', - 'uploader': 'Royalty Free Music - Topic', - }, - 'expected_warnings': [ - 'A channel/user page was given', - 'The URL does not have a videos tab', - ], - 'playlist_mincount': 101, - 'params': { - 'skip_download': True, - 'extractor_args': {'youtubetab': {'skip': ['webpage']}} - }, - }] - - @classmethod - def suitable(cls, url): - return False if YoutubeIE.suitable(url) else super( - YoutubeTabIE, cls).suitable(url) +class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): def _extract_channel_id(self, webpage): channel_id = self._html_search_meta( @@ -3684,49 +3301,53 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): if entry: yield entry ''' - def _entries(self, tab, item_id, ytcfg, account_syncid, visitor_data): - - def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds - contents = try_get(parent_renderer, lambda x: x['contents'], list) or [] - for content in contents: - if not isinstance(content, dict): - continue - is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict) - if not is_renderer: - renderer = content.get('richItemRenderer') - if renderer: - for entry in self._rich_entries(renderer): - yield entry - continuation_list[0] = self._extract_continuation(parent_renderer) + def _extract_entries(self, parent_renderer, continuation_list): + # continuation_list is modified in-place with continuation_list = [continuation_token] + continuation_list[:] = [None] + contents = try_get(parent_renderer, lambda x: x['contents'], list) or [] + for content in contents: + if not isinstance(content, dict): + continue + is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict) + if not is_renderer: + renderer = content.get('richItemRenderer') + if renderer: + for entry in self._rich_entries(renderer): + yield entry + continuation_list[0] = self._extract_continuation(parent_renderer) + continue + isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] + for isr_content in isr_contents: + if not isinstance(isr_content, dict): continue - isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] - for isr_content in isr_contents: - if not isinstance(isr_content, dict): - continue - known_renderers = { - 'playlistVideoListRenderer': self._playlist_entries, - 'gridRenderer': self._grid_entries, - 'shelfRenderer': lambda x: self._shelf_entries(x, tab.get('title') != 'Channels'), - 'backstagePostThreadRenderer': self._post_thread_entries, - 'videoRenderer': lambda x: [self._video_entry(x)], - } - for key, renderer in isr_content.items(): - if key not in known_renderers: - continue - for entry in known_renderers[key](renderer): - if entry: - yield entry - continuation_list[0] = self._extract_continuation(renderer) - break - - if not continuation_list[0]: - continuation_list[0] = self._extract_continuation(is_renderer) + known_renderers = { + 'playlistVideoListRenderer': self._playlist_entries, + 'gridRenderer': self._grid_entries, + 'shelfRenderer': lambda x: self._shelf_entries(x), + 'backstagePostThreadRenderer': self._post_thread_entries, + 'videoRenderer': lambda x: [self._video_entry(x)], + 'playlistRenderer': lambda x: self._grid_entries({'items': [{'playlistRenderer': x}]}), + 'channelRenderer': lambda x: self._grid_entries({'items': [{'channelRenderer': x}]}), + } + for key, renderer in isr_content.items(): + if key not in known_renderers: + continue + for entry in known_renderers[key](renderer): + if entry: + yield entry + continuation_list[0] = self._extract_continuation(renderer) + break if not continuation_list[0]: - continuation_list[0] = self._extract_continuation(parent_renderer) + continuation_list[0] = self._extract_continuation(is_renderer) + + if not continuation_list[0]: + continuation_list[0] = self._extract_continuation(parent_renderer) - continuation_list = [None] # Python 2 does not support nonlocal + def _entries(self, tab, item_id, ytcfg, account_syncid, visitor_data): + continuation_list = [None] + extract_entries = lambda x: self._extract_entries(x, continuation_list) tab_content = try_get(tab, lambda x: x['content'], dict) if not tab_content: return @@ -4118,6 +3739,519 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor): entry['url'] = smuggle_url(entry['url'], data) yield entry + _SEARCH_PARAMS = None + + def _search_results(self, query, params=NO_DEFAULT): + data = {'query': query} + if params is NO_DEFAULT: + params = self._SEARCH_PARAMS + if params: + data['params'] = params + continuation_list = [None] + for page_num in itertools.count(1): + data.update(continuation_list[0] or {}) + search = self._extract_response( + item_id='query "%s" page %s' % (query, page_num), ep='search', query=data, + check_get_keys=('contents', 'onResponseReceivedCommands')) + slr_contents = try_get( + search, + (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'], + lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']), + list) + yield from self._extract_entries({'contents': slr_contents}, continuation_list) + if not continuation_list[0]: + break + + +class YoutubeTabIE(YoutubeTabBaseInfoExtractor): + IE_DESC = 'YouTube Tabs' + _VALID_URL = r'''(?x: + https?:// + (?:\w+\.)? + (?: + youtube(?:kids)?\.com| + %(invidious)s + )/ + (?: + (?P<channel_type>channel|c|user|browse)/| + (?P<not_channel> + feed/|hashtag/| + (?:playlist|watch)\?.*?\blist= + )| + (?!(?:%(reserved_names)s)\b) # Direct URLs + ) + (?P<id>[^/?\#&]+) + )''' % { + 'reserved_names': YoutubeBaseInfoExtractor._RESERVED_NAMES, + 'invidious': '|'.join(YoutubeBaseInfoExtractor._INVIDIOUS_SITES), + } + IE_NAME = 'youtube:tab' + + _TESTS = [{ + 'note': 'playlists, multipage', + 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid', + 'playlist_mincount': 94, + 'info_dict': { + 'id': 'UCqj7Cz7revf5maW9g5pgNcg', + 'title': 'Игорь Клейнер - Playlists', + 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', + 'uploader': 'Игорь Клейнер', + 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg', + }, + }, { + 'note': 'playlists, multipage, different order', + 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd', + 'playlist_mincount': 94, + 'info_dict': { + 'id': 'UCqj7Cz7revf5maW9g5pgNcg', + 'title': 'Игорь Клейнер - Playlists', + 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', + 'uploader_id': 'UCqj7Cz7revf5maW9g5pgNcg', + 'uploader': 'Игорь Клейнер', + }, + }, { + 'note': 'playlists, series', + 'url': 'https://www.youtube.com/c/3blue1brown/playlists?view=50&sort=dd&shelf_id=3', + 'playlist_mincount': 5, + 'info_dict': { + 'id': 'UCYO_jab_esuFRV4b17AJtAw', + 'title': '3Blue1Brown - Playlists', + 'description': 'md5:e1384e8a133307dd10edee76e875d62f', + 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw', + 'uploader': '3Blue1Brown', + }, + }, { + 'note': 'playlists, singlepage', + 'url': 'https://www.youtube.com/user/ThirstForScience/playlists', + 'playlist_mincount': 4, + 'info_dict': { + 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ', + 'title': 'ThirstForScience - Playlists', + 'description': 'md5:609399d937ea957b0f53cbffb747a14c', + 'uploader': 'ThirstForScience', + 'uploader_id': 'UCAEtajcuhQ6an9WEzY9LEMQ', + } + }, { + 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', + 'only_matching': True, + }, { + 'note': 'basic, single video playlist', + 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', + 'info_dict': { + 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', + 'uploader': 'Sergey M.', + 'id': 'PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', + 'title': 'youtube-dl public playlist', + }, + 'playlist_count': 1, + }, { + 'note': 'empty playlist', + 'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', + 'info_dict': { + 'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', + 'uploader': 'Sergey M.', + 'id': 'PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf', + 'title': 'youtube-dl empty playlist', + }, + 'playlist_count': 0, + }, { + 'note': 'Home tab', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Home', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + }, + 'playlist_mincount': 2, + }, { + 'note': 'Videos tab', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Videos', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + }, + 'playlist_mincount': 975, + }, { + 'note': 'Videos tab, sorted by popular', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Videos', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + }, + 'playlist_mincount': 199, + }, { + 'note': 'Playlists tab', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Playlists', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + }, + 'playlist_mincount': 17, + }, { + 'note': 'Community tab', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Community', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + }, + 'playlist_mincount': 18, + }, { + 'note': 'Channels tab', + 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels', + 'info_dict': { + 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + 'title': 'lex will - Channels', + 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', + 'uploader': 'lex will', + 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', + }, + 'playlist_mincount': 12, + }, { + 'note': 'Search tab', + 'url': 'https://www.youtube.com/c/3blue1brown/search?query=linear%20algebra', + 'playlist_mincount': 40, + 'info_dict': { + 'id': 'UCYO_jab_esuFRV4b17AJtAw', + 'title': '3Blue1Brown - Search - linear algebra', + 'description': 'md5:e1384e8a133307dd10edee76e875d62f', + 'uploader': '3Blue1Brown', + 'uploader_id': 'UCYO_jab_esuFRV4b17AJtAw', + }, + }, { + 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'only_matching': True, + }, { + 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'only_matching': True, + }, { + 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'only_matching': True, + }, { + 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', + 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', + 'info_dict': { + 'title': '29C3: Not my department', + 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', + 'uploader': 'Christiaan008', + 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg', + 'description': 'md5:a14dc1a8ef8307a9807fe136a0660268', + }, + 'playlist_count': 96, + }, { + 'note': 'Large playlist', + 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q', + 'info_dict': { + 'title': 'Uploads from Cauchemar', + 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', + 'uploader': 'Cauchemar', + 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q', + }, + 'playlist_mincount': 1123, + }, { + 'note': 'even larger playlist, 8832 videos', + 'url': 'http://www.youtube.com/user/NASAgovVideo/videos', + 'only_matching': True, + }, { + 'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos', + 'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA', + 'info_dict': { + 'title': 'Uploads from Interstellar Movie', + 'id': 'UUXw-G3eDE9trcvY2sBMM_aA', + 'uploader': 'Interstellar Movie', + 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA', + }, + 'playlist_mincount': 21, + }, { + 'note': 'Playlist with "show unavailable videos" button', + 'url': 'https://www.youtube.com/playlist?list=UUTYLiWFZy8xtPwxFwX9rV7Q', + 'info_dict': { + 'title': 'Uploads from Phim Siêu Nhân Nhật Bản', + 'id': 'UUTYLiWFZy8xtPwxFwX9rV7Q', + 'uploader': 'Phim Siêu Nhân Nhật Bản', + 'uploader_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q', + }, + 'playlist_mincount': 200, + }, { + 'note': 'Playlist with unavailable videos in page 7', + 'url': 'https://www.youtube.com/playlist?list=UU8l9frL61Yl5KFOl87nIm2w', + 'info_dict': { + 'title': 'Uploads from BlankTV', + 'id': 'UU8l9frL61Yl5KFOl87nIm2w', + 'uploader': 'BlankTV', + 'uploader_id': 'UC8l9frL61Yl5KFOl87nIm2w', + }, + 'playlist_mincount': 1000, + }, { + 'note': 'https://github.com/ytdl-org/youtube-dl/issues/21844', + 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', + 'info_dict': { + 'title': 'Data Analysis with Dr Mike Pound', + 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', + 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA', + 'uploader': 'Computerphile', + 'description': 'md5:7f567c574d13d3f8c0954d9ffee4e487', + }, + 'playlist_mincount': 11, + }, { + 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', + 'only_matching': True, + }, { + 'note': 'Playlist URL that does not actually serve a playlist', + 'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4', + 'info_dict': { + 'id': 'FqZTN594JQw', + 'ext': 'webm', + 'title': "Smiley's People 01 detective, Adventure Series, Action", + 'uploader': 'STREEM', + 'uploader_id': 'UCyPhqAZgwYWZfxElWVbVJng', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCyPhqAZgwYWZfxElWVbVJng', + 'upload_date': '20150526', + 'license': 'Standard YouTube License', + 'description': 'md5:507cdcb5a49ac0da37a920ece610be80', + 'categories': ['People & Blogs'], + 'tags': list, + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'This video is not available.', + 'add_ie': [YoutubeIE.ie_key()], + }, { + 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live', + 'info_dict': { + 'id': '3yImotZU3tw', # This will keep changing + 'ext': 'mp4', + 'title': compat_str, + 'uploader': 'Sky News', + 'uploader_id': 'skynews', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews', + 'upload_date': r're:\d{8}', + 'description': compat_str, + 'categories': ['News & Politics'], + 'tags': list, + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Downloading just video ', 'Ignoring subtitle tracks found in '], + }, { + 'url': 'https://www.youtube.com/user/TheYoungTurks/live', + 'info_dict': { + 'id': 'a48o2S1cPoo', + 'ext': 'mp4', + 'title': 'The Young Turks - Live Main Show', + 'uploader': 'The Young Turks', + 'uploader_id': 'TheYoungTurks', + 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', + 'upload_date': '20150715', + 'license': 'Standard YouTube License', + 'description': 'md5:438179573adcdff3c97ebb1ee632b891', + 'categories': ['News & Politics'], + 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], + 'like_count': int, + 'dislike_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/c/CommanderVideoHq/live', + 'only_matching': True, + }, { + 'note': 'A channel that is not live. Should raise error', + 'url': 'https://www.youtube.com/user/numberphile/live', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/feed/trending', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/feed/library', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/feed/history', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/feed/subscriptions', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/feed/watch_later', + 'only_matching': True, + }, { + 'note': 'Recommended - redirects to home page.', + 'url': 'https://www.youtube.com/feed/recommended', + 'only_matching': True, + }, { + 'note': 'inline playlist with not always working continuations', + 'url': 'https://www.youtube.com/watch?v=UC6u0Tct-Fo&list=PL36D642111D65BE7C', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/course', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/zsecurity', + 'only_matching': True, + }, { + 'url': 'http://www.youtube.com/NASAgovVideo/videos', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/TheYoungTurks/live', + 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/hashtag/cctv9', + 'info_dict': { + 'id': 'cctv9', + 'title': '#cctv9', + }, + 'playlist_mincount': 350, + }, { + 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU', + 'only_matching': True, + }, { + 'note': 'Requires Premium: should request additional YTM-info webpage (and have format 141) for videos in playlist', + 'url': 'https://music.youtube.com/playlist?list=PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', + 'only_matching': True + }, { + 'note': '/browse/ should redirect to /channel/', + 'url': 'https://music.youtube.com/browse/UC1a8OFewdjuLq6KlF8M_8Ng', + 'only_matching': True + }, { + 'note': 'VLPL, should redirect to playlist?list=PL...', + 'url': 'https://music.youtube.com/browse/VLPLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', + 'info_dict': { + 'id': 'PLRBp0Fe2GpgmgoscNFLxNyBVSFVdYmFkq', + 'uploader': 'NoCopyrightSounds', + 'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!', + 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', + 'title': 'NCS Releases', + }, + 'playlist_mincount': 166, + }, { + 'note': 'Topic, should redirect to playlist?list=UU...', + 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw', + 'info_dict': { + 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw', + 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw', + 'title': 'Uploads from Royalty Free Music - Topic', + 'uploader': 'Royalty Free Music - Topic', + }, + 'expected_warnings': [ + 'A channel/user page was given', + 'The URL does not have a videos tab', + ], + 'playlist_mincount': 101, + }, { + 'note': 'Topic without a UU playlist', + 'url': 'https://www.youtube.com/channel/UCtFRv9O2AHqOZjjynzrv-xg', + 'info_dict': { + 'id': 'UCtFRv9O2AHqOZjjynzrv-xg', + 'title': 'UCtFRv9O2AHqOZjjynzrv-xg', + }, + 'expected_warnings': [ + 'A channel/user page was given', + 'The URL does not have a videos tab', + 'Falling back to channel URL', + ], + 'playlist_mincount': 9, + }, { + 'note': 'Youtube music Album', + 'url': 'https://music.youtube.com/browse/MPREb_gTAcphH99wE', + 'info_dict': { + 'id': 'OLAK5uy_l1m0thk3g31NmIIz_vMIbWtyv7eZixlH0', + 'title': 'Album - Royalty Free Music Library V2 (50 Songs)', + }, + 'playlist_count': 50, + }, { + 'note': 'unlisted single video playlist', + 'url': 'https://www.youtube.com/playlist?list=PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf', + 'info_dict': { + 'uploader_id': 'UC9zHu_mHU96r19o-wV5Qs1Q', + 'uploader': 'colethedj', + 'id': 'PLwL24UFy54GrB3s2KMMfjZscDi1x5Dajf', + 'title': 'yt-dlp unlisted playlist test', + 'availability': 'unlisted' + }, + 'playlist_count': 1, + }, { + 'note': 'API Fallback: Recommended - redirects to home page. Requires visitorData', + 'url': 'https://www.youtube.com/feed/recommended', + 'info_dict': { + 'id': 'recommended', + 'title': 'recommended', + }, + 'playlist_mincount': 50, + 'params': { + 'skip_download': True, + 'extractor_args': {'youtubetab': {'skip': ['webpage']}} + }, + }, { + 'note': 'API Fallback: /videos tab, sorted by oldest first', + 'url': 'https://www.youtube.com/user/theCodyReeder/videos?view=0&sort=da&flow=grid', + 'info_dict': { + 'id': 'UCu6mSoMNzHQiBIOCkHUa2Aw', + 'title': 'Cody\'sLab - Videos', + 'description': 'md5:d083b7c2f0c67ee7a6c74c3e9b4243fa', + 'uploader': 'Cody\'sLab', + 'uploader_id': 'UCu6mSoMNzHQiBIOCkHUa2Aw', + }, + 'playlist_mincount': 650, + 'params': { + 'skip_download': True, + 'extractor_args': {'youtubetab': {'skip': ['webpage']}} + }, + }, { + 'note': 'API Fallback: Topic, should redirect to playlist?list=UU...', + 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw', + 'info_dict': { + 'id': 'UU9ALqqC4aIeG5iDs7i90Bfw', + 'uploader_id': 'UC9ALqqC4aIeG5iDs7i90Bfw', + 'title': 'Uploads from Royalty Free Music - Topic', + 'uploader': 'Royalty Free Music - Topic', + }, + 'expected_warnings': [ + 'A channel/user page was given', + 'The URL does not have a videos tab', + ], + 'playlist_mincount': 101, + 'params': { + 'skip_download': True, + 'extractor_args': {'youtubetab': {'skip': ['webpage']}} + }, + }] + + @classmethod + def suitable(cls, url): + return False if YoutubeIE.suitable(url) else super( + YoutubeTabIE, cls).suitable(url) + def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) if self.is_music_url(url): @@ -4250,12 +4384,15 @@ class YoutubePlaylistIE(InfoExtractor): (?: (?: youtube(?:kids)?\.com| - invidio\.us + %(invidious)s ) /.*?\?.*?\blist= )? (?P<id>%(playlist_id)s) - )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} + )''' % { + 'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE, + 'invidious': '|'.join(YoutubeBaseInfoExtractor._INVIDIOUS_SITES), + } IE_NAME = 'youtube:playlist' _TESTS = [{ 'note': 'issue #673', @@ -4377,7 +4514,7 @@ class YoutubeYtUserIE(InfoExtractor): def _real_extract(self, url): user_id = self._match_id(url) return self.url_result( - 'https://www.youtube.com/user/%s' % user_id, + 'https://www.youtube.com/user/%s/videos' % user_id, ie=YoutubeTabIE.ie_key(), video_id=user_id) @@ -4400,77 +4537,25 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): ie=YoutubeTabIE.ie_key()) -class YoutubeSearchIE(SearchInfoExtractor, YoutubeTabIE): - IE_DESC = 'YouTube searches' +class YoutubeSearchIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor): + IE_DESC = 'YouTube search' IE_NAME = 'youtube:search' _SEARCH_KEY = 'ytsearch' - _SEARCH_PARAMS = None + _SEARCH_PARAMS = 'EgIQAQ%3D%3D' # Videos only _TESTS = [] - def _search_results(self, query): - data = {'query': query} - if self._SEARCH_PARAMS: - data['params'] = self._SEARCH_PARAMS - continuation = {} - for page_num in itertools.count(1): - data.update(continuation) - search = self._extract_response( - item_id='query "%s" page %s' % (query, page_num), ep='search', query=data, - check_get_keys=('contents', 'onResponseReceivedCommands') - ) - if not search: - break - slr_contents = try_get( - search, - (lambda x: x['contents']['twoColumnSearchResultsRenderer']['primaryContents']['sectionListRenderer']['contents'], - lambda x: x['onResponseReceivedCommands'][0]['appendContinuationItemsAction']['continuationItems']), - list) - if not slr_contents: - break - - # Youtube sometimes adds promoted content to searches, - # changing the index location of videos and token. - # So we search through all entries till we find them. - continuation = None - for slr_content in slr_contents: - if not continuation: - continuation = self._extract_continuation({'contents': [slr_content]}) - isr_contents = try_get( - slr_content, - lambda x: x['itemSectionRenderer']['contents'], - list) - if not isr_contents: - continue - for content in isr_contents: - if not isinstance(content, dict): - continue - video = content.get('videoRenderer') - if not isinstance(video, dict): - continue - video_id = video.get('videoId') - if not video_id: - continue - - yield self._extract_video(video) - - if not continuation: - break - - -class YoutubeSearchDateIE(YoutubeSearchIE): +class YoutubeSearchDateIE(SearchInfoExtractor, YoutubeTabBaseInfoExtractor): IE_NAME = YoutubeSearchIE.IE_NAME + ':date' _SEARCH_KEY = 'ytsearchdate' - IE_DESC = 'YouTube searches, newest videos first' - _SEARCH_PARAMS = 'CAI%3D' + IE_DESC = 'YouTube search, newest videos first' + _SEARCH_PARAMS = 'CAISAhAB' # Videos only, sorted by date -class YoutubeSearchURLIE(YoutubeSearchIE): +class YoutubeSearchURLIE(YoutubeTabBaseInfoExtractor): IE_DESC = 'YouTube search URLs with sorting and filter support' IE_NAME = YoutubeSearchIE.IE_NAME + '_url' - _SEARCH_KEY = None _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)' - # _MAX_RESULTS = 100 _TESTS = [{ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', 'playlist_mincount': 5, @@ -4479,19 +4564,22 @@ class YoutubeSearchURLIE(YoutubeSearchIE): 'title': 'youtube-dl test video', } }, { + 'url': 'https://www.youtube.com/results?search_query=python&sp=EgIQAg%253D%253D', + 'playlist_mincount': 5, + 'info_dict': { + 'id': 'python', + 'title': 'python', + } + + }, { 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB', 'only_matching': True, }] - @classmethod - def _make_valid_url(cls): - return cls._VALID_URL - def _real_extract(self, url): qs = parse_qs(url) query = (qs.get('search_query') or qs.get('q'))[0] - self._SEARCH_PARAMS = qs.get('sp', ('',))[0] - return self._get_n_results(query, self._MAX_RESULTS) + return self.playlist_result(self._search_results(query, qs.get('sp', (None,))[0]), query, query) class YoutubeFeedsInfoExtractor(YoutubeTabIE): diff --git a/yt_dlp/extractor/zattoo.py b/yt_dlp/extractor/zattoo.py index a13d12436..98d15604d 100644 --- a/yt_dlp/extractor/zattoo.py +++ b/yt_dlp/extractor/zattoo.py @@ -12,6 +12,7 @@ from ..compat import ( from ..utils import ( ExtractorError, int_or_none, + join_nonempty, try_get, url_or_none, urlencode_postdata, @@ -156,15 +157,9 @@ class ZattooPlatformBaseIE(InfoExtractor): watch_url = url_or_none(watch.get('url')) if not watch_url: continue - format_id_list = [stream_type] - maxrate = watch.get('maxrate') - if maxrate: - format_id_list.append(compat_str(maxrate)) audio_channel = watch.get('audio_channel') - if audio_channel: - format_id_list.append(compat_str(audio_channel)) preference = 1 if audio_channel == 'A' else None - format_id = '-'.join(format_id_list) + format_id = join_nonempty(stream_type, watch.get('maxrate'), audio_channel) if stream_type in ('dash', 'dash_widevine', 'dash_playready'): this_formats = self._extract_mpd_formats( watch_url, video_id, mpd_id=format_id, fatal=False) diff --git a/yt_dlp/extractor/zdf.py b/yt_dlp/extractor/zdf.py index 8c279c5ab..df236c050 100644 --- a/yt_dlp/extractor/zdf.py +++ b/yt_dlp/extractor/zdf.py @@ -9,12 +9,12 @@ from ..utils import ( determine_ext, float_or_none, int_or_none, + join_nonempty, merge_dicts, NO_DEFAULT, orderedSet, parse_codecs, qualities, - str_or_none, try_get, unified_timestamp, update_url_query, @@ -70,11 +70,11 @@ class ZDFBaseIE(InfoExtractor): f = {'vcodec': data[0], 'acodec': data[1]} f.update({ 'url': format_url, - 'format_id': '-'.join(filter(str_or_none, ('http', meta.get('type'), meta.get('quality')))), + 'format_id': join_nonempty('http', meta.get('type'), meta.get('quality')), }) new_formats = [f] formats.extend(merge_dicts(f, { - 'format_note': ', '.join(filter(None, (meta.get('quality'), meta.get('class')))), + 'format_note': join_nonempty('quality', 'class', from_dict=meta, delim=', '), 'language': meta.get('language'), 'language_preference': 10 if meta.get('class') == 'main' else -10 if meta.get('class') == 'ad' else -1, 'quality': qualities(self._QUALITIES)(meta.get('quality')), diff --git a/yt_dlp/jsinterp.py b/yt_dlp/jsinterp.py index 7bda59610..a6084ab82 100644 --- a/yt_dlp/jsinterp.py +++ b/yt_dlp/jsinterp.py @@ -1,5 +1,4 @@ -from __future__ import unicode_literals - +from collections.abc import MutableMapping import json import operator import re @@ -22,11 +21,54 @@ _OPERATORS = [ ('*', operator.mul), ] _ASSIGN_OPERATORS = [(op + '=', opfunc) for op, opfunc in _OPERATORS] -_ASSIGN_OPERATORS.append(('=', lambda cur, right: right)) +_ASSIGN_OPERATORS.append(('=', (lambda cur, right: right))) _NAME_RE = r'[a-zA-Z_$][a-zA-Z_$0-9]*' +class JS_Break(ExtractorError): + def __init__(self): + ExtractorError.__init__(self, 'Invalid break') + + +class JS_Continue(ExtractorError): + def __init__(self): + ExtractorError.__init__(self, 'Invalid continue') + + +class LocalNameSpace(MutableMapping): + def __init__(self, *stack): + self.stack = tuple(stack) + + def __getitem__(self, key): + for scope in self.stack: + if key in scope: + return scope[key] + raise KeyError(key) + + def __setitem__(self, key, value): + for scope in self.stack: + if key in scope: + scope[key] = value + break + else: + self.stack[0][key] = value + return value + + def __delitem__(self, key): + raise NotImplementedError('Deleting is not supported') + + def __iter__(self): + for scope in self.stack: + yield from scope + + def __len__(self, key): + return len(iter(self)) + + def __repr__(self): + return f'LocalNameSpace{self.stack}' + + class JSInterpreter(object): def __init__(self, code, objects=None): if objects is None: @@ -34,11 +76,58 @@ class JSInterpreter(object): self.code = code self._functions = {} self._objects = objects + self.__named_object_counter = 0 + + def _named_object(self, namespace, obj): + self.__named_object_counter += 1 + name = f'__yt_dlp_jsinterp_obj{self.__named_object_counter}' + namespace[name] = obj + return name + + @staticmethod + def _seperate(expr, delim=',', max_split=None): + if not expr: + return + parens = {'(': 0, '{': 0, '[': 0, ']': 0, '}': 0, ')': 0} + start, splits, pos, max_pos = 0, 0, 0, len(delim) - 1 + for idx, char in enumerate(expr): + if char in parens: + parens[char] += 1 + is_in_parens = (parens['['] - parens[']'] + or parens['('] - parens[')'] + or parens['{'] - parens['}']) + if char == delim[pos] and not is_in_parens: + if pos == max_pos: + pos = 0 + yield expr[start: idx - max_pos] + start = idx + 1 + splits += 1 + if max_split and splits >= max_split: + break + else: + pos += 1 + else: + pos = 0 + yield expr[start:] + + @staticmethod + def _seperate_at_paren(expr, delim): + seperated = list(JSInterpreter._seperate(expr, delim, 1)) + if len(seperated) < 2: + raise ExtractorError(f'No terminating paren {delim} in {expr}') + return seperated[0][1:].strip(), seperated[1].strip() def interpret_statement(self, stmt, local_vars, allow_recursion=100): if allow_recursion < 0: raise ExtractorError('Recursion limit reached') + sub_statements = list(self._seperate(stmt, ';')) + stmt = (sub_statements or ['']).pop() + for sub_stmt in sub_statements: + ret, should_abort = self.interpret_statement(sub_stmt, local_vars, allow_recursion - 1) + if should_abort: + return ret + should_abort = False stmt = stmt.lstrip() stmt_m = re.match(r'var\s', stmt) @@ -61,25 +150,122 @@ class JSInterpreter(object): if expr == '': # Empty expression return None + if expr.startswith('{'): + inner, outer = self._seperate_at_paren(expr, '}') + inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion - 1) + if not outer or should_abort: + return inner + else: + expr = json.dumps(inner) + outer + if expr.startswith('('): - parens_count = 0 - for m in re.finditer(r'[()]', expr): - if m.group(0) == '(': - parens_count += 1 + inner, outer = self._seperate_at_paren(expr, ')') + inner = self.interpret_expression(inner, local_vars, allow_recursion) + if not outer: + return inner + else: + expr = json.dumps(inner) + outer + + if expr.startswith('['): + inner, outer = self._seperate_at_paren(expr, ']') + name = self._named_object(local_vars, [ + self.interpret_expression(item, local_vars, allow_recursion) + for item in self._seperate(inner)]) + expr = name + outer + + m = re.match(r'try\s*', expr) + if m: + if expr[m.end()] == '{': + try_expr, expr = self._seperate_at_paren(expr[m.end():], '}') + else: + try_expr, expr = expr[m.end() - 1:], '' + ret, should_abort = self.interpret_statement(try_expr, local_vars, allow_recursion - 1) + if should_abort: + return ret + return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0] + + m = re.match(r'catch\s*\(', expr) + if m: + # We ignore the catch block + _, expr = self._seperate_at_paren(expr, '}') + return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0] + + m = re.match(r'for\s*\(', expr) + if m: + constructor, remaining = self._seperate_at_paren(expr[m.end() - 1:], ')') + if remaining.startswith('{'): + body, expr = self._seperate_at_paren(remaining, '}') + else: + m = re.match(r'switch\s*\(', remaining) # FIXME + if m: + switch_val, remaining = self._seperate_at_paren(remaining[m.end() - 1:], ')') + body, expr = self._seperate_at_paren(remaining, '}') + body = 'switch(%s){%s}' % (switch_val, body) else: - parens_count -= 1 - if parens_count == 0: - sub_expr = expr[1:m.start()] - sub_result = self.interpret_expression( - sub_expr, local_vars, allow_recursion) - remaining_expr = expr[m.end():].strip() - if not remaining_expr: - return sub_result - else: - expr = json.dumps(sub_result) + remaining_expr + body, expr = remaining, '' + start, cndn, increment = self._seperate(constructor, ';') + if self.interpret_statement(start, local_vars, allow_recursion - 1)[1]: + raise ExtractorError( + f'Premature return in the initialization of a for loop in {constructor!r}') + while True: + if not self.interpret_expression(cndn, local_vars, allow_recursion): + break + try: + ret, should_abort = self.interpret_statement(body, local_vars, allow_recursion - 1) + if should_abort: + return ret + except JS_Break: + break + except JS_Continue: + pass + if self.interpret_statement(increment, local_vars, allow_recursion - 1)[1]: + raise ExtractorError( + f'Premature return in the initialization of a for loop in {constructor!r}') + return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0] + + m = re.match(r'switch\s*\(', expr) + if m: + switch_val, remaining = self._seperate_at_paren(expr[m.end() - 1:], ')') + switch_val = self.interpret_expression(switch_val, local_vars, allow_recursion) + body, expr = self._seperate_at_paren(remaining, '}') + items = body.replace('default:', 'case default:').split('case ')[1:] + for default in (False, True): + matched = False + for item in items: + case, stmt = [i.strip() for i in self._seperate(item, ':', 1)] + if default: + matched = matched or case == 'default' + elif not matched: + matched = case != 'default' and switch_val == self.interpret_expression(case, local_vars, allow_recursion) + if not matched: + continue + try: + ret, should_abort = self.interpret_statement(stmt, local_vars, allow_recursion - 1) + if should_abort: + return ret + except JS_Break: break - else: - raise ExtractorError('Premature end of parens in %r' % expr) + if matched: + break + return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0] + + # Comma seperated statements + sub_expressions = list(self._seperate(expr)) + expr = sub_expressions.pop().strip() if sub_expressions else '' + for sub_expr in sub_expressions: + self.interpret_expression(sub_expr, local_vars, allow_recursion) + + for m in re.finditer(rf'''(?x) + (?P<pre_sign>\+\+|--)(?P<var1>{_NAME_RE})| + (?P<var2>{_NAME_RE})(?P<post_sign>\+\+|--)''', expr): + var = m.group('var1') or m.group('var2') + start, end = m.span() + sign = m.group('pre_sign') or m.group('post_sign') + ret = local_vars[var] + local_vars[var] += 1 if sign[0] == '+' else -1 + if m.group('pre_sign'): + ret = local_vars[var] + expr = expr[:start] + json.dumps(ret) + expr[end:] for op, opfunc in _ASSIGN_OPERATORS: m = re.match(r'''(?x) @@ -88,14 +274,13 @@ class JSInterpreter(object): (?P<expr>.*)$''' % (_NAME_RE, re.escape(op)), expr) if not m: continue - right_val = self.interpret_expression( - m.group('expr'), local_vars, allow_recursion - 1) + right_val = self.interpret_expression(m.group('expr'), local_vars, allow_recursion) if m.groupdict().get('index'): lvar = local_vars[m.group('out')] - idx = self.interpret_expression( - m.group('index'), local_vars, allow_recursion) - assert isinstance(idx, int) + idx = self.interpret_expression(m.group('index'), local_vars, allow_recursion) + if not isinstance(idx, int): + raise ExtractorError(f'List indices must be integers: {idx}') cur = lvar[idx] val = opfunc(cur, right_val) lvar[idx] = val @@ -109,8 +294,13 @@ class JSInterpreter(object): if expr.isdigit(): return int(expr) + if expr == 'break': + raise JS_Break() + elif expr == 'continue': + raise JS_Continue() + var_m = re.match( - r'(?!if|return|true|false)(?P<name>%s)$' % _NAME_RE, + r'(?!if|return|true|false|null)(?P<name>%s)$' % _NAME_RE, expr) if var_m: return local_vars[var_m.group('name')] @@ -124,91 +314,154 @@ class JSInterpreter(object): r'(?P<in>%s)\[(?P<idx>.+)\]$' % _NAME_RE, expr) if m: val = local_vars[m.group('in')] - idx = self.interpret_expression( - m.group('idx'), local_vars, allow_recursion - 1) + idx = self.interpret_expression(m.group('idx'), local_vars, allow_recursion) return val[idx] + for op, opfunc in _OPERATORS: + seperated = list(self._seperate(expr, op)) + if len(seperated) < 2: + continue + right_val = seperated.pop() + left_val = op.join(seperated) + left_val, should_abort = self.interpret_statement( + left_val, local_vars, allow_recursion - 1) + if should_abort: + raise ExtractorError(f'Premature left-side return of {op} in {expr!r}') + right_val, should_abort = self.interpret_statement( + right_val, local_vars, allow_recursion - 1) + if should_abort: + raise ExtractorError(f'Premature right-side return of {op} in {expr!r}') + return opfunc(left_val or 0, right_val) + m = re.match( - r'(?P<var>%s)(?:\.(?P<member>[^(]+)|\[(?P<member2>[^]]+)\])\s*(?:\(+(?P<args>[^()]*)\))?$' % _NAME_RE, + r'(?P<var>%s)(?:\.(?P<member>[^(]+)|\[(?P<member2>[^]]+)\])\s*' % _NAME_RE, expr) if m: variable = m.group('var') member = remove_quotes(m.group('member') or m.group('member2')) - arg_str = m.group('args') - - if variable in local_vars: - obj = local_vars[variable] - else: - if variable not in self._objects: - self._objects[variable] = self.extract_object(variable) - obj = self._objects[variable] - - if arg_str is None: - # Member access - if member == 'length': - return len(obj) - return obj[member] - - assert expr.endswith(')') - # Function call - if arg_str == '': - argvals = tuple() + arg_str = expr[m.end():] + if arg_str.startswith('('): + arg_str, remaining = self._seperate_at_paren(arg_str, ')') else: - argvals = tuple([ + arg_str, remaining = None, arg_str + + def assertion(cndn, msg): + """ assert, but without risk of getting optimized out """ + if not cndn: + raise ExtractorError(f'{member} {msg}: {expr}') + + def eval_method(): + nonlocal member + if variable == 'String': + obj = str + elif variable in local_vars: + obj = local_vars[variable] + else: + if variable not in self._objects: + self._objects[variable] = self.extract_object(variable) + obj = self._objects[variable] + + if arg_str is None: + # Member access + if member == 'length': + return len(obj) + return obj[member] + + # Function call + argvals = [ self.interpret_expression(v, local_vars, allow_recursion) - for v in arg_str.split(',')]) - - if member == 'split': - assert argvals == ('',) - return list(obj) - if member == 'join': - assert len(argvals) == 1 - return argvals[0].join(obj) - if member == 'reverse': - assert len(argvals) == 0 - obj.reverse() - return obj - if member == 'slice': - assert len(argvals) == 1 - return obj[argvals[0]:] - if member == 'splice': - assert isinstance(obj, list) - index, howMany = argvals - res = [] - for i in range(index, min(index + howMany, len(obj))): - res.append(obj.pop(index)) - return res - - return obj[member](argvals) - - for op, opfunc in _OPERATORS: - m = re.match(r'(?P<x>.+?)%s(?P<y>.+)' % re.escape(op), expr) - if not m: - continue - x, abort = self.interpret_statement( - m.group('x'), local_vars, allow_recursion - 1) - if abort: - raise ExtractorError( - 'Premature left-side return of %s in %r' % (op, expr)) - y, abort = self.interpret_statement( - m.group('y'), local_vars, allow_recursion - 1) - if abort: - raise ExtractorError( - 'Premature right-side return of %s in %r' % (op, expr)) - return opfunc(x, y) + for v in self._seperate(arg_str)] + + if obj == str: + if member == 'fromCharCode': + assertion(argvals, 'takes one or more arguments') + return ''.join(map(chr, argvals)) + raise ExtractorError(f'Unsupported string method {member}') + + if member == 'split': + assertion(argvals, 'takes one or more arguments') + assertion(argvals == [''], 'with arguments is not implemented') + return list(obj) + elif member == 'join': + assertion(isinstance(obj, list), 'must be applied on a list') + assertion(len(argvals) == 1, 'takes exactly one argument') + return argvals[0].join(obj) + elif member == 'reverse': + assertion(not argvals, 'does not take any arguments') + obj.reverse() + return obj + elif member == 'slice': + assertion(isinstance(obj, list), 'must be applied on a list') + assertion(len(argvals) == 1, 'takes exactly one argument') + return obj[argvals[0]:] + elif member == 'splice': + assertion(isinstance(obj, list), 'must be applied on a list') + assertion(argvals, 'takes one or more arguments') + index, howMany = map(int, (argvals + [len(obj)])[:2]) + if index < 0: + index += len(obj) + add_items = argvals[2:] + res = [] + for i in range(index, min(index + howMany, len(obj))): + res.append(obj.pop(index)) + for i, item in enumerate(add_items): + obj.insert(index + i, item) + return res + elif member == 'unshift': + assertion(isinstance(obj, list), 'must be applied on a list') + assertion(argvals, 'takes one or more arguments') + for item in reversed(argvals): + obj.insert(0, item) + return obj + elif member == 'pop': + assertion(isinstance(obj, list), 'must be applied on a list') + assertion(not argvals, 'does not take any arguments') + if not obj: + return + return obj.pop() + elif member == 'push': + assertion(argvals, 'takes one or more arguments') + obj.extend(argvals) + return obj + elif member == 'forEach': + assertion(argvals, 'takes one or more arguments') + assertion(len(argvals) <= 2, 'takes at-most 2 arguments') + f, this = (argvals + [''])[:2] + return [f((item, idx, obj), this=this) for idx, item in enumerate(obj)] + elif member == 'indexOf': + assertion(argvals, 'takes one or more arguments') + assertion(len(argvals) <= 2, 'takes at-most 2 arguments') + idx, start = (argvals + [0])[:2] + try: + return obj.index(idx, start) + except ValueError: + return -1 + + if isinstance(obj, list): + member = int(member) + return obj[member](argvals) + + if remaining: + return self.interpret_expression( + self._named_object(local_vars, eval_method()) + remaining, + local_vars, allow_recursion) + else: + return eval_method() - m = re.match( - r'^(?P<func>%s)\((?P<args>[a-zA-Z0-9_$,]*)\)$' % _NAME_RE, expr) + m = re.match(r'^(?P<func>%s)\((?P<args>[a-zA-Z0-9_$,]*)\)$' % _NAME_RE, expr) if m: fname = m.group('func') argvals = tuple([ int(v) if v.isdigit() else local_vars[v] - for v in m.group('args').split(',')]) if len(m.group('args')) > 0 else tuple() - if fname not in self._functions: + for v in self._seperate(m.group('args'))]) + if fname in local_vars: + return local_vars[fname](argvals) + elif fname not in self._functions: self._functions[fname] = self.extract_function(fname) return self._functions[fname](argvals) - raise ExtractorError('Unsupported JS expression %r' % expr) + if expr: + raise ExtractorError('Unsupported JS expression %r' % expr) def extract_object(self, objname): _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')''' @@ -233,30 +486,55 @@ class JSInterpreter(object): return obj - def extract_function(self, funcname): + def extract_function_code(self, funcname): + """ @returns argnames, code """ func_m = re.search( r'''(?x) (?:function\s+%s|[{;,]\s*%s\s*=\s*function|var\s+%s\s*=\s*function)\s* \((?P<args>[^)]*)\)\s* - \{(?P<code>[^}]+)\}''' % ( + (?P<code>\{(?:(?!};)[^"]|"([^"]|\\")*")+\})''' % ( re.escape(funcname), re.escape(funcname), re.escape(funcname)), self.code) + code, _ = self._seperate_at_paren(func_m.group('code'), '}') # refine the match if func_m is None: raise ExtractorError('Could not find JS function %r' % funcname) - argnames = func_m.group('args').split(',') + return func_m.group('args').split(','), code - return self.build_function(argnames, func_m.group('code')) + def extract_function(self, funcname): + return self.extract_function_from_code(*self.extract_function_code(funcname)) + + def extract_function_from_code(self, argnames, code, *global_stack): + local_vars = {} + while True: + mobj = re.search(r'function\((?P<args>[^)]*)\)\s*{', code) + if mobj is None: + break + start, body_start = mobj.span() + body, remaining = self._seperate_at_paren(code[body_start - 1:], '}') + name = self._named_object( + local_vars, + self.extract_function_from_code( + [str.strip(x) for x in mobj.group('args').split(',')], + body, local_vars, *global_stack)) + code = code[:start] + name + remaining + return self.build_function(argnames, code, local_vars, *global_stack) def call_function(self, funcname, *args): - f = self.extract_function(funcname) - return f(args) - - def build_function(self, argnames, code): - def resf(args): - local_vars = dict(zip(argnames, args)) - for stmt in code.split(';'): - res, abort = self.interpret_statement(stmt, local_vars) - if abort: + return self.extract_function(funcname)(args) + + def build_function(self, argnames, code, *global_stack): + global_stack = list(global_stack) or [{}] + local_vars = global_stack.pop(0) + + def resf(args, **kwargs): + local_vars.update({ + **dict(zip(argnames, args)), + **kwargs + }) + var_stack = LocalNameSpace(local_vars, *global_stack) + for stmt in self._seperate(code.replace('\n', ''), ';'): + ret, should_abort = self.interpret_statement(stmt, var_stack) + if should_abort: break - return res + return ret return resf diff --git a/yt_dlp/options.py b/yt_dlp/options.py index eb86f9e0c..9da37af28 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -151,25 +151,25 @@ def parseOpts(overrideArguments=None): def _dict_from_options_callback( option, opt_str, value, parser, - allowed_keys=r'[\w-]+', delimiter=':', default_key=None, process=None, multiple_keys=True): + allowed_keys=r'[\w-]+', delimiter=':', default_key=None, process=None, multiple_keys=True, + process_key=str.lower): out_dict = getattr(parser.values, option.dest) if multiple_keys: allowed_keys = r'(%s)(,(%s))*' % (allowed_keys, allowed_keys) mobj = re.match(r'(?i)(?P<keys>%s)%s(?P<val>.*)$' % (allowed_keys, delimiter), value) if mobj is not None: - keys = [k.strip() for k in mobj.group('keys').lower().split(',')] - val = mobj.group('val') + keys, val = mobj.group('keys').split(','), mobj.group('val') elif default_key is not None: keys, val = [default_key], value else: raise optparse.OptionValueError( 'wrong %s formatting; it should be %s, not "%s"' % (opt_str, option.metavar, value)) try: + keys = map(process_key, keys) if process_key else keys val = process(val) if process else val except Exception as err: - raise optparse.OptionValueError( - 'wrong %s formatting; %s' % (opt_str, err)) + raise optparse.OptionValueError(f'wrong {opt_str} formatting; {err}') for key in keys: out_dict[key] = val @@ -205,7 +205,7 @@ def parseOpts(overrideArguments=None): general.add_option( '-i', '--ignore-errors', action='store_true', dest='ignoreerrors', - help='Ignore download and postprocessing errors. The download will be considered successfull even if the postprocessing fails') + help='Ignore download and postprocessing errors. The download will be considered successful even if the postprocessing fails') general.add_option( '--no-abort-on-error', action='store_const', dest='ignoreerrors', const='only_download', @@ -274,7 +274,7 @@ def parseOpts(overrideArguments=None): 'allowed_values': { 'filename', 'format-sort', 'abort-on-error', 'format-spec', 'no-playlist-metafiles', 'multistreams', 'no-live-chat', 'playlist-index', 'list-formats', 'no-direct-merge', - 'no-youtube-channel-redirect', 'no-youtube-unavailable-videos', 'no-attach-info-json', + 'no-youtube-channel-redirect', 'no-youtube-unavailable-videos', 'no-attach-info-json', 'embed-metadata', 'embed-thumbnail-atomicparsley', 'seperate-video-versions', 'no-clean-infojson', 'no-keep-subs', }, 'aliases': { 'youtube-dl': ['-multistreams', 'all'], @@ -379,7 +379,7 @@ def parseOpts(overrideArguments=None): '--date', metavar='DATE', dest='date', default=None, help=( - 'Download only videos uploaded in this date. ' + 'Download only videos uploaded on this date. ' 'The date can be "YYYYMMDD" or in the format ' '"(now|today)[+-][0-9](day|week|month|year)(s)?"')) selection.add_option( @@ -630,7 +630,7 @@ def parseOpts(overrideArguments=None): action='callback', dest='subtitleslangs', metavar='LANGS', type='str', default=[], callback=_list_from_options_callback, help=( - 'Languages of the subtitles to download (can be regex) or "all" separated by commas. (Eg: --sub-langs en.*,ja) ' + 'Languages of the subtitles to download (can be regex) or "all" separated by commas. (Eg: --sub-langs "en.*,ja") ' 'You can prefix the language code with a "-" to exempt it from the requested languages. (Eg: --sub-langs all,-live_chat) ' 'Use --list-subs for a list of available language tags')) @@ -788,7 +788,7 @@ def parseOpts(overrideArguments=None): '--add-header', metavar='FIELD:VALUE', dest='headers', default={}, type='str', action='callback', callback=_dict_from_options_callback, - callback_kwargs={'multiple_keys': False}, + callback_kwargs={'multiple_keys': False, 'process_key': None}, help='Specify a custom HTTP header and its value, separated by a colon ":". You can use this option multiple times', ) workarounds.add_option( @@ -836,7 +836,7 @@ def parseOpts(overrideArguments=None): '--ignore-no-formats-error', action='store_true', dest='ignore_no_formats_error', default=False, help=( - 'Ignore "No video formats" error. Usefull for extracting metadata ' + 'Ignore "No video formats" error. Useful for extracting metadata ' 'even if the videos are not actually available for download (experimental)')) verbosity.add_option( '--no-ignore-no-formats-error', @@ -931,7 +931,7 @@ def parseOpts(overrideArguments=None): 'Template for progress outputs, optionally prefixed with one of "download:" (default), ' '"download-title:" (the console title), "postprocess:", or "postprocess-title:". ' 'The video\'s fields are accessible under the "info" key and ' - 'the progress attributes are accessible under "progress" key. Eg: ' + 'the progress attributes are accessible under "progress" key. E.g.: ' # TODO: Document the fields inside "progress" '--console-title --progress-template "download-title:%(info.id)s-%(progress.eta)s"')) verbosity.add_option( @@ -1024,11 +1024,11 @@ def parseOpts(overrideArguments=None): filesystem.add_option( '--windows-filenames', action='store_true', dest='windowsfilenames', default=False, - help='Force filenames to be windows compatible') + help='Force filenames to be Windows-compatible') filesystem.add_option( '--no-windows-filenames', action='store_false', dest='windowsfilenames', - help='Make filenames windows compatible only if using windows (default)') + help='Make filenames Windows-compatible only if using Windows (default)') filesystem.add_option( '--trim-filenames', '--trim-file-names', metavar='LENGTH', dest='trim_file_name', default=0, type=int, @@ -1211,7 +1211,7 @@ def parseOpts(overrideArguments=None): postproc.add_option( '--audio-quality', metavar='QUALITY', dest='audioquality', default='5', - help='Specify ffmpeg audio quality, insert a value between 0 (better) and 9 (worse) for VBR or a specific bitrate like 128K (default %default)') + help='Specify ffmpeg audio quality, insert a value between 0 (best) and 10 (worst) for VBR or a specific bitrate like 128K (default %default)') postproc.add_option( '--remux-video', metavar='FORMAT', dest='remuxvideo', default=None, @@ -1283,7 +1283,9 @@ def parseOpts(overrideArguments=None): postproc.add_option( '--embed-metadata', '--add-metadata', action='store_true', dest='addmetadata', default=False, - help='Embed metadata to the video file. Also adds chapters to file unless --no-add-chapters is used (Alias: --add-metadata)') + help=( + 'Embed metadata to the video file. Also embeds chapters/infojson if present ' + 'unless --no-embed-chapters/--no-embed-info-json are used (Alias: --add-metadata)')) postproc.add_option( '--no-embed-metadata', '--no-add-metadata', action='store_false', dest='addmetadata', @@ -1297,6 +1299,14 @@ def parseOpts(overrideArguments=None): action='store_false', dest='addchapters', help='Do not add chapter markers (default) (Alias: --no-add-chapters)') postproc.add_option( + '--embed-info-json', + action='store_true', dest='embed_infojson', default=None, + help='Embed the infojson as an attachment to mkv/mka video files') + postproc.add_option( + '--no-embed-info-json', + action='store_false', dest='embed_infojson', + help='Do not embed the infojson as an attachment to the video file') + postproc.add_option( '--metadata-from-title', metavar='FORMAT', dest='metafromtitle', help=optparse.SUPPRESS_HELP) diff --git a/yt_dlp/postprocessor/__init__.py b/yt_dlp/postprocessor/__init__.py index 07c87b76a..4ae230d2f 100644 --- a/yt_dlp/postprocessor/__init__.py +++ b/yt_dlp/postprocessor/__init__.py @@ -2,6 +2,7 @@ from ..utils import load_plugins +from .common import PostProcessor from .embedthumbnail import EmbedThumbnailPP from .exec import ExecPP, ExecAfterDownloadPP from .ffmpeg import ( @@ -39,5 +40,5 @@ def get_postprocessor(key): return globals()[key + 'PP'] -__all__ = [name for name in globals().keys() if name.endswith('IE')] -__all__.append('FFmpegPostProcessor') +__all__ = [name for name in globals().keys() if name.endswith('PP')] +__all__.extend(('PostProcessor', 'FFmpegPostProcessor')) diff --git a/yt_dlp/postprocessor/ffmpeg.py b/yt_dlp/postprocessor/ffmpeg.py index b7fcc569b..f712547a8 100644 --- a/yt_dlp/postprocessor/ffmpeg.py +++ b/yt_dlp/postprocessor/ffmpeg.py @@ -16,7 +16,8 @@ from ..utils import ( encodeArgument, encodeFilename, float_or_none, - get_exe_version, + _get_exe_version_output, + detect_exe_version, is_outdated_version, ISO639Utils, orderedSet, @@ -27,6 +28,7 @@ from ..utils import ( shell_quote, traverse_obj, variadic, + write_json_file, ) @@ -51,6 +53,7 @@ ACODECS = { 'opus': 'libopus', 'vorbis': 'libvorbis', 'wav': None, + 'alac': None, } @@ -75,15 +78,20 @@ class FFmpegPostProcessor(PostProcessor): self.report_warning(warning) @staticmethod + def get_versions_and_features(downloader=None): + pp = FFmpegPostProcessor(downloader) + return pp._versions, pp._features + + @staticmethod def get_versions(downloader=None): - return FFmpegPostProcessor(downloader)._versions + return FFmpegPostProcessor.get_version_and_features(downloader)[0] def _determine_executables(self): programs = ['avprobe', 'avconv', 'ffmpeg', 'ffprobe'] - prefer_ffmpeg = True - def get_ffmpeg_version(path): - ver = get_exe_version(path, args=['-version']) + def get_ffmpeg_version(path, prog): + out = _get_exe_version_output(path, ['-bsfs']) + ver = detect_exe_version(out) if out else False if ver: regexs = [ r'(?:\d+:)?([0-9.]+)-[0-9]+ubuntu[0-9.]+$', # Ubuntu, see [1] @@ -94,42 +102,52 @@ class FFmpegPostProcessor(PostProcessor): mobj = re.match(regex, ver) if mobj: ver = mobj.group(1) - return ver + self._versions[prog] = ver + if prog != 'ffmpeg' or not out: + return + + mobj = re.search(r'(?m)^\s+libavformat\s+(?:[0-9. ]+)\s+/\s+(?P<runtime>[0-9. ]+)', out) + lavf_runtime_version = mobj.group('runtime').replace(' ', '') if mobj else None + self._features = { + 'fdk': '--enable-libfdk-aac' in out, + 'setts': 'setts' in out.splitlines(), + 'needs_adtstoasc': is_outdated_version(lavf_runtime_version, '57.56.100', False), + } self.basename = None self.probe_basename = None - self._paths = None self._versions = None - if self._downloader: - prefer_ffmpeg = self.get_param('prefer_ffmpeg', True) - location = self.get_param('ffmpeg_location') - if location is not None: - if not os.path.exists(location): - self.report_warning( - 'ffmpeg-location %s does not exist! ' - 'Continuing without ffmpeg.' % (location)) - self._versions = {} - return - elif os.path.isdir(location): - dirname, basename = location, None - else: - basename = os.path.splitext(os.path.basename(location))[0] - basename = next((p for p in programs if basename.startswith(p)), 'ffmpeg') - dirname = os.path.dirname(os.path.abspath(location)) - if basename in ('ffmpeg', 'ffprobe'): - prefer_ffmpeg = True - - self._paths = dict( - (p, os.path.join(dirname, p)) for p in programs) - if basename: - self._paths[basename] = location - self._versions = dict( - (p, get_ffmpeg_version(self._paths[p])) for p in programs) - if self._versions is None: - self._versions = dict( - (p, get_ffmpeg_version(p)) for p in programs) - self._paths = dict((p, p) for p in programs) + self._features = {} + + prefer_ffmpeg = self.get_param('prefer_ffmpeg', True) + location = self.get_param('ffmpeg_location') + if location is None: + self._paths = {p: p for p in programs} + else: + if not os.path.exists(location): + self.report_warning( + 'ffmpeg-location %s does not exist! ' + 'Continuing without ffmpeg.' % (location)) + self._versions = {} + return + elif os.path.isdir(location): + dirname, basename = location, None + else: + basename = os.path.splitext(os.path.basename(location))[0] + basename = next((p for p in programs if basename.startswith(p)), 'ffmpeg') + dirname = os.path.dirname(os.path.abspath(location)) + if basename in ('ffmpeg', 'ffprobe'): + prefer_ffmpeg = True + + self._paths = dict( + (p, os.path.join(dirname, p)) for p in programs) + if basename: + self._paths[basename] = location + + self._versions = {} + for p in programs: + get_ffmpeg_version(self._paths[p], p) if prefer_ffmpeg is False: prefs = ('avconv', 'ffmpeg') @@ -233,22 +251,23 @@ class FFmpegPostProcessor(PostProcessor): None) return num, len(streams) - def _get_real_video_duration(self, info, fatal=True): + def _get_real_video_duration(self, filepath, fatal=True): try: - if '_real_duration' not in info: - info['_real_duration'] = float_or_none( - traverse_obj(self.get_metadata_object(info['filepath']), ('format', 'duration'))) - if not info['_real_duration']: + duration = float_or_none( + traverse_obj(self.get_metadata_object(filepath), ('format', 'duration'))) + if not duration: raise PostProcessingError('ffprobe returned empty duration') + return duration except PostProcessingError as e: if fatal: - raise PostProcessingError(f'Unable to determine video duration; {e}') - return info.setdefault('_real_duration', None) + raise PostProcessingError(f'Unable to determine video duration: {e.msg}') def _duration_mismatch(self, d1, d2): if not d1 or not d2: return None - return abs(d1 - d2) > 1 + # The duration is often only known to nearest second. So there can be <1sec disparity natually. + # Further excuse an additional <1sec difference. + return abs(d1 - d2) > 2 def run_ffmpeg_multiple_files(self, input_paths, out_path, opts, **kwargs): return self.real_run_ffmpeg( @@ -366,14 +385,36 @@ class FFmpegPostProcessor(PostProcessor): class FFmpegExtractAudioPP(FFmpegPostProcessor): COMMON_AUDIO_EXTS = ('wav', 'flac', 'm4a', 'aiff', 'mp3', 'ogg', 'mka', 'opus', 'wma') - SUPPORTED_EXTS = ('best', 'aac', 'flac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav') + SUPPORTED_EXTS = ('best', 'aac', 'flac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav', 'alac') def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, nopostoverwrites=False): FFmpegPostProcessor.__init__(self, downloader) self._preferredcodec = preferredcodec or 'best' - self._preferredquality = preferredquality + self._preferredquality = float_or_none(preferredquality) self._nopostoverwrites = nopostoverwrites + def _quality_args(self, codec): + if self._preferredquality is None: + return [] + elif self._preferredquality > 10: + return ['-b:a', f'{self._preferredquality}k'] + + limits = { + 'libmp3lame': (10, 0), + 'libvorbis': (0, 10), + # FFmpeg's AAC encoder does not have an upper limit for the value of -q:a. + # Experimentally, with values over 4, bitrate changes were minimal or non-existent + 'aac': (0.1, 4), + 'libfdk_aac': (1, 5), + }.get(codec) + if not limits: + return [] + + q = limits[1] + (limits[0] - limits[1]) * (self._preferredquality / 10) + if codec == 'libfdk_aac': + return ['-vbr', f'{int(q)}'] + return ['-q:a', f'{q}'] + def run_ffmpeg(self, path, out_path, codec, more_opts): if codec is None: acodec_opts = [] @@ -387,7 +428,7 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): @PostProcessor._restrict_to(images=False) def run(self, information): - path = information['filepath'] + orig_path = path = information['filepath'] orig_ext = information['ext'] if self._preferredcodec == 'best' and orig_ext in self.COMMON_AUDIO_EXTS: @@ -413,65 +454,67 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): more_opts = ['-f', 'adts'] if filecodec == 'vorbis': extension = 'ogg' + elif filecodec == 'alac': + acodec = None + extension = 'm4a' + more_opts += ['-acodec', 'alac'] else: # MP3 otherwise. acodec = 'libmp3lame' extension = 'mp3' - more_opts = [] - if self._preferredquality is not None: - if int(self._preferredquality) < 10: - more_opts += ['-q:a', self._preferredquality] - else: - more_opts += ['-b:a', self._preferredquality + 'k'] + more_opts = self._quality_args(acodec) else: # We convert the audio (lossy if codec is lossy) acodec = ACODECS[self._preferredcodec] + if acodec == 'aac' and self._features.get('fdk'): + acodec = 'libfdk_aac' extension = self._preferredcodec - more_opts = [] - if self._preferredquality is not None: - # The opus codec doesn't support the -aq option - if int(self._preferredquality) < 10 and extension != 'opus': - more_opts += ['-q:a', self._preferredquality] - else: - more_opts += ['-b:a', self._preferredquality + 'k'] + more_opts = self._quality_args(acodec) if self._preferredcodec == 'aac': more_opts += ['-f', 'adts'] - if self._preferredcodec == 'm4a': + elif self._preferredcodec == 'm4a': more_opts += ['-bsf:a', 'aac_adtstoasc'] - if self._preferredcodec == 'vorbis': + elif self._preferredcodec == 'vorbis': extension = 'ogg' - if self._preferredcodec == 'wav': + elif self._preferredcodec == 'wav': extension = 'wav' more_opts += ['-f', 'wav'] + elif self._preferredcodec == 'alac': + extension = 'm4a' + more_opts += ['-acodec', 'alac'] prefix, sep, ext = path.rpartition('.') # not os.path.splitext, since the latter does not work on unicode in all setups - new_path = prefix + sep + extension + temp_path = new_path = prefix + sep + extension - information['filepath'] = new_path - information['ext'] = extension - - # If we download foo.mp3 and convert it to... foo.mp3, then don't delete foo.mp3, silly. - if (new_path == path - or (self._nopostoverwrites and os.path.exists(encodeFilename(new_path)))): + if new_path == path: + orig_path = prepend_extension(path, 'orig') + temp_path = prepend_extension(path, 'temp') + if (self._nopostoverwrites and os.path.exists(encodeFilename(new_path)) + and os.path.exists(encodeFilename(orig_path))): self.to_screen('Post-process file %s exists, skipping' % new_path) return [], information try: - self.to_screen('Destination: ' + new_path) - self.run_ffmpeg(path, new_path, acodec, more_opts) + self.to_screen(f'Destination: {new_path}') + self.run_ffmpeg(path, temp_path, acodec, more_opts) except AudioConversionError as e: raise PostProcessingError( 'audio conversion failed: ' + e.msg) except Exception: raise PostProcessingError('error running ' + self.basename) + os.replace(path, orig_path) + os.replace(temp_path, new_path) + information['filepath'] = new_path + information['ext'] = extension + # Try to update the date time for extracted audio file. if information.get('filetime') is not None: self.try_utime( new_path, time.time(), information['filetime'], errnote='Cannot update utime of audio file') - return [path], information + return [orig_path], information class FFmpegVideoConvertorPP(FFmpegPostProcessor): @@ -533,22 +576,22 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): self._already_have_subtitle = already_have_subtitle @PostProcessor._restrict_to(images=False) - def run(self, information): - if information['ext'] not in ('mp4', 'webm', 'mkv'): + def run(self, info): + if info['ext'] not in ('mp4', 'webm', 'mkv'): self.to_screen('Subtitles can only be embedded in mp4, webm or mkv files') - return [], information - subtitles = information.get('requested_subtitles') + return [], info + subtitles = info.get('requested_subtitles') if not subtitles: self.to_screen('There aren\'t any subtitles to embed') - return [], information + return [], info - filename = information['filepath'] - if information.get('duration') and self._duration_mismatch( - self._get_real_video_duration(information, False), information['duration']): + filename = info['filepath'] + if info.get('duration') and not info.get('__real_download') and self._duration_mismatch( + self._get_real_video_duration(filename, False), info['duration']): self.to_screen(f'Skipping {self.pp_key()} since the real and expected durations mismatch') - return [], information + return [], info - ext = information['ext'] + ext = info['ext'] sub_langs, sub_names, sub_filenames = [], [], [] webm_vtt_warn = False mp4_ass_warn = False @@ -573,7 +616,7 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): self.report_warning('ASS subtitles cannot be properly embedded in mp4 files; expect issues') if not sub_langs: - return [], information + return [], info input_files = [filename] + sub_filenames @@ -586,7 +629,7 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): # https://trac.ffmpeg.org/ticket/6016) '-map', '-0:d', ] - if information['ext'] == 'mp4': + if info['ext'] == 'mp4': opts += ['-c:s', 'mov_text'] for i, (lang, name) in enumerate(zip(sub_langs, sub_names)): opts.extend(['-map', '%d:0' % (i + 1)]) @@ -602,15 +645,16 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): os.replace(temp_filename, filename) files_to_delete = [] if self._already_have_subtitle else sub_filenames - return files_to_delete, information + return files_to_delete, info class FFmpegMetadataPP(FFmpegPostProcessor): - def __init__(self, downloader, add_metadata=True, add_chapters=True): + def __init__(self, downloader, add_metadata=True, add_chapters=True, add_infojson='if_exists'): FFmpegPostProcessor.__init__(self, downloader) self._add_metadata = add_metadata self._add_chapters = add_chapters + self._add_infojson = add_infojson @staticmethod def _options(target_ext): @@ -623,13 +667,23 @@ class FFmpegMetadataPP(FFmpegPostProcessor): @PostProcessor._restrict_to(images=False) def run(self, info): filename, metadata_filename = info['filepath'], None - options = [] + files_to_delete, options = [], [] if self._add_chapters and info.get('chapters'): metadata_filename = replace_extension(filename, 'meta') options.extend(self._get_chapter_opts(info['chapters'], metadata_filename)) + files_to_delete.append(metadata_filename) if self._add_metadata: options.extend(self._get_metadata_opts(info)) + if self._add_infojson: + if info['ext'] in ('mkv', 'mka'): + infojson_filename = info.get('infojson_filename') + options.extend(self._get_infojson_opts(info, infojson_filename)) + if not infojson_filename: + files_to_delete.append(info.get('infojson_filename')) + elif self._add_infojson is True: + self.to_screen('The info-json can only be attached to mkv/mka files') + if not options: self.to_screen('There isn\'t any metadata to add') return [], info @@ -639,8 +693,8 @@ class FFmpegMetadataPP(FFmpegPostProcessor): self.run_ffmpeg_multiple_files( (filename, metadata_filename), temp_filename, itertools.chain(self._options(info['ext']), *options)) - if metadata_filename: - os.remove(metadata_filename) + for file in filter(None, files_to_delete): + os.remove(file) # Don't obey --keep-files os.replace(temp_filename, filename) return [], info @@ -692,6 +746,9 @@ class FFmpegMetadataPP(FFmpegPostProcessor): add('season_number') add('episode_id', ('episode', 'episode_id')) add('episode_sort', 'episode_number') + if 'embed-metadata' in self.get_param('compat_opts', []): + add('comment', 'description') + metadata.pop('synopsis', None) for key, value in info.items(): if value is not None and key != meta_prefix and key.startswith(meta_prefix): @@ -709,15 +766,26 @@ class FFmpegMetadataPP(FFmpegPostProcessor): yield ('-metadata:s:%d' % (stream_idx + i), 'language=%s' % lang) stream_idx += stream_count - if ('no-attach-info-json' not in self.get_param('compat_opts', []) - and '__infojson_filename' in info and info['ext'] in ('mkv', 'mka')): - old_stream, new_stream = self.get_stream_number(info['filepath'], ('tags', 'mimetype'), 'application/json') - if old_stream is not None: - yield ('-map', '-0:%d' % old_stream) - new_stream -= 1 + def _get_infojson_opts(self, info, infofn): + if not infofn or not os.path.exists(infofn): + if self._add_infojson is not True: + return + infofn = infofn or '%s.temp' % ( + self._downloader.prepare_filename(info, 'infojson') + or replace_extension(self._downloader.prepare_filename(info), 'info.json', info['ext'])) + if not self._downloader._ensure_dir_exists(infofn): + return + self.write_debug(f'Writing info-json to: {infofn}') + write_json_file(self._downloader.sanitize_info(info, self.get_param('clean_infojson', True)), infofn) + info['infojson_filename'] = infofn + + old_stream, new_stream = self.get_stream_number(info['filepath'], ('tags', 'mimetype'), 'application/json') + if old_stream is not None: + yield ('-map', '-0:%d' % old_stream) + new_stream -= 1 - yield ('-attach', info['__infojson_filename'], - '-metadata:s:%d' % new_stream, 'mimetype=application/json') + yield ('-attach', infofn, + '-metadata:s:%d' % new_stream, 'mimetype=application/json') class FFmpegMergerPP(FFmpegPostProcessor): @@ -788,10 +856,21 @@ class FFmpegFixupM4aPP(FFmpegFixupPostProcessor): class FFmpegFixupM3u8PP(FFmpegFixupPostProcessor): + def _needs_fixup(self, info): + yield info['ext'] in ('mp4', 'm4a') + yield info['protocol'].startswith('m3u8') + try: + metadata = self.get_metadata_object(info['filepath']) + except PostProcessingError as e: + self.report_warning(f'Unable to extract metadata: {e.msg}') + yield True + else: + yield traverse_obj(metadata, ('format', 'format_name'), casesense=False) == 'mpegts' + @PostProcessor._restrict_to(images=False) def run(self, info): - if self.get_audio_codec(info['filepath']) == 'aac': - self._fixup('Fixing malformed AAC bitstream', info['filepath'], [ + if all(self._needs_fixup(info)): + self._fixup('Fixing MPEG-TS in MP4 container', info['filepath'], [ '-c', 'copy', '-map', '0', '-dn', '-f', 'mp4', '-bsf:a', 'aac_adtstoasc']) return [], info @@ -806,11 +885,10 @@ class FFmpegFixupTimestampPP(FFmpegFixupPostProcessor): @PostProcessor._restrict_to(images=False) def run(self, info): - required_version = '4.4' - if is_outdated_version(self._versions[self.basename], required_version): + if not self._features.get('setts'): self.report_warning( 'A re-encode is needed to fix timestamps in older versions of ffmpeg. ' - f'Please install ffmpeg {required_version} or later to fixup without re-encoding') + 'Please install ffmpeg 4.4 or later to fixup without re-encoding') opts = ['-vf', 'setpts=PTS-STARTPTS'] else: opts = ['-c', 'copy', '-bsf', 'setts=ts=TS-STARTPTS'] diff --git a/yt_dlp/postprocessor/modify_chapters.py b/yt_dlp/postprocessor/modify_chapters.py index dca876200..0728bdcf5 100644 --- a/yt_dlp/postprocessor/modify_chapters.py +++ b/yt_dlp/postprocessor/modify_chapters.py @@ -38,7 +38,7 @@ class ModifyChaptersPP(FFmpegPostProcessor): if not chapters and not sponsor_chapters: return [], info - real_duration = self._get_real_video_duration(info) + real_duration = self._get_real_video_duration(info['filepath']) if not chapters: chapters = [{'start_time': 0, 'end_time': real_duration, 'title': info['title']}] @@ -72,7 +72,6 @@ class ModifyChaptersPP(FFmpegPostProcessor): os.replace(out_file, in_file) files_to_remove.append(uncut_file) - info['_real_duration'] = info['chapters'][-1]['end_time'] return files_to_remove, info def _mark_chapters_to_remove(self, chapters, sponsor_chapters): diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index e70c5f909..36597d41a 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -2006,6 +2006,23 @@ class HTMLAttributeParser(compat_HTMLParser): self.attrs = dict(attrs) +class HTMLListAttrsParser(compat_HTMLParser): + """HTML parser to gather the attributes for the elements of a list""" + + def __init__(self): + compat_HTMLParser.__init__(self) + self.items = [] + self._level = 0 + + def handle_starttag(self, tag, attrs): + if tag == 'li' and self._level == 0: + self.items.append(dict(attrs)) + self._level += 1 + + def handle_endtag(self, tag): + self._level -= 1 + + def extract_attributes(html_element): """Given a string for an HTML element such as <el @@ -2032,6 +2049,15 @@ def extract_attributes(html_element): return parser.attrs +def parse_list(webpage): + """Given a string for an series of HTML <li> elements, + return a dictionary of their attributes""" + parser = HTMLListAttrsParser() + parser.feed(webpage) + parser.close() + return parser.items + + def clean_html(html): """Clean an HTML snippet into a readable string""" @@ -2433,7 +2459,14 @@ def bug_reports_message(before=';'): class YoutubeDLError(Exception): """Base exception for YoutubeDL errors.""" - pass + msg = None + + def __init__(self, msg=None): + if msg is not None: + self.msg = msg + elif self.msg is None: + self.msg = type(self).__name__ + super().__init__(self.msg) network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error] @@ -2518,7 +2551,7 @@ class EntryNotInPlaylist(YoutubeDLError): This exception will be thrown by YoutubeDL when a requested entry is not found in the playlist info_dict """ - pass + msg = 'Entry not found in info' class SameFileError(YoutubeDLError): @@ -2527,7 +2560,12 @@ class SameFileError(YoutubeDLError): This exception will be thrown by FileDownloader objects if they detect multiple files would have to be downloaded to the same file on disk. """ - pass + msg = 'Fixed output name but more than one file to download' + + def __init__(self, filename=None): + if filename is not None: + self.msg += f': {filename}' + super().__init__(self.msg) class PostProcessingError(YoutubeDLError): @@ -2546,11 +2584,6 @@ class DownloadCancelled(YoutubeDLError): """ Exception raised when the download queue should be interrupted """ msg = 'The download was cancelled' - def __init__(self, msg=None): - if msg is not None: - self.msg = msg - YoutubeDLError.__init__(self, self.msg) - class ExistingVideoReached(DownloadCancelled): """ --break-on-existing triggered """ @@ -2569,7 +2602,7 @@ class MaxDownloadsReached(DownloadCancelled): class ThrottledDownload(YoutubeDLError): """ Download speed below --throttled-rate. """ - pass + msg = 'The download speed is below throttle limit' class UnavailableVideoError(YoutubeDLError): @@ -2578,7 +2611,12 @@ class UnavailableVideoError(YoutubeDLError): This exception will be thrown when a video is requested in a format that is not available for that video. """ - pass + msg = 'Unable to download video' + + def __init__(self, err=None): + if err is not None: + self.msg += f': {err}' + super().__init__(self.msg) class ContentTooShortError(YoutubeDLError): @@ -3871,7 +3909,7 @@ def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): return default try: return int(v) * invscale // scale - except (ValueError, TypeError): + except (ValueError, TypeError, OverflowError): return default @@ -4007,10 +4045,7 @@ def check_executable(exe, args=[]): return exe -def get_exe_version(exe, args=['--version'], - version_re=None, unrecognized='present'): - """ Returns the version of the specified executable, - or False if the executable is not present """ +def _get_exe_version_output(exe, args): try: # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers # SIGTTOU if yt-dlp is run in the background. @@ -4022,7 +4057,7 @@ def get_exe_version(exe, args=['--version'], return False if isinstance(out, bytes): # Python 2.x out = out.decode('ascii', 'ignore') - return detect_exe_version(out, version_re, unrecognized) + return out def detect_exe_version(output, version_re=None, unrecognized='present'): @@ -4036,6 +4071,14 @@ def detect_exe_version(output, version_re=None, unrecognized='present'): return unrecognized +def get_exe_version(exe, args=['--version'], + version_re=None, unrecognized='present'): + """ Returns the version of the specified executable, + or False if the executable is not present """ + out = _get_exe_version_output(exe, args) + return detect_exe_version(out, version_re, unrecognized) if out else False + + class LazyList(collections.abc.Sequence): ''' Lazy immutable list from an iterable Note that slices of a LazyList are lists and not LazyList''' @@ -4043,10 +4086,10 @@ class LazyList(collections.abc.Sequence): class IndexError(IndexError): pass - def __init__(self, iterable): + def __init__(self, iterable, *, reverse=False, _cache=None): self.__iterable = iter(iterable) - self.__cache = [] - self.__reversed = False + self.__cache = [] if _cache is None else _cache + self.__reversed = reverse def __iter__(self): if self.__reversed: @@ -4112,9 +4155,17 @@ class LazyList(collections.abc.Sequence): self.__exhaust() return len(self.__cache) - def reverse(self): - self.__reversed = not self.__reversed - return self + def __reversed__(self): + return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache) + + def __copy__(self): + return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache) + + def __deepcopy__(self, memo): + # FIXME: This is actually just a shallow copy + id_ = id(self) + memo[id_] = self.__copy__() + return memo[id_] def __repr__(self): # repr and str should mimic a list. So we exhaust the iterable @@ -4125,6 +4176,10 @@ class LazyList(collections.abc.Sequence): class PagedList: + + class IndexError(IndexError): + pass + def __len__(self): # This is only useful for tests return len(self.getslice()) @@ -4136,7 +4191,9 @@ class PagedList: self._cache = {} def getpage(self, pagenum): - page_results = self._cache.get(pagenum) or list(self._pagefunc(pagenum)) + page_results = self._cache.get(pagenum) + if page_results is None: + page_results = list(self._pagefunc(pagenum)) if self._use_cache: self._cache[pagenum] = page_results return page_results @@ -4152,7 +4209,9 @@ class PagedList: if not isinstance(idx, int) or idx < 0: raise TypeError('indices must be non-negative integers') entries = self.getslice(idx, idx + 1) - return entries[0] if entries else None + if not entries: + raise self.IndexError() + return entries[0] class OnDemandPagedList(PagedList): @@ -4656,19 +4715,18 @@ def parse_codecs(codecs_str): str.strip, codecs_str.strip().strip(',').split(',')))) vcodec, acodec, hdr = None, None, None for full_codec in split_codecs: - codec = full_codec.split('.')[0] - if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01', 'theora', 'dvh1', 'dvhe'): + parts = full_codec.split('.') + codec = parts[0].replace('0', '') + if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', + 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'): if not vcodec: - vcodec = full_codec + vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1') else full_codec if codec in ('dvh1', 'dvhe'): hdr = 'DV' - elif codec == 'vp9' and vcodec.startswith('vp9.2'): + elif codec == 'av1' and len(parts) > 3 and parts[3] == '10': + hdr = 'HDR10' + elif full_codec.replace('0', '').startswith('vp9.2'): hdr = 'HDR10' - elif codec == 'av01': - parts = full_codec.split('.') - if len(parts) > 3 and parts[3] == '10': - hdr = 'HDR10' - vcodec = '.'.join(parts[:4]) elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'): if not acodec: acodec = full_codec @@ -4759,10 +4817,11 @@ def determine_protocol(info_dict): return compat_urllib_parse_urlparse(url).scheme -def render_table(header_row, data, delim=False, extraGap=0, hideEmpty=False): - """ Render a list of rows, each as a list of values """ +def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False): + """ Render a list of rows, each as a list of values. + Text after a \t will be right aligned """ def width(string): - return len(remove_terminal_sequences(string)) + return len(remove_terminal_sequences(string).replace('\t', '')) def get_max_lens(table): return [max(width(str(v)) for v in col) for col in zip(*table)] @@ -4770,21 +4829,24 @@ def render_table(header_row, data, delim=False, extraGap=0, hideEmpty=False): def filter_using_list(row, filterArray): return [col for (take, col) in zip(filterArray, row) if take] - if hideEmpty: + if hide_empty: max_lens = get_max_lens(data) header_row = filter_using_list(header_row, max_lens) data = [filter_using_list(row, max_lens) for row in data] table = [header_row] + data max_lens = get_max_lens(table) - extraGap += 1 + extra_gap += 1 if delim: - table = [header_row] + [[delim * (ml + extraGap) for ml in max_lens]] + data - max_lens[-1] = 0 + table = [header_row, [delim * (ml + extra_gap) for ml in max_lens]] + data + table[1][-1] = table[1][-1][:-extra_gap] # Remove extra_gap from end of delimiter for row in table: for pos, text in enumerate(map(str, row)): - row[pos] = text + (' ' * (max_lens[pos] - width(text) + extraGap)) - ret = '\n'.join(''.join(row) for row in table) + if '\t' in text: + row[pos] = text.replace('\t', ' ' * (max_lens[pos] - width(text))) + ' ' * extra_gap + else: + row[pos] = text + ' ' * (max_lens[pos] - width(text) + extra_gap) + ret = '\n'.join(''.join(row).rstrip() for row in table) return ret @@ -6412,10 +6474,10 @@ def traverse_obj( def _traverse_obj(obj, path, _current_depth=0): nonlocal depth - if obj is None: - return None path = tuple(variadic(path)) for i, key in enumerate(path): + if obj is None: + return None if isinstance(key, (list, tuple)): obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key] key = ... @@ -6540,3 +6602,9 @@ def remove_terminal_sequences(string): def number_of_digits(number): return len('%d' % number) + + +def join_nonempty(*values, delim='-', from_dict=None): + if from_dict is not None: + values = map(from_dict.get, values) + return delim.join(map(str, filter(None, values))) diff --git a/yt_dlp/version.py b/yt_dlp/version.py index e7203be6b..5290afa2d 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2021.10.22' +__version__ = '2021.11.10.1' |