diff options
| author | Tom-Oliver Heidel <github@tom-oliver.eu> | 2020-11-30 02:20:18 +0100 | 
|---|---|---|
| committer | GitHub <noreply@github.com> | 2020-11-30 02:20:18 +0100 | 
| commit | ef5a4db06cb7667dc555f0dc1982de2c5258eb37 (patch) | |
| tree | c947a547303ccf19483e83ea9d3ea77c74b5b7a8 | |
| parent | 93201d50aa5e0bce0ecf6941cd6f9ea6c9ef86a5 (diff) | |
| parent | c78b936af4366259605e3e706bdeb5e173bf3d9b (diff) | |
| download | hypervideo-pre-ef5a4db06cb7667dc555f0dc1982de2c5258eb37.tar.lz hypervideo-pre-ef5a4db06cb7667dc555f0dc1982de2c5258eb37.tar.xz hypervideo-pre-ef5a4db06cb7667dc555f0dc1982de2c5258eb37.zip | |
Merge pull request #245 from pukkandan/merge-main
Merge youtube-dl and fix Youtube Feeds
66 files changed, 3617 insertions, 2252 deletions
| diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index e6de72b33..c27ef9781 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -61,7 +61,7 @@ def build_lazy_ie(ie, name):      return s -# find the correct sorting and add the required base classes so that sublcasses +# find the correct sorting and add the required base classes so that subclasses  # can be correctly created  classes = _ALL_CLASSES[:-1]  ordered_cls = [] diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 968593cd9..0b183b272 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -59,9 +59,9 @@   - **ARD:mediathek**   - **ARDBetaMediathek**   - **Arkena** - - **arte.tv:+7** - - **arte.tv:embed** - - **arte.tv:playlist** + - **ArteTV** + - **ArteTVEmbed** + - **ArteTVPlaylist**   - **AsianCrush**   - **AsianCrushPlaylist**   - **AtresPlayer** @@ -111,6 +111,7 @@   - **Bloomberg**   - **BokeCC**   - **BostonGlobe** + - **Box**   - **Bpb**: Bundeszentrale für politische Bildung   - **BR**: Bayerischer Rundfunk   - **BravoTV** @@ -158,6 +159,7 @@   - **Chilloutzone**   - **chirbit**   - **chirbit:profile** + - **cielotv.it**   - **Cinchcast**   - **Cinemax**   - **CiscoLiveSearch** @@ -425,6 +427,7 @@   - **la7.it**   - **laola1tv**   - **laola1tv:embed** + - **lbry.tv**   - **LCI**   - **Lcp**   - **LcpPlay** @@ -475,6 +478,7 @@   - **massengeschmack.tv**   - **MatchTV**   - **MDR**: MDR.DE and KiKA + - **MedalTV**   - **media.ccc.de**   - **media.ccc.de:lists**   - **Medialaan** @@ -618,6 +622,7 @@   - **Nuvid**   - **NYTimes**   - **NYTimesArticle** + - **NYTimesCooking**   - **NZZ**   - **ocw.mit.edu**   - **OdaTV** @@ -670,6 +675,8 @@   - **PicartoVod**   - **Piksel**   - **Pinkbike** + - **Pinterest** + - **PinterestCollection**   - **Pladform**   - **Platzi**   - **PlatziCourse** @@ -766,6 +773,7 @@   - **RTVNH**   - **RTVS**   - **RUHD** + - **RumbleEmbed**   - **rutube**: Rutube videos   - **rutube:channel**: Rutube channels   - **rutube:embed**: Rutube embedded videos @@ -836,12 +844,14 @@   - **SpankBangPlaylist**   - **Spankwire**   - **Spiegel** - - **Spiegel:Article**: Articles on spiegel.de - - **Spiegeltv**   - **sport.francetvinfo.fr**   - **Sport5**   - **SportBox**   - **SportDeutschland** + - **Spreaker** + - **SpreakerPage** + - **SpreakerShow** + - **SpreakerShowPage**   - **SpringboardPlatform**   - **Sprout**   - **sr:mediathek**: Saarländischer Rundfunk @@ -945,6 +955,7 @@   - **TV2DKBornholmPlay**   - **TV4**: tv4.se and tv4play.se   - **TV5MondePlus**: TV5MONDE+ + - **tv8.it**   - **TVA**   - **TVANouvelles**   - **TVANouvellesArticle** @@ -1059,7 +1070,7 @@   - **vk:wallpost**   - **vlive**   - **vlive:channel** - - **vlive:playlist** + - **vlive:post**   - **Vodlocker**   - **VODPl**   - **VODPlatform** @@ -1148,20 +1159,17 @@   - **YourPorn**   - **YourUpload**   - **youtube**: YouTube.com - - **youtube:channel**: YouTube.com channels - - **youtube:favorites**: YouTube.com favourite videos, ":ytfav" for short (requires authentication) + - **youtube:favorites**: YouTube.com liked videos, ":ytfav" for short (requires authentication)   - **youtube:history**: Youtube watch history, ":ythistory" for short (requires authentication) - - **youtube:live**: YouTube.com live streams   - **youtube:playlist**: YouTube.com playlists - - **youtube:playlists**: YouTube.com user/channel playlists   - **youtube:recommended**: YouTube.com recommended videos, ":ytrec" for short (requires authentication) - - **youtube:search**: YouTube.com searches - - **youtube:search:date**: YouTube.com searches, newest videos first + - **youtube:search**: YouTube.com searches, "ytsearch" keyword + - **youtube:search:date**: YouTube.com searches, newest videos first, "ytsearchdate" keyword   - **youtube:search_url**: YouTube.com search URLs - - **youtube:show**: YouTube.com (multi-season) shows - - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication) - - **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword) + - **youtube:subscriptions**: YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication) + - **youtube:tab**: YouTube.com tab   - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication) + - **YoutubeYtUser**: YouTube.com user videos, URL or "ytuser" keyword   - **Zapiks**   - **Zaq1**   - **Zattoo** diff --git a/test/parameters.json b/test/parameters.json index 7bf59c25f..65fd54428 100644 --- a/test/parameters.json +++ b/test/parameters.json @@ -37,7 +37,7 @@      "writeinfojson": true,       "writesubtitles": false,      "allsubtitles": false, -    "listssubtitles": false, +    "listsubtitles": false,      "socket_timeout": 20,      "fixup": "never"  } diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 6d02c2a54..a9e649191 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -919,6 +919,76 @@ class TestYoutubeDL(unittest.TestCase):          self.assertEqual(downloaded['extractor'], 'testex')          self.assertEqual(downloaded['extractor_key'], 'TestEx') +    # Test case for https://github.com/ytdl-org/youtube-dl/issues/27064 +    def test_ignoreerrors_for_playlist_with_url_transparent_iterable_entries(self): + +        class _YDL(YDL): +            def __init__(self, *args, **kwargs): +                super(_YDL, self).__init__(*args, **kwargs) + +            def trouble(self, s, tb=None): +                pass + +        ydl = _YDL({ +            'format': 'extra', +            'ignoreerrors': True, +        }) + +        class VideoIE(InfoExtractor): +            _VALID_URL = r'video:(?P<id>\d+)' + +            def _real_extract(self, url): +                video_id = self._match_id(url) +                formats = [{ +                    'format_id': 'default', +                    'url': 'url:', +                }] +                if video_id == '0': +                    raise ExtractorError('foo') +                if video_id == '2': +                    formats.append({ +                        'format_id': 'extra', +                        'url': TEST_URL, +                    }) +                return { +                    'id': video_id, +                    'title': 'Video %s' % video_id, +                    'formats': formats, +                } + +        class PlaylistIE(InfoExtractor): +            _VALID_URL = r'playlist:' + +            def _entries(self): +                for n in range(3): +                    video_id = compat_str(n) +                    yield { +                        '_type': 'url_transparent', +                        'ie_key': VideoIE.ie_key(), +                        'id': video_id, +                        'url': 'video:%s' % video_id, +                        'title': 'Video Transparent %s' % video_id, +                    } + +            def _real_extract(self, url): +                return self.playlist_result(self._entries()) + +        ydl.add_info_extractor(VideoIE(ydl)) +        ydl.add_info_extractor(PlaylistIE(ydl)) +        info = ydl.extract_info('playlist:') +        entries = info['entries'] +        self.assertEqual(len(entries), 3) +        self.assertTrue(entries[0] is None) +        self.assertTrue(entries[1] is None) +        self.assertEqual(len(ydl.downloaded_info_dicts), 1) +        downloaded = ydl.downloaded_info_dicts[0] +        self.assertEqual(entries[2], downloaded) +        self.assertEqual(downloaded['url'], TEST_URL) +        self.assertEqual(downloaded['title'], 'Video Transparent 2') +        self.assertEqual(downloaded['id'], '2') +        self.assertEqual(downloaded['extractor'], 'Video') +        self.assertEqual(downloaded['extractor_key'], 'Video') +  if __name__ == '__main__':      unittest.main() diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 548bc6750..8dcdc4e58 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -31,45 +31,47 @@ class TestAllURLsMatching(unittest.TestCase):      def test_youtube_playlist_matching(self):          assertPlaylist = lambda url: self.assertMatch(url, ['youtube:playlist']) +        assertTab = lambda url: self.assertMatch(url, ['youtube:tab'])          assertPlaylist('ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')          assertPlaylist('UUBABnxM4Ar9ten8Mdjj1j0Q')  # 585 -        assertPlaylist('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q') +        assertPlaylist('PL63F0C78739B09958') +        assertTab('https://www.youtube.com/AsapSCIENCE') +        assertTab('https://www.youtube.com/embedded') +        assertTab('https://www.youtube.com/feed')  # Own channel's home page +        assertTab('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')          assertPlaylist('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') -        assertPlaylist('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') -        assertPlaylist('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')  # 668 +        assertTab('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') +        assertTab('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')  # 668          self.assertFalse('youtube:playlist' in self.matching_ies('PLtS2H6bU1M'))          # Top tracks -        assertPlaylist('https://www.youtube.com/playlist?list=MCUS.20142101') +        assertTab('https://www.youtube.com/playlist?list=MCUS.20142101')      def test_youtube_matching(self):          self.assertTrue(YoutubeIE.suitable('PLtS2H6bU1M'))          self.assertFalse(YoutubeIE.suitable('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012'))  # 668          self.assertMatch('http://youtu.be/BaW_jenozKc', ['youtube']) -        self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube']) +        # self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube'])  # /v/ is no longer valid          self.assertMatch('https://youtube.googleapis.com/v/BaW_jenozKc', ['youtube'])          self.assertMatch('http://www.cleanvideosearch.com/media/action/yt/watch?videoId=8v_4O44sfjM', ['youtube'])      def test_youtube_channel_matching(self): -        assertChannel = lambda url: self.assertMatch(url, ['youtube:channel']) +        assertChannel = lambda url: self.assertMatch(url, ['youtube:tab'])          assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM')          assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM?feature=gb_ch_rec')          assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM/videos') -    def test_youtube_user_matching(self): -        self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:user']) +    # def test_youtube_user_matching(self): +    #     self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:tab'])      def test_youtube_feeds(self): -        self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater']) -        self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions']) -        self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended']) -        self.assertMatch('https://www.youtube.com/my_favorites', ['youtube:favorites']) - -    def test_youtube_show_matching(self): -        self.assertMatch('http://www.youtube.com/show/airdisasters', ['youtube:show']) - -    def test_youtube_search_matching(self): -        self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) -        self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) +        self.assertMatch('https://www.youtube.com/feed/library', ['youtube:tab']) +        self.assertMatch('https://www.youtube.com/feed/history', ['youtube:tab']) +        self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:tab']) +        self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:tab']) + +    # def test_youtube_search_matching(self): +    #     self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url']) +    #     self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url'])      def test_youtube_extract(self):          assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id) diff --git a/test/test_utils.py b/test/test_utils.py index 95231200b..16ad40831 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -937,6 +937,28 @@ class TestUtil(unittest.TestCase):          self.assertEqual(d['x'], 1)          self.assertEqual(d['y'], 'a') +        # Just drop ! prefix for now though this results in a wrong value +        on = js_to_json('''{ +            a: !0, +            b: !1, +            c: !!0, +            d: !!42.42, +            e: !!![], +            f: !"abc", +            g: !"", +            !42: 42 +        }''') +        self.assertEqual(json.loads(on), { +            'a': 0, +            'b': 1, +            'c': 0, +            'd': 42.42, +            'e': [], +            'f': "abc", +            'g': "", +            '42': 42 +        }) +          on = js_to_json('["abc", "def",]')          self.assertEqual(json.loads(on), ['abc', 'def']) @@ -994,6 +1016,12 @@ class TestUtil(unittest.TestCase):          on = js_to_json('{42:4.2e1}')          self.assertEqual(json.loads(on), {'42': 42.0}) +        on = js_to_json('{ "0x40": "0x40" }') +        self.assertEqual(json.loads(on), {'0x40': '0x40'}) + +        on = js_to_json('{ "040": "040" }') +        self.assertEqual(json.loads(on), {'040': '040'}) +      def test_js_to_json_malformed(self):          self.assertEqual(js_to_json('42a1'), '42"a1"')          self.assertEqual(js_to_json('42a-1'), '42"a"-1') diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py index bf02192eb..ef6fe0a78 100644 --- a/youtube_dlc/YoutubeDL.py +++ b/youtube_dlc/YoutubeDL.py @@ -830,34 +830,23 @@ class YoutubeDL(object):                                      'and will probably not work.')              try: -                try: -                    temp_id = ie.extract_id(url) if callable(getattr(ie, 'extract_id', None)) else ie._match_id(url) -                except (AssertionError, IndexError): -                    temp_id = None -                if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}): -                    self.to_screen("[%s] %s: has already been recorded in archive" % ( -                                   ie_key, temp_id)) -                    break +                temp_id = ie.extract_id(url) if callable(getattr(ie, 'extract_id', None)) else ie._match_id(url) +            except (AssertionError, IndexError, AttributeError): +                temp_id = None +            if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}): +                self.to_screen("[%s] %s: has already been recorded in archive" % ( +                               ie_key, temp_id)) +                break -                ie_result = ie.extract(url) -                if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here) -                    break -                if isinstance(ie_result, list): -                    # Backwards compatibility: old IE result format -                    ie_result = { -                        '_type': 'compat_list', -                        'entries': ie_result, -                    } -                if info_dict: -                    if info_dict.get('id'): -                        ie_result['id'] = info_dict['id'] -                    if info_dict.get('title'): -                        ie_result['title'] = info_dict['title'] -                self.add_default_extra_info(ie_result, ie, url) -                if process: -                    return self.process_ie_result(ie_result, download, extra_info) -                else: -                    return ie_result +            return self.__extract_info(url, ie, download, extra_info, process, info_dict) + +        else: +            self.report_error('no suitable InfoExtractor for URL %s' % url) + +    def __handle_extraction_exceptions(func): +        def wrapper(self, *args, **kwargs): +            try: +                return func(self, *args, **kwargs)              except GeoRestrictedError as e:                  msg = e.msg                  if e.countries: @@ -865,20 +854,38 @@ class YoutubeDL(object):                          map(ISO3166Utils.short2full, e.countries))                  msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'                  self.report_error(msg) -                break              except ExtractorError as e:  # An error we somewhat expected                  self.report_error(compat_str(e), e.format_traceback()) -                break              except MaxDownloadsReached:                  raise              except Exception as e:                  if self.params.get('ignoreerrors', False):                      self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc())) -                    break                  else:                      raise +        return wrapper + +    @__handle_extraction_exceptions +    def __extract_info(self, url, ie, download, extra_info, process, info_dict): +        ie_result = ie.extract(url) +        if ie_result is None:  # Finished already (backwards compatibility; listformats and friends should be moved here) +            return +        if isinstance(ie_result, list): +            # Backwards compatibility: old IE result format +            ie_result = { +                '_type': 'compat_list', +                'entries': ie_result, +            } +        if info_dict: +            if info_dict.get('id'): +                ie_result['id'] = info_dict['id'] +            if info_dict.get('title'): +                ie_result['title'] = info_dict['title'] +        self.add_default_extra_info(ie_result, ie, url) +        if process: +            return self.process_ie_result(ie_result, download, extra_info)          else: -            self.report_error('no suitable InfoExtractor for URL %s' % url) +            return ie_result      def add_default_extra_info(self, ie_result, ie, url):          self.add_extra_info(ie_result, { @@ -1057,9 +1064,8 @@ class YoutubeDL(object):                          self.to_screen('[download] ' + reason)                          continue -                entry_result = self.process_ie_result(entry, -                                                      download=download, -                                                      extra_info=extra) +                entry_result = self.__process_iterable_entry(entry, download, extra) +                # TODO: skip failed (empty) entries?                  playlist_results.append(entry_result)              ie_result['entries'] = playlist_results              self.to_screen('[download] Finished downloading playlist: %s' % playlist) @@ -1088,6 +1094,11 @@ class YoutubeDL(object):          else:              raise Exception('Invalid result type: %s' % result_type) +    @__handle_extraction_exceptions +    def __process_iterable_entry(self, entry, download, extra_info): +        return self.process_ie_result( +            entry, download=download, extra_info=extra_info) +      def _build_format_filter(self, filter_spec):          " Returns a function to filter the formats according to the filter_spec " diff --git a/youtube_dlc/compat.py b/youtube_dlc/compat.py index 1cf7efed6..ac889ddd7 100644 --- a/youtube_dlc/compat.py +++ b/youtube_dlc/compat.py @@ -2345,7 +2345,7 @@ except ImportError:  # Python <3.4          # HTMLParseError has been deprecated in Python 3.3 and removed in          # Python 3.5. Introducing dummy exception for Python >3.5 for compatible -        # and uniform cross-version exceptiong handling +        # and uniform cross-version exception handling          class compat_HTMLParseError(Exception):              pass diff --git a/youtube_dlc/downloader/fragment.py b/youtube_dlc/downloader/fragment.py index 9339b3a62..cf4fd41da 100644 --- a/youtube_dlc/downloader/fragment.py +++ b/youtube_dlc/downloader/fragment.py @@ -97,12 +97,15 @@ class FragmentFD(FileDownloader):      def _download_fragment(self, ctx, frag_url, info_dict, headers=None):          fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], ctx['fragment_index']) -        success = ctx['dl'].download(fragment_filename, { +        fragment_info_dict = {              'url': frag_url,              'http_headers': headers or info_dict.get('http_headers'), -        }) +        } +        success = ctx['dl'].download(fragment_filename, fragment_info_dict)          if not success:              return False, None +        if fragment_info_dict.get('filetime'): +            ctx['fragment_filetime'] = fragment_info_dict.get('filetime')          down, frag_sanitized = sanitize_open(fragment_filename, 'rb')          ctx['fragment_filename_sanitized'] = frag_sanitized          frag_content = down.read() @@ -258,6 +261,13 @@ class FragmentFD(FileDownloader):              downloaded_bytes = ctx['complete_frags_downloaded_bytes']          else:              self.try_rename(ctx['tmpfilename'], ctx['filename']) +            if self.params.get('updatetime', True): +                filetime = ctx.get('fragment_filetime') +                if filetime: +                    try: +                        os.utime(ctx['filename'], (time.time(), filetime)) +                    except Exception: +                        pass              downloaded_bytes = os.path.getsize(encodeFilename(ctx['filename']))          self._hook_progress({ diff --git a/youtube_dlc/downloader/http.py b/youtube_dlc/downloader/http.py index 96379caf1..d8ac41dcc 100644 --- a/youtube_dlc/downloader/http.py +++ b/youtube_dlc/downloader/http.py @@ -109,7 +109,9 @@ class HttpFD(FileDownloader):                  try:                      ctx.data = self.ydl.urlopen(request)                  except (compat_urllib_error.URLError, ) as err: -                    if isinstance(err.reason, socket.timeout): +                    # reason may not be available, e.g. for urllib2.HTTPError on python 2.6 +                    reason = getattr(err, 'reason', None) +                    if isinstance(reason, socket.timeout):                          raise RetryDownload(err)                      raise err                  # When trying to resume, Content-Range HTTP header of response has to be checked diff --git a/youtube_dlc/extractor/afreecatv.py b/youtube_dlc/extractor/afreecatv.py index 6275e5209..b56abb1e6 100644 --- a/youtube_dlc/extractor/afreecatv.py +++ b/youtube_dlc/extractor/afreecatv.py @@ -275,7 +275,7 @@ class AfreecaTVIE(InfoExtractor):          video_element = video_xml.findall(compat_xpath('./track/video'))[-1]          if video_element is None or video_element.text is None:              raise ExtractorError( -                'Video %s video does not exist' % video_id, expected=True) +                'Video %s does not exist' % video_id, expected=True)          video_url = video_element.text.strip() diff --git a/youtube_dlc/extractor/amara.py b/youtube_dlc/extractor/amara.py new file mode 100644 index 000000000..61d469574 --- /dev/null +++ b/youtube_dlc/extractor/amara.py @@ -0,0 +1,103 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .youtube import YoutubeIE +from .vimeo import VimeoIE +from ..utils import ( +    int_or_none, +    parse_iso8601, +    update_url_query, +) + + +class AmaraIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?amara\.org/(?:\w+/)?videos/(?P<id>\w+)' +    _TESTS = [{ +        # Youtube +        'url': 'https://amara.org/en/videos/jVx79ZKGK1ky/info/why-jury-trials-are-becoming-less-common/?tab=video', +        'md5': 'ea10daf2b6154b8c1ecf9922aca5e8ae', +        'info_dict': { +            'id': 'h6ZuVdvYnfE', +            'ext': 'mp4', +            'title': 'Why jury trials are becoming less common', +            'description': 'md5:a61811c319943960b6ab1c23e0cbc2c1', +            'thumbnail': r're:^https?://.*\.jpg$', +            'subtitles': dict, +            'upload_date': '20160813', +            'uploader': 'PBS NewsHour', +            'uploader_id': 'PBSNewsHour', +            'timestamp': 1549639570, +        } +    }, { +        # Vimeo +        'url': 'https://amara.org/en/videos/kYkK1VUTWW5I/info/vimeo-at-ces-2011', +        'md5': '99392c75fa05d432a8f11df03612195e', +        'info_dict': { +            'id': '18622084', +            'ext': 'mov', +            'title': 'Vimeo at CES 2011!', +            'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', +            'thumbnail': r're:^https?://.*\.jpg$', +            'subtitles': dict, +            'timestamp': 1294763658, +            'upload_date': '20110111', +            'uploader': 'Sam Morrill', +            'uploader_id': 'sammorrill' +        } +    }, { +        # Direct Link +        'url': 'https://amara.org/en/videos/s8KL7I3jLmh6/info/the-danger-of-a-single-story/', +        'md5': 'd3970f08512738ee60c5807311ff5d3f', +        'info_dict': { +            'id': 's8KL7I3jLmh6', +            'ext': 'mp4', +            'title': 'The danger of a single story', +            'description': 'md5:d769b31139c3b8bb5be9177f62ea3f23', +            'thumbnail': r're:^https?://.*\.jpg$', +            'subtitles': dict, +            'upload_date': '20091007', +            'timestamp': 1254942511, +        } +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        meta = self._download_json( +            'https://amara.org/api/videos/%s/' % video_id, +            video_id, query={'format': 'json'}) +        title = meta['title'] +        video_url = meta['all_urls'][0] + +        subtitles = {} +        for language in (meta.get('languages') or []): +            subtitles_uri = language.get('subtitles_uri') +            if not (subtitles_uri and language.get('published')): +                continue +            subtitle = subtitles.setdefault(language.get('code') or 'en', []) +            for f in ('json', 'srt', 'vtt'): +                subtitle.append({ +                    'ext': f, +                    'url': update_url_query(subtitles_uri, {'format': f}), +                }) + +        info = { +            'url': video_url, +            'id': video_id, +            'subtitles': subtitles, +            'title': title, +            'description': meta.get('description'), +            'thumbnail': meta.get('thumbnail'), +            'duration': int_or_none(meta.get('duration')), +            'timestamp': parse_iso8601(meta.get('created')), +        } + +        for ie in (YoutubeIE, VimeoIE): +            if ie.suitable(video_url): +                info.update({ +                    '_type': 'url_transparent', +                    'ie_key': ie.ie_key(), +                }) +                break + +        return info diff --git a/youtube_dlc/extractor/arte.py b/youtube_dlc/extractor/arte.py index 2bd3bfe8a..03abdbfaf 100644 --- a/youtube_dlc/extractor/arte.py +++ b/youtube_dlc/extractor/arte.py @@ -4,23 +4,57 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( +    compat_str, +    compat_urlparse, +)  from ..utils import (      ExtractorError,      int_or_none,      qualities,      try_get,      unified_strdate, +    url_or_none,  ) -# There are different sources of video in arte.tv, the extraction process -# is different for each one. The videos usually expire in 7 days, so we can't -# add tests. -  class ArteTVBaseIE(InfoExtractor): -    def _extract_from_json_url(self, json_url, video_id, lang, title=None): -        info = self._download_json(json_url, video_id) +    _ARTE_LANGUAGES = 'fr|de|en|es|it|pl' +    _API_BASE = 'https://api.arte.tv/api/player/v1' + + +class ArteTVIE(ArteTVBaseIE): +    _VALID_URL = r'''(?x) +                    https?:// +                        (?: +                            (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos| +                            api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s) +                        ) +                        /(?P<id>\d{6}-\d{3}-[AF]) +                    ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES} +    _TESTS = [{ +        'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', +        'info_dict': { +            'id': '088501-000-A', +            'ext': 'mp4', +            'title': 'Mexico: Stealing Petrol to Survive', +            'upload_date': '20190628', +        }, +    }, { +        'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/', +        'only_matching': True, +    }, { +        'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        mobj = re.match(self._VALID_URL, url) +        video_id = mobj.group('id') +        lang = mobj.group('lang') or mobj.group('lang_2') + +        info = self._download_json( +            '%s/config/%s/%s' % (self._API_BASE, lang, video_id), video_id)          player_info = info['videoJsonPlayer']          vsr = try_get(player_info, lambda x: x['VSR'], dict) @@ -37,18 +71,11 @@ class ArteTVBaseIE(InfoExtractor):          if not upload_date_str:              upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0] -        title = (player_info.get('VTI') or title or player_info['VID']).strip() +        title = (player_info.get('VTI') or player_info['VID']).strip()          subtitle = player_info.get('VSU', '').strip()          if subtitle:              title += ' - %s' % subtitle -        info_dict = { -            'id': player_info['VID'], -            'title': title, -            'description': player_info.get('VDE'), -            'upload_date': unified_strdate(upload_date_str), -            'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), -        }          qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ'])          LANGS = { @@ -65,6 +92,10 @@ class ArteTVBaseIE(InfoExtractor):          formats = []          for format_id, format_dict in vsr.items():              f = dict(format_dict) +            format_url = url_or_none(f.get('url')) +            streamer = f.get('streamer') +            if not format_url and not streamer: +                continue              versionCode = f.get('versionCode')              l = re.escape(langcode) @@ -107,6 +138,16 @@ class ArteTVBaseIE(InfoExtractor):              else:                  lang_pref = -1 +            media_type = f.get('mediaType') +            if media_type == 'hls': +                m3u8_formats = self._extract_m3u8_formats( +                    format_url, video_id, 'mp4', entry_protocol='m3u8_native', +                    m3u8_id=format_id, fatal=False) +                for m3u8_format in m3u8_formats: +                    m3u8_format['language_preference'] = lang_pref +                formats.extend(m3u8_formats) +                continue +              format = {                  'format_id': format_id,                  'preference': -10 if f.get('videoFormat') == 'M3U8' else None, @@ -118,7 +159,7 @@ class ArteTVBaseIE(InfoExtractor):                  'quality': qfunc(f.get('quality')),              } -            if f.get('mediaType') == 'rtmp': +            if media_type == 'rtmp':                  format['url'] = f['streamer']                  format['play_path'] = 'mp4:' + f['url']                  format['ext'] = 'flv' @@ -127,56 +168,50 @@ class ArteTVBaseIE(InfoExtractor):              formats.append(format) -        self._check_formats(formats, video_id)          self._sort_formats(formats) -        info_dict['formats'] = formats -        return info_dict - +        return { +            'id': player_info.get('VID') or video_id, +            'title': title, +            'description': player_info.get('VDE'), +            'upload_date': unified_strdate(upload_date_str), +            'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), +            'formats': formats, +        } -class ArteTVPlus7IE(ArteTVBaseIE): -    IE_NAME = 'arte.tv:+7' -    _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>\d{6}-\d{3}-[AF])' +class ArteTVEmbedIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'      _TESTS = [{ -        'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', +        'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',          'info_dict': { -            'id': '088501-000-A', +            'id': '100605-013-A',              'ext': 'mp4', -            'title': 'Mexico: Stealing Petrol to Survive', -            'upload_date': '20190628', +            'title': 'United we Stream November Lockdown Edition #13', +            'description': 'md5:be40b667f45189632b78c1425c7c2ce1', +            'upload_date': '20201116',          }, +    }, { +        'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A', +        'only_matching': True,      }] -    def _real_extract(self, url): -        lang, video_id = re.match(self._VALID_URL, url).groups() -        return self._extract_from_json_url( -            'https://api.arte.tv/api/player/v1/config/%s/%s' % (lang, video_id), -            video_id, lang) - - -class ArteTVEmbedIE(ArteTVPlus7IE): -    IE_NAME = 'arte.tv:embed' -    _VALID_URL = r'''(?x) -        https://www\.arte\.tv -        /player/v3/index\.php\?json_url= -        (?P<json_url> -            https?://api\.arte\.tv/api/player/v1/config/ -            (?P<lang>[^/]+)/(?P<id>\d{6}-\d{3}-[AF]) -        ) -    ''' - -    _TESTS = [] +    @staticmethod +    def _extract_urls(webpage): +        return [url for _, url in re.findall( +            r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1', +            webpage)]      def _real_extract(self, url): -        json_url, lang, video_id = re.match(self._VALID_URL, url).groups() -        return self._extract_from_json_url(json_url, video_id, lang) +        qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) +        json_url = qs['json_url'][0] +        video_id = ArteTVIE._match_id(json_url) +        return self.url_result( +            json_url, ie=ArteTVIE.ie_key(), video_id=video_id)  class ArteTVPlaylistIE(ArteTVBaseIE): -    IE_NAME = 'arte.tv:playlist' -    _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>RC-\d{6})' - +    _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES      _TESTS = [{          'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',          'info_dict': { @@ -185,17 +220,35 @@ class ArteTVPlaylistIE(ArteTVBaseIE):              'description': 'md5:d322c55011514b3a7241f7fb80d494c2',          },          'playlist_mincount': 6, +    }, { +        'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/', +        'only_matching': True,      }]      def _real_extract(self, url):          lang, playlist_id = re.match(self._VALID_URL, url).groups()          collection = self._download_json( -            'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos' -            % (lang, playlist_id), playlist_id) +            '%s/collectionData/%s/%s?source=videos' +            % (self._API_BASE, lang, playlist_id), playlist_id) +        entries = [] +        for video in collection['videos']: +            if not isinstance(video, dict): +                continue +            video_url = url_or_none(video.get('url')) or url_or_none(video.get('jsonUrl')) +            if not video_url: +                continue +            video_id = video.get('programId') +            entries.append({ +                '_type': 'url_transparent', +                'url': video_url, +                'id': video_id, +                'title': video.get('title'), +                'alt_title': video.get('subtitle'), +                'thumbnail': url_or_none(try_get(video, lambda x: x['mainImage']['url'], compat_str)), +                'duration': int_or_none(video.get('durationSeconds')), +                'view_count': int_or_none(video.get('views')), +                'ie_key': ArteTVIE.ie_key(), +            })          title = collection.get('title')          description = collection.get('shortDescription') or collection.get('teaserText') -        entries = [ -            self._extract_from_json_url( -                video['jsonUrl'], video.get('programId') or playlist_id, lang) -            for video in collection['videos'] if video.get('jsonUrl')]          return self.playlist_result(entries, playlist_id, title, description) diff --git a/youtube_dlc/extractor/bandcamp.py b/youtube_dlc/extractor/bandcamp.py index 0e7492764..69e673a26 100644 --- a/youtube_dlc/extractor/bandcamp.py +++ b/youtube_dlc/extractor/bandcamp.py @@ -1,3 +1,4 @@ +# coding: utf-8  from __future__ import unicode_literals  import random @@ -5,10 +6,7 @@ import re  import time  from .common import InfoExtractor -from ..compat import ( -    compat_str, -    compat_urlparse, -) +from ..compat import compat_str  from ..utils import (      ExtractorError,      float_or_none, @@ -17,71 +15,32 @@ from ..utils import (      parse_filesize,      str_or_none,      try_get, -    unescapeHTML,      update_url_query,      unified_strdate,      unified_timestamp,      url_or_none, +    urljoin,  ) -class BandcampBaseIE(InfoExtractor): -    """Provide base functions for Bandcamp extractors""" - -    def _extract_json_from_html_data_attribute(self, webpage, suffix, video_id): -        json_string = self._html_search_regex( -            r' data-%s="([^"]*)' % suffix, -            webpage, '%s json' % suffix, default='{}') - -        return self._parse_json(json_string, video_id) - -    def _parse_json_track(self, json): -        formats = [] -        file_ = json.get('file') -        if isinstance(file_, dict): -            for format_id, format_url in file_.items(): -                if not url_or_none(format_url): -                    continue -                ext, abr_str = format_id.split('-', 1) -                formats.append({ -                    'format_id': format_id, -                    'url': self._proto_relative_url(format_url, 'http:'), -                    'ext': ext, -                    'vcodec': 'none', -                    'acodec': ext, -                    'abr': int_or_none(abr_str), -                }) - -        return { -            'duration': float_or_none(json.get('duration')), -            'id': str_or_none(json.get('track_id') or json.get('id')), -            'title': json.get('title'), -            'title_link': json.get('title_link'), -            'number': int_or_none(json.get('track_num')), -            'formats': formats -        } - - -class BandcampIE(BandcampBaseIE): -    IE_NAME = "Bandcamp:track" -    _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<title>[^/?#&]+)' +class BandcampIE(InfoExtractor): +    _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<id>[^/?#&]+)'      _TESTS = [{          'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',          'md5': 'c557841d5e50261777a6585648adf439',          'info_dict': {              'id': '1812978515',              'ext': 'mp3', -            'title': "youtube-dl  \"'/\\\u00e4\u21ad - youtube-dl  \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", +            'title': "youtube-dl  \"'/\\ä↭ - youtube-dl  \"'/\\ä↭ - youtube-dl test song \"'/\\ä↭",              'duration': 9.8485, -            'uploader': "youtube-dl  \"'/\\\u00e4\u21ad", -            'timestamp': 1354224127, +            'uploader': 'youtube-dl  "\'/\\ä↭',              'upload_date': '20121129', +            'timestamp': 1354224127,          },          '_skip': 'There is a limit of 200 free downloads / month for the test song'      }, {          # free download          'url': 'http://benprunty.bandcamp.com/track/lanius-battle', -        'md5': '5d92af55811e47f38962a54c30b07ef0',          'info_dict': {              'id': '2650410135',              'ext': 'aiff', @@ -120,52 +79,59 @@ class BandcampIE(BandcampBaseIE):          },      }] +    def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True): +        return self._parse_json(self._html_search_regex( +            r'data-%s=(["\'])({.+?})\1' % attr, webpage, +            attr + ' data', group=2), video_id, fatal=fatal) +      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        title = mobj.group('title') -        url_track_title = title +        title = self._match_id(url)          webpage = self._download_webpage(url, title) -        thumbnail = self._html_search_meta('og:image', webpage, default=None) - -        json_tralbum = self._extract_json_from_html_data_attribute(webpage, "tralbum", url_track_title) -        json_embed = self._extract_json_from_html_data_attribute(webpage, "embed", url_track_title) - -        json_tracks = json_tralbum.get('trackinfo') -        if not json_tracks: -            raise ExtractorError('Could not extract track') - -        track = self._parse_json_track(json_tracks[0]) -        artist = json_tralbum.get('artist') -        album_title = json_embed.get('album_title') - -        json_album = json_tralbum.get('packages') -        if json_album: -            json_album = json_album[0] -            album_publish_date = json_album.get('album_publish_date') -            album_release_date = json_album.get('album_release_date') -        else: -            album_publish_date = None -            album_release_date = json_tralbum.get('album_release_date') - -        timestamp = unified_timestamp(json_tralbum.get('current', {}).get('publish_date') or album_publish_date) -        release_date = unified_strdate(album_release_date) - -        download_link = self._search_regex( -            r'freeDownloadPage(?:["\']|"):\s*(["\']|")(?P<url>(?:(?!\1).)+)\1', webpage, -            'download link', default=None, group='url') +        tralbum = self._extract_data_attr(webpage, title) +        thumbnail = self._og_search_thumbnail(webpage) + +        track_id = None +        track = None +        track_number = None +        duration = None + +        formats = [] +        track_info = try_get(tralbum, lambda x: x['trackinfo'][0], dict) +        if track_info: +            file_ = track_info.get('file') +            if isinstance(file_, dict): +                for format_id, format_url in file_.items(): +                    if not url_or_none(format_url): +                        continue +                    ext, abr_str = format_id.split('-', 1) +                    formats.append({ +                        'format_id': format_id, +                        'url': self._proto_relative_url(format_url, 'http:'), +                        'ext': ext, +                        'vcodec': 'none', +                        'acodec': ext, +                        'abr': int_or_none(abr_str), +                    }) +            track = track_info.get('title') +            track_id = str_or_none( +                track_info.get('track_id') or track_info.get('id')) +            track_number = int_or_none(track_info.get('track_num')) +            duration = float_or_none(track_info.get('duration')) + +        embed = self._extract_data_attr(webpage, title, 'embed', False) +        current = tralbum.get('current') or {} +        artist = embed.get('artist') or current.get('artist') or tralbum.get('artist') +        timestamp = unified_timestamp( +            current.get('publish_date') or tralbum.get('album_publish_date')) + +        download_link = tralbum.get('freeDownloadPage')          if download_link: -            track_id = self._search_regex( -                r'\?id=(?P<id>\d+)&', -                download_link, 'track id') +            track_id = compat_str(tralbum['id'])              download_webpage = self._download_webpage(                  download_link, track_id, 'Downloading free downloads page') -            blob = self._parse_json( -                self._search_regex( -                    r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage, -                    'blob', group='blob'), -                track_id, transform_source=unescapeHTML) +            blob = self._extract_data_attr(download_webpage, track_id, 'blob')              info = try_get(                  blob, (lambda x: x['digital_items'][0], @@ -173,6 +139,8 @@ class BandcampIE(BandcampBaseIE):              if info:                  downloads = info.get('downloads')                  if isinstance(downloads, dict): +                    if not track: +                        track = info.get('title')                      if not artist:                          artist = info.get('artist')                      if not thumbnail: @@ -206,7 +174,7 @@ class BandcampIE(BandcampBaseIE):                          retry_url = url_or_none(stat.get('retry_url'))                          if not retry_url:                              continue -                        track['formats'].append({ +                        formats.append({                              'url': self._proto_relative_url(retry_url, 'http:'),                              'ext': download_formats.get(format_id),                              'format_id': format_id, @@ -215,30 +183,34 @@ class BandcampIE(BandcampBaseIE):                              'vcodec': 'none',                          }) -        self._sort_formats(track['formats']) +        self._sort_formats(formats) -        title = '%s - %s' % (artist, track.get('title')) if artist else track.get('title') +        title = '%s - %s' % (artist, track) if artist else track + +        if not duration: +            duration = float_or_none(self._html_search_meta( +                'duration', webpage, default=None))          return { -            'album': album_title, -            'artist': artist, -            'duration': track['duration'], -            'formats': track['formats'], -            'id': track['id'], -            'release_date': release_date, +            'id': track_id, +            'title': title,              'thumbnail': thumbnail, +            'uploader': artist,              'timestamp': timestamp, -            'title': title, -            'track': track['title'], -            'track_id': track['id'], -            'track_number': track['number'], -            'uploader': artist +            'release_date': unified_strdate(tralbum.get('album_release_date')), +            'duration': duration, +            'track': track, +            'track_number': track_number, +            'track_id': track_id, +            'artist': artist, +            'album': embed.get('album_title'), +            'formats': formats,          } -class BandcampAlbumIE(BandcampBaseIE): +class BandcampAlbumIE(BandcampIE):      IE_NAME = 'Bandcamp:album' -    _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?' +    _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<id>[^/?#&]+))?'      _TESTS = [{          'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', @@ -248,7 +220,10 @@ class BandcampAlbumIE(BandcampBaseIE):                  'info_dict': {                      'id': '1353101989',                      'ext': 'mp3', -                    'title': 'Intro', +                    'title': 'Blazo - Intro', +                    'timestamp': 1311756226, +                    'upload_date': '20110727', +                    'uploader': 'Blazo',                  }              },              { @@ -256,7 +231,10 @@ class BandcampAlbumIE(BandcampBaseIE):                  'info_dict': {                      'id': '38097443',                      'ext': 'mp3', -                    'title': 'Kero One - Keep It Alive (Blazo remix)', +                    'title': 'Blazo - Kero One - Keep It Alive (Blazo remix)', +                    'timestamp': 1311757238, +                    'upload_date': '20110727', +                    'uploader': 'Blazo',                  }              },          ], @@ -292,6 +270,7 @@ class BandcampAlbumIE(BandcampBaseIE):              'title': '"Entropy" EP',              'uploader_id': 'jstrecords',              'id': 'entropy-ep', +            'description': 'md5:0ff22959c943622972596062f2f366a5',          },          'playlist_mincount': 3,      }, { @@ -301,6 +280,7 @@ class BandcampAlbumIE(BandcampBaseIE):              'id': 'we-are-the-plague',              'title': 'WE ARE THE PLAGUE',              'uploader_id': 'insulters', +            'description': 'md5:b3cf845ee41b2b1141dc7bde9237255f',          },          'playlist_count': 2,      }] @@ -312,41 +292,34 @@ class BandcampAlbumIE(BandcampBaseIE):                  else super(BandcampAlbumIE, cls).suitable(url))      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        uploader_id = mobj.group('subdomain') -        album_id = mobj.group('album_id') +        uploader_id, album_id = re.match(self._VALID_URL, url).groups()          playlist_id = album_id or uploader_id          webpage = self._download_webpage(url, playlist_id) - -        json_tralbum = self._extract_json_from_html_data_attribute(webpage, "tralbum", playlist_id) -        json_embed = self._extract_json_from_html_data_attribute(webpage, "embed", playlist_id) - -        json_tracks = json_tralbum.get('trackinfo') -        if not json_tracks: -            raise ExtractorError('Could not extract album tracks') - -        album_title = json_embed.get('album_title') - +        tralbum = self._extract_data_attr(webpage, playlist_id) +        track_info = tralbum.get('trackinfo') +        if not track_info: +            raise ExtractorError('The page doesn\'t contain any tracks')          # Only tracks with duration info have songs -        tracks = [self._parse_json_track(track) for track in json_tracks]          entries = [              self.url_result( -                compat_urlparse.urljoin(url, track['title_link']), -                ie=BandcampIE.ie_key(), video_id=track['id'], -                video_title=track['title']) -            for track in tracks -            if track.get('duration')] +                urljoin(url, t['title_link']), BandcampIE.ie_key(), +                str_or_none(t.get('track_id') or t.get('id')), t.get('title')) +            for t in track_info +            if t.get('duration')] + +        current = tralbum.get('current') or {}          return {              '_type': 'playlist',              'uploader_id': uploader_id,              'id': playlist_id, -            'title': album_title, -            'entries': entries +            'title': current.get('title'), +            'description': current.get('about'), +            'entries': entries,          } -class BandcampWeeklyIE(InfoExtractor): +class BandcampWeeklyIE(BandcampIE):      IE_NAME = 'Bandcamp:weekly'      _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)'      _TESTS = [{ @@ -361,29 +334,23 @@ class BandcampWeeklyIE(InfoExtractor):              'release_date': '20170404',              'series': 'Bandcamp Weekly',              'episode': 'Magic Moments', -            'episode_number': 208,              'episode_id': '224', -        } +        }, +        'params': { +            'format': 'opus-lo', +        },      }, {          'url': 'https://bandcamp.com/?blah/blah@&show=228',          'only_matching': True      }]      def _real_extract(self, url): -        video_id = self._match_id(url) -        webpage = self._download_webpage(url, video_id) - -        blob = self._parse_json( -            self._search_regex( -                r'data-blob=(["\'])(?P<blob>{.+?})\1', webpage, -                'blob', group='blob'), -            video_id, transform_source=unescapeHTML) +        show_id = self._match_id(url) +        webpage = self._download_webpage(url, show_id) -        show = blob['bcw_show'] +        blob = self._extract_data_attr(webpage, show_id, 'blob') -        # This is desired because any invalid show id redirects to `bandcamp.com` -        # which happens to expose the latest Bandcamp Weekly episode. -        show_id = int_or_none(show.get('show_id')) or int_or_none(video_id) +        show = blob['bcw_data'][show_id]          formats = []          for format_id, format_url in show['audio_stream'].items(): @@ -408,20 +375,8 @@ class BandcampWeeklyIE(InfoExtractor):          if subtitle:              title += ' - %s' % subtitle -        episode_number = None -        seq = blob.get('bcw_seq') - -        if seq and isinstance(seq, list): -            try: -                episode_number = next( -                    int_or_none(e.get('episode_number')) -                    for e in seq -                    if isinstance(e, dict) and int_or_none(e.get('id')) == show_id) -            except StopIteration: -                pass -          return { -            'id': video_id, +            'id': show_id,              'title': title,              'description': show.get('desc') or show.get('short_desc'),              'duration': float_or_none(show.get('audio_duration')), @@ -429,7 +384,6 @@ class BandcampWeeklyIE(InfoExtractor):              'release_date': unified_strdate(show.get('published_date')),              'series': 'Bandcamp Weekly',              'episode': show.get('subtitle'), -            'episode_number': episode_number, -            'episode_id': compat_str(video_id), +            'episode_id': show_id,              'formats': formats          } diff --git a/youtube_dlc/extractor/bbc.py b/youtube_dlc/extractor/bbc.py index 002c39c39..54cbcdc8e 100644 --- a/youtube_dlc/extractor/bbc.py +++ b/youtube_dlc/extractor/bbc.py @@ -981,7 +981,7 @@ class BBCIE(BBCCoUkIE):          group_id = self._search_regex(              r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,              webpage, 'group id', default=None) -        if playlist_id: +        if group_id:              return self.url_result(                  'https://www.bbc.co.uk/programmes/%s' % group_id,                  ie=BBCCoUkIE.ie_key()) @@ -1092,10 +1092,26 @@ class BBCIE(BBCCoUkIE):              self._search_regex(                  r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,                  'bbcthree config', default='{}'), -            playlist_id, transform_source=js_to_json, fatal=False) -        if bbc3_config: +            playlist_id, transform_source=js_to_json, fatal=False) or {} +        payload = bbc3_config.get('payload') or {} +        if payload: +            clip = payload.get('currentClip') or {} +            clip_vpid = clip.get('vpid') +            clip_title = clip.get('title') +            if clip_vpid and clip_title: +                formats, subtitles = self._download_media_selector(clip_vpid) +                self._sort_formats(formats) +                return { +                    'id': clip_vpid, +                    'title': clip_title, +                    'thumbnail': dict_get(clip, ('poster', 'imageUrl')), +                    'description': clip.get('description'), +                    'duration': parse_duration(clip.get('duration')), +                    'formats': formats, +                    'subtitles': subtitles, +                }              bbc3_playlist = try_get( -                bbc3_config, lambda x: x['payload']['content']['bbcMedia']['playlist'], +                payload, lambda x: x['content']['bbcMedia']['playlist'],                  dict)              if bbc3_playlist:                  playlist_title = bbc3_playlist.get('title') or playlist_title @@ -1118,6 +1134,39 @@ class BBCIE(BBCCoUkIE):                  return self.playlist_result(                      entries, playlist_id, playlist_title, playlist_description) +        initial_data = self._parse_json(self._search_regex( +            r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage, +            'preload state', default='{}'), playlist_id, fatal=False) +        if initial_data: +            def parse_media(media): +                if not media: +                    return +                for item in (try_get(media, lambda x: x['media']['items'], list) or []): +                    item_id = item.get('id') +                    item_title = item.get('title') +                    if not (item_id and item_title): +                        continue +                    formats, subtitles = self._download_media_selector(item_id) +                    self._sort_formats(formats) +                    entries.append({ +                        'id': item_id, +                        'title': item_title, +                        'thumbnail': item.get('holdingImageUrl'), +                        'formats': formats, +                        'subtitles': subtitles, +                    }) +            for resp in (initial_data.get('data') or {}).values(): +                name = resp.get('name') +                if name == 'media-experience': +                    parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict)) +                elif name == 'article': +                    for block in (try_get(resp, lambda x: x['data']['blocks'], list) or []): +                        if block.get('type') != 'media': +                            continue +                        parse_media(block.get('model')) +            return self.playlist_result( +                entries, playlist_id, playlist_title, playlist_description) +          def extract_all(pattern):              return list(filter(None, map(                  lambda s: self._parse_json(s, playlist_id, fatal=False), diff --git a/youtube_dlc/extractor/box.py b/youtube_dlc/extractor/box.py new file mode 100644 index 000000000..aae82d1af --- /dev/null +++ b/youtube_dlc/extractor/box.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..utils import ( +    determine_ext, +    parse_iso8601, +    # try_get, +    update_url_query, +) + + +class BoxIE(InfoExtractor): +    _VALID_URL = r'https?://(?:[^.]+\.)?app\.box\.com/s/(?P<shared_name>[^/]+)/file/(?P<id>\d+)' +    _TEST = { +        'url': 'https://mlssoccer.app.box.com/s/0evd2o3e08l60lr4ygukepvnkord1o1x/file/510727257538', +        'md5': '1f81b2fd3960f38a40a3b8823e5fcd43', +        'info_dict': { +            'id': '510727257538', +            'ext': 'mp4', +            'title': 'Garber   St. Louis will be 28th MLS team  +scarving.mp4', +            'uploader': 'MLS Video', +            'timestamp': 1566320259, +            'upload_date': '20190820', +            'uploader_id': '235196876', +        } +    } + +    def _real_extract(self, url): +        shared_name, file_id = re.match(self._VALID_URL, url).groups() +        webpage = self._download_webpage(url, file_id) +        request_token = self._parse_json(self._search_regex( +            r'Box\.config\s*=\s*({.+?});', webpage, +            'Box config'), file_id)['requestToken'] +        access_token = self._download_json( +            'https://app.box.com/app-api/enduserapp/elements/tokens', file_id, +            'Downloading token JSON metadata', +            data=json.dumps({'fileIDs': [file_id]}).encode(), headers={ +                'Content-Type': 'application/json', +                'X-Request-Token': request_token, +                'X-Box-EndUser-API': 'sharedName=' + shared_name, +            })[file_id]['read'] +        shared_link = 'https://app.box.com/s/' + shared_name +        f = self._download_json( +            'https://api.box.com/2.0/files/' + file_id, file_id, +            'Downloading file JSON metadata', headers={ +                'Authorization': 'Bearer ' + access_token, +                'BoxApi': 'shared_link=' + shared_link, +                'X-Rep-Hints': '[dash]',  # TODO: extract `hls` formats +            }, query={ +                'fields': 'authenticated_download_url,created_at,created_by,description,extension,is_download_available,name,representations,size' +            }) +        title = f['name'] + +        query = { +            'access_token': access_token, +            'shared_link': shared_link +        } + +        formats = [] + +        # for entry in (try_get(f, lambda x: x['representations']['entries'], list) or []): +        #     entry_url_template = try_get( +        #         entry, lambda x: x['content']['url_template']) +        #     if not entry_url_template: +        #         continue +        #     representation = entry.get('representation') +        #     if representation == 'dash': +        #         TODO: append query to every fragment URL +        #         formats.extend(self._extract_mpd_formats( +        #             entry_url_template.replace('{+asset_path}', 'manifest.mpd'), +        #             file_id, query=query)) + +        authenticated_download_url = f.get('authenticated_download_url') +        if authenticated_download_url and f.get('is_download_available'): +            formats.append({ +                'ext': f.get('extension') or determine_ext(title), +                'filesize': f.get('size'), +                'format_id': 'download', +                'url': update_url_query(authenticated_download_url, query), +            }) + +        self._sort_formats(formats) + +        creator = f.get('created_by') or {} + +        return { +            'id': file_id, +            'title': title, +            'formats': formats, +            'description': f.get('description') or None, +            'uploader': creator.get('name'), +            'timestamp': parse_iso8601(f.get('created_at')), +            'uploader_id': creator.get('id'), +        } diff --git a/youtube_dlc/extractor/brightcove.py b/youtube_dlc/extractor/brightcove.py index 638673c31..c6ca939dd 100644 --- a/youtube_dlc/extractor/brightcove.py +++ b/youtube_dlc/extractor/brightcove.py @@ -147,7 +147,7 @@ class BrightcoveLegacyIE(InfoExtractor):      ]      @classmethod -    def _build_brighcove_url(cls, object_str): +    def _build_brightcove_url(cls, object_str):          """          Build a Brightcove url from a xml string containing          <object class="BrightcoveExperience">{params}</object> @@ -217,7 +217,7 @@ class BrightcoveLegacyIE(InfoExtractor):          return cls._make_brightcove_url(params)      @classmethod -    def _build_brighcove_url_from_js(cls, object_js): +    def _build_brightcove_url_from_js(cls, object_js):          # The layout of JS is as follows:          # customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) {          #   // build Brightcove <object /> XML @@ -272,12 +272,12 @@ class BrightcoveLegacyIE(InfoExtractor):              ).+?>\s*</object>''',              webpage)          if matches: -            return list(filter(None, [cls._build_brighcove_url(m) for m in matches])) +            return list(filter(None, [cls._build_brightcove_url(m) for m in matches]))          matches = re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)          if matches:              return list(filter(None, [ -                cls._build_brighcove_url_from_js(custom_bc) +                cls._build_brightcove_url_from_js(custom_bc)                  for custom_bc in matches]))          return [src for _, src in re.findall(              r'<iframe[^>]+src=([\'"])((?:https?:)?//link\.brightcove\.com/services/player/(?!\1).+)\1', webpage)] diff --git a/youtube_dlc/extractor/cda.py b/youtube_dlc/extractor/cda.py index 0c3af23d5..d67900e62 100644 --- a/youtube_dlc/extractor/cda.py +++ b/youtube_dlc/extractor/cda.py @@ -5,10 +5,16 @@ import codecs  import re  from .common import InfoExtractor +from ..compat import ( +    compat_chr, +    compat_ord, +    compat_urllib_parse_unquote, +)  from ..utils import (      ExtractorError,      float_or_none,      int_or_none, +    merge_dicts,      multipart_encode,      parse_duration,      random_birthday, @@ -107,8 +113,9 @@ class CDAIE(InfoExtractor):              r'Odsłony:(?:\s| )*([0-9]+)', webpage,              'view_count', default=None)          average_rating = self._search_regex( -            r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)', -            webpage, 'rating', fatal=False, group='rating_value') +            (r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)', +             r'<span[^>]+\bclass=["\']rating["\'][^>]*>(?P<rating_value>[0-9.]+)'), webpage, 'rating', fatal=False, +            group='rating_value')          info_dict = {              'id': video_id, @@ -123,6 +130,24 @@ class CDAIE(InfoExtractor):              'age_limit': 18 if need_confirm_age else 0,          } +        # Source: https://www.cda.pl/js/player.js?t=1606154898 +        def decrypt_file(a): +            for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'): +                a = a.replace(p, '') +            a = compat_urllib_parse_unquote(a) +            b = [] +            for c in a: +                f = compat_ord(c) +                b.append(compat_chr(33 + (f + 14) % 94) if 33 <= f and 126 >= f else compat_chr(f)) +            a = ''.join(b) +            a = a.replace('.cda.mp4', '') +            for p in ('.2cda.pl', '.3cda.pl'): +                a = a.replace(p, '.cda.pl') +            if '/upstream' in a: +                a = a.replace('/upstream', '.mp4/upstream') +                return 'https://' + a +            return 'https://' + a + '.mp4' +          def extract_format(page, version):              json_str = self._html_search_regex(                  r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page, @@ -141,6 +166,8 @@ class CDAIE(InfoExtractor):                  video['file'] = codecs.decode(video['file'], 'rot_13')                  if video['file'].endswith('adc.mp4'):                      video['file'] = video['file'].replace('adc.mp4', '.mp4') +            elif not video['file'].startswith('http'): +                video['file'] = decrypt_file(video['file'])              f = {                  'url': video['file'],              } @@ -179,4 +206,6 @@ class CDAIE(InfoExtractor):          self._sort_formats(formats) -        return info_dict +        info = self._search_json_ld(webpage, video_id, default={}) + +        return merge_dicts(info_dict, info) diff --git a/youtube_dlc/extractor/cnbc.py b/youtube_dlc/extractor/cnbc.py index 6889b0f40..7b9f4536a 100644 --- a/youtube_dlc/extractor/cnbc.py +++ b/youtube_dlc/extractor/cnbc.py @@ -1,6 +1,7 @@  # coding: utf-8  from __future__ import unicode_literals +import re  from .common import InfoExtractor  from ..utils import smuggle_url @@ -38,7 +39,7 @@ class CNBCIE(InfoExtractor):  class CNBCVideoIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?cnbc\.com/video/(?:[^/]+/)+(?P<id>[^./?#&]+)' +    _VALID_URL = r'https?://(?:www\.)?cnbc\.com(?P<path>/video/(?:[^/]+/)+(?P<id>[^./?#&]+)\.html)'      _TEST = {          'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html',          'info_dict': { @@ -56,11 +57,15 @@ class CNBCVideoIE(InfoExtractor):      }      def _real_extract(self, url): -        display_id = self._match_id(url) -        webpage = self._download_webpage(url, display_id) -        video_id = self._search_regex( -            r'content_id["\']\s*:\s*["\'](\d+)', webpage, display_id, -            'video id') +        path, display_id = re.match(self._VALID_URL, url).groups() +        video_id = self._download_json( +            'https://webql-redesign.cnbcfm.com/graphql', display_id, query={ +                'query': '''{ +  page(path: "%s") { +    vcpsId +  } +}''' % path, +            })['data']['page']['vcpsId']          return self.url_result( -            'http://video.cnbc.com/gallery/?video=%s' % video_id, +            'http://video.cnbc.com/gallery/?video=%d' % video_id,              CNBCIE.ie_key()) diff --git a/youtube_dlc/extractor/common.py b/youtube_dlc/extractor/common.py index 4b42d699f..aacdf06fe 100644 --- a/youtube_dlc/extractor/common.py +++ b/youtube_dlc/extractor/common.py @@ -1456,9 +1456,10 @@ class InfoExtractor(object):          try:              self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)              return True -        except ExtractorError: +        except ExtractorError as e:              self.to_screen( -                '%s: %s URL is invalid, skipping' % (video_id, item)) +                '%s: %s URL is invalid, skipping: %s' +                % (video_id, item, error_to_compat_str(e.cause)))              return False      def http_scheme(self): @@ -1663,7 +1664,7 @@ class InfoExtractor(object):          # just the media without qualities renditions.          # Fortunately, master playlist can be easily distinguished from media          # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4] -        # master playlist tags MUST NOT appear in a media playist and vice versa. +        # master playlist tags MUST NOT appear in a media playlist and vice versa.          # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every          # media playlist and MUST NOT appear in master playlist thus we can          # clearly detect media playlist with this criterion. @@ -2596,6 +2597,7 @@ class InfoExtractor(object):      def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):          formats = [] +          hdcore_sign = 'hdcore=3.7.0'          f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')          hds_host = hosts.get('hds') @@ -2608,6 +2610,7 @@ class InfoExtractor(object):          for entry in f4m_formats:              entry.update({'extra_param_to_segment_url': hdcore_sign})          formats.extend(f4m_formats) +          m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')          hls_host = hosts.get('hls')          if hls_host: @@ -2615,6 +2618,31 @@ class InfoExtractor(object):          formats.extend(self._extract_m3u8_formats(              m3u8_url, video_id, 'mp4', 'm3u8_native',              m3u8_id='hls', fatal=False)) + +        http_host = hosts.get('http') +        if http_host and 'hdnea=' not in manifest_url: +            REPL_REGEX = r'https://[^/]+/i/([^,]+),([^/]+),([^/]+).csmil/.+' +            qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',') +            qualities_length = len(qualities) +            if len(formats) in (qualities_length + 1, qualities_length * 2 + 1): +                i = 0 +                http_formats = [] +                for f in formats: +                    if f['protocol'] == 'm3u8_native' and f['vcodec'] != 'none': +                        for protocol in ('http', 'https'): +                            http_f = f.copy() +                            del http_f['manifest_url'] +                            http_url = re.sub( +                                REPL_REGEX, protocol + r'://%s/\1%s\3' % (http_host, qualities[i]), f['url']) +                            http_f.update({ +                                'format_id': http_f['format_id'].replace('hls-', protocol + '-'), +                                'url': http_url, +                                'protocol': protocol, +                            }) +                            http_formats.append(http_f) +                        i += 1 +                formats.extend(http_formats) +          return formats      def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): diff --git a/youtube_dlc/extractor/condenast.py b/youtube_dlc/extractor/condenast.py index ed278fefc..d5e77af32 100644 --- a/youtube_dlc/extractor/condenast.py +++ b/youtube_dlc/extractor/condenast.py @@ -16,6 +16,8 @@ from ..utils import (      mimetype2ext,      orderedSet,      parse_iso8601, +    strip_or_none, +    try_get,  ) @@ -82,6 +84,7 @@ class CondeNastIE(InfoExtractor):              'uploader': 'gq',              'upload_date': '20170321',              'timestamp': 1490126427, +            'description': 'How much grimmer would things be if these people were competent?',          },      }, {          # JS embed @@ -93,7 +96,7 @@ class CondeNastIE(InfoExtractor):              'title': '3D printed TSA Travel Sentry keys really do open TSA locks',              'uploader': 'arstechnica',              'upload_date': '20150916', -            'timestamp': 1442434955, +            'timestamp': 1442434920,          }      }, {          'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player', @@ -196,6 +199,13 @@ class CondeNastIE(InfoExtractor):              })          self._sort_formats(formats) +        subtitles = {} +        for t, caption in video_info.get('captions', {}).items(): +            caption_url = caption.get('src') +            if not (t in ('vtt', 'srt', 'tml') and caption_url): +                continue +            subtitles.setdefault('en', []).append({'url': caption_url}) +          return {              'id': video_id,              'formats': formats, @@ -208,6 +218,7 @@ class CondeNastIE(InfoExtractor):              'season': video_info.get('season_title'),              'timestamp': parse_iso8601(video_info.get('premiere_date')),              'categories': video_info.get('categories'), +            'subtitles': subtitles,          }      def _real_extract(self, url): @@ -225,8 +236,16 @@ class CondeNastIE(InfoExtractor):          if url_type == 'series':              return self._extract_series(url, webpage)          else: -            params = self._extract_video_params(webpage, display_id) -            info = self._search_json_ld( -                webpage, display_id, fatal=False) +            video = try_get(self._parse_json(self._search_regex( +                r'__PRELOADED_STATE__\s*=\s*({.+?});', webpage, +                'preload state', '{}'), display_id), +                lambda x: x['transformed']['video']) +            if video: +                params = {'videoId': video['id']} +                info = {'description': strip_or_none(video.get('description'))} +            else: +                params = self._extract_video_params(webpage, display_id) +                info = self._search_json_ld( +                    webpage, display_id, fatal=False)              info.update(self._extract_video(params))              return info diff --git a/youtube_dlc/extractor/discoverynetworks.py b/youtube_dlc/extractor/discoverynetworks.py index 607a54948..c512b95d0 100644 --- a/youtube_dlc/extractor/discoverynetworks.py +++ b/youtube_dlc/extractor/discoverynetworks.py @@ -7,7 +7,7 @@ from .dplay import DPlayIE  class DiscoveryNetworksDeIE(DPlayIE): -    _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show)/(?P<programme>[^/]+)/video/(?P<alternate_id>[^/]+)' +    _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show|sendungen)/(?P<programme>[^/]+)/(?:video/)?(?P<alternate_id>[^/]+)'      _TESTS = [{          'url': 'https://www.tlc.de/programme/breaking-amish/video/die-welt-da-drauen/DCB331270001100', @@ -29,6 +29,9 @@ class DiscoveryNetworksDeIE(DPlayIE):      }, {          'url': 'https://www.dplay.co.uk/show/ghost-adventures/video/hotel-leger-103620/EHD_280313B',          'only_matching': True, +    }, { +        'url': 'https://tlc.de/sendungen/breaking-amish/die-welt-da-drauen/', +        'only_matching': True,      }]      def _real_extract(self, url): diff --git a/youtube_dlc/extractor/europa.py b/youtube_dlc/extractor/europa.py index 1efc0b2ec..2c1c747a1 100644 --- a/youtube_dlc/extractor/europa.py +++ b/youtube_dlc/extractor/europa.py @@ -60,7 +60,7 @@ class EuropaIE(InfoExtractor):          title = get_item('title', preferred_langs) or video_id          description = get_item('description', preferred_langs) -        thumbnmail = xpath_text(playlist, './info/thumburl', 'thumbnail') +        thumbnail = xpath_text(playlist, './info/thumburl', 'thumbnail')          upload_date = unified_strdate(xpath_text(playlist, './info/date', 'upload date'))          duration = parse_duration(xpath_text(playlist, './info/duration', 'duration'))          view_count = int_or_none(xpath_text(playlist, './info/views', 'views')) @@ -85,7 +85,7 @@ class EuropaIE(InfoExtractor):              'id': video_id,              'title': title,              'description': description, -            'thumbnail': thumbnmail, +            'thumbnail': thumbnail,              'upload_date': upload_date,              'duration': duration,              'view_count': view_count, diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index 90232c2a7..772746bb2 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -36,6 +36,7 @@ from .afreecatv import AfreecaTVIE  from .airmozilla import AirMozillaIE  from .aljazeera import AlJazeeraIE  from .alphaporno import AlphaPornoIE +from .amara import AmaraIE  from .alura import (      AluraIE,      AluraCourseIE @@ -62,7 +63,7 @@ from .ard import (      ARDMediathekIE,  )  from .arte import ( -    ArteTVPlus7IE, +    ArteTVIE,      ArteTVEmbedIE,      ArteTVPlaylistIE,  ) @@ -129,6 +130,7 @@ from .blinkx import BlinkxIE  from .bloomberg import BloombergIE  from .bokecc import BokeCCIE  from .bostonglobe import BostonGlobeIE +from .box import BoxIE  from .bpb import BpbIE  from .br import (      BRIE, @@ -546,6 +548,7 @@ from .laola1tv import (      EHFTVIE,      ITTFIE,  ) +from .lbry import LBRYIE  from .lci import LCIIE  from .lcp import (      LcpPlayIE, @@ -621,6 +624,7 @@ from .markiza import (  from .massengeschmacktv import MassengeschmackTVIE  from .matchtv import MatchTVIE  from .mdr import MDRIE +from .medaltv import MedalTVIE  from .mediaset import MediasetIE  from .mediasite import (      MediasiteIE, @@ -803,6 +807,7 @@ from .ntvru import NTVRuIE  from .nytimes import (      NYTimesIE,      NYTimesArticleIE, +    NYTimesCookingIE,  )  from .nuvid import NuvidIE  from .nzz import NZZIE @@ -865,6 +870,10 @@ from .picarto import (  )  from .piksel import PikselIE  from .pinkbike import PinkbikeIE +from .pinterest import ( +    PinterestIE, +    PinterestCollectionIE, +)  from .pladform import PladformIE  from .platzi import (      PlatziIE, @@ -983,6 +992,7 @@ from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETe  from .rtvnh import RTVNHIE  from .rtvs import RTVSIE  from .ruhd import RUHDIE +from .rumble import RumbleEmbedIE  from .rutube import (      RutubeIE,      RutubeChannelIE, @@ -1083,8 +1093,7 @@ from .spankbang import (      SpankBangPlaylistIE,  )  from .spankwire import SpankwireIE -from .spiegel import SpiegelIE, SpiegelArticleIE -from .spiegeltv import SpiegeltvIE +from .spiegel import SpiegelIE  from .spike import (      BellatorIE,      ParamountNetworkIE, @@ -1098,6 +1107,12 @@ from .stitcher import StitcherIE  from .sport5 import Sport5IE  from .sportbox import SportBoxIE  from .sportdeutschland import SportDeutschlandIE +from .spreaker import ( +    SpreakerIE, +    SpreakerPageIE, +    SpreakerShowIE, +    SpreakerShowPageIE, +)  from .springboardplatform import SpringboardPlatformIE  from .sprout import SproutIE  from .srgssr import ( @@ -1391,8 +1406,8 @@ from .vk import (  )  from .vlive import (      VLiveIE, +    VLivePostIE,      VLiveChannelIE, -    VLivePlaylistIE  )  from .vodlocker import VodlockerIE  from .vodpl import VODPlIE @@ -1509,21 +1524,18 @@ from .yourporn import YourPornIE  from .yourupload import YourUploadIE  from .youtube import (      YoutubeIE, -    YoutubeChannelIE,      YoutubeFavouritesIE,      YoutubeHistoryIE, -    YoutubeLiveIE, +    YoutubeTabIE,      YoutubePlaylistIE, -    YoutubePlaylistsIE,      YoutubeRecommendedIE,      YoutubeSearchDateIE,      YoutubeSearchIE,      YoutubeSearchURLIE, -    YoutubeShowIE,      YoutubeSubscriptionsIE,      YoutubeTruncatedIDIE,      YoutubeTruncatedURLIE, -    YoutubeUserIE, +    YoutubeYtUserIE,      YoutubeWatchLaterIE,  )  from .zapiks import ZapiksIE diff --git a/youtube_dlc/extractor/franceinter.py b/youtube_dlc/extractor/franceinter.py index 05806895c..ae822a50e 100644 --- a/youtube_dlc/extractor/franceinter.py +++ b/youtube_dlc/extractor/franceinter.py @@ -16,6 +16,7 @@ class FranceInterIE(InfoExtractor):              'ext': 'mp3',              'title': 'Affaire Cahuzac : le contentieux du compte en Suisse',              'description': 'md5:401969c5d318c061f86bda1fa359292b', +            'thumbnail': r're:^https?://.*\.jpg',              'upload_date': '20160907',          },      } @@ -31,6 +32,7 @@ class FranceInterIE(InfoExtractor):          title = self._og_search_title(webpage)          description = self._og_search_description(webpage) +        thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage)          upload_date_str = self._search_regex(              r'class=["\']\s*cover-emission-period\s*["\'][^>]*>[^<]+\s+(\d{1,2}\s+[^\s]+\s+\d{4})<', @@ -48,6 +50,7 @@ class FranceInterIE(InfoExtractor):              'id': video_id,              'title': title,              'description': description, +            'thumbnail': thumbnail,              'upload_date': upload_date,              'formats': [{                  'url': video_url, diff --git a/youtube_dlc/extractor/francetv.py b/youtube_dlc/extractor/francetv.py index e340cddba..ab0df1bed 100644 --- a/youtube_dlc/extractor/francetv.py +++ b/youtube_dlc/extractor/francetv.py @@ -17,6 +17,7 @@ from ..utils import (      parse_duration,      try_get,      url_or_none, +    urljoin,  )  from .dailymotion import DailymotionIE @@ -128,18 +129,38 @@ class FranceTVIE(InfoExtractor):          is_live = None -        formats = [] -        for video in info['videos']: -            if video['statut'] != 'ONLINE': +        videos = [] + +        for video in (info.get('videos') or []): +            if video.get('statut') != 'ONLINE':                  continue -            video_url = video['url'] +            if not video.get('url'): +                continue +            videos.append(video) + +        if not videos: +            for device_type in ['desktop', 'mobile']: +                fallback_info = self._download_json( +                    'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id, +                    video_id, 'Downloading fallback %s video JSON' % device_type, query={ +                        'device_type': device_type, +                        'browser': 'chrome', +                    }, fatal=False) + +                if fallback_info and fallback_info.get('video'): +                    videos.append(fallback_info['video']) + +        formats = [] +        for video in videos: +            video_url = video.get('url')              if not video_url:                  continue              if is_live is None:                  is_live = (try_get( -                    video, lambda x: x['plages_ouverture'][0]['direct'], -                    bool) is True) or '/live.francetv.fr/' in video_url -            format_id = video['format'] +                    video, lambda x: x['plages_ouverture'][0]['direct'], bool) is True +                    or video.get('is_live') is True +                    or '/live.francetv.fr/' in video_url) +            format_id = video.get('format')              ext = determine_ext(video_url)              if ext == 'f4m':                  if georestricted: @@ -154,6 +175,9 @@ class FranceTVIE(InfoExtractor):                      sign(video_url, format_id), video_id, 'mp4',                      entry_protocol='m3u8_native', m3u8_id=format_id,                      fatal=False)) +            elif ext == 'mpd': +                formats.extend(self._extract_mpd_formats( +                    sign(video_url, format_id), video_id, mpd_id=format_id, fatal=False))              elif video_url.startswith('rtmp'):                  formats.append({                      'url': video_url, @@ -166,6 +190,7 @@ class FranceTVIE(InfoExtractor):                          'url': video_url,                          'format_id': format_id,                      }) +          self._sort_formats(formats)          title = info['titre'] @@ -185,10 +210,10 @@ class FranceTVIE(InfoExtractor):          return {              'id': video_id,              'title': self._live_title(title) if is_live else title, -            'description': clean_html(info['synopsis']), -            'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', info['image']), -            'duration': int_or_none(info.get('real_duration')) or parse_duration(info['duree']), -            'timestamp': int_or_none(info['diffusion']['timestamp']), +            'description': clean_html(info.get('synopsis')), +            'thumbnail': urljoin('https://sivideo.webservices.francetelevisions.fr', info.get('image')), +            'duration': int_or_none(info.get('real_duration')) or parse_duration(info.get('duree')), +            'timestamp': int_or_none(try_get(info, lambda x: x['diffusion']['timestamp'])),              'is_live': is_live,              'formats': formats,              'subtitles': subtitles, diff --git a/youtube_dlc/extractor/generic.py b/youtube_dlc/extractor/generic.py index aba06b328..db4d3a933 100644 --- a/youtube_dlc/extractor/generic.py +++ b/youtube_dlc/extractor/generic.py @@ -91,6 +91,7 @@ from .piksel import PikselIE  from .videa import VideaIE  from .twentymin import TwentyMinutenIE  from .ustream import UstreamIE +from .arte import ArteTVEmbedIE  from .videopress import VideoPressIE  from .rutube import RutubeIE  from .limelight import LimelightBaseIE @@ -841,7 +842,7 @@ class GenericIE(InfoExtractor):                  'skip_download': True,              }          }, -        # MTVSercices embed +        # MTVServices embed          {              'url': 'http://www.vulture.com/2016/06/new-key-peele-sketches-released.html',              'md5': 'ca1aef97695ef2c1d6973256a57e5252', @@ -2760,11 +2761,9 @@ class GenericIE(InfoExtractor):              return self.url_result(ustream_url, UstreamIE.ie_key())          # Look for embedded arte.tv player -        mobj = re.search( -            r'<(?:script|iframe) [^>]*?src="(?P<url>http://www\.arte\.tv/(?:playerv2/embed|arte_vp/index)[^"]+)"', -            webpage) -        if mobj is not None: -            return self.url_result(mobj.group('url'), 'ArteTVEmbed') +        arte_urls = ArteTVEmbedIE._extract_urls(webpage) +        if arte_urls: +            return self.playlist_from_matches(arte_urls, video_id, video_title)          # Look for embedded francetv player          mobj = re.search( diff --git a/youtube_dlc/extractor/googledrive.py b/youtube_dlc/extractor/googledrive.py index ec0d58a57..fdb15795a 100644 --- a/youtube_dlc/extractor/googledrive.py +++ b/youtube_dlc/extractor/googledrive.py @@ -3,11 +3,13 @@ from __future__ import unicode_literals  import re  from .common import InfoExtractor +from ..compat import compat_parse_qs  from ..utils import (      determine_ext,      ExtractorError,      int_or_none,      lowercase_escape, +    try_get,      update_url_query,  ) @@ -38,21 +40,10 @@ class GoogleDriveIE(InfoExtractor):          # video can't be watched anonymously due to view count limit reached,          # but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046)          'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view', -        'md5': 'bfbd670d03a470bb1e6d4a257adec12e', -        'info_dict': { -            'id': '0B-vUyvmDLdWDcEt4WjBqcmI2XzQ', -            'ext': 'mp4', -            'title': 'Annabelle Creation (2017)- Z.V1 [TH].MP4', -        } +        'only_matching': True,      }, {          # video id is longer than 28 characters          'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit', -        'info_dict': { -            'id': '1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ', -            'ext': 'mp4', -            'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4', -            'duration': 189, -        },          'only_matching': True,      }, {          'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28', @@ -171,23 +162,21 @@ class GoogleDriveIE(InfoExtractor):      def _real_extract(self, url):          video_id = self._match_id(url) -        webpage = self._download_webpage( -            'http://docs.google.com/file/d/%s' % video_id, video_id) +        video_info = compat_parse_qs(self._download_webpage( +            'https://drive.google.com/get_video_info', +            video_id, query={'docid': video_id})) + +        def get_value(key): +            return try_get(video_info, lambda x: x[key][0]) -        title = self._search_regex( -            r'"title"\s*,\s*"([^"]+)', webpage, 'title', -            default=None) or self._og_search_title(webpage) -        duration = int_or_none(self._search_regex( -            r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds', -            default=None)) +        reason = get_value('reason') +        title = get_value('title') +        if not title and reason: +            raise ExtractorError(reason, expected=True)          formats = [] -        fmt_stream_map = self._search_regex( -            r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, -            'fmt stream map', default='').split(',') -        fmt_list = self._search_regex( -            r'"fmt_list"\s*,\s*"([^"]+)', webpage, -            'fmt_list', default='').split(',') +        fmt_stream_map = (get_value('fmt_stream_map') or '').split(',') +        fmt_list = (get_value('fmt_list') or '').split(',')          if fmt_stream_map and fmt_list:              resolutions = {}              for fmt in fmt_list: @@ -257,19 +246,14 @@ class GoogleDriveIE(InfoExtractor):                          if urlh and urlh.headers.get('Content-Disposition'):                              add_source_format(urlh) -        if not formats: -            reason = self._search_regex( -                r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None) -            if reason: -                raise ExtractorError(reason, expected=True) +        if not formats and reason: +            raise ExtractorError(reason, expected=True)          self._sort_formats(formats) -        hl = self._search_regex( -            r'"hl"\s*,\s*"([^"]+)', webpage, 'hl', default=None) +        hl = get_value('hl')          subtitles_id = None -        ttsurl = self._search_regex( -            r'"ttsurl"\s*,\s*"([^"]+)', webpage, 'ttsurl', default=None) +        ttsurl = get_value('ttsurl')          if ttsurl:              # the video Id for subtitles will be the last value in the ttsurl              # query string @@ -281,8 +265,8 @@ class GoogleDriveIE(InfoExtractor):          return {              'id': video_id,              'title': title, -            'thumbnail': self._og_search_thumbnail(webpage, default=None), -            'duration': duration, +            'thumbnail': 'https://drive.google.com/thumbnail?id=' + video_id, +            'duration': int_or_none(get_value('length_seconds')),              'formats': formats,              'subtitles': self.extract_subtitles(video_id, subtitles_id, hl),              'automatic_captions': self.extract_automatic_captions( diff --git a/youtube_dlc/extractor/infoq.py b/youtube_dlc/extractor/infoq.py index 18249cf9b..0a70a1fb4 100644 --- a/youtube_dlc/extractor/infoq.py +++ b/youtube_dlc/extractor/infoq.py @@ -54,7 +54,7 @@ class InfoQIE(BokeCCBaseIE):      def _extract_rtmp_video(self, webpage):          # The server URL is hardcoded -        video_url = 'rtmpe://video.infoq.com/cfx/st/' +        video_url = 'rtmpe://videof.infoq.com/cfx/st/'          # Extract video URL          encoded_id = self._search_regex( @@ -86,17 +86,18 @@ class InfoQIE(BokeCCBaseIE):          return [{              'format_id': 'http_video',              'url': http_video_url, +            'http_headers': {'Referer': 'https://www.infoq.com/'},          }]      def _extract_http_audio(self, webpage, video_id): -        fields = self._hidden_inputs(webpage) +        fields = self._form_hidden_inputs('mp3Form', webpage)          http_audio_url = fields.get('filename')          if not http_audio_url:              return []          # base URL is found in the Location header in the response returned by          # GET https://www.infoq.com/mp3download.action?filename=... when logged in. -        http_audio_url = compat_urlparse.urljoin('http://res.infoq.com/downloads/mp3downloads/', http_audio_url) +        http_audio_url = compat_urlparse.urljoin('http://ress.infoq.com/downloads/mp3downloads/', http_audio_url)          http_audio_url = update_url_query(http_audio_url, self._extract_cf_auth(webpage))          # audio file seem to be missing some times even if there is a download link diff --git a/youtube_dlc/extractor/iqiyi.py b/youtube_dlc/extractor/iqiyi.py index cd11aa70f..5df674daf 100644 --- a/youtube_dlc/extractor/iqiyi.py +++ b/youtube_dlc/extractor/iqiyi.py @@ -150,7 +150,7 @@ class IqiyiSDKInterpreter(object):              elif function in other_functions:                  other_functions[function]()              else: -                raise ExtractorError('Unknown funcion %s' % function) +                raise ExtractorError('Unknown function %s' % function)          return sdk.target diff --git a/youtube_dlc/extractor/kusi.py b/youtube_dlc/extractor/kusi.py index 6a7e3baa7..9833d35eb 100644 --- a/youtube_dlc/extractor/kusi.py +++ b/youtube_dlc/extractor/kusi.py @@ -64,7 +64,7 @@ class KUSIIE(InfoExtractor):          duration = float_or_none(xpath_text(doc, 'DURATION'), scale=1000)          description = xpath_text(doc, 'ABSTRACT')          thumbnail = xpath_text(doc, './THUMBNAILIMAGE/FILENAME') -        createtion_time = timeconvert(xpath_text(doc, 'rfc822creationdate')) +        creation_time = timeconvert(xpath_text(doc, 'rfc822creationdate'))          quality_options = doc.find('{http://search.yahoo.com/mrss/}group').findall('{http://search.yahoo.com/mrss/}content')          formats = [] @@ -84,5 +84,5 @@ class KUSIIE(InfoExtractor):              'duration': duration,              'formats': formats,              'thumbnail': thumbnail, -            'timestamp': createtion_time, +            'timestamp': creation_time,          } diff --git a/youtube_dlc/extractor/lbry.py b/youtube_dlc/extractor/lbry.py new file mode 100644 index 000000000..6177297ab --- /dev/null +++ b/youtube_dlc/extractor/lbry.py @@ -0,0 +1,91 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( +    determine_ext, +    ExtractorError, +    int_or_none, +    mimetype2ext, +    try_get, +) + + +class LBRYIE(InfoExtractor): +    IE_NAME = 'lbry.tv' +    _VALID_URL = r'https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/(?P<id>@[^:]+:[0-9a-z]+/[^:]+:[0-9a-z])' +    _TESTS = [{ +        # Video +        'url': 'https://lbry.tv/@Mantega:1/First-day-LBRY:1', +        'md5': '65bd7ec1f6744ada55da8e4c48a2edf9', +        'info_dict': { +            'id': '17f983b61f53091fb8ea58a9c56804e4ff8cff4d', +            'ext': 'mp4', +            'title': 'First day in LBRY? Start HERE!', +            'description': 'md5:f6cb5c704b332d37f5119313c2c98f51', +            'timestamp': 1595694354, +            'upload_date': '20200725', +        } +    }, { +        # Audio +        'url': 'https://lbry.tv/@LBRYFoundation:0/Episode-1:e', +        'md5': 'c94017d3eba9b49ce085a8fad6b98d00', +        'info_dict': { +            'id': 'e7d93d772bd87e2b62d5ab993c1c3ced86ebb396', +            'ext': 'mp3', +            'title': 'The LBRY Foundation Community Podcast Episode 1 - Introduction, Streaming on LBRY, Transcoding', +            'description': 'md5:661ac4f1db09f31728931d7b88807a61', +            'timestamp': 1591312601, +            'upload_date': '20200604', +        } +    }, { +        'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e', +        'only_matching': True, +    }, { +        'url': "https://odysee.com/@ScammerRevolts:b0/I-SYSKEY'D-THE-SAME-SCAMMERS-3-TIMES!:b", +        'only_matching': True, +    }] + +    def _call_api_proxy(self, method, display_id, params): +        return self._download_json( +            'https://api.lbry.tv/api/v1/proxy', display_id, +            headers={'Content-Type': 'application/json-rpc'}, +            data=json.dumps({ +                'method': method, +                'params': params, +            }).encode())['result'] + +    def _real_extract(self, url): +        display_id = self._match_id(url).replace(':', '#') +        uri = 'lbry://' + display_id +        result = self._call_api_proxy( +            'resolve', display_id, {'urls': [uri]})[uri] +        result_value = result['value'] +        if result_value.get('stream_type') not in ('video', 'audio'): +            raise ExtractorError('Unsupported URL', expected=True) +        streaming_url = self._call_api_proxy( +            'get', display_id, {'uri': uri})['streaming_url'] +        source = result_value.get('source') or {} +        media = result_value.get('video') or result_value.get('audio') or {} +        signing_channel = result_value.get('signing_channel') or {} + +        return { +            'id': result['claim_id'], +            'title': result_value['title'], +            'thumbnail': try_get(result_value, lambda x: x['thumbnail']['url'], compat_str), +            'description': result_value.get('description'), +            'license': result_value.get('license'), +            'timestamp': int_or_none(result.get('timestamp')), +            'tags': result_value.get('tags'), +            'width': int_or_none(media.get('width')), +            'height': int_or_none(media.get('height')), +            'duration': int_or_none(media.get('duration')), +            'channel': signing_channel.get('name'), +            'channel_id': signing_channel.get('claim_id'), +            'ext': determine_ext(source.get('name')) or mimetype2ext(source.get('media_type')), +            'filesize': int_or_none(source.get('size')), +            'url': streaming_url, +        } diff --git a/youtube_dlc/extractor/lrt.py b/youtube_dlc/extractor/lrt.py index f5c997ef4..89d549858 100644 --- a/youtube_dlc/extractor/lrt.py +++ b/youtube_dlc/extractor/lrt.py @@ -5,28 +5,26 @@ import re  from .common import InfoExtractor  from ..utils import ( -    determine_ext, -    int_or_none, -    parse_duration, -    remove_end, +    clean_html, +    merge_dicts,  )  class LRTIE(InfoExtractor):      IE_NAME = 'lrt.lt' -    _VALID_URL = r'https?://(?:www\.)?lrt\.lt/mediateka/irasas/(?P<id>[0-9]+)' +    _VALID_URL = r'https?://(?:www\.)?lrt\.lt(?P<path>/mediateka/irasas/(?P<id>[0-9]+))'      _TESTS = [{          # m3u8 download -        'url': 'http://www.lrt.lt/mediateka/irasas/54391/', -        'md5': 'fe44cf7e4ab3198055f2c598fc175cb0', +        'url': 'https://www.lrt.lt/mediateka/irasas/2000127261/greita-ir-gardu-sicilijos-ikvepta-klasikiniu-makaronu-su-baklazanais-vakariene', +        'md5': '85cb2bb530f31d91a9c65b479516ade4',          'info_dict': { -            'id': '54391', +            'id': '2000127261',              'ext': 'mp4', -            'title': 'Septynios Kauno dienos', -            'description': 'md5:24d84534c7dc76581e59f5689462411a', -            'duration': 1783, -            'view_count': int, -            'like_count': int, +            'title': 'Greita ir gardu: Sicilijos įkvėpta klasikinių makaronų su baklažanais vakarienė', +            'description': 'md5:ad7d985f51b0dc1489ba2d76d7ed47fa', +            'duration': 3035, +            'timestamp': 1604079000, +            'upload_date': '20201030',          },      }, {          # direct mp3 download @@ -43,52 +41,35 @@ class LRTIE(InfoExtractor):          },      }] +    def _extract_js_var(self, webpage, var_name, default): +        return self._search_regex( +            r'%s\s*=\s*(["\'])((?:(?!\1).)+)\1' % var_name, +            webpage, var_name.replace('_', ' '), default, group=2) +      def _real_extract(self, url): -        video_id = self._match_id(url) +        path, video_id = re.match(self._VALID_URL, url).groups()          webpage = self._download_webpage(url, video_id) -        title = remove_end(self._og_search_title(webpage), ' - LRT') - -        formats = [] -        for _, file_url in re.findall( -                r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage): -            ext = determine_ext(file_url) -            if ext not in ('m3u8', 'mp3'): -                continue -            # mp3 served as m3u8 produces stuttered media file -            if ext == 'm3u8' and '.mp3' in file_url: -                continue -            if ext == 'm3u8': -                formats.extend(self._extract_m3u8_formats( -                    file_url, video_id, 'mp4', entry_protocol='m3u8_native', -                    fatal=False)) -            elif ext == 'mp3': -                formats.append({ -                    'url': file_url, -                    'vcodec': 'none', -                }) -        self._sort_formats(formats) +        media_url = self._extract_js_var(webpage, 'main_url', path) +        media = self._download_json(self._extract_js_var( +            webpage, 'media_info_url', +            'https://www.lrt.lt/servisai/stream_url/vod/media_info/'), +            video_id, query={'url': media_url}) +        jw_data = self._parse_jwplayer_data( +            media['playlist_item'], video_id, base_url=url) -        thumbnail = self._og_search_thumbnail(webpage) -        description = self._og_search_description(webpage) -        duration = parse_duration(self._search_regex( -            r'var\s+record_len\s*=\s*(["\'])(?P<duration>[0-9]+:[0-9]+:[0-9]+)\1', -            webpage, 'duration', default=None, group='duration')) +        json_ld_data = self._search_json_ld(webpage, video_id) -        view_count = int_or_none(self._html_search_regex( -            r'<div[^>]+class=(["\']).*?record-desc-seen.*?\1[^>]*>(?P<count>.+?)</div>', -            webpage, 'view count', fatal=False, group='count')) -        like_count = int_or_none(self._search_regex( -            r'<span[^>]+id=(["\'])flikesCount.*?\1>(?P<count>\d+)<', -            webpage, 'like count', fatal=False, group='count')) +        tags = [] +        for tag in (media.get('tags') or []): +            tag_name = tag.get('name') +            if not tag_name: +                continue +            tags.append(tag_name) -        return { -            'id': video_id, -            'title': title, -            'formats': formats, -            'thumbnail': thumbnail, -            'description': description, -            'duration': duration, -            'view_count': view_count, -            'like_count': like_count, +        clean_info = { +            'description': clean_html(media.get('content')), +            'tags': tags,          } + +        return merge_dicts(clean_info, jw_data, json_ld_data) diff --git a/youtube_dlc/extractor/malltv.py b/youtube_dlc/extractor/malltv.py index 6f4fd927f..fadfd9338 100644 --- a/youtube_dlc/extractor/malltv.py +++ b/youtube_dlc/extractor/malltv.py @@ -1,10 +1,16 @@  # coding: utf-8  from __future__ import unicode_literals -import re -  from .common import InfoExtractor -from ..utils import merge_dicts +from ..utils import ( +    clean_html, +    dict_get, +    float_or_none, +    int_or_none, +    merge_dicts, +    parse_duration, +    try_get, +)  class MallTVIE(InfoExtractor): @@ -17,7 +23,7 @@ class MallTVIE(InfoExtractor):              'display_id': '18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice',              'ext': 'mp4',              'title': '18 miliard pro neziskovky. Opravdu jsou sportovci nebo Člověk v tísni pijavice?', -            'description': 'md5:25fc0ec42a72ba602b602c683fa29deb', +            'description': 'md5:db7d5744a4bd4043d9d98324aa72ab35',              'duration': 216,              'timestamp': 1538870400,              'upload_date': '20181007', @@ -37,20 +43,46 @@ class MallTVIE(InfoExtractor):          webpage = self._download_webpage(              url, display_id, headers=self.geo_verification_headers()) -        SOURCE_RE = r'(<source[^>]+\bsrc=(?:(["\'])(?:(?!\2).)+|[^\s]+)/(?P<id>[\da-z]+)/index)\b' +        video = self._parse_json(self._search_regex( +            r'videoObject\s*=\s*JSON\.parse\(JSON\.stringify\(({.+?})\)\);', +            webpage, 'video object'), display_id) +        video_source = video['VideoSource']          video_id = self._search_regex( -            SOURCE_RE, webpage, 'video id', group='id') +            r'/([\da-z]+)/index\b', video_source, 'video id') + +        formats = self._extract_m3u8_formats( +            video_source + '.m3u8', video_id, 'mp4', 'm3u8_native') +        self._sort_formats(formats) + +        subtitles = {} +        for s in (video.get('Subtitles') or {}): +            s_url = s.get('Url') +            if not s_url: +                continue +            subtitles.setdefault(s.get('Language') or 'cz', []).append({ +                'url': s_url, +            }) + +        entity_counts = video.get('EntityCounts') or {} -        media = self._parse_html5_media_entries( -            url, re.sub(SOURCE_RE, r'\1.m3u8', webpage), video_id, -            m3u8_id='hls', m3u8_entry_protocol='m3u8_native')[0] +        def get_count(k): +            v = entity_counts.get(k + 's') or {} +            return int_or_none(dict_get(v, ('Count', 'StrCount')))          info = self._search_json_ld(webpage, video_id, default={}) -        return merge_dicts(media, info, { +        return merge_dicts({              'id': video_id,              'display_id': display_id, -            'title': self._og_search_title(webpage, default=None) or display_id, -            'description': self._og_search_description(webpage, default=None), -            'thumbnail': self._og_search_thumbnail(webpage, default=None), -        }) +            'title': video.get('Title'), +            'description': clean_html(video.get('Description')), +            'thumbnail': video.get('ThumbnailUrl'), +            'formats': formats, +            'subtitles': subtitles, +            'duration': int_or_none(video.get('DurationSeconds')) or parse_duration(video.get('Duration')), +            'view_count': get_count('View'), +            'like_count': get_count('Like'), +            'dislike_count': get_count('Dislike'), +            'average_rating': float_or_none(try_get(video, lambda x: x['EntityRating']['AvarageRate'])), +            'comment_count': get_count('Comment'), +        }, info) diff --git a/youtube_dlc/extractor/medaltv.py b/youtube_dlc/extractor/medaltv.py new file mode 100644 index 000000000..1603b55f6 --- /dev/null +++ b/youtube_dlc/extractor/medaltv.py @@ -0,0 +1,131 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( +    ExtractorError, +    float_or_none, +    int_or_none, +    str_or_none, +    try_get, +) + + +class MedalTVIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?medal\.tv/clips/(?P<id>[0-9]+)' +    _TESTS = [{ +        'url': 'https://medal.tv/clips/34934644/3Is9zyGMoBMr', +        'md5': '7b07b064331b1cf9e8e5c52a06ae68fa', +        'info_dict': { +            'id': '34934644', +            'ext': 'mp4', +            'title': 'Quad Cold', +            'description': 'Medal,https://medal.tv/desktop/', +            'uploader': 'MowgliSB', +            'timestamp': 1603165266, +            'upload_date': '20201020', +            'uploader_id': 10619174, +        } +    }, { +        'url': 'https://medal.tv/clips/36787208', +        'md5': 'b6dc76b78195fff0b4f8bf4a33ec2148', +        'info_dict': { +            'id': '36787208', +            'ext': 'mp4', +            'title': 'u tk me i tk u bigger', +            'description': 'Medal,https://medal.tv/desktop/', +            'uploader': 'Mimicc', +            'timestamp': 1605580939, +            'upload_date': '20201117', +            'uploader_id': 5156321, +        } +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        webpage = self._download_webpage(url, video_id) + +        hydration_data = self._parse_json(self._search_regex( +            r'<script[^>]*>\s*(?:var\s*)?hydrationData\s*=\s*({.+?})\s*</script>', +            webpage, 'hydration data', default='{}'), video_id) + +        clip = try_get( +            hydration_data, lambda x: x['clips'][video_id], dict) or {} +        if not clip: +            raise ExtractorError( +                'Could not find video information.', video_id=video_id) + +        title = clip['contentTitle'] + +        source_width = int_or_none(clip.get('sourceWidth')) +        source_height = int_or_none(clip.get('sourceHeight')) + +        aspect_ratio = source_width / source_height if source_width and source_height else 16 / 9 + +        def add_item(container, item_url, height, id_key='format_id', item_id=None): +            item_id = item_id or '%dp' % height +            if item_id not in item_url: +                return +            width = int(round(aspect_ratio * height)) +            container.append({ +                'url': item_url, +                id_key: item_id, +                'width': width, +                'height': height +            }) + +        formats = [] +        thumbnails = [] +        for k, v in clip.items(): +            if not (v and isinstance(v, compat_str)): +                continue +            mobj = re.match(r'(contentUrl|thumbnail)(?:(\d+)p)?$', k) +            if not mobj: +                continue +            prefix = mobj.group(1) +            height = int_or_none(mobj.group(2)) +            if prefix == 'contentUrl': +                add_item( +                    formats, v, height or source_height, +                    item_id=None if height else 'source') +            elif prefix == 'thumbnail': +                add_item(thumbnails, v, height, 'id') + +        error = clip.get('error') +        if not formats and error: +            if error == 404: +                raise ExtractorError( +                    'That clip does not exist.', +                    expected=True, video_id=video_id) +            else: +                raise ExtractorError( +                    'An unknown error occurred ({0}).'.format(error), +                    video_id=video_id) + +        self._sort_formats(formats) + +        # Necessary because the id of the author is not known in advance. +        # Won't raise an issue if no profile can be found as this is optional. +        author = try_get( +            hydration_data, lambda x: list(x['profiles'].values())[0], dict) or {} +        author_id = str_or_none(author.get('id')) +        author_url = 'https://medal.tv/users/{0}'.format(author_id) if author_id else None + +        return { +            'id': video_id, +            'title': title, +            'formats': formats, +            'thumbnails': thumbnails, +            'description': clip.get('contentDescription'), +            'uploader': author.get('displayName'), +            'timestamp': float_or_none(clip.get('created'), 1000), +            'uploader_id': author_id, +            'uploader_url': author_url, +            'duration': int_or_none(clip.get('videoLengthSeconds')), +            'view_count': int_or_none(clip.get('views')), +            'like_count': int_or_none(clip.get('likes')), +            'comment_count': int_or_none(clip.get('comments')), +        } diff --git a/youtube_dlc/extractor/mgtv.py b/youtube_dlc/extractor/mgtv.py index 71fc3ec56..cab3aa045 100644 --- a/youtube_dlc/extractor/mgtv.py +++ b/youtube_dlc/extractor/mgtv.py @@ -17,9 +17,8 @@ from ..utils import (  class MGTVIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P<id>\d+)\.html' +    _VALID_URL = r'https?://(?:w(?:ww)?\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P<id>\d+)\.html'      IE_DESC = '芒果TV' -    _GEO_COUNTRIES = ['CN']      _TESTS = [{          'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html', @@ -34,14 +33,18 @@ class MGTVIE(InfoExtractor):      }, {          'url': 'http://www.mgtv.com/b/301817/3826653.html',          'only_matching': True, +    }, { +        'url': 'https://w.mgtv.com/b/301817/3826653.html', +        'only_matching': True,      }]      def _real_extract(self, url):          video_id = self._match_id(url) +        tk2 = base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1]          try:              api_data = self._download_json(                  'https://pcweb.api.mgtv.com/player/video', video_id, query={ -                    'tk2': base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1], +                    'tk2': tk2,                      'video_id': video_id,                  }, headers=self.geo_verification_headers())['data']          except ExtractorError as e: @@ -56,6 +59,7 @@ class MGTVIE(InfoExtractor):          stream_data = self._download_json(              'https://pcweb.api.mgtv.com/player/getSource', video_id, query={                  'pm2': api_data['atc']['pm2'], +                'tk2': tk2,                  'video_id': video_id,              }, headers=self.geo_verification_headers())['data']          stream_domain = stream_data['stream_domain'][0] diff --git a/youtube_dlc/extractor/mtv.py b/youtube_dlc/extractor/mtv.py index 04cc95b6a..d31f53137 100644 --- a/youtube_dlc/extractor/mtv.py +++ b/youtube_dlc/extractor/mtv.py @@ -403,6 +403,18 @@ class MTVIE(MTVServicesInfoExtractor):          'only_matching': True,      }] +    @staticmethod +    def extract_child_with_type(parent, t): +        children = parent['children'] +        return next(c for c in children if c.get('type') == t) + +    def _extract_mgid(self, webpage): +        data = self._parse_json(self._search_regex( +            r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None) +        main_container = self.extract_child_with_type(data, 'MainContainer') +        video_player = self.extract_child_with_type(main_container, 'VideoPlayer') +        return video_player['props']['media']['video']['config']['uri'] +  class MTVJapanIE(MTVServicesInfoExtractor):      IE_NAME = 'mtvjapan' diff --git a/youtube_dlc/extractor/nbc.py b/youtube_dlc/extractor/nbc.py index 6f3cb3003..ea5f5a315 100644 --- a/youtube_dlc/extractor/nbc.py +++ b/youtube_dlc/extractor/nbc.py @@ -10,7 +10,6 @@ from .adobepass import AdobePassIE  from ..compat import compat_urllib_parse_unquote  from ..utils import (      int_or_none, -    js_to_json,      parse_duration,      smuggle_url,      try_get, @@ -394,8 +393,8 @@ class NBCNewsIE(ThePlatformIE):          webpage = self._download_webpage(url, video_id)          data = self._parse_json(self._search_regex( -            r'window\.__data\s*=\s*({.+});', webpage, -            'bootstrap json'), video_id, js_to_json) +            r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>', +            webpage, 'bootstrap json'), video_id)['props']['initialState']          video_data = try_get(data, lambda x: x['video']['current'], dict)          if not video_data:              video_data = data['article']['content'][0]['primaryMedia']['video'] diff --git a/youtube_dlc/extractor/ndr.py b/youtube_dlc/extractor/ndr.py index f3897c71b..81abb3120 100644 --- a/youtube_dlc/extractor/ndr.py +++ b/youtube_dlc/extractor/ndr.py @@ -83,6 +83,29 @@ class NDRIE(NDRBaseIE):              'skip_download': True,          },      }, { +        # with subtitles +        'url': 'https://www.ndr.de/fernsehen/sendungen/extra_3/extra-3-Satiremagazin-mit-Christian-Ehring,sendung1091858.html', +        'info_dict': { +            'id': 'extra18674', +            'display_id': 'extra-3-Satiremagazin-mit-Christian-Ehring', +            'ext': 'mp4', +            'title': 'Extra 3 vom 11.11.2020 mit Christian Ehring', +            'description': 'md5:42ee53990a715eaaf4dc7f13a3bd56c6', +            'uploader': 'ndrtv', +            'upload_date': '20201113', +            'duration': 1749, +            'subtitles': { +                'de': [{ +                    'ext': 'ttml', +                    'url': r're:^https://www\.ndr\.de.+', +                }], +            }, +        }, +        'params': { +            'skip_download': True, +        }, +        'expected_warnings': ['Unable to download f4m manifest'], +    }, {          'url': 'https://www.ndr.de/Fettes-Brot-Ferris-MC-und-Thees-Uhlmann-live-on-stage,festivalsommer116.html',          'only_matching': True,      }] @@ -242,6 +265,20 @@ class NDREmbedBaseIE(InfoExtractor):                  'preference': quality_key(thumbnail.get('quality')),              }) +        subtitles = {} +        tracks = config.get('tracks') +        if tracks and isinstance(tracks, list): +            for track in tracks: +                if not isinstance(track, dict): +                    continue +                track_url = urljoin(url, track.get('src')) +                if not track_url: +                    continue +                subtitles.setdefault(track.get('srclang') or 'de', []).append({ +                    'url': track_url, +                    'ext': 'ttml', +                }) +          return {              'id': video_id,              'title': title, @@ -251,6 +288,7 @@ class NDREmbedBaseIE(InfoExtractor):              'duration': duration,              'thumbnails': thumbnails,              'formats': formats, +            'subtitles': subtitles,          } diff --git a/youtube_dlc/extractor/npr.py b/youtube_dlc/extractor/npr.py index 53acc6e57..9d1122f0c 100644 --- a/youtube_dlc/extractor/npr.py +++ b/youtube_dlc/extractor/npr.py @@ -33,7 +33,7 @@ class NprIE(InfoExtractor):              },          }],      }, { -        # mutlimedia, not media title +        # multimedia, not media title          'url': 'https://www.npr.org/2017/06/19/533198237/tigers-jaw-tiny-desk-concert',          'info_dict': {              'id': '533198237', diff --git a/youtube_dlc/extractor/nrk.py b/youtube_dlc/extractor/nrk.py index 84aacbcda..4a395546f 100644 --- a/youtube_dlc/extractor/nrk.py +++ b/youtube_dlc/extractor/nrk.py @@ -9,6 +9,7 @@ from ..compat import (      compat_urllib_parse_unquote,  )  from ..utils import ( +    determine_ext,      ExtractorError,      int_or_none,      js_to_json, @@ -16,185 +17,13 @@ from ..utils import (      parse_age_limit,      parse_duration,      try_get, +    url_or_none,  )  class NRKBaseIE(InfoExtractor):      _GEO_COUNTRIES = ['NO'] -    _api_host = None - -    def _real_extract(self, url): -        video_id = self._match_id(url) - -        api_hosts = (self._api_host, ) if self._api_host else self._API_HOSTS - -        for api_host in api_hosts: -            data = self._download_json( -                'http://%s/mediaelement/%s' % (api_host, video_id), -                video_id, 'Downloading mediaelement JSON', -                fatal=api_host == api_hosts[-1]) -            if not data: -                continue -            self._api_host = api_host -            break - -        title = data.get('fullTitle') or data.get('mainTitle') or data['title'] -        video_id = data.get('id') or video_id - -        entries = [] - -        conviva = data.get('convivaStatistics') or {} -        live = (data.get('mediaElementType') == 'Live' -                or data.get('isLive') is True or conviva.get('isLive')) - -        def make_title(t): -            return self._live_title(t) if live else t - -        media_assets = data.get('mediaAssets') -        if media_assets and isinstance(media_assets, list): -            def video_id_and_title(idx): -                return ((video_id, title) if len(media_assets) == 1 -                        else ('%s-%d' % (video_id, idx), '%s (Part %d)' % (title, idx))) -            for num, asset in enumerate(media_assets, 1): -                asset_url = asset.get('url') -                if not asset_url: -                    continue -                formats = self._extract_akamai_formats(asset_url, video_id) -                if not formats: -                    continue -                self._sort_formats(formats) - -                # Some f4m streams may not work with hdcore in fragments' URLs -                for f in formats: -                    extra_param = f.get('extra_param_to_segment_url') -                    if extra_param and 'hdcore' in extra_param: -                        del f['extra_param_to_segment_url'] - -                entry_id, entry_title = video_id_and_title(num) -                duration = parse_duration(asset.get('duration')) -                subtitles = {} -                for subtitle in ('webVtt', 'timedText'): -                    subtitle_url = asset.get('%sSubtitlesUrl' % subtitle) -                    if subtitle_url: -                        subtitles.setdefault('no', []).append({ -                            'url': compat_urllib_parse_unquote(subtitle_url) -                        }) -                entries.append({ -                    'id': asset.get('carrierId') or entry_id, -                    'title': make_title(entry_title), -                    'duration': duration, -                    'subtitles': subtitles, -                    'formats': formats, -                }) - -        if not entries: -            media_url = data.get('mediaUrl') -            if media_url: -                formats = self._extract_akamai_formats(media_url, video_id) -                self._sort_formats(formats) -                duration = parse_duration(data.get('duration')) -                entries = [{ -                    'id': video_id, -                    'title': make_title(title), -                    'duration': duration, -                    'formats': formats, -                }] - -        if not entries: -            MESSAGES = { -                'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet', -                'ProgramRightsHasExpired': 'Programmet har gått ut', -                'NoProgramRights': 'Ikke tilgjengelig', -                'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', -            } -            message_type = data.get('messageType', '') -            # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* -            if 'IsGeoBlocked' in message_type: -                self.raise_geo_restricted( -                    msg=MESSAGES.get('ProgramIsGeoBlocked'), -                    countries=self._GEO_COUNTRIES) -            raise ExtractorError( -                '%s said: %s' % (self.IE_NAME, MESSAGES.get( -                    message_type, message_type)), -                expected=True) - -        series = conviva.get('seriesName') or data.get('seriesTitle') -        episode = conviva.get('episodeName') or data.get('episodeNumberOrDate') - -        season_number = None -        episode_number = None -        if data.get('mediaElementType') == 'Episode': -            _season_episode = data.get('scoresStatistics', {}).get('springStreamStream') or \ -                data.get('relativeOriginUrl', '') -            EPISODENUM_RE = [ -                r'/s(?P<season>\d{,2})e(?P<episode>\d{,2})\.', -                r'/sesong-(?P<season>\d{,2})/episode-(?P<episode>\d{,2})', -            ] -            season_number = int_or_none(self._search_regex( -                EPISODENUM_RE, _season_episode, 'season number', -                default=None, group='season')) -            episode_number = int_or_none(self._search_regex( -                EPISODENUM_RE, _season_episode, 'episode number', -                default=None, group='episode')) - -        thumbnails = None -        images = data.get('images') -        if images and isinstance(images, dict): -            web_images = images.get('webImages') -            if isinstance(web_images, list): -                thumbnails = [{ -                    'url': image['imageUrl'], -                    'width': int_or_none(image.get('width')), -                    'height': int_or_none(image.get('height')), -                } for image in web_images if image.get('imageUrl')] - -        description = data.get('description') -        category = data.get('mediaAnalytics', {}).get('category') - -        common_info = { -            'description': description, -            'series': series, -            'episode': episode, -            'season_number': season_number, -            'episode_number': episode_number, -            'categories': [category] if category else None, -            'age_limit': parse_age_limit(data.get('legalAge')), -            'thumbnails': thumbnails, -        } - -        vcodec = 'none' if data.get('mediaType') == 'Audio' else None - -        for entry in entries: -            entry.update(common_info) -            for f in entry['formats']: -                f['vcodec'] = vcodec - -        points = data.get('shortIndexPoints') -        if isinstance(points, list): -            chapters = [] -            for next_num, point in enumerate(points, start=1): -                if not isinstance(point, dict): -                    continue -                start_time = parse_duration(point.get('startPoint')) -                if start_time is None: -                    continue -                end_time = parse_duration( -                    data.get('duration') -                    if next_num == len(points) -                    else points[next_num].get('startPoint')) -                if end_time is None: -                    continue -                chapters.append({ -                    'start_time': start_time, -                    'end_time': end_time, -                    'title': point.get('title'), -                }) -            if chapters and len(entries) == 1: -                entries[0]['chapters'] = chapters - -        return self.playlist_result(entries, video_id, title, description) -  class NRKIE(NRKBaseIE):      _VALID_URL = r'''(?x) @@ -202,13 +31,13 @@ class NRKIE(NRKBaseIE):                              nrk:|                              https?://                                  (?: -                                    (?:www\.)?nrk\.no/video/PS\*| +                                    (?:www\.)?nrk\.no/video/(?:PS\*|[^_]+_)|                                      v8[-.]psapi\.nrk\.no/mediaelement/                                  )                              ) -                            (?P<id>[^?#&]+) +                            (?P<id>[^?\#&]+)                          ''' -    _API_HOSTS = ('psapi.nrk.no', 'v8-psapi.nrk.no') +      _TESTS = [{          # video          'url': 'http://www.nrk.no/video/PS*150533', @@ -240,8 +69,76 @@ class NRKIE(NRKBaseIE):      }, {          'url': 'https://v8-psapi.nrk.no/mediaelement/ecc1b952-96dc-4a98-81b9-5296dc7a98d9',          'only_matching': True, +    }, { +        'url': 'https://www.nrk.no/video/dompap-og-andre-fugler-i-piip-show_150533', +        'only_matching': True, +    }, { +        'url': 'https://www.nrk.no/video/humor/kommentatorboksen-reiser-til-sjos_d1fda11f-a4ad-437a-a374-0398bc84e999', +        'only_matching': True,      }] +    def _extract_from_playback(self, video_id): +        manifest = self._download_json( +            'http://psapi.nrk.no/playback/manifest/%s' % video_id, +            video_id, 'Downloading manifest JSON') + +        playable = manifest['playable'] + +        formats = [] +        for asset in playable['assets']: +            if not isinstance(asset, dict): +                continue +            if asset.get('encrypted'): +                continue +            format_url = url_or_none(asset.get('url')) +            if not format_url: +                continue +            if asset.get('format') == 'HLS' or determine_ext(format_url) == 'm3u8': +                formats.extend(self._extract_m3u8_formats( +                    format_url, video_id, 'mp4', entry_protocol='m3u8_native', +                    m3u8_id='hls', fatal=False)) +        self._sort_formats(formats) + +        data = self._download_json( +            'http://psapi.nrk.no/playback/metadata/%s' % video_id, +            video_id, 'Downloading metadata JSON') + +        preplay = data['preplay'] +        titles = preplay['titles'] +        title = titles['title'] +        alt_title = titles.get('subtitle') + +        description = preplay.get('description') +        duration = parse_duration(playable.get('duration')) or parse_duration(data.get('duration')) + +        thumbnails = [] +        for image in try_get( +                preplay, lambda x: x['poster']['images'], list) or []: +            if not isinstance(image, dict): +                continue +            image_url = url_or_none(image.get('url')) +            if not image_url: +                continue +            thumbnails.append({ +                'url': image_url, +                'width': int_or_none(image.get('pixelWidth')), +                'height': int_or_none(image.get('pixelHeight')), +            }) + +        return { +            'id': video_id, +            'title': title, +            'alt_title': alt_title, +            'description': description, +            'duration': duration, +            'thumbnails': thumbnails, +            'formats': formats, +        } + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        return self._extract_from_playback(video_id) +  class NRKTVIE(NRKBaseIE):      IE_DESC = 'NRK TV and NRK Radio' @@ -380,6 +277,181 @@ class NRKTVIE(NRKBaseIE):          'only_matching': True,      }] +    _api_host = None + +    def _extract_from_mediaelement(self, video_id): +        api_hosts = (self._api_host, ) if self._api_host else self._API_HOSTS + +        for api_host in api_hosts: +            data = self._download_json( +                'http://%s/mediaelement/%s' % (api_host, video_id), +                video_id, 'Downloading mediaelement JSON', +                fatal=api_host == api_hosts[-1]) +            if not data: +                continue +            self._api_host = api_host +            break + +        title = data.get('fullTitle') or data.get('mainTitle') or data['title'] +        video_id = data.get('id') or video_id + +        entries = [] + +        conviva = data.get('convivaStatistics') or {} +        live = (data.get('mediaElementType') == 'Live' +                or data.get('isLive') is True or conviva.get('isLive')) + +        def make_title(t): +            return self._live_title(t) if live else t + +        media_assets = data.get('mediaAssets') +        if media_assets and isinstance(media_assets, list): +            def video_id_and_title(idx): +                return ((video_id, title) if len(media_assets) == 1 +                        else ('%s-%d' % (video_id, idx), '%s (Part %d)' % (title, idx))) +            for num, asset in enumerate(media_assets, 1): +                asset_url = asset.get('url') +                if not asset_url: +                    continue +                formats = self._extract_akamai_formats(asset_url, video_id) +                if not formats: +                    continue +                self._sort_formats(formats) + +                # Some f4m streams may not work with hdcore in fragments' URLs +                for f in formats: +                    extra_param = f.get('extra_param_to_segment_url') +                    if extra_param and 'hdcore' in extra_param: +                        del f['extra_param_to_segment_url'] + +                entry_id, entry_title = video_id_and_title(num) +                duration = parse_duration(asset.get('duration')) +                subtitles = {} +                for subtitle in ('webVtt', 'timedText'): +                    subtitle_url = asset.get('%sSubtitlesUrl' % subtitle) +                    if subtitle_url: +                        subtitles.setdefault('no', []).append({ +                            'url': compat_urllib_parse_unquote(subtitle_url) +                        }) +                entries.append({ +                    'id': asset.get('carrierId') or entry_id, +                    'title': make_title(entry_title), +                    'duration': duration, +                    'subtitles': subtitles, +                    'formats': formats, +                }) + +        if not entries: +            media_url = data.get('mediaUrl') +            if media_url: +                formats = self._extract_akamai_formats(media_url, video_id) +                self._sort_formats(formats) +                duration = parse_duration(data.get('duration')) +                entries = [{ +                    'id': video_id, +                    'title': make_title(title), +                    'duration': duration, +                    'formats': formats, +                }] + +        if not entries: +            MESSAGES = { +                'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet', +                'ProgramRightsHasExpired': 'Programmet har gått ut', +                'NoProgramRights': 'Ikke tilgjengelig', +                'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge', +            } +            message_type = data.get('messageType', '') +            # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked* +            if 'IsGeoBlocked' in message_type: +                self.raise_geo_restricted( +                    msg=MESSAGES.get('ProgramIsGeoBlocked'), +                    countries=self._GEO_COUNTRIES) +            raise ExtractorError( +                '%s said: %s' % (self.IE_NAME, MESSAGES.get( +                    message_type, message_type)), +                expected=True) + +        series = conviva.get('seriesName') or data.get('seriesTitle') +        episode = conviva.get('episodeName') or data.get('episodeNumberOrDate') + +        season_number = None +        episode_number = None +        if data.get('mediaElementType') == 'Episode': +            _season_episode = data.get('scoresStatistics', {}).get('springStreamStream') or \ +                data.get('relativeOriginUrl', '') +            EPISODENUM_RE = [ +                r'/s(?P<season>\d{,2})e(?P<episode>\d{,2})\.', +                r'/sesong-(?P<season>\d{,2})/episode-(?P<episode>\d{,2})', +            ] +            season_number = int_or_none(self._search_regex( +                EPISODENUM_RE, _season_episode, 'season number', +                default=None, group='season')) +            episode_number = int_or_none(self._search_regex( +                EPISODENUM_RE, _season_episode, 'episode number', +                default=None, group='episode')) + +        thumbnails = None +        images = data.get('images') +        if images and isinstance(images, dict): +            web_images = images.get('webImages') +            if isinstance(web_images, list): +                thumbnails = [{ +                    'url': image['imageUrl'], +                    'width': int_or_none(image.get('width')), +                    'height': int_or_none(image.get('height')), +                } for image in web_images if image.get('imageUrl')] + +        description = data.get('description') +        category = data.get('mediaAnalytics', {}).get('category') + +        common_info = { +            'description': description, +            'series': series, +            'episode': episode, +            'season_number': season_number, +            'episode_number': episode_number, +            'categories': [category] if category else None, +            'age_limit': parse_age_limit(data.get('legalAge')), +            'thumbnails': thumbnails, +        } + +        vcodec = 'none' if data.get('mediaType') == 'Audio' else None + +        for entry in entries: +            entry.update(common_info) +            for f in entry['formats']: +                f['vcodec'] = vcodec + +        points = data.get('shortIndexPoints') +        if isinstance(points, list): +            chapters = [] +            for next_num, point in enumerate(points, start=1): +                if not isinstance(point, dict): +                    continue +                start_time = parse_duration(point.get('startPoint')) +                if start_time is None: +                    continue +                end_time = parse_duration( +                    data.get('duration') +                    if next_num == len(points) +                    else points[next_num].get('startPoint')) +                if end_time is None: +                    continue +                chapters.append({ +                    'start_time': start_time, +                    'end_time': end_time, +                    'title': point.get('title'), +                }) +            if chapters and len(entries) == 1: +                entries[0]['chapters'] = chapters + +        return self.playlist_result(entries, video_id, title, description) + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        return self._extract_from_mediaelement(video_id) +  class NRKTVEpisodeIE(InfoExtractor):      _VALID_URL = r'https?://tv\.nrk\.no/serie/(?P<id>[^/]+/sesong/\d+/episode/\d+)' diff --git a/youtube_dlc/extractor/nytimes.py b/youtube_dlc/extractor/nytimes.py index fc78ca56c..976b1c694 100644 --- a/youtube_dlc/extractor/nytimes.py +++ b/youtube_dlc/extractor/nytimes.py @@ -221,3 +221,41 @@ class NYTimesArticleIE(NYTimesBaseIE):               r'NYTD\.FlexTypes\.push\s*\(\s*({.+})\s*\)\s*;'),              webpage, 'podcast data')          return self._extract_podcast_from_json(podcast_data, page_id, webpage) + + +class NYTimesCookingIE(NYTimesBaseIE): +    _VALID_URL = r'https?://cooking\.nytimes\.com/(?:guid|recip)es/(?P<id>\d+)' +    _TESTS = [{ +        'url': 'https://cooking.nytimes.com/recipes/1017817-cranberry-curd-tart', +        'md5': 'dab81fa2eaeb3f9ed47498bdcfcdc1d3', +        'info_dict': { +            'id': '100000004756089', +            'ext': 'mov', +            'timestamp': 1479383008, +            'uploader': 'By SHAW LASH, ADAM SAEWITZ and JAMES HERRON', +            'title': 'Cranberry Tart', +            'upload_date': '20161117', +            'description': 'If you are a fan of lemon curd or the classic French tarte au citron, you will love this cranberry version.', +        }, +    }, { +        'url': 'https://cooking.nytimes.com/guides/13-how-to-cook-a-turkey', +        'md5': '4b2e8c70530a89b8d905a2b572316eb8', +        'info_dict': { +            'id': '100000003951728', +            'ext': 'mov', +            'timestamp': 1445509539, +            'description': 'Turkey guide', +            'upload_date': '20151022', +            'title': 'Turkey', +        } +    }] + +    def _real_extract(self, url): +        page_id = self._match_id(url) + +        webpage = self._download_webpage(url, page_id) + +        video_id = self._search_regex( +            r'data-video-id=["\'](\d+)', webpage, 'video id') + +        return self._extract_video_from_id(video_id) diff --git a/youtube_dlc/extractor/pbs.py b/youtube_dlc/extractor/pbs.py index 4dbe661be..d4baa16ee 100644 --- a/youtube_dlc/extractor/pbs.py +++ b/youtube_dlc/extractor/pbs.py @@ -477,7 +477,7 @@ class PBSIE(InfoExtractor):              if media_id:                  return media_id, presumptive_id, upload_date, description -            # Fronline video embedded via flp +            # Frontline video embedded via flp              video_id = self._search_regex(                  r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid', default=None)              if video_id: diff --git a/youtube_dlc/extractor/pinterest.py b/youtube_dlc/extractor/pinterest.py new file mode 100644 index 000000000..b249c9eda --- /dev/null +++ b/youtube_dlc/extractor/pinterest.py @@ -0,0 +1,201 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( +    determine_ext, +    float_or_none, +    int_or_none, +    try_get, +    unified_timestamp, +    url_or_none, +) + + +class PinterestBaseIE(InfoExtractor): +    _VALID_URL_BASE = r'https?://(?:[^/]+\.)?pinterest\.(?:com|fr|de|ch|jp|cl|ca|it|co\.uk|nz|ru|com\.au|at|pt|co\.kr|es|com\.mx|dk|ph|th|com\.uy|co|nl|info|kr|ie|vn|com\.vn|ec|mx|in|pe|co\.at|hu|co\.in|co\.nz|id|com\.ec|com\.py|tw|be|uk|com\.bo|com\.pe)' + +    def _call_api(self, resource, video_id, options): +        return self._download_json( +            'https://www.pinterest.com/resource/%sResource/get/' % resource, +            video_id, 'Download %s JSON metadata' % resource, query={ +                'data': json.dumps({'options': options}) +            })['resource_response'] + +    def _extract_video(self, data, extract_formats=True): +        video_id = data['id'] + +        title = (data.get('title') or data.get('grid_title') or video_id).strip() + +        formats = [] +        duration = None +        if extract_formats: +            for format_id, format_dict in data['videos']['video_list'].items(): +                if not isinstance(format_dict, dict): +                    continue +                format_url = url_or_none(format_dict.get('url')) +                if not format_url: +                    continue +                duration = float_or_none(format_dict.get('duration'), scale=1000) +                ext = determine_ext(format_url) +                if 'hls' in format_id.lower() or ext == 'm3u8': +                    formats.extend(self._extract_m3u8_formats( +                        format_url, video_id, 'mp4', entry_protocol='m3u8_native', +                        m3u8_id=format_id, fatal=False)) +                else: +                    formats.append({ +                        'url': format_url, +                        'format_id': format_id, +                        'width': int_or_none(format_dict.get('width')), +                        'height': int_or_none(format_dict.get('height')), +                        'duration': duration, +                    }) +            self._sort_formats( +                formats, field_preference=('height', 'width', 'tbr', 'format_id')) + +        description = data.get('description') or data.get('description_html') or data.get('seo_description') +        timestamp = unified_timestamp(data.get('created_at')) + +        def _u(field): +            return try_get(data, lambda x: x['closeup_attribution'][field], compat_str) + +        uploader = _u('full_name') +        uploader_id = _u('id') + +        repost_count = int_or_none(data.get('repin_count')) +        comment_count = int_or_none(data.get('comment_count')) +        categories = try_get(data, lambda x: x['pin_join']['visual_annotation'], list) +        tags = data.get('hashtags') + +        thumbnails = [] +        images = data.get('images') +        if isinstance(images, dict): +            for thumbnail_id, thumbnail in images.items(): +                if not isinstance(thumbnail, dict): +                    continue +                thumbnail_url = url_or_none(thumbnail.get('url')) +                if not thumbnail_url: +                    continue +                thumbnails.append({ +                    'url': thumbnail_url, +                    'width': int_or_none(thumbnail.get('width')), +                    'height': int_or_none(thumbnail.get('height')), +                }) + +        return { +            'id': video_id, +            'title': title, +            'description': description, +            'duration': duration, +            'timestamp': timestamp, +            'thumbnails': thumbnails, +            'uploader': uploader, +            'uploader_id': uploader_id, +            'repost_count': repost_count, +            'comment_count': comment_count, +            'categories': categories, +            'tags': tags, +            'formats': formats, +            'extractor_key': PinterestIE.ie_key(), +        } + + +class PinterestIE(PinterestBaseIE): +    _VALID_URL = r'%s/pin/(?P<id>\d+)' % PinterestBaseIE._VALID_URL_BASE +    _TESTS = [{ +        'url': 'https://www.pinterest.com/pin/664281013778109217/', +        'md5': '6550c2af85d6d9f3fe3b88954d1577fc', +        'info_dict': { +            'id': '664281013778109217', +            'ext': 'mp4', +            'title': 'Origami', +            'description': 'md5:b9d90ddf7848e897882de9e73344f7dd', +            'duration': 57.7, +            'timestamp': 1593073622, +            'upload_date': '20200625', +            'uploader': 'Love origami -I am Dafei', +            'uploader_id': '586523688879454212', +            'repost_count': 50, +            'comment_count': 0, +            'categories': list, +            'tags': list, +        }, +    }, { +        'url': 'https://co.pinterest.com/pin/824721750502199491/', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        data = self._call_api( +            'Pin', video_id, { +                'field_set_key': 'unauth_react_main_pin', +                'id': video_id, +            })['data'] +        return self._extract_video(data) + + +class PinterestCollectionIE(PinterestBaseIE): +    _VALID_URL = r'%s/(?P<username>[^/]+)/(?P<id>[^/?#&]+)' % PinterestBaseIE._VALID_URL_BASE +    _TESTS = [{ +        'url': 'https://www.pinterest.ca/mashal0407/cool-diys/', +        'info_dict': { +            'id': '585890301462791043', +            'title': 'cool diys', +        }, +        'playlist_count': 8, +    }, { +        'url': 'https://www.pinterest.ca/fudohub/videos/', +        'info_dict': { +            'id': '682858430939307450', +            'title': 'VIDEOS', +        }, +        'playlist_mincount': 365, +        'skip': 'Test with extract_formats=False', +    }] + +    @classmethod +    def suitable(cls, url): +        return False if PinterestIE.suitable(url) else super( +            PinterestCollectionIE, cls).suitable(url) + +    def _real_extract(self, url): +        username, slug = re.match(self._VALID_URL, url).groups() +        board = self._call_api( +            'Board', slug, { +                'slug': slug, +                'username': username +            })['data'] +        board_id = board['id'] +        options = { +            'board_id': board_id, +            'page_size': 250, +        } +        bookmark = None +        entries = [] +        while True: +            if bookmark: +                options['bookmarks'] = [bookmark] +            board_feed = self._call_api('BoardFeed', board_id, options) +            for item in (board_feed.get('data') or []): +                if not isinstance(item, dict) or item.get('type') != 'pin': +                    continue +                video_id = item.get('id') +                if video_id: +                    # Some pins may not be available anonymously via pin URL +                    # video = self._extract_video(item, extract_formats=False) +                    # video.update({ +                    #     '_type': 'url_transparent', +                    #     'url': 'https://www.pinterest.com/pin/%s/' % video_id, +                    # }) +                    # entries.append(video) +                    entries.append(self._extract_video(item)) +            bookmark = board_feed.get('bookmark') +            if not bookmark: +                break +        return self.playlist_result( +            entries, playlist_id=board_id, playlist_title=board.get('name')) diff --git a/youtube_dlc/extractor/rai.py b/youtube_dlc/extractor/rai.py index 51a310f5c..5eef7c633 100644 --- a/youtube_dlc/extractor/rai.py +++ b/youtube_dlc/extractor/rai.py @@ -16,6 +16,7 @@ from ..utils import (      GeoRestrictedError,      int_or_none,      parse_duration, +    remove_start,      strip_or_none,      try_get,      unified_strdate, @@ -30,7 +31,6 @@ class RaiBaseIE(InfoExtractor):      _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}'      _GEO_COUNTRIES = ['IT']      _GEO_BYPASS = False -    _BASE_URL = 'https://www.raiplay.it'      def _extract_relinker_info(self, relinker_url, video_id):          if not re.match(r'https?://', relinker_url): @@ -68,7 +68,7 @@ class RaiBaseIE(InfoExtractor):              # This does not imply geo restriction (e.g.              # http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html) -            if media_url == 'http://download.rai.it/video_no_available.mp4': +            if '/video_no_available.mp4' in media_url:                  continue              ext = determine_ext(media_url) @@ -123,7 +123,7 @@ class RaiBaseIE(InfoExtractor):  class RaiPlayIE(RaiBaseIE): -    _VALID_URL = r'(?P<url>(?P<base>https?://(?:www\.)?raiplay\.it/.+?-)(?P<id>%s)(?P<ext>\.(?:html|json)))' % RaiBaseIE._UUID_RE +    _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s))\.(?:html|json)' % RaiBaseIE._UUID_RE      _TESTS = [{          'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html',          'md5': '8970abf8caf8aef4696e7b1f2adfc696', @@ -131,11 +131,13 @@ class RaiPlayIE(RaiBaseIE):              'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391',              'ext': 'mp4',              'title': 'Report del 07/04/2014', -            'alt_title': 'St 2013/14 - Espresso nel caffè - 07/04/2014 ', +            'alt_title': 'St 2013/14 - Espresso nel caffè - 07/04/2014',              'description': 'md5:d730c168a58f4bb35600fc2f881ec04e',              'thumbnail': r're:^https?://.*\.jpg$',              'uploader': 'Rai Gulp',              'duration': 6160, +            'series': 'Report', +            'season': '2013/14',          },          'params': {              'skip_download': True, @@ -146,11 +148,10 @@ class RaiPlayIE(RaiBaseIE):      }]      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        url, base, video_id, ext = mobj.group('url', 'base', 'id', 'ext') +        base, video_id = re.match(self._VALID_URL, url).groups()          media = self._download_json( -            '%s%s.json' % (base, video_id), video_id, 'Downloading video JSON') +            base + '.json', video_id, 'Downloading video JSON')          title = media['name']          video = media['video'] @@ -159,34 +160,39 @@ class RaiPlayIE(RaiBaseIE):          self._sort_formats(relinker_info['formats'])          thumbnails = [] -        if 'images' in media: -            for _, value in media.get('images').items(): -                if value: -                    thumbnails.append({ -                        'url': urljoin(RaiBaseIE._BASE_URL, value.replace('[RESOLUTION]', '600x400')) -                    }) +        for _, value in media.get('images', {}).items(): +            if value: +                thumbnails.append({ +                    'url': urljoin(url, value), +                }) -        timestamp = unified_timestamp(try_get( -            media, lambda x: x['availabilities'][0]['start'], compat_str)) +        date_published = media.get('date_published') +        time_published = media.get('time_published') +        if date_published and time_published: +            date_published += ' ' + time_published          subtitles = self._extract_subtitles(url, video.get('subtitles')) +        program_info = media.get('program_info') or {} +        season = media.get('season') +          info = { -            'id': video_id, +            'id': remove_start(media.get('id'), 'ContentItem-') or video_id, +            'display_id': video_id,              'title': self._live_title(title) if relinker_info.get(                  'is_live') else title, -            'alt_title': media.get('subtitle'), +            'alt_title': strip_or_none(media.get('subtitle')),              'description': media.get('description'),              'uploader': strip_or_none(media.get('channel')), -            'creator': strip_or_none(media.get('editor')), +            'creator': strip_or_none(media.get('editor') or None),              'duration': parse_duration(video.get('duration')), -            'timestamp': timestamp, +            'timestamp': unified_timestamp(date_published),              'thumbnails': thumbnails, -            'series': try_get( -                media, lambda x: x['isPartOf']['name'], compat_str), -            'season_number': int_or_none(try_get( -                media, lambda x: x['isPartOf']['numeroStagioni'])), -            'season': media.get('stagione') or None, +            'series': program_info.get('name'), +            'season_number': int_or_none(season), +            'season': season if (season and not season.isdigit()) else None, +            'episode': media.get('episode_title'), +            'episode_number': int_or_none(media.get('episode')),              'subtitles': subtitles,          } @@ -194,9 +200,9 @@ class RaiPlayIE(RaiBaseIE):          return info -class RaiPlayLiveIE(RaiBaseIE): -    _VALID_URL = r'https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+)' -    _TEST = { +class RaiPlayLiveIE(RaiPlayIE): +    _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+))' +    _TESTS = [{          'url': 'http://www.raiplay.it/dirette/rainews24',          'info_dict': {              'id': 'd784ad40-e0ae-4a69-aa76-37519d238a9c', @@ -211,40 +217,11 @@ class RaiPlayLiveIE(RaiBaseIE):          'params': {              'skip_download': True,          }, -    } - -    def _real_extract(self, url): -        display_id = self._match_id(url) - -        media = self._download_json( -            '%s.json' % urljoin(RaiBaseIE._BASE_URL, 'dirette/' + display_id), -            display_id, 'Downloading channel JSON') - -        title = media['name'] -        video = media['video'] -        video_id = media['id'].replace('ContentItem-', '') - -        relinker_info = self._extract_relinker_info(video['content_url'], video_id) -        self._sort_formats(relinker_info['formats']) - -        info = { -            'id': video_id, -            'display_id': display_id, -            'title': self._live_title(title) if relinker_info.get( -                'is_live') else title, -            'alt_title': media.get('subtitle'), -            'description': media.get('description'), -            'uploader': strip_or_none(media.get('channel')), -            'creator': strip_or_none(media.get('editor')), -            'duration': parse_duration(video.get('duration')), -        } - -        info.update(relinker_info) -        return info +    }]  class RaiPlayPlaylistIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+)' +    _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+))'      _TESTS = [{          'url': 'http://www.raiplay.it/programmi/nondirloalmiocapo/',          'info_dict': { @@ -256,29 +233,34 @@ class RaiPlayPlaylistIE(InfoExtractor):      }]      def _real_extract(self, url): -        playlist_id = self._match_id(url) - -        media = self._download_json( -            '%s.json' % urljoin(RaiBaseIE._BASE_URL, 'programmi/' + playlist_id), -            playlist_id, 'Downloading program JSON') - -        title = media['name'] -        description = media['program_info']['description'] +        base, playlist_id = re.match(self._VALID_URL, url).groups() -        content_sets = [s['id'] for b in media['blocks'] for s in b['sets']] +        program = self._download_json( +            base + '.json', playlist_id, 'Downloading program JSON')          entries = [] -        for cs in content_sets: -            medias = self._download_json( -                '%s/%s.json' % (urljoin(RaiBaseIE._BASE_URL, 'programmi/' + playlist_id), cs), -                cs, 'Downloading content set JSON') -            for m in medias['items']: -                video_url = urljoin(url, m['path_id']) -                entries.append(self.url_result( -                    video_url, ie=RaiPlayIE.ie_key(), -                    video_id=RaiPlayIE._match_id(video_url))) - -        return self.playlist_result(entries, playlist_id, title, description) +        for b in (program.get('blocks') or []): +            for s in (b.get('sets') or []): +                s_id = s.get('id') +                if not s_id: +                    continue +                medias = self._download_json( +                    '%s/%s.json' % (base, s_id), s_id, +                    'Downloading content set JSON', fatal=False) +                if not medias: +                    continue +                for m in (medias.get('items') or []): +                    path_id = m.get('path_id') +                    if not path_id: +                        continue +                    video_url = urljoin(url, path_id) +                    entries.append(self.url_result( +                        video_url, ie=RaiPlayIE.ie_key(), +                        video_id=RaiPlayIE._match_id(video_url))) + +        return self.playlist_result( +            entries, playlist_id, program.get('name'), +            try_get(program, lambda x: x['program_info']['description']))  class RaiIE(RaiBaseIE): @@ -294,7 +276,8 @@ class RaiIE(RaiBaseIE):              'thumbnail': r're:^https?://.*\.jpg$',              'duration': 1758,              'upload_date': '20140612', -        } +        }, +        'skip': 'This content is available only in Italy',      }, {          # with ContentItem in many metas          'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', @@ -440,7 +423,7 @@ class RaiIE(RaiBaseIE):              except ExtractorError:                  pass -        relinker_url = self._search_regex( +        relinker_url = self._proto_relative_url(self._search_regex(              r'''(?x)                  (?:                      var\s+videoURL| @@ -452,7 +435,7 @@ class RaiIE(RaiBaseIE):                      //mediapolis(?:vod)?\.rai\.it/relinker/relinkerServlet\.htm\?                      (?:(?!\1).)*\bcont=(?:(?!\1).)+)\1              ''', -            webpage, 'relinker URL', group='url') +            webpage, 'relinker URL', group='url'))          relinker_info = self._extract_relinker_info(              urljoin(url, relinker_url), video_id) diff --git a/youtube_dlc/extractor/rumble.py b/youtube_dlc/extractor/rumble.py new file mode 100644 index 000000000..4a0225109 --- /dev/null +++ b/youtube_dlc/extractor/rumble.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( +    determine_ext, +    int_or_none, +    parse_iso8601, +    try_get, +) + + +class RumbleEmbedIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)' +    _TESTS = [{ +        'url': 'https://rumble.com/embed/v5pv5f', +        'md5': '36a18a049856720189f30977ccbb2c34', +        'info_dict': { +            'id': 'v5pv5f', +            'ext': 'mp4', +            'title': 'WMAR 2 News Latest Headlines | October 20, 6pm', +            'timestamp': 1571611968, +            'upload_date': '20191020', +        } +    }, { +        'url': 'https://rumble.com/embed/ufe9n.v5pv5f', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        video_id = self._match_id(url) +        video = self._download_json( +            'https://rumble.com/embedJS/', video_id, +            query={'request': 'video', 'v': video_id}) +        title = video['title'] + +        formats = [] +        for height, ua in (video.get('ua') or {}).items(): +            for i in range(2): +                f_url = try_get(ua, lambda x: x[i], compat_str) +                if f_url: +                    ext = determine_ext(f_url) +                    f = { +                        'ext': ext, +                        'format_id': '%s-%sp' % (ext, height), +                        'height': int_or_none(height), +                        'url': f_url, +                    } +                    bitrate = try_get(ua, lambda x: x[i + 2]['bitrate']) +                    if bitrate: +                        f['tbr'] = int_or_none(bitrate) +                    formats.append(f) +        self._sort_formats(formats) + +        author = video.get('author') or {} + +        return { +            'id': video_id, +            'title': title, +            'formats': formats, +            'thumbnail': video.get('i'), +            'timestamp': parse_iso8601(video.get('pubDate')), +            'channel': author.get('name'), +            'channel_url': author.get('url'), +            'duration': int_or_none(video.get('duration')), +        } diff --git a/youtube_dlc/extractor/servus.py b/youtube_dlc/extractor/servus.py index 9401bf2cf..1610ddc2c 100644 --- a/youtube_dlc/extractor/servus.py +++ b/youtube_dlc/extractor/servus.py @@ -1,9 +1,15 @@  # coding: utf-8  from __future__ import unicode_literals -import re -  from .common import InfoExtractor +from ..utils import ( +    determine_ext, +    float_or_none, +    int_or_none, +    unified_timestamp, +    urlencode_postdata, +    url_or_none, +)  class ServusIE(InfoExtractor): @@ -12,20 +18,29 @@ class ServusIE(InfoExtractor):                          (?:www\.)?                          (?:                              servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)| -                            servustv\.com/videos +                            (?:servustv|pm-wissen)\.com/videos                          )                          /(?P<id>[aA]{2}-\w+|\d+-\d+)                      '''      _TESTS = [{          # new URL schema          'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/', -        'md5': '3e1dd16775aa8d5cbef23628cfffc1f4', +        'md5': '60474d4c21f3eb148838f215c37f02b9',          'info_dict': {              'id': 'AA-1T6VBU5PW1W12',              'ext': 'mp4',              'title': 'Die Grünen aus Sicht des Volkes', +            'alt_title': 'Talk im Hangar-7 Voxpops Gruene',              'description': 'md5:1247204d85783afe3682644398ff2ec4',              'thumbnail': r're:^https?://.*\.jpg', +            'duration': 62.442, +            'timestamp': 1605193976, +            'upload_date': '20201112', +            'series': 'Talk im Hangar-7', +            'season': 'Season 9', +            'season_number': 9, +            'episode': 'Episode 31 - September 14', +            'episode_number': 31,          }      }, {          # old URL schema @@ -40,30 +55,94 @@ class ServusIE(InfoExtractor):      }, {          'url': 'https://www.servus.com/tv/videos/1380889096408-1235196658/',          'only_matching': True, +    }, { +        'url': 'https://www.pm-wissen.com/videos/aa-24mus4g2w2112/', +        'only_matching': True,      }]      def _real_extract(self, url):          video_id = self._match_id(url).upper() -        webpage = self._download_webpage(url, video_id) -        title = self._search_regex( -            (r'videoLabel\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1', -             r'<h\d+[^>]+\bclass=["\']heading--(?:one|two)["\'][^>]*>(?P<title>[^<]+)'), -            webpage, 'title', default=None, -            group='title') or self._og_search_title(webpage) -        title = re.sub(r'\s*-\s*Servus TV\s*$', '', title) -        description = self._og_search_description(webpage) -        thumbnail = self._og_search_thumbnail(webpage) +        token = self._download_json( +            'https://auth.redbullmediahouse.com/token', video_id, +            'Downloading token', data=urlencode_postdata({ +                'grant_type': 'client_credentials', +            }), headers={ +                'Authorization': 'Basic SVgtMjJYNEhBNFdEM1cxMTpEdDRVSkFLd2ZOMG5IMjB1NGFBWTBmUFpDNlpoQ1EzNA==', +            }) +        access_token = token['access_token'] +        token_type = token.get('token_type', 'Bearer') -        formats = self._extract_m3u8_formats( -            'https://stv.rbmbtnx.net/api/v1/manifests/%s.m3u8' % video_id, -            video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') +        video = self._download_json( +            'https://sparkle-api.liiift.io/api/v1/stv/channels/international/assets/%s' % video_id, +            video_id, 'Downloading video JSON', headers={ +                'Authorization': '%s %s' % (token_type, access_token), +            }) + +        formats = [] +        thumbnail = None +        for resource in video['resources']: +            if not isinstance(resource, dict): +                continue +            format_url = url_or_none(resource.get('url')) +            if not format_url: +                continue +            extension = resource.get('extension') +            type_ = resource.get('type') +            if extension == 'jpg' or type_ == 'reference_keyframe': +                thumbnail = format_url +                continue +            ext = determine_ext(format_url) +            if type_ == 'dash' or ext == 'mpd': +                formats.extend(self._extract_mpd_formats( +                    format_url, video_id, mpd_id='dash', fatal=False)) +            elif type_ == 'hls' or ext == 'm3u8': +                formats.extend(self._extract_m3u8_formats( +                    format_url, video_id, 'mp4', entry_protocol='m3u8_native', +                    m3u8_id='hls', fatal=False)) +            elif extension == 'mp4' or ext == 'mp4': +                formats.append({ +                    'url': format_url, +                    'format_id': type_, +                    'width': int_or_none(resource.get('width')), +                    'height': int_or_none(resource.get('height')), +                })          self._sort_formats(formats) +        attrs = {} +        for attribute in video['attributes']: +            if not isinstance(attribute, dict): +                continue +            key = attribute.get('fieldKey') +            value = attribute.get('fieldValue') +            if not key or not value: +                continue +            attrs[key] = value + +        title = attrs.get('title_stv') or video_id +        alt_title = attrs.get('title') +        description = attrs.get('long_description') or attrs.get('short_description') +        series = attrs.get('label') +        season = attrs.get('season') +        episode = attrs.get('chapter') +        duration = float_or_none(attrs.get('duration'), scale=1000) +        season_number = int_or_none(self._search_regex( +            r'Season (\d+)', season or '', 'season number', default=None)) +        episode_number = int_or_none(self._search_regex( +            r'Episode (\d+)', episode or '', 'episode number', default=None)) +          return {              'id': video_id,              'title': title, +            'alt_title': alt_title,              'description': description,              'thumbnail': thumbnail, +            'duration': duration, +            'timestamp': unified_timestamp(video.get('lastPublished')), +            'series': series, +            'season': season, +            'season_number': season_number, +            'episode': episode, +            'episode_number': episode_number,              'formats': formats,          } diff --git a/youtube_dlc/extractor/soundcloud.py b/youtube_dlc/extractor/soundcloud.py index ed70b7169..47f68bf19 100644 --- a/youtube_dlc/extractor/soundcloud.py +++ b/youtube_dlc/extractor/soundcloud.py @@ -649,7 +649,7 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE):  class SoundcloudPagedPlaylistBaseIE(SoundcloudIE):      def _extract_playlist(self, base_url, playlist_id, playlist_title): -        # Per the SoundCloud documentation, the maximum limit for a linked partioning query is 200. +        # Per the SoundCloud documentation, the maximum limit for a linked partitioning query is 200.          # https://developers.soundcloud.com/blog/offset-pagination-deprecated          COMMON_QUERY = {              'limit': 200, diff --git a/youtube_dlc/extractor/spiegel.py b/youtube_dlc/extractor/spiegel.py index 4df7f4ddc..2da32b9b2 100644 --- a/youtube_dlc/extractor/spiegel.py +++ b/youtube_dlc/extractor/spiegel.py @@ -1,159 +1,54 @@  # coding: utf-8  from __future__ import unicode_literals -import re -  from .common import InfoExtractor -from .nexx import ( -    NexxIE, -    NexxEmbedIE, -) -from .spiegeltv import SpiegeltvIE -from ..compat import compat_urlparse -from ..utils import ( -    parse_duration, -    strip_or_none, -    unified_timestamp, -) +from .jwplatform import JWPlatformIE  class SpiegelIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' +    _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' +    _VALID_URL = r'https?://(?:www\.)?(?:spiegel|manager-magazin)\.de(?:/[^/]+)+/[^/]*-(?P<id>[0-9]+|%s)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' % _UUID_RE      _TESTS = [{          'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html', -        'md5': 'b57399839d055fccfeb9a0455c439868', +        'md5': '50c7948883ec85a3e431a0a44b7ad1d6',          'info_dict': { -            'id': '563747', +            'id': 'II0BUyxY', +            'display_id': '1259285',              'ext': 'mp4', -            'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv', +            'title': 'Vulkan Tungurahua in Ecuador ist wieder aktiv - DER SPIEGEL - Wissenschaft',              'description': 'md5:8029d8310232196eb235d27575a8b9f4', -            'duration': 49, +            'duration': 48.0,              'upload_date': '20130311', -            'timestamp': 1362994320, +            'timestamp': 1362997920,          },      }, {          'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', -        'md5': '5b6c2f4add9d62912ed5fc78a1faed80', -        'info_dict': { -            'id': '580988', -            'ext': 'mp4', -            'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers', -            'description': 'md5:c2322b65e58f385a820c10fa03b2d088', -            'duration': 983, -            'upload_date': '20131115', -            'timestamp': 1384546642, -        }, +        'only_matching': True,      }, { -        'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html', -        'md5': '97b91083a672d72976faa8433430afb9', -        'info_dict': { -            'id': '601883', -            'ext': 'mp4', -            'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.', -            'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"', -            'upload_date': '20140904', -            'timestamp': 1409834160, -        } +        'url': 'https://www.spiegel.de/video/eifel-zoo-aufregung-um-ausgebrochene-raubtiere-video-99018031.html', +        'only_matching': True,      }, { -        'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', +        'url': 'https://www.spiegel.de/panorama/urteile-im-goldmuenzenprozess-haftstrafen-fuer-clanmitglieder-a-aae8df48-43c1-4c61-867d-23f0a2d254b7',          'only_matching': True,      }, { -        # nexx video          'url': 'http://www.spiegel.de/video/spiegel-tv-magazin-ueber-guellekrise-in-schleswig-holstein-video-99012776.html',          'only_matching': True, +    }, { +        'url': 'http://www.spiegel.de/sport/sonst/badminton-wm-die-randsportart-soll-populaerer-werden-a-987092.html', +        'only_matching': True,      }]      def _real_extract(self, url):          video_id = self._match_id(url) -        metadata_url = 'http://www.spiegel.de/video/metadata/video-%s.json' % video_id -        handle = self._request_webpage(metadata_url, video_id) - -        # 302 to spiegel.tv, like http://www.spiegel.de/video/der-film-zum-wochenende-die-wahrheit-ueber-maenner-video-99003272.html -        if SpiegeltvIE.suitable(handle.geturl()): -            return self.url_result(handle.geturl(), 'Spiegeltv') - -        video_data = self._parse_json(self._webpage_read_content( -            handle, metadata_url, video_id), video_id) -        title = video_data['title'] -        nexx_id = video_data['nexxOmniaId'] -        domain_id = video_data.get('nexxOmniaDomain') or '748' - +        webpage = self._download_webpage(url, video_id) +        media_id = self._html_search_regex( +            r'("|["\'])mediaId\1\s*:\s*("|["\'])(?P<id>(?:(?!\2).)+)\2', +            webpage, 'media id', group='id')          return {              '_type': 'url_transparent',              'id': video_id, -            'url': 'nexx:%s:%s' % (domain_id, nexx_id), -            'title': title, -            'description': strip_or_none(video_data.get('teaser')), -            'duration': parse_duration(video_data.get('duration')), -            'timestamp': unified_timestamp(video_data.get('datum')), -            'ie_key': NexxIE.ie_key(), +            'display_id': video_id, +            'url': 'jwplatform:%s' % media_id, +            'title': self._og_search_title(webpage, default=None), +            'ie_key': JWPlatformIE.ie_key(),          } - - -class SpiegelArticleIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?spiegel\.de/(?!video/)[^?#]*?-(?P<id>[0-9]+)\.html' -    IE_NAME = 'Spiegel:Article' -    IE_DESC = 'Articles on spiegel.de' -    _TESTS = [{ -        'url': 'http://www.spiegel.de/sport/sonst/badminton-wm-die-randsportart-soll-populaerer-werden-a-987092.html', -        'info_dict': { -            'id': '1516455', -            'ext': 'mp4', -            'title': 'Faszination Badminton: Nennt es bloß nicht Federball', -            'description': 're:^Patrick Kämnitz gehört.{100,}', -            'upload_date': '20140825', -        }, -    }, { -        'url': 'http://www.spiegel.de/wissenschaft/weltall/astronaut-alexander-gerst-antwortet-spiegel-online-lesern-a-989876.html', -        'info_dict': { - -        }, -        'playlist_count': 6, -    }, { -        # Nexx iFrame embed -        'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-schnellste-katapult-achterbahn-der-welt-taron-a-1137884.html', -        'info_dict': { -            'id': '161464', -            'ext': 'mp4', -            'title': 'Nervenkitzel Achterbahn', -            'alt_title': 'Karussellbauer in Deutschland', -            'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc', -            'release_year': 2005, -            'creator': 'SPIEGEL TV', -            'thumbnail': r're:^https?://.*\.jpg$', -            'duration': 2761, -            'timestamp': 1394021479, -            'upload_date': '20140305', -        }, -        'params': { -            'format': 'bestvideo', -            'skip_download': True, -        }, -    }] - -    def _real_extract(self, url): -        video_id = self._match_id(url) -        webpage = self._download_webpage(url, video_id) - -        # Single video on top of the page -        video_link = self._search_regex( -            r'<a href="([^"]+)" onclick="return spOpenVideo\(this,', webpage, -            'video page URL', default=None) -        if video_link: -            video_url = compat_urlparse.urljoin( -                self.http_scheme() + '//spiegel.de/', video_link) -            return self.url_result(video_url) - -        # Multiple embedded videos -        embeds = re.findall( -            r'<div class="vid_holder[0-9]+.*?</div>\s*.*?url\s*=\s*"([^"]+)"', -            webpage) -        entries = [ -            self.url_result(compat_urlparse.urljoin( -                self.http_scheme() + '//spiegel.de/', embed_path)) -            for embed_path in embeds] -        if embeds: -            return self.playlist_result(entries) - -        return self.playlist_from_matches( -            NexxEmbedIE._extract_urls(webpage), ie=NexxEmbedIE.ie_key()) diff --git a/youtube_dlc/extractor/spreaker.py b/youtube_dlc/extractor/spreaker.py new file mode 100644 index 000000000..6c7e40ae4 --- /dev/null +++ b/youtube_dlc/extractor/spreaker.py @@ -0,0 +1,176 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( +    float_or_none, +    int_or_none, +    str_or_none, +    try_get, +    unified_timestamp, +    url_or_none, +) + + +def _extract_episode(data, episode_id=None): +    title = data['title'] +    download_url = data['download_url'] + +    series = try_get(data, lambda x: x['show']['title'], compat_str) +    uploader = try_get(data, lambda x: x['author']['fullname'], compat_str) + +    thumbnails = [] +    for image in ('image_original', 'image_medium', 'image'): +        image_url = url_or_none(data.get('%s_url' % image)) +        if image_url: +            thumbnails.append({'url': image_url}) + +    def stats(key): +        return int_or_none(try_get( +            data, +            (lambda x: x['%ss_count' % key], +             lambda x: x['stats']['%ss' % key]))) + +    def duration(key): +        return float_or_none(data.get(key), scale=1000) + +    return { +        'id': compat_str(episode_id or data['episode_id']), +        'url': download_url, +        'display_id': data.get('permalink'), +        'title': title, +        'description': data.get('description'), +        'timestamp': unified_timestamp(data.get('published_at')), +        'uploader': uploader, +        'uploader_id': str_or_none(data.get('author_id')), +        'creator': uploader, +        'duration': duration('duration') or duration('length'), +        'view_count': stats('play'), +        'like_count': stats('like'), +        'comment_count': stats('message'), +        'format': 'MPEG Layer 3', +        'format_id': 'mp3', +        'container': 'mp3', +        'ext': 'mp3', +        'thumbnails': thumbnails, +        'series': series, +        'extractor_key': SpreakerIE.ie_key(), +    } + + +class SpreakerIE(InfoExtractor): +    _VALID_URL = r'''(?x) +                    https?:// +                        api\.spreaker\.com/ +                        (?: +                            (?:download/)?episode| +                            v2/episodes +                        )/ +                        (?P<id>\d+) +                    ''' +    _TESTS = [{ +        'url': 'https://api.spreaker.com/episode/12534508', +        'info_dict': { +            'id': '12534508', +            'display_id': 'swm-ep15-how-to-market-your-music-part-2', +            'ext': 'mp3', +            'title': 'EP:15 | Music Marketing (Likes) - Part 2', +            'description': 'md5:0588c43e27be46423e183076fa071177', +            'timestamp': 1502250336, +            'upload_date': '20170809', +            'uploader': 'SWM', +            'uploader_id': '9780658', +            'duration': 1063.42, +            'view_count': int, +            'like_count': int, +            'comment_count': int, +            'series': 'Success With Music (SWM)', +        }, +    }, { +        'url': 'https://api.spreaker.com/download/episode/12534508/swm_ep15_how_to_market_your_music_part_2.mp3', +        'only_matching': True, +    }, { +        'url': 'https://api.spreaker.com/v2/episodes/12534508?export=episode_segments', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        episode_id = self._match_id(url) +        data = self._download_json( +            'https://api.spreaker.com/v2/episodes/%s' % episode_id, +            episode_id)['response']['episode'] +        return _extract_episode(data, episode_id) + + +class SpreakerPageIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?spreaker\.com/user/[^/]+/(?P<id>[^/?#&]+)' +    _TESTS = [{ +        'url': 'https://www.spreaker.com/user/9780658/swm-ep15-how-to-market-your-music-part-2', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        display_id = self._match_id(url) +        webpage = self._download_webpage(url, display_id) +        episode_id = self._search_regex( +            (r'data-episode_id=["\'](?P<id>\d+)', +             r'episode_id\s*:\s*(?P<id>\d+)'), webpage, 'episode id') +        return self.url_result( +            'https://api.spreaker.com/episode/%s' % episode_id, +            ie=SpreakerIE.ie_key(), video_id=episode_id) + + +class SpreakerShowIE(InfoExtractor): +    _VALID_URL = r'https?://api\.spreaker\.com/show/(?P<id>\d+)' +    _TESTS = [{ +        'url': 'https://api.spreaker.com/show/4652058', +        'info_dict': { +            'id': '4652058', +        }, +        'playlist_mincount': 118, +    }] + +    def _entries(self, show_id): +        for page_num in itertools.count(1): +            episodes = self._download_json( +                'https://api.spreaker.com/show/%s/episodes' % show_id, +                show_id, note='Downloading JSON page %d' % page_num, query={ +                    'page': page_num, +                    'max_per_page': 100, +                }) +            pager = try_get(episodes, lambda x: x['response']['pager'], dict) +            if not pager: +                break +            results = pager.get('results') +            if not results or not isinstance(results, list): +                break +            for result in results: +                if not isinstance(result, dict): +                    continue +                yield _extract_episode(result) +            if page_num == pager.get('last_page'): +                break + +    def _real_extract(self, url): +        show_id = self._match_id(url) +        return self.playlist_result(self._entries(show_id), playlist_id=show_id) + + +class SpreakerShowPageIE(InfoExtractor): +    _VALID_URL = r'https?://(?:www\.)?spreaker\.com/show/(?P<id>[^/?#&]+)' +    _TESTS = [{ +        'url': 'https://www.spreaker.com/show/success-with-music', +        'only_matching': True, +    }] + +    def _real_extract(self, url): +        display_id = self._match_id(url) +        webpage = self._download_webpage(url, display_id) +        show_id = self._search_regex( +            r'show_id\s*:\s*(?P<id>\d+)', webpage, 'show id') +        return self.url_result( +            'https://api.spreaker.com/show/%s' % show_id, +            ie=SpreakerShowIE.ie_key(), video_id=show_id) diff --git a/youtube_dlc/extractor/svt.py b/youtube_dlc/extractor/svt.py index 2f6887d86..a0b6ef4db 100644 --- a/youtube_dlc/extractor/svt.py +++ b/youtube_dlc/extractor/svt.py @@ -9,6 +9,7 @@ from ..utils import (      determine_ext,      dict_get,      int_or_none, +    unified_timestamp,      str_or_none,      strip_or_none,      try_get, @@ -44,7 +45,8 @@ class SVTBaseIE(InfoExtractor):                      'format_id': player_type,                      'url': vurl,                  }) -        if not formats and video_info.get('rights', {}).get('geoBlockedSweden'): +        rights = try_get(video_info, lambda x: x['rights'], dict) or {} +        if not formats and rights.get('geoBlockedSweden'):              self.raise_geo_restricted(                  'This video is only available in Sweden',                  countries=self._GEO_COUNTRIES) @@ -70,6 +72,7 @@ class SVTBaseIE(InfoExtractor):          episode = video_info.get('episodeTitle')          episode_number = int_or_none(video_info.get('episodeNumber')) +        timestamp = unified_timestamp(rights.get('validFrom'))          duration = int_or_none(dict_get(video_info, ('materialLength', 'contentDuration')))          age_limit = None          adult = dict_get( @@ -84,6 +87,7 @@ class SVTBaseIE(InfoExtractor):              'formats': formats,              'subtitles': subtitles,              'duration': duration, +            'timestamp': timestamp,              'age_limit': age_limit,              'series': series,              'season_number': season_number, @@ -136,26 +140,39 @@ class SVTPlayIE(SVTPlayBaseIE):      IE_DESC = 'SVT Play and Öppet arkiv'      _VALID_URL = r'''(?x)                      (?: -                        svt:(?P<svt_id>[^/?#&]+)| +                        (?: +                            svt:| +                            https?://(?:www\.)?svt\.se/barnkanalen/barnplay/[^/]+/ +                        ) +                        (?P<svt_id>[^/?#&]+)|                          https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+)                      )                      '''      _TESTS = [{ -        'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2', -        'md5': '2b6704fe4a28801e1a098bbf3c5ac611', +        'url': 'https://www.svtplay.se/video/26194546/det-har-ar-himlen', +        'md5': '2382036fd6f8c994856c323fe51c426e',          'info_dict': { -            'id': '5996901', +            'id': 'jNwpV9P',              'ext': 'mp4', -            'title': 'Flygplan till Haile Selassie', -            'duration': 3527, -            'thumbnail': r're:^https?://.*[\.-]jpg$', +            'title': 'Det här är himlen', +            'timestamp': 1586044800, +            'upload_date': '20200405', +            'duration': 3515, +            'thumbnail': r're:^https?://(?:.*[\.-]jpg|www.svtstatic.se/image/.*)$',              'age_limit': 0,              'subtitles': {                  'sv': [{ -                    'ext': 'wsrt', +                    'ext': 'vtt',                  }]              },          }, +        'params': { +            'format': 'bestvideo', +            # skip for now due to download test asserts that segment is > 10000 bytes and svt uses +            # init segments that are smaller +            # AssertionError: Expected test_SVTPlay_jNwpV9P.mp4 to be at least 9.77KiB, but it's only 864.00B +            'skip_download': True, +        },      }, {          # geo restricted to Sweden          'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten', @@ -172,6 +189,12 @@ class SVTPlayIE(SVTPlayBaseIE):      }, {          'url': 'svt:14278044',          'only_matching': True, +    }, { +        'url': 'https://www.svt.se/barnkanalen/barnplay/kar/eWv5MLX/', +        'only_matching': True, +    }, { +        'url': 'svt:eWv5MLX', +        'only_matching': True,      }]      def _adjust_title(self, info): @@ -236,7 +259,10 @@ class SVTPlayIE(SVTPlayBaseIE):                   r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)'),                  webpage, 'video id') -        return self._extract_by_video_id(svt_id, webpage) +        info_dict = self._extract_by_video_id(svt_id, webpage) +        info_dict['thumbnail'] = thumbnail + +        return info_dict  class SVTSeriesIE(SVTPlayBaseIE): @@ -360,7 +386,7 @@ class SVTPageIE(InfoExtractor):      @classmethod      def suitable(cls, url): -        return False if SVTIE.suitable(url) else super(SVTPageIE, cls).suitable(url) +        return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTPageIE, cls).suitable(url)      def _real_extract(self, url):          path, display_id = re.match(self._VALID_URL, url).groups() diff --git a/youtube_dlc/extractor/tagesschau.py b/youtube_dlc/extractor/tagesschau.py index c351b7545..8ceab7e35 100644 --- a/youtube_dlc/extractor/tagesschau.py +++ b/youtube_dlc/extractor/tagesschau.py @@ -86,7 +86,7 @@ class TagesschauPlayerIE(InfoExtractor):          #     return self._extract_via_api(kind, video_id)          # JSON api does not provide some audio formats (e.g. ogg) thus -        # extractiong audio via webpage +        # extracting audio via webpage          webpage = self._download_webpage(url, video_id) diff --git a/youtube_dlc/extractor/theplatform.py b/youtube_dlc/extractor/theplatform.py index 07055513a..41bfbe80f 100644 --- a/youtube_dlc/extractor/theplatform.py +++ b/youtube_dlc/extractor/theplatform.py @@ -208,7 +208,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):          if m:              return [m.group('url')] -        # Are whitesapces ignored in URLs? +        # Are whitespaces ignored in URLs?          # https://github.com/ytdl-org/youtube-dl/issues/12044          matches = re.findall(              r'(?s)<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage) diff --git a/youtube_dlc/extractor/turner.py b/youtube_dlc/extractor/turner.py index 4a6cbfbb8..2964504a2 100644 --- a/youtube_dlc/extractor/turner.py +++ b/youtube_dlc/extractor/turner.py @@ -56,9 +56,9 @@ class TurnerBaseIE(AdobePassIE):          content_id = xpath_text(video_data, 'contentId') or video_id          # rtmp_src = xpath_text(video_data, 'akamai/src')          # if rtmp_src: -        #     splited_rtmp_src = rtmp_src.split(',') -        #     if len(splited_rtmp_src) == 2: -        #         rtmp_src = splited_rtmp_src[1] +        #     split_rtmp_src = rtmp_src.split(',') +        #     if len(split_rtmp_src) == 2: +        #         rtmp_src = split_rtmp_src[1]          # aifp = xpath_text(video_data, 'akamai/aifp', default='')          urls = [] diff --git a/youtube_dlc/extractor/twentythreevideo.py b/youtube_dlc/extractor/twentythreevideo.py index aa0c6e90f..dc5609192 100644 --- a/youtube_dlc/extractor/twentythreevideo.py +++ b/youtube_dlc/extractor/twentythreevideo.py @@ -8,8 +8,8 @@ from ..utils import int_or_none  class TwentyThreeVideoIE(InfoExtractor):      IE_NAME = '23video' -    _VALID_URL = r'https?://video\.(?P<domain>twentythree\.net|23video\.com|filmweb\.no)/v\.ihtml/player\.html\?(?P<query>.*?\bphoto(?:_|%5f)id=(?P<id>\d+).*)' -    _TEST = { +    _VALID_URL = r'https?://(?P<domain>[^.]+\.(?:twentythree\.net|23video\.com|filmweb\.no))/v\.ihtml/player\.html\?(?P<query>.*?\bphoto(?:_|%5f)id=(?P<id>\d+).*)' +    _TESTS = [{          'url': 'https://video.twentythree.net/v.ihtml/player.html?showDescriptions=0&source=site&photo%5fid=20448876&autoPlay=1',          'md5': '75fcf216303eb1dae9920d651f85ced4',          'info_dict': { @@ -21,11 +21,14 @@ class TwentyThreeVideoIE(InfoExtractor):              'uploader_id': '12258964',              'uploader': 'Rasmus Bysted',          } -    } +    }, { +        'url': 'https://bonnier-publications-danmark.23video.com/v.ihtml/player.html?token=f0dc46476e06e13afd5a1f84a29e31e8&source=embed&photo%5fid=36137620', +        'only_matching': True, +    }]      def _real_extract(self, url):          domain, query, photo_id = re.match(self._VALID_URL, url).groups() -        base_url = 'https://video.%s' % domain +        base_url = 'https://%s' % domain          photo_data = self._download_json(              base_url + '/api/photo/list?' + query, photo_id, query={                  'format': 'json', diff --git a/youtube_dlc/extractor/urplay.py b/youtube_dlc/extractor/urplay.py index 4bc2b78fb..2c41f78bd 100644 --- a/youtube_dlc/extractor/urplay.py +++ b/youtube_dlc/extractor/urplay.py @@ -2,8 +2,11 @@  from __future__ import unicode_literals  from .common import InfoExtractor -from ..utils import unified_timestamp -import re +from ..utils import ( +    dict_get, +    int_or_none, +    unified_timestamp, +)  class URPlayIE(InfoExtractor): @@ -14,7 +17,7 @@ class URPlayIE(InfoExtractor):          'info_dict': {              'id': '203704',              'ext': 'mp4', -            'title': 'Om vetenskap, kritiskt tänkande och motstånd', +            'title': 'UR Samtiden - Livet, universum och rymdens märkliga musik : Om vetenskap, kritiskt tänkande och motstånd',              'description': 'md5:5344508a52aa78c1ced6c1b8b9e44e9a',              'timestamp': 1513292400,              'upload_date': '20171214', @@ -26,7 +29,7 @@ class URPlayIE(InfoExtractor):              'ext': 'mp4',              'title': 'Tripp, Trapp, Träd : Sovkudde',              'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1', -            'timestamp': 1440093600, +            'timestamp': 1440086400,              'upload_date': '20150820',          },      }, { @@ -36,28 +39,27 @@ class URPlayIE(InfoExtractor):      def _real_extract(self, url):          video_id = self._match_id(url) - +        url = url.replace('skola.se/Produkter', 'play.se/program')          webpage = self._download_webpage(url, video_id) -        urplayer_data = re.sub(""", "\"", self._search_regex( -            r'components\/Player\/Player\" data-react-props=\"({.+?})\"', -            webpage, 'urplayer data')) -        urplayer_data = self._parse_json(urplayer_data, video_id) -        for i in range(len(urplayer_data['accessibleEpisodes'])): -            if urplayer_data.get('accessibleEpisodes', {})[i].get('id') == int(video_id): -                urplayer_data = urplayer_data['accessibleEpisodes'][i] -                break +        urplayer_data = self._parse_json(self._html_search_regex( +            r'data-react-class="components/Player/Player"[^>]+data-react-props="({.+?})"', +            webpage, 'urplayer data'), video_id)['currentProduct'] +        episode = urplayer_data['title']          host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect']          formats = [] -        urplayer_streams = urplayer_data.get("streamingInfo") -        for quality in ('sd'), ('hd'): -            location = (urplayer_streams.get("raw", {}).get(quality, {}).get("location") -                        or urplayer_streams.get("sweComplete", {}).get(quality, {}).get("location")) -            if location: +        urplayer_streams = urplayer_data.get('streamingInfo', {}) + +        for k, v in urplayer_streams.get('raw', {}).items(): +            if not (k in ('sd', 'hd') and isinstance(v, dict)): +                continue +            file_http = v.get('location') +            if file_http:                  formats.extend(self._extract_wowza_formats( -                               'http://%s/%s/playlist.m3u8' % (host, location), video_id, -                               skip_protocols=['f4m', 'rtmp', 'rtsp'])) +                    'http://%s/%splaylist.m3u8' % (host, file_http), +                    video_id, skip_protocols=['f4m', 'rtmp', 'rtsp']))          self._sort_formats(formats) +          subtitles = {}          subs = urplayer_streams.get("sweComplete", {}).get("tt", {}).get("location")          if subs: @@ -65,14 +67,37 @@ class URPlayIE(InfoExtractor):                  'url': subs,              }) +        image = urplayer_data.get('image') or {} +        thumbnails = [] +        for k, v in image.items(): +            t = { +                'id': k, +                'url': v, +            } +            wh = k.split('x') +            if len(wh) == 2: +                t.update({ +                    'width': int_or_none(wh[0]), +                    'height': int_or_none(wh[1]), +                }) +            thumbnails.append(t) + +        series = urplayer_data.get('series') or {} +        series_title = dict_get(series, ('seriesTitle', 'title')) or dict_get(urplayer_data, ('seriesTitle', 'mainTitle')) +          return {              'id': video_id, -            'title': urplayer_data['title'], -            'description': self._og_search_description(webpage), -            'thumbnail': urplayer_data.get('image', {}).get('1280x720'), -            'timestamp': unified_timestamp(self._html_search_meta(('uploadDate', 'schema:uploadDate'), -                                           webpage, 'timestamp')), -            'series': urplayer_data.get('seriesTitle'),              'subtitles': subtitles, +            'title': '%s : %s' % (series_title, episode) if series_title else episode, +            'description': urplayer_data.get('description'), +            'thumbnails': thumbnails, +            'timestamp': unified_timestamp(urplayer_data.get('publishedAt')), +            'series': series_title,              'formats': formats, +            'duration': int_or_none(urplayer_data.get('duration')), +            'categories': urplayer_data.get('categories'), +            'tags': urplayer_data.get('keywords'), +            'season': series.get('label'), +            'episode': episode, +            'episode_number': int_or_none(urplayer_data.get('episodeNumber')),          } diff --git a/youtube_dlc/extractor/usanetwork.py b/youtube_dlc/extractor/usanetwork.py index 54c7495cc..d953e460b 100644 --- a/youtube_dlc/extractor/usanetwork.py +++ b/youtube_dlc/extractor/usanetwork.py @@ -1,74 +1,24 @@  # coding: utf-8  from __future__ import unicode_literals -from .adobepass import AdobePassIE -from ..utils import ( -    NO_DEFAULT, -    smuggle_url, -    update_url_query, -) +from .nbc import NBCIE -class USANetworkIE(AdobePassIE): -    _VALID_URL = r'https?://(?:www\.)?usanetwork\.com/(?:[^/]+/videos|movies)/(?P<id>[^/?#]+)' -    _TEST = { -        'url': 'http://www.usanetwork.com/mrrobot/videos/hpe-cybersecurity', -        'md5': '33c0d2ba381571b414024440d08d57fd', +class USANetworkIE(NBCIE): +    _VALID_URL = r'https?(?P<permalink>://(?:www\.)?usanetwork\.com/(?:[^/]+/videos?|movies?)/(?:[^/]+/)?(?P<id>\d+))' +    _TESTS = [{ +        'url': 'https://www.usanetwork.com/peacock-trailers/video/intelligence-trailer/4185302',          'info_dict': { -            'id': '3086229', +            'id': '4185302',              'ext': 'mp4', -            'title': 'HPE Cybersecurity', -            'description': 'The more we digitize our world, the more vulnerable we are.', -            'upload_date': '20160818', -            'timestamp': 1471535460, -            'uploader': 'NBCU-USA', +            'title': 'Intelligence (Trailer)', +            'description': 'A maverick NSA agent enlists the help of a junior systems analyst in a workplace power grab.', +            'upload_date': '20200715', +            'timestamp': 1594785600, +            'uploader': 'NBCU-MPAT',          }, -    } - -    def _real_extract(self, url): -        display_id = self._match_id(url) -        webpage = self._download_webpage(url, display_id) - -        def _x(name, default=NO_DEFAULT): -            return self._search_regex( -                r'data-%s\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % name, -                webpage, name, default=default, group='value') - -        video_id = _x('mpx-guid') -        title = _x('episode-title') -        mpx_account_id = _x('mpx-account-id', '2304992029') - -        query = { -            'mbr': 'true', -        } -        if _x('is-full-episode', None) == '1': -            query['manifest'] = 'm3u' - -        if _x('is-entitlement', None) == '1': -            adobe_pass = {} -            drupal_settings = self._search_regex( -                r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);', -                webpage, 'drupal settings', fatal=False) -            if drupal_settings: -                drupal_settings = self._parse_json(drupal_settings, video_id, fatal=False) -                if drupal_settings: -                    adobe_pass = drupal_settings.get('adobePass', {}) -            resource = self._get_mvpd_resource( -                adobe_pass.get('adobePassResourceId', 'usa'), -                title, video_id, _x('episode-rating', 'TV-14')) -            query['auth'] = self._extract_mvpd_auth( -                url, video_id, adobe_pass.get('adobePassRequestorId', 'usa'), resource) - -        info = self._search_json_ld(webpage, video_id, default={}) -        info.update({ -            '_type': 'url_transparent', -            'url': smuggle_url(update_url_query( -                'http://link.theplatform.com/s/HNK2IC/media/guid/%s/%s' % (mpx_account_id, video_id), -                query), {'force_smil_url': True}), -            'id': video_id, -            'title': title, -            'series': _x('show-title', None), -            'episode': title, -            'ie_key': 'ThePlatform', -        }) -        return info +        'params': { +            # m3u8 download +            'skip_download': True, +        }, +    }] diff --git a/youtube_dlc/extractor/ustream.py b/youtube_dlc/extractor/ustream.py index 582090d0d..9e860aeb7 100644 --- a/youtube_dlc/extractor/ustream.py +++ b/youtube_dlc/extractor/ustream.py @@ -19,7 +19,7 @@ from ..utils import (  class UstreamIE(InfoExtractor): -    _VALID_URL = r'https?://(?:www\.)?ustream\.tv/(?P<type>recorded|embed|embed/recorded)/(?P<id>\d+)' +    _VALID_URL = r'https?://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/(?P<type>recorded|embed|embed/recorded)/(?P<id>\d+)'      IE_NAME = 'ustream'      _TESTS = [{          'url': 'http://www.ustream.tv/recorded/20274954', @@ -67,12 +67,15 @@ class UstreamIE(InfoExtractor):          'params': {              'skip_download': True,  # m3u8 download          }, +    }, { +        'url': 'https://video.ibm.com/embed/recorded/128240221?&autoplay=true&controls=true&volume=100', +        'only_matching': True,      }]      @staticmethod      def _extract_url(webpage):          mobj = re.search( -            r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage) +            r'<iframe[^>]+?src=(["\'])(?P<url>http://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/embed/.+?)\1', webpage)          if mobj is not None:              return mobj.group('url') diff --git a/youtube_dlc/extractor/viki.py b/youtube_dlc/extractor/viki.py index 6bddf8be9..09da4338d 100644 --- a/youtube_dlc/extractor/viki.py +++ b/youtube_dlc/extractor/viki.py @@ -1,6 +1,7 @@  # coding: utf-8  from __future__ import unicode_literals +import base64  import hashlib  import hmac  import itertools @@ -9,6 +10,10 @@ import re  import time  from .common import InfoExtractor +from ..compat import ( +    compat_parse_qs, +    compat_urllib_parse_urlparse, +)  from ..utils import (      ExtractorError,      int_or_none, @@ -16,6 +21,7 @@ from ..utils import (      parse_age_limit,      parse_iso8601,      sanitized_Request, +    std_headers,  ) @@ -166,19 +172,20 @@ class VikiIE(VikiBaseIE):      }, {          # episode          'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1', -        'md5': '5fa476a902e902783ac7a4d615cdbc7a', +        'md5': '94e0e34fd58f169f40c184f232356cfe',          'info_dict': {              'id': '44699v',              'ext': 'mp4',              'title': 'Boys Over Flowers - Episode 1',              'description': 'md5:b89cf50038b480b88b5b3c93589a9076', -            'duration': 4204, +            'duration': 4172,              'timestamp': 1270496524,              'upload_date': '20100405',              'uploader': 'group8',              'like_count': int,              'age_limit': 13, -        } +        }, +        'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],      }, {          # youtube external          'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1', @@ -195,14 +202,15 @@ class VikiIE(VikiBaseIE):              'uploader_id': 'ad14065n',              'like_count': int,              'age_limit': 13, -        } +        }, +        'skip': 'Page not found!',      }, {          'url': 'http://www.viki.com/player/44699v',          'only_matching': True,      }, {          # non-English description          'url': 'http://www.viki.com/videos/158036v-love-in-magic', -        'md5': '1713ae35df5a521b31f6dc40730e7c9c', +        'md5': 'adf9e321a0ae5d0aace349efaaff7691',          'info_dict': {              'id': '158036v',              'ext': 'mp4', @@ -218,71 +226,13 @@ class VikiIE(VikiBaseIE):      def _real_extract(self, url):          video_id = self._match_id(url) -        video = self._call_api( -            'videos/%s.json' % video_id, video_id, 'Downloading video JSON') - -        streams = self._call_api( -            'videos/%s/streams.json' % video_id, video_id, -            'Downloading video streams JSON') - -        formats = [] -        for format_id, stream_dict in streams.items(): -            height = int_or_none(self._search_regex( -                r'^(\d+)[pP]$', format_id, 'height', default=None)) -            for protocol, format_dict in stream_dict.items(): -                # rtmps URLs does not seem to work -                if protocol == 'rtmps': -                    continue -                format_url = format_dict.get('url') -                format_drms = format_dict.get('drms') -                format_stream_id = format_dict.get('id') -                if format_id == 'm3u8': -                    m3u8_formats = self._extract_m3u8_formats( -                        format_url, video_id, 'mp4', -                        entry_protocol='m3u8_native', -                        m3u8_id='m3u8-%s' % protocol, fatal=False) -                    # Despite CODECS metadata in m3u8 all video-only formats -                    # are actually video+audio -                    for f in m3u8_formats: -                        if f.get('acodec') == 'none' and f.get('vcodec') != 'none': -                            f['acodec'] = None -                    formats.extend(m3u8_formats) -                elif format_id == 'mpd': -                    mpd_formats = self._extract_mpd_formats( -                        format_url, video_id, -                        mpd_id='mpd-%s' % protocol, fatal=False) -                    formats.extend(mpd_formats) -                elif format_id == 'mpd': - -                    formats.extend(mpd_formats) -                elif format_url.startswith('rtmp'): -                    mobj = re.search( -                        r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$', -                        format_url) -                    if not mobj: -                        continue -                    formats.append({ -                        'format_id': 'rtmp-%s' % format_id, -                        'ext': 'flv', -                        'url': mobj.group('url'), -                        'play_path': mobj.group('playpath'), -                        'app': mobj.group('app'), -                        'page_url': url, -                        'drms': format_drms, -                        'stream_id': format_stream_id, -                    }) -                else: -                    urlh = self._request_webpage( -                        HEADRequest(format_url), video_id, 'Checking file size', fatal=False) -                    formats.append({ -                        'url': format_url, -                        'format_id': '%s-%s' % (format_id, protocol), -                        'height': height, -                        'drms': format_drms, -                        'stream_id': format_stream_id, -                        'filesize': int_or_none(urlh.headers.get('Content-Length')), -                    }) -        self._sort_formats(formats) +        resp = self._download_json( +            'https://www.viki.com/api/videos/' + video_id, +            video_id, 'Downloading video JSON', headers={ +                'x-client-user-agent': std_headers['User-Agent'], +                'x-viki-app-ver': '4.0.57', +            }) +        video = resp['video']          self._check_errors(video) @@ -342,12 +292,84 @@ class VikiIE(VikiBaseIE):              'subtitles': subtitles,          } -        if 'external' in streams: -            result.update({ -                '_type': 'url_transparent', -                'url': streams['external']['url'], -            }) -            return result +        formats = [] + +        def add_format(format_id, format_dict, protocol='http'): +            # rtmps URLs does not seem to work +            if protocol == 'rtmps': +                return +            format_url = format_dict.get('url') +            if not format_url: +                return +            format_drms = format_dict.get('drms') +            format_stream_id = format_dict.get('id') +            qs = compat_parse_qs(compat_urllib_parse_urlparse(format_url).query) +            stream = qs.get('stream', [None])[0] +            if stream: +                format_url = base64.b64decode(stream).decode() +            if format_id in ('m3u8', 'hls'): +                m3u8_formats = self._extract_m3u8_formats( +                    format_url, video_id, 'mp4', +                    entry_protocol='m3u8_native', +                    m3u8_id='m3u8-%s' % protocol, fatal=False) +                # Despite CODECS metadata in m3u8 all video-only formats +                # are actually video+audio +                for f in m3u8_formats: +                    if '_drm/index_' in f['url']: +                        continue +                    if f.get('acodec') == 'none' and f.get('vcodec') != 'none': +                        f['acodec'] = None +                    formats.append(f) +            elif format_id in ('mpd', 'dash'): +                formats.extend(self._extract_mpd_formats( +                    format_url, video_id, 'mpd-%s' % protocol, fatal=False)) +            elif format_url.startswith('rtmp'): +                mobj = re.search( +                    r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$', +                    format_url) +                if not mobj: +                    return +                formats.append({ +                    'format_id': 'rtmp-%s' % format_id, +                    'ext': 'flv', +                    'url': mobj.group('url'), +                    'play_path': mobj.group('playpath'), +                    'app': mobj.group('app'), +                    'page_url': url, +                    'drms': format_drms, +                    'stream_id': format_stream_id, +                }) +            else: +                urlh = self._request_webpage( +                    HEADRequest(format_url), video_id, 'Checking file size', fatal=False) +                formats.append({ +                    'url': format_url, +                    'format_id': '%s-%s' % (format_id, protocol), +                    'height': int_or_none(self._search_regex( +                        r'^(\d+)[pP]$', format_id, 'height', default=None)), +                    'drms': format_drms, +                    'stream_id': format_stream_id, +                    'filesize': int_or_none(urlh.headers.get('Content-Length')), +                }) + +        for format_id, format_dict in (resp.get('streams') or {}).items(): +            add_format(format_id, format_dict) +        if not formats: +            streams = self._call_api( +                'videos/%s/streams.json' % video_id, video_id, +                'Downloading video streams JSON') + +            if 'external' in streams: +                result.update({ +                    '_type': 'url_transparent', +                    'url': streams['external']['url'], +                }) +                return result + +            for format_id, stream_dict in streams.items(): +                for protocol, format_dict in stream_dict.items(): +                    add_format(format_id, format_dict, protocol) +        self._sort_formats(formats)          result['formats'] = formats          return result diff --git a/youtube_dlc/extractor/vimeo.py b/youtube_dlc/extractor/vimeo.py index 9839657ca..51a0ab2fa 100644 --- a/youtube_dlc/extractor/vimeo.py +++ b/youtube_dlc/extractor/vimeo.py @@ -922,7 +922,7 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor):      }]      _PAGE_SIZE = 100 -    def _fetch_page(self, album_id, authorizaion, hashed_pass, page): +    def _fetch_page(self, album_id, authorization, hashed_pass, page):          api_page = page + 1          query = {              'fields': 'link,uri', @@ -934,7 +934,7 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor):          videos = self._download_json(              'https://api.vimeo.com/albums/%s/videos' % album_id,              album_id, 'Downloading page %d' % api_page, query=query, headers={ -                'Authorization': 'jwt ' + authorizaion, +                'Authorization': 'jwt ' + authorization,              })['data']          for video in videos:              link = video.get('link') @@ -946,10 +946,13 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor):      def _real_extract(self, url):          album_id = self._match_id(url) -        webpage = self._download_webpage(url, album_id) -        viewer = self._parse_json(self._search_regex( -            r'bootstrap_data\s*=\s*({.+?})</script>', -            webpage, 'bootstrap data'), album_id)['viewer'] +        viewer = self._download_json( +            'https://vimeo.com/_rv/viewer', album_id, fatal=False) +        if not viewer: +            webpage = self._download_webpage(url, album_id) +            viewer = self._parse_json(self._search_regex( +                r'bootstrap_data\s*=\s*({.+?})</script>', +                webpage, 'bootstrap data'), album_id)['viewer']          jwt = viewer['jwt']          album = self._download_json(              'https://api.vimeo.com/albums/' + album_id, diff --git a/youtube_dlc/extractor/vlive.py b/youtube_dlc/extractor/vlive.py index 935560b57..223709b1e 100644 --- a/youtube_dlc/extractor/vlive.py +++ b/youtube_dlc/extractor/vlive.py @@ -1,55 +1,50 @@  # coding: utf-8  from __future__ import unicode_literals -import re -import time  import itertools +import json -from .common import InfoExtractor  from .naver import NaverBaseIE -from ..compat import compat_str +from ..compat import ( +    compat_HTTPError, +    compat_str, +)  from ..utils import (      ExtractorError, +    int_or_none,      merge_dicts, +    str_or_none, +    strip_or_none,      try_get,      urlencode_postdata,  ) -class VLiveIE(NaverBaseIE): +class VLiveBaseIE(NaverBaseIE): +    _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b' + + +class VLiveIE(VLiveBaseIE):      IE_NAME = 'vlive' -    _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/(?:video|post)/(?P<id>(?:\d-)?[0-9]+)' +    _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/(?:video|embed)/(?P<id>[0-9]+)'      _NETRC_MACHINE = 'vlive'      _TESTS = [{ -        'url': 'https://www.vlive.tv/video/1326', +        'url': 'http://www.vlive.tv/video/1326',          'md5': 'cc7314812855ce56de70a06a27314983',          'info_dict': {              'id': '1326',              'ext': 'mp4', -            'title': "[V LIVE] Girl's Day's Broadcast", +            'title': "Girl's Day's Broadcast",              'creator': "Girl's Day",              'view_count': int,              'uploader_id': 'muploader_a',          }, -    }, -        { -        'url': 'https://vlive.tv/post/1-18244258', -        'md5': 'cc7314812855ce56de70a06a27314983', -        'info_dict': { -            'id': '1326', -            'ext': 'mp4', -            'title': "[V LIVE] Girl's Day's Broadcast", -            'creator': "Girl's Day", -            'view_count': int, -            'uploader_id': 'muploader_a', -        }, -    }, -        { -        'url': 'https://www.vlive.tv/video/16937', +    }, { +        'url': 'http://www.vlive.tv/video/16937',          'info_dict': {              'id': '16937',              'ext': 'mp4', -            'title': '[V LIVE] 첸백시 걍방', +            'title': '첸백시 걍방',              'creator': 'EXO',              'view_count': int,              'subtitles': 'mincount:12', @@ -70,12 +65,15 @@ class VLiveIE(NaverBaseIE):              'subtitles': 'mincount:10',          },          'skip': 'This video is only available for CH+ subscribers', +    }, { +        'url': 'https://www.vlive.tv/embed/1326', +        'only_matching': True, +    }, { +        # works only with gcc=KR +        'url': 'https://www.vlive.tv/video/225019', +        'only_matching': True,      }] -    @classmethod -    def suitable(cls, url): -        return False if VLivePlaylistIE.suitable(url) else super(VLiveIE, cls).suitable(url) -      def _real_initialize(self):          self._login() @@ -107,118 +105,159 @@ class VLiveIE(NaverBaseIE):          if not is_logged_in():              raise ExtractorError('Unable to log in', expected=True) -    def _real_extract(self, url): -        # url may match on a post or a video url with a post_id potentially matching a video_id -        working_id = self._match_id(url) -        webpage = self._download_webpage(url, working_id) - -        PARAMS_RE = r'window\.__PRELOADED_STATE__\s*=\s*({.*});?\s*</script>' -        PARAMS_FIELD = 'params' - -        params = self._search_regex( -            PARAMS_RE, webpage, PARAMS_FIELD, default='', flags=re.DOTALL) -        params = self._parse_json(params, working_id, fatal=False) - -        video_params = try_get(params, lambda x: x["postDetail"]["post"]["officialVideo"], dict) - -        if video_params is None: -            error = try_get(params, lambda x: x["postDetail"]["error"], dict) -            error_data = try_get(error, lambda x: x["data"], dict) -            error_video = try_get(error_data, lambda x: x["officialVideo"], dict) -            error_msg = try_get(error, lambda x: x["message"], compat_str) -            product_type = try_get(error_data, -                                   [lambda x: x["officialVideo"]["productType"], -                                    lambda x: x["board"]["boardType"]], -                                   compat_str) - -            if error_video is not None: -                if product_type in ('VLIVE_PLUS', 'VLIVE+'): -                    self.raise_login_required('This video is only available with V LIVE+.') -                elif error_msg is not None: -                    raise ExtractorError('V LIVE reported the following error: %s' % error_msg) -                else: -                    raise ExtractorError('Failed to extract video parameters.') -            elif 'post' in url: -                raise ExtractorError('Url does not appear to be a video post.', expected=True) -            else: -                raise ExtractorError('Failed to extract video parameters.') - -        video_id = working_id if 'video' in url else str(video_params["videoSeq"]) +    def _call_api(self, path_template, video_id, fields=None): +        query = {'appId': self._APP_ID, 'gcc': 'KR'} +        if fields: +            query['fields'] = fields +        try: +            return self._download_json( +                'https://www.vlive.tv/globalv-web/vam-web/' + path_template % video_id, video_id, +                'Downloading %s JSON metadata' % path_template.split('/')[-1].split('-')[0], +                headers={'Referer': 'https://www.vlive.tv/'}, query=query) +        except ExtractorError as e: +            if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: +                self.raise_login_required(json.loads(e.cause.read().decode())['message']) +            raise -        video_type = video_params["type"] -        if video_type in ('VOD'): -            encoding_status = video_params["encodingStatus"] -            if encoding_status == 'COMPLETE': -                return self._replay(video_id, webpage, params, video_params) -            else: -                raise ExtractorError('VOD encoding not yet complete. Please try again later.', -                                     expected=True) -        elif video_type in ('LIVE'): -            video_status = video_params["status"] -            if video_status in ('RESERVED'): +    def _real_extract(self, url): +        video_id = self._match_id(url) + +        post = self._call_api( +            'post/v1.0/officialVideoPost-%s', video_id, +            'author{nickname},channel{channelCode,channelName},officialVideo{commentCount,exposeStatus,likeCount,playCount,playTime,status,title,type,vodId}') + +        video = post['officialVideo'] + +        def get_common_fields(): +            channel = post.get('channel') or {} +            return { +                'title': video.get('title'), +                'creator': post.get('author', {}).get('nickname'), +                'channel': channel.get('channelName'), +                'channel_id': channel.get('channelCode'), +                'duration': int_or_none(video.get('playTime')), +                'view_count': int_or_none(video.get('playCount')), +                'like_count': int_or_none(video.get('likeCount')), +                'comment_count': int_or_none(video.get('commentCount')), +            } + +        video_type = video.get('type') +        if video_type == 'VOD': +            inkey = self._call_api('video/v1.0/vod/%s/inkey', video_id)['inkey'] +            vod_id = video['vodId'] +            return merge_dicts( +                get_common_fields(), +                self._extract_video_info(video_id, vod_id, inkey)) +        elif video_type == 'LIVE': +            status = video.get('status') +            if status == 'ON_AIR': +                stream_url = self._call_api( +                    'old/v3/live/%s/playInfo', +                    video_id)['result']['adaptiveStreamUrl'] +                formats = self._extract_m3u8_formats(stream_url, video_id, 'mp4') +                info = get_common_fields() +                info.update({ +                    'title': self._live_title(video['title']), +                    'id': video_id, +                    'formats': formats, +                    'is_live': True, +                }) +                return info +            elif status == 'ENDED': +                raise ExtractorError( +                    'Uploading for replay. Please wait...', expected=True) +            elif status == 'RESERVED':                  raise ExtractorError('Coming soon!', expected=True) -            elif video_status in ('ENDED', 'END'): -                raise ExtractorError('Uploading for replay. Please wait...', expected=True) +            elif video.get('exposeStatus') == 'CANCEL': +                raise ExtractorError( +                    'We are sorry, but the live broadcast has been canceled.', +                    expected=True)              else: -                return self._live(video_id, webpage, params) -        else: -            raise ExtractorError('Unknown video type %s' % video_type) - -    def _get_common_fields(self, webpage, params): -        title = self._og_search_title(webpage) -        description = self._html_search_meta( -            ['og:description', 'description', 'twitter:description'], -            webpage, 'description', default=None) -        creator = (try_get(params, lambda x: x["channel"]["channel"]["channelName"], compat_str) -                   or self._search_regex(r'on (.*) channel', description or '', 'creator', fatal=False)) -        thumbnail = self._og_search_thumbnail(webpage) -        return { -            'title': title, -            'creator': creator, -            'thumbnail': thumbnail, -        } - -    def _live(self, video_id, webpage, params): -        LIVE_INFO_ENDPOINT = 'https://www.vlive.tv/globalv-web/vam-web/old/v3/live/%s/playInfo' % video_id -        play_info = self._download_json(LIVE_INFO_ENDPOINT, video_id, -                                        headers={"referer": "https://www.vlive.tv"}) +                raise ExtractorError('Unknown status ' + status) -        streams = try_get(play_info, lambda x: x["result"]["streamList"], list) or [] -        formats = [] -        for stream in streams: -            formats.extend(self._extract_m3u8_formats( -                stream['serviceUrl'], video_id, 'mp4', -                fatal=False, live=True)) -        self._sort_formats(formats) +class VLivePostIE(VLiveIE): +    IE_NAME = 'vlive:post' +    _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/post/(?P<id>\d-\d+)' +    _TESTS = [{ +        # uploadType = SOS +        'url': 'https://www.vlive.tv/post/1-20088044', +        'info_dict': { +            'id': '1-20088044', +            'title': 'Hola estrellitas la tierra les dice hola (si era así no?) Ha...', +            'description': 'md5:fab8a1e50e6e51608907f46c7fa4b407', +        }, +        'playlist_count': 3, +    }, { +        # uploadType = V +        'url': 'https://www.vlive.tv/post/1-20087926', +        'info_dict': { +            'id': '1-20087926', +            'title': 'James Corden: And so, the baby becamos the Papa💜😭💪😭', +        }, +        'playlist_count': 1, +    }] +    _FVIDEO_TMPL = 'fvideo/v1.0/fvideo-%%s/%s' +    _SOS_TMPL = _FVIDEO_TMPL % 'sosPlayInfo' +    _INKEY_TMPL = _FVIDEO_TMPL % 'inKey' -        info = self._get_common_fields(webpage, params) -        info.update({ -            'title': self._live_title(info['title']), -            'id': video_id, -            'formats': formats, -            'is_live': True, -        }) -        return info +    def _real_extract(self, url): +        post_id = self._match_id(url) -    def _replay(self, video_id, webpage, params, video_params): -        long_video_id = video_params["vodId"] +        post = self._call_api( +            'post/v1.0/post-%s', post_id, +            'attachments{video},officialVideo{videoSeq},plainBody,title') -        VOD_KEY_ENDPOINT = 'https://www.vlive.tv/globalv-web/vam-web/video/v1.0/vod/%s/inkey' % video_id -        key_json = self._download_json(VOD_KEY_ENDPOINT, video_id, -                                       headers={"referer": "https://www.vlive.tv"}) -        key = key_json["inkey"] +        video_seq = str_or_none(try_get( +            post, lambda x: x['officialVideo']['videoSeq'])) +        if video_seq: +            return self.url_result( +                'http://www.vlive.tv/video/' + video_seq, +                VLiveIE.ie_key(), video_seq) -        return merge_dicts( -            self._get_common_fields(webpage, params), -            self._extract_video_info(video_id, long_video_id, key)) +        title = post['title'] +        entries = [] +        for idx, video in enumerate(post['attachments']['video'].values()): +            video_id = video.get('videoId') +            if not video_id: +                continue +            upload_type = video.get('uploadType') +            upload_info = video.get('uploadInfo') or {} +            entry = None +            if upload_type == 'SOS': +                download = self._call_api( +                    self._SOS_TMPL, video_id)['videoUrl']['download'] +                formats = [] +                for f_id, f_url in download.items(): +                    formats.append({ +                        'format_id': f_id, +                        'url': f_url, +                        'height': int_or_none(f_id[:-1]), +                    }) +                self._sort_formats(formats) +                entry = { +                    'formats': formats, +                    'id': video_id, +                    'thumbnail': upload_info.get('imageUrl'), +                } +            elif upload_type == 'V': +                vod_id = upload_info.get('videoId') +                if not vod_id: +                    continue +                inkey = self._call_api(self._INKEY_TMPL, video_id)['inKey'] +                entry = self._extract_video_info(video_id, vod_id, inkey) +            if entry: +                entry['title'] = '%s_part%s' % (title, idx) +                entries.append(entry) +        return self.playlist_result( +            entries, post_id, title, strip_or_none(post.get('plainBody'))) -class VLiveChannelIE(InfoExtractor): +class VLiveChannelIE(VLiveBaseIE):      IE_NAME = 'vlive:channel' -    _VALID_URL = r'https?://(?:(?:www|m)\.)?(?:channels\.vlive\.tv/|vlive\.tv/channels?/)(?P<id>[0-9A-Z]+)' +    _VALID_URL = r'https?://(?:channels\.vlive\.tv|(?:(?:www|m)\.)?vlive\.tv/channel)/(?P<id>[0-9A-Z]+)'      _TESTS = [{ -        'url': 'https://channels.vlive.tv/FCD4B', +        'url': 'http://channels.vlive.tv/FCD4B',          'info_dict': {              'id': 'FCD4B',              'title': 'MAMAMOO', @@ -226,63 +265,39 @@ class VLiveChannelIE(InfoExtractor):          'playlist_mincount': 110      }, {          'url': 'https://www.vlive.tv/channel/FCD4B', -        'info_dict': { -            'id': 'FCD4B', -            'title': 'MAMAMOO', -        }, -        'playlist_mincount': 110 +        'only_matching': True,      }] -    _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b' + +    def _call_api(self, path, channel_key_suffix, channel_value, note, query): +        q = { +            'app_id': self._APP_ID, +            'channel' + channel_key_suffix: channel_value, +        } +        q.update(query) +        return self._download_json( +            'http://api.vfan.vlive.tv/vproxy/channelplus/' + path, +            channel_value, note='Downloading ' + note, query=q)['result']      def _real_extract(self, url):          channel_code = self._match_id(url) -        webpage = self._download_webpage( -            'http://channels.vlive.tv/%s/video' % channel_code, channel_code) - -        app_id = None - -        app_js_url = self._search_regex( -            r'<script[^>]+src=(["\'])(?P<url>http.+?/app\.js.*?)\1', -            webpage, 'app js', default=None, group='url') - -        if app_js_url: -            app_js = self._download_webpage( -                app_js_url, channel_code, 'Downloading app JS', fatal=False) -            if app_js: -                app_id = self._search_regex( -                    r'Global\.VFAN_APP_ID\s*=\s*[\'"]([^\'"]+)[\'"]', -                    app_js, 'app id', default=None) - -        app_id = app_id or self._APP_ID - -        channel_info = self._download_json( -            'http://api.vfan.vlive.tv/vproxy/channelplus/decodeChannelCode', -            channel_code, note='Downloading decode channel code', -            query={ -                'app_id': app_id, -                'channelCode': channel_code, -                '_': int(time.time()) -            }) +        channel_seq = self._call_api( +            'decodeChannelCode', 'Code', channel_code, +            'decode channel code', {})['channelSeq'] -        channel_seq = channel_info['result']['channelSeq']          channel_name = None          entries = []          for page_num in itertools.count(1): -            video_list = self._download_json( -                'http://api.vfan.vlive.tv/vproxy/channelplus/getChannelVideoList', -                channel_code, note='Downloading channel list page #%d' % page_num, -                query={ -                    'app_id': app_id, -                    'channelSeq': channel_seq, +            video_list = self._call_api( +                'getChannelVideoList', 'Seq', channel_seq, +                'channel list page #%d' % page_num, {                      # Large values of maxNumOfRows (~300 or above) may cause                      # empty responses (see [1]), e.g. this happens for [2] that                      # has more than 300 videos.                      # 1. https://github.com/ytdl-org/youtube-dl/issues/13830                      # 2. http://channels.vlive.tv/EDBF.                      'maxNumOfRows': 100, -                    '_': int(time.time()),                      'pageNo': page_num                  }              ) @@ -290,11 +305,11 @@ class VLiveChannelIE(InfoExtractor):              if not channel_name:                  channel_name = try_get(                      video_list, -                    lambda x: x['result']['channelInfo']['channelName'], +                    lambda x: x['channelInfo']['channelName'],                      compat_str)              videos = try_get( -                video_list, lambda x: x['result']['videoList'], list) +                video_list, lambda x: x['videoList'], list)              if not videos:                  break @@ -310,79 +325,3 @@ class VLiveChannelIE(InfoExtractor):          return self.playlist_result(              entries, channel_code, channel_name) - - -class VLivePlaylistIE(InfoExtractor): -    IE_NAME = 'vlive:playlist' -    _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<video_id>[0-9]+)/playlist/(?P<id>[0-9]+)' -    _VIDEO_URL_TEMPLATE = 'http://www.vlive.tv/video/%s' -    _TESTS = [{ -        # regular working playlist -        'url': 'https://www.vlive.tv/video/117956/playlist/117963', -        'info_dict': { -            'id': '117963', -            'title': '아이돌룸(IDOL ROOM) 41회 - (여자)아이들' -        }, -        'playlist_mincount': 10 -    }, { -        # playlist with no playlistVideoSeqs -        'url': 'http://www.vlive.tv/video/22867/playlist/22912', -        'info_dict': { -            'id': '22867', -            'ext': 'mp4', -            'title': '[V LIVE] Valentine Day Message from MINA', -            'creator': 'TWICE', -            'view_count': int -        }, -        'params': { -            'skip_download': True, -        } -    }] - -    def _build_video_result(self, video_id, message): -        self.to_screen(message) -        return self.url_result( -            self._VIDEO_URL_TEMPLATE % video_id, -            ie=VLiveIE.ie_key(), video_id=video_id) - -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        video_id, playlist_id = mobj.group('video_id', 'id') - -        if self._downloader.params.get('noplaylist'): -            return self._build_video_result( -                video_id, -                'Downloading just video %s because of --no-playlist' -                % video_id) - -        self.to_screen( -            'Downloading playlist %s - add --no-playlist to just download video' -            % playlist_id) - -        webpage = self._download_webpage( -            'http://www.vlive.tv/video/%s/playlist/%s' -            % (video_id, playlist_id), playlist_id) - -        raw_item_ids = self._search_regex( -            r'playlistVideoSeqs\s*=\s*(\[[^]]+\])', webpage, -            'playlist video seqs', default=None, fatal=False) - -        if not raw_item_ids: -            return self._build_video_result( -                video_id, -                'Downloading just video %s because no playlist was found' -                % video_id) - -        item_ids = self._parse_json(raw_item_ids, playlist_id) - -        entries = [ -            self.url_result( -                self._VIDEO_URL_TEMPLATE % item_id, ie=VLiveIE.ie_key(), -                video_id=compat_str(item_id)) -            for item_id in item_ids] - -        playlist_name = self._html_search_regex( -            r'<div[^>]+class="[^"]*multicam_playlist[^>]*>\s*<h3[^>]+>([^<]+)', -            webpage, 'playlist title', fatal=False) - -        return self.playlist_result(entries, playlist_id, playlist_name) diff --git a/youtube_dlc/extractor/xiami.py b/youtube_dlc/extractor/xiami.py index 618da8382..769aab331 100644 --- a/youtube_dlc/extractor/xiami.py +++ b/youtube_dlc/extractor/xiami.py @@ -54,17 +54,17 @@ class XiamiBaseIE(InfoExtractor):      def _decrypt(origin):          n = int(origin[0])          origin = origin[1:] -        short_lenth = len(origin) // n -        long_num = len(origin) - short_lenth * n +        short_length = len(origin) // n +        long_num = len(origin) - short_length * n          l = tuple()          for i in range(0, n): -            length = short_lenth +            length = short_length              if i < long_num:                  length += 1              l += (origin[0:length], )              origin = origin[length:]          ans = '' -        for i in range(0, short_lenth + 1): +        for i in range(0, short_length + 1):              for j in range(0, n):                  if len(l[j]) > i:                      ans += l[j][i] diff --git a/youtube_dlc/extractor/xtube.py b/youtube_dlc/extractor/xtube.py index 081c5e2e7..98d2adb99 100644 --- a/youtube_dlc/extractor/xtube.py +++ b/youtube_dlc/extractor/xtube.py @@ -5,7 +5,6 @@ import re  from .common import InfoExtractor  from ..utils import ( -    ExtractorError,      int_or_none,      js_to_json,      orderedSet, @@ -34,7 +33,7 @@ class XTubeIE(InfoExtractor):              'title': 'strange erotica',              'description': 'contains:an ET kind of thing',              'uploader': 'greenshowers', -            'duration': 449, +            'duration': 450,              'view_count': int,              'comment_count': int,              'age_limit': 18, @@ -74,24 +73,16 @@ class XTubeIE(InfoExtractor):          title, thumbnail, duration = [None] * 3 -        json_config_string = self._search_regex( -            r'playerConf=({.+?}),loaderConf', -            webpage, 'config', default=None) -        if not json_config_string: -            raise ExtractorError("Could not extract video player data") - -        json_config_string = json_config_string.replace("!0", "true").replace("!1", "false") - -        config = self._parse_json(json_config_string, video_id, transform_source=js_to_json, fatal=False) -        if not config: -            raise ExtractorError("Could not extract video player data") - -        config = config.get('mainRoll') -        if isinstance(config, dict): -            title = config.get('title') -            thumbnail = config.get('poster') -            duration = int_or_none(config.get('duration')) -            sources = config.get('sources') or config.get('format') +        config = self._parse_json(self._search_regex( +            r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf)', webpage, 'config', +            default='{}'), video_id, transform_source=js_to_json, fatal=False) +        if config: +            config = config.get('mainRoll') +            if isinstance(config, dict): +                title = config.get('title') +                thumbnail = config.get('poster') +                duration = int_or_none(config.get('duration')) +                sources = config.get('sources') or config.get('format')          if not isinstance(sources, dict):              sources = self._parse_json(self._search_regex( diff --git a/youtube_dlc/extractor/youporn.py b/youtube_dlc/extractor/youporn.py index e7fca22de..7b9feafeb 100644 --- a/youtube_dlc/extractor/youporn.py +++ b/youtube_dlc/extractor/youporn.py @@ -29,7 +29,6 @@ class YouPornIE(InfoExtractor):              'upload_date': '20101217',              'average_rating': int,              'view_count': int, -            'comment_count': int,              'categories': list,              'tags': list,              'age_limit': 18, @@ -48,7 +47,6 @@ class YouPornIE(InfoExtractor):              'upload_date': '20110418',              'average_rating': int,              'view_count': int, -            'comment_count': int,              'categories': list,              'tags': list,              'age_limit': 18, @@ -156,7 +154,8 @@ class YouPornIE(InfoExtractor):              r'(?s)<div[^>]+class=["\']submitByLink["\'][^>]*>(.+?)</div>',              webpage, 'uploader', fatal=False)          upload_date = unified_strdate(self._html_search_regex( -            [r'Date\s+[Aa]dded:\s*<span>([^<]+)', +            [r'UPLOADED:\s*<span>([^<]+)', +             r'Date\s+[Aa]dded:\s*<span>([^<]+)',               r'(?s)<div[^>]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)</div>'],              webpage, 'upload date', fatal=False)) @@ -171,7 +170,7 @@ class YouPornIE(InfoExtractor):              webpage, 'view count', fatal=False, group='count'))          comment_count = str_to_int(self._search_regex(              r'>All [Cc]omments? \(([\d,.]+)\)', -            webpage, 'comment count', fatal=False)) +            webpage, 'comment count', default=None))          def extract_tag_box(regex, title):              tag_box = self._search_regex(regex, webpage, title, default=None) diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 97cc793f9..ad56b9b01 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -16,7 +16,6 @@ from ..jsinterp import JSInterpreter  from ..swfinterp import SWFInterpreter  from ..compat import (      compat_chr, -    compat_HTTPError,      compat_kwargs,      compat_parse_qs,      compat_urllib_parse_unquote, @@ -30,15 +29,11 @@ from ..utils import (      bool_or_none,      clean_html,      error_to_compat_str, -    extract_attributes,      ExtractorError,      float_or_none, -    get_element_by_attribute,      get_element_by_id,      int_or_none, -    js_to_json,      mimetype2ext, -    orderedSet,      parse_codecs,      parse_count,      parse_duration, @@ -51,9 +46,11 @@ from ..utils import (      unescapeHTML,      unified_strdate,      unsmuggle_url, +    update_url_query,      uppercase_escape,      url_or_none,      urlencode_postdata, +    urljoin,  ) @@ -66,13 +63,16 @@ class YoutubeBaseInfoExtractor(InfoExtractor):      _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'      _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}' +    _RESERVED_NAMES = ( +        r'course|embed|channel|c|user|playlist|watch|w|results|storefront|oops|' +        r'shared|index|account|reporthistory|t/terms|about|upload|signin|logout|' +        r'feed/(watch_later|history|subscriptions|library|trending|recommended)') +      _NETRC_MACHINE = 'youtube'      # If True it will raise an error if no login info is provided      _LOGIN_REQUIRED = False -    _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}' -    _INITIAL_DATA_RE = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});' -    _YTCFG_DATA_RE = r"ytcfg.set\(({.*?})\)" +    _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)'      _YOUTUBE_CLIENT_HEADERS = {          'x-youtube-client-name': '1', @@ -297,147 +297,36 @@ class YoutubeBaseInfoExtractor(InfoExtractor):          if not self._login():              return +    _DEFAULT_API_DATA = { +        'context': { +            'client': { +                'clientName': 'WEB', +                'clientVersion': '2.20201021.03.00', +            } +        }, +    } -class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor): - -    def _find_entries_in_json(self, extracted): -        entries = [] -        c = {} - -        def _real_find(obj): -            if obj is None or isinstance(obj, str): -                return - -            if type(obj) is list: -                for elem in obj: -                    _real_find(elem) - -            if type(obj) is dict: -                if self._is_entry(obj): -                    entries.append(obj) -                    return - -                if 'continuationCommand' in obj: -                    c['continuation'] = obj -                    return - -                for _, o in obj.items(): -                    _real_find(o) - -        _real_find(extracted) - -        return entries, try_get(c, lambda x: x["continuation"]) - -    def _entries(self, page, playlist_id, max_pages=None): -        seen = [] - -        yt_conf = {} -        for m in re.finditer(self._YTCFG_DATA_RE, page): -            parsed = self._parse_json(m.group(1), playlist_id, -                                      transform_source=js_to_json, fatal=False) -            if parsed: -                yt_conf.update(parsed) - -        data_json = self._parse_json(self._search_regex(self._INITIAL_DATA_RE, page, 'ytInitialData'), None) - -        for page_num in range(1, max_pages + 1) if max_pages is not None else itertools.count(1): -            entries, continuation = self._find_entries_in_json(data_json) -            processed = self._process_entries(entries, seen) - -            if not processed: -                break -            for entry in processed: -                yield entry - -            if not continuation or not yt_conf: -                break -            continuation_token = try_get(continuation, lambda x: x['continuationCommand']['token']) -            continuation_url = try_get(continuation, lambda x: x['commandMetadata']['webCommandMetadata']['apiUrl']) -            if not continuation_token or not continuation_url: -                break - -            count = 0 -            retries = 3 -            while count <= retries: -                try: -                    # Downloading page may result in intermittent 5xx HTTP error -                    # that is usually worked around with a retry -                    data_json = self._download_json( -                        'https://www.youtube.com%s' % continuation_url, -                        playlist_id, -                        'Downloading continuation page #%s%s' % (page_num, ' (retry #%d)' % count if count else ''), - -                        transform_source=uppercase_escape, -                        query={ -                            'key': try_get(yt_conf, lambda x: x['INNERTUBE_API_KEY']) -                        }, -                        data=str(json.dumps({ -                            'context': try_get(yt_conf, lambda x: x['INNERTUBE_CONTEXT']), -                            'continuation': continuation_token -                        })).encode(encoding='UTF-8', errors='strict'), -                        headers={ -                            'Content-Type': 'application/json' -                        } -                    ) -                    break -                except ExtractorError as e: -                    if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503): -                        count += 1 -                        if count <= retries: -                            continue -                    raise - -    def _extract_title(self, renderer): -        title = try_get(renderer, lambda x: x['title']['runs'][0]['text'], compat_str) -        if title: -            return title -        return try_get(renderer, lambda x: x['title']['simpleText'], compat_str) - - -class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): -    def _is_entry(self, obj): -        return 'videoId' in obj - -    def _process_entries(self, entries, seen): -        ids_in_page = [] -        titles_in_page = [] -        for renderer in entries: -            video_id = try_get(renderer, lambda x: x['videoId']) -            video_title = self._extract_title(renderer) - -            if video_id is None or video_title is None: -                # we do not have a videoRenderer or title extraction broke -                continue - -            video_title = video_title.strip() - -            try: -                idx = ids_in_page.index(video_id) -                if video_title and not titles_in_page[idx]: -                    titles_in_page[idx] = video_title -            except ValueError: -                ids_in_page.append(video_id) -                titles_in_page.append(video_title) - -        for video_id, video_title in zip(ids_in_page, titles_in_page): -            yield self.url_result(video_id, 'Youtube', video_id, video_title) - +    _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;' -class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor): -    def _is_entry(self, obj): -        return 'playlistId' in obj +    def _call_api(self, ep, query, video_id): +        data = self._DEFAULT_API_DATA.copy() +        data.update(query) -    def _process_entries(self, entries, seen): -        for playlist_id in orderedSet(try_get(r, lambda x: x['playlistId']) for r in entries): +        response = self._download_json( +            'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id, +            note='Downloading API JSON', errnote='Unable to download API page', +            data=json.dumps(data).encode('utf8'), +            headers={'content-type': 'application/json'}, +            query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'}) -            yield self.url_result( -                'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist') +        return response -    def _real_extract(self, url): -        playlist_id = self._match_id(url) -        webpage = self._download_webpage(url, playlist_id) -        title = self._og_search_title(webpage, fatal=False) -        return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title) +    def _extract_yt_initial_data(self, video_id, webpage): +        return self._parse_json( +            self._search_regex( +                (r'%s\s*\n' % self._YT_INITIAL_DATA_RE, +                 self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'), +            video_id)  class YoutubeIE(YoutubeBaseInfoExtractor): @@ -498,7 +387,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                           |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=                           )                       )?                                                       # all until now is optional -> you can pass the naked ID -                     ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID +                     (?P<id>[0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID                       (?!.*?\blist=                          (?:                              %(playlist_id)s|                                  # combined list/video URLs are handled by the playlist IE @@ -662,7 +551,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              }          },          { -            'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY', +            'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',              'note': 'Use the first video ID in the URL',              'info_dict': {                  'id': 'BaW_jenozKc', @@ -703,6 +592,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              },              'skip': 'format 141 not served anymore',          }, +        # DASH manifest with encrypted signature +        { +            'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA', +            'info_dict': { +                'id': 'IB3lcPjvWLA', +                'ext': 'm4a', +                'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson', +                'description': 'md5:8f5e2b82460520b619ccac1f509d43bf', +                'duration': 244, +                'uploader': 'AfrojackVEVO', +                'uploader_id': 'AfrojackVEVO', +                'upload_date': '20131011', +            }, +            'params': { +                'youtube_include_dash_manifest': True, +                'format': '141/bestaudio[ext=m4a]', +            }, +        },          # Controversy video          {              'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8', @@ -734,6 +641,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'age_limit': 18,              },          }, +        # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421) +        # YouTube Red ad is not captured for creator +        { +            'url': '__2ABJjxzNo', +            'info_dict': { +                'id': '__2ABJjxzNo', +                'ext': 'mp4', +                'duration': 266, +                'upload_date': '20100430', +                'uploader_id': 'deadmau5', +                'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5', +                'creator': 'Dada Life, deadmau5', +                'description': 'md5:12c56784b8032162bb936a5f76d55360', +                'uploader': 'deadmau5', +                'title': 'Deadmau5 - Some Chords (HD)', +                'alt_title': 'This Machine Kills Some Chords', +            }, +            'expected_warnings': [ +                'DASH manifest missing', +            ] +        },          # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)          {              'url': 'lqQg6PlCWgI', @@ -1073,10 +1001,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              'only_matching': True,          },          { -            'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', -            'only_matching': True, -        }, -        {              'url': 'https://invidio.us/watch?v=BaW_jenozKc',              'only_matching': True,          }, @@ -1128,73 +1052,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              },          },          { -            # Youtube Music Auto-generated description -            # Retrieve 'artist' field from 'Artist:' in video description -            # when it is present on youtube music video -            'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY', -            'info_dict': { -                'id': 'k0jLE7tTwjY', -                'ext': 'mp4', -                'title': 'Latch Feat. Sam Smith', -                'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335', -                'upload_date': '20150110', -                'uploader': 'Various Artists - Topic', -                'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w', -                'artist': 'Disclosure', -                'track': 'Latch Feat. Sam Smith', -                'album': 'Latch Featuring Sam Smith', -                'release_date': '20121008', -                'release_year': 2012, -            }, -            'params': { -                'skip_download': True, -            }, -        }, -        { -            # Youtube Music Auto-generated description -            # handle multiple artists on youtube music video -            'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA', -            'info_dict': { -                'id': '74qn0eJSjpA', -                'ext': 'mp4', -                'title': 'Eastside', -                'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2', -                'upload_date': '20180710', -                'uploader': 'Benny Blanco - Topic', -                'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A', -                'artist': 'benny blanco, Halsey, Khalid', -                'track': 'Eastside', -                'album': 'Eastside', -                'release_date': '20180713', -                'release_year': 2018, -            }, -            'params': { -                'skip_download': True, -            }, -        }, -        { -            # Youtube Music Auto-generated description -            # handle youtube music video with release_year and no release_date -            'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M', -            'info_dict': { -                'id': '-hcAI0g-f5M', -                'ext': 'mp4', -                'title': 'Put It On Me', -                'description': 'md5:f6422397c07c4c907c6638e1fee380a5', -                'upload_date': '20180426', -                'uploader': 'Matt Maeson - Topic', -                'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ', -                'artist': 'Matt Maeson', -                'track': 'Put It On Me', -                'album': 'The Hearse', -                'release_date': None, -                'release_year': 2018, -            }, -            'params': { -                'skip_download': True, -            }, -        }, -        {              'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',              'only_matching': True,          }, @@ -1234,6 +1091,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  'skip_download': True,              },          }, +        { +            # with '};' inside yt initial data (see https://github.com/ytdl-org/youtube-dl/issues/27093) +            'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no', +            'info_dict': { +                'id': 'CHqg6qOn4no', +                'ext': 'mp4', +                'title': 'Part 77   Sort a list of simple types in c#', +                'description': 'md5:b8746fa52e10cdbf47997903f13b20dc', +                'upload_date': '20130831', +                'uploader_id': 'kudvenkat', +                'uploader': 'kudvenkat', +            }, +            'params': { +                'skip_download': True, +            }, +        },      ]      def __init__(self, *args, **kwargs): @@ -1455,7 +1328,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              # https://github.com/ytdl-org/youtube-dl/pull/7599)              r';ytplayer\.config\s*=\s*({.+?});ytplayer',              r';ytplayer\.config\s*=\s*({.+?});', -            r'ytInitialPlayerResponse\s*=\s*({.+?});var meta'          )          config = self._search_regex(              patterns, webpage, 'ytplayer.config', default=None) @@ -1463,44 +1335,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              return self._parse_json(                  uppercase_escape(config), video_id, fatal=False) -    def _get_music_metadata_from_yt_initial(self, yt_initial): -        music_metadata = [] -        key_map = { -            'Album': 'album', -            'Artist': 'artist', -            'Song': 'track' -        } -        contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents']) -        if type(contents) is list: -            for content in contents: -                music_track = {} -                if type(content) is not dict: -                    continue -                videoSecondaryInfoRenderer = try_get(content, lambda x: x['videoSecondaryInfoRenderer']) -                if type(videoSecondaryInfoRenderer) is not dict: -                    continue -                rows = try_get(videoSecondaryInfoRenderer, lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows']) -                if type(rows) is not list: -                    continue -                for row in rows: -                    metadataRowRenderer = try_get(row, lambda x: x['metadataRowRenderer']) -                    if type(metadataRowRenderer) is not dict: -                        continue -                    key = try_get(metadataRowRenderer, lambda x: x['title']['simpleText']) -                    value = try_get(metadataRowRenderer, lambda x: x['contents'][0]['simpleText']) or \ -                        try_get(metadataRowRenderer, lambda x: x['contents'][0]['runs'][0]['text']) -                    if type(key) is not str or type(value) is not str: -                        continue -                    if key in key_map: -                        if key_map[key] in music_track: -                            # we've started on a new track -                            music_metadata.append(music_track) -                            music_track = {} -                        music_track[key_map[key]] = value -                if len(music_track.keys()): -                    music_metadata.append(music_track) -        return music_metadata -      def _get_automatic_captions(self, video_id, webpage):          """We need the webpage for getting the captions url, pass it as an             argument to speed up the process.""" @@ -1511,11 +1345,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              self._downloader.report_warning(err_msg)              return {}          try: -            if "args" in player_config and "ttsurl" in player_config["args"]: -                args = player_config['args'] -                caption_url = args['ttsurl'] +            args = player_config['args'] +            caption_url = args.get('ttsurl') +            if caption_url:                  timestamp = args['timestamp'] -                  # We get the available subtitles                  list_params = compat_urllib_parse_urlencode({                      'type': 'list', @@ -1571,24 +1404,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  return captions              # New captions format as of 22.06.2017 -            if "args" in player_config: -                player_response = player_config["args"].get('player_response') -            else: -                # New player system (ytInitialPlayerResponse) as of October 2020 -                player_response = player_config - -            if player_response: -                if isinstance(player_response, compat_str): -                    player_response = self._parse_json( -                        player_response, video_id, fatal=False) - -                renderer = player_response['captions']['playerCaptionsTracklistRenderer'] -                caption_tracks = renderer['captionTracks'] -                for caption_track in caption_tracks: -                    if 'kind' not in caption_track: -                        # not an automatic transcription -                        continue -                    base_url = caption_track['baseUrl'] +            player_response = args.get('player_response') +            if player_response and isinstance(player_response, compat_str): +                player_response = self._parse_json( +                    player_response, video_id, fatal=False) +                if player_response: +                    renderer = player_response['captions']['playerCaptionsTracklistRenderer'] +                    base_url = renderer['captionTracks'][0]['baseUrl']                      sub_lang_list = []                      for lang in renderer['translationLanguages']:                          lang_code = lang.get('languageCode') @@ -1596,25 +1418,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                              sub_lang_list.append(lang_code)                      return make_captions(base_url, sub_lang_list) -                self._downloader.report_warning("Couldn't find automatic captions for %s" % video_id) -                return {} - -            if "args" in player_config: -                args = player_config["args"] - -                # Some videos don't provide ttsurl but rather caption_tracks and -                # caption_translation_languages (e.g. 20LmZk1hakA) -                # Does not used anymore as of 22.06.2017 -                caption_tracks = args['caption_tracks'] -                caption_translation_languages = args['caption_translation_languages'] -                caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0] -                sub_lang_list = [] -                for lang in caption_translation_languages.split(','): -                    lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang)) -                    sub_lang = lang_qs.get('lc', [None])[0] -                    if sub_lang: -                        sub_lang_list.append(sub_lang) -                return make_captions(caption_url, sub_lang_list) +            # Some videos don't provide ttsurl but rather caption_tracks and +            # caption_translation_languages (e.g. 20LmZk1hakA) +            # Does not used anymore as of 22.06.2017 +            caption_tracks = args['caption_tracks'] +            caption_translation_languages = args['caption_translation_languages'] +            caption_url = compat_parse_qs(caption_tracks.split(',')[0])['u'][0] +            sub_lang_list = [] +            for lang in caption_translation_languages.split(','): +                lang_qs = compat_parse_qs(compat_urllib_parse_unquote_plus(lang)) +                sub_lang = lang_qs.get('lc', [None])[0] +                if sub_lang: +                    sub_lang_list.append(sub_lang) +            return make_captions(caption_url, sub_lang_list)          # An extractor error can be raise by the download process if there are          # no automatic captions but there are subtitles          except (KeyError, IndexError, ExtractorError): @@ -1695,15 +1511,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):      def _extract_chapters_from_json(self, webpage, video_id, duration):          if not webpage:              return -        initial_data = self._parse_json( -            self._search_regex( -                r'window\["ytInitialData"\] = (.+);\n', webpage, -                'player args', default='{}'), -            video_id, fatal=False) -        if not initial_data or not isinstance(initial_data, dict): +        data = self._extract_yt_initial_data(video_id, webpage) +        if not data or not isinstance(data, dict):              return          chapters_list = try_get( -            initial_data, +            data,              lambda x: x['playerOverlays']                         ['playerOverlayRenderer']                         ['decoratedPlayerBarRenderer'] @@ -1937,8 +1749,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              age_gate = False              # Try looking directly into the video webpage              ytplayer_config = self._get_ytplayer_config(video_id, video_webpage) -            args = ytplayer_config.get("args") -            if args is not None: +            if ytplayer_config: +                args = ytplayer_config.get('args', {})                  if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):                      # Convert to the same format returned by compat_parse_qs                      video_info = dict((k, [v]) for k, v in args.items()) @@ -1953,11 +1765,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                      is_live = True                  if not player_response:                      player_response = extract_player_response(args.get('player_response'), video_id) -            elif not player_response: -                player_response = ytplayer_config              if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):                  add_dash_mpd_pr(player_response) +        if not video_info and not player_response: +            player_response = extract_player_response( +                self._search_regex( +                    r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;', video_webpage, +                    'initial player response', default='{}'), +                video_id) +          def extract_unavailable_message():              messages = []              for tag, kind in (('h1', 'message'), ('div', 'submessage')): @@ -2162,7 +1979,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  if cipher:                      if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True): -                        ASSETS_RE = r'(?:"assets":.+?"js":\s*("[^"]+"))|(?:"jsUrl":\s*("[^"]+"))' +                        ASSETS_RE = ( +                            r'<script[^>]+\bsrc=("[^"]+")[^>]+\bname=["\']player_ias/base', +                            r'"jsUrl"\s*:\s*("[^"]+")', +                            r'"assets":.+?"js":\s*("[^"]+")')                          jsplayer_url_json = self._search_regex(                              ASSETS_RE,                              embed_webpage if age_gate else video_webpage, @@ -2298,6 +2118,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):              else:                  error_message = extract_unavailable_message()                  if not error_message: +                    reason_list = try_get( +                        player_response, +                        lambda x: x['playabilityStatus']['errorScreen']['playerErrorMessageRenderer']['subreason']['runs'], +                        list) or [] +                    for reason in reason_list: +                        if not isinstance(reason, dict): +                            continue +                        reason_text = try_get(reason, lambda x: x['text'], compat_str) +                        if reason_text: +                            if not error_message: +                                error_message = '' +                            error_message += reason_text +                    if error_message: +                        error_message = clean_html(error_message) +                if not error_message:                      error_message = clean_html(try_get(                          player_response, lambda x: x['playabilityStatus']['reason'],                          compat_str)) @@ -2422,7 +2257,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          # Youtube Music Auto-generated description          release_date = release_year = None          if video_description: -            mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description) +            mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description)              if mobj:                  if not track:                      track = mobj.group('track').strip() @@ -2439,13 +2274,33 @@ class YoutubeIE(YoutubeBaseInfoExtractor):                  if release_year:                      release_year = int(release_year) -        yt_initial = self._get_yt_initial_data(video_id, video_webpage) -        if yt_initial: -            music_metadata = self._get_music_metadata_from_yt_initial(yt_initial) -            if len(music_metadata): -                album = music_metadata[0].get('album') -                artist = music_metadata[0].get('artist') -                track = music_metadata[0].get('track') +        yt_initial_data = self._extract_yt_initial_data(video_id, video_webpage) +        contents = try_get(yt_initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or [] +        for content in contents: +            rows = try_get(content, lambda x: x['videoSecondaryInfoRenderer']['metadataRowContainer']['metadataRowContainerRenderer']['rows'], list) or [] +            multiple_songs = False +            for row in rows: +                if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True: +                    multiple_songs = True +                    break +            for row in rows: +                mrr = row.get('metadataRowRenderer') or {} +                mrr_title = try_get( +                    mrr, lambda x: x['title']['simpleText'], compat_str) +                mrr_contents = try_get( +                    mrr, lambda x: x['contents'][0], dict) or {} +                mrr_contents_text = try_get(mrr_contents, [lambda x: x['simpleText'], lambda x: x['runs'][0]['text']], compat_str) +                if not (mrr_title and mrr_contents_text): +                    continue +                if mrr_title == 'License': +                    video_license = mrr_contents_text +                elif not multiple_songs: +                    if mrr_title == 'Album': +                        album = mrr_contents_text +                    elif mrr_title == 'Artist': +                        artist = mrr_contents_text +                    elif mrr_title == 'Song': +                        track = mrr_contents_text          m_episode = re.search(              r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', @@ -2478,8 +2333,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          def _extract_count(count_name):              return str_to_int(self._search_regex( -                r'"accessibilityData":\{"label":"([\d,\w]+) %ss"\}' -                % re.escape(count_name), +                (r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' % re.escape(count_name), +                 r'["\']label["\']\s*:\s*["\']([\d,.]+)\s+%ss["\']' % re.escape(count_name)),                  video_webpage, count_name, default=None))          like_count = _extract_count('like') @@ -2656,44 +2511,59 @@ class YoutubeIE(YoutubeBaseInfoExtractor):          } -class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): -    IE_DESC = 'YouTube.com playlists' -    _VALID_URL = r"""(?x)(?: -                        (?:https?://)? +class YoutubeTabIE(YoutubeBaseInfoExtractor): +    IE_DESC = 'YouTube.com tab' +    _VALID_URL = r'''(?x) +                    https?://                          (?:\w+\.)?                          (?: -                            (?: -                                youtube(?:kids)?\.com| -                                invidio\.us -                            ) -                            / -                            (?: -                               (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11})) -                               \? (?:.*?[&;])*? (?:p|a|list)= -                            |  p/ +                            youtube(?:kids)?\.com| +                            invidio\.us +                        )/ +                        (?: +                            (?:channel|c|user)/| +                            (?P<not_channel> +                                feed/| +                                (?:playlist|watch)\?.*?\blist=                              )| -                            youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist= +                            (?!(%s)([/#?]|$))  # Direct URLs                          ) -                        ( -                            (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,} -                            # Top tracks, they can also include dots -                            |(?:MC)[\w\.]* -                        ) -                        .* -                     | -                        (%(playlist_id)s) -                     )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} -    _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' -    _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?' -    _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})' -    IE_NAME = 'youtube:playlist' -    _YTM_PLAYLIST_PREFIX = 'RDCLAK5uy_' -    _YTM_CHANNEL_INFO = { -        'uploader': 'Youtube Music', -        'uploader_id': 'music',  # or "UC-9-kyTW8ZkZNDHQJ6FgpwQ" -        'uploader_url': 'https://www.youtube.com/music' -    } +                        (?P<id>[^/?\#&]+) +                    ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES +    IE_NAME = 'youtube:tab' +      _TESTS = [{ +        # playlists, multipage +        'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid', +        'playlist_mincount': 94, +        'info_dict': { +            'id': 'UCqj7Cz7revf5maW9g5pgNcg', +            'title': 'Игорь Клейнер - Playlists', +            'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', +        }, +    }, { +        # playlists, multipage, different order +        'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd', +        'playlist_mincount': 94, +        'info_dict': { +            'id': 'UCqj7Cz7revf5maW9g5pgNcg', +            'title': 'Игорь Клейнер - Playlists', +            'description': 'md5:be97ee0f14ee314f1f002cf187166ee2', +        }, +    }, { +        # playlists, singlepage +        'url': 'https://www.youtube.com/user/ThirstForScience/playlists', +        'playlist_mincount': 4, +        'info_dict': { +            'id': 'UCAEtajcuhQ6an9WEzY9LEMQ', +            'title': 'ThirstForScience - Playlists', +            'description': 'md5:609399d937ea957b0f53cbffb747a14c', +        } +    }, { +        'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', +        'only_matching': True, +    }, { +        # basic, single video playlist          'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',          'info_dict': {              'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', @@ -2703,6 +2573,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):          },          'playlist_count': 1,      }, { +        # empty playlist          'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',          'info_dict': {              'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA', @@ -2712,71 +2583,92 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):          },          'playlist_count': 0,      }, { -        'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', -        'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', +        # Home tab +        'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',          'info_dict': { -            'title': '29C3: Not my department', -            'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', -            'uploader': 'Christiaan008', -            'uploader_id': 'ChRiStIaAn008', +            'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', +            'title': 'lex will - Home', +            'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',          }, -        'playlist_count': 96, +        'playlist_mincount': 2,      }, { -        'note': 'issue #673', -        'url': 'PLBB231211A4F62143', +        # Videos tab +        'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',          'info_dict': { -            'title': '[OLD]Team Fortress 2 (Class-based LP)', -            'id': 'PLBB231211A4F62143', -            'uploader': 'Wickydoo', -            'uploader_id': 'Wickydoo', +            'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', +            'title': 'lex will - Videos', +            'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',          }, -        'playlist_mincount': 26, +        'playlist_mincount': 975,      }, { -        'note': 'Large playlist', -        'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q', +        # Videos tab, sorted by popular +        'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',          'info_dict': { -            'title': 'Uploads from Cauchemar', -            'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', -            'uploader': 'Cauchemar', -            'uploader_id': 'Cauchemar89', +            'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', +            'title': 'lex will - Videos', +            'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',          }, -        'playlist_mincount': 799, +        'playlist_mincount': 199,      }, { -        'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', +        # Playlists tab +        'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',          'info_dict': { -            'title': 'YDL_safe_search', -            'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl', +            'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', +            'title': 'lex will - Playlists', +            'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',          }, -        'playlist_count': 2, -        'skip': 'This playlist is private', +        'playlist_mincount': 17,      }, { -        'note': 'embedded', -        'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', -        'playlist_count': 4, +        # Community tab +        'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',          'info_dict': { -            'title': 'JODA15', -            'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', -            'uploader': 'milan', -            'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw', -        } +            'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', +            'title': 'lex will - Community', +            'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', +        }, +        'playlist_mincount': 18,      }, { -        'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', -        'playlist_mincount': 485, +        # Channels tab +        'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',          'info_dict': { -            'title': '2018 Chinese New Singles (11/6 updated)', -            'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', -            'uploader': 'LBK', -            'uploader_id': 'sdragonfang', -        } +            'id': 'UCKfVa3S1e4PHvxWcwyMMg8w', +            'title': 'lex will - Channels', +            'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488', +        }, +        'playlist_mincount': 138,      }, { -        'note': 'Embedded SWF player', -        'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0', -        'playlist_count': 4, +        'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA', +        'only_matching': True, +    }, { +        'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', +        'only_matching': True, +    }, { +        'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', +        'only_matching': True, +    }, { +        'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.', +        'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',          'info_dict': { -            'title': 'JODA7', -            'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ', +            'title': '29C3: Not my department', +            'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC', +            'uploader': 'Christiaan008', +            'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',          }, -        'skip': 'This playlist does not exist', +        'playlist_count': 96, +    }, { +        'note': 'Large playlist', +        'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q', +        'info_dict': { +            'title': 'Uploads from Cauchemar', +            'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q', +            'uploader': 'Cauchemar', +            'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q', +        }, +        'playlist_mincount': 1123, +    }, { +        # even larger playlist, 8832 videos +        'url': 'http://www.youtube.com/user/NASAgovVideo/videos', +        'only_matching': True,      }, {          'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',          'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA', @@ -2784,10 +2676,23 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):              'title': 'Uploads from Interstellar Movie',              'id': 'UUXw-G3eDE9trcvY2sBMM_aA',              'uploader': 'Interstellar Movie', -            'uploader_id': 'InterstellarMovie1', +            'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',          },          'playlist_mincount': 21,      }, { +        # https://github.com/ytdl-org/youtube-dl/issues/21844 +        'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', +        'info_dict': { +            'title': 'Data Analysis with Dr Mike Pound', +            'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', +            'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA', +            'uploader': 'Computerphile', +        }, +        'playlist_mincount': 11, +    }, { +        'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc', +        'only_matching': True, +    }, {          # Playlist URL that does not actually serve a playlist          'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',          'info_dict': { @@ -2812,491 +2717,719 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):          'skip': 'This video is not available.',          'add_ie': [YoutubeIE.ie_key()],      }, { -        'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5', +        'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g', +        'only_matching': True, +    }, { +        'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM', +        'only_matching': True, +    }, { +        'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',          'info_dict': { -            'id': 'yeWKywCrFtk', +            'id': '9Auq9mYxFEE',              'ext': 'mp4', -            'title': 'Small Scale Baler and Braiding Rugs', -            'uploader': 'Backus-Page House Museum', -            'uploader_id': 'backuspagemuseum', -            'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum', -            'upload_date': '20161008', -            'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a', -            'categories': ['Nonprofits & Activism'], +            'title': 'Watch Sky News live', +            'uploader': 'Sky News', +            'uploader_id': 'skynews', +            'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews', +            'upload_date': '20191102', +            'description': 'md5:78de4e1c2359d0ea3ed829678e38b662', +            'categories': ['News & Politics'],              'tags': list,              'like_count': int,              'dislike_count': int,          },          'params': { -            'noplaylist': True,              'skip_download': True,          },      }, { -        # https://github.com/ytdl-org/youtube-dl/issues/21844 -        'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', +        'url': 'https://www.youtube.com/user/TheYoungTurks/live',          'info_dict': { -            'title': 'Data Analysis with Dr Mike Pound', -            'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba', -            'uploader_id': 'Computerphile', -            'uploader': 'Computerphile', +            'id': 'a48o2S1cPoo', +            'ext': 'mp4', +            'title': 'The Young Turks - Live Main Show', +            'uploader': 'The Young Turks', +            'uploader_id': 'TheYoungTurks', +            'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', +            'upload_date': '20150715', +            'license': 'Standard YouTube License', +            'description': 'md5:438179573adcdff3c97ebb1ee632b891', +            'categories': ['News & Politics'], +            'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], +            'like_count': int, +            'dislike_count': int,          }, -        'playlist_mincount': 11, +        'params': { +            'skip_download': True, +        }, +        'only_matching': True,      }, { -        'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21', +        'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',          'only_matching': True,      }, { -        'url': 'TLGGrESM50VT6acwMjAyMjAxNw', +        'url': 'https://www.youtube.com/c/CommanderVideoHq/live',          'only_matching': True,      }, { -        # music album playlist -        'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM', +        'url': 'https://www.youtube.com/feed/trending',          'only_matching': True,      }, { -        'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU', +        # needs auth +        'url': 'https://www.youtube.com/feed/library',          'only_matching': True,      }, { -        'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g', +        # needs auth +        'url': 'https://www.youtube.com/feed/history',          'only_matching': True, -    }] +    }, { +        # needs auth +        'url': 'https://www.youtube.com/feed/subscriptions', +        'only_matching': True, +    }, { +        # needs auth +        'url': 'https://www.youtube.com/feed/watch_later', +        'only_matching': True, +    }, { +        # no longer available? +        'url': 'https://www.youtube.com/feed/recommended', +        'only_matching': True, +    } +        # TODO +        # { +        #     'url': 'https://www.youtube.com/TheYoungTurks/live', +        #     'only_matching': True, +        # } +    ] -    def _real_initialize(self): -        self._login() +    def _extract_channel_id(self, webpage): +        channel_id = self._html_search_meta( +            'channelId', webpage, 'channel id', default=None) +        if channel_id: +            return channel_id +        channel_url = self._html_search_meta( +            ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url', +             'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad', +             'twitter:app:url:googleplay'), webpage, 'channel url') +        return self._search_regex( +            r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+', +            channel_url, 'channel id') -    def extract_videos_from_page(self, page): -        ids_in_page = [] -        titles_in_page = [] - -        for item in re.findall( -                r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page): -            attrs = extract_attributes(item) -            video_id = attrs['data-video-id'] -            video_title = unescapeHTML(attrs.get('data-title')) -            if video_title: -                video_title = video_title.strip() -            ids_in_page.append(video_id) -            titles_in_page.append(video_title) - -        # Fallback with old _VIDEO_RE -        self.extract_videos_from_page_impl( -            self._VIDEO_RE, page, ids_in_page, titles_in_page) - -        # Relaxed fallbacks -        self.extract_videos_from_page_impl( -            r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page, -            ids_in_page, titles_in_page) -        self.extract_videos_from_page_impl( -            r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page, -            ids_in_page, titles_in_page) - -        return zip(ids_in_page, titles_in_page) - -    def _extract_mix_ids_from_yt_initial(self, yt_initial): -        ids = [] -        playlist_contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['contents'], list) -        if playlist_contents: -            for item in playlist_contents: -                videoId = try_get(item, lambda x: x['playlistPanelVideoRenderer']['videoId'], compat_str) -                if videoId: -                    ids.append(videoId) -        return ids - -    def _extract_mix(self, playlist_id): -        # The mixes are generated from a single video -        # the id of the playlist is just 'RD' + video_id -        ids = [] -        yt_initial = None -        last_id = playlist_id[-11:] -        for n in itertools.count(1): -            url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id) -            webpage = self._download_webpage( -                url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n)) -            new_ids = orderedSet(re.findall( -                r'''(?xs)data-video-username=".*?".*? -                           href="/watch\?v=([0-9A-Za-z_-]{11})&[^"]*?list=%s''' % re.escape(playlist_id), -                webpage)) - -            # if no ids in html of page, try using embedded json -            if (len(new_ids) == 0): -                yt_initial = self._get_yt_initial_data(playlist_id, webpage) -                if yt_initial: -                    new_ids = self._extract_mix_ids_from_yt_initial(yt_initial) - -            # Fetch new pages until all the videos are repeated, it seems that -            # there are always 51 unique videos. -            new_ids = [_id for _id in new_ids if _id not in ids] -            if not new_ids: -                break -            ids.extend(new_ids) -            last_id = ids[-1] +    @staticmethod +    def _extract_grid_item_renderer(item): +        for item_kind in ('Playlist', 'Video', 'Channel'): +            renderer = item.get('grid%sRenderer' % item_kind) +            if renderer: +                return renderer + +    def _extract_video(self, renderer): +        video_id = renderer.get('videoId') +        title = try_get( +            renderer, +            (lambda x: x['title']['runs'][0]['text'], +             lambda x: x['title']['simpleText']), compat_str) +        description = try_get( +            renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'], +            compat_str) +        duration = parse_duration(try_get( +            renderer, lambda x: x['lengthText']['simpleText'], compat_str)) +        view_count_text = try_get( +            renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or '' +        view_count = str_to_int(self._search_regex( +            r'^([\d,]+)', re.sub(r'\s', '', view_count_text), +            'view count', default=None)) +        uploader = try_get( +            renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str) +        return { +            '_type': 'url_transparent', +            'ie_key': YoutubeIE.ie_key(), +            'id': video_id, +            'url': video_id, +            'title': title, +            'description': description, +            'duration': duration, +            'view_count': view_count, +            'uploader': uploader, +        } + +    def _grid_entries(self, grid_renderer): +        for item in grid_renderer['items']: +            if not isinstance(item, dict): +                continue +            renderer = self._extract_grid_item_renderer(item) +            if not isinstance(renderer, dict): +                continue +            title = try_get( +                renderer, lambda x: x['title']['runs'][0]['text'], compat_str) +            # playlist +            playlist_id = renderer.get('playlistId') +            if playlist_id: +                yield self.url_result( +                    'https://www.youtube.com/playlist?list=%s' % playlist_id, +                    ie=YoutubeTabIE.ie_key(), video_id=playlist_id, +                    video_title=title) +            # video +            video_id = renderer.get('videoId') +            if video_id: +                yield self._extract_video(renderer) +            # channel +            channel_id = renderer.get('channelId') +            if channel_id: +                title = try_get( +                    renderer, lambda x: x['title']['simpleText'], compat_str) +                yield self.url_result( +                    'https://www.youtube.com/channel/%s' % channel_id, +                    ie=YoutubeTabIE.ie_key(), video_title=title) + +    def _shelf_entries_from_content(self, shelf_renderer): +        content = shelf_renderer.get('content') +        if not isinstance(content, dict): +            return +        renderer = content.get('gridRenderer') +        if renderer: +            # TODO: add support for nested playlists so each shelf is processed +            # as separate playlist +            # TODO: this includes only first N items +            for entry in self._grid_entries(renderer): +                yield entry +        renderer = content.get('horizontalListRenderer') +        if renderer: +            # TODO +            pass + +    def _shelf_entries(self, shelf_renderer): +        ep = try_get( +            shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'], +            compat_str) +        shelf_url = urljoin('https://www.youtube.com', ep) +        if shelf_url: +            title = try_get( +                shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str) +            yield self.url_result(shelf_url, video_title=title) +        # Shelf may not contain shelf URL, fallback to extraction from content +        for entry in self._shelf_entries_from_content(shelf_renderer): +            yield entry + +    def _playlist_entries(self, video_list_renderer): +        for content in video_list_renderer['contents']: +            if not isinstance(content, dict): +                continue +            renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer') +            if not isinstance(renderer, dict): +                continue +            video_id = renderer.get('videoId') +            if not video_id: +                continue +            yield self._extract_video(renderer) + +    r""" # Not needed in the new implementation +    def _itemSection_entries(self, item_sect_renderer): +        for content in item_sect_renderer['contents']: +            if not isinstance(content, dict): +                continue +            renderer = content.get('videoRenderer', {}) +            if not isinstance(renderer, dict): +                continue +            video_id = renderer.get('videoId') +            if not video_id: +                continue +            yield self._extract_video(renderer) +    """ -        url_results = self._ids_to_results(ids) +    def _rich_entries(self, rich_grid_renderer): +        renderer = try_get( +            rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {} +        video_id = renderer.get('videoId') +        if not video_id: +            return +        yield self._extract_video(renderer) -        search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage) -        title_span = ( -            search_title('playlist-title') -            or search_title('title long-title') -            or search_title('title')) -        title = clean_html(title_span) +    def _video_entry(self, video_renderer): +        video_id = video_renderer.get('videoId') +        if video_id: +            return self._extract_video(video_renderer) -        if not title: -            title = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist']['title'], compat_str) +    def _post_thread_entries(self, post_thread_renderer): +        post_renderer = try_get( +            post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict) +        if not post_renderer: +            return +        # video attachment +        video_renderer = try_get( +            post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict) +        video_id = None +        if video_renderer: +            entry = self._video_entry(video_renderer) +            if entry: +                yield entry +        # inline video links +        runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or [] +        for run in runs: +            if not isinstance(run, dict): +                continue +            ep_url = try_get( +                run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str) +            if not ep_url: +                continue +            if not YoutubeIE.suitable(ep_url): +                continue +            ep_video_id = YoutubeIE._match_id(ep_url) +            if video_id == ep_video_id: +                continue +            yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=video_id) -        return self.playlist_result(url_results, playlist_id, title) +    def _post_thread_continuation_entries(self, post_thread_continuation): +        contents = post_thread_continuation.get('contents') +        if not isinstance(contents, list): +            return +        for content in contents: +            renderer = content.get('backstagePostThreadRenderer') +            if not isinstance(renderer, dict): +                continue +            for entry in self._post_thread_entries(renderer): +                yield entry -    def _extract_playlist(self, playlist_id): -        url = self._TEMPLATE_URL % playlist_id -        page = self._download_webpage(url, playlist_id) +    @staticmethod +    def _extract_next_continuation_data(renderer): +        next_continuation = try_get( +            renderer, lambda x: x['continuations'][0]['nextContinuationData'], dict) +        if not next_continuation: +            return +        continuation = next_continuation.get('continuation') +        if not continuation: +            return +        ctp = next_continuation.get('clickTrackingParams') +        return { +            'ctoken': continuation, +            'continuation': continuation, +            'itct': ctp, +        } -        # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604) -        for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page): -            match = match.strip() -            # Check if the playlist exists or is private -            mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match) -            if mobj: -                reason = mobj.group('reason') -                message = 'This playlist %s' % reason -                if 'private' in reason: -                    message += ', use --username or --netrc to access it' -                message += '.' -                raise ExtractorError(message, expected=True) -            elif re.match(r'[^<]*Invalid parameters[^<]*', match): -                raise ExtractorError( -                    'Invalid parameters. Maybe URL is incorrect.', -                    expected=True) -            elif re.match(r'[^<]*Choose your language[^<]*', match): +    @classmethod +    def _extract_continuation(cls, renderer): +        next_continuation = cls._extract_next_continuation_data(renderer) +        if next_continuation: +            return next_continuation +        contents = renderer.get('contents') +        if not isinstance(contents, list): +            return +        for content in contents: +            if not isinstance(content, dict):                  continue -            else: -                self.report_warning('Youtube gives an alert message: ' + match) +            continuation_ep = try_get( +                content, lambda x: x['continuationItemRenderer']['continuationEndpoint'], +                dict) +            if not continuation_ep: +                continue +            continuation = try_get( +                continuation_ep, lambda x: x['continuationCommand']['token'], compat_str) +            if not continuation: +                continue +            ctp = continuation_ep.get('clickTrackingParams') +            if not ctp: +                continue +            return { +                'ctoken': continuation, +                'continuation': continuation, +                'itct': ctp, +            } -        playlist_title = self._html_search_regex( -            r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>', -            page, 'title', default=None) +    def _entries(self, tab, identity_token): -        _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref=' -        uploader = self._html_search_regex( -            r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE, -            page, 'uploader', default=None) -        mobj = re.search( -            r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE, -            page) -        if mobj: -            uploader_id = mobj.group('uploader_id') -            uploader_url = compat_urlparse.urljoin(url, mobj.group('path')) -        else: -            uploader_id = uploader_url = None +        def extract_entries(parent_renderer):  # this needs to called again for continuation to work with feeds +            contents = try_get(parent_renderer, lambda x: x['contents'], list) or [] +            for content in contents: +                if not isinstance(content, dict): +                    continue +                is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict) +                if not is_renderer: +                    renderer = content.get('richItemRenderer') +                    if renderer: +                        for entry in self._rich_entries(renderer): +                            yield entry +                        continuation_list[0] = self._extract_continuation(parent_renderer) +                    continue +                isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] +                for isr_content in isr_contents: +                    if not isinstance(isr_content, dict): +                        continue +                    renderer = isr_content.get('playlistVideoListRenderer') +                    if renderer: +                        for entry in self._playlist_entries(renderer): +                            yield entry +                        continuation_list[0] = self._extract_continuation(renderer) +                        continue +                    renderer = isr_content.get('gridRenderer') +                    if renderer: +                        for entry in self._grid_entries(renderer): +                            yield entry +                        continuation_list[0] = self._extract_continuation(renderer) +                        continue +                    renderer = isr_content.get('shelfRenderer') +                    if renderer: +                        for entry in self._shelf_entries(renderer): +                            yield entry +                        continue +                    renderer = isr_content.get('backstagePostThreadRenderer') +                    if renderer: +                        for entry in self._post_thread_entries(renderer): +                            yield entry +                        continuation_list[0] = self._extract_continuation(renderer) +                        continue +                    renderer = isr_content.get('videoRenderer') +                    if renderer: +                        entry = self._video_entry(renderer) +                        if entry: +                            yield entry + +                if not continuation_list[0]: +                    continuation_list[0] = self._extract_continuation(is_renderer) + +            if not continuation_list[0]: +                continuation_list[0] = self._extract_continuation(parent_renderer) + +        continuation_list = [None]  # Python 2 doesnot support nonlocal +        parent_renderer = ( +            try_get(tab, lambda x: x['sectionListRenderer'], dict) +            or try_get(tab, lambda x: x['richGridRenderer'], dict) or {}) +        for entry in extract_entries(parent_renderer): +            yield entry +        continuation = continuation_list[0] + +        headers = { +            'x-youtube-client-name': '1', +            'x-youtube-client-version': '2.20201112.04.01', +        } +        if identity_token: +            headers['x-youtube-identity-token'] = identity_token -        has_videos = True +        for page_num in itertools.count(1): +            if not continuation: +                break +            browse = self._download_json( +                'https://www.youtube.com/browse_ajax', None, +                'Downloading page %d' % page_num, +                headers=headers, query=continuation, fatal=False) +            if not browse: +                break +            response = try_get(browse, lambda x: x[1]['response'], dict) +            if not response: +                break -        if not playlist_title: -            try: -                # Some playlist URLs don't actually serve a playlist (e.g. -                # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4) -                next(self._entries(page, playlist_id)) -            except StopIteration: -                has_videos = False +            continuation_contents = try_get( +                response, lambda x: x['continuationContents'], dict) +            if continuation_contents: +                continuation_renderer = continuation_contents.get('playlistVideoListContinuation') +                if continuation_renderer: +                    for entry in self._playlist_entries(continuation_renderer): +                        yield entry +                    continuation = self._extract_continuation(continuation_renderer) +                    continue +                continuation_renderer = continuation_contents.get('gridContinuation') +                if continuation_renderer: +                    for entry in self._grid_entries(continuation_renderer): +                        yield entry +                    continuation = self._extract_continuation(continuation_renderer) +                    continue +                continuation_renderer = continuation_contents.get('itemSectionContinuation') +                if continuation_renderer: +                    for entry in self._post_thread_continuation_entries(continuation_renderer): +                        yield entry +                    continuation = self._extract_continuation(continuation_renderer) +                    continue +                continuation_renderer = continuation_contents.get('sectionListContinuation')  # for feeds +                if continuation_renderer: +                    continuation_list = [None] +                    for entry in extract_entries(continuation_renderer): +                        yield entry +                    continuation = continuation_list[0] +                    continue + +            continuation_items = try_get( +                response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list) +            if continuation_items: +                continuation_item = continuation_items[0] +                if not isinstance(continuation_item, dict): +                    continue +                renderer = continuation_item.get('playlistVideoRenderer') or continuation_item.get('itemSectionRenderer') +                if renderer: +                    video_list_renderer = {'contents': continuation_items} +                    for entry in self._playlist_entries(video_list_renderer): +                        yield entry +                    continuation = self._extract_continuation(video_list_renderer) +                    continue +            break + +    @staticmethod +    def _extract_selected_tab(tabs): +        for tab in tabs: +            if try_get(tab, lambda x: x['tabRenderer']['selected'], bool): +                return tab['tabRenderer'] +        else: +            raise ExtractorError('Unable to find selected tab') +    @staticmethod +    def _extract_uploader(data): +        uploader = {} +        sidebar_renderer = try_get( +            data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list) +        if sidebar_renderer: +            for item in sidebar_renderer: +                if not isinstance(item, dict): +                    continue +                renderer = item.get('playlistSidebarSecondaryInfoRenderer') +                if not isinstance(renderer, dict): +                    continue +                owner = try_get( +                    renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict) +                if owner: +                    uploader['uploader'] = owner.get('text') +                    uploader['uploader_id'] = try_get( +                        owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str) +                    uploader['uploader_url'] = urljoin( +                        'https://www.youtube.com/', +                        try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str)) +        return uploader + +    def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token): +        selected_tab = self._extract_selected_tab(tabs) +        renderer = try_get( +            data, lambda x: x['metadata']['channelMetadataRenderer'], dict) +        playlist_id = title = description = None +        if renderer: +            channel_title = renderer.get('title') or item_id +            tab_title = selected_tab.get('title') +            title = channel_title or item_id +            if tab_title: +                title += ' - %s' % tab_title +            description = renderer.get('description') +            playlist_id = renderer.get('externalId') +        renderer = try_get( +            data, lambda x: x['metadata']['playlistMetadataRenderer'], dict) +        if renderer: +            title = renderer.get('title') +            description = None +            playlist_id = item_id +        if playlist_id is None: +            playlist_id = item_id +        if title is None: +            title = "Youtube " + playlist_id.title()          playlist = self.playlist_result( -            self._entries(page, playlist_id), playlist_id, playlist_title) -        playlist.update({ -            'uploader': uploader, -            'uploader_id': uploader_id, -            'uploader_url': uploader_url, -        }) -        if playlist_id.startswith(self._YTM_PLAYLIST_PREFIX): -            playlist.update(self._YTM_CHANNEL_INFO) +            self._entries(selected_tab['content'], identity_token), +            playlist_id=playlist_id, playlist_title=title, +            playlist_description=description) +        playlist.update(self._extract_uploader(data)) +        return playlist -        return has_videos, playlist +    def _extract_from_playlist(self, item_id, data, playlist): +        title = playlist.get('title') or try_get( +            data, lambda x: x['titleText']['simpleText'], compat_str) +        playlist_id = playlist.get('playlistId') or item_id +        return self.playlist_result( +            self._playlist_entries(playlist), playlist_id=playlist_id, +            playlist_title=title) + +    def _extract_alerts(self, data): +        for alert_dict in try_get(data, lambda x: x['alerts'], list) or []: +            for renderer in alert_dict: +                alert = alert_dict[renderer] +                alert_type = alert.get('type') +                if not alert_type: +                    continue +                message = try_get(alert, lambda x: x['text']['simpleText'], compat_str) +                if message: +                    yield alert_type, message +                for run in try_get(alert, lambda x: x['text']['runs'], list) or []: +                    message = try_get(run, lambda x: x['text'], compat_str) +                    if message: +                        yield alert_type, message -    def _check_download_just_video(self, url, playlist_id): -        # Check if it's a video-specific URL -        query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) -        video_id = query_dict.get('v', [None])[0] or self._search_regex( -            r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url, -            'video id', default=None) -        if video_id: +    def _real_extract(self, url): +        item_id = self._match_id(url) +        url = compat_urlparse.urlunparse( +            compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com')) +        is_home = re.match(r'(?P<pre>%s)(?P<post>/?(?![^#?]).*$)' % self._VALID_URL, url) +        if is_home is not None and is_home.group('not_channel') is None and item_id != 'feed': +            self._downloader.report_warning( +                'A channel/user page was given. All the channel\'s videos will be downloaded. ' +                'To download only the videos in the home page, add a "/home" to the URL') +            url = '%s/videos%s' % (is_home.group('pre'), is_home.group('post') or '') + +        # Handle both video/playlist URLs +        qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) +        video_id = qs.get('v', [None])[0] +        playlist_id = qs.get('list', [None])[0] + +        if is_home.group('not_channel') is not None and is_home.group('not_channel').startswith('watch') and not video_id: +            if playlist_id: +                self._downloader.report_warning('%s is not a valid Youtube URL. Trying to download playlist %s' % (url, playlist_id)) +                url = 'https://www.youtube.com/playlist?list=%s' % playlist_id +                # return self.url_result(playlist_id, ie=YoutubePlaylistIE.ie_key()) +            else: +                raise ExtractorError('Unable to recognize tab page') +        if video_id and playlist_id:              if self._downloader.params.get('noplaylist'):                  self.to_screen('Downloading just video %s because of --no-playlist' % video_id) -                return video_id, self.url_result(video_id, 'Youtube', video_id=video_id) -            else: -                self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) -                return video_id, None -        return None, None +                return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id) +            self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id)) + +        webpage = self._download_webpage(url, item_id) +        identity_token = self._search_regex( +            r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage, +            'identity token', default=None) +        data = self._extract_yt_initial_data(item_id, webpage) +        for alert_type, alert_message in self._extract_alerts(data): +            self._downloader.report_warning('YouTube said: %s - %s' % (alert_type, alert_message)) +        tabs = try_get( +            data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list) +        if tabs: +            return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token) +        playlist = try_get( +            data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict) +        if playlist: +            return self._extract_from_playlist(item_id, data, playlist) +        # Fallback to video extraction if no playlist alike page is recognized. +        # First check for the current video then try the v attribute of URL query. +        video_id = try_get( +            data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'], +            compat_str) or video_id +        if video_id: +            return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id) +        # Failed to recognize +        raise ExtractorError('Unable to recognize tab page') -    def _real_extract(self, url): -        # Extract playlist id -        mobj = re.match(self._VALID_URL, url) -        if mobj is None: -            raise ExtractorError('Invalid URL: %s' % url) -        playlist_id = mobj.group(1) or mobj.group(2) - -        video_id, video = self._check_download_just_video(url, playlist_id) -        if video: -            return video - -        if playlist_id.startswith(('RD', 'UL', 'PU')): -            if not playlist_id.startswith(self._YTM_PLAYLIST_PREFIX): -                # Mixes require a custom extraction process, -                # Youtube Music playlists act like normal playlists (with randomized order) -                return self._extract_mix(playlist_id) - -        has_videos, playlist = self._extract_playlist(playlist_id) -        if has_videos or not video_id: -            return playlist - -        # Some playlist URLs don't actually serve a playlist (see -        # https://github.com/ytdl-org/youtube-dl/issues/10537). -        # Fallback to plain video extraction if there is a video id -        # along with playlist id. -        return self.url_result(video_id, 'Youtube', video_id=video_id) - - -class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): -    IE_DESC = 'YouTube.com channels' -    _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie|kids)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)' -    _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos' -    _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?' -    IE_NAME = 'youtube:channel' + +class YoutubePlaylistIE(InfoExtractor): +    IE_DESC = 'YouTube.com playlists' +    _VALID_URL = r'''(?x)(?: +                        (?:https?://)? +                        (?:\w+\.)? +                        (?: +                            (?: +                                youtube(?:kids)?\.com| +                                invidio\.us| +                                youtu\.be +                            ) +                            /.*?\?.*?\blist= +                        )? +                        (?P<id>%(playlist_id)s) +                     )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE} +    IE_NAME = 'youtube:playlist'      _TESTS = [{ -        'note': 'paginated channel', -        'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w', -        'playlist_mincount': 91, +        'note': 'issue #673', +        'url': 'PLBB231211A4F62143',          'info_dict': { -            'id': 'UUKfVa3S1e4PHvxWcwyMMg8w', -            'title': 'Uploads from lex will', -            'uploader': 'lex will', -            'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w', -        } +            'title': '[OLD]Team Fortress 2 (Class-based LP)', +            'id': 'PLBB231211A4F62143', +            'uploader': 'Wickydoo', +            'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q', +        }, +        'playlist_mincount': 29,      }, { -        'note': 'Age restricted channel', -        # from https://www.youtube.com/user/DeusExOfficial -        'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w', -        'playlist_mincount': 64, +        'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',          'info_dict': { -            'id': 'UUs0ifCMCm1icqRbqhUINa0w', -            'title': 'Uploads from Deus Ex', -            'uploader': 'Deus Ex', -            'uploader_id': 'DeusExOfficial', +            'title': 'YDL_safe_search', +            'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',          }, +        'playlist_count': 2, +        'skip': 'This playlist is private',      }, { -        'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA', -        'only_matching': True, -    }, { -        'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA', -        'only_matching': True, -    }] - -    @classmethod -    def suitable(cls, url): -        return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url) -                else super(YoutubeChannelIE, cls).suitable(url)) - -    def _build_template_url(self, url, channel_id): -        return self._TEMPLATE_URL % channel_id - -    def _real_extract(self, url): -        channel_id = self._match_id(url) - -        url = self._build_template_url(url, channel_id) - -        # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778) -        # Workaround by extracting as a playlist if managed to obtain channel playlist URL -        # otherwise fallback on channel by page extraction -        channel_page = self._download_webpage( -            url + '?view=57', channel_id, -            'Downloading channel page', fatal=False) -        if channel_page is False: -            channel_playlist_id = False -        else: -            channel_playlist_id = self._html_search_meta( -                'channelId', channel_page, 'channel id', default=None) -            if not channel_playlist_id: -                channel_url = self._html_search_meta( -                    ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'), -                    channel_page, 'channel url', default=None) -                if channel_url: -                    channel_playlist_id = self._search_regex( -                        r'vnd\.youtube://user/([0-9A-Za-z_-]+)', -                        channel_url, 'channel id', default=None) -        if channel_playlist_id and channel_playlist_id.startswith('UC'): -            playlist_id = 'UU' + channel_playlist_id[2:] -            return self.url_result( -                compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist') - -        channel_page = self._download_webpage(url, channel_id, 'Downloading page #1') -        autogenerated = re.search(r'''(?x) -                class="[^"]*?(?: -                    channel-header-autogenerated-label| -                    yt-channel-title-autogenerated -                )[^"]*"''', channel_page) is not None - -        if autogenerated: -            # The videos are contained in a single page -            # the ajax pages can't be used, they are empty -            entries = [ -                self.url_result( -                    video_id, 'Youtube', video_id=video_id, -                    video_title=video_title) -                for video_id, video_title in self.extract_videos_from_page(channel_page)] -            return self.playlist_result(entries, channel_id) - -        try: -            next(self._entries(channel_page, channel_id)) -        except StopIteration: -            alert_message = self._html_search_regex( -                r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>', -                channel_page, 'alert', default=None, group='alert') -            if alert_message: -                raise ExtractorError('Youtube said: %s' % alert_message, expected=True) - -        return self.playlist_result(self._entries(channel_page, channel_id), channel_id) - - -class YoutubeUserIE(YoutubeChannelIE): -    IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)' -    _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9%-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_%-]+)' -    _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos' -    IE_NAME = 'youtube:user' - -    _TESTS = [{ -        'url': 'https://www.youtube.com/user/TheLinuxFoundation', -        'playlist_mincount': 320, +        'note': 'embedded', +        'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', +        'playlist_count': 4,          'info_dict': { -            'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ', -            'title': 'Uploads from The Linux Foundation', -            'uploader': 'The Linux Foundation', -            'uploader_id': 'TheLinuxFoundation', +            'title': 'JODA15', +            'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', +            'uploader': 'milan', +            'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',          }      }, { -        # Only available via https://www.youtube.com/c/12minuteathlete/videos -        # but not https://www.youtube.com/user/12minuteathlete/videos -        'url': 'https://www.youtube.com/c/12minuteathlete/videos', -        'playlist_mincount': 249, +        'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', +        'playlist_mincount': 982,          'info_dict': { -            'id': 'UUVjM-zV6_opMDx7WYxnjZiQ', -            'title': 'Uploads from 12 Minute Athlete', -            'uploader': '12 Minute Athlete', -            'uploader_id': 'the12minuteathlete', +            'title': '2018 Chinese New Singles (11/6 updated)', +            'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', +            'uploader': 'LBK', +            'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',          }      }, { -        'url': 'ytuser:phihag', -        'only_matching': True, -    }, { -        'url': 'https://www.youtube.com/c/gametrailers', -        'only_matching': True, +        'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5', +        'info_dict': { +            'id': 'yeWKywCrFtk', +            'ext': 'mp4', +            'title': 'Small Scale Baler and Braiding Rugs', +            'uploader': 'Backus-Page House Museum', +            'uploader_id': 'backuspagemuseum', +            'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum', +            'upload_date': '20161008', +            'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a', +            'categories': ['Nonprofits & Activism'], +            'tags': list, +            'like_count': int, +            'dislike_count': int, +        }, +        'params': { +            'noplaylist': True, +            'skip_download': True, +        },      }, { -        'url': 'https://www.youtube.com/c/Pawe%C5%82Zadro%C5%BCniak', +        'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',          'only_matching': True,      }, { -        'url': 'https://www.youtube.com/gametrailers', +        'url': 'TLGGrESM50VT6acwMjAyMjAxNw',          'only_matching': True,      }, { -        # This channel is not available, geo restricted to JP -        'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos', +        # music album playlist +        'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',          'only_matching': True,      }]      @classmethod      def suitable(cls, url): -        # Don't return True if the url can be extracted with other youtube -        # extractor, the regex would is too permissive and it would match. -        other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls) -        if any(ie.suitable(url) for ie in other_yt_ies): -            return False -        else: -            return super(YoutubeUserIE, cls).suitable(url) - -    def _build_template_url(self, url, channel_id): -        mobj = re.match(self._VALID_URL, url) -        return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id')) +        return False if YoutubeTabIE.suitable(url) else super( +            YoutubePlaylistIE, cls).suitable(url) +    def _real_extract(self, url): +        playlist_id = self._match_id(url) +        qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) +        if not qs: +            qs = {'list': playlist_id} +        return self.url_result( +            update_url_query('https://www.youtube.com/playlist', qs), +            ie=YoutubeTabIE.ie_key(), video_id=playlist_id) -class YoutubeLiveIE(YoutubeBaseInfoExtractor): -    IE_DESC = 'YouTube.com live streams' -    _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live' -    IE_NAME = 'youtube:live' +class YoutubeYtUserIE(InfoExtractor): +    _VALID_URL = r'ytuser:(?P<id>.+)'      _TESTS = [{ -        'url': 'https://www.youtube.com/user/TheYoungTurks/live', -        'info_dict': { -            'id': 'a48o2S1cPoo', -            'ext': 'mp4', -            'title': 'The Young Turks - Live Main Show', -            'uploader': 'The Young Turks', -            'uploader_id': 'TheYoungTurks', -            'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks', -            'upload_date': '20150715', -            'license': 'Standard YouTube License', -            'description': 'md5:438179573adcdff3c97ebb1ee632b891', -            'categories': ['News & Politics'], -            'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'], -            'like_count': int, -            'dislike_count': int, -        }, -        'params': { -            'skip_download': True, -        }, -    }, { -        'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live', -        'only_matching': True, -    }, { -        'url': 'https://www.youtube.com/c/CommanderVideoHq/live', -        'only_matching': True, -    }, { -        'url': 'https://www.youtube.com/TheYoungTurks/live', +        'url': 'ytuser:phihag',          'only_matching': True,      }]      def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        channel_id = mobj.group('id') -        base_url = mobj.group('base_url') -        webpage = self._download_webpage(url, channel_id, fatal=False) -        if webpage: -            page_type = self._og_search_property( -                'type', webpage, 'page type', default='') -            video_id = self._html_search_meta( -                'videoId', webpage, 'video id', default=None) -            if page_type.startswith('video') and video_id and re.match( -                    r'^[0-9A-Za-z_-]{11}$', video_id): -                return self.url_result(video_id, YoutubeIE.ie_key()) -        return self.url_result(base_url) - - -class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): -    IE_DESC = 'YouTube.com user/channel playlists' -    _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P<id>[^/]+)/playlists' -    IE_NAME = 'youtube:playlists' +        user_id = self._match_id(url) +        return self.url_result( +            'https://www.youtube.com/user/%s' % user_id, +            ie=YoutubeTabIE.ie_key(), video_id=user_id) + +class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): +    IE_NAME = 'youtube:favorites' +    IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)' +    _VALID_URL = r':ytfav(?:ou?rite)?s?' +    _LOGIN_REQUIRED = True      _TESTS = [{ -        'url': 'https://www.youtube.com/user/ThirstForScience/playlists', -        'playlist_mincount': 4, -        'info_dict': { -            'id': 'ThirstForScience', -            'title': 'ThirstForScience', -        }, -    }, { -        # with "Load more" button -        'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd', -        'playlist_mincount': 70, -        'info_dict': { -            'id': 'igorkle1', -            'title': 'Игорь Клейнер', -        }, -    }, { -        'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists', -        'playlist_mincount': 17, -        'info_dict': { -            'id': 'UCiU1dHvZObB2iP6xkJ__Icw', -            'title': 'Chem Player', -        }, -        'skip': 'Blocked', +        'url': ':ytfav', +        'only_matching': True,      }, { -        'url': 'https://www.youtube.com/c/ChristophLaimer/playlists', +        'url': ':ytfavorites',          'only_matching': True,      }] +    def _real_extract(self, url): +        return self.url_result( +            'https://www.youtube.com/playlist?list=LL', +            ie=YoutubeTabIE.ie_key()) + -class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistBaseInfoExtractor): +class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):      IE_DESC = 'YouTube.com searches'      # there doesn't appear to be a real limit, for example if you search for      # 'python' you get more than 8.000.000 results @@ -3393,10 +3526,11 @@ class YoutubeSearchDateIE(YoutubeSearchIE):      _SEARCH_PARAMS = 'CAI%3D' -class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor): +class YoutubeSearchURLIE(YoutubeSearchIE):      IE_DESC = 'YouTube.com search URLs' -    IE_NAME = 'youtube:search_url' -    _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' +    IE_NAME = YoutubeSearchIE.IE_NAME + '_url' +    _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)' +    # _MAX_RESULTS = 100      _TESTS = [{          'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',          'playlist_mincount': 5, @@ -3408,47 +3542,25 @@ class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor):          'only_matching': True,      }] -    def _process_json_dict(self, obj, videos, c): -        if "videoId" in obj: -            videos.append(obj) -            return - -        if "nextContinuationData" in obj: -            c["continuation"] = obj["nextContinuationData"] -            return - -    def _real_extract(self, url): -        mobj = re.match(self._VALID_URL, url) -        query = compat_urllib_parse_unquote_plus(mobj.group('query')) -        webpage = self._download_webpage(url, query) -        return self.playlist_result(self._entries(webpage, query, max_pages=5), playlist_title=query) - - -class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor): -    IE_DESC = 'YouTube.com (multi-season) shows' -    _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)' -    IE_NAME = 'youtube:show' -    _TESTS = [{ -        'url': 'https://www.youtube.com/show/airdisasters', -        'playlist_mincount': 5, -        'info_dict': { -            'id': 'airdisasters', -            'title': 'Air Disasters', -        } -    }] +    @classmethod +    def _make_valid_url(cls): +        return cls._VALID_URL      def _real_extract(self, url): -        playlist_id = self._match_id(url) -        return super(YoutubeShowIE, self)._real_extract( -            'https://www.youtube.com/show/%s/playlists' % playlist_id) +        qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query) +        query = (qs.get('search_query') or qs.get('q'))[0] +        self._SEARCH_PARAMS = qs.get('sp', ('',))[0] +        return self._get_n_results(query, self._MAX_RESULTS) -class YoutubeFeedsInfoExtractor(YoutubePlaylistBaseInfoExtractor): +class YoutubeFeedsInfoExtractor(YoutubeTabIE):      """      Base class for feed extractors -    Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties. +    Subclasses must define the _FEED_NAME property.      """      _LOGIN_REQUIRED = True +    # _MAX_PAGES = 5 +    _TESTS = []      @property      def IE_NAME(self): @@ -3457,89 +3569,63 @@ class YoutubeFeedsInfoExtractor(YoutubePlaylistBaseInfoExtractor):      def _real_initialize(self):          self._login() -    def _process_entries(self, entries, seen): -        new_info = [] -        for v in entries: -            v_id = try_get(v, lambda x: x['videoId']) -            if not v_id: -                continue - -            have_video = False -            for old in seen: -                if old['videoId'] == v_id: -                    have_video = True -                    break - -            if not have_video: -                new_info.append(v) - -        if not new_info: -            return - -        seen.extend(new_info) -        for video in new_info: -            yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=self._extract_title(video)) -      def _real_extract(self, url): -        page = self._download_webpage( +        return self.url_result(              'https://www.youtube.com/feed/%s' % self._FEED_NAME, -            self._PLAYLIST_TITLE) -        return self.playlist_result(self._entries(page, self._PLAYLIST_TITLE), -                                    playlist_title=self._PLAYLIST_TITLE) +            ie=YoutubeTabIE.ie_key()) -class YoutubeWatchLaterIE(YoutubePlaylistIE): +class YoutubeWatchLaterIE(InfoExtractor):      IE_NAME = 'youtube:watchlater'      IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)' -    _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater' - +    _VALID_URL = r':ytwatchlater'      _TESTS = [{ -        'url': 'https://www.youtube.com/playlist?list=WL', -        'only_matching': True, -    }, { -        'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL', +        'url': ':ytwatchlater',          'only_matching': True,      }]      def _real_extract(self, url): -        _, video = self._check_download_just_video(url, 'WL') -        if video: -            return video -        _, playlist = self._extract_playlist('WL') -        return playlist - - -class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): -    IE_NAME = 'youtube:favorites' -    IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)' -    _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?' -    _LOGIN_REQUIRED = True - -    def _real_extract(self, url): -        webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos') -        playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id') -        return self.url_result(playlist_id, 'YoutubePlaylist') +        return self.url_result( +            'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key())  class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):      IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)' -    _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?' +    _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?'      _FEED_NAME = 'recommended' -    _PLAYLIST_TITLE = 'Youtube Recommended videos' +    _TESTS = [{ +        'url': ':ytrec', +        'only_matching': True, +    }, { +        'url': ':ytrecommended', +        'only_matching': True, +    }, { +        'url': 'https://youtube.com', +        'only_matching': True, +    }]  class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor): -    IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)' -    _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?' +    IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)' +    _VALID_URL = r':ytsub(?:scription)?s?'      _FEED_NAME = 'subscriptions' -    _PLAYLIST_TITLE = 'Youtube Subscriptions' +    _TESTS = [{ +        'url': ':ytsubs', +        'only_matching': True, +    }, { +        'url': ':ytsubscriptions', +        'only_matching': True, +    }]  class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):      IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)' -    _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory' +    _VALID_URL = r':ythistory'      _FEED_NAME = 'history' -    _PLAYLIST_TITLE = 'Youtube History' +    _TESTS = [{ +        'url': ':ythistory', +        'only_matching': True, +    }]  class YoutubeTruncatedURLIE(InfoExtractor): @@ -3606,3 +3692,25 @@ class YoutubeTruncatedIDIE(InfoExtractor):          raise ExtractorError(              'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),              expected=True) + + +# Do Youtube show urls even exist anymore? I couldn't find any +r''' +class YoutubeShowIE(YoutubeTabIE): +    IE_DESC = 'YouTube.com (multi-season) shows' +    _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)' +    IE_NAME = 'youtube:show' +    _TESTS = [{ +        'url': 'https://www.youtube.com/show/airdisasters', +        'playlist_mincount': 5, +        'info_dict': { +            'id': 'airdisasters', +            'title': 'Air Disasters', +        } +    }] + +    def _real_extract(self, url): +        playlist_id = self._match_id(url) +        return super(YoutubeShowIE, self)._real_extract( +            'https://www.youtube.com/show/%s/playlists' % playlist_id) +''' diff --git a/youtube_dlc/utils.py b/youtube_dlc/utils.py index f5dc1bdaf..68b4ca944 100644 --- a/youtube_dlc/utils.py +++ b/youtube_dlc/utils.py @@ -2460,7 +2460,7 @@ class XAttrMetadataError(YoutubeDLError):          # Parsing code and msg          if (self.code in (errno.ENOSPC, errno.EDQUOT) -                or 'No space left' in self.msg or 'Disk quota excedded' in self.msg): +                or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):              self.reason = 'NO_SPACE'          elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:              self.reason = 'VALUE_TOO_LONG' @@ -4085,7 +4085,7 @@ def js_to_json(code):          v = m.group(0)          if v in ('true', 'false', 'null'):              return v -        elif v.startswith('/*') or v.startswith('//') or v == ',': +        elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':              return ""          if v[0] in ("'", '"'): @@ -4095,12 +4095,12 @@ def js_to_json(code):                  '\\\n': '',                  '\\x': '\\u00',              }.get(m.group(0), m.group(0)), v[1:-1]) - -        for regex, base in INTEGER_TABLE: -            im = re.match(regex, v) -            if im: -                i = int(im.group(1), base) -                return '"%d":' % i if v.endswith(':') else '%d' % i +        else: +            for regex, base in INTEGER_TABLE: +                im = re.match(regex, v) +                if im: +                    i = int(im.group(1), base) +                    return '"%d":' % i if v.endswith(':') else '%d' % i          return '"%s"' % v @@ -4110,7 +4110,8 @@ def js_to_json(code):          {comment}|,(?={skip}[\]}}])|          (?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|          \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?| -        [0-9]+(?={skip}:) +        [0-9]+(?={skip}:)| +        !+          '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code) @@ -4214,10 +4215,10 @@ def parse_codecs(codecs_str):      # http://tools.ietf.org/html/rfc6381      if not codecs_str:          return {} -    splited_codecs = list(filter(None, map( +    split_codecs = list(filter(None, map(          lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))      vcodec, acodec = None, None -    for full_codec in splited_codecs: +    for full_codec in split_codecs:          codec = full_codec.split('.')[0]          if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01', 'theora'):              if not vcodec: @@ -4228,10 +4229,10 @@ def parse_codecs(codecs_str):          else:              write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)      if not vcodec and not acodec: -        if len(splited_codecs) == 2: +        if len(split_codecs) == 2:              return { -                'vcodec': splited_codecs[0], -                'acodec': splited_codecs[1], +                'vcodec': split_codecs[0], +                'acodec': split_codecs[1],              }      else:          return { @@ -5470,7 +5471,7 @@ def encode_base_n(num, n, table=None):  def decode_packed_codes(code):      mobj = re.search(PACKED_CODES_RE, code) -    obfucasted_code, base, count, symbols = mobj.groups() +    obfuscated_code, base, count, symbols = mobj.groups()      base = int(base)      count = int(count)      symbols = symbols.split('|') @@ -5483,7 +5484,7 @@ def decode_packed_codes(code):      return re.sub(          r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)], -        obfucasted_code) +        obfuscated_code)  def caesar(s, alphabet, shift): | 
