diff options
| -rw-r--r-- | test/test_all_urls.py | 9 | ||||
| -rw-r--r-- | test/test_execution.py | 10 | ||||
| -rw-r--r-- | test/test_youtube_misc.py | 26 | ||||
| -rw-r--r-- | yt_dlp/extractor/bbc.py | 31 | ||||
| -rw-r--r-- | yt_dlp/extractor/go.py | 50 | ||||
| -rw-r--r-- | yt_dlp/extractor/youtube.py | 11 | 
6 files changed, 110 insertions, 27 deletions
| diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 06099a679..5f3c77d8e 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -72,15 +72,6 @@ class TestAllURLsMatching(unittest.TestCase):          self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url'])          self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url']) -    def test_youtube_extract(self): -        assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id) -        assertExtractId('http://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') -        assertExtractId('https://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') -        assertExtractId('https://www.youtube.com/watch?feature=player_embedded&v=BaW_jenozKc', 'BaW_jenozKc') -        assertExtractId('https://www.youtube.com/watch_popup?v=BaW_jenozKc', 'BaW_jenozKc') -        assertExtractId('http://www.youtube.com/watch?v=BaW_jenozKcsharePLED17F32AD9753930', 'BaW_jenozKc') -        assertExtractId('BaW_jenozKc', 'BaW_jenozKc') -      def test_facebook_matching(self):          self.assertTrue(FacebookIE.suitable('https://www.facebook.com/Shiniknoh#!/photo.php?v=10153317450565268'))          self.assertTrue(FacebookIE.suitable('https://www.facebook.com/cindyweather?fref=ts#!/photo.php?v=10152183998945793')) diff --git a/test/test_execution.py b/test/test_execution.py index 2aea4df1b..8a0d65bfb 100644 --- a/test/test_execution.py +++ b/test/test_execution.py @@ -39,6 +39,16 @@ class TestExecution(unittest.TestCase):          _, stderr = p.communicate()          self.assertFalse(stderr) +    def test_lazy_extractors(self): +        try: +            subprocess.check_call([sys.executable, 'devscripts/make_lazy_extractors.py', 'yt_dlp/extractor/lazy_extractors.py'], cwd=rootDir, stdout=_DEV_NULL) +            subprocess.check_call([sys.executable, 'test/test_all_urls.py'], cwd=rootDir, stdout=_DEV_NULL) +        finally: +            try: +                os.remove('yt_dlp/extractor/lazy_extractors.py') +            except (IOError, OSError): +                pass +  if __name__ == '__main__':      unittest.main() diff --git a/test/test_youtube_misc.py b/test/test_youtube_misc.py new file mode 100644 index 000000000..d9bb10d26 --- /dev/null +++ b/test/test_youtube_misc.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python +from __future__ import unicode_literals + +# Allow direct execution +import os +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +from yt_dlp.extractor import YoutubeIE + + +class TestYoutubeMisc(unittest.TestCase): +    def test_youtube_extract(self): +        assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id) +        assertExtractId('http://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') +        assertExtractId('https://www.youtube.com/watch?&v=BaW_jenozKc', 'BaW_jenozKc') +        assertExtractId('https://www.youtube.com/watch?feature=player_embedded&v=BaW_jenozKc', 'BaW_jenozKc') +        assertExtractId('https://www.youtube.com/watch_popup?v=BaW_jenozKc', 'BaW_jenozKc') +        assertExtractId('http://www.youtube.com/watch?v=BaW_jenozKcsharePLED17F32AD9753930', 'BaW_jenozKc') +        assertExtractId('BaW_jenozKc', 'BaW_jenozKc') + + +if __name__ == '__main__': +    unittest.main() diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index 333796c80..edc2c697b 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -11,6 +11,7 @@ from ..compat import (      compat_etree_Element,      compat_HTTPError,      compat_parse_qs, +    compat_str,      compat_urllib_parse_urlparse,      compat_urlparse,  ) @@ -25,8 +26,10 @@ from ..utils import (      js_to_json,      parse_duration,      parse_iso8601, +    strip_or_none,      try_get,      unescapeHTML, +    unified_timestamp,      url_or_none,      urlencode_postdata,      urljoin, @@ -761,8 +764,17 @@ class BBCIE(BBCCoUkIE):          'only_matching': True,      }, {          # custom redirection to www.bbc.com +        # also, video with window.__INITIAL_DATA__          'url': 'http://www.bbc.co.uk/news/science-environment-33661876', -        'only_matching': True, +        'info_dict': { +            'id': 'p02xzws1', +            'ext': 'mp4', +            'title': "Pluto may have 'nitrogen glaciers'", +            'description': 'md5:6a95b593f528d7a5f2605221bc56912f', +            'thumbnail': r're:https?://.+/.+\.jpg', +            'timestamp': 1437785037, +            'upload_date': '20150725', +        },      }, {          # single video article embedded with data-media-vpid          'url': 'http://www.bbc.co.uk/sport/rowing/35908187', @@ -1164,12 +1176,29 @@ class BBCIE(BBCCoUkIE):                          continue                      formats, subtitles = self._download_media_selector(item_id)                      self._sort_formats(formats) +                    item_desc = None +                    blocks = try_get(media, lambda x: x['summary']['blocks'], list) +                    if blocks: +                        summary = [] +                        for block in blocks: +                            text = try_get(block, lambda x: x['model']['text'], compat_str) +                            if text: +                                summary.append(text) +                        if summary: +                            item_desc = '\n\n'.join(summary) +                    item_time = None +                    for meta in try_get(media, lambda x: x['metadata']['items'], list) or []: +                        if try_get(meta, lambda x: x['label']) == 'Published': +                            item_time = unified_timestamp(meta.get('timestamp')) +                            break                      entries.append({                          'id': item_id,                          'title': item_title,                          'thumbnail': item.get('holdingImageUrl'),                          'formats': formats,                          'subtitles': subtitles, +                        'timestamp': item_time, +                        'description': strip_or_none(item_desc),                      })              for resp in (initial_data.get('data') or {}).values():                  name = resp.get('name') diff --git a/yt_dlp/extractor/go.py b/yt_dlp/extractor/go.py index c31e07a0c..7dcdc864f 100644 --- a/yt_dlp/extractor/go.py +++ b/yt_dlp/extractor/go.py @@ -4,12 +4,14 @@ from __future__ import unicode_literals  import re  from .adobepass import AdobePassIE +from ..compat import compat_str  from ..utils import (      int_or_none,      determine_ext,      parse_age_limit,      remove_start,      remove_end, +    try_get,      urlencode_postdata,      ExtractorError,  ) @@ -119,6 +121,18 @@ class GoIE(AdobePassIE):              'skip_download': True,          },      }, { +        'url': 'https://abc.com/shows/modern-family/episode-guide/season-01/101-pilot', +        'info_dict': { +            'id': 'VDKA22600213', +            'ext': 'mp4', +            'title': 'Pilot', +            'description': 'md5:74306df917cfc199d76d061d66bebdb4', +        }, +        'params': { +            # m3u8 download +            'skip_download': True, +        }, +    }, {          'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding',          'only_matching': True,      }, { @@ -154,18 +168,30 @@ class GoIE(AdobePassIE):          brand = site_info.get('brand')          if not video_id or not site_info:              webpage = self._download_webpage(url, display_id or video_id) -            video_id = self._search_regex( -                ( -                    # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" -                    # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood -                    r'data-video-id=["\']*(VDKA\w+)', -                    # https://github.com/ytdl-org/youtube-dl/pull/25216/files -                    #  The following is based on the pull request on the line above. Changed the ABC.com URL to a show available now. -                    # https://abc.com/shows/the-rookie/episode-guide/season-02/19-the-q-word -                    r'\bvideoIdCode["\']\s*:\s*["\'](vdka\w+)', -                    # Deprecated  fallback pattern -                    r'\b(?:video)?id["\']\s*:\s*["\'](VDKA\w+)' -                ), webpage, 'video id', default=video_id) +            data = self._parse_json( +                self._search_regex( +                    r'["\']__abc_com__["\']\s*\]\s*=\s*({.+?})\s*;', webpage, +                    'data', default='{}'), +                display_id or video_id, fatal=False) +            # https://abc.com/shows/modern-family/episode-guide/season-01/101-pilot +            layout = try_get(data, lambda x: x['page']['content']['video']['layout'], dict) +            video_id = None +            if layout: +                video_id = try_get( +                    layout, +                    (lambda x: x['videoid'], lambda x: x['video']['id']), +                    compat_str) +            if not video_id: +                video_id = self._search_regex( +                    ( +                        # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" +                        # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood +                        r'data-video-id=["\']*(VDKA\w+)', +                        # page.analytics.videoIdCode +                        r'\bvideoIdCode["\']\s*:\s*["\']((?:vdka|VDKA)\w+)', +                        # https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet +                        r'\b(?:video)?id["\']\s*:\s*["\'](VDKA\w+)' +                    ), webpage, 'video id', default=video_id)              if not site_info:                  brand = self._search_regex(                      (r'data-brand=\s*["\']\s*(\d+)', diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index eaa2db1da..edc985d19 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -77,11 +77,6 @@ class YoutubeBaseInfoExtractor(InfoExtractor):      _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)' -    def _ids_to_results(self, ids): -        return [ -            self.url_result(vid_id, 'Youtube', video_id=vid_id) -            for vid_id in ids] -      def _login(self):          """          Attempt to log in to YouTube. @@ -1313,6 +1308,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):      @classmethod      def suitable(cls, url): +        # Hack for lazy extractors until more generic solution is implemented +        # (see #28780) +        from .youtube import parse_qs          qs = parse_qs(url)          if qs.get('list', [None])[0]:              return False @@ -3595,6 +3593,9 @@ class YoutubePlaylistIE(InfoExtractor):      def suitable(cls, url):          if YoutubeTabIE.suitable(url):              return False +        # Hack for lazy extractors until more generic solution is implemented +        # (see #28780) +        from .youtube import parse_qs          qs = parse_qs(url)          if qs.get('v', [None])[0]:              return False | 
