diff options
| author | pukkandan <pukkandan.ytdlp@gmail.com> | 2022-08-01 06:52:03 +0530 | 
|---|---|---|
| committer | pukkandan <pukkandan.ytdlp@gmail.com> | 2022-08-02 01:08:16 +0530 | 
| commit | 8f97a15d1c7ebc10d0b51ce24632ac17b34a5f69 (patch) | |
| tree | 218588a6ee85435864e9848ce4450d2731c2e35b | |
| parent | 47304e07dc4a044242f7d5a14c3f6c3e5f3ad8ba (diff) | |
| download | hypervideo-pre-8f97a15d1c7ebc10d0b51ce24632ac17b34a5f69.tar.lz hypervideo-pre-8f97a15d1c7ebc10d0b51ce24632ac17b34a5f69.tar.xz hypervideo-pre-8f97a15d1c7ebc10d0b51ce24632ac17b34a5f69.zip | |
[extractor] Framework for embed detection (#4307)
| -rw-r--r-- | devscripts/lazy_load_template.py | 6 | ||||
| -rw-r--r-- | devscripts/make_lazy_extractors.py | 7 | ||||
| -rw-r--r-- | yt_dlp/YoutubeDL.py | 3 | ||||
| -rw-r--r-- | yt_dlp/extractor/brightcove.py | 4 | ||||
| -rw-r--r-- | yt_dlp/extractor/common.py | 99 | ||||
| -rw-r--r-- | yt_dlp/extractor/generic.py | 101 | ||||
| -rw-r--r-- | yt_dlp/extractor/spotify.py | 2 | ||||
| -rw-r--r-- | yt_dlp/utils.py | 4 | 
8 files changed, 149 insertions, 77 deletions
| diff --git a/devscripts/lazy_load_template.py b/devscripts/lazy_load_template.py index cdafaf1ef..a6e26b6f6 100644 --- a/devscripts/lazy_load_template.py +++ b/devscripts/lazy_load_template.py @@ -9,11 +9,13 @@ from ..utils import (      write_string,  ) +# These bloat the lazy_extractors, so allow them to passthrough silently +ALLOWED_CLASSMETHODS = {'get_testcases', 'extract_from_webpage'} +  class LazyLoadMetaClass(type):      def __getattr__(cls, name): -        # "_TESTS" bloat the lazy_extractors -        if '_real_class' not in cls.__dict__ and name != 'get_testcases': +        if '_real_class' not in cls.__dict__ and name not in ALLOWED_CLASSMETHODS:              write_string(                  'WARNING: Falling back to normal extractor since lazy extractor '                  f'{cls.__name__} does not have attribute {name}{bug_reports_message()}\n') diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py index 60fcc5ef0..c9fdfb562 100644 --- a/devscripts/make_lazy_extractors.py +++ b/devscripts/make_lazy_extractors.py @@ -11,7 +11,7 @@ import optparse  from inspect import getsource  NO_ATTR = object() -STATIC_CLASS_PROPERTIES = ['IE_NAME', 'IE_DESC', 'SEARCH_KEY', '_WORKING', '_NETRC_MACHINE', 'age_limit'] +STATIC_CLASS_PROPERTIES = ['IE_NAME', 'IE_DESC', 'SEARCH_KEY', '_VALID_URL', '_WORKING', '_NETRC_MACHINE', 'age_limit']  CLASS_METHODS = [      'ie_key', 'working', 'description', 'suitable', '_match_valid_url', '_match_id', 'get_temp_id', 'is_suitable'  ] @@ -116,11 +116,6 @@ def build_lazy_ie(ie, name, attr_base):      }.get(base.__name__, base.__name__) for base in ie.__bases__)      s = IE_TEMPLATE.format(name=name, module=ie.__module__, bases=bases) -    valid_url = getattr(ie, '_VALID_URL', None) -    if not valid_url and hasattr(ie, '_make_valid_url'): -        valid_url = ie._make_valid_url() -    if valid_url: -        s += f'    _VALID_URL = {valid_url!r}\n'      return s + '\n'.join(extra_ie_code(ie, attr_base)) diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index ce8ac2e89..f6f97b8ec 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1566,7 +1566,8 @@ class YoutubeDL:          result_type = ie_result.get('_type', 'video')          if result_type in ('url', 'url_transparent'): -            ie_result['url'] = sanitize_url(ie_result['url']) +            ie_result['url'] = sanitize_url( +                ie_result['url'], scheme='http' if self.params.get('prefer_insecure') else 'https')              if ie_result.get('original_url'):                  extra_info.setdefault('original_url', ie_result['original_url']) diff --git a/yt_dlp/extractor/brightcove.py b/yt_dlp/extractor/brightcove.py index a5412897d..99a216fb4 100644 --- a/yt_dlp/extractor/brightcove.py +++ b/yt_dlp/extractor/brightcove.py @@ -402,11 +402,11 @@ class BrightcoveNewIE(AdobePassIE):      @staticmethod      def _extract_url(ie, webpage): -        urls = BrightcoveNewIE._extract_urls(ie, webpage) +        urls = BrightcoveNewIE._extract_brightcove_urls(ie, webpage)          return urls[0] if urls else None      @staticmethod -    def _extract_urls(ie, webpage): +    def _extract_brightcove_urls(ie, webpage):          # Reference:          # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe          # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#tag diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index d168763e0..b8347fe4c 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -14,6 +14,7 @@ import random  import re  import sys  import time +import types  import urllib.parse  import urllib.request  import xml.etree.ElementTree @@ -23,6 +24,7 @@ from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name  from ..downloader import FileDownloader  from ..downloader.f4m import get_base_url, remove_encrypted_media  from ..utils import ( +    IDENTITY,      JSON_LD_RE,      NO_DEFAULT,      ExtractorError, @@ -59,6 +61,7 @@ from ..utils import (      parse_m3u8_attributes,      parse_resolution,      sanitize_filename, +    sanitize_url,      sanitized_Request,      str_or_none,      str_to_int, @@ -431,14 +434,26 @@ class InfoExtractor:      title, description etc. -    Subclasses of this should define a _VALID_URL regexp and, re-define the -    _real_extract() and (optionally) _real_initialize() methods. -    Probably, they should also be added to the list of extractors. +    Subclasses of this should also be added to the list of extractors and +    should define a _VALID_URL regexp and, re-define the _real_extract() and +    (optionally) _real_initialize() methods.      Subclasses may also override suitable() if necessary, but ensure the function      signature is preserved and that this function imports everything it needs      (except other extractors), so that lazy_extractors works correctly. +    Subclasses can define a list of _EMBED_REGEX, which will be searched for in +    the HTML of Generic webpages. It may also override _extract_embed_urls +    or _extract_from_webpage as necessary. While these are normally classmethods, +    _extract_from_webpage is allowed to be an instance method. + +    _extract_from_webpage may raise self.StopExtraction() to stop further +    processing of the webpage and obtain exclusive rights to it. This is useful +    when the extractor cannot reliably be matched using just the URL. +    Eg: invidious/peertube instances + +    Embed-only extractors can be defined by setting _VALID_URL = False. +      To support username + password (or netrc) login, the extractor must define a      _NETRC_MACHINE and re-define _perform_login(username, password) and      (optionally) _initialize_pre_login() methods. The _perform_login method will @@ -476,6 +491,8 @@ class InfoExtractor:      _NETRC_MACHINE = None      IE_DESC = None      SEARCH_KEY = None +    _VALID_URL = None +    _EMBED_REGEX = []      def _login_hint(self, method=NO_DEFAULT, netrc=None):          password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials' @@ -499,12 +516,12 @@ class InfoExtractor:      @classmethod      def _match_valid_url(cls, url): +        if cls._VALID_URL is False: +            return None          # This does not use has/getattr intentionally - we want to know whether          # we have cached the regexp for *this* class, whereas getattr would also          # match the superclass          if '_VALID_URL_RE' not in cls.__dict__: -            if '_VALID_URL' not in cls.__dict__: -                cls._VALID_URL = cls._make_valid_url()              cls._VALID_URL_RE = re.compile(cls._VALID_URL)          return cls._VALID_URL_RE.match(url) @@ -1143,10 +1160,12 @@ class InfoExtractor:              'url': url,          } -    def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs): -        urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {})) -                for m in orderedSet(map(getter, matches) if getter else matches)) -        return self.playlist_result(urls, playlist_id, playlist_title, **kwargs) +    @classmethod +    def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None, +                              getter=IDENTITY, ie=None, video_kwargs=None, **kwargs): +        return cls.playlist_result( +            (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)), +            playlist_id, playlist_title, **kwargs)      @staticmethod      def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs): @@ -1353,12 +1372,20 @@ class InfoExtractor:      def _dc_search_uploader(self, html):          return self._html_search_meta('dc.creator', html, 'uploader') -    def _rta_search(self, html): +    @staticmethod +    def _rta_search(html):          # See http://www.rtalabel.org/index.php?content=howtofaq#single          if re.search(r'(?ix)<meta\s+name="rating"\s+'                       r'     content="RTA-5042-1996-1400-1577-RTA"',                       html):              return 18 + +        # And then there are the jokers who advertise that they use RTA, but actually don't. +        AGE_LIMIT_MARKERS = [ +            r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>', +        ] +        if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS): +            return 18          return 0      def _media_rating_search(self, html): @@ -1965,14 +1992,9 @@ class InfoExtractor:              else 'https:')      def _proto_relative_url(self, url, scheme=None): -        if url is None: -            return url -        if url.startswith('//'): -            if scheme is None: -                scheme = self.http_scheme() -            return scheme + url -        else: -            return url +        scheme = scheme or self.http_scheme() +        assert scheme.endswith(':') +        return sanitize_url(url, scheme=scheme[:-1])      def _sleep(self, timeout, video_id, msg_template=None):          if msg_template is None: @@ -3767,10 +3789,12 @@ class InfoExtractor:              headers['Ytdl-request-proxy'] = geo_verification_proxy          return headers -    def _generic_id(self, url): +    @staticmethod +    def _generic_id(url):          return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) -    def _generic_title(self, url): +    @staticmethod +    def _generic_title(url):          return urllib.parse.unquote(os.path.splitext(url_basename(url))[0])      @staticmethod @@ -3816,6 +3840,37 @@ class InfoExtractor:          self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}')          return True +    @classmethod +    def extract_from_webpage(cls, ydl, url, webpage): +        ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType) +              else ydl.get_info_extractor(cls.ie_key())) +        yield from ie._extract_from_webpage(url, webpage) or [] + +    @classmethod +    def _extract_from_webpage(cls, url, webpage): +        for embed_url in orderedSet( +                cls._extract_embed_urls(url, webpage) or [], lazy=True): +            yield cls.url_result(embed_url, cls) + +    @classmethod +    def _extract_embed_urls(cls, url, webpage): +        """@returns all the embed urls on the webpage""" +        if '_EMBED_URL_RE' not in cls.__dict__: +            assert isinstance(cls._EMBED_REGEX, (list, tuple)) +            for idx, regex in enumerate(cls._EMBED_REGEX): +                assert regex.count('(?P<url>') == 1, \ +                    f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}' +            cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX)) + +        for regex in cls._EMBED_URL_RE: +            for mobj in regex.finditer(webpage): +                embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url'))) +                if cls._VALID_URL is False or cls.suitable(embed_url): +                    yield embed_url + +    class StopExtraction(Exception): +        pass +  class SearchInfoExtractor(InfoExtractor):      """ @@ -3826,8 +3881,8 @@ class SearchInfoExtractor(InfoExtractor):      _MAX_RESULTS = float('inf') -    @classmethod -    def _make_valid_url(cls): +    @classproperty +    def _VALID_URL(cls):          return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY      def _real_extract(self, query): diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index f8311820e..d6a6166a0 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -3,6 +3,8 @@ import re  import urllib.parse  import xml.etree.ElementTree +from . import gen_extractor_classes +from .common import InfoExtractor  # isort: split  from .ant1newsgr import Ant1NewsGrEmbedIE  from .anvato import AnvatoIE  from .apa import APAIE @@ -14,7 +16,6 @@ from .blogger import BloggerIE  from .brightcove import BrightcoveLegacyIE, BrightcoveNewIE  from .channel9 import Channel9IE  from .cloudflarestream import CloudflareStreamIE -from .common import InfoExtractor  from .commonprotocols import RtmpIE  from .condenast import CondeNastIE  from .dailymail import DailyMailIE @@ -115,6 +116,7 @@ from ..utils import (      determine_ext,      dict_get,      float_or_none, +    format_field,      int_or_none,      is_html,      js_to_json, @@ -2641,8 +2643,15 @@ class GenericIE(InfoExtractor):          """Report information extraction."""          self._downloader.to_screen('[redirect] Following redirect to %s' % new_url) -    def report_detected(self, name): -        self._downloader.write_debug(f'Identified a {name}') +    def report_detected(self, name, num=1, note=None): +        if num > 1: +            name += 's' +        elif not num: +            return +        else: +            num = 'a' + +        self._downloader.write_debug(f'Identified {num} {name}{format_field(note, None, "; %s")}')      def _extract_rss(self, url, video_id, doc):          NS_MAP = { @@ -2854,8 +2863,7 @@ class GenericIE(InfoExtractor):          if not self.get_param('test', False) and not is_intentional:              force = self.get_param('force_generic_extractor', False) -            self.report_warning( -                '%s on generic information extractor.' % ('Forcing' if force else 'Falling back')) +            self.report_warning('%s generic information extractor' % ('Forcing' if force else 'Falling back on'))          first_bytes = full_response.read(512) @@ -2933,6 +2941,22 @@ class GenericIE(InfoExtractor):              self.report_detected('Camtasia video')              return camtasia_res +        info_dict.update({ +            # it's tempting to parse this further, but you would +            # have to take into account all the variations like +            #   Video Title - Site Name +            #   Site Name | Video Title +            #   Video Title - Tagline | Site Name +            # and so on and so forth; it's just not practical +            'title': (self._og_search_title(webpage, default=None) +                      or self._html_extract_title(webpage, 'video title', default='video')), +            'description': self._og_search_description(webpage, default=None), +            'thumbnail': self._og_search_thumbnail(webpage, default=None), +            'age_limit': self._rta_search(webpage), +        }) + +        domain_name = self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader') +          # Sometimes embedded video player is hidden behind percent encoding          # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448)          # Unescaping the whole page allows to handle those cases in a generic way @@ -2946,40 +2970,12 @@ class GenericIE(InfoExtractor):              r'<div[^>]+class=[^>]*?\bsqs-video-wrapper\b[^>]*>',              lambda x: unescapeHTML(x.group(0)), webpage) -        # it's tempting to parse this further, but you would -        # have to take into account all the variations like -        #   Video Title - Site Name -        #   Site Name | Video Title -        #   Video Title - Tagline | Site Name -        # and so on and so forth; it's just not practical -        video_title = (self._og_search_title(webpage, default=None) -                       or self._html_extract_title(webpage, 'video title', default='video')) - -        # Try to detect age limit automatically -        age_limit = self._rta_search(webpage) -        # And then there are the jokers who advertise that they use RTA, -        # but actually don't. -        AGE_LIMIT_MARKERS = [ -            r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>', -        ] -        if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS): -            age_limit = 18 - -        # video uploader is domain name -        video_uploader = self._search_regex( -            r'^(?:https?://)?([^/]*)/.*', url, 'video uploader') - -        video_description = self._og_search_description(webpage, default=None) -        video_thumbnail = self._og_search_thumbnail(webpage, default=None) - -        info_dict.update({ -            'title': video_title, -            'description': video_description, -            'thumbnail': video_thumbnail, -            'age_limit': age_limit, -        }) +        # TODO: Remove +        video_title, video_description, video_thumbnail, age_limit, video_uploader = \ +            info_dict['title'], info_dict['description'], info_dict['thumbnail'], info_dict['age_limit'], domain_name -        self._downloader.write_debug('Looking for video embeds') +        # TODO: Move Embeds +        self._downloader.write_debug('Looking for single embeds')          # Look for Brightcove Legacy Studio embeds          bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) @@ -2998,7 +2994,7 @@ class GenericIE(InfoExtractor):              }          # Look for Brightcove New Studio embeds -        bc_urls = BrightcoveNewIE._extract_urls(self, webpage) +        bc_urls = BrightcoveNewIE._extract_brightcove_urls(self, webpage)          if bc_urls:              return self.playlist_from_matches(                  bc_urls, video_id, video_title, @@ -3246,7 +3242,7 @@ class GenericIE(InfoExtractor):              return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie=SportBoxIE.ie_key())          # Look for embedded Spotify player -        spotify_urls = SpotifyBaseIE._extract_embed_urls(webpage) +        spotify_urls = SpotifyBaseIE._extract_urls(webpage)          if spotify_urls:              return self.playlist_from_matches(spotify_urls, video_id, video_title) @@ -3837,6 +3833,30 @@ class GenericIE(InfoExtractor):          tiktok_urls = TikTokIE._extract_urls(webpage)          if tiktok_urls:              return self.playlist_from_matches(tiktok_urls, video_id, video_title) +        # TODO: END: Move Embeds + +        self._downloader.write_debug('Looking for embeds') +        embeds = [] +        for ie in gen_extractor_classes(): +            gen = ie.extract_from_webpage(self._downloader, url, webpage) +            current_embeds = [] +            try: +                while True: +                    current_embeds.append(next(gen)) +            except self.StopExtraction: +                self.report_detected(f'{ie.IE_NAME} exclusive embed', len(current_embeds), +                                     embeds and 'discarding other embeds') +                embeds = current_embeds +                break +            except StopIteration: +                self.report_detected(f'{ie.IE_NAME} embed', len(current_embeds)) +                embeds.extend(current_embeds) + +        del current_embeds +        if len(embeds) == 1: +            return {**info_dict, **embeds[0]} +        elif embeds: +            return self.playlist_result(embeds, **info_dict)          # Look for HTML5 media          entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') @@ -4119,7 +4139,6 @@ class GenericIE(InfoExtractor):                  entries.append(self.url_result(video_url, 'Youtube'))                  continue -            # here's a fun little line of code for you:              video_id = os.path.splitext(video_id)[0]              headers = {                  'referer': full_response.geturl() diff --git a/yt_dlp/extractor/spotify.py b/yt_dlp/extractor/spotify.py index fef8d8dd2..f476b7022 100644 --- a/yt_dlp/extractor/spotify.py +++ b/yt_dlp/extractor/spotify.py @@ -98,7 +98,7 @@ class SpotifyBaseIE(InfoExtractor):          }      @classmethod -    def _extract_embed_urls(cls, webpage): +    def _extract_urls(cls, webpage):          return re.findall(              r'<iframe[^>]+src="(https?://open\.spotify.com/embed/[^"]+)"',              webpage) diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index 57c9961c1..545c02763 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -705,13 +705,13 @@ def sanitize_path(s, force=False):      return os.path.join(*sanitized_path) -def sanitize_url(url): +def sanitize_url(url, *, scheme='http'):      # Prepend protocol-less URLs with `http:` scheme in order to mitigate      # the number of unwanted failures due to missing protocol      if url is None:          return      elif url.startswith('//'): -        return 'http:%s' % url +        return f'{scheme}:{url}'      # Fix some common typos seen so far      COMMON_TYPOS = (          # https://github.com/ytdl-org/youtube-dl/issues/15649 | 
