diff options
author | pukkandan <pukkandan.ytdlp@gmail.com> | 2022-08-01 06:52:03 +0530 |
---|---|---|
committer | pukkandan <pukkandan.ytdlp@gmail.com> | 2022-08-02 01:08:16 +0530 |
commit | 8f97a15d1c7ebc10d0b51ce24632ac17b34a5f69 (patch) | |
tree | 218588a6ee85435864e9848ce4450d2731c2e35b /yt_dlp/extractor/common.py | |
parent | 47304e07dc4a044242f7d5a14c3f6c3e5f3ad8ba (diff) | |
download | hypervideo-pre-8f97a15d1c7ebc10d0b51ce24632ac17b34a5f69.tar.lz hypervideo-pre-8f97a15d1c7ebc10d0b51ce24632ac17b34a5f69.tar.xz hypervideo-pre-8f97a15d1c7ebc10d0b51ce24632ac17b34a5f69.zip |
[extractor] Framework for embed detection (#4307)
Diffstat (limited to 'yt_dlp/extractor/common.py')
-rw-r--r-- | yt_dlp/extractor/common.py | 99 |
1 files changed, 77 insertions, 22 deletions
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index d168763e0..b8347fe4c 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -14,6 +14,7 @@ import random import re import sys import time +import types import urllib.parse import urllib.request import xml.etree.ElementTree @@ -23,6 +24,7 @@ from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name from ..downloader import FileDownloader from ..downloader.f4m import get_base_url, remove_encrypted_media from ..utils import ( + IDENTITY, JSON_LD_RE, NO_DEFAULT, ExtractorError, @@ -59,6 +61,7 @@ from ..utils import ( parse_m3u8_attributes, parse_resolution, sanitize_filename, + sanitize_url, sanitized_Request, str_or_none, str_to_int, @@ -431,14 +434,26 @@ class InfoExtractor: title, description etc. - Subclasses of this should define a _VALID_URL regexp and, re-define the - _real_extract() and (optionally) _real_initialize() methods. - Probably, they should also be added to the list of extractors. + Subclasses of this should also be added to the list of extractors and + should define a _VALID_URL regexp and, re-define the _real_extract() and + (optionally) _real_initialize() methods. Subclasses may also override suitable() if necessary, but ensure the function signature is preserved and that this function imports everything it needs (except other extractors), so that lazy_extractors works correctly. + Subclasses can define a list of _EMBED_REGEX, which will be searched for in + the HTML of Generic webpages. It may also override _extract_embed_urls + or _extract_from_webpage as necessary. While these are normally classmethods, + _extract_from_webpage is allowed to be an instance method. + + _extract_from_webpage may raise self.StopExtraction() to stop further + processing of the webpage and obtain exclusive rights to it. This is useful + when the extractor cannot reliably be matched using just the URL. + Eg: invidious/peertube instances + + Embed-only extractors can be defined by setting _VALID_URL = False. + To support username + password (or netrc) login, the extractor must define a _NETRC_MACHINE and re-define _perform_login(username, password) and (optionally) _initialize_pre_login() methods. The _perform_login method will @@ -476,6 +491,8 @@ class InfoExtractor: _NETRC_MACHINE = None IE_DESC = None SEARCH_KEY = None + _VALID_URL = None + _EMBED_REGEX = [] def _login_hint(self, method=NO_DEFAULT, netrc=None): password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials' @@ -499,12 +516,12 @@ class InfoExtractor: @classmethod def _match_valid_url(cls, url): + if cls._VALID_URL is False: + return None # This does not use has/getattr intentionally - we want to know whether # we have cached the regexp for *this* class, whereas getattr would also # match the superclass if '_VALID_URL_RE' not in cls.__dict__: - if '_VALID_URL' not in cls.__dict__: - cls._VALID_URL = cls._make_valid_url() cls._VALID_URL_RE = re.compile(cls._VALID_URL) return cls._VALID_URL_RE.match(url) @@ -1143,10 +1160,12 @@ class InfoExtractor: 'url': url, } - def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs): - urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {})) - for m in orderedSet(map(getter, matches) if getter else matches)) - return self.playlist_result(urls, playlist_id, playlist_title, **kwargs) + @classmethod + def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None, + getter=IDENTITY, ie=None, video_kwargs=None, **kwargs): + return cls.playlist_result( + (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)), + playlist_id, playlist_title, **kwargs) @staticmethod def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs): @@ -1353,12 +1372,20 @@ class InfoExtractor: def _dc_search_uploader(self, html): return self._html_search_meta('dc.creator', html, 'uploader') - def _rta_search(self, html): + @staticmethod + def _rta_search(html): # See http://www.rtalabel.org/index.php?content=howtofaq#single if re.search(r'(?ix)<meta\s+name="rating"\s+' r' content="RTA-5042-1996-1400-1577-RTA"', html): return 18 + + # And then there are the jokers who advertise that they use RTA, but actually don't. + AGE_LIMIT_MARKERS = [ + r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>', + ] + if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS): + return 18 return 0 def _media_rating_search(self, html): @@ -1965,14 +1992,9 @@ class InfoExtractor: else 'https:') def _proto_relative_url(self, url, scheme=None): - if url is None: - return url - if url.startswith('//'): - if scheme is None: - scheme = self.http_scheme() - return scheme + url - else: - return url + scheme = scheme or self.http_scheme() + assert scheme.endswith(':') + return sanitize_url(url, scheme=scheme[:-1]) def _sleep(self, timeout, video_id, msg_template=None): if msg_template is None: @@ -3767,10 +3789,12 @@ class InfoExtractor: headers['Ytdl-request-proxy'] = geo_verification_proxy return headers - def _generic_id(self, url): + @staticmethod + def _generic_id(url): return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) - def _generic_title(self, url): + @staticmethod + def _generic_title(url): return urllib.parse.unquote(os.path.splitext(url_basename(url))[0]) @staticmethod @@ -3816,6 +3840,37 @@ class InfoExtractor: self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}') return True + @classmethod + def extract_from_webpage(cls, ydl, url, webpage): + ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType) + else ydl.get_info_extractor(cls.ie_key())) + yield from ie._extract_from_webpage(url, webpage) or [] + + @classmethod + def _extract_from_webpage(cls, url, webpage): + for embed_url in orderedSet( + cls._extract_embed_urls(url, webpage) or [], lazy=True): + yield cls.url_result(embed_url, cls) + + @classmethod + def _extract_embed_urls(cls, url, webpage): + """@returns all the embed urls on the webpage""" + if '_EMBED_URL_RE' not in cls.__dict__: + assert isinstance(cls._EMBED_REGEX, (list, tuple)) + for idx, regex in enumerate(cls._EMBED_REGEX): + assert regex.count('(?P<url>') == 1, \ + f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}' + cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX)) + + for regex in cls._EMBED_URL_RE: + for mobj in regex.finditer(webpage): + embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url'))) + if cls._VALID_URL is False or cls.suitable(embed_url): + yield embed_url + + class StopExtraction(Exception): + pass + class SearchInfoExtractor(InfoExtractor): """ @@ -3826,8 +3881,8 @@ class SearchInfoExtractor(InfoExtractor): _MAX_RESULTS = float('inf') - @classmethod - def _make_valid_url(cls): + @classproperty + def _VALID_URL(cls): return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY def _real_extract(self, query): |