aboutsummaryrefslogtreecommitdiffstats
path: root/yt_dlp/extractor/common.py
diff options
context:
space:
mode:
authorpukkandan <pukkandan.ytdlp@gmail.com>2022-05-31 23:13:26 +0530
committerpukkandan <pukkandan.ytdlp@gmail.com>2022-06-01 01:57:16 +0530
commit617f658b7ec1193749848c1b7343acab125dbc46 (patch)
treee2489b678d04f09ee31734a3faece9b5461bc325 /yt_dlp/extractor/common.py
parent8a7f6d7a155bc0966c40736336faea81db92315b (diff)
downloadhypervideo-pre-617f658b7ec1193749848c1b7343acab125dbc46.tar.lz
hypervideo-pre-617f658b7ec1193749848c1b7343acab125dbc46.tar.xz
hypervideo-pre-617f658b7ec1193749848c1b7343acab125dbc46.zip
[extractor, cleanup] Refactor `_download_...` methods
Diffstat (limited to 'yt_dlp/extractor/common.py')
-rw-r--r--yt_dlp/extractor/common.py269
1 files changed, 101 insertions, 168 deletions
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index a589fb7fa..6f0de61df 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -791,8 +791,35 @@ class InfoExtractor:
"""
Return a tuple (page content as string, URL handle).
- See _download_webpage docstring for arguments specification.
+ Arguments:
+ url_or_request -- plain text URL as a string or
+ a compat_urllib_request.Requestobject
+ video_id -- Video/playlist/item identifier (string)
+
+ Keyword arguments:
+ note -- note printed before downloading (string)
+ errnote -- note printed in case of an error (string)
+ fatal -- flag denoting whether error should be considered fatal,
+ i.e. whether it should cause ExtractionError to be raised,
+ otherwise a warning will be reported and extraction continued
+ encoding -- encoding for a page content decoding, guessed automatically
+ when not explicitly specified
+ data -- POST data (bytes)
+ headers -- HTTP headers (dict)
+ query -- URL query (dict)
+ expected_status -- allows to accept failed HTTP requests (non 2xx
+ status code) by explicitly specifying a set of accepted status
+ codes. Can be any of the following entities:
+ - an integer type specifying an exact failed status code to
+ accept
+ - a list or a tuple of integer types specifying a list of
+ failed status codes to accept
+ - a callable accepting an actual failed status code and
+ returning True if it should be accepted
+ Note that this argument does not affect success status codes (2xx)
+ which are always accepted.
"""
+
# Strip hashes from the URL (#1038)
if isinstance(url_or_request, (compat_str, str)):
url_or_request = url_or_request.partition('#')[0]
@@ -887,102 +914,6 @@ class InfoExtractor:
return content
- def _download_webpage(
- self, url_or_request, video_id, note=None, errnote=None,
- fatal=True, tries=1, timeout=5, encoding=None, data=None,
- headers={}, query={}, expected_status=None):
- """
- Return the data of the page as a string.
-
- Arguments:
- url_or_request -- plain text URL as a string or
- a compat_urllib_request.Requestobject
- video_id -- Video/playlist/item identifier (string)
-
- Keyword arguments:
- note -- note printed before downloading (string)
- errnote -- note printed in case of an error (string)
- fatal -- flag denoting whether error should be considered fatal,
- i.e. whether it should cause ExtractionError to be raised,
- otherwise a warning will be reported and extraction continued
- tries -- number of tries
- timeout -- sleep interval between tries
- encoding -- encoding for a page content decoding, guessed automatically
- when not explicitly specified
- data -- POST data (bytes)
- headers -- HTTP headers (dict)
- query -- URL query (dict)
- expected_status -- allows to accept failed HTTP requests (non 2xx
- status code) by explicitly specifying a set of accepted status
- codes. Can be any of the following entities:
- - an integer type specifying an exact failed status code to
- accept
- - a list or a tuple of integer types specifying a list of
- failed status codes to accept
- - a callable accepting an actual failed status code and
- returning True if it should be accepted
- Note that this argument does not affect success status codes (2xx)
- which are always accepted.
- """
-
- success = False
- try_count = 0
- while success is False:
- try:
- res = self._download_webpage_handle(
- url_or_request, video_id, note, errnote, fatal,
- encoding=encoding, data=data, headers=headers, query=query,
- expected_status=expected_status)
- success = True
- except compat_http_client.IncompleteRead as e:
- try_count += 1
- if try_count >= tries:
- raise e
- self._sleep(timeout, video_id)
- if res is False:
- return res
- else:
- content, _ = res
- return content
-
- def _download_xml_handle(
- self, url_or_request, video_id, note='Downloading XML',
- errnote='Unable to download XML', transform_source=None,
- fatal=True, encoding=None, data=None, headers={}, query={},
- expected_status=None):
- """
- Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle).
-
- See _download_webpage docstring for arguments specification.
- """
- res = self._download_webpage_handle(
- url_or_request, video_id, note, errnote, fatal=fatal,
- encoding=encoding, data=data, headers=headers, query=query,
- expected_status=expected_status)
- if res is False:
- return res
- xml_string, urlh = res
- return self._parse_xml(
- xml_string, video_id, transform_source=transform_source,
- fatal=fatal), urlh
-
- def _download_xml(
- self, url_or_request, video_id,
- note='Downloading XML', errnote='Unable to download XML',
- transform_source=None, fatal=True, encoding=None,
- data=None, headers={}, query={}, expected_status=None):
- """
- Return the xml as an xml.etree.ElementTree.Element.
-
- See _download_webpage docstring for arguments specification.
- """
- res = self._download_xml_handle(
- url_or_request, video_id, note=note, errnote=errnote,
- transform_source=transform_source, fatal=fatal, encoding=encoding,
- data=data, headers=headers, query=query,
- expected_status=expected_status)
- return res if res is False else res[0]
-
def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
if transform_source:
xml_string = transform_source(xml_string)
@@ -995,44 +926,6 @@ class InfoExtractor:
else:
self.report_warning(errmsg + str(ve))
- def _download_json_handle(
- self, url_or_request, video_id, note='Downloading JSON metadata',
- errnote='Unable to download JSON metadata', transform_source=None,
- fatal=True, encoding=None, data=None, headers={}, query={},
- expected_status=None):
- """
- Return a tuple (JSON object, URL handle).
-
- See _download_webpage docstring for arguments specification.
- """
- res = self._download_webpage_handle(
- url_or_request, video_id, note, errnote, fatal=fatal,
- encoding=encoding, data=data, headers=headers, query=query,
- expected_status=expected_status)
- if res is False:
- return res
- json_string, urlh = res
- return self._parse_json(
- json_string, video_id, transform_source=transform_source,
- fatal=fatal), urlh
-
- def _download_json(
- self, url_or_request, video_id, note='Downloading JSON metadata',
- errnote='Unable to download JSON metadata', transform_source=None,
- fatal=True, encoding=None, data=None, headers={}, query={},
- expected_status=None):
- """
- Return the JSON object as a dict.
-
- See _download_webpage docstring for arguments specification.
- """
- res = self._download_json_handle(
- url_or_request, video_id, note=note, errnote=errnote,
- transform_source=transform_source, fatal=fatal, encoding=encoding,
- data=data, headers=headers, query=query,
- expected_status=expected_status)
- return res if res is False else res[0]
-
def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, lenient=False):
if transform_source:
json_string = transform_source(json_string)
@@ -1058,43 +951,83 @@ class InfoExtractor:
data[data.find('{'):data.rfind('}') + 1],
video_id, transform_source, fatal)
- def _download_socket_json_handle(
- self, url_or_request, video_id, note='Polling socket',
- errnote='Unable to poll socket', transform_source=None,
- fatal=True, encoding=None, data=None, headers={}, query={},
- expected_status=None):
- """
- Return a tuple (JSON object, URL handle).
+ def __create_download_methods(name, parser, note, errnote, return_value):
+
+ def parse(ie, content, *args, **kwargs):
+ if parser is None:
+ return content
+ # parser is fetched by name so subclasses can override it
+ return getattr(ie, parser)(content, *args, **kwargs)
+
+ def download_handle(self, url_or_request, video_id, note=note, errnote=errnote,
+ transform_source=None, fatal=True, *args, **kwargs):
+ res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, *args, **kwargs)
+ if res is False:
+ return res
+ content, urlh = res
+ return parse(self, content, video_id, transform_source, fatal), urlh
+
+ def download_content(
+ self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None, *args, **kwargs):
+ args = [url_or_request, video_id, note, errnote, transform_source, *args]
+ if parser is None:
+ args.pop(4) # transform_source
+ # The method is fetched by name so subclasses can override _download_..._handle
+ res = getattr(self, download_handle.__name__)(*args, **kwargs)
+ return res if res is False else res[0]
+
+ def impersonate(func, name, return_value):
+ func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}'
+ func.__doc__ = f'''
+ @param transform_source Apply this transformation before parsing
+ @returns {return_value}
+
+ See _download_webpage_handle docstring for other arguments specification
+ '''
+
+ impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)')
+ impersonate(download_content, f'_download_{name}', f'{return_value}')
+ return download_handle, download_content
+
+ _download_xml_handle, _download_xml = __create_download_methods(
+ 'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element')
+ _download_json_handle, _download_json = __create_download_methods(
+ 'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict')
+ _download_socket_json_handle, _download_socket_json = __create_download_methods(
+ 'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict')
+ __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1]
- See _download_webpage docstring for arguments specification.
- """
- res = self._download_webpage_handle(
- url_or_request, video_id, note, errnote, fatal=fatal,
- encoding=encoding, data=data, headers=headers, query=query,
- expected_status=expected_status)
- if res is False:
- return res
- webpage, urlh = res
- return self._parse_socket_response_as_json(
- webpage, video_id, transform_source=transform_source,
- fatal=fatal), urlh
-
- def _download_socket_json(
- self, url_or_request, video_id, note='Polling socket',
- errnote='Unable to poll socket', transform_source=None,
- fatal=True, encoding=None, data=None, headers={}, query={},
- expected_status=None):
+ def _download_webpage(
+ self, url_or_request, video_id, note=None, errnote=None,
+ fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs):
"""
- Return the JSON object as a dict.
+ Return the data of the page as a string.
- See _download_webpage docstring for arguments specification.
+ Keyword arguments:
+ tries -- number of tries
+ timeout -- sleep interval between tries
+
+ See _download_webpage_handle docstring for other arguments specification.
"""
- res = self._download_socket_json_handle(
- url_or_request, video_id, note=note, errnote=errnote,
- transform_source=transform_source, fatal=fatal, encoding=encoding,
- data=data, headers=headers, query=query,
- expected_status=expected_status)
- return res if res is False else res[0]
+
+ R''' # NB: These are unused; should they be deprecated?
+ if tries != 1:
+ self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage')
+ if timeout is NO_DEFAULT:
+ timeout = 5
+ else:
+ self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage')
+ '''
+
+ try_count = 0
+ while True:
+ try:
+ return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs)
+ except compat_http_client.IncompleteRead as e:
+ try_count += 1
+ if try_count >= tries:
+ raise e
+ self._sleep(timeout, video_id)
def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs):
idstr = format_field(video_id, template='%s: ')