diff options
author | Jesús <heckyel@hyperbola.info> | 2022-03-22 00:48:28 +0800 |
---|---|---|
committer | Jesús <heckyel@hyperbola.info> | 2022-03-22 00:48:28 +0800 |
commit | 7a74bc5d1e54299e51b73492e09c70da994f4b35 (patch) | |
tree | e59a64b5b386d2381906e99912153aabd5d4ab0d | |
parent | 3c69360ec3cb4a951d7e37150c7cfae8a0491cd2 (diff) | |
parent | 84842aee2ba8dc50601c86dc6fbb12d0fa438449 (diff) | |
download | hypervideo-pre-7a74bc5d1e54299e51b73492e09c70da994f4b35.tar.lz hypervideo-pre-7a74bc5d1e54299e51b73492e09c70da994f4b35.tar.xz hypervideo-pre-7a74bc5d1e54299e51b73492e09c70da994f4b35.zip |
updated from upstream | 22/03/2022 at 00:48
99 files changed, 1266 insertions, 1414 deletions
diff --git a/docs/.gitignore b/docs/.gitignore deleted file mode 100644 index 69fa449dd..000000000 --- a/docs/.gitignore +++ /dev/null @@ -1 +0,0 @@ -_build/ diff --git a/docs/Changelog.md b/docs/Changelog.md deleted file mode 100644 index 99de25fb1..000000000 --- a/docs/Changelog.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -orphan: true ---- -```{include} ../Changelog.md -``` diff --git a/docs/Collaborators.md b/docs/Collaborators.md deleted file mode 100644 index 5f493d814..000000000 --- a/docs/Collaborators.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -orphan: true ---- -```{include} ../Collaborators.md -``` diff --git a/docs/Contributing.md b/docs/Contributing.md deleted file mode 100644 index 60fe46909..000000000 --- a/docs/Contributing.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -orphan: true ---- -```{include} ../Contributing.md -``` diff --git a/docs/LICENSE.md b/docs/LICENSE.md deleted file mode 100644 index 8521669f8..000000000 --- a/docs/LICENSE.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -orphan: true ---- -# LICENSE -```{include} ../LICENSE -``` diff --git a/docs/Makefile b/docs/Makefile deleted file mode 100644 index 1a8e3cb1c..000000000 --- a/docs/Makefile +++ /dev/null @@ -1,177 +0,0 @@ -# Makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = sphinx-build -PAPER = -BUILDDIR = _build - -# User-friendly check for sphinx-build -ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) -$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) -endif - -# Internal variables. -PAPEROPT_a4 = -D latex_paper_size=a4 -PAPEROPT_letter = -D latex_paper_size=letter -ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . -# the i18n builder cannot share the environment and doctrees with the others -I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . - -.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext - -help: - @echo "Please use \`make <target>' where <target> is one of" - @echo " html to make standalone HTML files" - @echo " dirhtml to make HTML files named index.html in directories" - @echo " singlehtml to make a single large HTML file" - @echo " pickle to make pickle files" - @echo " json to make JSON files" - @echo " htmlhelp to make HTML files and a HTML help project" - @echo " qthelp to make HTML files and a qthelp project" - @echo " devhelp to make HTML files and a Devhelp project" - @echo " epub to make an epub" - @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" - @echo " latexpdf to make LaTeX files and run them through pdflatex" - @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" - @echo " text to make text files" - @echo " man to make manual pages" - @echo " texinfo to make Texinfo files" - @echo " info to make Texinfo files and run them through makeinfo" - @echo " gettext to make PO message catalogs" - @echo " changes to make an overview of all changed/added/deprecated items" - @echo " xml to make Docutils-native XML files" - @echo " pseudoxml to make pseudoxml-XML files for display purposes" - @echo " linkcheck to check all external links for integrity" - @echo " doctest to run all doctests embedded in the documentation (if enabled)" - -clean: - rm -rf $(BUILDDIR)/* - -html: - $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." - -dirhtml: - $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml - @echo - @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." - -singlehtml: - $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml - @echo - @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." - -pickle: - $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle - @echo - @echo "Build finished; now you can process the pickle files." - -json: - $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json - @echo - @echo "Build finished; now you can process the JSON files." - -htmlhelp: - $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp - @echo - @echo "Build finished; now you can run HTML Help Workshop with the" \ - ".hhp project file in $(BUILDDIR)/htmlhelp." - -qthelp: - $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp - @echo - @echo "Build finished; now you can run "qcollectiongenerator" with the" \ - ".qhcp project file in $(BUILDDIR)/qthelp, like this:" - @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/yt-dlp.qhcp" - @echo "To view the help file:" - @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/yt-dlp.qhc" - -devhelp: - $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp - @echo - @echo "Build finished." - @echo "To view the help file:" - @echo "# mkdir -p $$HOME/.local/share/devhelp/yt-dlp" - @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/yt-dlp" - @echo "# devhelp" - -epub: - $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub - @echo - @echo "Build finished. The epub file is in $(BUILDDIR)/epub." - -latex: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo - @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." - @echo "Run \`make' in that directory to run these through (pdf)latex" \ - "(use \`make latexpdf' here to do that automatically)." - -latexpdf: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through pdflatex..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -latexpdfja: - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex - @echo "Running LaTeX files through platex and dvipdfmx..." - $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja - @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." - -text: - $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text - @echo - @echo "Build finished. The text files are in $(BUILDDIR)/text." - -man: - $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man - @echo - @echo "Build finished. The manual pages are in $(BUILDDIR)/man." - -texinfo: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo - @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." - @echo "Run \`make' in that directory to run these through makeinfo" \ - "(use \`make info' here to do that automatically)." - -info: - $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo - @echo "Running Texinfo files through makeinfo..." - make -C $(BUILDDIR)/texinfo info - @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." - -gettext: - $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale - @echo - @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." - -changes: - $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes - @echo - @echo "The overview file is in $(BUILDDIR)/changes." - -linkcheck: - $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck - @echo - @echo "Link check complete; look for any errors in the above output " \ - "or in $(BUILDDIR)/linkcheck/output.txt." - -doctest: - $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest - @echo "Testing of doctests in the sources finished, look at the " \ - "results in $(BUILDDIR)/doctest/output.txt." - -xml: - $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml - @echo - @echo "Build finished. The XML files are in $(BUILDDIR)/xml." - -pseudoxml: - $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml - @echo - @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/docs/README.md b/docs/README.md deleted file mode 100644 index 451bedaec..000000000 --- a/docs/README.md +++ /dev/null @@ -1,2 +0,0 @@ -```{include} ../README.md -``` diff --git a/docs/conf.py b/docs/conf.py deleted file mode 100644 index c4010bbc7..000000000 --- a/docs/conf.py +++ /dev/null @@ -1,68 +0,0 @@ -# coding: utf-8 -# -# yt-dlp documentation build configuration file - -import sys -import os - -# Allows to import yt-dlp -sys.path.insert(0, os.path.abspath('..')) - -# -- General configuration ------------------------------------------------ - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - 'myst_parser', -] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] - -# The master toctree document. -master_doc = 'README' - -# General information about the project. -project = u'yt-dlp' -author = u'yt-dlp' -copyright = u'UNLICENSE' - -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# -# The short X.Y version. -from yt_dlp.version import __version__ -version = __version__ -# The full version, including alpha/beta/rc tags. -release = version - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -exclude_patterns = ['_build'] - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = 'sphinx' - -# -- Options for HTML output ---------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -html_theme = 'default' - -# Disable highlights -highlight_language = 'none' - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -# html_static_path = ['_static'] - -# Enable heading anchors -myst_heading_anchors = 4 - -# Suppress heading warnings -suppress_warnings = [ - 'myst.header', -] diff --git a/docs/requirements.txt b/docs/requirements.txt deleted file mode 100644 index f0694bdc0..000000000 --- a/docs/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -myst-parser diff --git a/docs/supportedsites.md b/docs/supportedsites.md deleted file mode 100644 index 55c023415..000000000 --- a/docs/supportedsites.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -orphan: true ---- -```{include} ../supportedsites.md -``` diff --git a/docs/ytdlp_plugins.md b/docs/ytdlp_plugins.md deleted file mode 100644 index 483b9c46e..000000000 --- a/docs/ytdlp_plugins.md +++ /dev/null @@ -1,6 +0,0 @@ ---- -orphan: true ---- -# ytdlp_plugins - -See [https://github.com/yt-dlp/yt-dlp/tree/master/ytdlp_plugins](https://github.com/yt-dlp/yt-dlp/tree/master/ytdlp_plugins). diff --git a/test/test_netrc.py b/test/test_netrc.py index 36b943591..94a703406 100644 --- a/test/test_netrc.py +++ b/test/test_netrc.py @@ -7,18 +7,19 @@ import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from yt_dlp.extractor import ( - gen_extractors, -) +from yt_dlp.extractor import gen_extractor_classes +from yt_dlp.extractor.common import InfoExtractor + +NO_LOGIN = InfoExtractor._perform_login class TestNetRc(unittest.TestCase): def test_netrc_present(self): - for ie in gen_extractors(): - if not hasattr(ie, '_login'): + for ie in gen_extractor_classes(): + if ie._perform_login is NO_LOGIN: continue self.assertTrue( - hasattr(ie, '_NETRC_MACHINE'), + ie._NETRC_MACHINE, 'Extractor %s supports login, but is missing a _NETRC_MACHINE property' % ie.IE_NAME) diff --git a/test/test_utils.py b/test/test_utils.py index 6be5bb642..a7f1b0e94 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -1780,6 +1780,7 @@ Line 1 self.assertEqual(format_bytes(1024**6), '1.00EiB') self.assertEqual(format_bytes(1024**7), '1.00ZiB') self.assertEqual(format_bytes(1024**8), '1.00YiB') + self.assertEqual(format_bytes(1024**9), '1024.00YiB') def test_hide_login_info(self): self.assertEqual(Config.hide_login_info(['-u', 'foo', '-p', 'bar']), diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 150764629..a5c7348b2 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1427,7 +1427,7 @@ class YoutubeDL(object): min_wait, max_wait = self.params.get('wait_for_video') diff = try_get(ie_result, lambda x: x['release_timestamp'] - time.time()) if diff is None and ie_result.get('live_status') == 'is_upcoming': - diff = random.randrange(min_wait, max_wait) if (max_wait and min_wait) else (max_wait or min_wait) + diff = round(random.uniform(min_wait, max_wait) if (max_wait and min_wait) else (max_wait or min_wait), 0) self.report_warning('Release time of video is not known') elif (diff or 0) <= 0: self.report_warning('Video should already be available according to extracted info') @@ -2858,14 +2858,13 @@ class YoutubeDL(object): # Does nothing under normal operation - for backward compatibility of process_info self.post_extract(info_dict) + self._num_downloads += 1 # info_dict['_filename'] needs to be set for backward compatibility info_dict['_filename'] = full_filename = self.prepare_filename(info_dict, warn=True) temp_filename = self.prepare_filename(info_dict, 'temp') files_to_move = {} - self._num_downloads += 1 - # Forced printings self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict)) diff --git a/yt_dlp/downloader/fragment.py b/yt_dlp/downloader/fragment.py index 95fb2f9e7..6b75dfc62 100644 --- a/yt_dlp/downloader/fragment.py +++ b/yt_dlp/downloader/fragment.py @@ -133,19 +133,19 @@ class FragmentFD(FileDownloader): } success = ctx['dl'].download(fragment_filename, fragment_info_dict) if not success: - return False, None + return False if fragment_info_dict.get('filetime'): ctx['fragment_filetime'] = fragment_info_dict.get('filetime') ctx['fragment_filename_sanitized'] = fragment_filename - try: - return True, self._read_fragment(ctx) - except FileNotFoundError: - if not info_dict.get('is_live'): - raise - return False, None + return True def _read_fragment(self, ctx): - down, frag_sanitized = self.sanitize_open(ctx['fragment_filename_sanitized'], 'rb') + try: + down, frag_sanitized = self.sanitize_open(ctx['fragment_filename_sanitized'], 'rb') + except FileNotFoundError: + if ctx.get('live'): + return None + raise ctx['fragment_filename_sanitized'] = frag_sanitized frag_content = down.read() down.close() @@ -457,7 +457,7 @@ class FragmentFD(FileDownloader): def download_fragment(fragment, ctx): if not interrupt_trigger[0]: - return False, fragment['frag_index'] + return frag_index = ctx['fragment_index'] = fragment['frag_index'] ctx['last_error'] = None @@ -467,14 +467,12 @@ class FragmentFD(FileDownloader): headers['Range'] = 'bytes=%d-%d' % (byte_range['start'], byte_range['end'] - 1) # Never skip the first fragment - fatal = is_fatal(fragment.get('index') or (frag_index - 1)) - count, frag_content = 0, None + fatal, count = is_fatal(fragment.get('index') or (frag_index - 1)), 0 while count <= fragment_retries: try: - success, frag_content = self._download_fragment(ctx, fragment['url'], info_dict, headers) - if not success: - return False, frag_index - break + if self._download_fragment(ctx, fragment['url'], info_dict, headers): + break + return except (compat_urllib_error.HTTPError, http.client.IncompleteRead) as err: # Unavailable (possibly temporary) fragments may be served. # First we try to retry then either skip or abort. @@ -491,13 +489,9 @@ class FragmentFD(FileDownloader): break raise - if count > fragment_retries: - if not fatal: - return False, frag_index + if count > fragment_retries and fatal: ctx['dest_stream'].close() self.report_error('Giving up after %s fragment retries' % fragment_retries) - return False, frag_index - return frag_content, frag_index def append_fragment(frag_content, frag_index, ctx): if not frag_content: @@ -520,23 +514,23 @@ class FragmentFD(FileDownloader): def _download_fragment(fragment): ctx_copy = ctx.copy() - frag_content, frag_index = download_fragment(fragment, ctx_copy) - return fragment, frag_content, frag_index, ctx_copy.get('fragment_filename_sanitized') + download_fragment(fragment, ctx_copy) + return fragment, fragment['frag_index'], ctx_copy.get('fragment_filename_sanitized') self.report_warning('The download speed shown is only of one thread. This is a known issue and patches are welcome') with tpe or concurrent.futures.ThreadPoolExecutor(max_workers) as pool: - for fragment, frag_content, frag_index, frag_filename in pool.map(_download_fragment, fragments): + for fragment, frag_index, frag_filename in pool.map(_download_fragment, fragments): ctx['fragment_filename_sanitized'] = frag_filename ctx['fragment_index'] = frag_index - result = append_fragment(decrypt_fragment(fragment, frag_content), frag_index, ctx) + result = append_fragment(decrypt_fragment(fragment, self._read_fragment(ctx)), frag_index, ctx) if not result: return False else: for fragment in fragments: if not interrupt_trigger[0]: break - frag_content, frag_index = download_fragment(fragment, ctx) - result = append_fragment(decrypt_fragment(fragment, frag_content), frag_index, ctx) + download_fragment(fragment, ctx) + result = append_fragment(decrypt_fragment(fragment, self._read_fragment(ctx)), fragment['frag_index'], ctx) if not result: return False diff --git a/yt_dlp/downloader/http.py b/yt_dlp/downloader/http.py index 10ba61024..8e096b76b 100644 --- a/yt_dlp/downloader/http.py +++ b/yt_dlp/downloader/http.py @@ -1,8 +1,7 @@ from __future__ import unicode_literals -import errno import os -import socket +import ssl import time import random @@ -10,6 +9,7 @@ from .common import FileDownloader from ..compat import ( compat_str, compat_urllib_error, + compat_http_client ) from ..utils import ( ContentTooShortError, @@ -18,11 +18,14 @@ from ..utils import ( parse_http_range, sanitized_Request, ThrottledDownload, + try_get, write_xattr, XAttrMetadataError, XAttrUnavailableError, ) +RESPONSE_READ_EXCEPTIONS = (TimeoutError, ConnectionError, ssl.SSLError, compat_http_client.HTTPException) + class HttpFD(FileDownloader): def real_download(self, filename, info_dict): @@ -53,7 +56,6 @@ class HttpFD(FileDownloader): ctx.open_mode = 'wb' ctx.resume_len = 0 - ctx.data_len = None ctx.block_size = self.params.get('buffersize', 1024) ctx.start_time = time.time() ctx.chunk_size = None @@ -100,6 +102,8 @@ class HttpFD(FileDownloader): if ctx.is_resume: self.report_resuming_byte(ctx.resume_len) ctx.open_mode = 'ab' + elif req_start is not None: + range_start = req_start elif ctx.chunk_size > 0: range_start = 0 else: @@ -116,23 +120,21 @@ class HttpFD(FileDownloader): else: range_end = None - if range_end and ctx.data_len is not None and range_end >= ctx.data_len: - range_end = ctx.data_len - 1 - has_range = range_start is not None - ctx.has_range = has_range + if try_get(None, lambda _: range_start > range_end): + ctx.resume_len = 0 + ctx.open_mode = 'wb' + raise RetryDownload(Exception(f'Conflicting range. (start={range_start} > end={range_end})')) + + if try_get(None, lambda _: range_end >= ctx.content_len): + range_end = ctx.content_len - 1 + request = sanitized_Request(url, request_data, headers) + has_range = range_start is not None if has_range: set_range(request, range_start, range_end) # Establish connection try: - try: - ctx.data = self.ydl.urlopen(request) - except (compat_urllib_error.URLError, ) as err: - # reason may not be available, e.g. for urllib2.HTTPError on python 2.6 - reason = getattr(err, 'reason', None) - if isinstance(reason, socket.timeout): - raise RetryDownload(err) - raise err + ctx.data = self.ydl.urlopen(request) # When trying to resume, Content-Range HTTP header of response has to be checked # to match the value of requested Range HTTP header. This is due to a webservers # that don't support resuming and serve a whole file with no Content-Range @@ -151,7 +153,8 @@ class HttpFD(FileDownloader): or content_range_end == range_end or content_len < range_end) if accept_content_len: - ctx.data_len = content_len + ctx.content_len = content_len + ctx.data_len = min(content_len, req_end or content_len) - (req_start or 0) return # Content-Range is either not present or invalid. Assuming remote webserver is # trying to send the whole file, resume is not possible, so wiping the local file @@ -159,8 +162,7 @@ class HttpFD(FileDownloader): self.report_unable_to_resume() ctx.resume_len = 0 ctx.open_mode = 'wb' - ctx.data_len = int_or_none(ctx.data.info().get('Content-length', None)) - return + ctx.data_len = ctx.content_len = int_or_none(ctx.data.info().get('Content-length', None)) except (compat_urllib_error.HTTPError, ) as err: if err.code == 416: # Unable to resume (requested range not satisfiable) @@ -202,13 +204,14 @@ class HttpFD(FileDownloader): # Unexpected HTTP error raise raise RetryDownload(err) - except socket.timeout as err: + except compat_urllib_error.URLError as err: + if isinstance(err.reason, ssl.CertificateError): + raise + raise RetryDownload(err) + # In urllib.request.AbstractHTTPHandler, the response is partially read on request. + # Any errors that occur during this will not be wrapped by URLError + except RESPONSE_READ_EXCEPTIONS as err: raise RetryDownload(err) - except socket.error as err: - if err.errno in (errno.ECONNRESET, errno.ETIMEDOUT): - # Connection reset is no problem, just retry - raise RetryDownload(err) - raise def download(): nonlocal throttle_start @@ -254,16 +257,8 @@ class HttpFD(FileDownloader): try: # Download and write data_block = ctx.data.read(block_size if not is_test else min(block_size, data_len - byte_counter)) - # socket.timeout is a subclass of socket.error but may not have - # errno set - except socket.timeout as e: - retry(e) - except socket.error as e: - # SSLError on python 2 (inherits socket.error) may have - # no errno set but this error message - if e.errno in (errno.ECONNRESET, errno.ETIMEDOUT) or getattr(e, 'message', None) == 'The read operation timed out': - retry(e) - raise + except RESPONSE_READ_EXCEPTIONS as err: + retry(err) byte_counter += len(data_block) @@ -343,7 +338,7 @@ class HttpFD(FileDownloader): elif speed: throttle_start = None - if not is_test and ctx.chunk_size and ctx.data_len is not None and byte_counter < ctx.data_len: + if not is_test and ctx.chunk_size and ctx.content_len is not None and byte_counter < ctx.content_len: ctx.resume_len = byte_counter # ctx.block_size = block_size raise NextFragment() diff --git a/yt_dlp/downloader/ism.py b/yt_dlp/downloader/ism.py index 09516abe5..4d5618c83 100644 --- a/yt_dlp/downloader/ism.py +++ b/yt_dlp/downloader/ism.py @@ -263,9 +263,11 @@ class IsmFD(FragmentFD): count = 0 while count <= fragment_retries: try: - success, frag_content = self._download_fragment(ctx, segment['url'], info_dict) + success = self._download_fragment(ctx, segment['url'], info_dict) if not success: return False + frag_content = self._read_fragment(ctx) + if not extra_state['ism_track_written']: tfhd_data = extract_box_data(frag_content, [b'moof', b'traf', b'tfhd']) info_dict['_download_params']['track_id'] = u32.unpack(tfhd_data[4:8])[0] diff --git a/yt_dlp/downloader/mhtml.py b/yt_dlp/downloader/mhtml.py index bc86fd1bf..54e711792 100644 --- a/yt_dlp/downloader/mhtml.py +++ b/yt_dlp/downloader/mhtml.py @@ -171,9 +171,10 @@ body > figure > img { assert fragment_base_url fragment_url = urljoin(fragment_base_url, fragment['path']) - success, frag_content = self._download_fragment(ctx, fragment_url, info_dict) + success = self._download_fragment(ctx, fragment_url, info_dict) if not success: continue + frag_content = self._read_fragment(ctx) mime_type = b'image/jpeg' if frag_content.startswith(b'\x89PNG\r\n\x1a\n'): diff --git a/yt_dlp/downloader/youtube_live_chat.py b/yt_dlp/downloader/youtube_live_chat.py index b28d1ec17..cfca686ee 100644 --- a/yt_dlp/downloader/youtube_live_chat.py +++ b/yt_dlp/downloader/youtube_live_chat.py @@ -115,9 +115,10 @@ class YoutubeLiveChatFD(FragmentFD): count = 0 while count <= fragment_retries: try: - success, raw_fragment = dl_fragment(url, request_data, headers) + success = dl_fragment(url, request_data, headers) if not success: return False, None, None, None + raw_fragment = self._read_fragment(ctx) try: data = ie.extract_yt_initial_data(video_id, raw_fragment.decode('utf-8', 'replace')) except RegexNotFoundError: @@ -145,9 +146,10 @@ class YoutubeLiveChatFD(FragmentFD): self._prepare_and_start_frag_download(ctx, info_dict) - success, raw_fragment = dl_fragment(info_dict['url']) + success = dl_fragment(info_dict['url']) if not success: return False + raw_fragment = self._read_fragment(ctx) try: data = ie.extract_yt_initial_data(video_id, raw_fragment.decode('utf-8', 'replace')) except RegexNotFoundError: diff --git a/yt_dlp/extractor/abematv.py b/yt_dlp/extractor/abematv.py index 360fa4699..a839f0c1f 100644 --- a/yt_dlp/extractor/abematv.py +++ b/yt_dlp/extractor/abematv.py @@ -291,15 +291,7 @@ class AbemaTVIE(AbemaTVBaseIE): return self._MEDIATOKEN - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - # No authentication to be performed - if not username: - return True - + def _perform_login(self, username, password): if '@' in username: # don't strictly check if it's email address or not ep, method = 'user/email', 'email' else: diff --git a/yt_dlp/extractor/adn.py b/yt_dlp/extractor/adn.py index 0863e0d85..fca6e605d 100644 --- a/yt_dlp/extractor/adn.py +++ b/yt_dlp/extractor/adn.py @@ -126,10 +126,7 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' }]) return subtitles - def _real_initialize(self): - username, password = self._get_login_info() - if not username: - return + def _perform_login(self, username, password): try: access_token = (self._download_json( self._API_BASE_URL + 'authentication/login', None, diff --git a/yt_dlp/extractor/afreecatv.py b/yt_dlp/extractor/afreecatv.py index f25fc47fa..77f0e3c10 100644 --- a/yt_dlp/extractor/afreecatv.py +++ b/yt_dlp/extractor/afreecatv.py @@ -32,7 +32,7 @@ class AfreecaTVIE(InfoExtractor): /app/(?:index|read_ucc_bbs)\.cgi| /player/[Pp]layer\.(?:swf|html) )\?.*?\bnTitleNo=| - vod\.afreecatv\.com/PLAYER/STATION/ + vod\.afreecatv\.com/(PLAYER/STATION|player)/ ) (?P<id>\d+) ''' @@ -170,6 +170,9 @@ class AfreecaTVIE(InfoExtractor): }, { 'url': 'http://vod.afreecatv.com/PLAYER/STATION/15055030', 'only_matching': True, + }, { + 'url': 'http://vod.afreecatv.com/player/15055030', + 'only_matching': True, }] @staticmethod @@ -181,14 +184,7 @@ class AfreecaTVIE(InfoExtractor): video_key['part'] = int(m.group('part')) return video_key - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): login_form = { 'szWork': 'login', 'szType': 'json', diff --git a/yt_dlp/extractor/alura.py b/yt_dlp/extractor/alura.py index f5325de2f..d2e2df270 100644 --- a/yt_dlp/extractor/alura.py +++ b/yt_dlp/extractor/alura.py @@ -74,14 +74,7 @@ class AluraIE(InfoExtractor): "formats": formats } - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - pass + def _perform_login(self, username, password): login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login popup') diff --git a/yt_dlp/extractor/animelab.py b/yt_dlp/extractor/animelab.py index 4fb7ee424..1c2cc47dd 100644 --- a/yt_dlp/extractor/animelab.py +++ b/yt_dlp/extractor/animelab.py @@ -15,25 +15,21 @@ from ..compat import compat_HTTPError class AnimeLabBaseIE(InfoExtractor): - _LOGIN_REQUIRED = True _LOGIN_URL = 'https://www.animelab.com/login' _NETRC_MACHINE = 'animelab' + _LOGGED_IN = False - def _login(self): - def is_logged_in(login_webpage): - return 'Sign In' not in login_webpage + def _is_logged_in(self, login_page=None): + if not self._LOGGED_IN: + if not login_page: + login_page = self._download_webpage(self._LOGIN_URL, None, 'Downloading login page') + AnimeLabBaseIE._LOGGED_IN = 'Sign In' not in login_page + return self._LOGGED_IN - login_page = self._download_webpage( - self._LOGIN_URL, None, 'Downloading login page') - - # Check if already logged in - if is_logged_in(login_page): + def _perform_login(self, username, password): + if self._is_logged_in(): return - (username, password) = self._get_login_info() - if username is None and self._LOGIN_REQUIRED: - self.raise_login_required('Login is required to access any AnimeLab content') - login_form = { 'email': username, 'password': password, @@ -47,17 +43,14 @@ class AnimeLabBaseIE(InfoExtractor): except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: raise ExtractorError('Unable to log in (wrong credentials?)', expected=True) - else: - raise + raise - # if login was successful - if is_logged_in(response): - return - - raise ExtractorError('Unable to login (cannot verify if logged in)') + if not self._is_logged_in(response): + raise ExtractorError('Unable to login (cannot verify if logged in)') def _real_initialize(self): - self._login() + if not self._is_logged_in(): + self.raise_login_required('Login is required to access any AnimeLab content') class AnimeLabIE(AnimeLabBaseIE): diff --git a/yt_dlp/extractor/animeondemand.py b/yt_dlp/extractor/animeondemand.py index 5694f7240..2e674d58f 100644 --- a/yt_dlp/extractor/animeondemand.py +++ b/yt_dlp/extractor/animeondemand.py @@ -53,11 +53,7 @@ class AnimeOnDemandIE(InfoExtractor): 'only_matching': True, }] - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') @@ -93,9 +89,6 @@ class AnimeOnDemandIE(InfoExtractor): raise ExtractorError('Unable to login: %s' % error, expected=True) raise ExtractorError('Unable to log in') - def _real_initialize(self): - self._login() - def _real_extract(self, url): anime_id = self._match_id(url) diff --git a/yt_dlp/extractor/arte.py b/yt_dlp/extractor/arte.py index a7ffdc24c..c2f2c1bd3 100644 --- a/yt_dlp/extractor/arte.py +++ b/yt_dlp/extractor/arte.py @@ -138,6 +138,7 @@ class ArteTVIE(ArteTVBaseIE): break else: lang_pref = -1 + format_note = '%s, %s' % (f.get('versionCode'), f.get('versionLibelle')) media_type = f.get('mediaType') if media_type == 'hls': @@ -145,14 +146,17 @@ class ArteTVIE(ArteTVBaseIE): format_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False) for m3u8_format in m3u8_formats: - m3u8_format['language_preference'] = lang_pref + m3u8_format.update({ + 'language_preference': lang_pref, + 'format_note': format_note, + }) formats.extend(m3u8_formats) continue format = { 'format_id': format_id, 'language_preference': lang_pref, - 'format_note': '%s, %s' % (f.get('versionCode'), f.get('versionLibelle')), + 'format_note': format_note, 'width': int_or_none(f.get('width')), 'height': int_or_none(f.get('height')), 'tbr': int_or_none(f.get('bitrate')), diff --git a/yt_dlp/extractor/atresplayer.py b/yt_dlp/extractor/atresplayer.py index 6d843966a..465af4ed3 100644 --- a/yt_dlp/extractor/atresplayer.py +++ b/yt_dlp/extractor/atresplayer.py @@ -37,9 +37,6 @@ class AtresPlayerIE(InfoExtractor): ] _API_BASE = 'https://api.atresplayer.com/' - def _real_initialize(self): - self._login() - def _handle_error(self, e, code): if isinstance(e.cause, compat_HTTPError) and e.cause.code == code: error = self._parse_json(e.cause.read(), None) @@ -48,11 +45,7 @@ class AtresPlayerIE(InfoExtractor): raise ExtractorError(error['error_description'], expected=True) raise - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): self._request_webpage( self._API_BASE + 'login', None, 'Downloading login page') diff --git a/yt_dlp/extractor/azmedien.py b/yt_dlp/extractor/azmedien.py index fee640e14..b3cabbf94 100644 --- a/yt_dlp/extractor/azmedien.py +++ b/yt_dlp/extractor/azmedien.py @@ -15,7 +15,8 @@ class AZMedienIE(InfoExtractor): (?P<host> telezueri\.ch| telebaern\.tv| - telem1\.ch + telem1\.ch| + tvo-online\.ch )/ [^/]+/ (?P<id> diff --git a/yt_dlp/extractor/bbc.py b/yt_dlp/extractor/bbc.py index b664a7007..823155730 100644 --- a/yt_dlp/extractor/bbc.py +++ b/yt_dlp/extractor/bbc.py @@ -264,11 +264,7 @@ class BBCCoUkIE(InfoExtractor): 'only_matching': True, }] - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading signin page') @@ -294,9 +290,6 @@ class BBCCoUkIE(InfoExtractor): 'Unable to login: %s' % error, expected=True) raise ExtractorError('Unable to log in') - def _real_initialize(self): - self._login() - class MediaSelectionError(Exception): def __init__(self, id): self.id = id diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index 1bbf7ca1c..b4eb20642 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -821,11 +821,7 @@ class BiliIntlBaseIE(InfoExtractor): 'extractor_key': BiliIntlIE.ie_key(), } - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): try: from Cryptodome.PublicKey import RSA from Cryptodome.Cipher import PKCS1_v1_5 @@ -856,9 +852,6 @@ class BiliIntlBaseIE(InfoExtractor): else: raise ExtractorError('Unable to log in') - def _real_initialize(self): - self._login() - class BiliIntlIE(BiliIntlBaseIE): _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?play/(?P<season_id>\d+)/(?P<id>\d+)' diff --git a/yt_dlp/extractor/canvas.py b/yt_dlp/extractor/canvas.py index 82fded4e1..31e7d7de6 100644 --- a/yt_dlp/extractor/canvas.py +++ b/yt_dlp/extractor/canvas.py @@ -274,14 +274,7 @@ class VrtNUIE(GigyaBaseIE): _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy' _CONTEXT_ID = 'R3595707040' - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): auth_info = self._gigya_login({ 'APIKey': self._APIKEY, 'targetEnv': 'jssdk', diff --git a/yt_dlp/extractor/cbs.py b/yt_dlp/extractor/cbs.py index ae9ce5862..2af36ea82 100644 --- a/yt_dlp/extractor/cbs.py +++ b/yt_dlp/extractor/cbs.py @@ -77,21 +77,21 @@ class CBSIE(CBSBaseIE): (?: cbs:| https?://(?:www\.)?(?: - cbs\.com/(?:shows/[^/]+/video|movies/[^/]+)/| + cbs\.com/(?:shows|movies)/(?:video|[^/]+/video|[^/]+)/| colbertlateshow\.com/(?:video|podcasts)/) )(?P<id>[\w-]+)''' # All tests are blocked outside US _TESTS = [{ - 'url': 'https://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', + 'url': 'https://www.cbs.com/shows/video/xrUyNLtl9wd8D_RWWAg9NU2F_V6QpB3R/', 'info_dict': { - 'id': '_u7W953k6la293J7EPTd9oHkSPs6Xn6_', + 'id': 'xrUyNLtl9wd8D_RWWAg9NU2F_V6QpB3R', 'ext': 'mp4', - 'title': 'Connect Chat feat. Garth Brooks', - 'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', - 'duration': 1495, - 'timestamp': 1385585425, - 'upload_date': '20131127', + 'title': 'Tough As Nails - Dreams Never Die', + 'description': 'md5:a3535a62531cdd52b0364248a2c1ae33', + 'duration': 2588, + 'timestamp': 1639015200, + 'upload_date': '20211209', 'uploader': 'CBSI-NEW', }, 'params': { @@ -99,14 +99,14 @@ class CBSIE(CBSBaseIE): 'skip_download': True, }, }, { - 'url': 'https://www.cbs.com/shows/the-late-show-with-stephen-colbert/video/60icOhMb9NcjbcWnF_gub9XXHdeBcNk2/the-late-show-6-23-21-christine-baranski-joy-oladokun-', + 'url': 'https://www.cbs.com/shows/video/sZH1MGgomIosZgxGJ1l263MFq16oMtW1/', 'info_dict': { - 'id': '60icOhMb9NcjbcWnF_gub9XXHdeBcNk2', - 'title': 'The Late Show - 6/23/21 (Christine Baranski, Joy Oladokun)', - 'timestamp': 1624507140, - 'description': 'md5:e01af24e95c74d55e8775aef86117b95', + 'id': 'sZH1MGgomIosZgxGJ1l263MFq16oMtW1', + 'title': 'The Late Show - 3/16/22 (Michael Buble, Rose Matafeo)', + 'timestamp': 1647488100, + 'description': 'md5:d0e6ec23c544b7fa8e39a8e6844d2439', 'uploader': 'CBSI-NEW', - 'upload_date': '20210624', + 'upload_date': '20220317', }, 'params': { 'ignore_no_formats_error': True, diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 345da9a72..f3ae3fd4c 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -432,7 +432,15 @@ class InfoExtractor(object): Subclasses may also override suitable() if necessary, but ensure the function signature is preserved and that this function imports everything it needs - (except other extractors), so that lazy_extractors works correctly + (except other extractors), so that lazy_extractors works correctly. + + To support username + password (or netrc) login, the extractor must define a + _NETRC_MACHINE and re-define _perform_login(username, password) and + (optionally) _initialize_pre_login() methods. The _perform_login method will + be called between _initialize_pre_login and _real_initialize if credentials + are passed by the user. In cases where it is necessary to have the login + process as part of the extraction rather than initialization, _perform_login + can be left undefined. _GEO_BYPASS attribute may be set to False in order to disable geo restriction bypass mechanisms for a particular extractor. @@ -460,9 +468,10 @@ class InfoExtractor(object): _GEO_COUNTRIES = None _GEO_IP_BLOCKS = None _WORKING = True + _NETRC_MACHINE = None _LOGIN_HINTS = { - 'any': 'Use --cookies, --username and --password, or --netrc to provide account credentials', + 'any': 'Use --cookies, --cookies-from-browser, --username and --password, or --netrc to provide account credentials', 'cookies': ( 'Use --cookies-from-browser or --cookies for the authentication. ' 'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to manually pass cookies'), @@ -512,6 +521,10 @@ class InfoExtractor(object): """Getter method for _WORKING.""" return cls._WORKING + @classmethod + def supports_login(cls): + return bool(cls._NETRC_MACHINE) + def initialize(self): """Initializes an instance (authentication, etc).""" self._printed_messages = set() @@ -520,6 +533,13 @@ class InfoExtractor(object): 'ip_blocks': self._GEO_IP_BLOCKS, }) if not self._ready: + self._initialize_pre_login() + if self.supports_login(): + username, password = self._get_login_info() + if username: + self._perform_login(username, password) + elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE): + self.report_warning(f'Login with password is not supported for this website. {self._LOGIN_HINTS["cookies"]}') self._real_initialize() self._ready = True @@ -665,6 +685,14 @@ class InfoExtractor(object): """Sets a YoutubeDL instance as the downloader for this IE.""" self._downloader = downloader + def _initialize_pre_login(self): + """ Intialization before login. Redefine in subclasses.""" + pass + + def _perform_login(self, username, password): + """ Login with username and password. Redefine in subclasses.""" + pass + def _real_initialize(self): """Real initialization process. Redefine in subclasses.""" pass @@ -1098,12 +1126,15 @@ class InfoExtractor(object): def raise_login_required( self, msg='This video is only available for registered users', - metadata_available=False, method='any'): + metadata_available=False, method=NO_DEFAULT): if metadata_available and ( self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')): self.report_warning(msg) return + if method is NO_DEFAULT: + method = 'any' if self.supports_login() else 'cookies' if method is not None: + assert method in self._LOGIN_HINTS, 'Invalid login method' msg = '%s. %s' % (msg, self._LOGIN_HINTS[method]) raise ExtractorError(msg, expected=True) @@ -3680,9 +3711,8 @@ class InfoExtractor(object): def mark_watched(self, *args, **kwargs): if not self.get_param('mark_watched', False): return - if (hasattr(self, '_NETRC_MACHINE') and self._get_login_info()[0] is not None - or self.get_param('cookiefile') - or self.get_param('cookiesfrombrowser')): + if (self.supports_login() and self._get_login_info()[0] is not None + or self.get_param('cookiefile') or self.get_param('cookiesfrombrowser')): self._mark_watched(*args, **kwargs) def _mark_watched(self, *args, **kwargs): diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index b6ba5ef56..bf1bf8c1c 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -57,10 +57,7 @@ class CrunchyrollBaseIE(InfoExtractor): 'Content-Type': 'application/x-www-form-urlencoded', }) - def _login(self): - username, password = self._get_login_info() - if username is None: - return + def _perform_login(self, username, password): if self._get_cookies(self._LOGIN_URL).get('etp_rt'): return @@ -89,9 +86,6 @@ class CrunchyrollBaseIE(InfoExtractor): if not self._get_cookies(self._LOGIN_URL).get('etp_rt'): raise ExtractorError('Login succeeded but did not set etp_rt cookie') - def _real_initialize(self): - self._login() - @staticmethod def _add_skip_wall(url): parsed_url = compat_urlparse.urlparse(url) diff --git a/yt_dlp/extractor/curiositystream.py b/yt_dlp/extractor/curiositystream.py index 485b6031f..b8abcf7a5 100644 --- a/yt_dlp/extractor/curiositystream.py +++ b/yt_dlp/extractor/curiositystream.py @@ -33,14 +33,11 @@ class CuriosityStreamBaseIE(InfoExtractor): self._handle_errors(result) return result['data'] - def _real_initialize(self): - email, password = self._get_login_info() - if email is None: - return + def _perform_login(self, username, password): result = self._download_json( 'https://api.curiositystream.com/v1/login', None, note='Logging in', data=urlencode_postdata({ - 'email': email, + 'email': username, 'password': password, })) self._handle_errors(result) diff --git a/yt_dlp/extractor/daftsex.py b/yt_dlp/extractor/daftsex.py index 03672b35d..6037fd9ca 100644 --- a/yt_dlp/extractor/daftsex.py +++ b/yt_dlp/extractor/daftsex.py @@ -4,30 +4,50 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_b64decode from ..utils import ( - get_elements_by_class, int_or_none, js_to_json, parse_count, parse_duration, + traverse_obj, try_get, + unified_timestamp, ) class DaftsexIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?daftsex\.com/watch/(?P<id>-?\d+_\d+)' _TESTS = [{ + 'url': 'https://daftsex.com/watch/-35370899_456246186', + 'md5': 'd95135e6cea2d905bea20dbe82cda64a', + 'info_dict': { + 'id': '-35370899_456246186', + 'ext': 'mp4', + 'title': 'just relaxing', + 'description': 'just relaxing - Watch video Watch video in high quality', + 'upload_date': '20201113', + 'timestamp': 1605261911, + 'thumbnail': r're:https://[^/]+/impf/-43BuMDIawmBGr3GLcZ93CYwWf2PBv_tVWoS1A/dnu41DnARU4\.jpg\?size=800x450&quality=96&keep_aspect_ratio=1&background=000000&sign=6af2c26ff4a45e55334189301c867384&type=video_thumb', + }, + }, { 'url': 'https://daftsex.com/watch/-156601359_456242791', 'info_dict': { 'id': '-156601359_456242791', 'ext': 'mp4', 'title': 'Skye Blue - Dinner And A Show', + 'description': 'Skye Blue - Dinner And A Show - Watch video Watch video in high quality', + 'upload_date': '20200916', + 'timestamp': 1600250735, + 'thumbnail': 'https://psv153-1.crazycloud.ru/videos/-156601359/456242791/thumb.jpg?extra=i3D32KaBbBFf9TqDRMAVmQ', }, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = get_elements_by_class('heading', webpage)[-1] + title = self._html_search_meta('name', webpage, 'title') + timestamp = unified_timestamp(self._html_search_meta('uploadDate', webpage, 'Upload Date', default=None)) + description = self._html_search_meta('description', webpage, 'Description', default=None) + duration = parse_duration(self._search_regex( r'Duration: ((?:[0-9]{2}:){0,2}[0-9]{2})', webpage, 'duration', fatal=False)) @@ -52,28 +72,75 @@ class DaftsexIE(InfoExtractor): video_id, transform_source=js_to_json) server_domain = 'https://%s' % compat_b64decode(video_params['server'][::-1]).decode('utf-8') + + cdn_files = traverse_obj(video_params, ('video', 'cdn_files')) or {} + if cdn_files: + formats = [] + for format_id, format_data in cdn_files.items(): + ext, height = format_id.split('_') + formats.append({ + 'format_id': format_id, + 'url': f'{server_domain}/videos/{video_id.replace("_", "/")}/{height}.mp4?extra={format_data.split(".")[-1]}', + 'height': int_or_none(height), + 'ext': ext, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': description, + 'duration': duration, + 'thumbnail': try_get(video_params, lambda vi: 'https:' + compat_b64decode(vi['video']['thumb']).decode('utf-8')), + 'timestamp': timestamp, + 'view_count': views, + 'age_limit': 18, + } + + item = self._download_json( + f'{server_domain}/method/video.get/{video_id}', video_id, + headers={'Referer': url}, query={ + 'token': video_params['video']['access_token'], + 'videos': video_id, + 'ckey': video_params['c_key'], + 'credentials': video_params['video']['credentials'], + })['response']['items'][0] + formats = [] - for format_id, format_data in video_params['video']['cdn_files'].items(): - ext, height = format_id.split('_') - extra_quality_data = format_data.split('.')[-1] - url = f'{server_domain}/videos/{video_id.replace("_", "/")}/{height}.mp4?extra={extra_quality_data}' - formats.append({ - 'format_id': format_id, - 'url': url, - 'height': int_or_none(height), - 'ext': ext, - }) + for f_id, f_url in item.get('files', {}).items(): + if f_id == 'external': + return self.url_result(f_url) + ext, height = f_id.split('_') + height_extra_key = traverse_obj(video_params, ('video', 'partial', 'quality', height)) + if height_extra_key: + formats.append({ + 'format_id': f'{height}p', + 'url': f'{server_domain}/{f_url[8:]}&videos={video_id}&extra_key={height_extra_key}', + 'height': int_or_none(height), + 'ext': ext, + }) self._sort_formats(formats) - thumbnail = try_get(video_params, - lambda vi: 'https:' + compat_b64decode(vi['video']['thumb']).decode('utf-8')) + thumbnails = [] + for k, v in item.items(): + if k.startswith('photo_') and v: + width = k.replace('photo_', '') + thumbnails.append({ + 'id': width, + 'url': v, + 'width': int_or_none(width), + }) return { 'id': video_id, 'title': title, 'formats': formats, + 'comment_count': int_or_none(item.get('comments')), + 'description': description, 'duration': duration, - 'thumbnail': thumbnail, + 'thumbnails': thumbnails, + 'timestamp': timestamp, 'view_count': views, 'age_limit': 18, } diff --git a/yt_dlp/extractor/digitalconcerthall.py b/yt_dlp/extractor/digitalconcerthall.py index 9b302a9a0..8398ae30e 100644 --- a/yt_dlp/extractor/digitalconcerthall.py +++ b/yt_dlp/extractor/digitalconcerthall.py @@ -45,10 +45,7 @@ class DigitalConcertHallIE(InfoExtractor): 'playlist_count': 3, }] - def _login(self): - username, password = self._get_login_info() - if not username: - self.raise_login_required() + def _perform_login(self, username, password): token_response = self._download_json( self._OAUTH_URL, None, 'Obtaining token', errnote='Unable to obtain token', data=urlencode_postdata({ @@ -78,7 +75,8 @@ class DigitalConcertHallIE(InfoExtractor): self.raise_login_required(msg='Login info incorrect') def _real_initialize(self): - self._login() + if not self._ACCESS_TOKEN: + self.raise_login_required(method='password') def _entries(self, items, language, **kwargs): for item in items: diff --git a/yt_dlp/extractor/eroprofile.py b/yt_dlp/extractor/eroprofile.py index a8396f1d3..5d5e7f244 100644 --- a/yt_dlp/extractor/eroprofile.py +++ b/yt_dlp/extractor/eroprofile.py @@ -39,11 +39,7 @@ class EroProfileIE(InfoExtractor): 'skip': 'Requires login', }] - def _login(self): - (username, password) = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): query = compat_urllib_parse_urlencode({ 'username': username, 'password': password, @@ -62,9 +58,6 @@ class EroProfileIE(InfoExtractor): r'<script[^>]+?src="([^"]+)"', login_page, 'login redirect url') self._download_webpage(redirect_url, None, False) - def _real_initialize(self): - self._login() - def _real_extract(self, url): display_id = self._match_id(url) diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index 09b795c56..4eda27cdc 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -622,6 +622,7 @@ from .hse import ( HSEProductIE, ) from .huajiao import HuajiaoIE +from .huya import HuyaLiveIE from .huffpost import HuffPostIE from .hungama import ( HungamaIE, diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index ef57b221c..2deed585f 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -329,11 +329,7 @@ class FacebookIE(InfoExtractor): urls.append(mobj.group('url')) return urls - def _login(self): - useremail, password = self._get_login_info() - if useremail is None: - return - + def _perform_login(self, username, password): login_page_req = sanitized_Request(self._LOGIN_URL) self._set_cookie('facebook.com', 'locale', 'en_US') login_page = self._download_webpage(login_page_req, None, @@ -345,7 +341,7 @@ class FacebookIE(InfoExtractor): lgnrnd = self._search_regex(r'name="lgnrnd" value="([^"]*?)"', login_page, 'lgnrnd') login_form = { - 'email': useremail, + 'email': username, 'pass': password, 'lsd': lsd, 'lgnrnd': lgnrnd, @@ -392,9 +388,6 @@ class FacebookIE(InfoExtractor): self.report_warning('unable to log in: %s' % error_to_compat_str(err)) return - def _real_initialize(self): - self._login() - def _extract_from_url(self, url, video_id): webpage = self._download_webpage( url.replace('://m.facebook.com/', '://www.facebook.com/'), video_id) diff --git a/yt_dlp/extractor/fancode.py b/yt_dlp/extractor/fancode.py index 978df31ff..7ea16c61d 100644 --- a/yt_dlp/extractor/fancode.py +++ b/yt_dlp/extractor/fancode.py @@ -49,30 +49,26 @@ class FancodeVodIE(InfoExtractor): 'referer': 'https://fancode.com', } - def _login(self): + def _perform_login(self, username, password): # Access tokens are shortlived, so get them using the refresh token. - username, password = self._get_login_info() - if username == 'refresh' and password is not None: - self.report_login() - data = '''{ - "query":"mutation RefreshToken($refreshToken: String\\u0021) { refreshToken(refreshToken: $refreshToken) { accessToken }}", - "variables":{ - "refreshToken":"%s" - }, - "operationName":"RefreshToken" - }''' % password - - token_json = self.download_gql('refresh token', data, "Getting the Access token") - self._ACCESS_TOKEN = try_get(token_json, lambda x: x['data']['refreshToken']['accessToken']) - if self._ACCESS_TOKEN is None: - self.report_warning('Failed to get Access token') - else: - self.headers.update({'Authorization': 'Bearer %s' % self._ACCESS_TOKEN}) - elif username is not None: + if username != 'refresh': self.report_warning(f'Login using username and password is not currently supported. {self._LOGIN_HINT}') - def _real_initialize(self): - self._login() + self.report_login() + data = '''{ + "query":"mutation RefreshToken($refreshToken: String\\u0021) { refreshToken(refreshToken: $refreshToken) { accessToken }}", + "variables":{ + "refreshToken":"%s" + }, + "operationName":"RefreshToken" + }''' % password + + token_json = self.download_gql('refresh token', data, "Getting the Access token") + self._ACCESS_TOKEN = try_get(token_json, lambda x: x['data']['refreshToken']['accessToken']) + if self._ACCESS_TOKEN is None: + self.report_warning('Failed to get Access token') + else: + self.headers.update({'Authorization': 'Bearer %s' % self._ACCESS_TOKEN}) def _check_login_required(self, is_available, is_premium): msg = None diff --git a/yt_dlp/extractor/franceculture.py b/yt_dlp/extractor/franceculture.py index 14f4cb489..9dc28d801 100644 --- a/yt_dlp/extractor/franceculture.py +++ b/yt_dlp/extractor/franceculture.py @@ -1,18 +1,45 @@ # coding: utf-8 from __future__ import unicode_literals +import re from .common import InfoExtractor from ..utils import ( determine_ext, extract_attributes, int_or_none, + traverse_obj, + unified_strdate, ) class FranceCultureIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emissions/(?:[^/]+/)*(?P<id>[^/?#&]+)' _TESTS = [{ - 'url': 'http://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks', + # playlist + 'url': 'https://www.franceculture.fr/emissions/serie/hasta-dente', + 'playlist_count': 12, + 'info_dict': { + 'id': 'hasta-dente', + 'title': 'Hasta Dente', + 'description': 'md5:57479af50648d14e9bb649e6b1f8f911', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20201024', + }, + 'playlist': [{ + 'info_dict': { + 'id': '3c1c2e55-41a0-11e5-9fe0-005056a87c89', + 'ext': 'mp3', + 'title': 'Jeudi, vous avez dit bizarre ?', + 'description': 'md5:47cf1e00cc21c86b0210279996a812c6', + 'duration': 604, + 'upload_date': '20201024', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1603576680 + }, + }, + ], + }, { + 'url': 'https://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks', 'info_dict': { 'id': 'rendez-vous-au-pays-des-geeks', 'display_id': 'rendez-vous-au-pays-des-geeks', @@ -20,9 +47,9 @@ class FranceCultureIE(InfoExtractor): 'title': 'Rendez-vous au pays des geeks', 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20140301', - 'timestamp': 1393700400, 'vcodec': 'none', - } + 'duration': 3569, + }, }, { # no thumbnail 'url': 'https://www.franceculture.fr/emissions/la-recherche-montre-en-main/la-recherche-montre-en-main-du-mercredi-10-octobre-2018', @@ -31,9 +58,54 @@ class FranceCultureIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + info = { + 'id': display_id, + 'title': self._html_search_regex( + r'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>', + webpage, 'title', default=self._og_search_title(webpage)), + 'description': self._html_search_regex( + r'(?s)<div[^>]+class="excerpt"[^>]*>(.*?)</div>', webpage, 'description', default=None), + 'thumbnail': self._og_search_thumbnail(webpage), + 'uploader': self._html_search_regex( + r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None), + 'upload_date': unified_strdate(self._html_search_regex( + r'(?s)class="teaser-text-date".*?(\d{2}/\d{2}/\d{4})', webpage, 'date', default=None)), + } + + playlist_data = self._search_regex( + r'''(?sx) + <section[^>]+data-xiti-place="[^"]*?liste_episodes[^"?]*?"[^>]*> + (.*?) + </section> + ''', + webpage, 'playlist data', fatal=False, default=None) + + if playlist_data: + entries = [] + for item, item_description in re.findall( + r'(?s)(<button[^<]*class="[^"]*replay-button[^>]*>).*?<p[^>]*class="[^"]*teaser-text-chapo[^>]*>(.*?)</p>', + playlist_data): + + item_attributes = extract_attributes(item) + entries.append({ + 'id': item_attributes.get('data-emission-uuid'), + 'url': item_attributes.get('data-url'), + 'title': item_attributes.get('data-diffusion-title'), + 'duration': int_or_none(traverse_obj(item_attributes, 'data-duration-seconds', 'data-duration-seconds')), + 'description': item_description, + 'timestamp': int_or_none(item_attributes.get('data-start-time')), + 'thumbnail': info['thumbnail'], + 'uploader': info['uploader'], + }) + + return { + '_type': 'playlist', + 'entries': entries, + **info + } + video_data = extract_attributes(self._search_regex( r'''(?sx) (?: @@ -43,31 +115,14 @@ class FranceCultureIE(InfoExtractor): (<button[^>]+data-(?:url|asset-source)="[^"]+"[^>]+>) ''', webpage, 'video data')) - - video_url = video_data.get('data-url') or video_data['data-asset-source'] - title = video_data.get('data-asset-title') or video_data.get('data-diffusion-title') or self._og_search_title(webpage) - - description = self._html_search_regex( - r'(?s)<div[^>]+class="intro"[^>]*>.*?<h2>(.+?)</h2>', - webpage, 'description', default=None) - thumbnail = self._search_regex( - r'(?s)<figure[^>]+itemtype="https://schema.org/ImageObject"[^>]*>.*?<img[^>]+(?:data-dejavu-)?src="([^"]+)"', - webpage, 'thumbnail', default=None) - uploader = self._html_search_regex( - r'(?s)<span class="author">(.*?)</span>', - webpage, 'uploader', default=None) + video_url = traverse_obj(video_data, 'data-url', 'data-asset-source') ext = determine_ext(video_url.lower()) return { - 'id': display_id, 'display_id': display_id, 'url': video_url, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, 'ext': ext, 'vcodec': 'none' if ext == 'mp3' else None, - 'uploader': uploader, - 'timestamp': int_or_none(video_data.get('data-start-time')) or int_or_none(video_data.get('data-asset-created-date')), 'duration': int_or_none(video_data.get('data-duration')), + **info } diff --git a/yt_dlp/extractor/frontendmasters.py b/yt_dlp/extractor/frontendmasters.py index 0d29da29b..fc67a8437 100644 --- a/yt_dlp/extractor/frontendmasters.py +++ b/yt_dlp/extractor/frontendmasters.py @@ -28,14 +28,7 @@ class FrontendMastersBaseIE(InfoExtractor): 'high': {'width': 1920, 'height': 1080} } - def _real_initialize(self): - self._login() - - def _login(self): - (username, password) = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') diff --git a/yt_dlp/extractor/funimation.py b/yt_dlp/extractor/funimation.py index 96dad2ca3..36a9c4772 100644 --- a/yt_dlp/extractor/funimation.py +++ b/yt_dlp/extractor/funimation.py @@ -36,9 +36,8 @@ class FunimationBaseIE(InfoExtractor): note='Checking geo-location', errnote='Unable to fetch geo-location information'), 'region') or 'US' - def _login(self): - username, password = self._get_login_info() - if username is None: + def _perform_login(self, username, password): + if self._TOKEN: return try: data = self._download_json( @@ -47,7 +46,7 @@ class FunimationBaseIE(InfoExtractor): 'username': username, 'password': password, })) - return data['token'] + FunimationBaseIE._TOKEN = data['token'] except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: error = self._parse_json(e.cause.read().decode(), None)['error'] @@ -90,8 +89,6 @@ class FunimationPageIE(FunimationBaseIE): def _real_initialize(self): if not self._REGION: FunimationBaseIE._REGION = self._get_region() - if not self._TOKEN: - FunimationBaseIE._TOKEN = self._login() def _real_extract(self, url): locale, show, episode = self._match_valid_url(url).group('lang', 'show', 'episode') @@ -154,10 +151,6 @@ class FunimationIE(FunimationBaseIE): }, }] - def _real_initialize(self): - if not self._TOKEN: - FunimationBaseIE._TOKEN = self._login() - @staticmethod def _get_experiences(episode): for lang, lang_data in episode.get('languages', {}).items(): diff --git a/yt_dlp/extractor/gaia.py b/yt_dlp/extractor/gaia.py index 7821fb783..5b0195c63 100644 --- a/yt_dlp/extractor/gaia.py +++ b/yt_dlp/extractor/gaia.py @@ -56,24 +56,22 @@ class GaiaIE(InfoExtractor): def _real_initialize(self): auth = self._get_cookies('https://www.gaia.com/').get('auth') if auth: - auth = self._parse_json( - compat_urllib_parse_unquote(auth.value), - None, fatal=False) - if not auth: - username, password = self._get_login_info() - if username is None: - return - auth = self._download_json( - 'https://auth.gaia.com/v1/login', - None, data=urlencode_postdata({ - 'username': username, - 'password': password - })) - if auth.get('success') is False: - raise ExtractorError(', '.join(auth['messages']), expected=True) - if auth: + auth = self._parse_json(compat_urllib_parse_unquote(auth.value), None, fatal=False) self._jwt = auth.get('jwt') + def _perform_login(self, username, password): + if self._jwt: + return + auth = self._download_json( + 'https://auth.gaia.com/v1/login', + None, data=urlencode_postdata({ + 'username': username, + 'password': password + })) + if auth.get('success') is False: + raise ExtractorError(', '.join(auth['messages']), expected=True) + self._jwt = auth.get('jwt') + def _real_extract(self, url): display_id, vtype = self._match_valid_url(url).groups() node_id = self._download_json( diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 6a8b8543b..97e34808f 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -153,6 +153,7 @@ class GenericIE(InfoExtractor): IE_DESC = 'Generic downloader that works on some sites' _VALID_URL = r'.*' IE_NAME = 'generic' + _NETRC_MACHINE = False # Supress username warning _TESTS = [ # Direct link to a video { diff --git a/yt_dlp/extractor/hidive.py b/yt_dlp/extractor/hidive.py index 15bd444f9..46d7d62ab 100644 --- a/yt_dlp/extractor/hidive.py +++ b/yt_dlp/extractor/hidive.py @@ -35,18 +35,14 @@ class HiDiveIE(InfoExtractor): 'skip': 'Requires Authentication', }] - def _real_initialize(self): - email, password = self._get_login_info() - if email is None: - return - + def _perform_login(self, username, password): webpage = self._download_webpage(self._LOGIN_URL, None) form = self._search_regex( r'(?s)<form[^>]+action="/account/login"[^>]*>(.+?)</form>', webpage, 'login form') data = self._hidden_inputs(form) data.update({ - 'Email': email, + 'Email': username, 'Password': password, }) self._download_webpage( diff --git a/yt_dlp/extractor/hrti.py b/yt_dlp/extractor/hrti.py index dc5b9670c..36d600773 100644 --- a/yt_dlp/extractor/hrti.py +++ b/yt_dlp/extractor/hrti.py @@ -27,8 +27,9 @@ class HRTiBaseIE(InfoExtractor): _APP_VERSION = '1.1' _APP_PUBLICATION_ID = 'all_in_one' _API_URL = 'http://clientapi.hrt.hr/client_api.php/config/identify/format/json' + _token = None - def _initialize_api(self): + def _initialize_pre_login(self): init_data = { 'application_publication_id': self._APP_PUBLICATION_ID } @@ -64,12 +65,7 @@ class HRTiBaseIE(InfoExtractor): self._logout_url = modules['user']['resources']['logout']['uri'] - def _login(self): - username, password = self._get_login_info() - # TODO: figure out authentication with cookies - if username is None or password is None: - self.raise_login_required() - + def _perform_login(self, username, password): auth_data = { 'username': username, 'password': password, @@ -94,8 +90,9 @@ class HRTiBaseIE(InfoExtractor): self._token = auth_info['secure_streaming_token'] def _real_initialize(self): - self._initialize_api() - self._login() + if not self._token: + # TODO: figure out authentication with cookies + self.raise_login_required(method='password') class HRTiIE(HRTiBaseIE): diff --git a/yt_dlp/extractor/huya.py b/yt_dlp/extractor/huya.py new file mode 100644 index 000000000..b81439682 --- /dev/null +++ b/yt_dlp/extractor/huya.py @@ -0,0 +1,138 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import hashlib +import random + +from ..compat import compat_urlparse, compat_b64decode + +from ..utils import ( + ExtractorError, + int_or_none, + js_to_json, + str_or_none, + try_get, + unescapeHTML, + update_url_query, +) + +from .common import InfoExtractor + + +class HuyaLiveIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.|m\.)?huya\.com/(?P<id>[^/#?&]+)(?:\D|$)' + IE_NAME = 'huya:live' + IE_DESC = 'huya.com' + TESTS = [{ + 'url': 'https://www.huya.com/572329', + 'info_dict': { + 'id': '572329', + 'title': str, + 'description': str, + 'is_live': True, + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.huya.com/xiaoyugame', + 'only_matching': True + }] + + _RESOLUTION = { + '蓝光4M': { + 'width': 1920, + 'height': 1080, + }, + '超清': { + 'width': 1280, + 'height': 720, + }, + '流畅': { + 'width': 800, + 'height': 480 + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id=video_id) + json_stream = self._search_regex(r'"stream":\s+"([a-zA-Z0-9+=/]+)"', webpage, 'stream', default=None) + if not json_stream: + raise ExtractorError('Video is offline', expected=True) + stream_data = self._parse_json(compat_b64decode(json_stream).decode(), video_id=video_id, + transform_source=js_to_json) + room_info = try_get(stream_data, lambda x: x['data'][0]['gameLiveInfo']) + if not room_info: + raise ExtractorError('Can not extract the room info', expected=True) + title = room_info.get('roomName') or room_info.get('introduction') or self._html_search_regex( + r'<title>([^<]+)</title>', webpage, 'title') + screen_type = room_info.get('screenType') + live_source_type = room_info.get('liveSourceType') + stream_info_list = stream_data['data'][0]['gameStreamInfoList'] + formats = [] + for stream_info in stream_info_list: + stream_url = stream_info.get('sFlvUrl') + if not stream_url: + continue + stream_name = stream_info.get('sStreamName') + re_secret = not screen_type and live_source_type in (0, 8, 13) + params = dict(compat_urlparse.parse_qsl(unescapeHTML(stream_info['sFlvAntiCode']))) + fm, ss = '', '' + if re_secret: + fm, ss = self.encrypt(params, stream_info, stream_name) + for si in stream_data.get('vMultiStreamInfo'): + rate = si.get('iBitRate') + if rate: + params['ratio'] = rate + else: + params.pop('ratio', None) + if re_secret: + params['wsSecret'] = hashlib.md5( + '_'.join([fm, params['u'], stream_name, ss, params['wsTime']])) + formats.append({ + 'ext': stream_info.get('sFlvUrlSuffix'), + 'format_id': str_or_none(stream_info.get('iLineIndex')), + 'tbr': rate, + 'url': update_url_query(f'{stream_url}/{stream_name}.{stream_info.get("sFlvUrlSuffix")}', + query=params), + **self._RESOLUTION.get(si.get('sDisplayName'), {}), + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'view_count': room_info.get('totalCount'), + 'thumbnail': room_info.get('screenshot'), + 'description': room_info.get('contentIntro'), + 'http_headers': { + 'Origin': 'https://www.huya.com', + 'Referer': 'https://www.huya.com/', + }, + } + + def encrypt(self, params, stream_info, stream_name): + ct = int_or_none(params.get('wsTime'), 16) + random.random() + presenter_uid = stream_info['lPresenterUid'] + if not stream_name.startswith(str(presenter_uid)): + uid = presenter_uid + else: + uid = int_or_none(ct % 1e7 * 1e6 % 0xffffffff) + u1 = uid & 0xffffffff00000000 + u2 = uid & 0xffffffff + u3 = uid & 0xffffff + u = u1 | u2 >> 24 | u3 << 8 + params.update({ + 'u': str_or_none(u), + 'seqid': str_or_none(int_or_none(ct * 1000) + uid), + 'ver': '1', + 'uuid': int_or_none(ct % 1e7 * 1e6 % 0xffffffff), + 't': '100', + }) + fm = compat_b64decode(params['fm']).decode().split('_', 1)[0] + ss = hashlib.md5('|'.join([params['seqid'], params['ctype'], params['t']])) + return fm, ss diff --git a/yt_dlp/extractor/imggaming.py b/yt_dlp/extractor/imggaming.py index 230dc86d3..ce7b21ab2 100644 --- a/yt_dlp/extractor/imggaming.py +++ b/yt_dlp/extractor/imggaming.py @@ -21,25 +21,26 @@ class ImgGamingBaseIE(InfoExtractor): _REALM = None _VALID_URL_TEMPL = r'https?://(?P<domain>%s)/(?P<type>live|playlist|video)/(?P<id>\d+)(?:\?.*?\bplaylistId=(?P<playlist_id>\d+))?' - def _real_initialize(self): + def _initialize_pre_login(self): self._HEADERS = { 'Realm': 'dce.' + self._REALM, 'x-api-key': self._API_KEY, } - email, password = self._get_login_info() - if email is None: - self.raise_login_required() - + def _perform_login(self, username, password): p_headers = self._HEADERS.copy() p_headers['Content-Type'] = 'application/json' self._HEADERS['Authorization'] = 'Bearer ' + self._download_json( self._API_BASE + 'login', None, 'Logging in', data=json.dumps({ - 'id': email, + 'id': username, 'secret': password, }).encode(), headers=p_headers)['authorisationToken'] + def _real_initialize(self): + if not self._HEADERS.get('Authorization'): + self.raise_login_required(method='password') + def _call_api(self, path, media_id): return self._download_json( self._API_BASE + path + media_id, media_id, headers=self._HEADERS) diff --git a/yt_dlp/extractor/instagram.py b/yt_dlp/extractor/instagram.py index 3bb786d6a..970f2c8ab 100644 --- a/yt_dlp/extractor/instagram.py +++ b/yt_dlp/extractor/instagram.py @@ -29,9 +29,8 @@ class InstagramBaseIE(InfoExtractor): _NETRC_MACHINE = 'instagram' _IS_LOGGED_IN = False - def _login(self): - username, password = self._get_login_info() - if username is None or self._IS_LOGGED_IN: + def _perform_login(self, username, password): + if self._IS_LOGGED_IN: return login_webpage = self._download_webpage( @@ -72,9 +71,6 @@ class InstagramBaseIE(InfoExtractor): raise ExtractorError('Unable to login') InstagramBaseIE._IS_LOGGED_IN = True - def _real_initialize(self): - self._login() - def _get_count(self, media, kind, *keys): return traverse_obj( media, (kind, 'count'), *((f'edge_media_{key}', 'count') for key in keys), diff --git a/yt_dlp/extractor/iprima.py b/yt_dlp/extractor/iprima.py index 347fec1d5..1a2038453 100644 --- a/yt_dlp/extractor/iprima.py +++ b/yt_dlp/extractor/iprima.py @@ -65,11 +65,9 @@ class IPrimaIE(InfoExtractor): 'only_matching': True, }] - def _login(self): - username, password = self._get_login_info() - - if username is None or password is None: - self.raise_login_required('Login is required to access any iPrima content', method='password') + def _perform_login(self, username, password): + if self.access_token: + return login_page = self._download_webpage( self._LOGIN_URL, None, note='Downloading login page', @@ -105,16 +103,16 @@ class IPrimaIE(InfoExtractor): if self.access_token is None: raise ExtractorError('Getting token failed', expected=True) + def _real_initialize(self): + if not self.access_token: + self.raise_login_required('Login is required to access any iPrima content', method='password') + def _raise_access_error(self, error_code): if error_code == 'PLAY_GEOIP_DENIED': self.raise_geo_restricted(countries=['CZ'], metadata_available=True) elif error_code is not None: self.raise_no_formats('Access to stream infos forbidden', expected=True) - def _real_initialize(self): - if not self.access_token: - self._login() - def _real_extract(self, url): video_id = self._match_id(url) diff --git a/yt_dlp/extractor/lecturio.py b/yt_dlp/extractor/lecturio.py index 9d2228700..0ee1eeb4d 100644 --- a/yt_dlp/extractor/lecturio.py +++ b/yt_dlp/extractor/lecturio.py @@ -22,14 +22,7 @@ class LecturioBaseIE(InfoExtractor): _LOGIN_URL = 'https://app.lecturio.com/en/login' _NETRC_MACHINE = 'lecturio' - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): # Sets some cookies _, urlh = self._download_webpage_handle( self._LOGIN_URL, None, 'Downloading login popup') diff --git a/yt_dlp/extractor/linkedin.py b/yt_dlp/extractor/linkedin.py index bd76ae166..bf549e164 100644 --- a/yt_dlp/extractor/linkedin.py +++ b/yt_dlp/extractor/linkedin.py @@ -25,12 +25,9 @@ class LinkedInBaseIE(InfoExtractor): _NETRC_MACHINE = 'linkedin' _logged_in = False - def _real_initialize(self): + def _perform_login(self, username, password): if self._logged_in: return - email, password = self._get_login_info() - if email is None: - return login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') @@ -39,7 +36,7 @@ class LinkedInBaseIE(InfoExtractor): default='https://www.linkedin.com/uas/login-submit', group='url')) data = self._hidden_inputs(login_page) data.update({ - 'session_key': email, + 'session_key': username, 'session_password': password, }) login_submit_page = self._download_webpage( diff --git a/yt_dlp/extractor/linuxacademy.py b/yt_dlp/extractor/linuxacademy.py index 2053970d1..6aff88e13 100644 --- a/yt_dlp/extractor/linuxacademy.py +++ b/yt_dlp/extractor/linuxacademy.py @@ -75,14 +75,7 @@ class LinuxAcademyIE(InfoExtractor): _CLIENT_ID = 'KaWxNn1C2Gc7n83W9OFeXltd8Utb5vvx' _NETRC_MACHINE = 'linuxacademy' - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): def random_string(): return ''.join([ random.choice('0123456789ABCDEFGHIJKLMNOPQRSTUVXYZabcdefghijklmnopqrstuvwxyz-._~') diff --git a/yt_dlp/extractor/lynda.py b/yt_dlp/extractor/lynda.py index 58cf17239..ce304743f 100644 --- a/yt_dlp/extractor/lynda.py +++ b/yt_dlp/extractor/lynda.py @@ -21,9 +21,6 @@ class LyndaBaseIE(InfoExtractor): _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to provide lynda.com account credentials.' _NETRC_MACHINE = 'lynda' - def _real_initialize(self): - self._login() - @staticmethod def _check_error(json_string, key_or_keys): keys = [key_or_keys] if isinstance(key_or_keys, compat_str) else key_or_keys @@ -32,7 +29,7 @@ class LyndaBaseIE(InfoExtractor): if error: raise ExtractorError('Unable to login: %s' % error, expected=True) - def _login_step(self, form_html, fallback_action_url, extra_form_data, note, referrer_url): + def _perform_login_step(self, form_html, fallback_action_url, extra_form_data, note, referrer_url): action_url = self._search_regex( r'<form[^>]+action=(["\'])(?P<url>.+?)\1', form_html, 'post url', default=fallback_action_url, group='url') @@ -55,11 +52,7 @@ class LyndaBaseIE(InfoExtractor): return response, action_url - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): # Step 1: download signin page signin_page = self._download_webpage( self._SIGNIN_URL, None, 'Downloading signin page') diff --git a/yt_dlp/extractor/nebula.py b/yt_dlp/extractor/nebula.py index d235805c3..b77ef5f28 100644 --- a/yt_dlp/extractor/nebula.py +++ b/yt_dlp/extractor/nebula.py @@ -148,14 +148,12 @@ class NebulaBaseIE(InfoExtractor): 'creator': episode['channel_title'], } - def _login(self): + def _perform_login(self, username=None, password=None): + # FIXME: username should be passed from here to inner functions self._nebula_api_token = self._retrieve_nebula_api_token() self._nebula_bearer_token = self._fetch_nebula_bearer_token() self._zype_access_token = self._fetch_zype_access_token() - def _real_initialize(self): - self._login() - class NebulaIE(NebulaBaseIE): _VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app)/videos/(?P<id>[-\w]+)' diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py index 8f56fc95b..74828f833 100644 --- a/yt_dlp/extractor/niconico.py +++ b/yt_dlp/extractor/niconico.py @@ -2,36 +2,36 @@ from __future__ import unicode_literals import datetime -import itertools import functools +import itertools import json import re +import time from .common import InfoExtractor, SearchInfoExtractor -from ..postprocessor.ffmpeg import FFmpegPostProcessor from ..compat import ( - compat_str, compat_parse_qs, compat_urllib_parse_urlparse, compat_HTTPError, ) from ..utils import ( ExtractorError, - dict_get, + OnDemandPagedList, + bug_reports_message, + clean_html, float_or_none, int_or_none, - OnDemandPagedList, + join_nonempty, parse_duration, + parse_filesize, parse_iso8601, - PostProcessingError, remove_start, - str_or_none, traverse_obj, try_get, unescapeHTML, - unified_timestamp, + update_url_query, + url_or_none, urlencode_postdata, - xpath_text, ) @@ -41,7 +41,7 @@ class NiconicoIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.nicovideo.jp/watch/sm22312215', - 'md5': 'a5bad06f1347452102953f323c69da34s', + 'md5': 'd1a75c0823e2f629128c43e1212760f9', 'info_dict': { 'id': 'sm22312215', 'ext': 'mp4', @@ -164,35 +164,42 @@ class NiconicoIE(InfoExtractor): }, { 'url': 'http://sp.nicovideo.jp/watch/sm28964488?ss_pos=1&cp_in=wt_tg', 'only_matching': True, + }, { + 'note': 'a video that is only served as an ENCRYPTED HLS.', + 'url': 'https://www.nicovideo.jp/watch/so38016254', + 'only_matching': True, }] - _VALID_URL = r'https?://(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch/(?P<id>(?:[a-z]{2})?[0-9]+)' + _VALID_URL = r'https?://(?:(?:www\.|secure\.|sp\.)?nicovideo\.jp/watch|nico\.ms)/(?P<id>(?:[a-z]{2})?[0-9]+)' _NETRC_MACHINE = 'niconico' - + _COMMENT_API_ENDPOINTS = ( + 'https://nvcomment.nicovideo.jp/legacy/api.json', + 'https://nmsg.nicovideo.jp/api.json',) _API_HEADERS = { 'X-Frontend-ID': '6', - 'X-Frontend-Version': '0' + 'X-Frontend-Version': '0', + 'X-Niconico-Language': 'en-us', + 'Referer': 'https://www.nicovideo.jp/', + 'Origin': 'https://www.nicovideo.jp', } - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - # No authentication to be performed - if not username: - return True - - # Log in + def _perform_login(self, username, password): login_ok = True login_form_strs = { 'mail_tel': username, 'password': password, } + self._request_webpage( + 'https://account.nicovideo.jp/login', None, + note='Acquiring Login session') urlh = self._request_webpage( - 'https://account.nicovideo.jp/api/v1/login', None, + 'https://account.nicovideo.jp/login/redirector?show_button_twitter=1&site=niconico&show_button_facebook=1', None, note='Logging in', errnote='Unable to log in', - data=urlencode_postdata(login_form_strs)) + data=urlencode_postdata(login_form_strs), + headers={ + 'Referer': 'https://account.nicovideo.jp/login', + 'Content-Type': 'application/x-www-form-urlencoded', + }) if urlh is False: login_ok = False else: @@ -204,8 +211,8 @@ class NiconicoIE(InfoExtractor): return login_ok def _get_heartbeat_info(self, info_dict): - video_id, video_src_id, audio_src_id = info_dict['url'].split(':')[1].split('/') + dmc_protocol = info_dict['_expected_protocol'] api_data = ( info_dict.get('_api_data') @@ -220,49 +227,50 @@ class NiconicoIE(InfoExtractor): session_api_endpoint = try_get(session_api_data, lambda x: x['urls'][0]) def ping(): - status = try_get( - self._download_json( - 'https://nvapi.nicovideo.jp/v1/2ab0cbaa/watch', video_id, - query={'t': try_get(api_data, lambda x: x['media']['delivery']['trackingId'])}, - note='Acquiring permission for downloading video', - headers=self._API_HEADERS), - lambda x: x['meta']['status']) - if status != 200: - self.report_warning('Failed to acquire permission for playing video. The video may not download.') + tracking_id = traverse_obj(api_data, ('media', 'delivery', 'trackingId')) + if tracking_id: + tracking_url = update_url_query('https://nvapi.nicovideo.jp/v1/2ab0cbaa/watch', {'t': tracking_id}) + watch_request_response = self._download_json( + tracking_url, video_id, + note='Acquiring permission for downloading video', fatal=False, + headers=self._API_HEADERS) + if traverse_obj(watch_request_response, ('meta', 'status')) != 200: + self.report_warning('Failed to acquire permission for playing video. Video download may fail.') yesno = lambda x: 'yes' if x else 'no' - # m3u8 (encryption) - if try_get(api_data, lambda x: x['media']['delivery']['encryption']) is not None: + if dmc_protocol == 'http': + protocol = 'http' + protocol_parameters = { + 'http_output_download_parameters': { + 'use_ssl': yesno(session_api_data['urls'][0]['isSsl']), + 'use_well_known_port': yesno(session_api_data['urls'][0]['isWellKnownPort']), + } + } + elif dmc_protocol == 'hls': protocol = 'm3u8' - encryption = self._parse_json(session_api_data['token'], video_id)['hls_encryption'] - session_api_http_parameters = { - 'parameters': { - 'hls_parameters': { - 'encryption': { - encryption: { - 'encrypted_key': try_get(api_data, lambda x: x['media']['delivery']['encryption']['encryptedKey']), - 'key_uri': try_get(api_data, lambda x: x['media']['delivery']['encryption']['keyUri']) - } - }, - 'transfer_preset': '', - 'use_ssl': yesno(session_api_endpoint['isSsl']), - 'use_well_known_port': yesno(session_api_endpoint['isWellKnownPort']), - 'segment_duration': 6000, - } + segment_duration = try_get(self._configuration_arg('segment_duration'), lambda x: int(x[0])) or 6000 + parsed_token = self._parse_json(session_api_data['token'], video_id) + encryption = traverse_obj(api_data, ('media', 'delivery', 'encryption')) + protocol_parameters = { + 'hls_parameters': { + 'segment_duration': segment_duration, + 'transfer_preset': '', + 'use_ssl': yesno(session_api_data['urls'][0]['isSsl']), + 'use_well_known_port': yesno(session_api_data['urls'][0]['isWellKnownPort']), } } - # http - else: - protocol = 'http' - session_api_http_parameters = { - 'parameters': { - 'http_output_download_parameters': { - 'use_ssl': yesno(session_api_endpoint['isSsl']), - 'use_well_known_port': yesno(session_api_endpoint['isWellKnownPort']), + if 'hls_encryption' in parsed_token and encryption: + protocol_parameters['hls_parameters']['encryption'] = { + parsed_token['hls_encryption']: { + 'encrypted_key': encryption['encryptedKey'], + 'key_uri': encryption['keyUri'], } } - } + else: + protocol = 'm3u8_native' + else: + raise ExtractorError(f'Unsupported DMC protocol: {dmc_protocol}') session_response = self._download_json( session_api_endpoint['url'], video_id, @@ -296,11 +304,13 @@ class NiconicoIE(InfoExtractor): 'lifetime': session_api_data.get('heartbeatLifetime') } }, - 'priority': session_api_data.get('priority'), + 'priority': session_api_data['priority'], 'protocol': { 'name': 'http', 'parameters': { - 'http_parameters': session_api_http_parameters + 'http_parameters': { + 'parameters': protocol_parameters + } } }, 'recipe_id': session_api_data.get('recipeId'), @@ -328,36 +338,35 @@ class NiconicoIE(InfoExtractor): return info_dict, heartbeat_info_dict - def _extract_format_for_quality(self, api_data, video_id, audio_quality, video_quality): - def parse_format_id(id_code): - mobj = re.match(r'''(?x) - (?:archive_)? - (?:(?P<codec>[^_]+)_)? - (?:(?P<br>[\d]+)kbps_)? - (?:(?P<res>[\d+]+)p_)? - ''', '%s_' % id_code) - return mobj.groupdict() if mobj else {} - - protocol = 'niconico_dmc' - format_id = '-'.join(map(lambda s: remove_start(s['id'], 'archive_'), [video_quality, audio_quality])) - vdict = parse_format_id(video_quality['id']) - adict = parse_format_id(audio_quality['id']) - resolution = try_get(video_quality, lambda x: x['metadata']['resolution'], dict) or {'height': vdict.get('res')} - vbr = try_get(video_quality, lambda x: x['metadata']['bitrate'], float) + def _extract_format_for_quality(self, video_id, audio_quality, video_quality, dmc_protocol): + + if not audio_quality.get('isAvailable') or not video_quality.get('isAvailable'): + return None + + def extract_video_quality(video_quality): + return parse_filesize('%sB' % self._search_regex( + r'\| ([0-9]*\.?[0-9]*[MK])', video_quality, 'vbr', default='')) + + format_id = '-'.join( + [remove_start(s['id'], 'archive_') for s in (video_quality, audio_quality)] + [dmc_protocol]) + + vid_qual_label = traverse_obj(video_quality, ('metadata', 'label')) + vid_quality = traverse_obj(video_quality, ('metadata', 'bitrate')) return { - 'url': '%s:%s/%s/%s' % (protocol, video_id, video_quality['id'], audio_quality['id']), + 'url': 'niconico_dmc:%s/%s/%s' % (video_id, video_quality['id'], audio_quality['id']), 'format_id': format_id, - 'format_note': 'DMC %s' % try_get(video_quality, lambda x: x['metadata']['label'], compat_str), + 'format_note': join_nonempty('DMC', vid_qual_label, dmc_protocol.upper(), delim=' '), 'ext': 'mp4', # Session API are used in HTML5, which always serves mp4 - 'vcodec': vdict.get('codec'), - 'acodec': adict.get('codec'), - 'vbr': float_or_none(vbr, 1000) or float_or_none(vdict.get('br')), - 'abr': float_or_none(audio_quality.get('bitrate'), 1000) or float_or_none(adict.get('br')), - 'height': int_or_none(resolution.get('height', vdict.get('res'))), - 'width': int_or_none(resolution.get('width')), - 'quality': -2 if 'low' in format_id else -1, # Default quality value is -1 - 'protocol': protocol, + 'acodec': 'aac', + 'vcodec': 'h264', + 'abr': float_or_none(traverse_obj(audio_quality, ('metadata', 'bitrate')), 1000), + 'vbr': float_or_none(vid_quality if vid_quality > 0 else extract_video_quality(vid_qual_label), 1000), + 'height': traverse_obj(video_quality, ('metadata', 'resolution', 'height')), + 'width': traverse_obj(video_quality, ('metadata', 'resolution', 'width')), + 'quality': -2 if 'low' in video_quality['id'] else None, + 'protocol': 'niconico_dmc', + '_expected_protocol': dmc_protocol, 'http_headers': { 'Origin': 'https://www.nicovideo.jp', 'Referer': 'https://www.nicovideo.jp/watch/' + video_id, @@ -367,248 +376,157 @@ class NiconicoIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - # Get video webpage for API data. - webpage, handle = self._download_webpage_handle( - 'http://www.nicovideo.jp/watch/' + video_id, video_id) - if video_id.startswith('so'): - video_id = self._match_id(handle.geturl()) - - api_data = self._parse_json(self._html_search_regex( - 'data-api-data="([^"]+)"', webpage, - 'API data', default='{}'), video_id) - - def get_video_info_web(items): - return dict_get(api_data['video'], items) - - # Get video info - video_info_xml = self._download_xml( - 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, - video_id, note='Downloading video info page') - - def get_video_info_xml(items): - if not isinstance(items, list): - items = [items] - for item in items: - ret = xpath_text(video_info_xml, './/' + item) - if ret: - return ret - - if get_video_info_xml('error'): - error_code = get_video_info_xml('code') - - if error_code == 'DELETED': - raise ExtractorError('The video has been deleted.', - expected=True) - elif error_code == 'NOT_FOUND': - raise ExtractorError('The video is not found.', - expected=True) - elif error_code == 'COMMUNITY': - self.to_screen('%s: The video is community members only.' % video_id) - else: - raise ExtractorError('%s reports error: %s' % (self.IE_NAME, error_code)) + try: + webpage, handle = self._download_webpage_handle( + 'http://www.nicovideo.jp/watch/' + video_id, video_id) + if video_id.startswith('so'): + video_id = self._match_id(handle.geturl()) + + api_data = self._parse_json(self._html_search_regex( + 'data-api-data="([^"]+)"', webpage, + 'API data', default='{}'), video_id) + except ExtractorError as e: + try: + api_data = self._download_json( + 'https://www.nicovideo.jp/api/watch/v3/%s?_frontendId=6&_frontendVersion=0&actionTrackId=AAAAAAAAAA_%d' % (video_id, round(time.time() * 1000)), video_id, + note='Downloading API JSON', errnote='Unable to fetch data')['data'] + except ExtractorError: + if not isinstance(e.cause, compat_HTTPError): + raise + webpage = e.cause.read().decode('utf-8', 'replace') + error_msg = self._html_search_regex( + r'(?s)<section\s+class="(?:(?:ErrorMessage|WatchExceptionPage-message)\s*)+">(.+?)</section>', + webpage, 'error reason', default=None) + if not error_msg: + raise + raise ExtractorError(re.sub(r'\s+', ' ', error_msg), expected=True) - # Start extracting video formats formats = [] - # Get HTML5 videos info - quality_info = try_get(api_data, lambda x: x['media']['delivery']['movie']) - if not quality_info: - raise ExtractorError('The video can\'t be downloaded', expected=True) - - for audio_quality in quality_info.get('audios') or {}: - for video_quality in quality_info.get('videos') or {}: - if not audio_quality.get('isAvailable') or not video_quality.get('isAvailable'): - continue - formats.append(self._extract_format_for_quality( - api_data, video_id, audio_quality, video_quality)) + def get_video_info(*items, get_first=True, **kwargs): + return traverse_obj(api_data, ('video', *items), get_all=not get_first, **kwargs) - # Get flv/swf info - timestamp = None - video_real_url = try_get(api_data, lambda x: x['video']['smileInfo']['url']) - if video_real_url: - is_economy = video_real_url.endswith('low') - - if is_economy: - self.report_warning('Site is currently in economy mode! You will only have access to lower quality streams') - - # Invoking ffprobe to determine resolution - pp = FFmpegPostProcessor(self._downloader) - cookies = self._get_cookies('https://nicovideo.jp').output(header='', sep='; path=/; domain=nicovideo.jp;\n') - - self.to_screen('%s: %s' % (video_id, 'Checking smile format with ffprobe')) - - try: - metadata = pp.get_metadata_object(video_real_url, ['-cookies', cookies]) - except PostProcessingError as err: - raise ExtractorError(err.msg, expected=True) - - v_stream = a_stream = {} - - # Some complex swf files doesn't have video stream (e.g. nm4809023) - for stream in metadata['streams']: - if stream['codec_type'] == 'video': - v_stream = stream - elif stream['codec_type'] == 'audio': - a_stream = stream - - # Community restricted videos seem to have issues with the thumb API not returning anything at all - filesize = int( - (get_video_info_xml('size_high') if not is_economy else get_video_info_xml('size_low')) - or metadata['format']['size'] - ) - extension = ( - get_video_info_xml('movie_type') - or 'mp4' if 'mp4' in metadata['format']['format_name'] else metadata['format']['format_name'] - ) - - # 'creation_time' tag on video stream of re-encoded SMILEVIDEO mp4 files are '1970-01-01T00:00:00.000000Z'. - timestamp = ( - parse_iso8601(get_video_info_web('first_retrieve')) - or unified_timestamp(get_video_info_web('postedDateTime')) - ) - metadata_timestamp = ( - parse_iso8601(try_get(v_stream, lambda x: x['tags']['creation_time'])) - or timestamp if extension != 'mp4' else 0 - ) - - # According to compconf, smile videos from pre-2017 are always better quality than their DMC counterparts - smile_threshold_timestamp = parse_iso8601('2016-12-08T00:00:00+09:00') - - is_source = timestamp < smile_threshold_timestamp or metadata_timestamp > 0 - - # If movie file size is unstable, old server movie is not source movie. - if filesize > 1: - formats.append({ - 'url': video_real_url, - 'format_id': 'smile' if not is_economy else 'smile_low', - 'format_note': 'SMILEVIDEO source' if not is_economy else 'SMILEVIDEO low quality', - 'ext': extension, - 'container': extension, - 'vcodec': v_stream.get('codec_name'), - 'acodec': a_stream.get('codec_name'), - # Some complex swf files doesn't have total bit rate metadata (e.g. nm6049209) - 'tbr': int_or_none(metadata['format'].get('bit_rate'), scale=1000), - 'vbr': int_or_none(v_stream.get('bit_rate'), scale=1000), - 'abr': int_or_none(a_stream.get('bit_rate'), scale=1000), - 'height': int_or_none(v_stream.get('height')), - 'width': int_or_none(v_stream.get('width')), - 'source_preference': 5 if not is_economy else -2, - 'quality': 5 if is_source and not is_economy else None, - 'filesize': filesize - }) + quality_info = api_data['media']['delivery']['movie'] + session_api_data = quality_info['session'] + for (audio_quality, video_quality, protocol) in itertools.product(quality_info['audios'], quality_info['videos'], session_api_data['protocols']): + fmt = self._extract_format_for_quality(video_id, audio_quality, video_quality, protocol) + if fmt: + formats.append(fmt) self._sort_formats(formats) # Start extracting information - title = ( - get_video_info_xml('title') # prefer to get the untranslated original title - or get_video_info_web(['originalTitle', 'title']) - or self._og_search_title(webpage, default=None) - or self._html_search_regex( - r'<span[^>]+class="videoHeaderTitle"[^>]*>([^<]+)</span>', - webpage, 'video title')) - - watch_api_data_string = self._html_search_regex( - r'<div[^>]+id="watchAPIDataContainer"[^>]+>([^<]+)</div>', - webpage, 'watch api data', default=None) - watch_api_data = self._parse_json(watch_api_data_string, video_id) if watch_api_data_string else {} - video_detail = watch_api_data.get('videoDetail', {}) - - thumbnail = ( - self._html_search_regex(r'<meta property="og:image" content="([^"]+)">', webpage, 'thumbnail data', default=None) - or dict_get( # choose highest from 720p to 240p - get_video_info_web('thumbnail'), - ['ogp', 'player', 'largeUrl', 'middleUrl', 'url']) - or self._html_search_meta('image', webpage, 'thumbnail', default=None) - or video_detail.get('thumbnail')) - - description = get_video_info_web('description') - - if not timestamp: - match = self._html_search_meta('datePublished', webpage, 'date published', default=None) - if match: - timestamp = parse_iso8601(match.replace('+', ':00+')) - if not timestamp and video_detail.get('postedAt'): - timestamp = parse_iso8601( - video_detail['postedAt'].replace('/', '-'), - delimiter=' ', timezone=datetime.timedelta(hours=9)) - timestamp = timestamp or try_get(api_data, lambda x: parse_iso8601(x['video']['registeredAt'])) - - view_count = int_or_none(get_video_info_web(['view_counter', 'viewCount'])) - if not view_count: - match = self._html_search_regex( - r'>Views: <strong[^>]*>([^<]+)</strong>', - webpage, 'view count', default=None) - if match: - view_count = int_or_none(match.replace(',', '')) - view_count = ( - view_count - or video_detail.get('viewCount') - or try_get(api_data, lambda x: x['video']['count']['view'])) - - comment_count = ( - int_or_none(get_video_info_web('comment_num')) - or video_detail.get('commentCount') - or try_get(api_data, lambda x: x['video']['count']['comment'])) - - if not comment_count: - match = self._html_search_regex( - r'>Comments: <strong[^>]*>([^<]+)</strong>', - webpage, 'comment count', default=None) - if match: - comment_count = int_or_none(match.replace(',', '')) - - duration = (parse_duration( - get_video_info_web('length') - or self._html_search_meta( - 'video:duration', webpage, 'video duration', default=None)) - or video_detail.get('length') - or get_video_info_web('duration')) - - webpage_url = get_video_info_web('watch_url') or url - - # for channel movie and community movie - channel_id = try_get( - api_data, - (lambda x: x['channel']['globalId'], - lambda x: x['community']['globalId'])) - channel = try_get( - api_data, - (lambda x: x['channel']['name'], - lambda x: x['community']['name'])) - - # Note: cannot use api_data.get('owner', {}) because owner may be set to "null" - # in the JSON, which will cause None to be returned instead of {}. - owner = try_get(api_data, lambda x: x.get('owner'), dict) or {} - uploader_id = str_or_none( - get_video_info_web(['ch_id', 'user_id']) - or owner.get('id') - or channel_id - ) - uploader = ( - get_video_info_web(['ch_name', 'user_nickname']) - or owner.get('nickname') - or channel - ) + tags = None + if webpage: + # use og:video:tag (not logged in) + og_video_tags = re.finditer(r'<meta\s+property="og:video:tag"\s*content="(.*?)">', webpage) + tags = list(filter(None, (clean_html(x.group(1)) for x in og_video_tags))) + if not tags: + # use keywords and split with comma (not logged in) + kwds = self._html_search_meta('keywords', webpage, default=None) + if kwds: + tags = [x for x in kwds.split(',') if x] + if not tags: + # find in json (logged in) + tags = traverse_obj(api_data, ('tag', 'items', ..., 'name')) return { 'id': video_id, '_api_data': api_data, - 'title': title, + 'title': get_video_info(('originalTitle', 'title')) or self._og_search_title(webpage, default=None), 'formats': formats, - 'thumbnail': thumbnail, - 'description': description, - 'uploader': uploader, - 'timestamp': timestamp, - 'uploader_id': uploader_id, - 'channel': channel, - 'channel_id': channel_id, - 'view_count': view_count, - 'comment_count': comment_count, - 'duration': duration, - 'webpage_url': webpage_url, + 'thumbnail': get_video_info('thumbnail', 'url') or self._html_search_meta( + ('image', 'og:image'), webpage, 'thumbnail', default=None), + 'description': clean_html(get_video_info('description')), + 'uploader': traverse_obj(api_data, ('owner', 'nickname')), + 'timestamp': parse_iso8601(get_video_info('registeredAt')) or parse_iso8601( + self._html_search_meta('video:release_date', webpage, 'date published', default=None)), + 'uploader_id': traverse_obj(api_data, ('owner', 'id')), + 'channel': traverse_obj(api_data, ('channel', 'name'), ('community', 'name')), + 'channel_id': traverse_obj(api_data, ('channel', 'id'), ('community', 'id')), + 'view_count': int_or_none(get_video_info('count', 'view')), + 'tags': tags, + 'genre': traverse_obj(api_data, ('genre', 'label'), ('genre', 'key')), + 'comment_count': get_video_info('count', 'comment', expected_type=int), + 'duration': ( + parse_duration(self._html_search_meta('video:duration', webpage, 'video duration', default=None)) + or get_video_info('duration')), + 'webpage_url': url_or_none(url) or f'https://www.nicovideo.jp/watch/{video_id}', + 'subtitles': self.extract_subtitles(video_id, api_data, session_api_data), } + def _get_subtitles(self, video_id, api_data, session_api_data): + comment_user_key = traverse_obj(api_data, ('comment', 'keys', 'userKey')) + user_id_str = session_api_data.get('serviceUserId') + + thread_ids = [x for x in traverse_obj(api_data, ('comment', 'threads')) or [] if x['isActive']] + raw_danmaku = self._extract_all_comments(video_id, thread_ids, user_id_str, comment_user_key) + if not raw_danmaku: + self.report_warning(f'Failed to get comments. {bug_reports_message()}') + return + return { + 'comments': [{ + 'ext': 'json', + 'data': json.dumps(raw_danmaku), + }], + } + + def _extract_all_comments(self, video_id, threads, user_id, user_key): + auth_data = { + 'user_id': user_id, + 'userkey': user_key, + } if user_id and user_key else {'user_id': ''} + + # Request Start + post_data = [{'ping': {'content': 'rs:0'}}] + for i, thread in enumerate(threads): + thread_id = thread['id'] + thread_fork = thread['fork'] + # Post Start (2N) + post_data.append({'ping': {'content': f'ps:{i * 2}'}}) + post_data.append({'thread': { + 'fork': thread_fork, + 'language': 0, + 'nicoru': 3, + 'scores': 1, + 'thread': thread_id, + 'version': '20090904', + 'with_global': 1, + **auth_data, + }}) + # Post Final (2N) + post_data.append({'ping': {'content': f'pf:{i * 2}'}}) + + # Post Start (2N+1) + post_data.append({'ping': {'content': f'ps:{i * 2 + 1}'}}) + post_data.append({'thread_leaves': { + # format is '<bottom of minute range>-<top of minute range>:<comments per minute>,<total last comments' + # unfortunately NND limits (deletes?) comment returns this way, so you're only able to grab the last 1000 per language + 'content': '0-999999:999999,999999,nicoru:999999', + 'fork': thread_fork, + 'language': 0, + 'nicoru': 3, + 'scores': 1, + 'thread': thread_id, + **auth_data, + }}) + # Post Final (2N+1) + post_data.append({'ping': {'content': f'pf:{i * 2 + 1}'}}) + # Request Final + post_data.append({'ping': {'content': 'rf:0'}}) + + for api_url in self._COMMENT_API_ENDPOINTS: + comments = self._download_json( + api_url, video_id, data=json.dumps(post_data).encode(), fatal=False, + headers={ + 'Referer': 'https://www.nicovideo.jp/watch/%s' % video_id, + 'Origin': 'https://www.nicovideo.jp', + 'Content-Type': 'text/plain;charset=UTF-8', + }, + note='Downloading comments', errnote=f'Failed to access endpoint {api_url}') + if comments: + return comments + class NiconicoPlaylistBaseIE(InfoExtractor): _PAGE_SIZE = 100 diff --git a/yt_dlp/extractor/nitter.py b/yt_dlp/extractor/nitter.py index a0546cda0..8bb709cd7 100644 --- a/yt_dlp/extractor/nitter.py +++ b/yt_dlp/extractor/nitter.py @@ -5,7 +5,6 @@ from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( parse_count, - unified_strdate, unified_timestamp, remove_end, determine_ext, @@ -25,6 +24,16 @@ class NitterIE(InfoExtractor): 'nitter.v6vgyqpa7yefkorazmg5d5fimstmvm2vtbirt6676mt7qmllrcnwycqd.onion', 'i23nv6w3juvzlw32xzoxcqzktegd4i4fu3nmnc2ewv4ggiu4ledwklad.onion', '26oq3gioiwcmfojub37nz5gzbkdiqp7fue5kvye7d4txv4ny6fb4wwid.onion', + 'vfaomgh4jxphpbdfizkm5gbtjahmei234giqj4facbwhrfjtcldauqad.onion', + 'iwgu3cv7ywf3gssed5iqtavmrlszgsxazkmwwnt4h2kdait75thdyrqd.onion', + 'erpnncl5nhyji3c32dcfmztujtl3xaddqb457jsbkulq24zqq7ifdgad.onion', + 'ckzuw5misyahmg7j5t5xwwuj3bwy62jfolxyux4brfflramzsvvd3syd.onion', + 'jebqj47jgxleaiosfcxfibx2xdahjettuydlxbg64azd4khsxv6kawid.onion', + 'nttr2iupbb6fazdpr2rgbooon2tzbbsvvkagkgkwohhodjzj43stxhad.onion', + 'nitraeju2mipeziu2wtcrqsxg7h62v5y4eqgwi75uprynkj74gevvuqd.onion', + 'nitter.lqs5fjmajyp7rvp4qvyubwofzi6d4imua7vs237rkc4m5qogitqwrgyd.onion', + 'ibsboeui2im5o7dxnik3s5yghufumgy5abevtij5nbizequfpu4qi4ad.onion', + 'ec5nvbycpfa5k6ro77blxgkyrzbkv7uy6r5cngcbkadtjj2733nm3uyd.onion', 'nitter.i2p', 'u6ikd6zndl3c4dsdq4mmujpntgeevdk5qzkfb57r4tnfeccrn2qa.b32.i2p', @@ -36,28 +45,55 @@ class NitterIE(InfoExtractor): 'nitter.42l.fr', 'nitter.pussthecat.org', 'nitter.nixnet.services', - 'nitter.mastodont.cat', - 'nitter.tedomum.net', 'nitter.fdn.fr', 'nitter.1d4.us', 'nitter.kavin.rocks', - 'tweet.lambda.dance', - 'nitter.cc', - 'nitter.vxempire.xyz', 'nitter.unixfox.eu', 'nitter.domain.glass', - 'nitter.himiko.cloud', 'nitter.eu', 'nitter.namazso.eu', - 'nitter.mailstation.de', 'nitter.actionsack.com', - 'nitter.cattube.org', - 'nitter.dark.fail', 'birdsite.xanny.family', - 'nitter.40two.app', - 'nitter.skrep.in', + 'nitter.hu', + 'twitr.gq', + 'nitter.moomoo.me', + 'nittereu.moomoo.me', + 'bird.from.tf', + 'nitter.it', + 'twitter.censors.us', + 'twitter.grimneko.de', + 'nitter.alefvanoon.xyz', + 'n.hyperborea.cloud', + 'nitter.ca', + 'twitter.076.ne.jp', + 'twitter.mstdn.social', + 'nitter.fly.dev', + 'notabird.site', + 'nitter.weiler.rocks', + 'nitter.silkky.cloud', + 'nitter.sethforprivacy.com', + 'nttr.stream', + 'nitter.cutelab.space', + 'nitter.nl', + 'nitter.mint.lgbt', + 'nitter.bus-hit.me', + 'fuckthesacklers.network', + 'nitter.govt.land', + 'nitter.datatunnel.xyz', + 'nitter.esmailelbob.xyz', + 'tw.artemislena.eu', + 'de.nttr.stream', + 'nitter.winscloud.net', + 'nitter.tiekoetter.com', + 'nitter.spaceint.fr', + 'twtr.bch.bar', + 'nitter.exonip.de', + 'nitter.mastodon.pro', + 'nitter.notraxx.ch', + # not in the list anymore + 'nitter.skrep.in', 'nitter.snopyta.org', ) @@ -68,96 +104,121 @@ class NitterIE(InfoExtractor): # official, rate limited 'nitter.net', # offline + 'is-nitter.resolv.ee', + 'lu-nitter.resolv.ee', 'nitter.13ad.de', + 'nitter.40two.app', + 'nitter.cattube.org', + 'nitter.cc', + 'nitter.dark.fail', + 'nitter.himiko.cloud', + 'nitter.koyu.space', + 'nitter.mailstation.de', + 'nitter.mastodont.cat', + 'nitter.tedomum.net', + 'nitter.tokhmi.xyz', 'nitter.weaponizedhumiliation.com', + 'nitter.vxempire.xyz', + 'tweet.lambda.dance', ) INSTANCES = NON_HTTP_INSTANCES + HTTP_INSTANCES + DEAD_INSTANCES - _INSTANCES_RE = '(?:' + '|'.join([re.escape(instance) for instance in INSTANCES]) + ')' - _VALID_URL = r'https?://%(instance)s/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?' % {'instance': _INSTANCES_RE} + _INSTANCES_RE = f'(?:{"|".join(map(re.escape, INSTANCES))})' + _VALID_URL = fr'https?://{_INSTANCES_RE}/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?' current_instance = random.choice(HTTP_INSTANCES) _TESTS = [ { # GIF (wrapped in mp4) - 'url': 'https://%s/firefox/status/1314279897502629888#m' % current_instance, + 'url': f'https://{current_instance}/firefox/status/1314279897502629888#m', 'info_dict': { 'id': '1314279897502629888', 'ext': 'mp4', - 'title': 'Firefox 🔥 - You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension. \n\nReport harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg\n\n#UnfckTheInternet', - 'description': 'You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension. \n\nReport harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg\n\n#UnfckTheInternet', + 'title': 'md5:7890a9277da4639ab624dd899424c5d8', + 'description': 'md5:5fea96a4d3716c350f8b95b21b3111fe', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Firefox 🔥', 'uploader_id': 'firefox', - 'uploader_url': 'https://%s/firefox' % current_instance, + 'uploader_url': f'https://{current_instance}/firefox', 'upload_date': '20201008', 'timestamp': 1602183720, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, }, }, { # normal video - 'url': 'https://%s/Le___Doc/status/1299715685392756737#m' % current_instance, + 'url': f'https://{current_instance}/Le___Doc/status/1299715685392756737#m', 'info_dict': { 'id': '1299715685392756737', 'ext': 'mp4', - 'title': 'Le Doc - "Je ne prédis jamais rien"\nD Raoult, Août 2020...', + 'title': 're:^.* - "Je ne prédis jamais rien"\nD Raoult, Août 2020...', 'description': '"Je ne prédis jamais rien"\nD Raoult, Août 2020...', 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Le Doc', + 'uploader': 're:^Le *Doc', 'uploader_id': 'Le___Doc', - 'uploader_url': 'https://%s/Le___Doc' % current_instance, + 'uploader_url': f'https://{current_instance}/Le___Doc', 'upload_date': '20200829', - 'timestamp': 1598711341, + 'timestamp': 1598711340, 'view_count': int, 'like_count': int, 'repost_count': int, 'comment_count': int, }, }, { # video embed in a "Streaming Political Ads" box - 'url': 'https://%s/mozilla/status/1321147074491092994#m' % current_instance, + 'url': f'https://{current_instance}/mozilla/status/1321147074491092994#m', 'info_dict': { 'id': '1321147074491092994', 'ext': 'mp4', - 'title': "Mozilla - Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows?\n\nThis isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few. \n\nLearn more ➡️ https://mzl.la/StreamingAds", - 'description': "Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows?\n\nThis isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few. \n\nLearn more ➡️ https://mzl.la/StreamingAds", + 'title': 'md5:8290664aabb43b9189145c008386bf12', + 'description': 'md5:9cf2762d49674bc416a191a689fb2aaa', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Mozilla', 'uploader_id': 'mozilla', - 'uploader_url': 'https://%s/mozilla' % current_instance, + 'uploader_url': f'https://{current_instance}/mozilla', 'upload_date': '20201027', - 'timestamp': 1603820982 + 'timestamp': 1603820940, + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, }, + 'expected_warnings': ['Ignoring subtitle tracks found in the HLS manifest'], }, { # not the first tweet but main-tweet - 'url': 'https://%s/TheNaturalNu/status/1379050895539724290#m' % current_instance, + 'url': f'https://{current_instance}/firefox/status/1354848277481414657#m', 'info_dict': { - 'id': '1379050895539724290', + 'id': '1354848277481414657', 'ext': 'mp4', - 'title': 'Dorothy Zbornak - This had me hollering!!', - 'description': 'This had me hollering!!', + 'title': 'md5:bef647f03bd1c6b15b687ea70dfc9700', + 'description': 'md5:5efba25e2f9dac85ebcd21160cb4341f', 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Dorothy Zbornak', - 'uploader_id': 'TheNaturalNu', - 'uploader_url': 'https://%s/TheNaturalNu' % current_instance, - 'timestamp': 1617626329, - 'upload_date': '20210405' + 'uploader': 'Firefox 🔥', + 'uploader_id': 'firefox', + 'uploader_url': f'https://{current_instance}/firefox', + 'upload_date': '20210128', + 'timestamp': 1611855960, + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, } } ] def _real_extract(self, url): - video_id = self._match_id(url) + video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id') parsed_url = compat_urlparse.urlparse(url) - base_url = '%s://%s' % (parsed_url.scheme, parsed_url.netloc) + base_url = f'{parsed_url.scheme}://{parsed_url.netloc}' self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on') - full_webpage = self._download_webpage(url, video_id) + full_webpage = webpage = self._download_webpage(url, video_id) main_tweet_start = full_webpage.find('class="main-tweet"') if main_tweet_start > 0: webpage = full_webpage[main_tweet_start:] - if not webpage: - webpage = full_webpage - video_url = '%s%s' % (base_url, self._html_search_regex(r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url')) + video_url = '%s%s' % (base_url, self._html_search_regex( + r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url')) ext = determine_ext(video_url) if ext == 'unknown_video': @@ -168,61 +229,49 @@ class NitterIE(InfoExtractor): 'ext': ext }] - title = self._og_search_description(full_webpage) - if not title: - title = self._html_search_regex(r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title') - description = title + title = description = self._og_search_description(full_webpage) or self._html_search_regex( + r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title', fatal=False) - mobj = self._match_valid_url(url) - uploader_id = ( - mobj.group('uploader_id') - or self._html_search_regex(r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False) - ) + uploader_id = self._html_search_regex( + r'<a class="username"[^>]+title="@([^"]+)"', webpage, 'uploader id', fatal=False) or uploader_id - if uploader_id: - uploader_url = '%s/%s' % (base_url, uploader_id) + uploader = self._html_search_regex( + r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False) + if uploader: + title = f'{uploader} - {title}' - uploader = self._html_search_regex(r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False) + counts = { + f'{x[0]}_count': self._html_search_regex( + fr'<span[^>]+class="icon-{x[1]}[^>]*></span>([^<]*)</div>', + webpage, f'{x[0]} count', fatal=False) + for x in (('view', 'play'), ('like', 'heart'), ('repost', 'retweet'), ('comment', 'comment')) + } + counts = {field: 0 if count == '' else parse_count(count) for field, count in counts.items()} - if uploader: - title = '%s - %s' % (uploader, title) - - view_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-play[^>]*></span>\s([^<]+)</div>', webpage, 'view count', fatal=False)) - like_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-heart[^>]*></span>\s([^<]+)</div>', webpage, 'like count', fatal=False)) - repost_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-retweet[^>]*></span>\s([^<]+)</div>', webpage, 'repost count', fatal=False)) - comment_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-comment[^>]*></span>\s([^<]+)</div>', webpage, 'repost count', fatal=False)) - - thumbnail = self._html_search_meta('og:image', full_webpage, 'thumbnail url') - if not thumbnail: - thumbnail = '%s%s' % (base_url, self._html_search_regex(r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False)) - thumbnail = remove_end(thumbnail, '%3Asmall') - - thumbnails = [] - thumbnail_ids = ('thumb', 'small', 'large', 'medium', 'orig') - for id in thumbnail_ids: - thumbnails.append({ - 'id': id, - 'url': thumbnail + '%3A' + id, - }) - - date = self._html_search_regex(r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"', webpage, 'upload date', fatal=False) - upload_date = unified_strdate(date) - timestamp = unified_timestamp(date) + thumbnail = ( + self._html_search_meta('og:image', full_webpage, 'thumbnail url') + or remove_end('%s%s' % (base_url, self._html_search_regex( + r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False)), '%3Asmall')) + + thumbnails = [ + {'id': id, 'url': f'{thumbnail}%3A{id}'} + for id in ('thumb', 'small', 'large', 'medium', 'orig') + ] + + date = self._html_search_regex( + r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"', + webpage, 'upload date', default='').replace('·', '') return { 'id': video_id, 'title': title, 'description': description, 'uploader': uploader, - 'timestamp': timestamp, + 'timestamp': unified_timestamp(date), 'uploader_id': uploader_id, - 'uploader_url': uploader_url, - 'view_count': view_count, - 'like_count': like_count, - 'repost_count': repost_count, - 'comment_count': comment_count, + 'uploader_url': f'{base_url}/{uploader_id}', 'formats': formats, 'thumbnails': thumbnails, 'thumbnail': thumbnail, - 'upload_date': upload_date, + **counts, } diff --git a/yt_dlp/extractor/njpwworld.py b/yt_dlp/extractor/njpwworld.py index 89380d039..68c8c8e52 100644 --- a/yt_dlp/extractor/njpwworld.py +++ b/yt_dlp/extractor/njpwworld.py @@ -43,15 +43,7 @@ class NJPWWorldIE(InfoExtractor): _LOGIN_URL = 'https://front.njpwworld.com/auth/login' - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - # No authentication to be performed - if not username: - return True - + def _perform_login(self, username, password): # Setup session (will set necessary cookies) self._request_webpage( 'https://njpwworld.com/', None, note='Setting up session') diff --git a/yt_dlp/extractor/noco.py b/yt_dlp/extractor/noco.py index 78c4952f4..28af909d5 100644 --- a/yt_dlp/extractor/noco.py +++ b/yt_dlp/extractor/noco.py @@ -61,14 +61,7 @@ class NocoIE(InfoExtractor): } ] - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): login = self._download_json( self._LOGIN_URL, None, 'Logging in', data=urlencode_postdata({ diff --git a/yt_dlp/extractor/openload.py b/yt_dlp/extractor/openload.py index 36927009d..fe4740aae 100644 --- a/yt_dlp/extractor/openload.py +++ b/yt_dlp/extractor/openload.py @@ -207,7 +207,7 @@ class PhantomJSwrapper(object): replaces = self.options replaces['url'] = url - user_agent = headers.get('User-Agent') or self.get_param('http_headers')['User-Agent'] + user_agent = headers.get('User-Agent') or self.extractor.get_param('http_headers')['User-Agent'] replaces['ua'] = user_agent.replace('"', '\\"') replaces['jscode'] = jscode diff --git a/yt_dlp/extractor/packtpub.py b/yt_dlp/extractor/packtpub.py index c06fca795..62c52cd6e 100644 --- a/yt_dlp/extractor/packtpub.py +++ b/yt_dlp/extractor/packtpub.py @@ -47,10 +47,7 @@ class PacktPubIE(PacktPubBaseIE): _NETRC_MACHINE = 'packtpub' _TOKEN = None - def _real_initialize(self): - username, password = self._get_login_info() - if username is None: - return + def _perform_login(self, username, password): try: self._TOKEN = self._download_json( 'https://services.packtpub.com/auth-v1/users/tokens', None, diff --git a/yt_dlp/extractor/panopto.py b/yt_dlp/extractor/panopto.py index d458dfe50..3388f7f39 100644 --- a/yt_dlp/extractor/panopto.py +++ b/yt_dlp/extractor/panopto.py @@ -18,12 +18,39 @@ from ..utils import ( int_or_none, OnDemandPagedList, parse_qs, + srt_subtitles_timecode, traverse_obj, ) class PanoptoBaseIE(InfoExtractor): - BASE_URL_RE = r'(?P<base_url>https?://[\w.]+\.panopto.(?:com|eu)/Panopto)' + BASE_URL_RE = r'(?P<base_url>https?://[\w.-]+\.panopto.(?:com|eu)/Panopto)' + + # see panopto core.js + _SUB_LANG_MAPPING = { + 0: 'en-US', + 1: 'en-GB', + 2: 'es-MX', + 3: 'es-ES', + 4: 'de-DE', + 5: 'fr-FR', + 6: 'nl-NL', + 7: 'th-TH', + 8: 'zh-CN', + 9: 'zh-TW', + 10: 'ko-KR', + 11: 'ja-JP', + 12: 'ru-RU', + 13: 'pt-PT', + 14: 'pl-PL', + 15: 'en-AU', + 16: 'da-DK', + 17: 'fi-FI', + 18: 'hu-HU', + 19: 'nb-NO', + 20: 'sv-SE', + 21: 'it-IT' + } def _call_api(self, base_url, path, video_id, data=None, fatal=True, **kwargs): response = self._download_json( @@ -31,7 +58,7 @@ class PanoptoBaseIE(InfoExtractor): fatal=fatal, headers={'accept': 'application/json', 'content-type': 'application/json'}, **kwargs) if not response: return - error_code = response.get('ErrorCode') + error_code = traverse_obj(response, 'ErrorCode') if error_code == 2: self.raise_login_required(method='cookies') elif error_code is not None: @@ -62,10 +89,11 @@ class PanoptoIE(PanoptoBaseIE): 'id': '26b3ae9e-4a48-4dcc-96ba-0befba08a0fb', 'title': 'Panopto for Business - Use Cases', 'timestamp': 1459184200, - 'thumbnail': r're:https://demo\.hosted\.panopto\.com/Panopto/Services/FrameGrabber\.svc/FrameRedirect\?objectId=26b3ae9e-4a48-4dcc-96ba-0befba08a0fb&mode=Delivery&random=[\d.]+', + 'thumbnail': r're:https://demo\.hosted\.panopto\.com/.+', 'upload_date': '20160328', 'ext': 'mp4', 'cast': [], + 'chapters': [], 'duration': 88.17099999999999, 'average_rating': int, 'uploader_id': '2db6b718-47a0-4b0b-9e17-ab0b00f42b1e', @@ -80,10 +108,10 @@ class PanoptoIE(PanoptoBaseIE): 'title': 'Overcoming Top 4 Challenges of Enterprise Video', 'uploader': 'Panopto Support', 'timestamp': 1449409251, - 'thumbnail': r're:https://demo\.hosted\.panopto\.com/Panopto/Services/FrameGrabber\.svc/FrameRedirect\?objectId=ed01b077-c9e5-4c7b-b8ff-15fa306d7a59&mode=Delivery&random=[\d.]+', + 'thumbnail': r're:https://demo\.hosted\.panopto\.com/.+', 'upload_date': '20151206', 'ext': 'mp4', - 'chapters': 'count:21', + 'chapters': 'count:12', 'cast': ['Panopto Support'], 'uploader_id': 'a96d1a31-b4de-489b-9eee-b4a5b414372c', 'average_rating': int, @@ -104,8 +132,9 @@ class PanoptoIE(PanoptoBaseIE): 'uploader_id': '316a0a58-7fa2-4cd9-be1c-64270d284a56', 'timestamp': 1569845768, 'tags': ['Viewer', 'Enterprise'], + 'chapters': [], 'upload_date': '20190930', - 'thumbnail': r're:https://howtovideos\.hosted\.panopto\.com/Panopto/Services/FrameGrabber.svc/FrameRedirect\?objectId=5fa74e93-3d87-4694-b60e-aaa4012214ed&mode=Delivery&random=[\d.]+', + 'thumbnail': r're:https://howtovideos\.hosted\.panopto\.com/.+', 'description': 'md5:2d844aaa1b1a14ad0e2601a0993b431f', 'title': 'Getting Started: View a Video', 'average_rating': int, @@ -121,6 +150,7 @@ class PanoptoIE(PanoptoBaseIE): 'id': '9d9a0fa3-e99a-4ebd-a281-aac2017f4da4', 'ext': 'mp4', 'cast': ['LTS CLI Script'], + 'chapters': [], 'duration': 2178.45, 'description': 'md5:ee5cf653919f55b72bce2dbcf829c9fa', 'channel_id': 'b23e673f-c287-4cb1-8344-aae9005a69f8', @@ -129,12 +159,78 @@ class PanoptoIE(PanoptoBaseIE): 'uploader': 'LTS CLI Script', 'timestamp': 1572458134, 'title': 'WW2 Vets Interview 3 Ronald Stanley George', - 'thumbnail': r're:https://unisa\.au\.panopto\.com/Panopto/Services/FrameGrabber.svc/FrameRedirect\?objectId=9d9a0fa3-e99a-4ebd-a281-aac2017f4da4&mode=Delivery&random=[\d.]+', + 'thumbnail': r're:https://unisa\.au\.panopto\.com/.+', 'channel': 'World War II Veteran Interviews', 'upload_date': '20191030', }, }, { + # Slides/storyboard + 'url': 'https://demo.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=a7f12f1d-3872-4310-84b0-f8d8ab15326b', + 'info_dict': { + 'id': 'a7f12f1d-3872-4310-84b0-f8d8ab15326b', + 'ext': 'mhtml', + 'timestamp': 1448798857, + 'duration': 4712.681, + 'title': 'Cache Memory - CompSci 15-213, Lecture 12', + 'channel_id': 'e4c6a2fc-1214-4ca0-8fb7-aef2e29ff63a', + 'uploader_id': 'a96d1a31-b4de-489b-9eee-b4a5b414372c', + 'upload_date': '20151129', + 'average_rating': 0, + 'uploader': 'Panopto Support', + 'channel': 'Showcase Videos', + 'description': 'md5:55e51d54233ddb0e6c2ed388ca73822c', + 'cast': ['ISR Videographer', 'Panopto Support'], + 'chapters': 'count:28', + 'thumbnail': r're:https://demo\.hosted\.panopto\.com/.+', + }, + 'params': {'format': 'mhtml', 'skip_download': True} + }, + { + 'url': 'https://na-training-1.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=8285224a-9a2b-4957-84f2-acb0000c4ea9', + 'info_dict': { + 'id': '8285224a-9a2b-4957-84f2-acb0000c4ea9', + 'ext': 'mp4', + 'chapters': [], + 'title': 'Company Policy', + 'average_rating': 0, + 'timestamp': 1615058901, + 'channel': 'Human Resources', + 'tags': ['HumanResources'], + 'duration': 1604.243, + 'thumbnail': r're:https://na-training-1\.hosted\.panopto\.com/.+', + 'uploader_id': '8e8ba0a3-424f-40df-a4f1-ab3a01375103', + 'uploader': 'Cait M.', + 'upload_date': '20210306', + 'cast': ['Cait M.'], + 'subtitles': {'en-US': [{'ext': 'srt', 'data': 'md5:a3f4d25963fdeace838f327097c13265'}], + 'es-ES': [{'ext': 'srt', 'data': 'md5:57e9dad365fd0fbaf0468eac4949f189'}]}, + }, + 'params': {'writesubtitles': True, 'skip_download': True} + }, { + # On Panopto there are two subs: "Default" and en-US. en-US is blank and should be skipped. + 'url': 'https://na-training-1.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=940cbd41-f616-4a45-b13e-aaf1000c915b', + 'info_dict': { + 'id': '940cbd41-f616-4a45-b13e-aaf1000c915b', + 'ext': 'mp4', + 'subtitles': 'count:1', + 'title': 'HR Benefits Review Meeting*', + 'cast': ['Panopto Support'], + 'chapters': [], + 'timestamp': 1575024251, + 'thumbnail': r're:https://na-training-1\.hosted\.panopto\.com/.+', + 'channel': 'Zoom', + 'description': 'md5:04f90a9c2c68b7828144abfb170f0106', + 'uploader': 'Panopto Support', + 'average_rating': 0, + 'duration': 409.34499999999997, + 'uploader_id': 'b6ac04ad-38b8-4724-a004-a851004ea3df', + 'upload_date': '20191129', + + }, + 'params': {'writesubtitles': True, 'skip_download': True} + }, + { 'url': 'https://ucc.cloud.panopto.eu/Panopto/Pages/Viewer.aspx?id=0e8484a4-4ceb-4d98-a63f-ac0200b455cb', 'only_matching': True }, @@ -178,19 +274,82 @@ class PanoptoIE(PanoptoBaseIE): note='Marking watched', errnote='Unable to mark watched') @staticmethod - def _extract_chapters(delivery): + def _extract_chapters(timestamps): chapters = [] - for timestamp in delivery.get('Timestamps', []): + for timestamp in timestamps or []: + caption = timestamp.get('Caption') start, duration = int_or_none(timestamp.get('Time')), int_or_none(timestamp.get('Duration')) - if start is None or duration is None: + if not caption or start is None or duration is None: continue chapters.append({ 'start_time': start, 'end_time': start + duration, - 'title': timestamp.get('Caption') + 'title': caption }) return chapters + @staticmethod + def _extract_mhtml_formats(base_url, timestamps): + image_frags = {} + for timestamp in timestamps or []: + duration = timestamp.get('Duration') + obj_id, obj_sn = timestamp.get('ObjectIdentifier'), timestamp.get('ObjectSequenceNumber'), + if timestamp.get('EventTargetType') == 'PowerPoint' and obj_id is not None and obj_sn is not None: + image_frags.setdefault('slides', []).append({ + 'url': base_url + f'/Pages/Viewer/Image.aspx?id={obj_id}&number={obj_sn}', + 'duration': duration + }) + + obj_pid, session_id, abs_time = timestamp.get('ObjectPublicIdentifier'), timestamp.get('SessionID'), timestamp.get('AbsoluteTime') + if None not in (obj_pid, session_id, abs_time): + image_frags.setdefault('chapter', []).append({ + 'url': base_url + f'/Pages/Viewer/Thumb.aspx?eventTargetPID={obj_pid}&sessionPID={session_id}&number={obj_sn}&isPrimary=false&absoluteTime={abs_time}', + 'duration': duration, + }) + for name, fragments in image_frags.items(): + yield { + 'format_id': name, + 'ext': 'mhtml', + 'protocol': 'mhtml', + 'acodec': 'none', + 'vcodec': 'none', + 'url': 'about:invalid', + 'fragments': fragments + } + + @staticmethod + def _json2srt(data, delivery): + def _gen_lines(): + for i, line in enumerate(data): + start_time = line['Time'] + duration = line.get('Duration') + if duration: + end_time = start_time + duration + else: + end_time = traverse_obj(data, (i + 1, 'Time')) or delivery['Duration'] + yield f'{i + 1}\n{srt_subtitles_timecode(start_time)} --> {srt_subtitles_timecode(end_time)}\n{line["Caption"]}' + return '\n\n'.join(_gen_lines()) + + def _get_subtitles(self, base_url, video_id, delivery): + subtitles = {} + for lang in delivery.get('AvailableLanguages') or []: + response = self._call_api( + base_url, '/Pages/Viewer/DeliveryInfo.aspx', video_id, fatal=False, + note='Downloading captions JSON metadata', query={ + 'deliveryId': video_id, + 'getCaptions': True, + 'language': str(lang), + 'responseType': 'json' + } + ) + if not isinstance(response, list): + continue + subtitles.setdefault(self._SUB_LANG_MAPPING.get(lang) or 'default', []).append({ + 'ext': 'srt', + 'data': self._json2srt(response, delivery), + }) + return subtitles + def _extract_streams_formats_and_subtitles(self, video_id, streams, **fmt_kwargs): formats = [] subtitles = {} @@ -240,6 +399,7 @@ class PanoptoIE(PanoptoBaseIE): delivery = delivery_info['Delivery'] session_start_time = int_or_none(delivery.get('SessionStartTime')) + timestamps = delivery.get('Timestamps') # Podcast stream is usually the combined streams. We will prefer that by default. podcast_formats, podcast_subtitles = self._extract_streams_formats_and_subtitles( @@ -249,9 +409,11 @@ class PanoptoIE(PanoptoBaseIE): video_id, delivery.get('Streams'), preference=-10) formats = podcast_formats + streams_formats - subtitles = self._merge_subtitles(podcast_subtitles, streams_subtitles) - self._sort_formats(formats) + formats.extend(self._extract_mhtml_formats(base_url, timestamps)) + subtitles = self._merge_subtitles( + podcast_subtitles, streams_subtitles, self.extract_subtitles(base_url, video_id, delivery)) + self._sort_formats(formats) self.mark_watched(base_url, video_id, delivery_info) return { @@ -262,7 +424,7 @@ class PanoptoIE(PanoptoBaseIE): 'duration': delivery.get('Duration'), 'thumbnail': base_url + f'/Services/FrameGrabber.svc/FrameRedirect?objectId={video_id}&mode=Delivery&random={random()}', 'average_rating': delivery.get('AverageRating'), - 'chapters': self._extract_chapters(delivery) or None, + 'chapters': self._extract_chapters(timestamps), 'uploader': delivery.get('OwnerDisplayName') or None, 'uploader_id': delivery.get('OwnerId'), 'description': delivery.get('SessionAbstract'), diff --git a/yt_dlp/extractor/paramountplus.py b/yt_dlp/extractor/paramountplus.py index 17138985a..94a9319ea 100644 --- a/yt_dlp/extractor/paramountplus.py +++ b/yt_dlp/extractor/paramountplus.py @@ -1,4 +1,5 @@ from __future__ import unicode_literals +import itertools from .common import InfoExtractor from .cbs import CBSBaseIE @@ -13,12 +14,12 @@ class ParamountPlusIE(CBSBaseIE): (?: paramountplus:| https?://(?:www\.)?(?: - paramountplus\.com/(?:shows/[^/]+/video|movies/[^/]+)/ + paramountplus\.com/(?:shows|movies)/(?:video|[^/]+/video|[^/]+)/ )(?P<id>[\w-]+))''' # All tests are blocked outside US _TESTS = [{ - 'url': 'https://www.paramountplus.com/shows/catdog/video/Oe44g5_NrlgiZE3aQVONleD6vXc8kP0k/catdog-climb-every-catdog-the-canine-mutiny/', + 'url': 'https://www.paramountplus.com/shows/video/Oe44g5_NrlgiZE3aQVONleD6vXc8kP0k/', 'info_dict': { 'id': 'Oe44g5_NrlgiZE3aQVONleD6vXc8kP0k', 'ext': 'mp4', @@ -33,7 +34,7 @@ class ParamountPlusIE(CBSBaseIE): 'skip_download': 'm3u8', }, }, { - 'url': 'https://www.paramountplus.com/shows/tooning-out-the-news/video/6hSWYWRrR9EUTz7IEe5fJKBhYvSUfexd/7-23-21-week-in-review-rep-jahana-hayes-howard-fineman-sen-michael-bennet-sheera-frenkel-cecilia-kang-/', + 'url': 'https://www.paramountplus.com/shows/video/6hSWYWRrR9EUTz7IEe5fJKBhYvSUfexd/', 'info_dict': { 'id': '6hSWYWRrR9EUTz7IEe5fJKBhYvSUfexd', 'ext': 'mp4', @@ -48,7 +49,7 @@ class ParamountPlusIE(CBSBaseIE): 'skip_download': 'm3u8', }, }, { - 'url': 'https://www.paramountplus.com/movies/daddys-home/vM2vm0kE6vsS2U41VhMRKTOVHyQAr6pC', + 'url': 'https://www.paramountplus.com/movies/video/vM2vm0kE6vsS2U41VhMRKTOVHyQAr6pC/', 'info_dict': { 'id': 'vM2vm0kE6vsS2U41VhMRKTOVHyQAr6pC', 'ext': 'mp4', @@ -63,7 +64,7 @@ class ParamountPlusIE(CBSBaseIE): }, 'expected_warnings': ['Ignoring subtitle tracks'], # TODO: Investigate this }, { - 'url': 'https://www.paramountplus.com/movies/sonic-the-hedgehog/5EKDXPOzdVf9voUqW6oRuocyAEeJGbEc', + 'url': 'https://www.paramountplus.com/movies/video/5EKDXPOzdVf9voUqW6oRuocyAEeJGbEc/', 'info_dict': { 'id': '5EKDXPOzdVf9voUqW6oRuocyAEeJGbEc', 'ext': 'mp4', @@ -78,10 +79,16 @@ class ParamountPlusIE(CBSBaseIE): }, 'expected_warnings': ['Ignoring subtitle tracks'], }, { - 'url': 'https://www.paramountplus.com/shows/all-rise/video/QmR1WhNkh1a_IrdHZrbcRklm176X_rVc/all-rise-space/', + 'url': 'https://www.paramountplus.com/shows/the-real-world/video/mOVeHeL9ub9yWdyzSZFYz8Uj4ZBkVzQg/the-real-world-reunion/', 'only_matching': True, }, { - 'url': 'https://www.paramountplus.com/movies/million-dollar-american-princesses-meghan-and-harry/C0LpgNwXYeB8txxycdWdR9TjxpJOsdCq', + 'url': 'https://www.paramountplus.com/shows/video/mOVeHeL9ub9yWdyzSZFYz8Uj4ZBkVzQg/', + 'only_matching': True, + }, { + 'url': 'https://www.paramountplus.com/movies/video/W0VyStQqUnqKzJkrpSAIARuCc9YuYGNy/', + 'only_matching': True, + }, { + 'url': 'https://www.paramountplus.com/movies/paw-patrol-the-movie/W0VyStQqUnqKzJkrpSAIARuCc9YuYGNy/', 'only_matching': True, }] @@ -128,11 +135,13 @@ class ParamountPlusSeriesIE(InfoExtractor): 'id': 'spongebob-squarepants', } }] - _API_URL = 'https://www.paramountplus.com/shows/{}/xhr/episodes/page/0/size/100000/xs/0/season/0/' def _entries(self, show_name): - show_json = self._download_json(self._API_URL.format(show_name), video_id=show_name) - if show_json.get('success'): + for page in itertools.count(): + show_json = self._download_json( + f'https://www.paramountplus.com/shows/{show_name}/xhr/episodes/page/{page}/size/50/xs/0/season/0', show_name) + if not show_json.get('success'): + return for episode in show_json['result']['data']: yield self.url_result( 'https://www.paramountplus.com%s' % episode['url'], diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py index d3ee071e0..963a0d6fb 100644 --- a/yt_dlp/extractor/patreon.py +++ b/yt_dlp/extractor/patreon.py @@ -88,11 +88,7 @@ class PatreonIE(InfoExtractor): # Currently Patreon exposes download URL via hidden CSS, so login is not # needed. Keeping this commented for when this inevitably changes. ''' - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): login_form = { 'redirectUrl': 'http://www.patreon.com/', 'email': username, @@ -108,8 +104,6 @@ class PatreonIE(InfoExtractor): if re.search(r'onLoginFailed', login_page): raise ExtractorError('Unable to login, incorrect username and/or password', expected=True) - def _real_initialize(self): - self._login() ''' def _real_extract(self, url): diff --git a/yt_dlp/extractor/piapro.py b/yt_dlp/extractor/piapro.py index 497e1edbc..c4eb4913f 100644 --- a/yt_dlp/extractor/piapro.py +++ b/yt_dlp/extractor/piapro.py @@ -29,13 +29,9 @@ class PiaproIE(InfoExtractor): } }] - def _real_initialize(self): - self._login_status = self._login() + _login_status = False - def _login(self): - username, password = self._get_login_info() - if not username: - return False + def _perform_login(self, username, password): login_ok = True login_form_strs = { '_username': username, @@ -57,7 +53,7 @@ class PiaproIE(InfoExtractor): if not login_ok: self.report_warning( 'unable to log in: bad username or password') - return login_ok + self._login_status = login_ok def _real_extract(self, url): video_id = self._match_id(url) diff --git a/yt_dlp/extractor/platzi.py b/yt_dlp/extractor/platzi.py index 23c8256b5..17f52e7f4 100644 --- a/yt_dlp/extractor/platzi.py +++ b/yt_dlp/extractor/platzi.py @@ -22,14 +22,7 @@ class PlatziBaseIE(InfoExtractor): _LOGIN_URL = 'https://platzi.com/login/' _NETRC_MACHINE = 'platzi' - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') diff --git a/yt_dlp/extractor/playplustv.py b/yt_dlp/extractor/playplustv.py index fd72a3717..cad2c3a0f 100644 --- a/yt_dlp/extractor/playplustv.py +++ b/yt_dlp/extractor/playplustv.py @@ -38,14 +38,10 @@ class PlayPlusTVIE(InfoExtractor): 'Authorization': 'Bearer ' + self._token, }, query=query) - def _real_initialize(self): - email, password = self._get_login_info() - if email is None: - self.raise_login_required() - + def _perform_login(self, username, password): req = PUTRequest( 'https://api.playplus.tv/api/web/login', json.dumps({ - 'email': email, + 'email': username, 'password': password, }).encode(), { 'Content-Type': 'application/json; charset=utf-8', @@ -61,6 +57,10 @@ class PlayPlusTVIE(InfoExtractor): self._profile = self._call_api('Profiles')['list'][0]['_id'] + def _real_initialize(self): + if not self._token: + self.raise_login_required(method='password') + def _real_extract(self, url): project_id, media_id = self._match_valid_url(url).groups() media = self._call_api( diff --git a/yt_dlp/extractor/pluralsight.py b/yt_dlp/extractor/pluralsight.py index 801057ee1..2a5e0e488 100644 --- a/yt_dlp/extractor/pluralsight.py +++ b/yt_dlp/extractor/pluralsight.py @@ -162,14 +162,7 @@ query viewClip { } }''' - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') diff --git a/yt_dlp/extractor/pokergo.py b/yt_dlp/extractor/pokergo.py index d27031c91..c9e2fed12 100644 --- a/yt_dlp/extractor/pokergo.py +++ b/yt_dlp/extractor/pokergo.py @@ -15,11 +15,9 @@ class PokerGoBaseIE(InfoExtractor): _AUTH_TOKEN = None _PROPERTY_ID = '1dfb3940-7d53-4980-b0b0-f28b369a000d' - def _login(self): - username, password = self._get_login_info() - if not username: - self.raise_login_required(method='password') - + def _perform_login(self, username, password): + if self._AUTH_TOKEN: + return self.report_login() PokerGoBaseIE._AUTH_TOKEN = self._download_json( f'https://subscription.pokergo.com/properties/{self._PROPERTY_ID}/sign-in', None, @@ -30,7 +28,7 @@ class PokerGoBaseIE(InfoExtractor): def _real_initialize(self): if not self._AUTH_TOKEN: - self._login() + self.raise_login_required(method='password') class PokerGoIE(PokerGoBaseIE): diff --git a/yt_dlp/extractor/roosterteeth.py b/yt_dlp/extractor/roosterteeth.py index 652fdd116..a55dd4f8b 100644 --- a/yt_dlp/extractor/roosterteeth.py +++ b/yt_dlp/extractor/roosterteeth.py @@ -21,10 +21,7 @@ class RoosterTeethBaseIE(InfoExtractor): _API_BASE = 'https://svod-be.roosterteeth.com' _API_BASE_URL = f'{_API_BASE}/api/v1' - def _login(self): - username, password = self._get_login_info() - if username is None: - return + def _perform_login(self, username, password): if self._get_cookies(self._API_BASE_URL).get('rt_access_token'): return @@ -47,9 +44,6 @@ class RoosterTeethBaseIE(InfoExtractor): msg += ': ' + error self.report_warning(msg) - def _real_initialize(self): - self._login() - def _extract_video_info(self, data): thumbnails = [] for image in traverse_obj(data, ('included', 'images')): diff --git a/yt_dlp/extractor/rumble.py b/yt_dlp/extractor/rumble.py index 49c1f4485..a0d5f88d9 100644 --- a/yt_dlp/extractor/rumble.py +++ b/yt_dlp/extractor/rumble.py @@ -11,6 +11,7 @@ from ..utils import ( int_or_none, parse_iso8601, try_get, + unescapeHTML, ExtractorError, ) @@ -28,6 +29,20 @@ class RumbleEmbedIE(InfoExtractor): 'upload_date': '20191020', } }, { + 'url': 'https://rumble.com/embed/vslb7v', + 'md5': '7418035de1a30a178b8af34dc2b6a52b', + 'info_dict': { + 'id': 'vslb7v', + 'ext': 'mp4', + 'title': 'Defense Sec. says US Commitment to NATO Defense \'Ironclad\'', + 'timestamp': 1645142135, + 'upload_date': '20220217', + 'channel_url': 'https://rumble.com/c/CyberTechNews', + 'channel': 'CTNews', + 'thumbnail': 'https://sp.rmbl.ws/s8/6/7/i/9/h/7i9hd.OvCc.jpg', + 'duration': 901, + } + }, { 'url': 'https://rumble.com/embed/ufe9n.v5pv5f', 'only_matching': True, }] @@ -45,7 +60,7 @@ class RumbleEmbedIE(InfoExtractor): video = self._download_json( 'https://rumble.com/embedJS/', video_id, query={'request': 'video', 'v': video_id}) - title = video['title'] + title = unescapeHTML(video['title']) formats = [] for height, ua in (video.get('ua') or {}).items(): diff --git a/yt_dlp/extractor/rutv.py b/yt_dlp/extractor/rutv.py index 66ac32deb..0ea8253fa 100644 --- a/yt_dlp/extractor/rutv.py +++ b/yt_dlp/extractor/rutv.py @@ -181,7 +181,6 @@ class RUTVIE(InfoExtractor): 'rtmp_live': True, 'ext': 'flv', 'vbr': str_to_int(quality), - 'quality': preference, } elif transport == 'm3u8': formats.extend(self._extract_m3u8_formats( @@ -192,9 +191,10 @@ class RUTVIE(InfoExtractor): 'url': url } fmt.update({ - 'width': width, - 'height': height, + 'width': int_or_none(quality, default=height, invscale=width, scale=height), + 'height': int_or_none(quality, default=height), 'format_id': '%s-%s' % (transport, quality), + 'source_preference': preference, }) formats.append(fmt) diff --git a/yt_dlp/extractor/safari.py b/yt_dlp/extractor/safari.py index cca4464ca..7b4571daa 100644 --- a/yt_dlp/extractor/safari.py +++ b/yt_dlp/extractor/safari.py @@ -25,14 +25,7 @@ class SafariBaseIE(InfoExtractor): LOGGED_IN = False - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): _, urlh = self._download_webpage_handle( 'https://learning.oreilly.com/accounts/login-check/', None, 'Downloading login page') diff --git a/yt_dlp/extractor/scte.py b/yt_dlp/extractor/scte.py index ca1de63b6..7215cf5d1 100644 --- a/yt_dlp/extractor/scte.py +++ b/yt_dlp/extractor/scte.py @@ -14,14 +14,7 @@ class SCTEBaseIE(InfoExtractor): _LOGIN_URL = 'https://www.scte.org/SCTE/Sign_In.aspx' _NETRC_MACHINE = 'scte' - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): login_popup = self._download_webpage( self._LOGIN_URL, None, 'Downloading login popup') diff --git a/yt_dlp/extractor/shahid.py b/yt_dlp/extractor/shahid.py index 42de41a11..ab45d9ce4 100644 --- a/yt_dlp/extractor/shahid.py +++ b/yt_dlp/extractor/shahid.py @@ -79,16 +79,12 @@ class ShahidIE(ShahidBaseIE): 'only_matching': True }] - def _real_initialize(self): - email, password = self._get_login_info() - if email is None: - return - + def _perform_login(self, username, password): try: user_data = self._download_json( 'https://shahid.mbc.net/wd/service/users/login', None, 'Logging in', data=json.dumps({ - 'email': email, + 'email': username, 'password': password, 'basic': 'false', }).encode('utf-8'), headers={ diff --git a/yt_dlp/extractor/sonyliv.py b/yt_dlp/extractor/sonyliv.py index a5026b2e0..5b6849fc9 100644 --- a/yt_dlp/extractor/sonyliv.py +++ b/yt_dlp/extractor/sonyliv.py @@ -75,9 +75,12 @@ class SonyLIVIE(InfoExtractor): t[i] = '{:x}'.format(3 & n | 8) return ''.join(t) + '-' + str(int(time.time() * 1000)) - def _login(self, username, password): + def _perform_login(self, username, password): + self._HEADERS['device_id'] = self._get_device_id() + self._HEADERS['content-type'] = 'application/json' + if username.lower() == 'token' and len(password) > 1198: - return password + self._HEADERS['authorization'] = password elif len(username) != 10 or not username.isdigit(): raise ExtractorError(f'Invalid username/password; {self._LOGIN_HINT}') @@ -99,7 +102,7 @@ class SonyLIVIE(InfoExtractor): None, note='Verifying OTP', data=data.encode(), headers=self._HEADERS) if otp_verify_json['resultCode'] == 'KO': raise ExtractorError(otp_request_json['message'], expected=True) - return otp_verify_json['resultObj']['accessToken'] + self._HEADERS['authorization'] = otp_verify_json['resultObj']['accessToken'] def _call_api(self, version, path, video_id): try: @@ -118,13 +121,8 @@ class SonyLIVIE(InfoExtractor): raise ExtractorError(message) raise - def _real_initialize(self): + def _initialize_pre_login(self): self._HEADERS['security_token'] = self._call_api('1.4', 'ALL/GETTOKEN', None) - username, password = self._get_login_info() - if username: - self._HEADERS['device_id'] = self._get_device_id() - self._HEADERS['content-type'] = 'application/json' - self._HEADERS['authorization'] = self._login(username, password) def _real_extract(self, url): video_id = self._match_id(url) diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index 64b8a71b6..bbc79c2be 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -107,30 +107,24 @@ class SoundcloudBaseIE(InfoExtractor): return False raise - def _real_initialize(self): + def _initialize_pre_login(self): self._CLIENT_ID = self._downloader.cache.load('soundcloud', 'client_id') or 'a3e059563d7fd3372b49b37f00a00bcf' - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - - if username == 'oauth' and password is not None: - self._access_token = password - query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID - payload = {'session': {'access_token': self._access_token}} - token_verification = sanitized_Request(self._API_VERIFY_AUTH_TOKEN % query, json.dumps(payload).encode('utf-8')) - response = self._download_json(token_verification, None, note='Verifying login token...', fatal=False) - if response is not False: - self._HEADERS = {'Authorization': 'OAuth ' + self._access_token} - self.report_login() - else: - self.report_warning('Provided authorization token seems to be invalid. Continue as guest') - elif username is not None: + + def _perform_login(self, username, password): + if username != 'oauth': self.report_warning( 'Login using username and password is not currently supported. ' 'Use "--username oauth --password <oauth_token>" to login using an oauth token') + self._access_token = password + query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID + payload = {'session': {'access_token': self._access_token}} + token_verification = sanitized_Request(self._API_VERIFY_AUTH_TOKEN % query, json.dumps(payload).encode('utf-8')) + response = self._download_json(token_verification, None, note='Verifying login token...', fatal=False) + if response is not False: + self._HEADERS = {'Authorization': 'OAuth ' + self._access_token} + self.report_login() + else: + self.report_warning('Provided authorization token seems to be invalid. Continue as guest') r''' def genDevId(): diff --git a/yt_dlp/extractor/teachable.py b/yt_dlp/extractor/teachable.py index 37eae82bc..232eaa521 100644 --- a/yt_dlp/extractor/teachable.py +++ b/yt_dlp/extractor/teachable.py @@ -40,8 +40,7 @@ class TeachableBaseIE(InfoExtractor): if self._logged_in: return - username, password = self._get_login_info( - netrc_machine=self._SITES.get(site, site)) + username, password = self._get_login_info(netrc_machine=self._SITES.get(site, site)) if username is None: return diff --git a/yt_dlp/extractor/teamtreehouse.py b/yt_dlp/extractor/teamtreehouse.py index d347e97ef..64522ec4c 100644 --- a/yt_dlp/extractor/teamtreehouse.py +++ b/yt_dlp/extractor/teamtreehouse.py @@ -51,17 +51,14 @@ class TeamTreeHouseIE(InfoExtractor): }] _NETRC_MACHINE = 'teamtreehouse' - def _real_initialize(self): - email, password = self._get_login_info() - if email is None: - return + def _perform_login(self, username, password): signin_page = self._download_webpage( 'https://teamtreehouse.com/signin', None, 'Downloading signin page') data = self._form_hidden_inputs('new_user_session', signin_page) data.update({ - 'user_session[email]': email, + 'user_session[email]': username, 'user_session[password]': password, }) error_message = get_element_by_class('error-message', self._download_webpage( diff --git a/yt_dlp/extractor/tennistv.py b/yt_dlp/extractor/tennistv.py index a39a2fc60..58fdecebe 100644 --- a/yt_dlp/extractor/tennistv.py +++ b/yt_dlp/extractor/tennistv.py @@ -30,11 +30,9 @@ class TennisTVIE(InfoExtractor): 'skip': 'Requires email and password of a subscribed account', } _NETRC_MACHINE = 'tennistv' + _session_token = None - def _login(self): - username, password = self._get_login_info() - if not username or not password: - raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) + def _perform_login(self, username, password): login_form = { 'Email': username, @@ -63,7 +61,8 @@ class TennisTVIE(InfoExtractor): self._session_token = login_result['sessionToken'] def _real_initialize(self): - self._login() + if not self._session_token: + raise self.raise_login_required('Login info is needed for this website', method='password') def _real_extract(self, url): video_id = self._match_id(url) diff --git a/yt_dlp/extractor/toutv.py b/yt_dlp/extractor/toutv.py index 6c84c211c..1d5da1040 100644 --- a/yt_dlp/extractor/toutv.py +++ b/yt_dlp/extractor/toutv.py @@ -40,17 +40,14 @@ class TouTvIE(RadioCanadaIE): }] _CLIENT_KEY = '90505c8d-9c34-4f34-8da1-3a85bdc6d4f4' - def _real_initialize(self): - email, password = self._get_login_info() - if email is None: - return + def _perform_login(self, username, password): try: self._access_token = self._download_json( 'https://services.radio-canada.ca/toutv/profiling/accounts/login', None, 'Logging in', data=json.dumps({ 'ClientId': self._CLIENT_KEY, 'ClientSecret': '34026772-244b-49b6-8b06-317b30ac9a20', - 'Email': email, + 'Email': username, 'Password': password, 'Scope': 'id.write media-validation.read', }).encode(), headers={ diff --git a/yt_dlp/extractor/tubitv.py b/yt_dlp/extractor/tubitv.py index e9b66ec77..31feb9a70 100644 --- a/yt_dlp/extractor/tubitv.py +++ b/yt_dlp/extractor/tubitv.py @@ -54,10 +54,7 @@ class TubiTvIE(InfoExtractor): }, }] - def _login(self): - username, password = self._get_login_info() - if username is None: - return + def _perform_login(self, username, password): self.report_login() form_data = { 'username': username, @@ -72,9 +69,6 @@ class TubiTvIE(InfoExtractor): raise ExtractorError( 'Login failed (invalid username/password)', expected=True) - def _real_initialize(self): - self._login() - def _real_extract(self, url): video_id = self._match_id(url) video_data = self._download_json( diff --git a/yt_dlp/extractor/tumblr.py b/yt_dlp/extractor/tumblr.py index a3e0e15f2..8086f613d 100644 --- a/yt_dlp/extractor/tumblr.py +++ b/yt_dlp/extractor/tumblr.py @@ -247,11 +247,7 @@ class TumblrIE(InfoExtractor): _ACCESS_TOKEN = None - def _real_initialize(self): - self.get_access_token() - self._login() - - def get_access_token(self): + def _initialize_pre_login(self): login_page = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page', fatal=False) if login_page: @@ -260,11 +256,7 @@ class TumblrIE(InfoExtractor): if not self._ACCESS_TOKEN: self.report_warning('Failed to get access token; metadata will be missing and some videos may not work') - def _login(self): - username, password = self._get_login_info() - if not username: - return - + def _perform_login(self, username, password): if not self._ACCESS_TOKEN: return diff --git a/yt_dlp/extractor/twitch.py b/yt_dlp/extractor/twitch.py index bee26c3a3..10de74c8e 100644 --- a/yt_dlp/extractor/twitch.py +++ b/yt_dlp/extractor/twitch.py @@ -57,14 +57,7 @@ class TwitchBaseIE(InfoExtractor): 'VideoPlayer_ChapterSelectButtonVideo': '8d2793384aac3773beab5e59bd5d6f585aedb923d292800119e03d40cd0f9b41', } - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): def fail(message): raise ExtractorError( 'Unable to login. Twitch said: %s' % message, expected=True) diff --git a/yt_dlp/extractor/udemy.py b/yt_dlp/extractor/udemy.py index 25b28e98e..235f89713 100644 --- a/yt_dlp/extractor/udemy.py +++ b/yt_dlp/extractor/udemy.py @@ -168,14 +168,7 @@ class UdemyIE(InfoExtractor): self._handle_error(response) return response - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): login_popup = self._download_webpage( self._LOGIN_URL, None, 'Downloading login popup') diff --git a/yt_dlp/extractor/veo.py b/yt_dlp/extractor/veo.py index 4e57a52d1..d87bb5b47 100644 --- a/yt_dlp/extractor/veo.py +++ b/yt_dlp/extractor/veo.py @@ -6,13 +6,14 @@ from .common import InfoExtractor from ..utils import ( int_or_none, mimetype2ext, + str_or_none, unified_timestamp, url_or_none, ) class VeoIE(InfoExtractor): - _VALID_URL = r'https?://app\.veo\.co/matches/(?P<id>[0-9A-Za-z-]+)' + _VALID_URL = r'https?://app\.veo\.co/matches/(?P<id>[0-9A-Za-z-_]+)' _TESTS = [{ 'url': 'https://app.veo.co/matches/20201027-last-period/', @@ -24,7 +25,11 @@ class VeoIE(InfoExtractor): 'upload_date': '20201028', 'timestamp': 1603847208, 'duration': 1916, + 'view_count': int, } + }, { + 'url': 'https://app.veo.co/matches/20220313-2022-03-13_u15m-plsjq-vs-csl/', + 'only_matching': True, }] def _real_extract(self, url): @@ -36,39 +41,41 @@ class VeoIE(InfoExtractor): video_data = self._download_json( 'https://app.veo.co/api/app/matches/%s/videos' % video_id, video_id, 'Downloading video data') - title = metadata.get('title') - thumbnail = url_or_none(metadata.get('thumbnail')) - - timestamp = unified_timestamp(metadata.get('created')) - duration = int_or_none(metadata.get('duration')) - view_count = int_or_none(metadata.get('view_count')) - formats = [] for fmt in video_data: - mimetype = fmt.get('mime_type') + mimetype = str_or_none(fmt.get('mime_type')) + format_url = url_or_none(fmt.get('url')) # skip configuration file for panoramic video - if mimetype == 'video/mp2t': + if not format_url or mimetype == 'video/mp2t': continue + height = int_or_none(fmt.get('height')) - bitrate = int_or_none(fmt.get('bit_rate'), scale=1000) - render_type = fmt.get('render_type') + render_type = str_or_none(fmt.get('render_type')) + format_id = f'{render_type}-{height}p' if render_type and height else None + + # Veo returns panoramic video information even if panoramic video is not available. + # e.g. https://app.veo.co/matches/20201027-last-period/ + if render_type == 'panorama': + if not self._is_valid_url(format_url, video_id, format_id): + continue + formats.append({ - 'url': url_or_none(fmt.get('url')), - 'format_id': '%s-%sp' % (render_type, height), + 'url': format_url, + 'format_id': format_id, 'ext': mimetype2ext(mimetype), 'width': int_or_none(fmt.get('width')), 'height': height, - 'vbr': bitrate + 'vbr': int_or_none(fmt.get('bit_rate'), scale=1000), }) self._sort_formats(formats) return { 'id': video_id, - 'title': title, + 'title': str_or_none(metadata.get('title')), 'formats': formats, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'view_count': view_count, - 'duration': duration + 'thumbnail': url_or_none(metadata.get('thumbnail')), + 'timestamp': unified_timestamp(metadata.get('created')), + 'view_count': int_or_none(metadata.get('view_count')), + 'duration': int_or_none(metadata.get('duration')), } diff --git a/yt_dlp/extractor/vidio.py b/yt_dlp/extractor/vidio.py index e99dbdefa..6bfb8d442 100644 --- a/yt_dlp/extractor/vidio.py +++ b/yt_dlp/extractor/vidio.py @@ -23,11 +23,7 @@ class VidioBaseIE(InfoExtractor): _LOGIN_URL = 'https://www.vidio.com/users/login' _NETRC_MACHINE = 'vidio' - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): def is_logged_in(): res = self._download_json( 'https://www.vidio.com/interactions.json', None, 'Checking if logged in', fatal=False) or {} @@ -63,10 +59,9 @@ class VidioBaseIE(InfoExtractor): 'Unable to log in: %s. %s' % (reason, clean_html(subreason)), expected=True) raise ExtractorError('Unable to log in') - def _real_initialize(self): + def _initialize_pre_login(self): self._api_key = self._download_json( 'https://www.vidio.com/auth', None, data=b'')['api_key'] - self._login() def _call_api(self, url, video_id, note=None): return self._download_json(url, video_id, note=note, headers={ diff --git a/yt_dlp/extractor/viewlift.py b/yt_dlp/extractor/viewlift.py index 5b558d890..4627f66fd 100644 --- a/yt_dlp/extractor/viewlift.py +++ b/yt_dlp/extractor/viewlift.py @@ -36,9 +36,6 @@ class ViewLiftBaseIE(InfoExtractor): def _fetch_token(self, site, url): if self._TOKENS.get(site): return - email, password = self._get_login_info(netrc_machine=site) - if email: - self.report_warning('Logging in using username and password is broken. %s' % self._LOGIN_HINTS['cookies']) cookies = self._get_cookies(url) if cookies and cookies.get('token'): diff --git a/yt_dlp/extractor/viki.py b/yt_dlp/extractor/viki.py index 19b09121c..8234ba7df 100644 --- a/yt_dlp/extractor/viki.py +++ b/yt_dlp/extractor/viki.py @@ -99,14 +99,7 @@ class VikiBaseIE(InfoExtractor): self.raise_login_required(message) self._raise_error(message) - def _real_initialize(self): - self._login() - - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): self._token = self._call_api( 'sessions.json', None, 'Logging in', fatal=False, data={'username': username, 'password': password}).get('token') diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py index 458a751fe..051cf1b17 100644 --- a/yt_dlp/extractor/vimeo.py +++ b/yt_dlp/extractor/vimeo.py @@ -44,12 +44,7 @@ class VimeoBaseInfoExtractor(InfoExtractor): _LOGIN_REQUIRED = False _LOGIN_URL = 'https://vimeo.com/log_in' - def _login(self): - username, password = self._get_login_info() - if username is None: - if self._LOGIN_REQUIRED: - raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) - return + def _perform_login(self, username, password): webpage = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') token, vuid = self._extract_xsrft_and_vuid(webpage) @@ -75,6 +70,10 @@ class VimeoBaseInfoExtractor(InfoExtractor): expected=True) raise ExtractorError('Unable to log in') + def _real_initialize(self): + if self._LOGIN_REQUIRED and not self._get_cookies('https://vimeo.com').get('vuid'): + self._raise_login_required() + def _get_video_password(self): password = self.get_param('videopassword') if password is None: @@ -701,9 +700,6 @@ class VimeoIE(VimeoBaseInfoExtractor): raise ExtractorError('Wrong video password', expected=True) return checked - def _real_initialize(self): - self._login() - def _extract_from_api(self, video_id, unlisted_hash=None): token = self._download_json( 'https://vimeo.com/_rv/jwt', video_id, headers={ @@ -1231,9 +1227,6 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): 'skip': 'video gone', }] - def _real_initialize(self): - self._login() - def _real_extract(self, url): page_url, video_id = self._match_valid_url(url).groups() data = self._download_json( @@ -1275,9 +1268,6 @@ class VimeoWatchLaterIE(VimeoChannelIE): 'only_matching': True, }] - def _real_initialize(self): - self._login() - def _page_url(self, base_url, pagenum): url = '%s/page:%d/' % (base_url, pagenum) request = sanitized_Request(url) diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index 18eb33b57..cbc315961 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -29,11 +29,7 @@ from .youtube import YoutubeIE class VKBaseIE(InfoExtractor): _NETRC_MACHINE = 'vk' - def _login(self): - username, password = self._get_login_info() - if username is None: - return - + def _perform_login(self, username, password): login_page, url_handle = self._download_webpage_handle( 'https://vk.com', None, 'Downloading login page') @@ -57,9 +53,6 @@ class VKBaseIE(InfoExtractor): raise ExtractorError( 'Unable to login, incorrect username and/or password', expected=True) - def _real_initialize(self): - self._login() - def _download_payload(self, path, video_id, data, fatal=True): data['al'] = 1 code, payload = self._download_json( diff --git a/yt_dlp/extractor/vlive.py b/yt_dlp/extractor/vlive.py index 74dc349d5..ae35c976c 100644 --- a/yt_dlp/extractor/vlive.py +++ b/yt_dlp/extractor/vlive.py @@ -26,22 +26,16 @@ class VLiveBaseIE(NaverBaseIE): _NETRC_MACHINE = 'vlive' _logged_in = False - def _real_initialize(self): - if not self._logged_in: - VLiveBaseIE._logged_in = self._login() - - def _login(self): - email, password = self._get_login_info() - if email is None: - return False - + def _perform_login(self, username, password): + if self._logged_in: + return LOGIN_URL = 'https://www.vlive.tv/auth/email/login' self._request_webpage( LOGIN_URL, None, note='Downloading login cookies') self._download_webpage( LOGIN_URL, None, note='Logging in', - data=urlencode_postdata({'email': email, 'pwd': password}), + data=urlencode_postdata({'email': username, 'pwd': password}), headers={ 'Referer': LOGIN_URL, 'Content-Type': 'application/x-www-form-urlencoded' @@ -54,7 +48,7 @@ class VLiveBaseIE(NaverBaseIE): if not try_get(login_info, lambda x: x['message']['login'], bool): raise ExtractorError('Unable to log in', expected=True) - return True + VLiveBaseIE._logged_in = True def _call_api(self, path_template, video_id, fields=None, query_add={}, note=None): if note is None: diff --git a/yt_dlp/extractor/vrv.py b/yt_dlp/extractor/vrv.py index 7bc55f333..10e6be7ed 100644 --- a/yt_dlp/extractor/vrv.py +++ b/yt_dlp/extractor/vrv.py @@ -85,7 +85,7 @@ class VRVBaseIE(InfoExtractor): 'resource_key': resource_key, })['__links__']['cms_resource']['href'] - def _real_initialize(self): + def _initialize_pre_login(self): webpage = self._download_webpage( 'https://vrv.co/', None, headers=self.geo_verification_headers()) self._API_PARAMS = self._parse_json(self._search_regex( @@ -124,16 +124,10 @@ class VRVIE(VRVBaseIE): }] _NETRC_MACHINE = 'vrv' - def _real_initialize(self): - super(VRVIE, self)._real_initialize() - - email, password = self._get_login_info() - if email is None: - return - + def _perform_login(self, username, password): token_credentials = self._call_api( 'authenticate/by:credentials', None, 'Token Credentials', data={ - 'email': email, + 'email': username, 'password': password, }) self._TOKEN = token_credentials['oauth_token'] diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 4fe9cec5b..d74d5b0e9 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -263,7 +263,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)' - _NETRC_MACHINE = 'youtube' + # _NETRC_MACHINE = 'youtube' # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False @@ -334,21 +334,6 @@ class YoutubeBaseInfoExtractor(InfoExtractor): r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion', ) - def _login(self): - """ - Attempt to log in to YouTube. - If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised. - """ - - if (self._LOGIN_REQUIRED - and self.get_param('cookiefile') is None - and self.get_param('cookiesfrombrowser') is None): - self.raise_login_required( - 'Login details are needed to download this content', method='cookies') - username, password = self._get_login_info() - if username: - self.report_warning(f'Cannot login to YouTube using username and password. {self._LOGIN_HINTS["cookies"]}') - def _initialize_consent(self): cookies = self._get_cookies('https://www.youtube.com/') if cookies.get('__Secure-3PSID'): @@ -379,7 +364,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor): def _real_initialize(self): self._initialize_pref() self._initialize_consent() - self._login() + if (self._LOGIN_REQUIRED + and self.get_param('cookiefile') is None + and self.get_param('cookiesfrombrowser') is None): + self.raise_login_required('Login details are needed to download this content', method='cookies') _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;' _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;' @@ -458,7 +446,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'https://%s/youtubei/v1/%s' % (api_hostname or self._get_innertube_host(default_client), ep), video_id=video_id, fatal=fatal, note=note, errnote=errnote, data=json.dumps(data).encode('utf8'), headers=real_headers, - query={'key': api_key or self._extract_api_key()}) + query={'key': api_key or self._extract_api_key(), 'prettyPrint': 'false'}) def extract_yt_initial_data(self, item_id, webpage, fatal=True): data = self._search_regex( @@ -1297,7 +1285,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }, 'expected_warnings': [ 'DASH manifest missing', - 'Some formats are possibly damaged' ] }, # Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431) @@ -3013,7 +3000,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.report_warning(last_error) return prs, player_url - def _extract_formats(self, streaming_data, video_id, player_url, is_live): + def _extract_formats(self, streaming_data, video_id, player_url, is_live, duration): itags, stream_ids = {}, [] itag_qualities, res_qualities = {}, {} q = qualities([ @@ -3024,7 +3011,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'small', 'medium', 'large', 'hd720', 'hd1080', 'hd1440', 'hd2160', 'hd2880', 'highres' ]) streaming_formats = traverse_obj(streaming_data, (..., ('formats', 'adaptiveFormats'), ...), default=[]) - approx_duration = max(traverse_obj(streaming_formats, (..., 'approxDurationMs'), expected_type=float_or_none) or [0]) or None for fmt in streaming_formats: if fmt.get('targetDurationSec') or fmt.get('drmFamilies'): @@ -3091,7 +3077,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else -1) # Some formats may have much smaller duration than others (possibly damaged during encoding) # Eg: 2-nOtRESiUc Ref: https://github.com/yt-dlp/yt-dlp/issues/2823 - is_damaged = try_get(fmt, lambda x: float(x['approxDurationMs']) < approx_duration - 10000) + # Make sure to avoid false positives with small duration differences. + # Eg: __2ABJjxzNo, ySuUZEjARPY + is_damaged = try_get(fmt, lambda x: float(x['approxDurationMs']) / duration < 500) if is_damaged: self.report_warning(f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True) dct = { @@ -3227,14 +3215,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return webpage, master_ytcfg, player_responses, player_url - def _list_formats(self, video_id, microformats, video_details, player_responses, player_url): + def _list_formats(self, video_id, microformats, video_details, player_responses, player_url, duration=None): live_broadcast_details = traverse_obj(microformats, (..., 'liveBroadcastDetails')) is_live = get_first(video_details, 'isLive') if is_live is None: is_live = get_first(live_broadcast_details, 'isLiveNow') streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[]) - formats = list(self._extract_formats(streaming_data, video_id, player_url, is_live)) + formats = list(self._extract_formats(streaming_data, video_id, player_url, is_live, duration)) return live_broadcast_details, is_live, streaming_data, formats @@ -3315,7 +3303,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return self.playlist_result( entries, video_id, video_title, video_description) - live_broadcast_details, is_live, streaming_data, formats = self._list_formats(video_id, microformats, video_details, player_responses, player_url) + duration = int_or_none( + get_first(video_details, 'lengthSeconds') + or get_first(microformats, 'lengthSeconds') + or parse_duration(search_meta('duration'))) or None + + live_broadcast_details, is_live, streaming_data, formats = self._list_formats( + video_id, microformats, video_details, player_responses, player_url, duration) if not formats: if not self.get_param('allow_unplayable_formats') and traverse_obj(streaming_data, (..., 'licenseInfos')): @@ -3387,10 +3381,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): get_first(video_details, 'channelId') or get_first(microformats, 'externalChannelId') or search_meta('channelId')) - duration = int_or_none( - get_first(video_details, 'lengthSeconds') - or get_first(microformats, 'lengthSeconds') - or parse_duration(search_meta('duration'))) or None owner_profile_url = get_first(microformats, 'ownerProfileUrl') live_content = get_first(video_details, 'isLiveContent') @@ -3926,6 +3916,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): if entry: yield entry ''' + def _extract_entries(self, parent_renderer, continuation_list): # continuation_list is modified in-place with continuation_list = [continuation_token] continuation_list[:] = [None] @@ -4024,6 +4015,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): continue known_renderers = { + 'videoRenderer': (self._grid_entries, 'items'), # for membership tab 'gridPlaylistRenderer': (self._grid_entries, 'items'), 'gridVideoRenderer': (self._grid_entries, 'items'), 'gridChannelRenderer': (self._grid_entries, 'items'), diff --git a/yt_dlp/extractor/zattoo.py b/yt_dlp/extractor/zattoo.py index 9435920b2..c02b4ca14 100644 --- a/yt_dlp/extractor/zattoo.py +++ b/yt_dlp/extractor/zattoo.py @@ -25,13 +25,11 @@ class ZattooPlatformBaseIE(InfoExtractor): def _host_url(self): return 'https://%s' % (self._API_HOST if hasattr(self, '_API_HOST') else self._HOST) - def _login(self): - username, password = self._get_login_info() - if not username or not password: - self.raise_login_required( - 'A valid %s account is needed to access this media.' - % self._NETRC_MACHINE) + def _real_initialize(self): + if not self._power_guide_hash: + self.raise_login_required('An account is needed to access this media', method='password') + def _perform_login(self, username, password): try: data = self._download_json( '%s/zapi/v2/account/login' % self._host_url(), None, 'Logging in', @@ -52,7 +50,7 @@ class ZattooPlatformBaseIE(InfoExtractor): self._power_guide_hash = data['session']['power_guide_hash'] - def _real_initialize(self): + def _initialize_pre_login(self): webpage = self._download_webpage( self._host_url(), None, 'Downloading app token') app_token = self._html_search_regex( @@ -72,8 +70,6 @@ class ZattooPlatformBaseIE(InfoExtractor): 'format': 'json', })) - self._login() - def _extract_cid(self, video_id, channel_name): channel_groups = self._download_json( '%s/zapi/v2/cached/channels/%s' % (self._host_url(), diff --git a/yt_dlp/extractor/zee5.py b/yt_dlp/extractor/zee5.py index ebe393ec7..3e3f11b15 100644 --- a/yt_dlp/extractor/zee5.py +++ b/yt_dlp/extractor/zee5.py @@ -93,32 +93,27 @@ class Zee5IE(InfoExtractor): _NETRC_MACHINE = 'zee5' _GEO_COUNTRIES = ['IN'] - def _login(self): - username, password = self._get_login_info() - if username: - if len(username) == 10 and username.isdigit() and self._USER_TOKEN is None: - self.report_login() - otp_request_json = self._download_json('https://b2bapi.zee5.com/device/sendotp_v1.php?phoneno=91{}'.format(username), - None, note='Sending OTP') - if otp_request_json['code'] == 0: - self.to_screen(otp_request_json['message']) - else: - raise ExtractorError(otp_request_json['message'], expected=True) - otp_code = self._get_tfa_info('OTP') - otp_verify_json = self._download_json('https://b2bapi.zee5.com/device/verifyotp_v1.php?phoneno=91{}&otp={}&guest_token={}&platform=web'.format(username, otp_code, self._DEVICE_ID), - None, note='Verifying OTP', fatal=False) - if not otp_verify_json: - raise ExtractorError('Unable to verify OTP.', expected=True) - self._USER_TOKEN = otp_verify_json.get('token') - if not self._USER_TOKEN: - raise ExtractorError(otp_request_json['message'], expected=True) - elif username.lower() == 'token' and len(password) > 1198: - self._USER_TOKEN = password + def _perform_login(self, username, password): + if len(username) == 10 and username.isdigit() and self._USER_TOKEN is None: + self.report_login() + otp_request_json = self._download_json('https://b2bapi.zee5.com/device/sendotp_v1.php?phoneno=91{}'.format(username), + None, note='Sending OTP') + if otp_request_json['code'] == 0: + self.to_screen(otp_request_json['message']) else: - raise ExtractorError(self._LOGIN_HINT, expected=True) - - def _real_initialize(self): - self._login() + raise ExtractorError(otp_request_json['message'], expected=True) + otp_code = self._get_tfa_info('OTP') + otp_verify_json = self._download_json('https://b2bapi.zee5.com/device/verifyotp_v1.php?phoneno=91{}&otp={}&guest_token={}&platform=web'.format(username, otp_code, self._DEVICE_ID), + None, note='Verifying OTP', fatal=False) + if not otp_verify_json: + raise ExtractorError('Unable to verify OTP.', expected=True) + self._USER_TOKEN = otp_verify_json.get('token') + if not self._USER_TOKEN: + raise ExtractorError(otp_request_json['message'], expected=True) + elif username.lower() == 'token' and len(password) > 1198: + self._USER_TOKEN = password + else: + raise ExtractorError(self._LOGIN_HINT, expected=True) def _real_extract(self, url): video_id, display_id = self._match_valid_url(url).group('id', 'display_id') diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index c9b57c2f0..da6f27801 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -2279,8 +2279,9 @@ def format_decimal_suffix(num, fmt='%d%s', *, factor=1000): num, factor = float_or_none(num), float(factor) if num is None or num < 0: return None - exponent = 0 if num == 0 else int(math.log(num, factor)) - suffix = ['', *'kMGTPEZY'][exponent] + POSSIBLE_SUFFIXES = 'kMGTPEZY' + exponent = 0 if num == 0 else min(int(math.log(num, factor)), len(POSSIBLE_SUFFIXES)) + suffix = ['', *POSSIBLE_SUFFIXES][exponent] if factor == 1024: suffix = {'k': 'Ki', '': ''}.get(suffix, f'{suffix}i') converted = num / (factor ** exponent) |