diff options
author | Jesús <heckyel@hyperbola.info> | 2022-12-02 05:21:10 +0800 |
---|---|---|
committer | Jesús <heckyel@hyperbola.info> | 2022-12-02 05:21:10 +0800 |
commit | eaeeef9c1d1bedb76fea953c332ef84d53bffe2c (patch) | |
tree | c3cb5582247e47fc67c24cd7ff8ea857fb76821e /hypervideo_dl | |
parent | 1e5a50b71d8f0eae6007bedc329eecb24bb5aba3 (diff) | |
download | hypervideo-eaeeef9c1d1bedb76fea953c332ef84d53bffe2c.tar.lz hypervideo-eaeeef9c1d1bedb76fea953c332ef84d53bffe2c.tar.xz hypervideo-eaeeef9c1d1bedb76fea953c332ef84d53bffe2c.zip |
update from upstream
Diffstat (limited to 'hypervideo_dl')
1092 files changed, 32411 insertions, 23399 deletions
diff --git a/hypervideo_dl/YoutubeDL.py b/hypervideo_dl/YoutubeDL.py index 276f42d..012c3b8 100644 --- a/hypervideo_dl/YoutubeDL.py +++ b/hypervideo_dl/YoutubeDL.py @@ -1,8 +1,3 @@ -#!/usr/bin/env python3 -# coding: utf-8 - -from __future__ import absolute_import, unicode_literals - import collections import contextlib import datetime @@ -15,7 +10,7 @@ import json import locale import operator import os -import platform +import random import re import shutil import subprocess @@ -24,151 +19,141 @@ import tempfile import time import tokenize import traceback -import random import unicodedata - -from enum import Enum +import urllib.request from string import ascii_letters -from .compat import ( - compat_basestring, - compat_brotli, - compat_get_terminal_size, - compat_kwargs, - compat_numeric_types, - compat_os_name, - compat_pycrypto_AES, - compat_shlex_quote, - compat_str, - compat_tokenize_tokenize, - compat_urllib_error, - compat_urllib_request, - compat_urllib_request_DataHandler, - windows_enable_vt_mode, -) +from .cache import Cache +from .compat import compat_os_name, compat_shlex_quote from .cookies import load_cookies +from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name +from .downloader.rtmp import rtmpdump_version +from .extractor import gen_extractor_classes, get_info_extractor +from .extractor.common import UnsupportedURLIE +from .extractor.openload import PhantomJSwrapper +from .minicurses import format_text +from .postprocessor import _PLUGIN_CLASSES as plugin_postprocessors +from .postprocessor import ( + EmbedThumbnailPP, + FFmpegFixupDuplicateMoovPP, + FFmpegFixupDurationPP, + FFmpegFixupM3u8PP, + FFmpegFixupM4aPP, + FFmpegFixupStretchedPP, + FFmpegFixupTimestampPP, + FFmpegMergerPP, + FFmpegPostProcessor, + FFmpegVideoConvertorPP, + MoveFilesAfterDownloadPP, + get_postprocessor, +) from .utils import ( + DEFAULT_OUTTMPL, + IDENTITY, + LINK_TEMPLATES, + MEDIA_EXTENSIONS, + NO_DEFAULT, + NUMBER_RE, + OUTTMPL_TYPES, + POSTPROCESS_WHEN, + STR_FORMAT_RE_TMPL, + STR_FORMAT_TYPES, + ContentTooShortError, + DateRange, + DownloadCancelled, + DownloadError, + EntryNotInPlaylist, + ExistingVideoReached, + ExtractorError, + FormatSorter, + GeoRestrictedError, + HEADRequest, + ISO3166Utils, + LazyList, + MaxDownloadsReached, + Namespace, + PagedList, + PerRequestProxyHandler, + PlaylistEntries, + Popen, + PostProcessingError, + ReExtractInfo, + RejectedVideoReached, + SameFileError, + UnavailableVideoError, + UserNotLive, + YoutubeDLCookieProcessor, + YoutubeDLHandler, + YoutubeDLRedirectHandler, age_restricted, args_to_str, - ContentTooShortError, + bug_reports_message, date_from_str, - DateRange, - DEFAULT_OUTTMPL, + deprecation_warning, determine_ext, determine_protocol, - DownloadCancelled, - DownloadError, encode_compat_str, encodeFilename, - EntryNotInPlaylist, error_to_compat_str, - ExistingVideoReached, + escapeHTML, expand_path, - ExtractorError, filter_dict, float_or_none, format_bytes, - format_field, format_decimal_suffix, + format_field, formatSeconds, - GeoRestrictedError, + get_compatible_ext, get_domain, - has_certifi, - HEADRequest, - InAdvancePagedList, int_or_none, iri_to_uri, - ISO3166Utils, + is_path_like, join_nonempty, - LazyList, - LINK_TEMPLATES, locked_file, + make_archive_id, make_dir, make_HTTPS_handler, - MaxDownloadsReached, merge_headers, network_exceptions, - NO_DEFAULT, number_of_digits, orderedSet, - OUTTMPL_TYPES, - PagedList, + orderedSet_from_options, parse_filesize, - PerRequestProxyHandler, - platform_name, - Popen, - POSTPROCESS_WHEN, - PostProcessingError, preferredencoding, prepend_extension, - ReExtractInfo, register_socks_protocols, - RejectedVideoReached, remove_terminal_sequences, render_table, replace_extension, - SameFileError, sanitize_filename, sanitize_path, sanitize_url, sanitized_Request, std_headers, - STR_FORMAT_RE_TMPL, - STR_FORMAT_TYPES, str_or_none, strftime_or_none, subtitles_filename, supports_terminal_sequences, + system_identifier, timetuple_from_msec, to_high_limit_path, traverse_obj, + try_call, try_get, - UnavailableVideoError, url_basename, variadic, version_tuple, + windows_enable_vt_mode, write_json_file, write_string, - YoutubeDLCookieProcessor, - YoutubeDLHandler, - YoutubeDLRedirectHandler, -) -from .cache import Cache -from .minicurses import format_text -from .extractor import ( - gen_extractor_classes, - get_info_extractor, - _LAZY_LOADER, - _PLUGIN_CLASSES as plugin_extractors -) -from .extractor.openload import PhantomJSwrapper -from .downloader import ( - FFmpegFD, - get_suitable_downloader, - shorten_protocol_name -) -from .downloader.rtmp import rtmpdump_version -from .postprocessor import ( - get_postprocessor, - EmbedThumbnailPP, - FFmpegFixupDuplicateMoovPP, - FFmpegFixupDurationPP, - FFmpegFixupM3u8PP, - FFmpegFixupM4aPP, - FFmpegFixupStretchedPP, - FFmpegFixupTimestampPP, - FFmpegMergerPP, - FFmpegPostProcessor, - MoveFilesAfterDownloadPP, - _PLUGIN_CLASSES as plugin_postprocessors ) -from .version import __version__ +from .version import RELEASE_GIT_HEAD, VARIANT, __version__ if compat_os_name == 'nt': import ctypes -class YoutubeDL(object): +class YoutubeDL: """YoutubeDL class. YoutubeDL objects are the ones responsible of downloading the @@ -211,13 +196,6 @@ class YoutubeDL(object): For compatibility, a single list is also accepted print_to_file: A dict with keys WHEN (same as forceprint) mapped to a list of tuples with (template, filename) - forceurl: Force printing final URL. (Deprecated) - forcetitle: Force printing title. (Deprecated) - forceid: Force printing ID. (Deprecated) - forcethumbnail: Force printing thumbnail URL. (Deprecated) - forcedescription: Force printing description. (Deprecated) - forcefilename: Force printing final filename. (Deprecated) - forceduration: Force printing duration. (Deprecated) forcejson: Force printing info_dict as JSON. dump_single_json: Force printing the info_dict of the whole playlist (or video) as a single JSON line. @@ -261,22 +239,20 @@ class YoutubeDL(object): Default is 'only_download' for CLI, but False for API skip_playlist_after_errors: Number of allowed failures until the rest of the playlist is skipped - force_generic_extractor: Force downloader to use the generic extractor + allowed_extractors: List of regexes to match against extractor names that are allowed overwrites: Overwrite all video and metadata files if True, overwrite only non-video files if None and don't overwrite any file if False For compatibility with youtube-dl, "nooverwrites" may also be used instead - playliststart: Playlist item to start at. - playlistend: Playlist item to end at. playlist_items: Specific indices of playlist to download. - playlistreverse: Download playlist items in reverse order. playlistrandom: Download playlist items in random order. + lazy_playlist: Process playlist entries as they are received. matchtitle: Download only matching titles. rejecttitle: Reject downloads for matching titles. logger: Log messages to a logging.Logger instance. - logtostderr: Log messages to stderr instead of stdout. - consoletitle: Display progress in console window's titlebar. + logtostderr: Print everything to stderr instead of stdout. + consoletitle: Display progress in console window's titlebar. writedescription: Write the video description to a .description file writeinfojson: Write the video description to a .info.json file clean_infojson: Remove private fields from the infojson @@ -294,15 +270,12 @@ class YoutubeDL(object): writedesktoplink: Write a Linux internet shortcut file (.desktop) writesubtitles: Write the video subtitles to a file writeautomaticsub: Write the automatically generated subtitles to a file - allsubtitles: Deprecated - Use subtitleslangs = ['all'] - Downloads all the subtitles of the video - (requires writesubtitles or writeautomaticsub) listsubtitles: Lists all available subtitles for the video subtitlesformat: The format code for subtitles subtitleslangs: List of languages of the subtitles to download (can be regex). The list may contain "all" to refer to all the available subtitles. The language can be prefixed with a "-" to - exclude it from the requested languages. Eg: ['all', '-live_chat'] + exclude it from the requested languages, e.g. ['all', '-live_chat'] keepvideo: Keep the video file after post-processing daterange: A DateRange object, download only if the upload_date is in the range. skip_download: Skip the actual download of the video file @@ -320,24 +293,28 @@ class YoutubeDL(object): downloaded. Videos without view count information are always downloaded. None for no limit. - download_archive: File name of a file where all downloads are recorded. - Videos already present in the file are not downloaded - again. + download_archive: A set, or the name of a file where all downloads are recorded. + Videos already present in the file are not downloaded again. break_on_existing: Stop the download process after attempting to download a file that is in the archive. break_on_reject: Stop the download process when encountering a video that has been filtered out. break_per_url: Whether break_on_reject and break_on_existing should act on each input URL as opposed to for the entire queue - cookiefile: File name where cookies should be read from and dumped to + cookiefile: File name or text stream from where cookies should be read and dumped to cookiesfrombrowser: A tuple containing the name of the browser, the profile - name/pathfrom where cookies are loaded, and the name of the - keyring. Eg: ('chrome', ) or ('vivaldi', 'default', 'BASICTEXT') + name/path from where cookies are loaded, the name of the keyring, + and the container name, e.g. ('chrome', ) or + ('vivaldi', 'default', 'BASICTEXT') or ('firefox', 'default', None, 'Meta') legacyserverconnect: Explicitly allow HTTPS connection to servers that do not support RFC 5746 secure renegotiation nocheckcertificate: Do not verify SSL certificates + client_certificate: Path to client certificate file in PEM format. May include the private key + client_certificate_key: Path to private key file for client certificate + client_certificate_password: Password for client certificate private key, if encrypted. + If not provided and the key is encrypted, hypervideo will ask interactively prefer_insecure: Use HTTP instead of HTTPS to retrieve information. - At the moment, this is only supported by YouTube. + (Only supported by some extractors) http_headers: A dictionary of custom headers to be used for all requests proxy: URL of the proxy server to use geo_verification_proxy: URL of the proxy to use for IP address verification @@ -346,13 +323,17 @@ class YoutubeDL(object): bidi_workaround: Work around buggy terminals without bidirectional text support, using fridibi debug_printtraffic:Print out sent and received HTTP traffic - include_ads: Download ads as well (deprecated) default_search: Prepend this string if an input url is not valid. 'auto' for elaborate guessing encoding: Use this encoding instead of the system-specified. - extract_flat: Do not resolve URLs, return the immediate result. - Pass in 'in_playlist' to only show this behavior for - playlist items. + extract_flat: Whether to resolve and process url_results further + * False: Always process (default) + * True: Never process + * 'in_playlist': Do not process inside playlist/multi_video + * 'discard': Always process, but don't return the result + from inside playlist/multi_video + * 'discard_in_playlist': Same as "discard", but only for + playlists (not multi_video) wait_for_video: If given, wait for scheduled streams to become available. The value should be a tuple containing the range (min_secs, max_secs) to wait between retries @@ -362,10 +343,6 @@ class YoutubeDL(object): * when: When to run the postprocessor. Allowed values are the entries of utils.POSTPROCESS_WHEN Assumed to be 'post_process' if not given - post_hooks: Deprecated - Register a custom postprocessor instead - A list of functions that get called as the final step - for each video file, after all postprocessors have been - called. The filename will be passed as the only argument. progress_hooks: A list of functions that get called on download progress, with a dictionary with the entries * status: One of "downloading", "error", or "finished". @@ -400,7 +377,7 @@ class YoutubeDL(object): Progress hooks are guaranteed to be called at least twice (with status "started" and "finished") if the processing is successful. - merge_output_format: Extension to use when merging formats. + merge_output_format: "/" separated list of extensions to use when merging formats. final_ext: Expected final extension; used to detect when the file was already downloaded and converted fixup: Automatically correct known faults of the file. @@ -410,8 +387,6 @@ class YoutubeDL(object): - "detect_or_warn": check whether we can do anything about it, warn otherwise (default) source_address: Client-side IP address to bind to. - call_home: Boolean, true iff we are allowed to contact the - hypervideo servers for debugging. (BROKEN) sleep_interval_requests: Number of seconds to sleep between requests during extraction sleep_interval: Number of seconds to sleep before each download when @@ -427,10 +402,14 @@ class YoutubeDL(object): sleep_interval_subtitles: Number of seconds to sleep before each subtitle download listformats: Print an overview of available video formats and exit. list_thumbnails: Print a table of all thumbnails and exit. - match_filter: A function that gets called with the info_dict of - every video. - If it returns a message, the video is ignored. - If it returns None, the video is downloaded. + match_filter: A function that gets called for every video with the signature + (info_dict, *, incomplete: bool) -> Optional[str] + For backward compatibility with youtube-dl, the signature + (info_dict) -> Optional[str] is also allowed. + - If it returns a message, the video is ignored. + - If it returns None, the video is downloaded. + - If it returns utils.NO_DEFAULT, the user is interactively + asked whether to download the video. match_filter_func in utils.py is one example for this. no_color: Do not emit color codes in output. geo_bypass: Bypass geographic restriction via faking X-Forwarded-For @@ -442,17 +421,10 @@ class YoutubeDL(object): geo_bypass_ip_block: IP range in CIDR notation that will be used similarly to geo_bypass_country - - The following options determine which downloader is picked: external_downloader: A dictionary of protocol keys and the executable of the external downloader to use for it. The allowed protocols are default|http|ftp|m3u8|dash|rtsp|rtmp|mms. Set the value to 'native' to use the native downloader - hls_prefer_native: Deprecated - Use external_downloader = {'m3u8': 'native'} - or {'m3u8': 'ffmpeg'} instead. - Use the native HLS downloader instead of ffmpeg/avconv - if True, otherwise use ffmpeg/avconv if False, otherwise - use downloader suggested by extractor if None. compat_opts: Compatibility options. See "Differences in default behavior". The following options do not work when used through the API: filename, abort-on-error, multistreams, no-live-chat, format-sort @@ -462,17 +434,29 @@ class YoutubeDL(object): Allowed keys are 'download', 'postprocess', 'download-title' (console title) and 'postprocess-title'. The template is mapped on a dictionary with keys 'progress' and 'info' + retry_sleep_functions: Dictionary of functions that takes the number of attempts + as argument and returns the time to sleep in seconds. + Allowed keys are 'http', 'fragment', 'file_access' + download_ranges: A callback function that gets called for every video with + the signature (info_dict, ydl) -> Iterable[Section]. + Only the returned sections will be downloaded. + Each Section is a dict with the following keys: + * start_time: Start time of the section in seconds + * end_time: End time of the section in seconds + * title: Section title (Optional) + * index: Section number (Optional) + force_keyframes_at_cuts: Re-encode the video when downloading ranges to get precise cuts + noprogress: Do not print the progress bar + live_from_start: Whether to download livestreams videos from the start The following parameters are not used by YoutubeDL itself, they are used by the downloader (see hypervideo_dl/downloader/common.py): nopart, updatetime, buffersize, ratelimit, throttledratelimit, min_filesize, max_filesize, test, noresizebuffer, retries, file_access_retries, fragment_retries, - continuedl, noprogress, xattr_set_filesize, hls_use_mpegts, http_chunk_size, + continuedl, xattr_set_filesize, hls_use_mpegts, http_chunk_size, external_downloader_args, concurrent_fragment_downloads. The following options are used by the post processors: - prefer_ffmpeg: If False, use avconv instead of ffmpeg if both are available, - otherwise prefer ffmpeg. (avconv support is deprecated) ffmpeg_location: Location of the ffmpeg/avconv binary; either the path to the binary or its containing directory. postprocessor_args: A dictionary of postprocessor/executable keys (in lower case) @@ -490,44 +474,89 @@ class YoutubeDL(object): discontinuities such as ad breaks (default: False) extractor_args: A dictionary of arguments to be passed to the extractors. See "EXTRACTOR ARGUMENTS" for details. - Eg: {'youtube': {'skip': ['dash', 'hls']}} + E.g. {'youtube': {'skip': ['dash', 'hls']}} mark_watched: Mark videos watched (even with --simulate). Only for YouTube - youtube_include_dash_manifest: Deprecated - Use extractor_args instead. + + The following options are deprecated and may be removed in the future: + + force_generic_extractor: Force downloader to use the generic extractor + - Use allowed_extractors = ['generic', 'default'] + playliststart: - Use playlist_items + Playlist item to start at. + playlistend: - Use playlist_items + Playlist item to end at. + playlistreverse: - Use playlist_items + Download playlist items in reverse order. + forceurl: - Use forceprint + Force printing final URL. + forcetitle: - Use forceprint + Force printing title. + forceid: - Use forceprint + Force printing ID. + forcethumbnail: - Use forceprint + Force printing thumbnail URL. + forcedescription: - Use forceprint + Force printing description. + forcefilename: - Use forceprint + Force printing final filename. + forceduration: - Use forceprint + Force printing duration. + allsubtitles: - Use subtitleslangs = ['all'] + Downloads all the subtitles of the video + (requires writesubtitles or writeautomaticsub) + include_ads: - Doesn't work + Download ads as well + call_home: - Not implemented + Boolean, true iff we are allowed to contact the + hypervideo servers for debugging. + post_hooks: - Register a custom postprocessor + A list of functions that get called as the final step + for each video file, after all postprocessors have been + called. The filename will be passed as the only argument. + hls_prefer_native: - Use external_downloader = {'m3u8': 'native'} or {'m3u8': 'ffmpeg'}. + Use the native HLS downloader instead of ffmpeg/avconv + if True, otherwise use ffmpeg/avconv if False, otherwise + use downloader suggested by extractor if None. + prefer_ffmpeg: - avconv support is deprecated + If False, use avconv instead of ffmpeg if both are available, + otherwise prefer ffmpeg. + youtube_include_dash_manifest: - Use extractor_args If True (default), DASH manifests and related data will be downloaded and processed by extractor. You can reduce network I/O by disabling it if you don't care about DASH. (only for youtube) - youtube_include_hls_manifest: Deprecated - Use extractor_args instead. + youtube_include_hls_manifest: - Use extractor_args If True (default), HLS manifests and related data will be downloaded and processed by extractor. You can reduce network I/O by disabling it if you don't care about HLS. (only for youtube) """ - _NUMERIC_FIELDS = set(( - 'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx', + _NUMERIC_FIELDS = { + 'width', 'height', 'asr', 'audio_channels', 'fps', + 'tbr', 'abr', 'vbr', 'filesize', 'filesize_approx', 'timestamp', 'release_timestamp', 'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count', 'average_rating', 'comment_count', 'age_limit', 'start_time', 'end_time', 'chapter_number', 'season_number', 'episode_number', 'track_number', 'disc_number', 'release_year', - )) + } _format_fields = { # NB: Keep in sync with the docstring of extractor/common.py 'url', 'manifest_url', 'manifest_stream_number', 'ext', 'format', 'format_id', 'format_note', - 'width', 'height', 'resolution', 'dynamic_range', 'tbr', 'abr', 'acodec', 'asr', - 'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx', + 'width', 'height', 'aspect_ratio', 'resolution', 'dynamic_range', 'tbr', 'abr', 'acodec', 'asr', 'audio_channels', + 'vbr', 'fps', 'vcodec', 'container', 'filesize', 'filesize_approx', 'rows', 'columns', 'player_url', 'protocol', 'fragment_base_url', 'fragments', 'is_from_start', 'preference', 'language', 'language_preference', 'quality', 'source_preference', 'http_headers', 'stretched_ratio', 'no_resume', 'has_drm', 'downloader_options', 'page_url', 'app', 'play_path', 'tc_url', 'flash_version', 'rtmp_live', 'rtmp_conn', 'rtmp_protocol', 'rtmp_real_time' } _format_selection_exts = { - 'audio': {'m4a', 'mp3', 'ogg', 'aac'}, - 'video': {'mp4', 'flv', 'webm', '3gp'}, - 'storyboards': {'mhtml'}, + 'audio': set(MEDIA_EXTENSIONS.common_audio), + 'video': set(MEDIA_EXTENSIONS.common_video + ('3gp', )), + 'storyboards': set(MEDIA_EXTENSIONS.storyboards), } def __init__(self, params=None, auto_init=True): @@ -554,21 +583,30 @@ class YoutubeDL(object): self.cache = Cache(self) windows_enable_vt_mode() - self._out_files = { - 'error': sys.stderr, - 'print': sys.stderr if self.params.get('logtostderr') else sys.stdout, - 'console': None if compat_os_name == 'nt' else next( + stdout = sys.stderr if self.params.get('logtostderr') else sys.stdout + self._out_files = Namespace( + out=stdout, + error=sys.stderr, + screen=sys.stderr if self.params.get('quiet') else stdout, + console=None if compat_os_name == 'nt' else next( filter(supports_terminal_sequences, (sys.stderr, sys.stdout)), None) - } - self._out_files['screen'] = sys.stderr if self.params.get('quiet') else self._out_files['print'] - self._allow_colors = { - type_: not self.params.get('no_color') and supports_terminal_sequences(self._out_files[type_]) - for type_ in ('screen', 'error') - } - - if sys.version_info < (3, 6): - self.report_warning( - 'Python version %d.%d is not supported! Please update to Python 3.6 or above' % sys.version_info[:2]) + ) + self._allow_colors = Namespace(**{ + type_: not self.params.get('no_color') and supports_terminal_sequences(stream) + for type_, stream in self._out_files.items_ if type_ != 'console' + }) + + # The code is left like this to be reused for future deprecations + MIN_SUPPORTED, MIN_RECOMMENDED = (3, 7), (3, 7) + current_version = sys.version_info[:2] + if current_version < MIN_RECOMMENDED: + msg = ('Support for Python version %d.%d has been deprecated. ' + 'See https://github.com/hypervideo/hypervideo/issues/3764 for more details.' + '\n You will no longer receive updates on this version') + if current_version < MIN_SUPPORTED: + msg = 'Python version %d.%d is no longer supported' + self.deprecation_warning( + f'{msg}! Please update to Python %d.%d or above' % (*current_version, *MIN_RECOMMENDED)) if self.params.get('allow_unplayable_formats'): self.report_warning( @@ -577,9 +615,33 @@ class YoutubeDL(object): ' If you experience any issues while using this option, ' f'{self._format_err("DO NOT", self.Styles.ERROR)} open a bug report') + if self.params.get('bidi_workaround', False): + try: + import pty + master, slave = pty.openpty() + width = shutil.get_terminal_size().columns + width_args = [] if width is None else ['-w', str(width)] + sp_kwargs = {'stdin': subprocess.PIPE, 'stdout': slave, 'stderr': self._out_files.error} + try: + self._output_process = Popen(['bidiv'] + width_args, **sp_kwargs) + except OSError: + self._output_process = Popen(['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs) + self._output_channel = os.fdopen(master, 'rb') + except OSError as ose: + if ose.errno == errno.ENOENT: + self.report_warning( + 'Could not find fribidi executable, ignoring --bidi-workaround. ' + 'Make sure that fribidi is an executable file in one of the directories in your $PATH.') + else: + raise + + self.params['compat_opts'] = set(self.params.get('compat_opts', ())) + if auto_init and auto_init != 'no_verbose_header': + self.print_debug_header() + def check_deprecated(param, option, suggestion): if self.params.get(param) is not None: - self.report_warning('%s is deprecated. Use %s instead' % (option, suggestion)) + self.report_warning(f'{option} is deprecated. Use {suggestion} instead') return True return False @@ -594,9 +656,9 @@ class YoutubeDL(object): for msg in self.params.get('_warnings', []): self.report_warning(msg) for msg in self.params.get('_deprecation_warnings', []): - self.deprecation_warning(msg) + self.deprecated_feature(msg) - if 'list-formats' in self.params.get('compat_opts', []): + if 'list-formats' in self.params['compat_opts']: self.params['listformats_table'] = False if 'overwrites' not in self.params and self.params.get('nooverwrites') is not None: @@ -609,6 +671,13 @@ class YoutubeDL(object): else: self.params['nooverwrites'] = not self.params['overwrites'] + if self.params.get('simulate') is None and any(( + self.params.get('list_thumbnails'), + self.params.get('listformats'), + self.params.get('listsubtitles'), + )): + self.params['simulate'] = 'list_only' + self.params.setdefault('forceprint', {}) self.params.setdefault('print_to_file', {}) @@ -616,31 +685,8 @@ class YoutubeDL(object): if not isinstance(params['forceprint'], dict): self.params['forceprint'] = {'video': params['forceprint']} - if self.params.get('bidi_workaround', False): - try: - import pty - master, slave = pty.openpty() - width = compat_get_terminal_size().columns - if width is None: - width_args = [] - else: - width_args = ['-w', str(width)] - sp_kwargs = dict( - stdin=subprocess.PIPE, - stdout=slave, - stderr=self._out_files['error']) - try: - self._output_process = Popen(['bidiv'] + width_args, **sp_kwargs) - except OSError: - self._output_process = Popen(['fribidi', '-c', 'UTF-8'] + width_args, **sp_kwargs) - self._output_channel = os.fdopen(master, 'rb') - except OSError as ose: - if ose.errno == errno.ENOENT: - self.report_warning( - 'Could not find fribidi executable, ignoring --bidi-workaround. ' - 'Make sure that fribidi is an executable file in one of the directories in your $PATH.') - else: - raise + if auto_init: + self.add_default_info_extractors() if (sys.platform != 'win32' and sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] @@ -652,7 +698,7 @@ class YoutubeDL(object): 'Set the LC_ALL environment variable to fix this.') self.params['restrictfilenames'] = True - self.outtmpl_dict = self.parse_outtmpl() + self._parse_outtmpl() # Creating format selector here allows us to catch syntax errors before the extraction self.format_selector = ( @@ -663,13 +709,6 @@ class YoutubeDL(object): # Set http_headers defaults according to std_headers self.params['http_headers'] = merge_headers(std_headers, self.params.get('http_headers', {})) - self._setup_opener() - - if auto_init: - if auto_init != 'no_verbose_header': - self.print_debug_header() - self.add_default_info_extractors() - hooks = { 'post_hooks': self.add_post_hook, 'progress_hooks': self.add_progress_hook, @@ -683,28 +722,31 @@ class YoutubeDL(object): pp_def = dict(pp_def_raw) when = pp_def.pop('when', 'post_process') self.add_post_processor( - get_postprocessor(pp_def.pop('key'))(self, **compat_kwargs(pp_def)), + get_postprocessor(pp_def.pop('key'))(self, **pp_def), when=when) + self._setup_opener() register_socks_protocols() def preload_download_archive(fn): """Preload the archive, if any is specified""" + archive = set() if fn is None: - return False + return archive + elif not is_path_like(fn): + return fn + self.write_debug(f'Loading archive file {fn!r}') try: with locked_file(fn, 'r', encoding='utf-8') as archive_file: for line in archive_file: - self.archive.add(line.strip()) - except IOError as ioe: + archive.add(line.strip()) + except OSError as ioe: if ioe.errno != errno.ENOENT: raise - return False - return True + return archive - self.archive = set() - preload_download_archive(self.params.get('download_archive')) + self.archive = preload_download_archive(self.params.get('download_archive')) def warn_if_short_id(self, argv): # short YouTube ID starting with dash? @@ -730,13 +772,6 @@ class YoutubeDL(object): self._ies_instances[ie_key] = ie ie.set_downloader(self) - def _get_info_extractor_class(self, ie_key): - ie = self._ies.get(ie_key) - if ie is None: - ie = get_info_extractor(ie_key) - self.add_info_extractor(ie) - return ie - def get_info_extractor(self, ie_key): """ Get an instance of an IE with name ie_key, it will try to get one from @@ -753,11 +788,23 @@ class YoutubeDL(object): """ Add the InfoExtractors returned by gen_extractors to the end of the list """ - for ie in gen_extractor_classes(): - self.add_info_extractor(ie) + all_ies = {ie.IE_NAME.lower(): ie for ie in gen_extractor_classes()} + all_ies['end'] = UnsupportedURLIE() + try: + ie_names = orderedSet_from_options( + self.params.get('allowed_extractors', ['default']), { + 'all': list(all_ies), + 'default': [name for name, ie in all_ies.items() if ie._ENABLED], + }, use_regex=True) + except re.error as e: + raise ValueError(f'Wrong regex for allowed_extractors: {e.pattern}') + for name in ie_names: + self.add_info_extractor(all_ies[name]) + self.write_debug(f'Loaded {len(ie_names)} extractors') def add_post_processor(self, pp, when='post_process'): """Add a PostProcessor object to the end of the chain.""" + assert when in POSTPROCESS_WHEN, f'Invalid when={when}' self._pps[when].append(pp) pp.set_downloader(self) @@ -781,11 +828,11 @@ class YoutubeDL(object): return message assert hasattr(self, '_output_process') - assert isinstance(message, compat_str) + assert isinstance(message, str) line_count = message.count('\n') + 1 - self._output_process.stdin.write((message + '\n').encode('utf-8')) + self._output_process.stdin.write((message + '\n').encode()) self._output_process.stdin.flush() - res = ''.join(self._output_channel.readline().decode('utf-8') + res = ''.join(self._output_channel.readline().decode() for _ in range(line_count)) return res[:-len('\n')] @@ -799,12 +846,14 @@ class YoutubeDL(object): def to_stdout(self, message, skip_eol=False, quiet=None): """Print message to stdout""" if quiet is not None: - self.deprecation_warning('"YoutubeDL.to_stdout" no longer accepts the argument quiet. Use "YoutubeDL.to_screen" instead') - self._write_string( - '%s%s' % (self._bidi_workaround(message), ('' if skip_eol else '\n')), - self._out_files['print']) - - def to_screen(self, message, skip_eol=False, quiet=None): + self.deprecation_warning('"YoutubeDL.to_stdout" no longer accepts the argument quiet. ' + 'Use "YoutubeDL.to_screen" instead') + if skip_eol is not False: + self.deprecation_warning('"YoutubeDL.to_stdout" no longer accepts the argument skip_eol. ' + 'Use "YoutubeDL.to_screen" instead') + self._write_string(f'{self._bidi_workaround(message)}\n', self._out_files.out) + + def to_screen(self, message, skip_eol=False, quiet=None, only_once=False): """Print message to screen if not in quiet mode""" if self.params.get('logger'): self.params['logger'].debug(message) @@ -813,20 +862,20 @@ class YoutubeDL(object): return self._write_string( '%s%s' % (self._bidi_workaround(message), ('' if skip_eol else '\n')), - self._out_files['screen']) + self._out_files.screen, only_once=only_once) def to_stderr(self, message, only_once=False): """Print message to stderr""" - assert isinstance(message, compat_str) + assert isinstance(message, str) if self.params.get('logger'): self.params['logger'].error(message) else: - self._write_string('%s\n' % self._bidi_workaround(message), self._out_files['error'], only_once=only_once) + self._write_string(f'{self._bidi_workaround(message)}\n', self._out_files.error, only_once=only_once) def _send_console_code(self, code): - if compat_os_name == 'nt' or not self._out_files['console']: + if compat_os_name == 'nt' or not self._out_files.console: return - self._write_string(code, self._out_files['console']) + self._write_string(code, self._out_files.console) def to_console_title(self, message): if not self.params.get('consoletitle', False): @@ -894,16 +943,19 @@ class YoutubeDL(object): raise DownloadError(message, exc_info) self._download_retcode = 1 - class Styles(Enum): - HEADERS = 'yellow' - EMPHASIS = 'light blue' - ID = 'green' - DELIM = 'blue' - ERROR = 'red' - WARNING = 'yellow' - SUPPRESS = 'light black' + Styles = Namespace( + HEADERS='yellow', + EMPHASIS='light blue', + FILENAME='green', + ID='green', + DELIM='blue', + ERROR='red', + WARNING='yellow', + SUPPRESS='light black', + ) def _format_text(self, handle, allow_colors, text, f, fallback=None, *, test_encoding=False): + text = str(text) if test_encoding: original_text = text # handle.encoding can be None. See https://github.com/hypervideo/hypervideo/issues/2711 @@ -911,17 +963,16 @@ class YoutubeDL(object): text = text.encode(encoding, 'ignore').decode(encoding) if fallback is not None and text != original_text: text = fallback - if isinstance(f, self.Styles): - f = f.value return format_text(text, f) if allow_colors else text if fallback is None else fallback + def _format_out(self, *args, **kwargs): + return self._format_text(self._out_files.out, self._allow_colors.out, *args, **kwargs) + def _format_screen(self, *args, **kwargs): - return self._format_text( - self._out_files['screen'], self._allow_colors['screen'], *args, **kwargs) + return self._format_text(self._out_files.screen, self._allow_colors.screen, *args, **kwargs) def _format_err(self, *args, **kwargs): - return self._format_text( - self._out_files['error'], self._allow_colors['error'], *args, **kwargs) + return self._format_text(self._out_files.error, self._allow_colors.error, *args, **kwargs) def report_warning(self, message, only_once=False): ''' @@ -935,11 +986,14 @@ class YoutubeDL(object): return self.to_stderr(f'{self._format_err("WARNING:", self.Styles.WARNING)} {message}', only_once) - def deprecation_warning(self, message): + def deprecation_warning(self, message, *, stacklevel=0): + deprecation_warning( + message, stacklevel=stacklevel + 1, printer=self.report_error, is_error=False) + + def deprecated_feature(self, message): if self.params.get('logger') is not None: - self.params['logger'].warning(f'DeprecationWarning: {message}') - else: - self.to_stderr(f'{self._format_err("DeprecationWarning:", self.Styles.ERROR)} {message}', True) + self.params['logger'].warning(f'Deprecated Feature: {message}') + self.to_stderr(f'{self._format_err("Deprecated Feature:", self.Styles.ERROR)} {message}', True) def report_error(self, message, *args, **kwargs): ''' @@ -952,7 +1006,7 @@ class YoutubeDL(object): '''Log debug message or Print message to stderr''' if not self.params.get('verbose', False): return - message = '[debug] %s' % message + message = f'[debug] {message}' if self.params.get('logger'): self.params['logger'].debug(message) else: @@ -973,7 +1027,7 @@ class YoutubeDL(object): self.to_screen('Deleting existing file') def raise_no_formats(self, info, forced=False, *, msg=None): - has_drm = info.get('__has_drm') + has_drm = info.get('_has_drm') ignored, expected = self.params.get('ignore_no_formats_error'), bool(msg) msg = msg or has_drm and 'This video is DRM protected' or 'No video formats found!' if forced or not ignored: @@ -983,37 +1037,27 @@ class YoutubeDL(object): self.report_warning(msg) def parse_outtmpl(self): - outtmpl_dict = self.params.get('outtmpl', {}) - if not isinstance(outtmpl_dict, dict): - outtmpl_dict = {'default': outtmpl_dict} - # Remove spaces in the default template - if self.params.get('restrictfilenames'): + self.deprecation_warning('"YoutubeDL.parse_outtmpl" is deprecated and may be removed in a future version') + self._parse_outtmpl() + return self.params['outtmpl'] + + def _parse_outtmpl(self): + sanitize = IDENTITY + if self.params.get('restrictfilenames'): # Remove spaces in the default template sanitize = lambda x: x.replace(' - ', ' ').replace(' ', '-') - else: - sanitize = lambda x: x - outtmpl_dict.update({ - k: sanitize(v) for k, v in DEFAULT_OUTTMPL.items() - if outtmpl_dict.get(k) is None}) - for key, val in outtmpl_dict.items(): - if isinstance(val, bytes): - self.report_warning( - 'Parameter outtmpl is bytes, but should be a unicode string. ' - 'Put from __future__ import unicode_literals at the top of your code file or consider switching to Python 3.x.') - return outtmpl_dict + + outtmpl = self.params.setdefault('outtmpl', {}) + if not isinstance(outtmpl, dict): + self.params['outtmpl'] = outtmpl = {'default': outtmpl} + outtmpl.update({k: sanitize(v) for k, v in DEFAULT_OUTTMPL.items() if outtmpl.get(k) is None}) def get_output_path(self, dir_type='', filename=None): paths = self.params.get('paths', {}) - assert isinstance(paths, dict) + assert isinstance(paths, dict), '"paths" parameter must be a dictionary' path = os.path.join( expand_path(paths.get('home', '').strip()), expand_path(paths.get(dir_type, '').strip()) if dir_type else '', filename or '') - - # Temporary fix for #4787 - # 'Treat' all problem characters by passing filename through preferredencoding - # to workaround encoding issues with subprocess on python2 @ Windows - if sys.version_info < (3, 0) and sys.platform == 'win32': - path = encodeFilename(path, True).decode(preferredencoding()) return sanitize_path(path, force=self.params.get('windowsfilenames')) @staticmethod @@ -1023,11 +1067,11 @@ class YoutubeDL(object): # '%%' intact for template dict substitution step. Working around # with boundary-alike separator hack. sep = ''.join([random.choice(ascii_letters) for _ in range(32)]) - outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep)) + outtmpl = outtmpl.replace('%%', f'%{sep}%').replace('$$', f'${sep}$') # outtmpl should be expand_path'ed before template dict substitution # because meta fields may contain env variables we don't want to - # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and + # be expanded. E.g. for outtmpl "%(title)s.%(ext)s" and # title "Hello $PATH", we don't want `$PATH` to be expanded. return expand_path(outtmpl).replace(sep, '') @@ -1043,7 +1087,7 @@ class YoutubeDL(object): def validate_outtmpl(cls, outtmpl): ''' @return None or Exception object ''' outtmpl = re.sub( - STR_FORMAT_RE_TMPL.format('[^)]*', '[ljqBUDS]'), + STR_FORMAT_RE_TMPL.format('[^)]*', '[ljhqBUDS]'), lambda mobj: f'{mobj.group(0)[:-1]}s', cls._outtmpl_expandpath(outtmpl)) try: @@ -1056,6 +1100,7 @@ class YoutubeDL(object): def _copy_infodict(info_dict): info_dict = dict(info_dict) info_dict.pop('__postprocessors', None) + info_dict.pop('__pending_error', None) return info_dict def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False): @@ -1071,7 +1116,7 @@ class YoutubeDL(object): formatSeconds(info_dict['duration'], '-' if sanitize else ':') if info_dict.get('duration', None) is not None else None) - info_dict['autonumber'] = self.params.get('autonumber_start', 1) - 1 + self._num_downloads + info_dict['autonumber'] = int(self.params.get('autonumber_start', 1) - 1 + self._num_downloads) info_dict['video_autonumber'] = self._num_videos if info_dict.get('resolution') is None: info_dict['resolution'] = self.format_resolution(info_dict, default=None) @@ -1079,38 +1124,51 @@ class YoutubeDL(object): # For fields playlist_index, playlist_autonumber and autonumber convert all occurrences # of %(field)s to %(field)0Nd for backward compatibility field_size_compat_map = { - 'playlist_index': number_of_digits(info_dict.get('_last_playlist_index') or 0), + 'playlist_index': number_of_digits(info_dict.get('__last_playlist_index') or 0), 'playlist_autonumber': number_of_digits(info_dict.get('n_entries') or 0), 'autonumber': self.params.get('autonumber_size') or 5, } TMPL_DICT = {} - EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljqBUDS]')) + EXTERNAL_FORMAT_RE = re.compile(STR_FORMAT_RE_TMPL.format('[^)]*', f'[{STR_FORMAT_TYPES}ljhqBUDS]')) MATH_FUNCTIONS = { '+': float.__add__, '-': float.__sub__, } # Field is of the form key1.key2... - # where keys (except first) can be string, int or slice - FIELD_RE = r'\w*(?:\.(?:\w+|{num}|{num}?(?::{num}?){{1,2}}))*'.format(num=r'(?:-?\d+)') - MATH_FIELD_RE = r'''(?:{field}|{num})'''.format(field=FIELD_RE, num=r'-?\d+(?:.\d+)?') + # where keys (except first) can be string, int, slice or "{field, ...}" + FIELD_INNER_RE = r'(?:\w+|%(num)s|%(num)s?(?::%(num)s?){1,2})' % {'num': r'(?:-?\d+)'} + FIELD_RE = r'\w*(?:\.(?:%(inner)s|{%(field)s(?:,%(field)s)*}))*' % { + 'inner': FIELD_INNER_RE, + 'field': rf'\w*(?:\.{FIELD_INNER_RE})*' + } + MATH_FIELD_RE = rf'(?:{FIELD_RE}|-?{NUMBER_RE})' MATH_OPERATORS_RE = r'(?:%s)' % '|'.join(map(re.escape, MATH_FUNCTIONS.keys())) - INTERNAL_FORMAT_RE = re.compile(r'''(?x) + INTERNAL_FORMAT_RE = re.compile(rf'''(?x) (?P<negate>-)? - (?P<fields>{field}) - (?P<maths>(?:{math_op}{math_field})*) + (?P<fields>{FIELD_RE}) + (?P<maths>(?:{MATH_OPERATORS_RE}{MATH_FIELD_RE})*) (?:>(?P<strf_format>.+?))? (?P<remaining> (?P<alternate>(?<!\\),[^|&)]+)? (?:&(?P<replacement>.*?))? (?:\|(?P<default>.*?))? - )$'''.format(field=FIELD_RE, math_op=MATH_OPERATORS_RE, math_field=MATH_FIELD_RE)) + )$''') + + def _traverse_infodict(fields): + fields = [f for x in re.split(r'\.({.+?})\.?', fields) + for f in ([x] if x.startswith('{') else x.split('.'))] + for i in (0, -1): + if fields and not fields[i]: + fields.pop(i) - def _traverse_infodict(k): - k = k.split('.') - if k[0] == '': - k.pop(0) - return traverse_obj(info_dict, k, is_user_input=True, traverse_string=True) + for i, f in enumerate(fields): + if not f.startswith('{'): + continue + assert f.endswith('}'), f'No closing brace for {f} in {fields}' + fields[i] = {k: k.split('.') for k in f[1:-1].split(',')} + + return traverse_obj(info_dict, fields, is_user_input=True, traverse_string=True) def get_value(mdict): # Object traversal @@ -1146,6 +1204,9 @@ class YoutubeDL(object): if mdict['strf_format']: value = strftime_or_none(value, mdict['strf_format'].replace('\\,', ',')) + # XXX: Workaround for https://github.com/hypervideo/hypervideo/issues/4485 + if sanitize and value == '': + value = None return value na = self.params.get('outtmpl_na_placeholder', 'NA') @@ -1153,7 +1214,7 @@ class YoutubeDL(object): def filename_sanitizer(key, value, restricted=self.params.get('restrictfilenames')): return sanitize_filename(str(value), restricted=restricted, is_id=( bool(re.search(r'(^|[_.])id(\.|$)', key)) - if 'filename-sanitization' in self.params.get('compat_opts', []) + if 'filename-sanitization' in self.params['compat_opts'] else NO_DEFAULT)) sanitizer = sanitize if callable(sanitize) else filename_sanitizer @@ -1183,7 +1244,7 @@ class YoutubeDL(object): fmt = outer_mobj.group('format') if fmt == 's' and value is not None and key in field_size_compat_map.keys(): - fmt = '0{:d}d'.format(field_size_compat_map[key]) + fmt = f'0{field_size_compat_map[key]:d}d' value = default if value is None else value if replacement is None else replacement @@ -1193,12 +1254,16 @@ class YoutubeDL(object): delim = '\n' if '#' in flags else ', ' value, fmt = delim.join(map(str, variadic(value, allowed_types=(str, bytes)))), str_fmt elif fmt[-1] == 'j': # json - value, fmt = json.dumps(value, default=_dumpjson_default, indent=4 if '#' in flags else None), str_fmt + value, fmt = json.dumps( + value, default=_dumpjson_default, + indent=4 if '#' in flags else None, ensure_ascii='+' not in flags), str_fmt + elif fmt[-1] == 'h': # html + value, fmt = escapeHTML(str(value)), str_fmt elif fmt[-1] == 'q': # quoted value = map(str, variadic(value) if '#' in flags else [value]) value, fmt = ' '.join(map(compat_shlex_quote, value)), str_fmt elif fmt[-1] == 'B': # bytes - value = f'%{str_fmt}'.encode('utf-8') % str(value).encode('utf-8') + value = f'%{str_fmt}'.encode() % str(value).encode() value, fmt = value.decode('utf-8', 'ignore'), 's' elif fmt[-1] == 'U': # unicode normalized value, fmt = unicodedata.normalize( @@ -1242,7 +1307,7 @@ class YoutubeDL(object): def _prepare_filename(self, info_dict, *, outtmpl=None, tmpl_type=None): assert None in (outtmpl, tmpl_type), 'outtmpl and tmpl_type are mutually exclusive' if outtmpl is None: - outtmpl = self.outtmpl_dict.get(tmpl_type or 'default', self.outtmpl_dict['default']) + outtmpl = self.params['outtmpl'].get(tmpl_type or 'default', self.params['outtmpl']['default']) try: outtmpl = self._outtmpl_expandpath(outtmpl) filename = self.evaluate_outtmpl(outtmpl, info_dict, True) @@ -1291,11 +1356,19 @@ class YoutubeDL(object): return self.get_output_path(dir_type, filename) def _match_entry(self, info_dict, incomplete=False, silent=False): - """ Returns None if the file should be downloaded """ + """Returns None if the file should be downloaded""" + _type = info_dict.get('_type', 'video') + assert incomplete or _type == 'video', 'Only video result can be considered complete' - video_title = info_dict.get('title', info_dict.get('id', 'video')) + video_title = info_dict.get('title', info_dict.get('id', 'entry')) def check_filter(): + if _type in ('playlist', 'multi_video'): + return + elif _type in ('url', 'url_transparent') and not try_call( + lambda: self.get_info_extractor(info_dict['ie_key']).is_single_video(info_dict['url'])): + return + if 'title' in info_dict: # This can happen when we're just evaluating the playlist title = info_dict['title'] @@ -1307,11 +1380,12 @@ class YoutubeDL(object): if rejecttitle: if re.search(rejecttitle, title, re.IGNORECASE): return '"' + title + '" title matched reject pattern "' + rejecttitle + '"' + date = info_dict.get('upload_date') if date is not None: dateRange = self.params.get('daterange', DateRange()) if date not in dateRange: - return '%s upload date is not in range %s' % (date_from_str(date).isoformat(), dateRange) + return f'{date_from_str(date).isoformat()} upload date is not in range {dateRange}' view_count = info_dict.get('view_count') if view_count is not None: min_views = self.params.get('min_views') @@ -1330,7 +1404,16 @@ class YoutubeDL(object): except TypeError: # For backward compatibility ret = None if incomplete else match_filter(info_dict) - if ret is not None: + if ret is NO_DEFAULT: + while True: + filename = self._format_screen(self.prepare_filename(info_dict), self.Styles.FILENAME) + reply = input(self._format_screen( + f'Download "{filename}"? (Y/n): ', self.Styles.EMPHASIS)).lower().strip() + if reply in {'y', ''}: + return None + elif reply == 'n': + return f'Skipping {video_title}' + elif ret is not None: return ret return None @@ -1356,18 +1439,19 @@ class YoutubeDL(object): def extract_info(self, url, download=True, ie_key=None, extra_info=None, process=True, force_generic_extractor=False): """ - Return a list with a dictionary for each video extracted. + Extract and return the information dictionary of the URL Arguments: - url -- URL to extract + @param url URL to extract Keyword arguments: - download -- whether to download videos during extraction - ie_key -- extractor key hint - extra_info -- dictionary containing the extra values to add to each result - process -- whether to resolve all unresolved references (URLs, playlist items), - must be True for download to work. - force_generic_extractor -- force using the generic extractor + @param download Whether to download videos + @param process Whether to resolve all unresolved references (URLs, playlist items). + Must be True for download to work + @param ie_key Use only the extractor with this key + + @param extra_info Dictionary containing the extra values to add to the info (For internal use only) + @force_generic_extractor Force using the generic extractor (Deprecated; use ie_key='Generic') """ if extra_info is None: @@ -1377,11 +1461,11 @@ class YoutubeDL(object): ie_key = 'Generic' if ie_key: - ies = {ie_key: self._get_info_extractor_class(ie_key)} + ies = {ie_key: self._ies[ie_key]} if ie_key in self._ies else {} else: ies = self._ies - for ie_key, ie in ies.items(): + for key, ie in ies.items(): if not ie.suitable(url): continue @@ -1390,16 +1474,18 @@ class YoutubeDL(object): 'and will probably not work.') temp_id = ie.get_temp_id(url) - if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}): - self.to_screen(f'[{ie_key}] {temp_id}: has already been recorded in the archive') + if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': key}): + self.to_screen(f'[{key}] {temp_id}: has already been recorded in the archive') if self.params.get('break_on_existing', False): raise ExistingVideoReached() break - return self.__extract_info(url, self.get_info_extractor(ie_key), download, extra_info, process) + return self.__extract_info(url, self.get_info_extractor(key), download, extra_info, process) else: - self.report_error('no suitable InfoExtractor for URL %s' % url) + extractors_restricted = self.params.get('allowed_extractors') not in (None, ['default']) + self.report_error(f'No suitable extractor{format_field(ie_key, None, " (%s)")} found for URL {url}', + tb=False if extractors_restricted else None) - def __handle_extraction_exceptions(func): + def _handle_extraction_exceptions(func): @functools.wraps(func) def wrapper(self, *args, **kwargs): while True: @@ -1431,7 +1517,7 @@ class YoutubeDL(object): break return wrapper - def _wait_for_video(self, ie_result): + def _wait_for_video(self, ie_result={}): if (not self.params.get('wait_for_video') or ie_result.get('_type', 'video') != 'video' or ie_result.get('formats') or ie_result.get('url')): @@ -1442,7 +1528,12 @@ class YoutubeDL(object): def progress(msg): nonlocal last_msg - self.to_screen(msg + ' ' * (len(last_msg) - len(msg)) + '\r', skip_eol=True) + full_msg = f'{msg}\n' + if not self.params.get('noprogress'): + full_msg = msg + ' ' * (len(last_msg) - len(msg)) + '\r' + elif last_msg: + return + self.to_screen(full_msg, skip_eol=True) last_msg = msg min_wait, max_wait = self.params.get('wait_for_video') @@ -1450,7 +1541,7 @@ class YoutubeDL(object): if diff is None and ie_result.get('live_status') == 'is_upcoming': diff = round(random.uniform(min_wait, max_wait) if (max_wait and min_wait) else (max_wait or min_wait), 0) self.report_warning('Release time of video is not known') - elif (diff or 0) <= 0: + elif ie_result and (diff or 0) <= 0: self.report_warning('Video should already be available according to extracted info') diff = min(max(diff or 0, min_wait or 0), max_wait or float('inf')) self.to_screen(f'[wait] Waiting for {format_dur(diff)} - Press Ctrl+C to try now') @@ -1472,10 +1563,18 @@ class YoutubeDL(object): self.to_screen('') raise - @__handle_extraction_exceptions + @_handle_extraction_exceptions def __extract_info(self, url, ie, download, extra_info, process): - ie_result = ie.extract(url) + try: + ie_result = ie.extract(url) + except UserNotLive as e: + if process: + if self.params.get('wait_for_video'): + self.report_warning(e) + self._wait_for_video() + raise if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here) + self.report_warning(f'Extractor {ie.IE_NAME} returned nothing{bug_reports_message()}') return if isinstance(ie_result, list): # Backwards compatibility: old IE result format @@ -1523,7 +1622,8 @@ class YoutubeDL(object): result_type = ie_result.get('_type', 'video') if result_type in ('url', 'url_transparent'): - ie_result['url'] = sanitize_url(ie_result['url']) + ie_result['url'] = sanitize_url( + ie_result['url'], scheme='http' if self.params.get('prefer_insecure') else 'https') if ie_result.get('original_url'): extra_info.setdefault('original_url', ie_result['original_url']) @@ -1537,7 +1637,9 @@ class YoutubeDL(object): self.add_default_extra_info(info_copy, ie, ie_result['url']) self.add_extra_info(info_copy, extra_info) info_copy, _ = self.pre_process(info_copy) + self._fill_common_fields(info_copy, False) self.__forced_printings(info_copy, self.prepare_filename(info_copy), incomplete=True) + self._raise_pending_errors(info_copy) if self.params.get('force_write_download_archive', False): self.record_download_archive(info_copy) return ie_result @@ -1545,10 +1647,11 @@ class YoutubeDL(object): if result_type == 'video': self.add_extra_info(ie_result, extra_info) ie_result = self.process_video_result(ie_result, download=download) + self._raise_pending_errors(ie_result) additional_urls = (ie_result or {}).get('additional_urls') if additional_urls: # TODO: Improve MetadataParserPP to allow setting a list - if isinstance(additional_urls, compat_str): + if isinstance(additional_urls, str): additional_urls = [additional_urls] self.to_screen( '[info] %s: %d additional URL(s) requested' % (ie_result['id'], len(additional_urls))) @@ -1579,9 +1682,13 @@ class YoutubeDL(object): if not info: return info + exempted_fields = {'_type', 'url', 'ie_key'} + if not ie_result.get('section_end') and ie_result.get('section_start') is None: + # For video clips, the id etc of the clip extractor should be used + exempted_fields |= {'id', 'extractor', 'extractor_key'} + new_result = info.copy() - new_result.update(filter_dict(ie_result, lambda k, v: ( - v is not None and k not in {'_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'}))) + new_result.update(filter_dict(ie_result, lambda k, v: v is not None and k not in exempted_fields)) # Extracted info may not be a video result (i.e. # info.get('_type', 'video') != video) but rather an url or @@ -1597,8 +1704,8 @@ class YoutubeDL(object): elif result_type in ('playlist', 'multi_video'): # Protect from infinite recursion due to recursively nested playlists # (see https://github.com/ytdl-org/youtube-dl/issues/27833) - webpage_url = ie_result['webpage_url'] - if webpage_url in self._playlist_urls: + webpage_url = ie_result.get('webpage_url') # Playlists maynot have webpage_url + if webpage_url and webpage_url in self._playlist_urls: self.to_screen( '[download] Skipping already downloaded playlist: %s' % ie_result.get('title') or ie_result.get('id')) @@ -1640,124 +1747,65 @@ class YoutubeDL(object): return make_dir(path, self.report_error) @staticmethod - def _playlist_infodict(ie_result, **kwargs): - return { - **ie_result, + def _playlist_infodict(ie_result, strict=False, **kwargs): + info = { + 'playlist_count': ie_result.get('playlist_count'), 'playlist': ie_result.get('title') or ie_result.get('id'), 'playlist_id': ie_result.get('id'), 'playlist_title': ie_result.get('title'), 'playlist_uploader': ie_result.get('uploader'), 'playlist_uploader_id': ie_result.get('uploader_id'), - 'playlist_index': 0, **kwargs, } + if strict: + return info + if ie_result.get('webpage_url'): + info.update({ + 'webpage_url': ie_result['webpage_url'], + 'webpage_url_basename': url_basename(ie_result['webpage_url']), + 'webpage_url_domain': get_domain(ie_result['webpage_url']), + }) + return { + **info, + 'playlist_index': 0, + '__last_playlist_index': max(ie_result['requested_entries'] or (0, 0)), + 'extractor': ie_result['extractor'], + 'extractor_key': ie_result['extractor_key'], + } def __process_playlist(self, ie_result, download): - # We process each entry in the playlist - playlist = ie_result.get('title') or ie_result.get('id') - self.to_screen('[download] Downloading playlist: %s' % playlist) - - if 'entries' not in ie_result: - raise EntryNotInPlaylist('There are no entries') - - MissingEntry = object() - incomplete_entries = bool(ie_result.get('requested_entries')) - if incomplete_entries: - def fill_missing_entries(entries, indices): - ret = [MissingEntry] * max(indices) - for i, entry in zip(indices, entries): - ret[i - 1] = entry - return ret - ie_result['entries'] = fill_missing_entries(ie_result['entries'], ie_result['requested_entries']) - - playlist_results = [] - - playliststart = self.params.get('playliststart', 1) - playlistend = self.params.get('playlistend') - # For backwards compatibility, interpret -1 as whole list - if playlistend == -1: - playlistend = None - - playlistitems_str = self.params.get('playlist_items') - playlistitems = None - if playlistitems_str is not None: - def iter_playlistitems(format): - for string_segment in format.split(','): - if '-' in string_segment: - start, end = string_segment.split('-') - for item in range(int(start), int(end) + 1): - yield int(item) - else: - yield int(string_segment) - playlistitems = orderedSet(iter_playlistitems(playlistitems_str)) + """Process each entry in the playlist""" + assert ie_result['_type'] in ('playlist', 'multi_video') - ie_entries = ie_result['entries'] - if isinstance(ie_entries, list): - playlist_count = len(ie_entries) - msg = f'Collected {playlist_count} videos; downloading %d of them' - ie_result['playlist_count'] = ie_result.get('playlist_count') or playlist_count - - def get_entry(i): - return ie_entries[i - 1] - else: - msg = 'Downloading %d videos' - if not isinstance(ie_entries, (PagedList, LazyList)): - ie_entries = LazyList(ie_entries) - elif isinstance(ie_entries, InAdvancePagedList): - if ie_entries._pagesize == 1: - playlist_count = ie_entries._pagecount - - def get_entry(i): - return YoutubeDL.__handle_extraction_exceptions( - lambda self, i: ie_entries[i - 1] - )(self, i) - - entries, broken = [], False - items = playlistitems if playlistitems is not None else itertools.count(playliststart) - for i in items: - if i == 0: - continue - if playlistitems is None and playlistend is not None and playlistend < i: - break - entry = None - try: - entry = get_entry(i) - if entry is MissingEntry: - raise EntryNotInPlaylist() - except (IndexError, EntryNotInPlaylist): - if incomplete_entries: - raise EntryNotInPlaylist(f'Entry {i} cannot be found') - elif not playlistitems: - break - entries.append(entry) - try: - if entry is not None: - self._match_entry(entry, incomplete=True, silent=True) - except (ExistingVideoReached, RejectedVideoReached): - broken = True - break - ie_result['entries'] = entries + common_info = self._playlist_infodict(ie_result, strict=True) + title = common_info.get('playlist') or '<Untitled>' + if self._match_entry(common_info, incomplete=True) is not None: + return + self.to_screen(f'[download] Downloading {ie_result["_type"]}: {title}') - # Save playlist_index before re-ordering - entries = [ - ((playlistitems[i - 1] if playlistitems else i + playliststart - 1), entry) - for i, entry in enumerate(entries, 1) - if entry is not None] - n_entries = len(entries) + all_entries = PlaylistEntries(self, ie_result) + entries = orderedSet(all_entries.get_requested_items(), lazy=True) - if not (ie_result.get('playlist_count') or broken or playlistitems or playlistend): - ie_result['playlist_count'] = n_entries + lazy = self.params.get('lazy_playlist') + if lazy: + resolved_entries, n_entries = [], 'N/A' + ie_result['requested_entries'], ie_result['entries'] = None, None + else: + entries = resolved_entries = list(entries) + n_entries = len(resolved_entries) + ie_result['requested_entries'], ie_result['entries'] = tuple(zip(*resolved_entries)) or ([], []) + if not ie_result.get('playlist_count'): + # Better to do this after potentially exhausting entries + ie_result['playlist_count'] = all_entries.get_full_count() - if not playlistitems and (playliststart != 1 or playlistend): - playlistitems = list(range(playliststart, playliststart + n_entries)) - ie_result['requested_entries'] = playlistitems + extra = self._playlist_infodict(ie_result, n_entries=int_or_none(n_entries)) + ie_copy = collections.ChainMap(ie_result, extra) _infojson_written = False write_playlist_files = self.params.get('allow_playlist_files', True) if write_playlist_files and self.params.get('list_thumbnails'): self.list_thumbnails(ie_result) if write_playlist_files and not self.params.get('simulate'): - ie_copy = self._playlist_infodict(ie_result, n_entries=n_entries) _infojson_written = self._write_info_json( 'playlist', ie_result, self.prepare_filename(ie_copy, 'pl_infojson')) if _infojson_written is None: @@ -1766,57 +1814,72 @@ class YoutubeDL(object): self.prepare_filename(ie_copy, 'pl_description')) is None: return # TODO: This should be passed to ThumbnailsConvertor if necessary - self._write_thumbnails('playlist', ie_copy, self.prepare_filename(ie_copy, 'pl_thumbnail')) - - if self.params.get('playlistreverse', False): - entries = entries[::-1] - if self.params.get('playlistrandom', False): + self._write_thumbnails('playlist', ie_result, self.prepare_filename(ie_copy, 'pl_thumbnail')) + + if lazy: + if self.params.get('playlistreverse') or self.params.get('playlistrandom'): + self.report_warning('playlistreverse and playlistrandom are not supported with lazy_playlist', only_once=True) + elif self.params.get('playlistreverse'): + entries.reverse() + elif self.params.get('playlistrandom'): random.shuffle(entries) - x_forwarded_for = ie_result.get('__x_forwarded_for_ip') + self.to_screen(f'[{ie_result["extractor"]}] Playlist {title}: Downloading {n_entries} items' + f'{format_field(ie_result, "playlist_count", " of %s")}') + + keep_resolved_entries = self.params.get('extract_flat') != 'discard' + if self.params.get('extract_flat') == 'discard_in_playlist': + keep_resolved_entries = ie_result['_type'] != 'playlist' + if keep_resolved_entries: + self.write_debug('The information of all playlist entries will be held in memory') - self.to_screen('[%s] playlist %s: %s' % (ie_result['extractor'], playlist, msg % n_entries)) failures = 0 max_failures = self.params.get('skip_playlist_after_errors') or float('inf') - for i, entry_tuple in enumerate(entries, 1): - playlist_index, entry = entry_tuple - if 'playlist-index' in self.params.get('compat_opts', []): - playlist_index = playlistitems[i - 1] if playlistitems else i + playliststart - 1 - self.to_screen('[download] Downloading video %s of %s' % (i, n_entries)) - # This __x_forwarded_for_ip thing is a bit ugly but requires - # minimal changes - if x_forwarded_for: - entry['__x_forwarded_for_ip'] = x_forwarded_for - extra = { - 'n_entries': n_entries, - '_last_playlist_index': max(playlistitems) if playlistitems else (playlistend or n_entries), - 'playlist_count': ie_result.get('playlist_count'), + for i, (playlist_index, entry) in enumerate(entries): + if lazy: + resolved_entries.append((playlist_index, entry)) + if not entry: + continue + + entry['__x_forwarded_for_ip'] = ie_result.get('__x_forwarded_for_ip') + if not lazy and 'playlist-index' in self.params.get('compat_opts', []): + playlist_index = ie_result['requested_entries'][i] + + entry_copy = collections.ChainMap(entry, { + **common_info, + 'n_entries': int_or_none(n_entries), 'playlist_index': playlist_index, - 'playlist_autonumber': i, - 'playlist': playlist, - 'playlist_id': ie_result.get('id'), - 'playlist_title': ie_result.get('title'), - 'playlist_uploader': ie_result.get('uploader'), - 'playlist_uploader_id': ie_result.get('uploader_id'), - 'extractor': ie_result['extractor'], - 'webpage_url': ie_result['webpage_url'], - 'webpage_url_basename': url_basename(ie_result['webpage_url']), - 'webpage_url_domain': get_domain(ie_result['webpage_url']), - 'extractor_key': ie_result['extractor_key'], - } + 'playlist_autonumber': i + 1, + }) - if self._match_entry(entry, incomplete=True) is not None: + if self._match_entry(entry_copy, incomplete=True) is not None: + # For compatabilty with youtube-dl. See https://github.com/hypervideo/hypervideo/issues/4369 + resolved_entries[i] = (playlist_index, NO_DEFAULT) continue + self.to_screen('[download] Downloading item %s of %s' % ( + self._format_screen(i + 1, self.Styles.ID), self._format_screen(n_entries, self.Styles.EMPHASIS))) + + extra.update({ + 'playlist_index': playlist_index, + 'playlist_autonumber': i + 1, + }) entry_result = self.__process_iterable_entry(entry, download, extra) if not entry_result: failures += 1 if failures >= max_failures: self.report_error( - 'Skipping the remaining entries in playlist "%s" since %d items failed extraction' % (playlist, failures)) + f'Skipping the remaining entries in playlist "{title}" since {failures} items failed extraction') break - playlist_results.append(entry_result) - ie_result['entries'] = playlist_results + if keep_resolved_entries: + resolved_entries[i] = (playlist_index, entry_result) + + # Update with processed data + ie_result['entries'] = [e for _, e in resolved_entries if e is not NO_DEFAULT] + ie_result['requested_entries'] = [i for i, e in resolved_entries if e is not NO_DEFAULT] + if ie_result['requested_entries'] == try_call(lambda: list(range(1, ie_result['playlist_count'] + 1))): + # Do not set for full playlist + ie_result.pop('requested_entries') # Write the updated info to json if _infojson_written is True and self._write_info_json( @@ -1825,10 +1888,10 @@ class YoutubeDL(object): return ie_result = self.run_all_pps('playlist', ie_result) - self.to_screen(f'[download] Finished downloading playlist: {playlist}') + self.to_screen(f'[download] Finished downloading playlist: {title}') return ie_result - @__handle_extraction_exceptions + @_handle_extraction_exceptions def __process_iterable_entry(self, entry, download, extra_info): return self.process_ie_result( entry, download=download, extra_info=extra_info) @@ -1910,7 +1973,7 @@ class YoutubeDL(object): temp_file.close() try: success, _ = self.dl(temp_file.name, f, test=True) - except (DownloadError, IOError, OSError, ValueError) + network_exceptions: + except (DownloadError, OSError, ValueError) + network_exceptions: success = False finally: if os.path.exists(temp_file.name): @@ -1934,12 +1997,12 @@ class YoutubeDL(object): and download and ( not can_merge() - or info_dict.get('is_live', False) - or self.outtmpl_dict['default'] == '-')) + or info_dict.get('is_live') and not self.params.get('live_from_start') + or self.params['outtmpl']['default'] == '-')) compat = ( prefer_best or self.params.get('allow_multiple_audio_streams', False) - or 'format-spec' in self.params.get('compat_opts', [])) + or 'format-spec' in self.params['compat_opts']) return ( 'best/bestvideo+bestaudio' if prefer_best @@ -1950,7 +2013,7 @@ class YoutubeDL(object): def syntax_error(note, start): message = ( 'Invalid format specification: ' - '{0}\n\t{1}\n\t{2}^'.format(note, format_spec, ' ' * start[1])) + '{}\n\t{}\n\t{}^'.format(note, format_spec, ' ' * start[1])) return SyntaxError(message) PICKFIRST = 'PICKFIRST' @@ -1973,8 +2036,8 @@ class YoutubeDL(object): filter_parts.append(string) def _remove_unused_ops(tokens): - # Remove operators that we don't use and join them with the surrounding strings - # for example: 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9' + # Remove operators that we don't use and join them with the surrounding strings. + # E.g. 'mp4' '-' 'baseline' '-' '16x9' is converted to 'mp4-baseline-16x9' ALLOWED_OPS = ('/', '+', ',', '(', ')') last_string, last_start, last_end, last_line = None, None, None, None for type, string, start, end, line in tokens: @@ -2054,7 +2117,7 @@ class YoutubeDL(object): raise syntax_error('Expected a selector', start) current_selector = FormatSelector(MERGE, (selector_1, selector_2), []) else: - raise syntax_error('Operator not recognized: "{0}"'.format(string), start) + raise syntax_error(f'Operator not recognized: "{string}"', start) elif type == tokenize.ENDMARKER: break if current_selector: @@ -2090,14 +2153,13 @@ class YoutubeDL(object): the_only_video = video_fmts[0] if len(video_fmts) == 1 else None the_only_audio = audio_fmts[0] if len(audio_fmts) == 1 else None - output_ext = self.params.get('merge_output_format') - if not output_ext: - if the_only_video: - output_ext = the_only_video['ext'] - elif the_only_audio and not video_fmts: - output_ext = the_only_audio['ext'] - else: - output_ext = 'mkv' + output_ext = get_compatible_ext( + vcodecs=[f.get('vcodec') for f in video_fmts], + acodecs=[f.get('acodec') for f in audio_fmts], + vexts=[f['ext'] for f in video_fmts], + aexts=[f['ext'] for f in audio_fmts], + preferences=(try_call(lambda: self.params['merge_output_format'].split('/')) + or self.params.get('prefer_free_formats') and ('webm', 'mkv'))) filtered = lambda *keys: filter(None, (traverse_obj(fmt, *keys) for fmt in formats_info)) @@ -2123,6 +2185,7 @@ class YoutubeDL(object): 'vcodec': the_only_video.get('vcodec'), 'vbr': the_only_video.get('vbr'), 'stretched_ratio': the_only_video.get('stretched_ratio'), + 'aspect_ratio': the_only_video.get('aspect_ratio'), }) if the_only_audio: @@ -2130,6 +2193,7 @@ class YoutubeDL(object): 'acodec': the_only_audio.get('acodec'), 'abr': the_only_audio.get('abr'), 'asr': the_only_audio.get('asr'), + 'audio_channels': the_only_audio.get('audio_channels') }) return new_dict @@ -2178,7 +2242,8 @@ class YoutubeDL(object): yield from _check_formats(ctx['formats'][::-1]) elif format_spec == 'mergeall': def selector_function(ctx): - formats = list(_check_formats(ctx['formats'])) + formats = list(_check_formats( + f for f in ctx['formats'] if f.get('vcodec') != 'none' or f.get('acodec') != 'none')) if not formats: return merged_format = formats[-1] @@ -2235,7 +2300,7 @@ class YoutubeDL(object): matches = LazyList(_check_formats(matches[::-1 if format_reverse else 1])) try: yield matches[format_idx - 1] - except IndexError: + except LazyList.IndexError: return filters = [self._build_format_filter(f) for f in selector.filters] @@ -2247,13 +2312,13 @@ class YoutubeDL(object): return selector_function(ctx_copy) return final_selector - stream = io.BytesIO(format_spec.encode('utf-8')) + stream = io.BytesIO(format_spec.encode()) try: - tokens = list(_remove_unused_ops(compat_tokenize_tokenize(stream.readline))) + tokens = list(_remove_unused_ops(tokenize.tokenize(stream.readline))) except tokenize.TokenError: raise syntax_error('Missing closing/opening brackets or parenthesis', (0, len(format_spec))) - class TokenIterator(object): + class TokenIterator: def __init__(self, tokens): self.tokens = tokens self.counter = 0 @@ -2279,7 +2344,7 @@ class YoutubeDL(object): def _calc_headers(self, info_dict): res = merge_headers(self.params['http_headers'], info_dict.get('http_headers') or {}) - cookies = self._calc_cookies(info_dict) + cookies = self._calc_cookies(info_dict['url']) if cookies: res['Cookie'] = cookies @@ -2290,8 +2355,8 @@ class YoutubeDL(object): return res - def _calc_cookies(self, info_dict): - pr = sanitized_Request(info_dict['url']) + def _calc_cookies(self, url): + pr = sanitized_Request(url) self.cookiejar.add_cookie_header(pr) return pr.get_header('Cookie') @@ -2335,17 +2400,20 @@ class YoutubeDL(object): else: info_dict['thumbnails'] = thumbnails - def _fill_common_fields(self, info_dict, is_video=True): + def _fill_common_fields(self, info_dict, final=True): # TODO: move sanitization here - if is_video: - # playlists are allowed to lack "title" - info_dict['fulltitle'] = info_dict.get('title') - if 'title' not in info_dict: + if final: + title = info_dict.get('title', NO_DEFAULT) + if title is NO_DEFAULT: raise ExtractorError('Missing "title" field in extractor result', video_id=info_dict['id'], ie=info_dict['extractor']) - elif not info_dict.get('title'): - self.report_warning('Extractor failed to obtain "title". Creating a generic title instead') - info_dict['title'] = f'{info_dict["extractor"]} video #{info_dict["id"]}' + info_dict['fulltitle'] = title + if not title: + if title == '': + self.write_debug('Extractor gave empty title. Creating a generic title') + else: + self.report_warning('Extractor failed to obtain "title". Creating a generic title instead') + info_dict['title'] = f'{info_dict["extractor"].replace(":", "-")} video #{info_dict["id"]}' if info_dict.get('duration') is not None: info_dict['duration_string'] = formatSeconds(info_dict['duration']) @@ -2358,11 +2426,9 @@ class YoutubeDL(object): if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None: # Working around out-of-range timestamp values (e.g. negative ones on Windows, # see http://bugs.python.org/issue1646728) - try: + with contextlib.suppress(ValueError, OverflowError, OSError): upload_date = datetime.datetime.utcfromtimestamp(info_dict[ts_key]) info_dict[date_key] = upload_date.strftime('%Y%m%d') - except (ValueError, OverflowError, OSError): - pass live_keys = ('is_live', 'was_live') live_status = info_dict.get('live_status') @@ -2380,13 +2446,32 @@ class YoutubeDL(object): for key in live_keys: if info_dict.get(key) is None: info_dict[key] = (live_status == key) + if live_status == 'post_live': + info_dict['was_live'] = True # Auto generate title fields corresponding to the *_number fields when missing # in order to always have clean titles. This is very common for TV series. for field in ('chapter', 'season', 'episode'): - if info_dict.get('%s_number' % field) is not None and not info_dict.get(field): + if final and info_dict.get('%s_number' % field) is not None and not info_dict.get(field): info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field]) + def _raise_pending_errors(self, info): + err = info.pop('__pending_error', None) + if err: + self.report_error(err, tb=False) + + def sort_formats(self, info_dict): + formats = self._get_formats(info_dict) + if not formats: + return + # Backward compatibility with InfoExtractor._sort_formats + field_preference = formats[0].pop('__sort_fields', None) + if field_preference: + info_dict['_format_sort_fields'] = field_preference + + formats.sort(key=FormatSorter( + self, info_dict.get('_format_sort_fields', [])).calculate_preference) + def process_video_result(self, info_dict, download=True): assert info_dict.get('_type', 'video') == 'video' self._num_videos += 1 @@ -2403,24 +2488,40 @@ class YoutubeDL(object): def sanitize_string_field(info, string_field): field = info.get(string_field) - if field is None or isinstance(field, compat_str): + if field is None or isinstance(field, str): return report_force_conversion(string_field, 'a string', 'string') - info[string_field] = compat_str(field) + info[string_field] = str(field) def sanitize_numeric_fields(info): for numeric_field in self._NUMERIC_FIELDS: field = info.get(numeric_field) - if field is None or isinstance(field, compat_numeric_types): + if field is None or isinstance(field, (int, float)): continue report_force_conversion(numeric_field, 'numeric', 'int') info[numeric_field] = int_or_none(field) sanitize_string_field(info_dict, 'id') sanitize_numeric_fields(info_dict) + if info_dict.get('section_end') and info_dict.get('section_start') is not None: + info_dict['duration'] = round(info_dict['section_end'] - info_dict['section_start'], 3) if (info_dict.get('duration') or 0) <= 0 and info_dict.pop('duration', None): self.report_warning('"duration" field is negative, there is an error in extractor') + chapters = info_dict.get('chapters') or [] + if chapters and chapters[0].get('start_time'): + chapters.insert(0, {'start_time': 0}) + + dummy_chapter = {'end_time': 0, 'start_time': info_dict.get('duration')} + for idx, (prev, current, next_) in enumerate(zip( + (dummy_chapter, *chapters), chapters, (*chapters[1:], dummy_chapter)), 1): + if current.get('start_time') is None: + current['start_time'] = prev.get('end_time') + if not current.get('end_time'): + current['end_time'] = next_.get('start_time') + if not current.get('title'): + current['title'] = f'<Untitled Chapter {idx}>' + if 'playlist' not in info_dict: # It isn't part of a playlist info_dict['playlist'] = None @@ -2456,20 +2557,18 @@ class YoutubeDL(object): info_dict['requested_subtitles'] = self.process_subtitles( info_dict['id'], subtitles, automatic_captions) - if info_dict.get('formats') is None: - # There's only one format available - formats = [info_dict] - else: - formats = info_dict['formats'] + self.sort_formats(info_dict) + formats = self._get_formats(info_dict) - info_dict['__has_drm'] = any(f.get('has_drm') for f in formats) + # or None ensures --clean-infojson removes it + info_dict['_has_drm'] = any(f.get('has_drm') for f in formats) or None if not self.params.get('allow_unplayable_formats'): formats = [f for f in formats if not f.get('has_drm')] - if info_dict['__has_drm'] and all( - f.get('acodec') == f.get('vcodec') == 'none' for f in formats): - self.report_warning( - 'This video is DRM protected and only images are available for download. ' - 'Use --list-formats to see them') + + if formats and all(f.get('acodec') == f.get('vcodec') == 'none' for f in formats): + self.report_warning( + f'{"This video is DRM protected and " if info_dict["_has_drm"] else ""}' + 'only images are available for download. Use --list-formats to see them'.capitalize()) get_from_start = not info_dict.get('is_live') or bool(self.params.get('live_from_start')) if not get_from_start: @@ -2481,9 +2580,6 @@ class YoutubeDL(object): '--live-from-start is passed, but there are no formats that can be downloaded from the start. ' 'If you want to download from the current time, use --no-live-from-start')) - if not formats: - self.raise_no_formats(info_dict) - def is_wellformed(f): url = f.get('url') if not url: @@ -2496,7 +2592,10 @@ class YoutubeDL(object): return True # Filter out malformed formats for better extraction robustness - formats = list(filter(is_wellformed, formats)) + formats = list(filter(is_wellformed, formats or [])) + + if not formats: + self.raise_no_formats(info_dict) formats_dict = {} @@ -2506,7 +2605,7 @@ class YoutubeDL(object): sanitize_numeric_fields(format) format['url'] = sanitize_url(format['url']) if not format.get('format_id'): - format['format_id'] = compat_str(i) + format['format_id'] = str(i) else: # Sanitize format_id from characters used in format selector expression format['format_id'] = re.sub(r'[\s,/+\[\]()]', '_', format['format_id']) @@ -2542,9 +2641,11 @@ class YoutubeDL(object): format['resolution'] = self.format_resolution(format, default=None) if format.get('dynamic_range') is None and format.get('vcodec') != 'none': format['dynamic_range'] = 'SDR' + if format.get('aspect_ratio') is None: + format['aspect_ratio'] = try_call(lambda: round(format['width'] / format['height'], 2)) if (info_dict.get('duration') and format.get('tbr') and not format.get('filesize') and not format.get('filesize_approx')): - format['filesize_approx'] = info_dict['duration'] * format['tbr'] * (1024 / 8) + format['filesize_approx'] = int(info_dict['duration'] * format['tbr'] * (1024 / 8)) # Add HTTP headers, so that external programs can use them from the # json output @@ -2574,10 +2675,9 @@ class YoutubeDL(object): info_dict, _ = self.pre_process(info_dict, 'after_filter') # The pre-processors may have modified the formats - formats = info_dict.get('formats', [info_dict]) + formats = self._get_formats(info_dict) - list_only = self.params.get('simulate') is None and ( - self.params.get('list_thumbnails') or self.params.get('listformats') or self.params.get('listsubtitles')) + list_only = self.params.get('simulate') == 'list_only' interactive_format_selection = not list_only and self.format_selector == '-' if self.params.get('list_thumbnails'): self.list_thumbnails(info_dict) @@ -2591,7 +2691,7 @@ class YoutubeDL(object): if list_only: # Without this printing, -F --print-json will not work self.__forced_printings(info_dict, self.prepare_filename(info_dict), incomplete=True) - return + return info_dict format_selector = self.format_selector if format_selector is None: @@ -2632,20 +2732,39 @@ class YoutubeDL(object): # Process what we can, even without any available formats. formats_to_download = [{}] - best_format = formats_to_download[-1] + requested_ranges = tuple(self.params.get('download_ranges', lambda *_: [{}])(info_dict, self)) + best_format, downloaded_formats = formats_to_download[-1], [] if download: - if best_format: - self.to_screen( - f'[info] {info_dict["id"]}: Downloading {len(formats_to_download)} format(s): ' - + ', '.join([f['format_id'] for f in formats_to_download])) + if best_format and requested_ranges: + def to_screen(*msg): + self.to_screen(f'[info] {info_dict["id"]}: {" ".join(", ".join(variadic(m)) for m in msg)}') + + to_screen(f'Downloading {len(formats_to_download)} format(s):', + (f['format_id'] for f in formats_to_download)) + if requested_ranges != ({}, ): + to_screen(f'Downloading {len(requested_ranges)} time ranges:', + (f'{c["start_time"]:.1f}-{c["end_time"]:.1f}' for c in requested_ranges)) max_downloads_reached = False - for i, fmt in enumerate(formats_to_download): - formats_to_download[i] = new_info = self._copy_infodict(info_dict) + + for fmt, chapter in itertools.product(formats_to_download, requested_ranges): + new_info = self._copy_infodict(info_dict) new_info.update(fmt) + offset, duration = info_dict.get('section_start') or 0, info_dict.get('duration') or float('inf') + end_time = offset + min(chapter.get('end_time', duration), duration) + if chapter or offset: + new_info.update({ + 'section_start': offset + chapter.get('start_time', 0), + # duration may not be accurate. So allow deviations <1sec + 'section_end': end_time if end_time <= offset + duration + 1 else None, + 'section_title': chapter.get('title'), + 'section_number': chapter.get('index'), + }) + downloaded_formats.append(new_info) try: self.process_info(new_info) except MaxDownloadsReached: max_downloads_reached = True + self._raise_pending_errors(new_info) # Remove copied info for key, val in tuple(new_info.items()): if info_dict.get(key) == val: @@ -2653,12 +2772,12 @@ class YoutubeDL(object): if max_downloads_reached: break - write_archive = set(f.get('__write_download_archive', False) for f in formats_to_download) + write_archive = {f.get('__write_download_archive', False) for f in downloaded_formats} assert write_archive.issubset({True, False, 'ignore'}) if True in write_archive and False not in write_archive: self.record_download_archive(info_dict) - info_dict['requested_downloads'] = formats_to_download + info_dict['requested_downloads'] = downloaded_formats info_dict = self.run_all_pps('after_video', info_dict) if max_downloads_reached: raise MaxDownloadsReached() @@ -2669,50 +2788,35 @@ class YoutubeDL(object): def process_subtitles(self, video_id, normal_subtitles, automatic_captions): """Select the requested subtitles and their format""" - available_subs = {} + available_subs, normal_sub_langs = {}, [] if normal_subtitles and self.params.get('writesubtitles'): available_subs.update(normal_subtitles) + normal_sub_langs = tuple(normal_subtitles.keys()) if automatic_captions and self.params.get('writeautomaticsub'): for lang, cap_info in automatic_captions.items(): if lang not in available_subs: available_subs[lang] = cap_info - if (not self.params.get('writesubtitles') and not - self.params.get('writeautomaticsub') or not - available_subs): + if not available_subs or ( + not self.params.get('writesubtitles') + and not self.params.get('writeautomaticsub')): return None - all_sub_langs = available_subs.keys() + all_sub_langs = tuple(available_subs.keys()) if self.params.get('allsubtitles', False): requested_langs = all_sub_langs elif self.params.get('subtitleslangs', False): - # A list is used so that the order of languages will be the same as - # given in subtitleslangs. See https://github.com/hypervideo/hypervideo/issues/1041 - requested_langs = [] - for lang_re in self.params.get('subtitleslangs'): - discard = lang_re[0] == '-' - if discard: - lang_re = lang_re[1:] - if lang_re == 'all': - if discard: - requested_langs = [] - else: - requested_langs.extend(all_sub_langs) - continue - current_langs = filter(re.compile(lang_re + '$').match, all_sub_langs) - if discard: - for lang in current_langs: - while lang in requested_langs: - requested_langs.remove(lang) - else: - requested_langs.extend(current_langs) - requested_langs = orderedSet(requested_langs) - elif 'en' in available_subs: - requested_langs = ['en'] + try: + requested_langs = orderedSet_from_options( + self.params.get('subtitleslangs'), {'all': all_sub_langs}, use_regex=True) + except re.error as e: + raise ValueError(f'Wrong regex for subtitlelangs: {e.pattern}') + elif normal_sub_langs: + requested_langs = ['en'] if 'en' in normal_sub_langs else normal_sub_langs[:1] else: - requested_langs = [list(all_sub_langs)[0]] + requested_langs = ['en'] if 'en' in all_sub_langs else all_sub_langs[:1] if requested_langs: - self.write_debug('Downloading subtitles: %s' % ', '.join(requested_langs)) + self.to_screen(f'[info] {video_id}: Downloading subtitles: {", ".join(requested_langs)}') formats_query = self.params.get('subtitlesformat', 'best') formats_preference = formats_query.split('/') if formats_query else [] @@ -2720,7 +2824,7 @@ class YoutubeDL(object): for lang in requested_langs: formats = available_subs.get(lang) if formats is None: - self.report_warning('%s subtitles not available for %s' % (lang, video_id)) + self.report_warning(f'{lang} subtitles not available for {video_id}') continue for ext in formats_preference: if ext == 'best': @@ -2748,12 +2852,16 @@ class YoutubeDL(object): info_copy['automatic_captions_table'] = self.render_subtitles_table(info_dict.get('id'), info_dict.get('automatic_captions')) def format_tmpl(tmpl): - mobj = re.match(r'\w+(=?)$', tmpl) - if mobj and mobj.group(1): - return f'{tmpl[:-1]} = %({tmpl[:-1]})r' - elif mobj: - return f'%({tmpl})s' - return tmpl + mobj = re.fullmatch(r'([\w.:,]|-\d|(?P<dict>{([\w.:,]|-\d)+}))+=?', tmpl) + if not mobj: + return tmpl + + fmt = '%({})s' + if tmpl.startswith('{'): + tmpl = f'.{tmpl}' + if tmpl.endswith('='): + tmpl, fmt = tmpl[:-1], '{0} = %({0})#j' + return '\n'.join(map(fmt.format, [tmpl] if mobj.group('dict') else tmpl.split(','))) for tmpl in self.params['forceprint'].get(key, []): self.to_stdout(self.evaluate_outtmpl(format_tmpl(tmpl), info_copy)) @@ -2763,7 +2871,7 @@ class YoutubeDL(object): tmpl = format_tmpl(tmpl) self.to_screen(f'[info] Writing {tmpl!r} to: {filename}') if self._ensure_dir_exists(filename): - with io.open(filename, 'a', encoding='utf-8') as f: + with open(filename, 'a', encoding='utf-8') as f: f.write(self.evaluate_outtmpl(tmpl, info_copy) + '\n') def __forced_printings(self, info_dict, filename, incomplete): @@ -2833,7 +2941,7 @@ class YoutubeDL(object): urls = '", "'.join( (f['url'].split(',')[0] + ',<data>' if f['url'].startswith('data:') else f['url']) for f in info.get('requested_formats', []) or [info]) - self.write_debug('Invoking downloader on "%s"' % urls) + self.write_debug(f'Invoking {fd.FD_NAME} downloader on "{urls}"') # Note: Ideally info should be a deep-copied so that hooks cannot modify it. # But it may contain objects that are not deep-copyable @@ -2861,8 +2969,6 @@ class YoutubeDL(object): if 'format' not in info_dict and 'ext' in info_dict: info_dict['format'] = info_dict['ext'] - # This is mostly just for backward compatibility of process_info - # As a side-effect, this allows for format-specific filters if self._match_entry(info_dict) is not None: info_dict['__write_download_archive'] = 'ignore' return @@ -2879,8 +2985,13 @@ class YoutubeDL(object): # Forced printings self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict)) + def check_max_downloads(): + if self._num_downloads >= float(self.params.get('max_downloads') or 'inf'): + raise MaxDownloadsReached() + if self.params.get('simulate'): info_dict['__write_download_archive'] = self.params.get('force_write_download_archive') + check_max_downloads() return if full_filename is None: @@ -2928,11 +3039,11 @@ class YoutubeDL(object): else: try: self.to_screen('[info] Writing video annotations to: ' + annofn) - with io.open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile: + with open(encodeFilename(annofn), 'w', encoding='utf-8') as annofile: annofile.write(info_dict['annotations']) except (KeyError, TypeError): self.report_warning('There are no annotations to write.') - except (OSError, IOError): + except OSError: self.report_error('Cannot write annotations file: ' + annofn) return @@ -2951,13 +3062,13 @@ class YoutubeDL(object): return True try: self.to_screen(f'[info] Writing internet shortcut (.{link_type}) to: {linkfn}') - with io.open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8', - newline='\r\n' if link_type == 'url' else '\n') as linkfile: + with open(encodeFilename(to_high_limit_path(linkfn)), 'w', encoding='utf-8', + newline='\r\n' if link_type == 'url' else '\n') as linkfile: template_vars = {'url': url} if link_type == 'desktop': template_vars['filename'] = linkfn[:-(len(link_type) + 1)] linkfile.write(LINK_TEMPLATES[link_type] % template_vars) - except (OSError, IOError): + except OSError: self.report_error(f'Cannot write internet shortcut {linkfn}') return False return True @@ -2984,12 +3095,8 @@ class YoutubeDL(object): info_dict.clear() info_dict.update(new_info) - try: - new_info, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move) - replace_info_dict(new_info) - except PostProcessingError as err: - self.report_error('Preprocessing: %s' % str(err)) - return + new_info, files_to_move = self.pre_process(info_dict, 'before_dl', files_to_move) + replace_info_dict(new_info) if self.params.get('skip_download'): info_dict['filepath'] = temp_filename @@ -3011,40 +3118,25 @@ class YoutubeDL(object): info_dict['ext'] = os.path.splitext(file)[1][1:] return file - success = True - if info_dict.get('requested_formats') is not None: - - def compatible_formats(formats): - # TODO: some formats actually allow this (mkv, webm, ogg, mp4), but not all of them. - video_formats = [format for format in formats if format.get('vcodec') != 'none'] - audio_formats = [format for format in formats if format.get('acodec') != 'none'] - if len(video_formats) > 2 or len(audio_formats) > 2: - return False - - # Check extension - exts = set(format.get('ext') for format in formats) - COMPATIBLE_EXTS = ( - set(('mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma')), - set(('webm',)), - ) - for ext_sets in COMPATIBLE_EXTS: - if ext_sets.issuperset(exts): - return True - # TODO: Check acodec/vcodec - return False + fd, success = None, True + if info_dict.get('protocol') or info_dict.get('url'): + fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-') + if fd is not FFmpegFD and ( + info_dict.get('section_start') or info_dict.get('section_end')): + msg = ('This format cannot be partially downloaded' if FFmpegFD.available() + else 'You have requested downloading the video partially, but ffmpeg is not installed') + self.report_error(f'{msg}. Aborting') + return + if info_dict.get('requested_formats') is not None: requested_formats = info_dict['requested_formats'] old_ext = info_dict['ext'] if self.params.get('merge_output_format') is None: - if not compatible_formats(requested_formats): - info_dict['ext'] = 'mkv' - self.report_warning( - 'Requested formats are incompatible for merge and will be merged into mkv') if (info_dict['ext'] == 'webm' and info_dict.get('thumbnails') # check with type instead of pp_key, __name__, or isinstance # since we dont want any custom PPs to trigger this - and any(type(pp) == EmbedThumbnailPP for pp in self._pps['post_process'])): + and any(type(pp) == EmbedThumbnailPP for pp in self._pps['post_process'])): # noqa: E721 info_dict['ext'] = 'mkv' self.report_warning( 'webm doesn\'t support embedding a thumbnail, mkv will be used') @@ -3058,7 +3150,7 @@ class YoutubeDL(object): os.path.splitext(filename)[0] if filename_real_ext in (old_ext, new_ext) else filename) - return '%s.%s' % (filename_wo_ext, ext) + return f'{filename_wo_ext}.{ext}' # Ensure filename always has a correct extension for successful merge full_filename = correct_ext(full_filename) @@ -3066,10 +3158,8 @@ class YoutubeDL(object): dl_filename = existing_video_file(full_filename, temp_filename) info_dict['__real_download'] = False - downloaded = [] merger = FFmpegMergerPP(self) - - fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-') + downloaded = [] if dl_filename is not None: self.report_file_already_downloaded(dl_filename) elif fd: @@ -3143,12 +3233,13 @@ class YoutubeDL(object): except network_exceptions as err: self.report_error('unable to download video data: %s' % error_to_compat_str(err)) return - except (OSError, IOError) as err: + except OSError as err: raise UnavailableVideoError(err) except (ContentTooShortError, ) as err: - self.report_error('content too short (expected %s bytes and served %s)' % (err.expected, err.downloaded)) + self.report_error(f'content too short (expected {err.expected} bytes and served {err.downloaded})') return + self._raise_pending_errors(info_dict) if success and full_filename != '-': def fixup(): @@ -3159,16 +3250,16 @@ class YoutubeDL(object): if fixup_policy in ('ignore', 'never'): return elif fixup_policy == 'warn': - do_fixup = False + do_fixup = 'warn' elif fixup_policy != 'force': assert fixup_policy in ('detect_or_warn', None) if not info_dict.get('__real_download'): do_fixup = False def ffmpeg_fixup(cndn, msg, cls): - if not cndn: + if not (do_fixup and cndn): return - if not do_fixup: + elif do_fixup == 'warn': self.report_warning(f'{vid}: {msg}') return pp = cls(self) @@ -3178,30 +3269,32 @@ class YoutubeDL(object): self.report_warning(f'{vid}: {msg}. Install ffmpeg to fix this automatically') stretched_ratio = info_dict.get('stretched_ratio') - ffmpeg_fixup( - stretched_ratio not in (1, None), - f'Non-uniform pixel ratio {stretched_ratio}', - FFmpegFixupStretchedPP) - - ffmpeg_fixup( - (info_dict.get('requested_formats') is None - and info_dict.get('container') == 'm4a_dash' - and info_dict.get('ext') == 'm4a'), - 'writing DASH m4a. Only some players support this container', - FFmpegFixupM4aPP) + ffmpeg_fixup(stretched_ratio not in (1, None), + f'Non-uniform pixel ratio {stretched_ratio}', + FFmpegFixupStretchedPP) downloader = get_suitable_downloader(info_dict, self.params) if 'protocol' in info_dict else None - downloader = downloader.__name__ if downloader else None + downloader = downloader.FD_NAME if downloader else None - if info_dict.get('requested_formats') is None: # Not necessary if doing merger - ffmpeg_fixup(downloader == 'HlsFD', + ext = info_dict.get('ext') + postprocessed_by_ffmpeg = info_dict.get('requested_formats') or any(( + isinstance(pp, FFmpegVideoConvertorPP) + and resolve_recode_mapping(ext, pp.mapping)[0] not in (ext, None) + ) for pp in self._pps['post_process']) + + if not postprocessed_by_ffmpeg: + ffmpeg_fixup(ext == 'm4a' and info_dict.get('container') == 'm4a_dash', + 'writing DASH m4a. Only some players support this container', + FFmpegFixupM4aPP) + ffmpeg_fixup(downloader == 'hlsnative' and not self.params.get('hls_use_mpegts') + or info_dict.get('is_live') and self.params.get('hls_use_mpegts') is None, 'Possible MPEG-TS in MP4 container or malformed AAC timestamps', FFmpegFixupM3u8PP) ffmpeg_fixup(info_dict.get('is_live') and downloader == 'DashSegmentsFD', 'Possible duplicate MOOV atoms', FFmpegFixupDuplicateMoovPP) - ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'Malformed timestamps detected', FFmpegFixupTimestampPP) - ffmpeg_fixup(downloader == 'WebSocketFragmentFD', 'Malformed duration detected', FFmpegFixupDurationPP) + ffmpeg_fixup(downloader == 'web_socket_fragment', 'Malformed timestamps detected', FFmpegFixupTimestampPP) + ffmpeg_fixup(downloader == 'web_socket_fragment', 'Malformed duration detected', FFmpegFixupDurationPP) fixup() try: @@ -3217,15 +3310,10 @@ class YoutubeDL(object): return info_dict['__write_download_archive'] = True + assert info_dict is original_infodict # Make sure the info_dict was modified in-place if self.params.get('force_write_download_archive'): info_dict['__write_download_archive'] = True - - # Make sure the info_dict was modified in-place - assert info_dict is original_infodict - - max_downloads = self.params.get('max_downloads') - if max_downloads is not None and self._num_downloads >= int(max_downloads): - raise MaxDownloadsReached() + check_max_downloads() def __download_wrapper(self, func): @functools.wraps(func) @@ -3234,13 +3322,11 @@ class YoutubeDL(object): res = func(*args, **kwargs) except UnavailableVideoError as e: self.report_error(e) - except MaxDownloadsReached as e: - self.to_screen(f'[info] {e}') - raise except DownloadCancelled as e: self.to_screen(f'[info] {e}') if not self.params.get('break_per_url'): raise + self._num_downloads = 0 else: if self.params.get('dump_single_json', False): self.post_extract(res) @@ -3250,7 +3336,7 @@ class YoutubeDL(object): def download(self, url_list): """Download a given list of URLs.""" url_list = variadic(url_list) # Passing a single URL is a common mistake - outtmpl = self.outtmpl_dict['default'] + outtmpl = self.params['outtmpl']['default'] if (len(url_list) > 1 and outtmpl != '-' and '%' not in outtmpl @@ -3289,11 +3375,17 @@ class YoutubeDL(object): return info_dict info_dict.setdefault('epoch', int(time.time())) info_dict.setdefault('_type', 'video') + info_dict.setdefault('_version', { + 'version': __version__, + 'current_git_head': current_git_head(), + 'release_git_head': RELEASE_GIT_HEAD, + 'repository': REPOSITORY, + }) if remove_private_keys: - reject = lambda k, v: v is None or (k.startswith('_') and k != '_type') or k in { + reject = lambda k, v: v is None or k.startswith('__') or k in { 'requested_downloads', 'requested_formats', 'requested_subtitles', 'requested_entries', - 'entries', 'filepath', 'infojson_filename', 'original_url', 'playlist_autonumber', + 'entries', 'filepath', '_filename', 'infojson_filename', 'original_url', 'playlist_autonumber', } else: reject = lambda k, v: False @@ -3315,6 +3407,17 @@ class YoutubeDL(object): ''' Alias of sanitize_info for backward compatibility ''' return YoutubeDL.sanitize_info(info_dict, actually_filter) + def _delete_downloaded_files(self, *files_to_delete, info={}, msg=None): + for filename in set(filter(None, files_to_delete)): + if msg: + self.to_screen(msg % filename) + try: + os.remove(filename) + except OSError: + self.report_warning(f'Unable to delete file {filename}') + if filename in info.get('__files_to_move', []): # NB: Delete even if None + del info['__files_to_move'][filename] + @staticmethod def post_extract(info_dict): def actual_post_extract(info_dict): @@ -3347,14 +3450,8 @@ class YoutubeDL(object): for f in files_to_delete: infodict['__files_to_move'].setdefault(f, '') else: - for old_filename in set(files_to_delete): - self.to_screen('Deleting original file %s (pass -k to keep)' % old_filename) - try: - os.remove(encodeFilename(old_filename)) - except (IOError, OSError): - self.report_warning('Unable to remove downloaded original file') - if old_filename in infodict['__files_to_move']: - del infodict['__files_to_move'][old_filename] + self._delete_downloaded_files( + *files_to_delete, info=infodict, msg='Deleting original file %s (pass -k to keep)') return infodict def run_all_pps(self, key, info, *, additional_pps=None): @@ -3366,7 +3463,12 @@ class YoutubeDL(object): def pre_process(self, ie_info, key='pre_process', files_to_move=None): info = dict(ie_info) info['__files_to_move'] = files_to_move or {} - info = self.run_all_pps(key, info) + try: + info = self.run_all_pps(key, info) + except PostProcessingError as err: + msg = f'Preprocessing: {err}' + info.setdefault('__pending_error', msg) + self.report_error(msg, is_error=False) return info, info.pop('__files_to_move', None) def post_process(self, filename, info, files_to_move=None): @@ -3396,18 +3498,15 @@ class YoutubeDL(object): break else: return - return '%s %s' % (extractor.lower(), video_id) + return make_archive_id(extractor, video_id) def in_download_archive(self, info_dict): - fn = self.params.get('download_archive') - if fn is None: + if not self.archive: return False - vid_id = self._make_archive_id(info_dict) - if not vid_id: - return False # Incomplete video information - - return vid_id in self.archive + vid_ids = [self._make_archive_id(info_dict)] + vid_ids.extend(info_dict.get('_old_archive_ids') or []) + return any(id_ in self.archive for id_ in vid_ids) def record_download_archive(self, info_dict): fn = self.params.get('download_archive') @@ -3415,9 +3514,11 @@ class YoutubeDL(object): return vid_id = self._make_archive_id(info_dict) assert vid_id + self.write_debug(f'Adding to archive: {vid_id}') - with locked_file(fn, 'a', encoding='utf-8') as archive_file: - archive_file.write(vid_id + '\n') + if is_path_like(fn): + with locked_file(fn, 'a', encoding='utf-8') as archive_file: + archive_file.write(vid_id + '\n') self.archive.add(vid_id) @staticmethod @@ -3436,7 +3537,7 @@ class YoutubeDL(object): def _list_format_headers(self, *headers): if self.params.get('listformats_table', True) is not False: - return [self._format_screen(header, self.Styles.HEADERS) for header in headers] + return [self._format_out(header, self.Styles.HEADERS) for header in headers] return headers def _format_note(self, fdict): @@ -3499,11 +3600,17 @@ class YoutubeDL(object): res += '~' + format_bytes(fdict['filesize_approx']) return res - def render_formats_table(self, info_dict): - if not info_dict.get('formats') and not info_dict.get('url'): - return None + def _get_formats(self, info_dict): + if info_dict.get('formats') is None: + if info_dict.get('url') and info_dict.get('_type', 'video') == 'video': + return [info_dict] + return [] + return info_dict['formats'] - formats = info_dict.get('formats', [info_dict]) + def render_formats_table(self, info_dict): + formats = self._get_formats(info_dict) + if not formats: + return if not self.params.get('listformats_table', True) is not False: table = [ [ @@ -3511,33 +3618,45 @@ class YoutubeDL(object): format_field(f, 'ext'), self.format_resolution(f), self._format_note(f) - ] for f in formats if f.get('preference') is None or f['preference'] >= -1000] + ] for f in formats if (f.get('preference') or 0) >= -1000] return render_table(['format code', 'extension', 'resolution', 'note'], table, extra_gap=1) - delim = self._format_screen('\u2502', self.Styles.DELIM, '|', test_encoding=True) + def simplified_codec(f, field): + assert field in ('acodec', 'vcodec') + codec = f.get(field, 'unknown') + if not codec: + return 'unknown' + elif codec != 'none': + return '.'.join(codec.split('.')[:4]) + + if field == 'vcodec' and f.get('acodec') == 'none': + return 'images' + elif field == 'acodec' and f.get('vcodec') == 'none': + return '' + return self._format_out('audio only' if field == 'vcodec' else 'video only', + self.Styles.SUPPRESS) + + delim = self._format_out('\u2502', self.Styles.DELIM, '|', test_encoding=True) table = [ [ - self._format_screen(format_field(f, 'format_id'), self.Styles.ID), + self._format_out(format_field(f, 'format_id'), self.Styles.ID), format_field(f, 'ext'), format_field(f, func=self.format_resolution, ignore=('audio only', 'images')), - format_field(f, 'fps', '\t%d'), + format_field(f, 'fps', '\t%d', func=round), format_field(f, 'dynamic_range', '%s', ignore=(None, 'SDR')).replace('HDR', ''), + format_field(f, 'audio_channels', '\t%s'), delim, format_field(f, 'filesize', ' \t%s', func=format_bytes) + format_field(f, 'filesize_approx', '~\t%s', func=format_bytes), - format_field(f, 'tbr', '\t%dk'), + format_field(f, 'tbr', '\t%dk', func=round), shorten_protocol_name(f.get('protocol', '')), delim, - format_field(f, 'vcodec', default='unknown').replace( - 'none', 'images' if f.get('acodec') == 'none' - else self._format_screen('audio only', self.Styles.SUPPRESS)), - format_field(f, 'vbr', '\t%dk'), - format_field(f, 'acodec', default='unknown').replace( - 'none', '' if f.get('vcodec') == 'none' - else self._format_screen('video only', self.Styles.SUPPRESS)), - format_field(f, 'abr', '\t%dk'), - format_field(f, 'asr', '\t%dHz'), + simplified_codec(f, 'vcodec'), + format_field(f, 'vbr', '\t%dk', func=round), + simplified_codec(f, 'acodec'), + format_field(f, 'abr', '\t%dk', func=round), + format_field(f, 'asr', '\t%s', func=format_decimal_suffix), join_nonempty( - self._format_screen('UNSUPPORTED', 'light red') if f.get('ext') in ('f4f', 'f4m') else None, + self._format_out('UNSUPPORTED', 'light red') if f.get('ext') in ('f4f', 'f4m') else None, format_field(f, 'language', '[%s]'), join_nonempty(format_field(f, 'format_note'), format_field(f, 'container', ignore=(None, f.get('ext'))), @@ -3545,12 +3664,12 @@ class YoutubeDL(object): delim=' '), ] for f in formats if f.get('preference') is None or f['preference'] >= -1000] header_line = self._list_format_headers( - 'ID', 'EXT', 'RESOLUTION', '\tFPS', 'HDR', delim, '\tFILESIZE', '\tTBR', 'PROTO', + 'ID', 'EXT', 'RESOLUTION', '\tFPS', 'HDR', 'CH', delim, '\tFILESIZE', '\tTBR', 'PROTO', delim, 'VCODEC', '\tVBR', 'ACODEC', '\tABR', '\tASR', 'MORE INFO') return render_table( header_line, table, hide_empty=True, - delim=self._format_screen('\u2500', self.Styles.DELIM, '-', test_encoding=True)) + delim=self._format_out('\u2500', self.Styles.DELIM, '-', test_encoding=True)) def render_thumbnails_table(self, info_dict): thumbnails = list(info_dict.get('thumbnails') or []) @@ -3558,7 +3677,7 @@ class YoutubeDL(object): return None return render_table( self._list_format_headers('ID', 'Width', 'Height', 'URL'), - [[t.get('id'), t.get('width', 'unknown'), t.get('height', 'unknown'), t['url']] for t in thumbnails]) + [[t.get('id'), t.get('width') or 'unknown', t.get('height') or 'unknown', t['url']] for t in thumbnails]) def render_subtitles_table(self, video_id, subtitles): def _row(lang, formats): @@ -3593,7 +3712,7 @@ class YoutubeDL(object): def urlopen(self, req): """ Start an HTTP download """ - if isinstance(req, compat_basestring): + if isinstance(req, str): req = sanitized_Request(req) return self._opener.open(req, timeout=self._socket_timeout) @@ -3601,18 +3720,27 @@ class YoutubeDL(object): if not self.params.get('verbose'): return + from . import _IN_CLI # Must be delayed import + + # These imports can be slow. So import them only as needed + from .extractor.extractors import _LAZY_LOADER + from .extractor.extractors import _PLUGIN_CLASSES as plugin_extractors + def get_encoding(stream): ret = str(getattr(stream, 'encoding', 'missing (%s)' % type(stream).__name__)) if not supports_terminal_sequences(stream): - from .compat import WINDOWS_VT_MODE + from .utils import WINDOWS_VT_MODE # Must be imported locally ret += ' (No VT)' if WINDOWS_VT_MODE is False else ' (No ANSI)' return ret - encoding_str = 'Encodings: locale %s, fs %s, out %s, err %s, pref %s' % ( + encoding_str = 'Encodings: locale %s, fs %s, pref %s, %s' % ( locale.getpreferredencoding(), sys.getfilesystemencoding(), - get_encoding(self._out_files['screen']), get_encoding(self._out_files['error']), - self.get_encoding()) + self.get_encoding(), + ', '.join( + f'{key} {get_encoding(stream)}' for key, stream in self._out_files.items_ + if stream is not None and key != 'console') + ) logger = self.params.get('logger') if logger: @@ -3623,11 +3751,19 @@ class YoutubeDL(object): write_debug = lambda msg: self._write_string(f'[debug] {msg}\n') source = detect_variant() + if VARIANT not in (None, 'pip'): + source += '*' write_debug(join_nonempty( - 'hypervideo version', __version__, + f'{"hypervideo" if REPOSITORY == "hypervideo/hypervideo" else REPOSITORY} version', + __version__, f'[{RELEASE_GIT_HEAD}]' if RELEASE_GIT_HEAD else '', '' if source == 'unknown' else f'({source})', + '' if _IN_CLI else 'API', delim=' ')) + + if not _IN_CLI: + write_debug(f'params: {self.params}') + if not _LAZY_LOADER: if os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'): write_debug('Lazy loading extractors is forcibly disabled') @@ -3637,41 +3773,17 @@ class YoutubeDL(object): write_debug('Plugins: %s' % [ '%s%s' % (klass.__name__, '' if klass.__name__ == name else f' as {name}') for name, klass in itertools.chain(plugin_extractors.items(), plugin_postprocessors.items())]) - if self.params.get('compat_opts'): - write_debug('Compatibility options: %s' % ', '.join(self.params.get('compat_opts'))) + if self.params['compat_opts']: + write_debug('Compatibility options: %s' % ', '.join(self.params['compat_opts'])) - if source == 'source': - try: - sp = Popen( - ['git', 'rev-parse', '--short', 'HEAD'], - stdout=subprocess.PIPE, stderr=subprocess.PIPE, - cwd=os.path.dirname(os.path.abspath(__file__))) - out, err = sp.communicate_or_kill() - out = out.decode().strip() - if re.match('[0-9a-f]+', out): - write_debug('Git HEAD: %s' % out) - except Exception: - try: - sys.exc_clear() - except Exception: - pass - - def python_implementation(): - impl_name = platform.python_implementation() - if impl_name == 'PyPy' and hasattr(sys, 'pypy_version_info'): - return impl_name + ' version %d.%d.%d' % sys.pypy_version_info[:3] - return impl_name - - write_debug('Python version %s (%s %s) - %s' % ( - platform.python_version(), - python_implementation(), - platform.architecture()[0], - platform_name())) + if current_git_head(): + write_debug(f'Git HEAD: {current_git_head()}') + write_debug(system_identifier()) exe_versions, ffmpeg_features = FFmpegPostProcessor.get_versions_and_features(self) ffmpeg_features = {key for key, val in ffmpeg_features.items() if val} if ffmpeg_features: - exe_versions['ffmpeg'] += ' (%s)' % ','.join(ffmpeg_features) + exe_versions['ffmpeg'] += ' (%s)' % ','.join(sorted(ffmpeg_features)) exe_versions['rtmpdump'] = rtmpdump_version() exe_versions['phantomjs'] = PhantomJSwrapper._version() @@ -3680,21 +3792,14 @@ class YoutubeDL(object): ) or 'none' write_debug('exe versions: %s' % exe_str) - from .downloader.websocket import has_websockets - from .postprocessor.embedthumbnail import has_mutagen - from .cookies import SQLITE_AVAILABLE, SECRETSTORAGE_AVAILABLE - - lib_str = join_nonempty( - compat_brotli and compat_brotli.__name__, - has_certifi and 'certifi', - compat_pycrypto_AES and compat_pycrypto_AES.__name__.split('.')[0], - SECRETSTORAGE_AVAILABLE and 'secretstorage', - has_mutagen and 'mutagen', - SQLITE_AVAILABLE and 'sqlite', - has_websockets and 'websockets', - delim=', ') or 'none' - write_debug('Optional libraries: %s' % lib_str) + from .compat.compat_utils import get_package_info + from .dependencies import available_dependencies + + write_debug('Optional libraries: %s' % (', '.join(sorted({ + join_nonempty(*get_package_info(m)) for m in available_dependencies.values() + })) or 'none')) + self._setup_opener() proxy_map = {} for handler in self._opener.handlers: if hasattr(handler, 'proxies'): @@ -3703,10 +3808,10 @@ class YoutubeDL(object): # Not implemented if False and self.params.get('call_home'): - ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode('utf-8') + ipaddr = self.urlopen('https://yt-dl.org/ip').read().decode() write_debug('Public IP address: %s' % ipaddr) latest_version = self.urlopen( - 'https://yt-dl.org/latest/version').read().decode('utf-8') + 'https://yt-dl.org/latest/version').read().decode() if version_tuple(latest_version) > version_tuple(__version__): self.report_warning( 'You are using an outdated version (newest version: %s)! ' @@ -3714,6 +3819,8 @@ class YoutubeDL(object): latest_version) def _setup_opener(self): + if hasattr(self, '_opener'): + return timeout_val = self.params.get('socket_timeout') self._socket_timeout = 20 if timeout_val is None else float(timeout_val) @@ -3730,7 +3837,7 @@ class YoutubeDL(object): else: proxies = {'http': opts_proxy, 'https': opts_proxy} else: - proxies = compat_urllib_request.getproxies() + proxies = urllib.request.getproxies() # Set HTTPS proxy to HTTP one if given (https://github.com/ytdl-org/youtube-dl/issues/805) if 'http' in proxies and 'https' not in proxies: proxies['https'] = proxies['http'] @@ -3740,19 +3847,19 @@ class YoutubeDL(object): https_handler = make_HTTPS_handler(self.params, debuglevel=debuglevel) ydlh = YoutubeDLHandler(self.params, debuglevel=debuglevel) redirect_handler = YoutubeDLRedirectHandler() - data_handler = compat_urllib_request_DataHandler() + data_handler = urllib.request.DataHandler() # When passing our own FileHandler instance, build_opener won't add the # default FileHandler and allows us to disable the file protocol, which # can be used for malicious purposes (see # https://github.com/ytdl-org/youtube-dl/issues/8227) - file_handler = compat_urllib_request.FileHandler() + file_handler = urllib.request.FileHandler() def file_open(*args, **kwargs): - raise compat_urllib_error.URLError('file:// scheme is explicitly disabled in hypervideo for security reasons') + raise urllib.error.URLError('file:// scheme is explicitly disabled in hypervideo for security reasons') file_handler.file_open = file_open - opener = compat_urllib_request.build_opener( + opener = urllib.request.build_opener( proxy_handler, https_handler, cookie_processor, ydlh, redirect_handler, data_handler, file_handler) # Delete the default user-agent header, which would otherwise apply in @@ -3796,7 +3903,7 @@ class YoutubeDL(object): try: write_json_file(self.sanitize_info(ie_result, self.params.get('clean_infojson', True)), infofn) return True - except (OSError, IOError): + except OSError: self.report_error(f'Cannot write {label} metadata to JSON file {infofn}') return None @@ -3817,9 +3924,9 @@ class YoutubeDL(object): else: try: self.to_screen(f'[info] Writing {label} description to: {descfn}') - with io.open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile: + with open(encodeFilename(descfn), 'w', encoding='utf-8') as descfile: descfile.write(ie_result['description']) - except (OSError, IOError): + except OSError: self.report_error(f'Cannot write {label} description file {descfn}') return None return True @@ -3853,12 +3960,12 @@ class YoutubeDL(object): try: # Use newline='' to prevent conversion of newline characters # See https://github.com/ytdl-org/youtube-dl/issues/10268 - with io.open(sub_filename, 'w', encoding='utf-8', newline='') as subfile: + with open(sub_filename, 'w', encoding='utf-8', newline='') as subfile: subfile.write(sub_info['data']) sub_info['filepath'] = sub_filename ret.append((sub_filename, sub_filename_final)) continue - except (OSError, IOError): + except OSError: self.report_error(f'Cannot write video subtitles file {sub_filename}') return None diff --git a/hypervideo_dl/__init__.py b/hypervideo_dl/__init__.py index dc53a9e..8ac1c0c 100644 --- a/hypervideo_dl/__init__.py +++ b/hypervideo_dl/__init__.py @@ -1,81 +1,80 @@ #!/usr/bin/python -# coding: utf-8 +f'You are using an unsupported version of Python. Only Python versions 3.6 and above are supported by hypervideo' # noqa: F541 __license__ = 'CC0-1.0' -import codecs -import io +import getpass import itertools +import optparse import os -import random import re import sys +from .compat import compat_shlex_quote, workaround_optparse_bug9161 +from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS +from .downloader import FileDownloader +from .downloader.external import get_external_downloader +from .extractor import list_extractor_classes +from .extractor.adobepass import MSO_INFO +from .extractor.common import InfoExtractor from .options import parseOpts -from .compat import ( - compat_getpass, - compat_os_name, - compat_shlex_quote, - workaround_optparse_bug9161, +from .postprocessor import ( + FFmpegExtractAudioPP, + FFmpegSubtitlesConvertorPP, + FFmpegThumbnailsConvertorPP, + FFmpegVideoConvertorPP, + FFmpegVideoRemuxerPP, + MetadataFromFieldPP, + MetadataParserPP, ) -from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS from .utils import ( + NO_DEFAULT, + POSTPROCESS_WHEN, DateRange, - decodeOption, DownloadCancelled, DownloadError, + GeoUtils, + PlaylistEntries, + SameFileError, + decodeOption, + download_range_func, expand_path, float_or_none, - GeoUtils, + format_field, int_or_none, match_filter_func, - NO_DEFAULT, parse_duration, preferredencoding, read_batch_urls, + read_stdin, render_table, - SameFileError, setproctitle, std_headers, traverse_obj, + variadic, write_string, ) -from .downloader import ( - FileDownloader, -) -from .extractor import gen_extractors, list_extractors -from .extractor.common import InfoExtractor -from .extractor.adobepass import MSO_INFO -from .postprocessor import ( - FFmpegExtractAudioPP, - FFmpegSubtitlesConvertorPP, - FFmpegThumbnailsConvertorPP, - FFmpegVideoConvertorPP, - FFmpegVideoRemuxerPP, - MetadataFromFieldPP, - MetadataParserPP, -) from .YoutubeDL import YoutubeDL +def _exit(status=0, *args): + for msg in args: + sys.stderr.write(msg) + raise SystemExit(status) + + def get_urls(urls, batchfile, verbose): # Batch file verification batch_urls = [] if batchfile is not None: try: - if batchfile == '-': - write_string('Reading URLs from stdin - EOF (%s) to end:\n' % ( - 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D')) - batchfd = sys.stdin - else: - batchfd = io.open( - expand_path(batchfile), - 'r', encoding='utf-8', errors='ignore') - batch_urls = read_batch_urls(batchfd) + batch_urls = read_batch_urls( + read_stdin('URLs') if batchfile == '-' + else open(expand_path(batchfile), encoding='utf-8', errors='ignore')) if verbose: write_string('[debug] Batch file urls: ' + repr(batch_urls) + '\n') - except IOError: - sys.exit('ERROR: batch file %s could not be read' % batchfile) + except OSError: + _exit(f'ERROR: batch file {batchfile} could not be read') _enc = preferredencoding() return [ url.strip().decode(_enc, 'ignore') if isinstance(url, bytes) else url.strip() @@ -83,6 +82,11 @@ def get_urls(urls, batchfile, verbose): def print_extractor_information(opts, urls): + # Importing GenericIE is currently slow since it imports other extractors + # TODO: Move this back to module level after generalization of embed detection + from .extractor.generic import GenericIE + + out = '' if opts.list_extractors: for ie in list_extractors(opts.age_limit): write_string(ie.IE_NAME + (' (CURRENTLY BROKEN)' if not ie.working() else '') + '\n', out=sys.stdout) @@ -218,15 +222,11 @@ def validate_options(opts): validate_regex('format sorting', f, InfoExtractor.FormatSort.regex) # Postprocessor formats - validate_in('audio format', opts.audioformat, ['best'] + list(FFmpegExtractAudioPP.SUPPORTED_EXTS)) + validate_regex('audio format', opts.audioformat, FFmpegExtractAudioPP.FORMAT_RE) validate_in('subtitle format', opts.convertsubtitles, FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS) - validate_in('thumbnail format', opts.convertthumbnails, FFmpegThumbnailsConvertorPP.SUPPORTED_EXTS) - if opts.recodevideo is not None: - opts.recodevideo = opts.recodevideo.replace(' ', '') - validate_regex('video recode format', opts.recodevideo, FFmpegVideoConvertorPP.FORMAT_RE) - if opts.remuxvideo is not None: - opts.remuxvideo = opts.remuxvideo.replace(' ', '') - validate_regex('video remux format', opts.remuxvideo, FFmpegVideoRemuxerPP.FORMAT_RE) + validate_regex('thumbnail format', opts.convertthumbnails, FFmpegThumbnailsConvertorPP.FORMAT_RE) + validate_regex('recode video format', opts.recodevideo, FFmpegVideoConvertorPP.FORMAT_RE) + validate_regex('remux video format', opts.remuxvideo, FFmpegVideoRemuxerPP.FORMAT_RE) if opts.audioquality: opts.audioquality = opts.audioquality.strip('k').strip('K') # int_or_none prevents inf, nan @@ -248,6 +248,28 @@ def validate_options(opts): opts.extractor_retries = parse_retries('extractor', opts.extractor_retries) opts.file_access_retries = parse_retries('file access', opts.file_access_retries) + # Retry sleep function + def parse_sleep_func(expr): + NUMBER_RE = r'\d+(?:\.\d+)?' + op, start, limit, step, *_ = tuple(re.fullmatch( + rf'(?:(linear|exp)=)?({NUMBER_RE})(?::({NUMBER_RE})?)?(?::({NUMBER_RE}))?', + expr.strip()).groups()) + (None, None) + + if op == 'exp': + return lambda n: min(float(start) * (float(step or 2) ** n), float(limit or 'inf')) + else: + default_step = start if op or limit else 0 + return lambda n: min(float(start) + float(step or default_step) * n, float(limit or 'inf')) + + for key, expr in opts.retry_sleep.items(): + if not expr: + del opts.retry_sleep[key] + continue + try: + opts.retry_sleep[key] = parse_sleep_func(expr) + except AttributeError: + raise ValueError(f'invalid {key} retry sleep expression {expr!r}') + # Bytes def parse_bytes(name, value): if value is None: @@ -292,20 +314,25 @@ def validate_options(opts): 'Cannot download a video and extract audio into the same file! ' f'Use "{outtmpl_default}.%(ext)s" instead of "{outtmpl_default}" as the output template') - # Remove chapters - remove_chapters_patterns, opts.remove_ranges = [], [] - for regex in opts.remove_chapters or []: - if regex.startswith('*'): - dur = list(map(parse_duration, regex[1:].split('-'))) - if len(dur) == 2 and all(t is not None for t in dur): - opts.remove_ranges.append(tuple(dur)) + def parse_chapters(name, value): + chapters, ranges = [], [] + for regex in value or []: + if regex.startswith('*'): + for range in regex[1:].split(','): + dur = tuple(map(parse_duration, range.strip().split('-'))) + if len(dur) == 2 and all(t is not None for t in dur): + ranges.append(dur) + else: + raise ValueError(f'invalid {name} time range "{regex}". Must be of the form *start-end') continue - raise ValueError(f'invalid --remove-chapters time range "{regex}". Must be of the form *start-end') - try: - remove_chapters_patterns.append(re.compile(regex)) - except re.error as err: - raise ValueError(f'invalid --remove-chapters regex "{regex}" - {err}') - opts.remove_chapters = remove_chapters_patterns + try: + chapters.append(re.compile(regex)) + except re.error as err: + raise ValueError(f'invalid {name} regex "{regex}" - {err}') + return chapters, ranges + + opts.remove_chapters, opts.remove_ranges = parse_chapters('--remove-chapters', opts.remove_chapters) + opts.download_ranges = download_range_func(*parse_chapters('--download-sections', opts.download_ranges)) # Cookies from browser if opts.cookiesfrombrowser: @@ -349,6 +376,12 @@ def validate_options(opts): opts.parse_metadata = list(itertools.chain(*map(metadataparser_actions, parse_metadata))) # Other options + if opts.playlist_items is not None: + try: + tuple(PlaylistEntries.parse_playlist_items(opts.playlist_items)) + except Exception as err: + raise ValueError(f'Invalid playlist-items {opts.playlist_items!r}: {err}') + geo_bypass_code = opts.geo_bypass_ip_block or opts.geo_bypass_country if geo_bypass_code is not None: try: @@ -369,6 +402,17 @@ def validate_options(opts): if opts.no_sponsorblock: opts.sponsorblock_mark = opts.sponsorblock_remove = set() + default_downloader = None + for proto, path in opts.external_downloader.items(): + if path == 'native': + continue + ed = get_external_downloader(path) + if ed is None: + raise ValueError( + f'No such {format_field(proto, None, "%s ", ignore="default")}external downloader "{path}"') + elif ed and proto == 'default': + default_downloader = ed.get_basename() + warnings, deprecation_warnings = [], [] # Common mistake: -f best @@ -379,13 +423,18 @@ def validate_options(opts): 'If you know what you are doing and want only the best pre-merged format, use "-f b" instead to suppress this warning'))) # --(postprocessor/downloader)-args without name - def report_args_compat(name, value, key1, key2=None): + def report_args_compat(name, value, key1, key2=None, where=None): if key1 in value and key2 not in value: - warnings.append(f'{name} arguments given without specifying name. The arguments will be given to all {name}s') + warnings.append(f'{name.title()} arguments given without specifying name. ' + f'The arguments will be given to {where or f"all {name}s"}') return True return False - report_args_compat('external downloader', opts.external_downloader_args, 'default') + if report_args_compat('external downloader', opts.external_downloader_args, + 'default', where=default_downloader) and default_downloader: + # Compat with youtube-dl's behavior. See https://github.com/ytdl-org/youtube-dl/commit/49c5293014bc11ec8c009856cd63cffa6296c1e1 + opts.external_downloader_args.setdefault(default_downloader, opts.external_downloader_args.pop('default')) + if report_args_compat('post-processor', opts.postprocessor_args, 'default-compat', 'default'): opts.postprocessor_args['default'] = opts.postprocessor_args.pop('default-compat') opts.postprocessor_args.setdefault('sponskrub', []) @@ -404,6 +453,9 @@ def validate_options(opts): setattr(opts, opt1, default) # Conflicting options + report_conflict('--playlist-reverse', 'playlist_reverse', '--playlist-random', 'playlist_random') + report_conflict('--playlist-reverse', 'playlist_reverse', '--lazy-playlist', 'lazy_playlist') + report_conflict('--playlist-random', 'playlist_random', '--lazy-playlist', 'lazy_playlist') report_conflict('--dateafter', 'dateafter', '--date', 'date', default=None) report_conflict('--datebefore', 'datebefore', '--date', 'date', default=None) report_conflict('--exec-before-download', 'exec_before_dl_cmd', '"--exec before_dl:"', 'exec_cmd', opts.exec_cmd.get('before_dl')) @@ -478,9 +530,9 @@ def validate_options(opts): # Ask for passwords if opts.username is not None and opts.password is None: - opts.password = compat_getpass('Type account password and press [Return]: ') + opts.password = getpass.getpass('Type account password and press [Return]: ') if opts.ap_username is not None and opts.ap_password is None: - opts.ap_password = compat_getpass('Type TV provider account password and press [Return]: ') + opts.ap_password = getpass.getpass('Type TV provider account password and press [Return]: ') return warnings, deprecation_warnings @@ -634,7 +686,7 @@ def parse_options(argv=None): final_ext = ( opts.recodevideo if opts.recodevideo in FFmpegVideoConvertorPP.SUPPORTED_EXTS else opts.remuxvideo if opts.remuxvideo in FFmpegVideoRemuxerPP.SUPPORTED_EXTS - else opts.audioformat if (opts.extractaudio and opts.audioformat != 'best') + else opts.audioformat if (opts.extractaudio and opts.audioformat in FFmpegExtractAudioPP.SUPPORTED_EXTS) else None) return parser, opts, urls, { @@ -690,6 +742,7 @@ def parse_options(argv=None): 'file_access_retries': opts.file_access_retries, 'fragment_retries': opts.fragment_retries, 'extractor_retries': opts.extractor_retries, + 'retry_sleep_functions': opts.retry_sleep, 'skip_unavailable_fragments': opts.skip_unavailable_fragments, 'keep_fragments': opts.keep_fragments, 'concurrent_fragment_downloads': opts.concurrent_fragment_downloads, @@ -704,6 +757,7 @@ def parse_options(argv=None): 'playlistend': opts.playlistend, 'playlistreverse': opts.playlist_reverse, 'playlistrandom': opts.playlist_random, + 'lazy_playlist': opts.lazy_playlist, 'noplaylist': opts.noplaylist, 'logtostderr': opts.outtmpl.get('default') == '-', 'consoletitle': opts.consoletitle, @@ -735,6 +789,7 @@ def parse_options(argv=None): 'verbose': opts.verbose, 'dump_intermediate_pages': opts.dump_intermediate_pages, 'write_pages': opts.write_pages, + 'load_pages': opts.load_pages, 'test': opts.test, 'keepvideo': opts.keepvideo, 'min_filesize': opts.min_filesize, @@ -783,6 +838,8 @@ def parse_options(argv=None): 'max_sleep_interval': opts.max_sleep_interval, 'sleep_interval_subtitles': opts.sleep_interval_subtitles, 'external_downloader': opts.external_downloader, + 'download_ranges': opts.download_ranges, + 'force_keyframes_at_cuts': opts.force_keyframes_at_cuts, 'list_thumbnails': opts.list_thumbnails, 'playlist_items': opts.playlist_items, 'xattr_set_filesize': opts.xattr_set_filesize, @@ -821,52 +878,66 @@ def _real_main(argv=None): if opts.dump_user_agent: ua = traverse_obj(opts.headers, 'User-Agent', casesense=False, default=std_headers['User-Agent']) write_string(f'{ua}\n', out=sys.stdout) - sys.exit(0) + return if print_extractor_information(opts, all_urls): - sys.exit(0) + return with YoutubeDL(ydl_opts) as ydl: + pre_process = opts.update_self or opts.rm_cachedir actual_use = all_urls or opts.load_info_filename - # Remove cache dir if opts.rm_cachedir: ydl.cache.remove() - # Maybe do nothing + updater = Updater(ydl) + if opts.update_self and updater.update() and actual_use: + if updater.cmd: + return updater.restart() + # This code is reachable only for zip variant in py < 3.10 + # It makes sense to exit here, but the old behavior is to continue + ydl.report_warning('Restart hypervideo to use the updated version') + # return 100, 'ERROR: The program must exit for the update to complete' + if not actual_use: + if pre_process: + return ydl._download_retcode + ydl.warn_if_short_id(sys.argv[1:] if argv is None else argv) parser.error( 'You must provide at least one URL.\n' 'Type hypervideo --help to see a list of all options.') + parser.destroy() try: if opts.load_info_filename is not None: - retcode = ydl.download_with_info_file(expand_path(opts.load_info_filename)) + return ydl.download_with_info_file(expand_path(opts.load_info_filename)) else: - retcode = ydl.download(all_urls) + return ydl.download(all_urls) except DownloadCancelled: ydl.to_screen('Aborting remaining downloads') - retcode = 101 - - sys.exit(retcode) + return 101 def main(argv=None): try: - _real_main(argv) + _exit(*variadic(_real_main(argv))) except DownloadError: - sys.exit(1) + _exit(1) except SameFileError as e: - sys.exit(f'ERROR: {e}') + _exit(f'ERROR: {e}') except KeyboardInterrupt: - sys.exit('\nERROR: Interrupted by user') + _exit('\nERROR: Interrupted by user') except BrokenPipeError as e: # https://docs.python.org/3/library/signal.html#note-on-sigpipe devnull = os.open(os.devnull, os.O_WRONLY) os.dup2(devnull, sys.stdout.fileno()) - sys.exit(f'\nERROR: {e}') + _exit(f'\nERROR: {e}') + except optparse.OptParseError as e: + _exit(2, f'\n{e}') + +from .extractor import gen_extractors, list_extractors __all__ = [ 'main', diff --git a/hypervideo_dl/__main__.py b/hypervideo_dl/__main__.py index 49765e4..c45082e 100644 --- a/hypervideo_dl/__main__.py +++ b/hypervideo_dl/__main__.py @@ -1,13 +1,11 @@ #!/usr/bin/env python3 -from __future__ import unicode_literals # Execute with -# $ python hypervideo_dl/__main__.py (2.6+) -# $ python -m hypervideo_dl (2.7+) +# $ python -m hypervideo_dl import sys -if __package__ is None and not hasattr(sys, 'frozen'): +if __package__ is None and not getattr(sys, 'frozen', False): # direct call of __main__.py import os.path path = os.path.realpath(os.path.abspath(__file__)) diff --git a/hypervideo_dl/aes.py b/hypervideo_dl/aes.py index b37f0dd..60ce99c 100644 --- a/hypervideo_dl/aes.py +++ b/hypervideo_dl/aes.py @@ -1,26 +1,18 @@ -from __future__ import unicode_literals - +import base64 from math import ceil -from .compat import ( - compat_b64decode, - compat_ord, - compat_pycrypto_AES, -) -from .utils import ( - bytes_to_intlist, - intlist_to_bytes, -) - +from .compat import compat_ord +from .dependencies import Cryptodome_AES +from .utils import bytes_to_intlist, intlist_to_bytes -if compat_pycrypto_AES: +if Cryptodome_AES: def aes_cbc_decrypt_bytes(data, key, iv): """ Decrypt bytes with AES-CBC using pycryptodome """ - return compat_pycrypto_AES.new(key, compat_pycrypto_AES.MODE_CBC, iv).decrypt(data) + return Cryptodome_AES.new(key, Cryptodome_AES.MODE_CBC, iv).decrypt(data) def aes_gcm_decrypt_and_verify_bytes(data, key, tag, nonce): """ Decrypt bytes with AES-GCM using pycryptodome """ - return compat_pycrypto_AES.new(key, compat_pycrypto_AES.MODE_GCM, nonce).decrypt_and_verify(data, tag) + return Cryptodome_AES.new(key, Cryptodome_AES.MODE_GCM, nonce).decrypt_and_verify(data, tag) else: def aes_cbc_decrypt_bytes(data, key, iv): @@ -32,16 +24,59 @@ else: return intlist_to_bytes(aes_gcm_decrypt_and_verify(*map(bytes_to_intlist, (data, key, tag, nonce)))) +def aes_cbc_encrypt_bytes(data, key, iv, **kwargs): + return intlist_to_bytes(aes_cbc_encrypt(*map(bytes_to_intlist, (data, key, iv)), **kwargs)) + + +BLOCK_SIZE_BYTES = 16 + + def unpad_pkcs7(data): return data[:-compat_ord(data[-1])] -BLOCK_SIZE_BYTES = 16 +def pkcs7_padding(data): + """ + PKCS#7 padding + + @param {int[]} data cleartext + @returns {int[]} padding data + """ + + remaining_length = BLOCK_SIZE_BYTES - len(data) % BLOCK_SIZE_BYTES + return data + [remaining_length] * remaining_length + + +def pad_block(block, padding_mode): + """ + Pad a block with the given padding mode + @param {int[]} block block to pad + @param padding_mode padding mode + """ + padding_size = BLOCK_SIZE_BYTES - len(block) + + PADDING_BYTE = { + 'pkcs7': padding_size, + 'iso7816': 0x0, + 'whitespace': 0x20, + 'zero': 0x0, + } + + if padding_size < 0: + raise ValueError('Block size exceeded') + elif padding_mode not in PADDING_BYTE: + raise NotImplementedError(f'Padding mode {padding_mode} is not implemented') + + if padding_mode == 'iso7816' and padding_size: + block = block + [0x80] # NB: += mutates list + padding_size -= 1 + + return block + [PADDING_BYTE[padding_mode]] * padding_size def aes_ecb_encrypt(data, key, iv=None): """ - Encrypt with aes in ECB mode + Encrypt with aes in ECB mode. Using PKCS#7 padding @param {int[]} data cleartext @param {int[]} key 16/24/32-Byte cipher key @@ -54,8 +89,7 @@ def aes_ecb_encrypt(data, key, iv=None): encrypted_data = [] for i in range(block_count): block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] - encrypted_data += aes_encrypt(block, expanded_key) - encrypted_data = encrypted_data[:len(data)] + encrypted_data += aes_encrypt(pkcs7_padding(block), expanded_key) return encrypted_data @@ -145,13 +179,14 @@ def aes_cbc_decrypt(data, key, iv): return decrypted_data -def aes_cbc_encrypt(data, key, iv): +def aes_cbc_encrypt(data, key, iv, *, padding_mode='pkcs7'): """ - Encrypt with aes in CBC mode. Using PKCS#7 padding + Encrypt with aes in CBC mode @param {int[]} data cleartext @param {int[]} key 16/24/32-Byte cipher key @param {int[]} iv 16-Byte IV + @param padding_mode Padding mode to use @returns {int[]} encrypted data """ expanded_key = key_expansion(key) @@ -161,8 +196,8 @@ def aes_cbc_encrypt(data, key, iv): previous_cipher_block = iv for i in range(block_count): block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] - remaining_length = BLOCK_SIZE_BYTES - len(block) - block += [remaining_length] * remaining_length + block = pad_block(block, padding_mode) + mixed_block = xor(block, previous_cipher_block) encrypted_block = aes_encrypt(mixed_block, expanded_key) @@ -273,8 +308,8 @@ def aes_decrypt_text(data, password, key_size_bytes): """ NONCE_LENGTH_BYTES = 8 - data = bytes_to_intlist(compat_b64decode(data)) - password = bytes_to_intlist(password.encode('utf-8')) + data = bytes_to_intlist(base64.b64decode(data)) + password = bytes_to_intlist(password.encode()) key = password[:key_size_bytes] + [0] * (key_size_bytes - len(password)) key = aes_encrypt(key[:BLOCK_SIZE_BYTES], key_expansion(key)) * (key_size_bytes // BLOCK_SIZE_BYTES) @@ -503,20 +538,30 @@ def ghash(subkey, data): last_y = [0] * BLOCK_SIZE_BYTES for i in range(0, len(data), BLOCK_SIZE_BYTES): - block = data[i : i + BLOCK_SIZE_BYTES] # noqa: E203 + block = data[i: i + BLOCK_SIZE_BYTES] last_y = block_product(xor(last_y, block), subkey) return last_y __all__ = [ - 'aes_ctr_decrypt', 'aes_cbc_decrypt', 'aes_cbc_decrypt_bytes', + 'aes_ctr_decrypt', 'aes_decrypt_text', - 'aes_encrypt', + 'aes_decrypt', + 'aes_ecb_decrypt', 'aes_gcm_decrypt_and_verify', 'aes_gcm_decrypt_and_verify_bytes', + + 'aes_cbc_encrypt', + 'aes_cbc_encrypt_bytes', + 'aes_ctr_encrypt', + 'aes_ecb_encrypt', + 'aes_encrypt', + 'key_expansion', + 'pad_block', + 'pkcs7_padding', 'unpad_pkcs7', ] diff --git a/hypervideo_dl/cache.py b/hypervideo_dl/cache.py index 24acb1b..2e9c1ef 100644 --- a/hypervideo_dl/cache.py +++ b/hypervideo_dl/cache.py @@ -1,28 +1,23 @@ -from __future__ import unicode_literals - +import contextlib import errno -import io import json import os import re import shutil import traceback -from .compat import compat_getenv -from .utils import ( - expand_path, - write_json_file, -) +from .utils import expand_path, traverse_obj, version_tuple, write_json_file +from .version import __version__ -class Cache(object): +class Cache: def __init__(self, ydl): self._ydl = ydl def _get_root_dir(self): res = self._ydl.params.get('cachedir') if res is None: - cache_root = compat_getenv('XDG_CACHE_HOME', '~/.cache') + cache_root = os.getenv('XDG_CACHE_HOME', '~/.cache') res = os.path.join(cache_root, 'hypervideo') return expand_path(res) @@ -31,7 +26,7 @@ class Cache(object): 'invalid section %r' % section assert re.match(r'^[a-zA-Z0-9_.-]+$', key), 'invalid key %r' % key return os.path.join( - self._get_root_dir(), section, '%s.%s' % (key, dtype)) + self._get_root_dir(), section, f'{key}.{dtype}') @property def enabled(self): @@ -51,33 +46,37 @@ class Cache(object): if ose.errno != errno.EEXIST: raise self._ydl.write_debug(f'Saving {section}.{key} to cache') - write_json_file(data, fn) + write_json_file({'hypervideo_version': __version__, 'data': data}, fn) except Exception: tb = traceback.format_exc() - self._ydl.report_warning( - 'Writing cache to %r failed: %s' % (fn, tb)) + self._ydl.report_warning(f'Writing cache to {fn!r} failed: {tb}') + + def _validate(self, data, min_ver): + version = traverse_obj(data, 'hypervideo_version') + if not version: # Backward compatibility + data, version = {'data': data}, '2022.08.19' + if not min_ver or version_tuple(version) >= version_tuple(min_ver): + return data['data'] + self._ydl.write_debug(f'Discarding old cache from version {version} (needs {min_ver})') - def load(self, section, key, dtype='json', default=None): + def load(self, section, key, dtype='json', default=None, *, min_ver=None): assert dtype in ('json',) if not self.enabled: return default cache_fn = self._get_cache_fn(section, key, dtype) - try: + with contextlib.suppress(OSError): try: - with io.open(cache_fn, 'r', encoding='utf-8') as cachef: + with open(cache_fn, encoding='utf-8') as cachef: self._ydl.write_debug(f'Loading {section}.{key} from cache') - return json.load(cachef) - except ValueError: + return self._validate(json.load(cachef), min_ver) + except (ValueError, KeyError): try: file_size = os.path.getsize(cache_fn) - except (OSError, IOError) as oe: + except OSError as oe: file_size = str(oe) - self._ydl.report_warning( - 'Cache retrieval from %s failed (%s)' % (cache_fn, file_size)) - except IOError: - pass # No cache available + self._ydl.report_warning(f'Cache retrieval from {cache_fn} failed ({file_size})') return default diff --git a/hypervideo_dl/compat.py b/hypervideo_dl/compat.py deleted file mode 100644 index bdea14c..0000000 --- a/hypervideo_dl/compat.py +++ /dev/null @@ -1,330 +0,0 @@ -# coding: utf-8 - -import asyncio -import base64 -import collections -import ctypes -import getpass -import html -import html.parser -import http -import http.client -import http.cookiejar -import http.cookies -import http.server -import itertools -import optparse -import os -import re -import shlex -import shutil -import socket -import struct -import subprocess -import sys -import tokenize -import urllib -import xml.etree.ElementTree as etree -from subprocess import DEVNULL - - -# HTMLParseError has been deprecated in Python 3.3 and removed in -# Python 3.5. Introducing dummy exception for Python >3.5 for compatible -# and uniform cross-version exception handling -class compat_HTMLParseError(Exception): - pass - - -# compat_ctypes_WINFUNCTYPE = ctypes.WINFUNCTYPE -# will not work since ctypes.WINFUNCTYPE does not exist in UNIX machines -def compat_ctypes_WINFUNCTYPE(*args, **kwargs): - return ctypes.WINFUNCTYPE(*args, **kwargs) - - -class _TreeBuilder(etree.TreeBuilder): - def doctype(self, name, pubid, system): - pass - - -def compat_etree_fromstring(text): - return etree.XML(text, parser=etree.XMLParser(target=_TreeBuilder())) - - -compat_os_name = os._name if os.name == 'java' else os.name - - -if compat_os_name == 'nt': - def compat_shlex_quote(s): - return s if re.match(r'^[-_\w./]+$', s) else '"%s"' % s.replace('"', '\\"') -else: - from shlex import quote as compat_shlex_quote - - -def compat_ord(c): - if type(c) is int: - return c - else: - return ord(c) - - -def compat_setenv(key, value, env=os.environ): - env[key] = value - - -if compat_os_name == 'nt' and sys.version_info < (3, 8): - # os.path.realpath on Windows does not follow symbolic links - # prior to Python 3.8 (see https://bugs.python.org/issue9949) - def compat_realpath(path): - while os.path.islink(path): - path = os.path.abspath(os.readlink(path)) - return path -else: - compat_realpath = os.path.realpath - - -def compat_print(s): - assert isinstance(s, compat_str) - print(s) - - -# Fix https://github.com/ytdl-org/youtube-dl/issues/4223 -# See http://bugs.python.org/issue9161 for what is broken -def workaround_optparse_bug9161(): - op = optparse.OptionParser() - og = optparse.OptionGroup(op, 'foo') - try: - og.add_option('-t') - except TypeError: - real_add_option = optparse.OptionGroup.add_option - - def _compat_add_option(self, *args, **kwargs): - enc = lambda v: ( - v.encode('ascii', 'replace') if isinstance(v, compat_str) - else v) - bargs = [enc(a) for a in args] - bkwargs = dict( - (k, enc(v)) for k, v in kwargs.items()) - return real_add_option(self, *bargs, **bkwargs) - optparse.OptionGroup.add_option = _compat_add_option - - -try: - compat_Pattern = re.Pattern -except AttributeError: - compat_Pattern = type(re.compile('')) - - -try: - compat_Match = re.Match -except AttributeError: - compat_Match = type(re.compile('').match('')) - - -try: - compat_asyncio_run = asyncio.run # >= 3.7 -except AttributeError: - def compat_asyncio_run(coro): - try: - loop = asyncio.get_event_loop() - except RuntimeError: - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) - loop.run_until_complete(coro) - - asyncio.run = compat_asyncio_run - - -try: # >= 3.7 - asyncio.tasks.all_tasks -except AttributeError: - asyncio.tasks.all_tasks = asyncio.tasks.Task.all_tasks - -try: - import websockets as compat_websockets -except ImportError: - compat_websockets = None - -# Python 3.8+ does not honor %HOME% on windows, but this breaks compatibility with youtube-dl -# See https://github.com/hypervideo/hypervideo/issues/792 -# https://docs.python.org/3/library/os.path.html#os.path.expanduser -if compat_os_name in ('nt', 'ce') and 'HOME' in os.environ: - _userhome = os.environ['HOME'] - - def compat_expanduser(path): - if not path.startswith('~'): - return path - i = path.replace('\\', '/', 1).find('/') # ~user - if i < 0: - i = len(path) - userhome = os.path.join(os.path.dirname(_userhome), path[1:i]) if i > 1 else _userhome - return userhome + path[i:] -else: - compat_expanduser = os.path.expanduser - - -try: - from Cryptodome.Cipher import AES as compat_pycrypto_AES -except ImportError: - try: - from Crypto.Cipher import AES as compat_pycrypto_AES - except ImportError: - compat_pycrypto_AES = None - -try: - import brotlicffi as compat_brotli -except ImportError: - try: - import brotli as compat_brotli - except ImportError: - compat_brotli = None - -WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None - - -def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075 - if compat_os_name != 'nt': - return - global WINDOWS_VT_MODE - startupinfo = subprocess.STARTUPINFO() - startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW - try: - subprocess.Popen('', shell=True, startupinfo=startupinfo) - WINDOWS_VT_MODE = True - except Exception: - pass - - -# Deprecated - -compat_basestring = str -compat_chr = chr -compat_filter = filter -compat_input = input -compat_integer_types = (int, ) -compat_kwargs = lambda kwargs: kwargs -compat_map = map -compat_numeric_types = (int, float, complex) -compat_str = str -compat_xpath = lambda xpath: xpath -compat_zip = zip - -compat_collections_abc = collections.abc -compat_HTMLParser = html.parser.HTMLParser -compat_HTTPError = urllib.error.HTTPError -compat_Struct = struct.Struct -compat_b64decode = base64.b64decode -compat_cookiejar = http.cookiejar -compat_cookiejar_Cookie = compat_cookiejar.Cookie -compat_cookies = http.cookies -compat_cookies_SimpleCookie = compat_cookies.SimpleCookie -compat_etree_Element = etree.Element -compat_etree_register_namespace = etree.register_namespace -compat_get_terminal_size = shutil.get_terminal_size -compat_getenv = os.getenv -compat_getpass = getpass.getpass -compat_html_entities = html.entities -compat_html_entities_html5 = compat_html_entities.html5 -compat_http_client = http.client -compat_http_server = http.server -compat_itertools_count = itertools.count -compat_parse_qs = urllib.parse.parse_qs -compat_shlex_split = shlex.split -compat_socket_create_connection = socket.create_connection -compat_struct_pack = struct.pack -compat_struct_unpack = struct.unpack -compat_subprocess_get_DEVNULL = lambda: DEVNULL -compat_tokenize_tokenize = tokenize.tokenize -compat_urllib_error = urllib.error -compat_urllib_parse = urllib.parse -compat_urllib_parse_quote = urllib.parse.quote -compat_urllib_parse_quote_plus = urllib.parse.quote_plus -compat_urllib_parse_unquote = urllib.parse.unquote -compat_urllib_parse_unquote_plus = urllib.parse.unquote_plus -compat_urllib_parse_unquote_to_bytes = urllib.parse.unquote_to_bytes -compat_urllib_parse_urlencode = urllib.parse.urlencode -compat_urllib_parse_urlparse = urllib.parse.urlparse -compat_urllib_parse_urlunparse = urllib.parse.urlunparse -compat_urllib_request = urllib.request -compat_urllib_request_DataHandler = urllib.request.DataHandler -compat_urllib_response = urllib.response -compat_urlparse = urllib.parse -compat_urlretrieve = urllib.request.urlretrieve -compat_xml_parse_error = etree.ParseError - - -# Set public objects - -__all__ = [ - 'WINDOWS_VT_MODE', - 'compat_HTMLParseError', - 'compat_HTMLParser', - 'compat_HTTPError', - 'compat_Match', - 'compat_Pattern', - 'compat_Struct', - 'compat_asyncio_run', - 'compat_b64decode', - 'compat_basestring', - 'compat_brotli', - 'compat_chr', - 'compat_collections_abc', - 'compat_cookiejar', - 'compat_cookiejar_Cookie', - 'compat_cookies', - 'compat_cookies_SimpleCookie', - 'compat_ctypes_WINFUNCTYPE', - 'compat_etree_Element', - 'compat_etree_fromstring', - 'compat_etree_register_namespace', - 'compat_expanduser', - 'compat_filter', - 'compat_get_terminal_size', - 'compat_getenv', - 'compat_getpass', - 'compat_html_entities', - 'compat_html_entities_html5', - 'compat_http_client', - 'compat_http_server', - 'compat_input', - 'compat_integer_types', - 'compat_itertools_count', - 'compat_kwargs', - 'compat_map', - 'compat_numeric_types', - 'compat_ord', - 'compat_os_name', - 'compat_parse_qs', - 'compat_print', - 'compat_pycrypto_AES', - 'compat_realpath', - 'compat_setenv', - 'compat_shlex_quote', - 'compat_shlex_split', - 'compat_socket_create_connection', - 'compat_str', - 'compat_struct_pack', - 'compat_struct_unpack', - 'compat_subprocess_get_DEVNULL', - 'compat_tokenize_tokenize', - 'compat_urllib_error', - 'compat_urllib_parse', - 'compat_urllib_parse_quote', - 'compat_urllib_parse_quote_plus', - 'compat_urllib_parse_unquote', - 'compat_urllib_parse_unquote_plus', - 'compat_urllib_parse_unquote_to_bytes', - 'compat_urllib_parse_urlencode', - 'compat_urllib_parse_urlparse', - 'compat_urllib_parse_urlunparse', - 'compat_urllib_request', - 'compat_urllib_request_DataHandler', - 'compat_urllib_response', - 'compat_urlparse', - 'compat_urlretrieve', - 'compat_websockets', - 'compat_xml_parse_error', - 'compat_xpath', - 'compat_zip', - 'windows_enable_vt_mode', - 'workaround_optparse_bug9161', -] diff --git a/hypervideo_dl/compat/__init__.py b/hypervideo_dl/compat/__init__.py new file mode 100644 index 0000000..2f2621b --- /dev/null +++ b/hypervideo_dl/compat/__init__.py @@ -0,0 +1,78 @@ +import os +import sys +import warnings +import xml.etree.ElementTree as etree + +from ._deprecated import * # noqa: F401, F403 +from .compat_utils import passthrough_module + +# XXX: Implement this the same way as other DeprecationWarnings without circular import +passthrough_module(__name__, '._legacy', callback=lambda attr: warnings.warn( + DeprecationWarning(f'{__name__}.{attr} is deprecated'), stacklevel=3)) + + +# HTMLParseError has been deprecated in Python 3.3 and removed in +# Python 3.5. Introducing dummy exception for Python >3.5 for compatible +# and uniform cross-version exception handling +class compat_HTMLParseError(ValueError): + pass + + +class _TreeBuilder(etree.TreeBuilder): + def doctype(self, name, pubid, system): + pass + + +def compat_etree_fromstring(text): + return etree.XML(text, parser=etree.XMLParser(target=_TreeBuilder())) + + +compat_os_name = os._name if os.name == 'java' else os.name + + +if compat_os_name == 'nt': + def compat_shlex_quote(s): + import re + return s if re.match(r'^[-_\w./]+$', s) else '"%s"' % s.replace('"', '\\"') +else: + from shlex import quote as compat_shlex_quote # noqa: F401 + + +def compat_ord(c): + return c if isinstance(c, int) else ord(c) + + +if compat_os_name == 'nt' and sys.version_info < (3, 8): + # os.path.realpath on Windows does not follow symbolic links + # prior to Python 3.8 (see https://bugs.python.org/issue9949) + def compat_realpath(path): + while os.path.islink(path): + path = os.path.abspath(os.readlink(path)) + return os.path.realpath(path) +else: + compat_realpath = os.path.realpath + + +# Python 3.8+ does not honor %HOME% on windows, but this breaks compatibility with youtube-dl +# See https://github.com/hypervideo/hypervideo/issues/792 +# https://docs.python.org/3/library/os.path.html#os.path.expanduser +if compat_os_name in ('nt', 'ce'): + def compat_expanduser(path): + HOME = os.environ.get('HOME') + if not HOME: + return os.path.expanduser(path) + elif not path.startswith('~'): + return path + i = path.replace('\\', '/', 1).find('/') # ~user + if i < 0: + i = len(path) + userhome = os.path.join(os.path.dirname(HOME), path[1:i]) if i > 1 else HOME + return userhome + path[i:] +else: + compat_expanduser = os.path.expanduser + + +# NB: Add modules that are imported dynamically here so that PyInstaller can find them +# See https://github.com/pyinstaller/pyinstaller-hooks-contrib/issues/438 +if False: + from . import _legacy # noqa: F401 diff --git a/hypervideo_dl/compat/_deprecated.py b/hypervideo_dl/compat/_deprecated.py new file mode 100644 index 0000000..342f1f8 --- /dev/null +++ b/hypervideo_dl/compat/_deprecated.py @@ -0,0 +1,16 @@ +"""Deprecated - New code should avoid these""" + +import base64 +import urllib.error +import urllib.parse + +compat_str = str + +compat_b64decode = base64.b64decode + +compat_HTTPError = urllib.error.HTTPError +compat_urlparse = urllib.parse +compat_parse_qs = urllib.parse.parse_qs +compat_urllib_parse_unquote = urllib.parse.unquote +compat_urllib_parse_urlencode = urllib.parse.urlencode +compat_urllib_parse_urlparse = urllib.parse.urlparse diff --git a/hypervideo_dl/compat/_legacy.py b/hypervideo_dl/compat/_legacy.py new file mode 100644 index 0000000..d19333d --- /dev/null +++ b/hypervideo_dl/compat/_legacy.py @@ -0,0 +1,97 @@ +""" Do not use! """ + +import collections +import ctypes +import getpass +import html.entities +import html.parser +import http.client +import http.cookiejar +import http.cookies +import http.server +import itertools +import os +import shlex +import shutil +import socket +import struct +import tokenize +import urllib.error +import urllib.parse +import urllib.request +import xml.etree.ElementTree as etree +from subprocess import DEVNULL + +# isort: split +import asyncio # noqa: F401 +import re # noqa: F401 +from asyncio import run as compat_asyncio_run # noqa: F401 +from re import Pattern as compat_Pattern # noqa: F401 +from re import match as compat_Match # noqa: F401 + +from .compat_utils import passthrough_module +from ..dependencies import Cryptodome_AES as compat_pycrypto_AES # noqa: F401 +from ..dependencies import brotli as compat_brotli # noqa: F401 +from ..dependencies import websockets as compat_websockets # noqa: F401 + +passthrough_module(__name__, '...utils', ('WINDOWS_VT_MODE', 'windows_enable_vt_mode')) + + +# compat_ctypes_WINFUNCTYPE = ctypes.WINFUNCTYPE +# will not work since ctypes.WINFUNCTYPE does not exist in UNIX machines +def compat_ctypes_WINFUNCTYPE(*args, **kwargs): + return ctypes.WINFUNCTYPE(*args, **kwargs) + + +def compat_setenv(key, value, env=os.environ): + env[key] = value + + +compat_basestring = str +compat_casefold = str.casefold +compat_chr = chr +compat_collections_abc = collections.abc +compat_cookiejar = http.cookiejar +compat_cookiejar_Cookie = http.cookiejar.Cookie +compat_cookies = http.cookies +compat_cookies_SimpleCookie = http.cookies.SimpleCookie +compat_etree_Element = etree.Element +compat_etree_register_namespace = etree.register_namespace +compat_filter = filter +compat_get_terminal_size = shutil.get_terminal_size +compat_getenv = os.getenv +compat_getpass = getpass.getpass +compat_html_entities = html.entities +compat_html_entities_html5 = html.entities.html5 +compat_HTMLParser = html.parser.HTMLParser +compat_http_client = http.client +compat_http_server = http.server +compat_input = input +compat_integer_types = (int, ) +compat_itertools_count = itertools.count +compat_kwargs = lambda kwargs: kwargs +compat_map = map +compat_numeric_types = (int, float, complex) +compat_print = print +compat_shlex_split = shlex.split +compat_socket_create_connection = socket.create_connection +compat_Struct = struct.Struct +compat_struct_pack = struct.pack +compat_struct_unpack = struct.unpack +compat_subprocess_get_DEVNULL = lambda: DEVNULL +compat_tokenize_tokenize = tokenize.tokenize +compat_urllib_error = urllib.error +compat_urllib_parse = urllib.parse +compat_urllib_parse_quote = urllib.parse.quote +compat_urllib_parse_quote_plus = urllib.parse.quote_plus +compat_urllib_parse_unquote_plus = urllib.parse.unquote_plus +compat_urllib_parse_unquote_to_bytes = urllib.parse.unquote_to_bytes +compat_urllib_parse_urlunparse = urllib.parse.urlunparse +compat_urllib_request = urllib.request +compat_urllib_request_DataHandler = urllib.request.DataHandler +compat_urllib_response = urllib.response +compat_urlretrieve = urllib.request.urlretrieve +compat_xml_parse_error = etree.ParseError +compat_xpath = lambda xpath: xpath +compat_zip = zip +workaround_optparse_bug9161 = lambda: None diff --git a/hypervideo_dl/compat/compat_utils.py b/hypervideo_dl/compat/compat_utils.py new file mode 100644 index 0000000..1bf6566 --- /dev/null +++ b/hypervideo_dl/compat/compat_utils.py @@ -0,0 +1,70 @@ +import collections +import contextlib +import importlib +import sys +import types + +_NO_ATTRIBUTE = object() + +_Package = collections.namedtuple('Package', ('name', 'version')) + + +def get_package_info(module): + parent = module.__name__.split('.')[0] + parent_module = None + with contextlib.suppress(ImportError): + parent_module = importlib.import_module(parent) + + for attr in ('__version__', 'version_string', 'version'): + version = getattr(parent_module, attr, None) + if version is not None: + break + return _Package(getattr(module, '_hypervideo_dl__identifier', parent), str(version)) + + +def _is_package(module): + try: + module.__getattribute__('__path__') + except AttributeError: + return False + return True + + +def passthrough_module(parent, child, allowed_attributes=None, *, callback=lambda _: None): + parent_module = importlib.import_module(parent) + child_module = None # Import child module only as needed + + class PassthroughModule(types.ModuleType): + def __getattr__(self, attr): + if _is_package(parent_module): + with contextlib.suppress(ImportError): + return importlib.import_module(f'.{attr}', parent) + + ret = self.__from_child(attr) + if ret is _NO_ATTRIBUTE: + raise AttributeError(f'module {parent} has no attribute {attr}') + callback(attr) + return ret + + def __from_child(self, attr): + if allowed_attributes is None: + if attr.startswith('__') and attr.endswith('__'): + return _NO_ATTRIBUTE + elif attr not in allowed_attributes: + return _NO_ATTRIBUTE + + nonlocal child_module + child_module = child_module or importlib.import_module(child, parent) + + with contextlib.suppress(AttributeError): + return getattr(child_module, attr) + + if _is_package(child_module): + with contextlib.suppress(ImportError): + return importlib.import_module(f'.{attr}', child) + + return _NO_ATTRIBUTE + + # Python 3.6 does not have module level __getattr__ + # https://peps.python.org/pep-0562/ + sys.modules[parent].__class__ = PassthroughModule diff --git a/hypervideo_dl/compat/functools.py b/hypervideo_dl/compat/functools.py new file mode 100644 index 0000000..ec003ea --- /dev/null +++ b/hypervideo_dl/compat/functools.py @@ -0,0 +1,26 @@ +# flake8: noqa: F405 +from functools import * # noqa: F403 + +from .compat_utils import passthrough_module + +passthrough_module(__name__, 'functools') +del passthrough_module + +try: + cache # >= 3.9 +except NameError: + cache = lru_cache(maxsize=None) + +try: + cached_property # >= 3.8 +except NameError: + class cached_property: + def __init__(self, func): + update_wrapper(self, func) + self.func = func + + def __get__(self, instance, _): + if instance is None: + return self + setattr(instance, self.func.__name__, self.func(instance)) + return getattr(instance, self.func.__name__) diff --git a/hypervideo_dl/compat/imghdr.py b/hypervideo_dl/compat/imghdr.py new file mode 100644 index 0000000..5d64ab0 --- /dev/null +++ b/hypervideo_dl/compat/imghdr.py @@ -0,0 +1,16 @@ +tests = { + 'webp': lambda h: h[0:4] == b'RIFF' and h[8:] == b'WEBP', + 'png': lambda h: h[:8] == b'\211PNG\r\n\032\n', + 'jpeg': lambda h: h[6:10] in (b'JFIF', b'Exif'), + 'gif': lambda h: h[:6] in (b'GIF87a', b'GIF89a'), +} + + +def what(file=None, h=None): + """Detect format of image (Currently supports jpeg, png, webp, gif only) + Ref: https://github.com/python/cpython/blob/3.10/Lib/imghdr.py + """ + if h is None: + with open(file, 'rb') as f: + h = f.read(12) + return next((type_ for type_, test in tests.items() if test(h)), None) diff --git a/hypervideo_dl/compat/shutil.py b/hypervideo_dl/compat/shutil.py new file mode 100644 index 0000000..23239d5 --- /dev/null +++ b/hypervideo_dl/compat/shutil.py @@ -0,0 +1,30 @@ +# flake8: noqa: F405 +from shutil import * # noqa: F403 + +from .compat_utils import passthrough_module + +passthrough_module(__name__, 'shutil') +del passthrough_module + + +import sys + +if sys.platform.startswith('freebsd'): + import errno + import os + import shutil + + # Workaround for PermissionError when using restricted ACL mode on FreeBSD + def copy2(src, dst, *args, **kwargs): + if os.path.isdir(dst): + dst = os.path.join(dst, os.path.basename(src)) + shutil.copyfile(src, dst, *args, **kwargs) + try: + shutil.copystat(src, dst, *args, **kwargs) + except PermissionError as e: + if e.errno != getattr(errno, 'EPERM', None): + raise + return dst + + def move(*args, copy_function=copy2, **kwargs): + return shutil.move(*args, copy_function=copy_function, **kwargs) diff --git a/hypervideo_dl/cookies.py b/hypervideo_dl/cookies.py index f963729..97457a1 100644 --- a/hypervideo_dl/cookies.py +++ b/hypervideo_dl/cookies.py @@ -1,12 +1,16 @@ +import base64 import contextlib -import ctypes +import http.cookiejar +import http.cookies import json import os +import re import shutil import struct import subprocess import sys import tempfile +import time from datetime import datetime, timedelta, timezone from enum import Enum, auto from hashlib import pbkdf2_hmac @@ -16,39 +20,21 @@ from .aes import ( aes_gcm_decrypt_and_verify_bytes, unpad_pkcs7, ) -from .compat import ( - compat_b64decode, - compat_cookiejar_Cookie, +from .dependencies import ( + _SECRETSTORAGE_UNAVAILABLE_REASON, + secretstorage, + sqlite3, ) +from .minicurses import MultilinePrinter, QuietMultilinePrinter from .utils import ( - error_to_str, - expand_path, Popen, YoutubeDLCookieJar, + error_to_str, + expand_path, + is_path_like, + try_call, ) -try: - import sqlite3 - SQLITE_AVAILABLE = True -except ImportError: - # although sqlite3 is part of the standard library, it is possible to compile python without - # sqlite support. See: https://github.com/hypervideo/hypervideo/issues/544 - SQLITE_AVAILABLE = False - - -try: - import secretstorage - SECRETSTORAGE_AVAILABLE = True -except ImportError: - SECRETSTORAGE_AVAILABLE = False - SECRETSTORAGE_UNAVAILABLE_REASON = ( - 'as the `secretstorage` module is not installed. ' - 'Please install by running `python3 -m pip install secretstorage`.') -except Exception as _err: - SECRETSTORAGE_AVAILABLE = False - SECRETSTORAGE_UNAVAILABLE_REASON = f'as the `secretstorage` module could not be initialized. {_err}' - - CHROMIUM_BASED_BROWSERS = {'brave', 'chrome', 'chromium', 'edge', 'opera', 'vivaldi'} SUPPORTED_BROWSERS = CHROMIUM_BASED_BROWSERS | {'firefox', 'safari'} @@ -73,37 +59,72 @@ class YDLLogger: if self._ydl: self._ydl.report_error(message) + class ProgressBar(MultilinePrinter): + _DELAY, _timer = 0.1, 0 + + def print(self, message): + if time.time() - self._timer > self._DELAY: + self.print_at_line(f'[Cookies] {message}', 0) + self._timer = time.time() + + def progress_bar(self): + """Return a context manager with a print method. (Optional)""" + # Do not print to files/pipes, loggers, or when --no-progress is used + if not self._ydl or self._ydl.params.get('noprogress') or self._ydl.params.get('logger'): + return + file = self._ydl._out_files.error + try: + if not file.isatty(): + return + except BaseException: + return + return self.ProgressBar(file, preserve_output=False) + + +def _create_progress_bar(logger): + if hasattr(logger, 'progress_bar'): + printer = logger.progress_bar() + if printer: + return printer + printer = QuietMultilinePrinter() + printer.print = lambda _: None + return printer + def load_cookies(cookie_file, browser_specification, ydl): cookie_jars = [] if browser_specification is not None: - browser_name, profile, keyring = _parse_browser_specification(*browser_specification) - cookie_jars.append(extract_cookies_from_browser(browser_name, profile, YDLLogger(ydl), keyring=keyring)) + browser_name, profile, keyring, container = _parse_browser_specification(*browser_specification) + cookie_jars.append( + extract_cookies_from_browser(browser_name, profile, YDLLogger(ydl), keyring=keyring, container=container)) if cookie_file is not None: - cookie_file = expand_path(cookie_file) + is_filename = is_path_like(cookie_file) + if is_filename: + cookie_file = expand_path(cookie_file) + jar = YoutubeDLCookieJar(cookie_file) - if os.access(cookie_file, os.R_OK): + if not is_filename or os.access(cookie_file, os.R_OK): jar.load(ignore_discard=True, ignore_expires=True) cookie_jars.append(jar) return _merge_cookie_jars(cookie_jars) -def extract_cookies_from_browser(browser_name, profile=None, logger=YDLLogger(), *, keyring=None): +def extract_cookies_from_browser(browser_name, profile=None, logger=YDLLogger(), *, keyring=None, container=None): if browser_name == 'firefox': - return _extract_firefox_cookies(profile, logger) + return _extract_firefox_cookies(profile, container, logger) elif browser_name == 'safari': return _extract_safari_cookies(profile, logger) elif browser_name in CHROMIUM_BASED_BROWSERS: return _extract_chrome_cookies(browser_name, profile, keyring, logger) else: - raise ValueError('unknown browser: {}'.format(browser_name)) + raise ValueError(f'unknown browser: {browser_name}') -def _extract_firefox_cookies(profile, logger): +def _extract_firefox_cookies(profile, container, logger): logger.info('Extracting cookies from firefox') - if not SQLITE_AVAILABLE: + if not sqlite3: logger.warning('Cannot extract cookies from firefox without sqlite3 support. ' 'Please use a python interpreter compiled with sqlite3 support') return YoutubeDLCookieJar() @@ -115,25 +136,54 @@ def _extract_firefox_cookies(profile, logger): else: search_root = os.path.join(_firefox_browser_dir(), profile) - cookie_database_path = _find_most_recently_used_file(search_root, 'cookies.sqlite') + cookie_database_path = _find_most_recently_used_file(search_root, 'cookies.sqlite', logger) if cookie_database_path is None: - raise FileNotFoundError('could not find firefox cookies database in {}'.format(search_root)) - logger.debug('Extracting cookies from: "{}"'.format(cookie_database_path)) + raise FileNotFoundError(f'could not find firefox cookies database in {search_root}') + logger.debug(f'Extracting cookies from: "{cookie_database_path}"') + + container_id = None + if container not in (None, 'none'): + containers_path = os.path.join(os.path.dirname(cookie_database_path), 'containers.json') + if not os.path.isfile(containers_path) or not os.access(containers_path, os.R_OK): + raise FileNotFoundError(f'could not read containers.json in {search_root}') + with open(containers_path) as containers: + identities = json.load(containers).get('identities', []) + container_id = next((context.get('userContextId') for context in identities if container in ( + context.get('name'), + try_call(lambda: re.fullmatch(r'userContext([^\.]+)\.label', context['l10nID']).group()) + )), None) + if not isinstance(container_id, int): + raise ValueError(f'could not find firefox container "{container}" in containers.json') with tempfile.TemporaryDirectory(prefix='hypervideo_dl') as tmpdir: cursor = None try: cursor = _open_database_copy(cookie_database_path, tmpdir) - cursor.execute('SELECT host, name, value, path, expiry, isSecure FROM moz_cookies') + if isinstance(container_id, int): + logger.debug( + f'Only loading cookies from firefox container "{container}", ID {container_id}') + cursor.execute( + 'SELECT host, name, value, path, expiry, isSecure FROM moz_cookies WHERE originAttributes LIKE ? OR originAttributes LIKE ?', + (f'%userContextId={container_id}', f'%userContextId={container_id}&%')) + elif container == 'none': + logger.debug('Only loading cookies not belonging to any container') + cursor.execute( + 'SELECT host, name, value, path, expiry, isSecure FROM moz_cookies WHERE NOT INSTR(originAttributes,"userContextId=")') + else: + cursor.execute('SELECT host, name, value, path, expiry, isSecure FROM moz_cookies') jar = YoutubeDLCookieJar() - for host, name, value, path, expiry, is_secure in cursor.fetchall(): - cookie = compat_cookiejar_Cookie( - version=0, name=name, value=value, port=None, port_specified=False, - domain=host, domain_specified=bool(host), domain_initial_dot=host.startswith('.'), - path=path, path_specified=bool(path), secure=is_secure, expires=expiry, discard=False, - comment=None, comment_url=None, rest={}) - jar.set_cookie(cookie) - logger.info('Extracted {} cookies from firefox'.format(len(jar))) + with _create_progress_bar(logger) as progress_bar: + table = cursor.fetchall() + total_cookie_count = len(table) + for i, (host, name, value, path, expiry, is_secure) in enumerate(table): + progress_bar.print(f'Loading cookie {i: 6d}/{total_cookie_count: 6d}') + cookie = http.cookiejar.Cookie( + version=0, name=name, value=value, port=None, port_specified=False, + domain=host, domain_specified=bool(host), domain_initial_dot=host.startswith('.'), + path=path, path_specified=bool(path), secure=is_secure, expires=expiry, discard=False, + comment=None, comment_url=None, rest={}) + jar.set_cookie(cookie) + logger.info(f'Extracted {len(jar)} cookies from firefox') return jar finally: if cursor is not None: @@ -141,39 +191,25 @@ def _extract_firefox_cookies(profile, logger): def _firefox_browser_dir(): - if sys.platform in ('linux', 'linux2'): - return os.path.expanduser('~/.mozilla/firefox') - elif sys.platform == 'win32': - return os.path.expandvars(r'%APPDATA%\Mozilla\Firefox\Profiles') + if sys.platform in ('cygwin', 'win32'): + return os.path.expandvars(R'%APPDATA%\Mozilla\Firefox\Profiles') elif sys.platform == 'darwin': return os.path.expanduser('~/Library/Application Support/Firefox') - else: - raise ValueError('unsupported platform: {}'.format(sys.platform)) + return os.path.expanduser('~/.mozilla/firefox') def _get_chromium_based_browser_settings(browser_name): # https://chromium.googlesource.com/chromium/src/+/HEAD/docs/user_data_dir.md - if sys.platform in ('linux', 'linux2'): - config = _config_home() - browser_dir = { - 'brave': os.path.join(config, 'BraveSoftware/Brave-Browser'), - 'chrome': os.path.join(config, 'google-chrome'), - 'chromium': os.path.join(config, 'chromium'), - 'edge': os.path.join(config, 'microsoft-edge'), - 'opera': os.path.join(config, 'opera'), - 'vivaldi': os.path.join(config, 'vivaldi'), - }[browser_name] - - elif sys.platform == 'win32': + if sys.platform in ('cygwin', 'win32'): appdata_local = os.path.expandvars('%LOCALAPPDATA%') appdata_roaming = os.path.expandvars('%APPDATA%') browser_dir = { - 'brave': os.path.join(appdata_local, r'BraveSoftware\Brave-Browser\User Data'), - 'chrome': os.path.join(appdata_local, r'Google\Chrome\User Data'), - 'chromium': os.path.join(appdata_local, r'Chromium\User Data'), - 'edge': os.path.join(appdata_local, r'Microsoft\Edge\User Data'), - 'opera': os.path.join(appdata_roaming, r'Opera Software\Opera Stable'), - 'vivaldi': os.path.join(appdata_local, r'Vivaldi\User Data'), + 'brave': os.path.join(appdata_local, R'BraveSoftware\Brave-Browser\User Data'), + 'chrome': os.path.join(appdata_local, R'Google\Chrome\User Data'), + 'chromium': os.path.join(appdata_local, R'Chromium\User Data'), + 'edge': os.path.join(appdata_local, R'Microsoft\Edge\User Data'), + 'opera': os.path.join(appdata_roaming, R'Opera Software\Opera Stable'), + 'vivaldi': os.path.join(appdata_local, R'Vivaldi\User Data'), }[browser_name] elif sys.platform == 'darwin': @@ -188,7 +224,15 @@ def _get_chromium_based_browser_settings(browser_name): }[browser_name] else: - raise ValueError('unsupported platform: {}'.format(sys.platform)) + config = _config_home() + browser_dir = { + 'brave': os.path.join(config, 'BraveSoftware/Brave-Browser'), + 'chrome': os.path.join(config, 'google-chrome'), + 'chromium': os.path.join(config, 'chromium'), + 'edge': os.path.join(config, 'microsoft-edge'), + 'opera': os.path.join(config, 'opera'), + 'vivaldi': os.path.join(config, 'vivaldi'), + }[browser_name] # Linux keyring names can be determined by snooping on dbus while opening the browser in KDE: # dbus-monitor "interface='org.kde.KWallet'" "type=method_return" @@ -211,11 +255,11 @@ def _get_chromium_based_browser_settings(browser_name): def _extract_chrome_cookies(browser_name, profile, keyring, logger): - logger.info('Extracting cookies from {}'.format(browser_name)) + logger.info(f'Extracting cookies from {browser_name}') - if not SQLITE_AVAILABLE: - logger.warning(('Cannot extract cookies from {} without sqlite3 support. ' - 'Please use a python interpreter compiled with sqlite3 support').format(browser_name)) + if not sqlite3: + logger.warning(f'Cannot extract cookies from {browser_name} without sqlite3 support. ' + 'Please use a python interpreter compiled with sqlite3 support') return YoutubeDLCookieJar() config = _get_chromium_based_browser_settings(browser_name) @@ -229,13 +273,13 @@ def _extract_chrome_cookies(browser_name, profile, keyring, logger): if config['supports_profiles']: search_root = os.path.join(config['browser_dir'], profile) else: - logger.error('{} does not support profiles'.format(browser_name)) + logger.error(f'{browser_name} does not support profiles') search_root = config['browser_dir'] - cookie_database_path = _find_most_recently_used_file(search_root, 'Cookies') + cookie_database_path = _find_most_recently_used_file(search_root, 'Cookies', logger) if cookie_database_path is None: - raise FileNotFoundError('could not find {} cookies database in "{}"'.format(browser_name, search_root)) - logger.debug('Extracting cookies from: "{}"'.format(cookie_database_path)) + raise FileNotFoundError(f'could not find {browser_name} cookies database in "{search_root}"') + logger.debug(f'Extracting cookies from: "{cookie_database_path}"') decryptor = get_cookie_decryptor(config['browser_dir'], config['keyring_name'], logger, keyring=keyring) @@ -246,45 +290,55 @@ def _extract_chrome_cookies(browser_name, profile, keyring, logger): cursor.connection.text_factory = bytes column_names = _get_column_names(cursor, 'cookies') secure_column = 'is_secure' if 'is_secure' in column_names else 'secure' - cursor.execute('SELECT host_key, name, value, encrypted_value, path, ' - 'expires_utc, {} FROM cookies'.format(secure_column)) + cursor.execute(f'SELECT host_key, name, value, encrypted_value, path, expires_utc, {secure_column} FROM cookies') jar = YoutubeDLCookieJar() failed_cookies = 0 unencrypted_cookies = 0 - for host_key, name, value, encrypted_value, path, expires_utc, is_secure in cursor.fetchall(): - host_key = host_key.decode('utf-8') - name = name.decode('utf-8') - value = value.decode('utf-8') - path = path.decode('utf-8') - - if not value and encrypted_value: - value = decryptor.decrypt(encrypted_value) - if value is None: + with _create_progress_bar(logger) as progress_bar: + table = cursor.fetchall() + total_cookie_count = len(table) + for i, line in enumerate(table): + progress_bar.print(f'Loading cookie {i: 6d}/{total_cookie_count: 6d}') + is_encrypted, cookie = _process_chrome_cookie(decryptor, *line) + if not cookie: failed_cookies += 1 continue - else: - unencrypted_cookies += 1 - - cookie = compat_cookiejar_Cookie( - version=0, name=name, value=value, port=None, port_specified=False, - domain=host_key, domain_specified=bool(host_key), domain_initial_dot=host_key.startswith('.'), - path=path, path_specified=bool(path), secure=is_secure, expires=expires_utc, discard=False, - comment=None, comment_url=None, rest={}) - jar.set_cookie(cookie) + elif not is_encrypted: + unencrypted_cookies += 1 + jar.set_cookie(cookie) if failed_cookies > 0: - failed_message = ' ({} could not be decrypted)'.format(failed_cookies) + failed_message = f' ({failed_cookies} could not be decrypted)' else: failed_message = '' - logger.info('Extracted {} cookies from {}{}'.format(len(jar), browser_name, failed_message)) - counts = decryptor.cookie_counts.copy() + logger.info(f'Extracted {len(jar)} cookies from {browser_name}{failed_message}') + counts = decryptor._cookie_counts.copy() counts['unencrypted'] = unencrypted_cookies - logger.debug('cookie version breakdown: {}'.format(counts)) + logger.debug(f'cookie version breakdown: {counts}') return jar finally: if cursor is not None: cursor.connection.close() +def _process_chrome_cookie(decryptor, host_key, name, value, encrypted_value, path, expires_utc, is_secure): + host_key = host_key.decode() + name = name.decode() + value = value.decode() + path = path.decode() + is_encrypted = not value and encrypted_value + + if is_encrypted: + value = decryptor.decrypt(encrypted_value) + if value is None: + return is_encrypted, None + + return is_encrypted, http.cookiejar.Cookie( + version=0, name=name, value=value, port=None, port_specified=False, + domain=host_key, domain_specified=bool(host_key), domain_initial_dot=host_key.startswith('.'), + path=path, path_specified=bool(path), secure=is_secure, expires=expires_utc, discard=False, + comment=None, comment_url=None, rest={}) + + class ChromeCookieDecryptor: """ Overview: @@ -311,24 +365,18 @@ class ChromeCookieDecryptor: - KeyStorageLinux::CreateService """ - def decrypt(self, encrypted_value): - raise NotImplementedError + _cookie_counts = {} - @property - def cookie_counts(self): - raise NotImplementedError + def decrypt(self, encrypted_value): + raise NotImplementedError('Must be implemented by sub classes') def get_cookie_decryptor(browser_root, browser_keyring_name, logger, *, keyring=None): - if sys.platform in ('linux', 'linux2'): - return LinuxChromeCookieDecryptor(browser_keyring_name, logger, keyring=keyring) - elif sys.platform == 'darwin': + if sys.platform == 'darwin': return MacChromeCookieDecryptor(browser_keyring_name, logger) - elif sys.platform == 'win32': + elif sys.platform in ('win32', 'cygwin'): return WindowsChromeCookieDecryptor(browser_root, logger) - else: - raise NotImplementedError('Chrome cookie decryption is not supported ' - 'on this platform: {}'.format(sys.platform)) + return LinuxChromeCookieDecryptor(browser_keyring_name, logger, keyring=keyring) class LinuxChromeCookieDecryptor(ChromeCookieDecryptor): @@ -345,10 +393,6 @@ class LinuxChromeCookieDecryptor(ChromeCookieDecryptor): # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_linux.cc return pbkdf2_sha1(password, salt=b'saltysalt', iterations=1, key_length=16) - @property - def cookie_counts(self): - return self._cookie_counts - def decrypt(self, encrypted_value): version = encrypted_value[:3] ciphertext = encrypted_value[3:] @@ -382,10 +426,6 @@ class MacChromeCookieDecryptor(ChromeCookieDecryptor): # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_mac.mm return pbkdf2_sha1(password, salt=b'saltysalt', iterations=1003, key_length=16) - @property - def cookie_counts(self): - return self._cookie_counts - def decrypt(self, encrypted_value): version = encrypted_value[:3] ciphertext = encrypted_value[3:] @@ -411,10 +451,6 @@ class WindowsChromeCookieDecryptor(ChromeCookieDecryptor): self._v10_key = _get_windows_v10_key(browser_root, logger) self._cookie_counts = {'v10': 0, 'other': 0} - @property - def cookie_counts(self): - return self._cookie_counts - def decrypt(self, encrypted_value): version = encrypted_value[:3] ciphertext = encrypted_value[3:] @@ -443,14 +479,14 @@ class WindowsChromeCookieDecryptor(ChromeCookieDecryptor): self._cookie_counts['other'] += 1 # any other prefix means the data is DPAPI encrypted # https://chromium.googlesource.com/chromium/src/+/refs/heads/main/components/os_crypt/os_crypt_win.cc - return _decrypt_windows_dpapi(encrypted_value, self._logger).decode('utf-8') + return _decrypt_windows_dpapi(encrypted_value, self._logger).decode() def _extract_safari_cookies(profile, logger): if profile is not None: logger.error('safari does not support profiles') if sys.platform != 'darwin': - raise ValueError('unsupported platform: {}'.format(sys.platform)) + raise ValueError(f'unsupported platform: {sys.platform}') cookies_path = os.path.expanduser('~/Library/Cookies/Cookies.binarycookies') @@ -464,7 +500,7 @@ def _extract_safari_cookies(profile, logger): cookies_data = f.read() jar = parse_safari_cookies(cookies_data, logger=logger) - logger.info('Extracted {} cookies from safari'.format(len(jar))) + logger.info(f'Extracted {len(jar)} cookies from safari') return jar @@ -480,7 +516,7 @@ class DataParser: def read_bytes(self, num_bytes): if num_bytes < 0: - raise ParserError('invalid read of {} bytes'.format(num_bytes)) + raise ParserError(f'invalid read of {num_bytes} bytes') end = self.cursor + num_bytes if end > len(self._data): raise ParserError('reached end of input') @@ -491,7 +527,7 @@ class DataParser: def expect_bytes(self, expected_value, message): value = self.read_bytes(len(expected_value)) if value != expected_value: - raise ParserError('unexpected value: {} != {} ({})'.format(value, expected_value, message)) + raise ParserError(f'unexpected value: {value} != {expected_value} ({message})') def read_uint(self, big_endian=False): data_format = '>I' if big_endian else '<I' @@ -506,16 +542,15 @@ class DataParser: while True: c = self.read_bytes(1) if c == b'\x00': - return b''.join(buffer).decode('utf-8') + return b''.join(buffer).decode() else: buffer.append(c) def skip(self, num_bytes, description='unknown'): if num_bytes > 0: - self._logger.debug('skipping {} bytes ({}): {}'.format( - num_bytes, description, self.read_bytes(num_bytes))) + self._logger.debug(f'skipping {num_bytes} bytes ({description}): {self.read_bytes(num_bytes)!r}') elif num_bytes < 0: - raise ParserError('invalid skip of {} bytes'.format(num_bytes)) + raise ParserError(f'invalid skip of {num_bytes} bytes') def skip_to(self, offset, description='unknown'): self.skip(offset - self.cursor, description) @@ -542,15 +577,17 @@ def _parse_safari_cookies_page(data, jar, logger): number_of_cookies = p.read_uint() record_offsets = [p.read_uint() for _ in range(number_of_cookies)] if number_of_cookies == 0: - logger.debug('a cookies page of size {} has no cookies'.format(len(data))) + logger.debug(f'a cookies page of size {len(data)} has no cookies') return p.skip_to(record_offsets[0], 'unknown page header field') - for record_offset in record_offsets: - p.skip_to(record_offset, 'space between records') - record_length = _parse_safari_cookies_record(data[record_offset:], jar, logger) - p.read_bytes(record_length) + with _create_progress_bar(logger) as progress_bar: + for i, record_offset in enumerate(record_offsets): + progress_bar.print(f'Loading cookie {i: 6d}/{number_of_cookies: 6d}') + p.skip_to(record_offset, 'space between records') + record_length = _parse_safari_cookies_record(data[record_offset:], jar, logger) + p.read_bytes(record_length) p.skip_to_end('space in between pages') @@ -587,7 +624,7 @@ def _parse_safari_cookies_record(data, jar, logger): p.skip_to(record_size, 'space at the end of the record') - cookie = compat_cookiejar_Cookie( + cookie = http.cookiejar.Cookie( version=0, name=name, value=value, port=None, port_specified=False, domain=domain, domain_specified=bool(domain), domain_initial_dot=domain.startswith('.'), path=path, path_specified=bool(path), secure=is_secure, expires=expiration_date, discard=False, @@ -686,7 +723,7 @@ def _choose_linux_keyring(logger): SelectBackend """ desktop_environment = _get_linux_desktop_environment(os.environ) - logger.debug('detected desktop environment: {}'.format(desktop_environment.name)) + logger.debug(f'detected desktop environment: {desktop_environment.name}') if desktop_environment == _LinuxDesktopEnvironment.KDE: linux_keyring = _LinuxKeyring.KWALLET elif desktop_environment == _LinuxDesktopEnvironment.OTHER: @@ -707,23 +744,21 @@ def _get_kwallet_network_wallet(logger): """ default_wallet = 'kdewallet' try: - proc = Popen([ + stdout, _, returncode = Popen.run([ 'dbus-send', '--session', '--print-reply=literal', '--dest=org.kde.kwalletd5', '/modules/kwalletd5', 'org.kde.KWallet.networkWallet' - ], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) + ], text=True, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) - stdout, stderr = proc.communicate_or_kill() - if proc.returncode != 0: + if returncode: logger.warning('failed to read NetworkWallet') return default_wallet else: - network_wallet = stdout.decode('utf-8').strip() - logger.debug('NetworkWallet = "{}"'.format(network_wallet)) - return network_wallet + logger.debug(f'NetworkWallet = "{stdout.strip()}"') + return stdout.strip() except Exception as e: - logger.warning('exception while obtaining NetworkWallet: {}'.format(e)) + logger.warning(f'exception while obtaining NetworkWallet: {e}') return default_wallet @@ -739,17 +774,16 @@ def _get_kwallet_password(browser_keyring_name, logger): network_wallet = _get_kwallet_network_wallet(logger) try: - proc = Popen([ + stdout, _, returncode = Popen.run([ 'kwallet-query', - '--read-password', '{} Safe Storage'.format(browser_keyring_name), - '--folder', '{} Keys'.format(browser_keyring_name), + '--read-password', f'{browser_keyring_name} Safe Storage', + '--folder', f'{browser_keyring_name} Keys', network_wallet ], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) - stdout, stderr = proc.communicate_or_kill() - if proc.returncode != 0: - logger.error('kwallet-query failed with return code {}. Please consult ' - 'the kwallet-query man page for details'.format(proc.returncode)) + if returncode: + logger.error(f'kwallet-query failed with return code {returncode}. ' + 'Please consult the kwallet-query man page for details') return b'' else: if stdout.lower().startswith(b'failed to read'): @@ -764,17 +798,15 @@ def _get_kwallet_password(browser_keyring_name, logger): return b'' else: logger.debug('password found') - if stdout[-1:] == b'\n': - stdout = stdout[:-1] - return stdout + return stdout.rstrip(b'\n') except Exception as e: logger.warning(f'exception running kwallet-query: {error_to_str(e)}') return b'' def _get_gnome_keyring_password(browser_keyring_name, logger): - if not SECRETSTORAGE_AVAILABLE: - logger.error('secretstorage not available {}'.format(SECRETSTORAGE_UNAVAILABLE_REASON)) + if not secretstorage: + logger.error(f'secretstorage not available {_SECRETSTORAGE_UNAVAILABLE_REASON}') return b'' # the Gnome keyring does not seem to organise keys in the same way as KWallet, # using `dbus-monitor` during startup, it can be observed that chromium lists all keys @@ -783,7 +815,7 @@ def _get_gnome_keyring_password(browser_keyring_name, logger): with contextlib.closing(secretstorage.dbus_init()) as con: col = secretstorage.get_default_collection(con) for item in col.get_all_items(): - if item.get_label() == '{} Safe Storage'.format(browser_keyring_name): + if item.get_label() == f'{browser_keyring_name} Safe Storage': return item.get_secret() else: logger.error('failed to read from keyring') @@ -813,35 +845,35 @@ def _get_linux_keyring_password(browser_keyring_name, keyring, logger): def _get_mac_keyring_password(browser_keyring_name, logger): logger.debug('using find-generic-password to obtain password from OSX keychain') try: - proc = Popen( + stdout, _, returncode = Popen.run( ['security', 'find-generic-password', '-w', # write password to stdout '-a', browser_keyring_name, # match 'account' - '-s', '{} Safe Storage'.format(browser_keyring_name)], # match 'service' + '-s', f'{browser_keyring_name} Safe Storage'], # match 'service' stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) - - stdout, stderr = proc.communicate_or_kill() - if stdout[-1:] == b'\n': - stdout = stdout[:-1] - return stdout + if returncode: + logger.warning('find-generic-password failed') + return None + return stdout.rstrip(b'\n') except Exception as e: logger.warning(f'exception running find-generic-password: {error_to_str(e)}') return None def _get_windows_v10_key(browser_root, logger): - path = _find_most_recently_used_file(browser_root, 'Local State') + path = _find_most_recently_used_file(browser_root, 'Local State', logger) if path is None: logger.error('could not find local state file') return None - with open(path, 'r', encoding='utf8') as f: + logger.debug(f'Found local state file at "{path}"') + with open(path, encoding='utf8') as f: data = json.load(f) try: base64_key = data['os_crypt']['encrypted_key'] except KeyError: logger.error('no encrypted key in Local State') return None - encrypted_key = compat_b64decode(base64_key) + encrypted_key = base64.b64decode(base64_key) prefix = b'DPAPI' if not encrypted_key.startswith(prefix): logger.error('invalid key') @@ -856,7 +888,7 @@ def pbkdf2_sha1(password, salt, iterations, key_length): def _decrypt_aes_cbc(ciphertext, key, logger, initialization_vector=b' ' * 16): plaintext = unpad_pkcs7(aes_cbc_decrypt_bytes(ciphertext, key, initialization_vector)) try: - return plaintext.decode('utf-8') + return plaintext.decode() except UnicodeDecodeError: logger.warning('failed to decrypt cookie (AES-CBC) because UTF-8 decoding failed. Possibly the key is wrong?', only_once=True) return None @@ -870,7 +902,7 @@ def _decrypt_aes_gcm(ciphertext, key, nonce, authentication_tag, logger): return None try: - return plaintext.decode('utf-8') + return plaintext.decode() except UnicodeDecodeError: logger.warning('failed to decrypt cookie (AES-GCM) because UTF-8 decoding failed. Possibly the key is wrong?', only_once=True) return None @@ -881,10 +913,12 @@ def _decrypt_windows_dpapi(ciphertext, logger): References: - https://docs.microsoft.com/en-us/windows/win32/api/dpapi/nf-dpapi-cryptunprotectdata """ - from ctypes.wintypes import DWORD + + import ctypes + import ctypes.wintypes class DATA_BLOB(ctypes.Structure): - _fields_ = [('cbData', DWORD), + _fields_ = [('cbData', ctypes.wintypes.DWORD), ('pbData', ctypes.POINTER(ctypes.c_char))] buffer = ctypes.create_string_buffer(ciphertext) @@ -921,17 +955,20 @@ def _open_database_copy(database_path, tmpdir): def _get_column_names(cursor, table_name): - table_info = cursor.execute('PRAGMA table_info({})'.format(table_name)).fetchall() - return [row[1].decode('utf-8') for row in table_info] + table_info = cursor.execute(f'PRAGMA table_info({table_name})').fetchall() + return [row[1].decode() for row in table_info] -def _find_most_recently_used_file(root, filename): +def _find_most_recently_used_file(root, filename, logger): # if there are multiple browser profiles, take the most recently used one - paths = [] - for root, dirs, files in os.walk(root): - for file in files: - if file == filename: - paths.append(os.path.join(root, file)) + i, paths = 0, [] + with _create_progress_bar(logger) as progress_bar: + for curr_root, dirs, files in os.walk(root): + for file in files: + i += 1 + progress_bar.print(f'Searching for "{filename}": {i: 6d} files searched') + if file == filename: + paths.append(os.path.join(curr_root, file)) return None if not paths else max(paths, key=lambda path: os.lstat(path).st_mtime) @@ -949,11 +986,102 @@ def _is_path(value): return os.path.sep in value -def _parse_browser_specification(browser_name, profile=None, keyring=None): +def _parse_browser_specification(browser_name, profile=None, keyring=None, container=None): if browser_name not in SUPPORTED_BROWSERS: raise ValueError(f'unsupported browser: "{browser_name}"') if keyring not in (None, *SUPPORTED_KEYRINGS): raise ValueError(f'unsupported keyring: "{keyring}"') - if profile is not None and _is_path(profile): - profile = os.path.expanduser(profile) - return browser_name, profile, keyring + if profile is not None and _is_path(expand_path(profile)): + profile = expand_path(profile) + return browser_name, profile, keyring, container + + +class LenientSimpleCookie(http.cookies.SimpleCookie): + """More lenient version of http.cookies.SimpleCookie""" + # From https://github.com/python/cpython/blob/v3.10.7/Lib/http/cookies.py + # We use Morsel's legal key chars to avoid errors on setting values + _LEGAL_KEY_CHARS = r'\w\d' + re.escape('!#$%&\'*+-.:^_`|~') + _LEGAL_VALUE_CHARS = _LEGAL_KEY_CHARS + re.escape('(),/<=>?@[]{}') + + _RESERVED = { + "expires", + "path", + "comment", + "domain", + "max-age", + "secure", + "httponly", + "version", + "samesite", + } + + _FLAGS = {"secure", "httponly"} + + # Added 'bad' group to catch the remaining value + _COOKIE_PATTERN = re.compile(r""" + \s* # Optional whitespace at start of cookie + (?P<key> # Start of group 'key' + [""" + _LEGAL_KEY_CHARS + r"""]+?# Any word of at least one letter + ) # End of group 'key' + ( # Optional group: there may not be a value. + \s*=\s* # Equal Sign + ( # Start of potential value + (?P<val> # Start of group 'val' + "(?:[^\\"]|\\.)*" # Any doublequoted string + | # or + \w{3},\s[\w\d\s-]{9,11}\s[\d:]{8}\sGMT # Special case for "expires" attr + | # or + [""" + _LEGAL_VALUE_CHARS + r"""]* # Any word or empty string + ) # End of group 'val' + | # or + (?P<bad>(?:\\;|[^;])*?) # 'bad' group fallback for invalid values + ) # End of potential value + )? # End of optional value group + \s* # Any number of spaces. + (\s+|;|$) # Ending either at space, semicolon, or EOS. + """, re.ASCII | re.VERBOSE) + + def load(self, data): + # Workaround for https://github.com/hypervideo/hypervideo/issues/4776 + if not isinstance(data, str): + return super().load(data) + + morsel = None + for match in self._COOKIE_PATTERN.finditer(data): + if match.group('bad'): + morsel = None + continue + + key, value = match.group('key', 'val') + + is_attribute = False + if key.startswith('$'): + key = key[1:] + is_attribute = True + + lower_key = key.lower() + if lower_key in self._RESERVED: + if morsel is None: + continue + + if value is None: + if lower_key not in self._FLAGS: + morsel = None + continue + value = True + else: + value, _ = self.value_decode(value) + + morsel[key] = value + + elif is_attribute: + morsel = None + + elif value is not None: + morsel = self.get(key, http.cookies.Morsel()) + real_value, coded_value = self.value_decode(value) + morsel.set(key, real_value, coded_value) + self[key] = morsel + + else: + morsel = None diff --git a/hypervideo_dl/dependencies.py b/hypervideo_dl/dependencies.py new file mode 100644 index 0000000..a913169 --- /dev/null +++ b/hypervideo_dl/dependencies.py @@ -0,0 +1,97 @@ +# flake8: noqa: F401 +"""Imports all optional dependencies for the project. +An attribute "_hypervideo_dl__identifier" may be inserted into the module if it uses an ambiguous namespace""" + +try: + import brotlicffi as brotli +except ImportError: + try: + import brotli + except ImportError: + brotli = None + + +try: + import certifi +except ImportError: + certifi = None +else: + from os.path import exists as _path_exists + + # The certificate may not be bundled in executable + if not _path_exists(certifi.where()): + certifi = None + + +try: + from Cryptodome.Cipher import AES as Cryptodome_AES +except ImportError: + try: + from Crypto.Cipher import AES as Cryptodome_AES + except (ImportError, SyntaxError): # Old Crypto gives SyntaxError in newer Python + Cryptodome_AES = None + else: + try: + # In pycrypto, mode defaults to ECB. See: + # https://www.pycryptodome.org/en/latest/src/vs_pycrypto.html#:~:text=not%20have%20ECB%20as%20default%20mode + Cryptodome_AES.new(b'abcdefghijklmnop') + except TypeError: + pass + else: + Cryptodome_AES._hypervideo_dl__identifier = 'pycrypto' + + +try: + import mutagen +except ImportError: + mutagen = None + + +secretstorage = None +try: + import secretstorage + _SECRETSTORAGE_UNAVAILABLE_REASON = None +except ImportError: + _SECRETSTORAGE_UNAVAILABLE_REASON = ( + 'as the `secretstorage` module is not installed. ' + 'Please install by running `python3 -m pip install secretstorage`') +except Exception as _err: + _SECRETSTORAGE_UNAVAILABLE_REASON = f'as the `secretstorage` module could not be initialized. {_err}' + + +try: + import sqlite3 +except ImportError: + # although sqlite3 is part of the standard library, it is possible to compile python without + # sqlite support. See: https://github.com/hypervideo/hypervideo/issues/544 + sqlite3 = None + + +try: + import websockets +except (ImportError, SyntaxError): + # websockets 3.10 on python 3.6 causes SyntaxError + # See https://github.com/hypervideo/hypervideo/issues/2633 + websockets = None + + +try: + import xattr # xattr or pyxattr +except ImportError: + xattr = None +else: + if hasattr(xattr, 'set'): # pyxattr + xattr._hypervideo_dl__identifier = 'pyxattr' + + +all_dependencies = {k: v for k, v in globals().items() if not k.startswith('_')} + + +available_dependencies = {k: v for k, v in all_dependencies.items() if v} + + +__all__ = [ + 'all_dependencies', + 'available_dependencies', + *all_dependencies.keys(), +] diff --git a/hypervideo_dl/downloader/__init__.py b/hypervideo_dl/downloader/__init__.py index 96d484d..c34dbce 100644 --- a/hypervideo_dl/downloader/__init__.py +++ b/hypervideo_dl/downloader/__init__.py @@ -1,10 +1,4 @@ -from __future__ import unicode_literals - -from ..compat import compat_str -from ..utils import ( - determine_protocol, - NO_DEFAULT -) +from ..utils import NO_DEFAULT, determine_protocol def get_suitable_downloader(info_dict, params={}, default=NO_DEFAULT, protocol=None, to_stdout=False): @@ -29,21 +23,18 @@ def get_suitable_downloader(info_dict, params={}, default=NO_DEFAULT, protocol=N # Some of these require get_suitable_downloader from .common import FileDownloader from .dash import DashSegmentsFD +from .external import FFmpegFD, get_external_downloader from .f4m import F4mFD from .fc2 import FC2LiveFD from .hls import HlsFD from .http import HttpFD -from .rtmp import RtmpFD -from .rtsp import RtspFD from .ism import IsmFD from .mhtml import MhtmlFD from .niconico import NiconicoDmcFD +from .rtmp import RtmpFD +from .rtsp import RtspFD from .websocket import WebSocketFragmentFD from .youtube_live_chat import YoutubeLiveChatFD -from .external import ( - get_external_downloader, - FFmpegFD, -) PROTOCOL_MAP = { 'rtmp': RtmpFD, @@ -68,10 +59,11 @@ PROTOCOL_MAP = { def shorten_protocol_name(proto, simplify=False): short_protocol_names = { - 'm3u8_native': 'm3u8_n', - 'rtmp_ffmpeg': 'rtmp_f', + 'm3u8_native': 'm3u8', + 'm3u8': 'm3u8F', + 'rtmp_ffmpeg': 'rtmpF', 'http_dash_segments': 'dash', - 'http_dash_segments_generator': 'dash_g', + 'http_dash_segments_generator': 'dashG', 'niconico_dmc': 'dmc', 'websocket_frag': 'WSfrag', } @@ -79,6 +71,7 @@ def shorten_protocol_name(proto, simplify=False): short_protocol_names.update({ 'https': 'http', 'ftps': 'ftp', + 'm3u8': 'm3u8', # Reverse above m3u8 mapping 'm3u8_native': 'm3u8', 'http_dash_segments_generator': 'dash', 'rtmp_ffmpeg': 'rtmp', @@ -93,13 +86,13 @@ def _get_suitable_downloader(info_dict, protocol, params, default): if default is NO_DEFAULT: default = HttpFD - # if (info_dict.get('start_time') or info_dict.get('end_time')) and not info_dict.get('requested_formats') and FFmpegFD.can_download(info_dict): - # return FFmpegFD + if (info_dict.get('section_start') or info_dict.get('section_end')) and FFmpegFD.can_download(info_dict): + return FFmpegFD info_dict['protocol'] = protocol downloaders = params.get('external_downloader') external_downloader = ( - downloaders if isinstance(downloaders, compat_str) or downloaders is None + downloaders if isinstance(downloaders, str) or downloaders is None else downloaders.get(shorten_protocol_name(protocol, True), downloaders.get('default'))) if external_downloader is None: diff --git a/hypervideo_dl/downloader/common.py b/hypervideo_dl/downloader/common.py index 7cef3e8..72d4822 100644 --- a/hypervideo_dl/downloader/common.py +++ b/hypervideo_dl/downloader/common.py @@ -1,30 +1,39 @@ -from __future__ import division, unicode_literals - +import contextlib +import errno +import functools import os +import random import re import time -import random -import errno +from ..minicurses import ( + BreaklineStatusPrinter, + MultilineLogger, + MultilinePrinter, + QuietMultilinePrinter, +) from ..utils import ( + IDENTITY, + NO_DEFAULT, + LockingUnsupportedError, + Namespace, + RetryManager, + classproperty, decodeArgument, encodeFilename, - error_to_compat_str, format_bytes, + join_nonempty, + parse_bytes, + remove_start, sanitize_open, shell_quote, timeconvert, timetuple_from_msec, -) -from ..minicurses import ( - MultilineLogger, - MultilinePrinter, - QuietMultilinePrinter, - BreaklineStatusPrinter + try_call, ) -class FileDownloader(object): +class FileDownloader: """File Downloader class. File downloader objects are the ones responsible of downloading the @@ -39,6 +48,7 @@ class FileDownloader(object): verbose: Print additional info to stdout. quiet: Do not print messages to stdout. ratelimit: Download speed limit, in bytes/sec. + continuedl: Attempt to continue downloads if possible throttledratelimit: Assume the download is being throttled below this speed (bytes/sec) retries: Number of times to retry for HTTP error 5xx file_access_retries: Number of times to retry on file access error @@ -62,6 +72,7 @@ class FileDownloader(object): useful for bypassing bandwidth throttling imposed by a webserver (experimental) progress_template: See YoutubeDL.py + retry_sleep_functions: See YoutubeDL.py Subclasses of this one must re-define the real_download method. """ @@ -71,21 +82,51 @@ class FileDownloader(object): def __init__(self, ydl, params): """Create a FileDownloader object with the given options.""" - self.ydl = ydl + self._set_ydl(ydl) self._progress_hooks = [] self.params = params self._prepare_multiline_status() self.add_progress_hook(self.report_progress) + def _set_ydl(self, ydl): + self.ydl = ydl + + for func in ( + 'deprecation_warning', + 'deprecated_feature', + 'report_error', + 'report_file_already_downloaded', + 'report_warning', + 'to_console_title', + 'to_stderr', + 'trouble', + 'write_debug', + ): + if not hasattr(self, func): + setattr(self, func, getattr(ydl, func)) + + def to_screen(self, *args, **kargs): + self.ydl.to_screen(*args, quiet=self.params.get('quiet'), **kargs) + + __to_screen = to_screen + + @classproperty + def FD_NAME(cls): + return re.sub(r'(?<=[a-z])(?=[A-Z])', '_', cls.__name__[:-2]).lower() + @staticmethod def format_seconds(seconds): + if seconds is None: + return ' Unknown' time = timetuple_from_msec(seconds * 1000) if time.hours > 99: return '--:--:--' - if not time.hours: - return '%02d:%02d' % time[1:-1] return '%02d:%02d:%02d' % time[:-1] + @classmethod + def format_eta(cls, seconds): + return f'{remove_start(cls.format_seconds(seconds), "00:"):>8s}' + @staticmethod def calc_percent(byte_counter, data_len): if data_len is None: @@ -94,11 +135,7 @@ class FileDownloader(object): @staticmethod def format_percent(percent): - if percent is None: - return '---.-%' - elif percent == 100: - return '100%' - return '%6s' % ('%3.1f%%' % percent) + return ' N/A%' if percent is None else f'{percent:>5.1f}%' @staticmethod def calc_eta(start, now, total, current): @@ -113,12 +150,6 @@ class FileDownloader(object): return int((float(total) - float(current)) / rate) @staticmethod - def format_eta(eta): - if eta is None: - return '--:--' - return FileDownloader.format_seconds(eta) - - @staticmethod def calc_speed(start, now, bytes): dif = now - start if bytes == 0 or dif < 0.001: # One millisecond @@ -127,13 +158,11 @@ class FileDownloader(object): @staticmethod def format_speed(speed): - if speed is None: - return '%10s' % '---b/s' - return '%10s' % ('%s/s' % format_bytes(speed)) + return ' Unknown B/s' if speed is None else f'{format_bytes(speed):>10s}/s' @staticmethod def format_retries(retries): - return 'inf' if retries == float('inf') else '%.0f' % retries + return 'inf' if retries == float('inf') else int(retries) @staticmethod def best_block_size(elapsed_time, bytes): @@ -151,33 +180,7 @@ class FileDownloader(object): @staticmethod def parse_bytes(bytestr): """Parse a string indicating a byte quantity into an integer.""" - matchobj = re.match(r'(?i)^(\d+(?:\.\d+)?)([kMGTPEZY]?)$', bytestr) - if matchobj is None: - return None - number = float(matchobj.group(1)) - multiplier = 1024.0 ** 'bkmgtpezy'.index(matchobj.group(2).lower()) - return int(round(number * multiplier)) - - def to_screen(self, *args, **kargs): - self.ydl.to_screen(*args, quiet=self.params.get('quiet'), **kargs) - - def to_stderr(self, message): - self.ydl.to_stderr(message) - - def to_console_title(self, message): - self.ydl.to_console_title(message) - - def trouble(self, *args, **kargs): - self.ydl.trouble(*args, **kargs) - - def report_warning(self, *args, **kargs): - self.ydl.report_warning(*args, **kargs) - - def report_error(self, *args, **kargs): - self.ydl.report_error(*args, **kargs) - - def write_debug(self, *args, **kargs): - self.ydl.write_debug(*args, **kargs) + parse_bytes(bytestr) def slow_down(self, start_time, now, byte_counter): """Sleep if the download speed is over the rate limit.""" @@ -211,30 +214,31 @@ class FileDownloader(object): return filename + '.ytdl' def wrap_file_access(action, *, fatal=False): - def outer(func): - def inner(self, *args, **kwargs): - file_access_retries = self.params.get('file_access_retries', 0) - retry = 0 - while True: - try: - return func(self, *args, **kwargs) - except (IOError, OSError) as err: - retry = retry + 1 - if retry > file_access_retries or err.errno not in (errno.EACCES, errno.EINVAL): - if not fatal: - self.report_error(f'unable to {action} file: {err}') - return - raise - self.to_screen( - f'[download] Unable to {action} file due to file access error. ' - f'Retrying (attempt {retry} of {self.format_retries(file_access_retries)}) ...') - time.sleep(0.01) - return inner - return outer + def error_callback(err, count, retries, *, fd): + return RetryManager.report_retry( + err, count, retries, info=fd.__to_screen, + warn=lambda e: (time.sleep(0.01), fd.to_screen(f'[download] Unable to {action} file: {e}')), + error=None if fatal else lambda e: fd.report_error(f'Unable to {action} file: {e}'), + sleep_func=fd.params.get('retry_sleep_functions', {}).get('file_access')) + + def wrapper(self, func, *args, **kwargs): + for retry in RetryManager(self.params.get('file_access_retries'), error_callback, fd=self): + try: + return func(self, *args, **kwargs) + except OSError as err: + if err.errno in (errno.EACCES, errno.EINVAL): + retry.error = err + continue + retry.error_callback(err, 1, 0) + + return functools.partial(functools.partialmethod, wrapper) @wrap_file_access('open', fatal=True) def sanitize_open(self, filename, open_mode): - return sanitize_open(filename, open_mode) + f, filename = sanitize_open(filename, open_mode) + if not getattr(f, 'locked', None): + self.write_debug(f'{LockingUnsupportedError.msg}. Proceeding without locking', only_once=True) + return f, filename @wrap_file_access('remove') def try_remove(self, filename): @@ -261,10 +265,8 @@ class FileDownloader(object): # Ignore obviously invalid dates if filetime == 0: return - try: + with contextlib.suppress(Exception): os.utime(filename, (time.time(), filetime)) - except Exception: - pass return filetime def report_destination(self, filename): @@ -277,26 +279,26 @@ class FileDownloader(object): elif self.ydl.params.get('logger'): self._multiline = MultilineLogger(self.ydl.params['logger'], lines) elif self.params.get('progress_with_newline'): - self._multiline = BreaklineStatusPrinter(self.ydl._out_files['screen'], lines) + self._multiline = BreaklineStatusPrinter(self.ydl._out_files.out, lines) else: - self._multiline = MultilinePrinter(self.ydl._out_files['screen'], lines, not self.params.get('quiet')) + self._multiline = MultilinePrinter(self.ydl._out_files.out, lines, not self.params.get('quiet')) self._multiline.allow_colors = self._multiline._HAVE_FULLCAP and not self.params.get('no_color') def _finish_multiline_status(self): self._multiline.end() - _progress_styles = { - 'downloaded_bytes': 'light blue', - 'percent': 'light blue', - 'eta': 'yellow', - 'speed': 'green', - 'elapsed': 'bold white', - 'total_bytes': '', - 'total_bytes_estimate': '', - } + ProgressStyles = Namespace( + downloaded_bytes='light blue', + percent='light blue', + eta='yellow', + speed='green', + elapsed='bold white', + total_bytes='', + total_bytes_estimate='', + ) def _report_progress_status(self, s, default_template): - for name, style in self._progress_styles.items(): + for name, style in self.ProgressStyles.items_: name = f'_{name}_str' if name not in s: continue @@ -320,78 +322,73 @@ class FileDownloader(object): self._multiline.stream, self._multiline.allow_colors, *args, **kwargs) def report_progress(self, s): + def with_fields(*tups, default=''): + for *fields, tmpl in tups: + if all(s.get(f) is not None for f in fields): + return tmpl + return default + + _format_bytes = lambda k: f'{format_bytes(s.get(k)):>10s}' + if s['status'] == 'finished': if self.params.get('noprogress'): self.to_screen('[download] Download completed') - msg_template = '100%%' - if s.get('total_bytes') is not None: - s['_total_bytes_str'] = format_bytes(s['total_bytes']) - msg_template += ' of %(_total_bytes_str)s' - if s.get('elapsed') is not None: - s['_elapsed_str'] = self.format_seconds(s['elapsed']) - msg_template += ' in %(_elapsed_str)s' - s['_percent_str'] = self.format_percent(100) - self._report_progress_status(s, msg_template) - return + speed = try_call(lambda: s['total_bytes'] / s['elapsed']) + s.update({ + 'speed': speed, + '_speed_str': self.format_speed(speed).strip(), + '_total_bytes_str': _format_bytes('total_bytes'), + '_elapsed_str': self.format_seconds(s.get('elapsed')), + '_percent_str': self.format_percent(100), + }) + self._report_progress_status(s, join_nonempty( + '100%%', + with_fields(('total_bytes', 'of %(_total_bytes_str)s')), + with_fields(('elapsed', 'in %(_elapsed_str)s')), + with_fields(('speed', 'at %(_speed_str)s')), + delim=' ')) if s['status'] != 'downloading': return - if s.get('eta') is not None: - s['_eta_str'] = self.format_eta(s['eta']) - else: - s['_eta_str'] = 'Unknown' - - if s.get('total_bytes') and s.get('downloaded_bytes') is not None: - s['_percent_str'] = self.format_percent(100 * s['downloaded_bytes'] / s['total_bytes']) - elif s.get('total_bytes_estimate') and s.get('downloaded_bytes') is not None: - s['_percent_str'] = self.format_percent(100 * s['downloaded_bytes'] / s['total_bytes_estimate']) - else: - if s.get('downloaded_bytes') == 0: - s['_percent_str'] = self.format_percent(0) - else: - s['_percent_str'] = 'Unknown %' - - if s.get('speed') is not None: - s['_speed_str'] = self.format_speed(s['speed']) - else: - s['_speed_str'] = 'Unknown speed' - - if s.get('total_bytes') is not None: - s['_total_bytes_str'] = format_bytes(s['total_bytes']) - msg_template = '%(_percent_str)s of %(_total_bytes_str)s at %(_speed_str)s ETA %(_eta_str)s' - elif s.get('total_bytes_estimate') is not None: - s['_total_bytes_estimate_str'] = format_bytes(s['total_bytes_estimate']) - msg_template = '%(_percent_str)s of ~%(_total_bytes_estimate_str)s at %(_speed_str)s ETA %(_eta_str)s' - else: - if s.get('downloaded_bytes') is not None: - s['_downloaded_bytes_str'] = format_bytes(s['downloaded_bytes']) - if s.get('elapsed'): - s['_elapsed_str'] = self.format_seconds(s['elapsed']) - msg_template = '%(_downloaded_bytes_str)s at %(_speed_str)s (%(_elapsed_str)s)' - else: - msg_template = '%(_downloaded_bytes_str)s at %(_speed_str)s' - else: - msg_template = '%(_percent_str)s at %(_speed_str)s ETA %(_eta_str)s' - if s.get('fragment_index') and s.get('fragment_count'): - msg_template += ' (frag %(fragment_index)s/%(fragment_count)s)' - elif s.get('fragment_index'): - msg_template += ' (frag %(fragment_index)s)' + s.update({ + '_eta_str': self.format_eta(s.get('eta')).strip(), + '_speed_str': self.format_speed(s.get('speed')), + '_percent_str': self.format_percent(try_call( + lambda: 100 * s['downloaded_bytes'] / s['total_bytes'], + lambda: 100 * s['downloaded_bytes'] / s['total_bytes_estimate'], + lambda: s['downloaded_bytes'] == 0 and 0)), + '_total_bytes_str': _format_bytes('total_bytes'), + '_total_bytes_estimate_str': _format_bytes('total_bytes_estimate'), + '_downloaded_bytes_str': _format_bytes('downloaded_bytes'), + '_elapsed_str': self.format_seconds(s.get('elapsed')), + }) + + msg_template = with_fields( + ('total_bytes', '%(_percent_str)s of %(_total_bytes_str)s at %(_speed_str)s ETA %(_eta_str)s'), + ('total_bytes_estimate', '%(_percent_str)s of ~%(_total_bytes_estimate_str)s at %(_speed_str)s ETA %(_eta_str)s'), + ('downloaded_bytes', 'elapsed', '%(_downloaded_bytes_str)s at %(_speed_str)s (%(_elapsed_str)s)'), + ('downloaded_bytes', '%(_downloaded_bytes_str)s at %(_speed_str)s'), + default='%(_percent_str)s at %(_speed_str)s ETA %(_eta_str)s') + + msg_template += with_fields( + ('fragment_index', 'fragment_count', ' (frag %(fragment_index)s/%(fragment_count)s)'), + ('fragment_index', ' (frag %(fragment_index)s)')) self._report_progress_status(s, msg_template) def report_resuming_byte(self, resume_len): """Report attempt to resume at given byte.""" self.to_screen('[download] Resuming download at byte %s' % resume_len) - def report_retry(self, err, count, retries): - """Report retry in case of HTTP error 5xx""" - self.to_screen( - '[download] Got server HTTP error: %s. Retrying (attempt %d of %s) ...' - % (error_to_compat_str(err), count, self.format_retries(retries))) - - def report_file_already_downloaded(self, *args, **kwargs): - """Report file has already been fully downloaded.""" - return self.ydl.report_file_already_downloaded(*args, **kwargs) + def report_retry(self, err, count, retries, frag_index=NO_DEFAULT, fatal=True): + """Report retry""" + is_frag = False if frag_index is NO_DEFAULT else 'fragment' + RetryManager.report_retry( + err, count, retries, info=self.__to_screen, + warn=lambda msg: self.__to_screen(f'[download] Got error: {msg}'), + error=IDENTITY if not fatal else lambda e: self.report_error(f'\r[download] Got error: {e}'), + sleep_func=self.params.get('retry_sleep_functions', {}).get(is_frag or 'http'), + suffix=f'fragment{"s" if frag_index is None else f" {frag_index}"}' if is_frag else None) def report_unable_to_resume(self): """Report it was impossible to resume download.""" @@ -431,25 +428,16 @@ class FileDownloader(object): self._finish_multiline_status() return True, False - if subtitle is False: - min_sleep_interval = self.params.get('sleep_interval') - if min_sleep_interval: - max_sleep_interval = self.params.get('max_sleep_interval', min_sleep_interval) - sleep_interval = random.uniform(min_sleep_interval, max_sleep_interval) - self.to_screen( - '[download] Sleeping %s seconds ...' % ( - int(sleep_interval) if sleep_interval.is_integer() - else '%.2f' % sleep_interval)) - time.sleep(sleep_interval) + if subtitle: + sleep_interval = self.params.get('sleep_interval_subtitles') or 0 else: - sleep_interval_sub = 0 - if type(self.params.get('sleep_interval_subtitles')) is int: - sleep_interval_sub = self.params.get('sleep_interval_subtitles') - if sleep_interval_sub > 0: - self.to_screen( - '[download] Sleeping %s seconds ...' % ( - sleep_interval_sub)) - time.sleep(sleep_interval_sub) + min_sleep_interval = self.params.get('sleep_interval') or 0 + sleep_interval = random.uniform( + min_sleep_interval, self.params.get('max_sleep_interval') or min_sleep_interval) + if sleep_interval > 0: + self.to_screen(f'[download] Sleeping {sleep_interval:.2f} seconds ...') + time.sleep(sleep_interval) + ret = self.real_download(filename, info_dict) self._finish_multiline_status() return ret, True @@ -459,8 +447,7 @@ class FileDownloader(object): raise NotImplementedError('This method must be implemented by subclasses') def _hook_progress(self, status, info_dict): - if not self._progress_hooks: - return + # Ideally we want to make a copy of the dict, but that is too slow status['info_dict'] = info_dict # youtube-dl passes the same status object to all the hooks. # Some third party scripts seems to be relying on this. @@ -482,4 +469,4 @@ class FileDownloader(object): if exe is None: exe = os.path.basename(str_args[0]) - self.write_debug('%s command line: %s' % (exe, shell_quote(str_args))) + self.write_debug(f'{exe} command line: {shell_quote(str_args)}') diff --git a/hypervideo_dl/downloader/dash.py b/hypervideo_dl/downloader/dash.py index a845ee7..4328d73 100644 --- a/hypervideo_dl/downloader/dash.py +++ b/hypervideo_dl/downloader/dash.py @@ -1,10 +1,9 @@ -from __future__ import unicode_literals import time +import urllib.parse -from ..downloader import get_suitable_downloader +from . import get_suitable_downloader from .fragment import FragmentFD - -from ..utils import urljoin +from ..utils import update_url_query, urljoin class DashSegmentsFD(FragmentFD): @@ -42,24 +41,29 @@ class DashSegmentsFD(FragmentFD): self._prepare_and_start_frag_download(ctx, fmt) ctx['start'] = real_start - fragments_to_download = self._get_fragments(fmt, ctx) + extra_query = None + extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url') + if extra_param_to_segment_url: + extra_query = urllib.parse.parse_qs(extra_param_to_segment_url) + + fragments_to_download = self._get_fragments(fmt, ctx, extra_query) if real_downloader: self.to_screen( - '[%s] Fragment downloads will be delegated to %s' % (self.FD_NAME, real_downloader.get_basename())) + f'[{self.FD_NAME}] Fragment downloads will be delegated to {real_downloader.get_basename()}') info_dict['fragments'] = list(fragments_to_download) fd = real_downloader(self.ydl, self.params) return fd.real_download(filename, info_dict) args.append([ctx, fragments_to_download, fmt]) - return self.download_and_append_fragments_multiple(*args) + return self.download_and_append_fragments_multiple(*args, is_fatal=lambda idx: idx == 0) def _resolve_fragments(self, fragments, ctx): fragments = fragments(ctx) if callable(fragments) else fragments return [next(iter(fragments))] if self.params.get('test') else fragments - def _get_fragments(self, fmt, ctx): + def _get_fragments(self, fmt, ctx, extra_query): fragment_base_url = fmt.get('fragment_base_url') fragments = self._resolve_fragments(fmt['fragments'], ctx) @@ -72,9 +76,12 @@ class DashSegmentsFD(FragmentFD): if not fragment_url: assert fragment_base_url fragment_url = urljoin(fragment_base_url, fragment['path']) + if extra_query: + fragment_url = update_url_query(fragment_url, extra_query) yield { 'frag_index': frag_index, + 'fragment_count': fragment.get('fragment_count'), 'index': i, 'url': fragment_url, } diff --git a/hypervideo_dl/downloader/external.py b/hypervideo_dl/downloader/external.py index b99dc37..75257a7 100644 --- a/hypervideo_dl/downloader/external.py +++ b/hypervideo_dl/downloader/external.py @@ -1,5 +1,4 @@ -from __future__ import unicode_literals - +import enum import os.path import re import subprocess @@ -7,30 +6,35 @@ import sys import time from .fragment import FragmentFD -from ..compat import ( - compat_setenv, - compat_str, -) -from ..postprocessor.ffmpeg import FFmpegPostProcessor, EXT_TO_OUT_FORMATS +from ..compat import functools +from ..postprocessor.ffmpeg import EXT_TO_OUT_FORMATS, FFmpegPostProcessor from ..utils import ( + Popen, + RetryManager, + _configuration_args, + check_executable, classproperty, + cli_bool_option, cli_option, cli_valueless_option, - cli_bool_option, - _configuration_args, determine_ext, - encodeFilename, encodeArgument, + encodeFilename, handle_youtubedl_headers, - check_executable, - Popen, remove_end, + traverse_obj, ) +class Features(enum.Enum): + TO_STDOUT = enum.auto() + MULTIPLE_FORMATS = enum.auto() + + class ExternalFD(FragmentFD): SUPPORTED_PROTOCOLS = ('http', 'https', 'ftp', 'ftps') - can_download_to_stdout = False + SUPPORTED_FEATURES = () + _CAPTURE_STDERR = True def real_download(self, filename, info_dict): self.report_destination(filename) @@ -56,7 +60,7 @@ class ExternalFD(FragmentFD): } if filename != '-': fsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen('\r[%s] Downloaded %s bytes' % (self.get_basename(), fsize)) + self.to_screen(f'\r[{self.get_basename()}] Downloaded {fsize} bytes') self.try_rename(tmpfilename, filename) status.update({ 'downloaded_bytes': fsize, @@ -78,7 +82,7 @@ class ExternalFD(FragmentFD): def EXE_NAME(cls): return cls.get_basename() - @property + @functools.cached_property def exe(self): return self.EXE_NAME @@ -94,9 +98,11 @@ class ExternalFD(FragmentFD): @classmethod def supports(cls, info_dict): - return ( - (cls.can_download_to_stdout or not info_dict.get('to_stdout')) - and info_dict['protocol'] in cls.SUPPORTED_PROTOCOLS) + return all(( + not info_dict.get('to_stdout') or Features.TO_STDOUT in cls.SUPPORTED_FEATURES, + '+' not in info_dict['protocol'] or Features.MULTIPLE_FORMATS in cls.SUPPORTED_FEATURES, + all(proto in cls.SUPPORTED_PROTOCOLS for proto in info_dict['protocol'].split('+')), + )) @classmethod def can_download(cls, info_dict, path=None): @@ -123,33 +129,28 @@ class ExternalFD(FragmentFD): self._debug_cmd(cmd) if 'fragments' not in info_dict: - p = Popen(cmd, stderr=subprocess.PIPE) - _, stderr = p.communicate_or_kill() - if p.returncode != 0: - self.to_stderr(stderr.decode('utf-8', 'replace')) - return p.returncode + _, stderr, returncode = Popen.run( + cmd, text=True, stderr=subprocess.PIPE if self._CAPTURE_STDERR else None) + if returncode and stderr: + self.to_stderr(stderr) + return returncode - fragment_retries = self.params.get('fragment_retries', 0) skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) - count = 0 - while count <= fragment_retries: - p = Popen(cmd, stderr=subprocess.PIPE) - _, stderr = p.communicate_or_kill() - if p.returncode == 0: + retry_manager = RetryManager(self.params.get('fragment_retries'), self.report_retry, + frag_index=None, fatal=not skip_unavailable_fragments) + for retry in retry_manager: + _, stderr, returncode = Popen.run(cmd, text=True, stderr=subprocess.PIPE) + if not returncode: break # TODO: Decide whether to retry based on error code # https://aria2.github.io/manual/en/html/aria2c.html#exit-status - self.to_stderr(stderr.decode('utf-8', 'replace')) - count += 1 - if count <= fragment_retries: - self.to_screen( - '[%s] Got error. Retrying fragments (attempt %d of %s)...' - % (self.get_basename(), count, self.format_retries(fragment_retries))) - if count > fragment_retries: - if not skip_unavailable_fragments: - self.report_error('Giving up after %s fragment retries' % fragment_retries) - return -1 + if stderr: + self.to_stderr(stderr) + retry.error = Exception() + continue + if not skip_unavailable_fragments and retry_manager.error: + return -1 decrypt_fragment = self.decrypter(info_dict) dest, _ = self.sanitize_open(tmpfilename, 'wb') @@ -157,7 +158,7 @@ class ExternalFD(FragmentFD): fragment_filename = '%s-Frag%d' % (tmpfilename, frag_index) try: src, _ = self.sanitize_open(fragment_filename, 'rb') - except IOError as err: + except OSError as err: if skip_unavailable_fragments and frag_index > 1: self.report_skip_fragment(frag_index, err) continue @@ -174,12 +175,13 @@ class ExternalFD(FragmentFD): class CurlFD(ExternalFD): AVAILABLE_OPT = '-V' + _CAPTURE_STDERR = False # curl writes the progress to stderr def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '--location', '-o', tmpfilename, '--compressed'] if info_dict.get('http_headers') is not None: for key, val in info_dict['http_headers'].items(): - cmd += ['--header', '%s: %s' % (key, val)] + cmd += ['--header', f'{key}: {val}'] cmd += self._bool_option('--continue-at', 'continuedl', '-', '0') cmd += self._valueless_option('--silent', 'noprogress') @@ -198,16 +200,6 @@ class CurlFD(ExternalFD): cmd += ['--', info_dict['url']] return cmd - def _call_downloader(self, tmpfilename, info_dict): - cmd = [encodeArgument(a) for a in self._make_cmd(tmpfilename, info_dict)] - - self._debug_cmd(cmd) - - # curl writes the progress to stderr so don't capture it. - p = Popen(cmd) - p.communicate_or_kill() - return p.returncode - class AxelFD(ExternalFD): AVAILABLE_OPT = '-V' @@ -216,7 +208,7 @@ class AxelFD(ExternalFD): cmd = [self.exe, '-o', tmpfilename] if info_dict.get('http_headers') is not None: for key, val in info_dict['http_headers'].items(): - cmd += ['-H', '%s: %s' % (key, val)] + cmd += ['-H', f'{key}: {val}'] cmd += self._configuration_args() cmd += ['--', info_dict['url']] return cmd @@ -229,7 +221,7 @@ class WgetFD(ExternalFD): cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies', '--compression=auto'] if info_dict.get('http_headers') is not None: for key, val in info_dict['http_headers'].items(): - cmd += ['--header', '%s: %s' % (key, val)] + cmd += ['--header', f'{key}: {val}'] cmd += self._option('--limit-rate', 'ratelimit') retry = self._option('--tries', 'retries') if len(retry) == 2: @@ -240,7 +232,7 @@ class WgetFD(ExternalFD): proxy = self.params.get('proxy') if proxy: for var in ('http_proxy', 'https_proxy'): - cmd += ['--execute', '%s=%s' % (var, proxy)] + cmd += ['--execute', f'{var}={proxy}'] cmd += self._valueless_option('--no-check-certificate', 'nocheckcertificate') cmd += self._configuration_args() cmd += ['--', info_dict['url']] @@ -260,6 +252,10 @@ class Aria2cFD(ExternalFD): check_results = (not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES) return all(check_results) + @staticmethod + def _aria2c_filename(fn): + return fn if os.path.isabs(fn) else f'.{os.path.sep}{fn}' + def _make_cmd(self, tmpfilename, info_dict): cmd = [self.exe, '-c', '--console-log-level=warn', '--summary-interval=0', '--download-result=hide', @@ -271,7 +267,7 @@ class Aria2cFD(ExternalFD): if info_dict.get('http_headers') is not None: for key, val in info_dict['http_headers'].items(): - cmd += ['--header', '%s: %s' % (key, val)] + cmd += ['--header', f'{key}: {val}'] cmd += self._option('--max-overall-download-limit', 'ratelimit') cmd += self._option('--interface', 'source_address') cmd += self._option('--all-proxy', 'proxy') @@ -288,11 +284,9 @@ class Aria2cFD(ExternalFD): # https://github.com/aria2/aria2/issues/1373 dn = os.path.dirname(tmpfilename) if dn: - if not os.path.isabs(dn): - dn = '.%s%s' % (os.path.sep, dn) - cmd += ['--dir', dn + os.path.sep] + cmd += ['--dir', self._aria2c_filename(dn) + os.path.sep] if 'fragments' not in info_dict: - cmd += ['--out', '.%s%s' % (os.path.sep, os.path.basename(tmpfilename))] + cmd += ['--out', self._aria2c_filename(os.path.basename(tmpfilename))] cmd += ['--auto-file-renaming=false'] if 'fragments' in info_dict: @@ -301,11 +295,11 @@ class Aria2cFD(ExternalFD): url_list = [] for frag_index, fragment in enumerate(info_dict['fragments']): fragment_filename = '%s-Frag%d' % (os.path.basename(tmpfilename), frag_index) - url_list.append('%s\n\tout=%s' % (fragment['url'], fragment_filename)) + url_list.append('%s\n\tout=%s' % (fragment['url'], self._aria2c_filename(fragment_filename))) stream, _ = self.sanitize_open(url_list_file, 'wb') - stream.write('\n'.join(url_list).encode('utf-8')) + stream.write('\n'.join(url_list).encode()) stream.close() - cmd += ['-i', url_list_file] + cmd += ['-i', self._aria2c_filename(url_list_file)] else: cmd += ['--', info_dict['url']] return cmd @@ -320,13 +314,13 @@ class HttpieFD(ExternalFD): if info_dict.get('http_headers') is not None: for key, val in info_dict['http_headers'].items(): - cmd += ['%s:%s' % (key, val)] + cmd += [f'{key}:{val}'] return cmd class FFmpegFD(ExternalFD): SUPPORTED_PROTOCOLS = ('http', 'https', 'ftp', 'ftps', 'm3u8', 'm3u8_native', 'rtsp', 'rtmp', 'rtmp_ffmpeg', 'mms', 'http_dash_segments') - can_download_to_stdout = True + SUPPORTED_FEATURES = (Features.TO_STDOUT, Features.MULTIPLE_FORMATS) @classmethod def available(cls, path=None): @@ -334,10 +328,6 @@ class FFmpegFD(ExternalFD): # Fixme: This may be wrong when --ffmpeg-location is used return FFmpegPostProcessor().available - @classmethod - def supports(cls, info_dict): - return all(proto in cls.SUPPORTED_PROTOCOLS for proto in info_dict['protocol'].split('+')) - def on_process_started(self, proc, stdin): """ Override this in subclasses """ pass @@ -368,9 +358,11 @@ class FFmpegFD(ExternalFD): if not self.params.get('verbose'): args += ['-hide_banner'] - args += info_dict.get('_ffmpeg_args', []) + args += traverse_obj(info_dict, ('downloader_options', 'ffmpeg_args'), default=[]) - # This option exists only for compatibility. Extractors should use `_ffmpeg_args` instead + # These exists only for compatibility. Extractors should use + # info_dict['downloader_options']['ffmpeg_args'] instead + args += info_dict.get('_ffmpeg_args') or [] seekable = info_dict.get('_seekable') if seekable is not None: # setting -seekable prevents ffmpeg from guessing if the server @@ -380,20 +372,15 @@ class FFmpegFD(ExternalFD): # http://trac.ffmpeg.org/ticket/6125#comment:10 args += ['-seekable', '1' if seekable else '0'] - # start_time = info_dict.get('start_time') or 0 - # if start_time: - # args += ['-ss', compat_str(start_time)] - # end_time = info_dict.get('end_time') - # if end_time: - # args += ['-t', compat_str(end_time - start_time)] - - if info_dict.get('http_headers') is not None and re.match(r'^https?://', urls[0]): - # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: - # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. - headers = handle_youtubedl_headers(info_dict['http_headers']) - args += [ + http_headers = None + if info_dict.get('http_headers'): + youtubedl_headers = handle_youtubedl_headers(info_dict['http_headers']) + http_headers = [ + # Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv: + # [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header. '-headers', - ''.join('%s: %s\r\n' % (key, val) for key, val in headers.items())] + ''.join(f'{key}: {val}\r\n' for key, val in youtubedl_headers.items()) + ] env = None proxy = self.params.get('proxy') @@ -411,8 +398,8 @@ class FFmpegFD(ExternalFD): # We could switch to the following code if we are able to detect version properly # args += ['-http_proxy', proxy] env = os.environ.copy() - compat_setenv('HTTP_PROXY', proxy, env=env) - compat_setenv('http_proxy', proxy, env=env) + env['HTTP_PROXY'] = proxy + env['http_proxy'] = proxy protocol = info_dict.get('protocol') @@ -442,20 +429,31 @@ class FFmpegFD(ExternalFD): if isinstance(conn, list): for entry in conn: args += ['-rtmp_conn', entry] - elif isinstance(conn, compat_str): + elif isinstance(conn, str): args += ['-rtmp_conn', conn] + start_time, end_time = info_dict.get('section_start') or 0, info_dict.get('section_end') + for i, url in enumerate(urls): + if http_headers is not None and re.match(r'^https?://', url): + args += http_headers + if start_time: + args += ['-ss', str(start_time)] + if end_time: + args += ['-t', str(end_time - start_time)] + args += self._configuration_args((f'_i{i + 1}', '_i')) + ['-i', url] - args += ['-c', 'copy'] + if not (start_time or end_time) or not self.params.get('force_keyframes_at_cuts'): + args += ['-c', 'copy'] + if info_dict.get('requested_formats') or protocol == 'http_dash_segments': for (i, fmt) in enumerate(info_dict.get('requested_formats') or [info_dict]): stream_number = fmt.get('manifest_stream_number', 0) args.extend(['-map', f'{i}:{stream_number}']) if self.params.get('test', False): - args += ['-fs', compat_str(self._TEST_FILE_SIZE)] + args += ['-fs', str(self._TEST_FILE_SIZE)] ext = info_dict['ext'] if protocol in ('m3u8', 'm3u8_native'): @@ -490,24 +488,23 @@ class FFmpegFD(ExternalFD): args.append(encodeFilename(ffpp._ffmpeg_filename_argument(tmpfilename), True)) self._debug_cmd(args) - proc = Popen(args, stdin=subprocess.PIPE, env=env) - if url in ('-', 'pipe:'): - self.on_process_started(proc, proc.stdin) - try: - retval = proc.wait() - except BaseException as e: - # subprocces.run would send the SIGKILL signal to ffmpeg and the - # mp4 file couldn't be played, but if we ask ffmpeg to quit it - # produces a file that is playable (this is mostly useful for live - # streams). Note that Windows is not affected and produces playable - # files (see https://github.com/ytdl-org/youtube-dl/issues/8300). - if isinstance(e, KeyboardInterrupt) and sys.platform != 'win32' and url not in ('-', 'pipe:'): - proc.communicate_or_kill(b'q') - else: - proc.kill() - proc.wait() - raise - return retval + with Popen(args, stdin=subprocess.PIPE, env=env) as proc: + if url in ('-', 'pipe:'): + self.on_process_started(proc, proc.stdin) + try: + retval = proc.wait() + except BaseException as e: + # subprocces.run would send the SIGKILL signal to ffmpeg and the + # mp4 file couldn't be played, but if we ask ffmpeg to quit it + # produces a file that is playable (this is mostly useful for live + # streams). Note that Windows is not affected and produces playable + # files (see https://github.com/ytdl-org/youtube-dl/issues/8300). + if isinstance(e, KeyboardInterrupt) and sys.platform != 'win32' and url not in ('-', 'pipe:'): + proc.communicate_or_kill(b'q') + else: + proc.kill(timeout=None) + raise + return retval class AVconvFD(FFmpegFD): @@ -520,16 +517,14 @@ _BY_NAME = { if name.endswith('FD') and name not in ('ExternalFD', 'FragmentFD') } -_BY_EXE = {klass.EXE_NAME: klass for klass in _BY_NAME.values()} - def list_external_downloaders(): return sorted(_BY_NAME.keys()) def get_external_downloader(external_downloader): - """ Given the name of the executable, see whether we support the given - downloader . """ - # Drop .exe extension on Windows + """ Given the name of the executable, see whether we support the given downloader """ bn = os.path.splitext(os.path.basename(external_downloader))[0] - return _BY_NAME.get(bn, _BY_EXE.get(bn)) + return _BY_NAME.get(bn) or next(( + klass for klass in _BY_NAME.values() if klass.EXE_NAME in bn + ), None) diff --git a/hypervideo_dl/downloader/f4m.py b/hypervideo_dl/downloader/f4m.py index 0008b7c..306f921 100644 --- a/hypervideo_dl/downloader/f4m.py +++ b/hypervideo_dl/downloader/f4m.py @@ -1,23 +1,14 @@ -from __future__ import division, unicode_literals - +import base64 import io import itertools +import struct import time +import urllib.error +import urllib.parse from .fragment import FragmentFD -from ..compat import ( - compat_b64decode, - compat_etree_fromstring, - compat_urlparse, - compat_urllib_error, - compat_urllib_parse_urlparse, - compat_struct_pack, - compat_struct_unpack, -) -from ..utils import ( - fix_xml_ampersands, - xpath_text, -) +from ..compat import compat_etree_fromstring +from ..utils import fix_xml_ampersands, xpath_text class DataTruncatedError(Exception): @@ -40,13 +31,13 @@ class FlvReader(io.BytesIO): # Utility functions for reading numbers and strings def read_unsigned_long_long(self): - return compat_struct_unpack('!Q', self.read_bytes(8))[0] + return struct.unpack('!Q', self.read_bytes(8))[0] def read_unsigned_int(self): - return compat_struct_unpack('!I', self.read_bytes(4))[0] + return struct.unpack('!I', self.read_bytes(4))[0] def read_unsigned_char(self): - return compat_struct_unpack('!B', self.read_bytes(1))[0] + return struct.unpack('!B', self.read_bytes(1))[0] def read_string(self): res = b'' @@ -193,7 +184,7 @@ def build_fragments_list(boot_info): first_frag_number = fragment_run_entry_table[0]['first'] fragments_counter = itertools.count(first_frag_number) for segment, fragments_count in segment_run_table['segment_run']: - # In some live HDS streams (for example Rai), `fragments_count` is + # In some live HDS streams (e.g. Rai), `fragments_count` is # abnormal and causing out-of-memory errors. It's OK to change the # number of fragments for live streams as they are updated periodically if fragments_count == 4294967295 and boot_info['live']: @@ -208,11 +199,11 @@ def build_fragments_list(boot_info): def write_unsigned_int(stream, val): - stream.write(compat_struct_pack('!I', val)) + stream.write(struct.pack('!I', val)) def write_unsigned_int_24(stream, val): - stream.write(compat_struct_pack('!I', val)[1:]) + stream.write(struct.pack('!I', val)[1:]) def write_flv_header(stream): @@ -261,8 +252,6 @@ class F4mFD(FragmentFD): A downloader for f4m manifests or AdobeHDS. """ - FD_NAME = 'f4m' - def _get_unencrypted_media(self, doc): media = doc.findall(_add_ns('media')) if not media: @@ -308,12 +297,12 @@ class F4mFD(FragmentFD): # 1. http://live-1-1.rutube.ru/stream/1024/HDS/SD/C2NKsS85HQNckgn5HdEmOQ/1454167650/S-s604419906/move/four/dirs/upper/1024-576p.f4m bootstrap_url = node.get('url') if bootstrap_url: - bootstrap_url = compat_urlparse.urljoin( + bootstrap_url = urllib.parse.urljoin( base_url, bootstrap_url) boot_info = self._get_bootstrap_from_url(bootstrap_url) else: bootstrap_url = None - bootstrap = compat_b64decode(node.text) + bootstrap = base64.b64decode(node.text) boot_info = read_bootstrap_info(bootstrap) return boot_info, bootstrap_url @@ -343,14 +332,14 @@ class F4mFD(FragmentFD): # Prefer baseURL for relative URLs as per 11.2 of F4M 3.0 spec. man_base_url = get_base_url(doc) or man_url - base_url = compat_urlparse.urljoin(man_base_url, media.attrib['url']) + base_url = urllib.parse.urljoin(man_base_url, media.attrib['url']) bootstrap_node = doc.find(_add_ns('bootstrapInfo')) boot_info, bootstrap_url = self._parse_bootstrap_node( bootstrap_node, man_base_url) live = boot_info['live'] metadata_node = media.find(_add_ns('metadata')) if metadata_node is not None: - metadata = compat_b64decode(metadata_node.text) + metadata = base64.b64decode(metadata_node.text) else: metadata = None @@ -378,7 +367,7 @@ class F4mFD(FragmentFD): if not live: write_metadata_tag(dest_stream, metadata) - base_url_parsed = compat_urllib_parse_urlparse(base_url) + base_url_parsed = urllib.parse.urlparse(base_url) self._start_frag_download(ctx, info_dict) @@ -398,9 +387,10 @@ class F4mFD(FragmentFD): query.append(info_dict['extra_param_to_segment_url']) url_parsed = base_url_parsed._replace(path=base_url_parsed.path + name, query='&'.join(query)) try: - success, down_data = self._download_fragment(ctx, url_parsed.geturl(), info_dict) + success = self._download_fragment(ctx, url_parsed.geturl(), info_dict) if not success: return False + down_data = self._read_fragment(ctx) reader = FlvReader(down_data) while True: try: @@ -417,7 +407,7 @@ class F4mFD(FragmentFD): if box_type == b'mdat': self._append_fragment(ctx, box_data) break - except (compat_urllib_error.HTTPError, ) as err: + except urllib.error.HTTPError as err: if live and (err.code == 404 or err.code == 410): # We didn't keep up with the live window. Continue # with the next available fragment. @@ -434,6 +424,4 @@ class F4mFD(FragmentFD): msg = 'Missed %d fragments' % (fragments_list[0][1] - (frag_i + 1)) self.report_warning(msg) - self._finish_frag_download(ctx, info_dict) - - return True + return self._finish_frag_download(ctx, info_dict) diff --git a/hypervideo_dl/downloader/fc2.py b/hypervideo_dl/downloader/fc2.py index 157bcf2..f9763de 100644 --- a/hypervideo_dl/downloader/fc2.py +++ b/hypervideo_dl/downloader/fc2.py @@ -1,5 +1,3 @@ -from __future__ import division, unicode_literals - import threading from .common import FileDownloader @@ -20,6 +18,9 @@ class FC2LiveFD(FileDownloader): heartbeat_state = [None, 1] def heartbeat(): + if heartbeat_state[1] < 0: + return + try: heartbeat_state[1] += 1 ws.send('{"name":"heartbeat","arguments":{},"id":%d}' % heartbeat_state[1]) @@ -38,4 +39,8 @@ class FC2LiveFD(FileDownloader): 'ws': None, 'protocol': 'live_ffmpeg', }) - return FFmpegFD(self.ydl, self.params or {}).download(filename, new_info_dict) + try: + return FFmpegFD(self.ydl, self.params or {}).download(filename, new_info_dict) + finally: + # stop heartbeating + heartbeat_state[1] = -1 diff --git a/hypervideo_dl/downloader/fragment.py b/hypervideo_dl/downloader/fragment.py index a991c6d..e61bd0e 100644 --- a/hypervideo_dl/downloader/fragment.py +++ b/hypervideo_dl/downloader/fragment.py @@ -1,28 +1,20 @@ -from __future__ import division, unicode_literals - +import concurrent.futures +import contextlib import http.client import json import math import os +import struct import time - -try: - import concurrent.futures - can_threaded_download = True -except ImportError: - can_threaded_download = False +import urllib.error from .common import FileDownloader from .http import HttpFD from ..aes import aes_cbc_decrypt_bytes, unpad_pkcs7 -from ..compat import ( - compat_os_name, - compat_urllib_error, - compat_struct_pack, -) +from ..compat import compat_os_name from ..utils import ( DownloadError, - error_to_compat_str, + RetryManager, encodeFilename, sanitized_Request, traverse_obj, @@ -33,9 +25,7 @@ class HttpQuietDownloader(HttpFD): def to_screen(self, *args, **kargs): pass - def report_retry(self, err, count, retries): - super().to_screen( - f'[download] Got server HTTP error: {err}. Retrying (attempt {count} of {self.format_retries(retries)}) ...') + to_console_title = to_screen class FragmentFD(FileDownloader): @@ -75,9 +65,9 @@ class FragmentFD(FileDownloader): """ def report_retry_fragment(self, err, frag_index, count, retries): - self.to_screen( - '\r[download] Got server HTTP error: %s. Retrying fragment %d (attempt %d of %s) ...' - % (error_to_compat_str(err), frag_index, count, self.format_retries(retries))) + self.deprecation_warning('hypervideo_dl.downloader.FragmentFD.report_retry_fragment is deprecated. ' + 'Use hypervideo_dl.downloader.FileDownloader.report_retry instead') + return self.report_retry(err, count, retries, frag_index) def report_skip_fragment(self, frag_index, err=None): err = f' {err};' if err else '' @@ -131,7 +121,7 @@ class FragmentFD(FileDownloader): 'request_data': request_data, 'ctx_id': ctx.get('ctx_id'), } - success = ctx['dl'].download(fragment_filename, fragment_info_dict) + success, _ = ctx['dl'].download(fragment_filename, fragment_info_dict) if not success: return False if fragment_info_dict.get('filetime'): @@ -140,6 +130,8 @@ class FragmentFD(FileDownloader): return True def _read_fragment(self, ctx): + if not ctx.get('fragment_filename_sanitized'): + return None try: down, frag_sanitized = self.sanitize_open(ctx['fragment_filename_sanitized'], 'rb') except FileNotFoundError: @@ -172,21 +164,13 @@ class FragmentFD(FileDownloader): total_frags_str += ' (not including %d ad)' % ad_frags else: total_frags_str = 'unknown (live)' - self.to_screen( - '[%s] Total fragments: %s' % (self.FD_NAME, total_frags_str)) + self.to_screen(f'[{self.FD_NAME}] Total fragments: {total_frags_str}') self.report_destination(ctx['filename']) - dl = HttpQuietDownloader( - self.ydl, - { - 'continuedl': self.params.get('continuedl', True), - 'quiet': self.params.get('quiet'), - 'noprogress': True, - 'ratelimit': self.params.get('ratelimit'), - 'retries': self.params.get('retries', 0), - 'nopart': self.params.get('nopart', False), - 'test': self.params.get('test', False), - } - ) + dl = HttpQuietDownloader(self.ydl, { + **self.params, + 'noprogress': True, + 'test': False, + }) tmpfilename = self.temp_name(ctx['filename']) open_mode = 'wb' resume_len = 0 @@ -259,6 +243,9 @@ class FragmentFD(FileDownloader): if s['status'] not in ('downloading', 'finished'): return + if not total_frags and ctx.get('fragment_count'): + state['fragment_count'] = ctx['fragment_count'] + if ctx_id is not None and s.get('ctx_id') != ctx_id: return @@ -308,18 +295,23 @@ class FragmentFD(FileDownloader): self.try_remove(ytdl_filename) elapsed = time.time() - ctx['started'] - if ctx['tmpfilename'] == '-': - downloaded_bytes = ctx['complete_frags_downloaded_bytes'] + to_file = ctx['tmpfilename'] != '-' + if to_file: + downloaded_bytes = os.path.getsize(encodeFilename(ctx['tmpfilename'])) else: + downloaded_bytes = ctx['complete_frags_downloaded_bytes'] + + if not downloaded_bytes: + if to_file: + self.try_remove(ctx['tmpfilename']) + self.report_error('The downloaded file is empty') + return False + elif to_file: self.try_rename(ctx['tmpfilename'], ctx['filename']) - if self.params.get('updatetime', True): - filetime = ctx.get('fragment_filetime') - if filetime: - try: - os.utime(ctx['filename'], (time.time(), filetime)) - except Exception: - pass - downloaded_bytes = os.path.getsize(encodeFilename(ctx['filename'])) + filetime = ctx.get('fragment_filetime') + if self.params.get('updatetime', True) and filetime: + with contextlib.suppress(Exception): + os.utime(ctx['filename'], (time.time(), filetime)) self._hook_progress({ 'downloaded_bytes': downloaded_bytes, @@ -331,6 +323,7 @@ class FragmentFD(FileDownloader): 'max_progress': ctx.get('max_progress'), 'progress_idx': ctx.get('progress_idx'), }, info_dict) + return True def _prepare_external_frag_download(self, ctx): if 'live' not in ctx: @@ -342,8 +335,7 @@ class FragmentFD(FileDownloader): total_frags_str += ' (not including %d ad)' % ad_frags else: total_frags_str = 'unknown (live)' - self.to_screen( - '[%s] Total fragments: %s' % (self.FD_NAME, total_frags_str)) + self.to_screen(f'[{self.FD_NAME}] Total fragments: {total_frags_str}') tmpfilename = self.temp_name(ctx['filename']) @@ -362,10 +354,12 @@ class FragmentFD(FileDownloader): return _key_cache[url] def decrypt_fragment(fragment, frag_content): + if frag_content is None: + return decrypt_info = fragment.get('decrypt_info') if not decrypt_info or decrypt_info['METHOD'] != 'AES-128': return frag_content - iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', fragment['media_sequence']) + iv = decrypt_info.get('IV') or struct.pack('>8xq', fragment['media_sequence']) decrypt_info['KEY'] = decrypt_info.get('KEY') or _get_key(info_dict.get('_decryption_key_url') or decrypt_info['URI']) # Don't decrypt the content in tests since the data is explicitly truncated and it's not to a valid block # size (see https://github.com/ytdl-org/youtube-dl/pull/27660). Tests only care that the correct data downloaded, @@ -376,7 +370,7 @@ class FragmentFD(FileDownloader): return decrypt_fragment - def download_and_append_fragments_multiple(self, *args, pack_func=None, finish_func=None): + def download_and_append_fragments_multiple(self, *args, **kwargs): ''' @params (ctx1, fragments1, info_dict1), (ctx2, fragments2, info_dict2), ... all args must be either tuple or list @@ -384,7 +378,7 @@ class FragmentFD(FileDownloader): interrupt_trigger = [True] max_progress = len(args) if max_progress == 1: - return self.download_and_append_fragments(*args[0], pack_func=pack_func, finish_func=finish_func) + return self.download_and_append_fragments(*args[0], **kwargs) max_workers = self.params.get('concurrent_fragment_downloads', 1) if max_progress > 1: self._prepare_multiline_status(max_progress) @@ -394,8 +388,7 @@ class FragmentFD(FileDownloader): ctx['max_progress'] = max_progress ctx['progress_idx'] = idx return self.download_and_append_fragments( - ctx, fragments, info_dict, pack_func=pack_func, finish_func=finish_func, - tpe=tpe, interrupt_trigger=interrupt_trigger) + ctx, fragments, info_dict, **kwargs, tpe=tpe, interrupt_trigger=interrupt_trigger) class FTPE(concurrent.futures.ThreadPoolExecutor): # has to stop this or it's going to wait on the worker thread itself @@ -442,18 +435,12 @@ class FragmentFD(FileDownloader): return result def download_and_append_fragments( - self, ctx, fragments, info_dict, *, pack_func=None, finish_func=None, - tpe=None, interrupt_trigger=None): - if not interrupt_trigger: - interrupt_trigger = (True, ) - - fragment_retries = self.params.get('fragment_retries', 0) - is_fatal = ( - ((lambda _: False) if info_dict.get('is_live') else (lambda idx: idx == 0)) - if self.params.get('skip_unavailable_fragments', True) else (lambda _: True)) + self, ctx, fragments, info_dict, *, is_fatal=(lambda idx: False), + pack_func=(lambda content, idx: content), finish_func=None, + tpe=None, interrupt_trigger=(True, )): - if not pack_func: - pack_func = lambda frag_content, _: frag_content + if not self.params.get('skip_unavailable_fragments', True): + is_fatal = lambda _: True def download_fragment(fragment, ctx): if not interrupt_trigger[0]: @@ -467,31 +454,25 @@ class FragmentFD(FileDownloader): headers['Range'] = 'bytes=%d-%d' % (byte_range['start'], byte_range['end'] - 1) # Never skip the first fragment - fatal, count = is_fatal(fragment.get('index') or (frag_index - 1)), 0 - while count <= fragment_retries: - try: - if self._download_fragment(ctx, fragment['url'], info_dict, headers): - break - return - except (compat_urllib_error.HTTPError, http.client.IncompleteRead) as err: - # Unavailable (possibly temporary) fragments may be served. - # First we try to retry then either skip or abort. - # See https://github.com/ytdl-org/youtube-dl/issues/10165, - # https://github.com/ytdl-org/youtube-dl/issues/10448). - count += 1 - ctx['last_error'] = err - if count <= fragment_retries: - self.report_retry_fragment(err, frag_index, count, fragment_retries) - except DownloadError: - # Don't retry fragment if error occurred during HTTP downloading - # itself since it has own retry settings - if not fatal: - break - raise + fatal = is_fatal(fragment.get('index') or (frag_index - 1)) - if count > fragment_retries and fatal: - ctx['dest_stream'].close() - self.report_error('Giving up after %s fragment retries' % fragment_retries) + def error_callback(err, count, retries): + if fatal and count > retries: + ctx['dest_stream'].close() + self.report_retry(err, count, retries, frag_index, fatal) + ctx['last_error'] = err + + for retry in RetryManager(self.params.get('fragment_retries'), error_callback): + try: + ctx['fragment_count'] = fragment.get('fragment_count') + if not self._download_fragment(ctx, fragment['url'], info_dict, headers): + return + except (urllib.error.HTTPError, http.client.IncompleteRead) as err: + retry.error = err + continue + except DownloadError: # has own retry settings + if fatal: + raise def append_fragment(frag_content, frag_index, ctx): if frag_content: @@ -508,8 +489,7 @@ class FragmentFD(FileDownloader): max_workers = math.ceil( self.params.get('concurrent_fragment_downloads', 1) / ctx.get('max_progress', 1)) - if can_threaded_download and max_workers > 1: - + if max_workers > 1: def _download_fragment(fragment): ctx_copy = ctx.copy() download_fragment(fragment, ctx_copy) @@ -517,23 +497,36 @@ class FragmentFD(FileDownloader): self.report_warning('The download speed shown is only of one thread. This is a known issue and patches are welcome') with tpe or concurrent.futures.ThreadPoolExecutor(max_workers) as pool: - for fragment, frag_index, frag_filename in pool.map(_download_fragment, fragments): - ctx['fragment_filename_sanitized'] = frag_filename - ctx['fragment_index'] = frag_index - result = append_fragment(decrypt_fragment(fragment, self._read_fragment(ctx)), frag_index, ctx) - if not result: - return False + try: + for fragment, frag_index, frag_filename in pool.map(_download_fragment, fragments): + ctx.update({ + 'fragment_filename_sanitized': frag_filename, + 'fragment_index': frag_index, + }) + if not append_fragment(decrypt_fragment(fragment, self._read_fragment(ctx)), frag_index, ctx): + return False + except KeyboardInterrupt: + self._finish_multiline_status() + self.report_error( + 'Interrupted by user. Waiting for all threads to shutdown...', is_error=False, tb=False) + pool.shutdown(wait=False) + raise else: for fragment in fragments: if not interrupt_trigger[0]: break - download_fragment(fragment, ctx) - result = append_fragment(decrypt_fragment(fragment, self._read_fragment(ctx)), fragment['frag_index'], ctx) + try: + download_fragment(fragment, ctx) + result = append_fragment( + decrypt_fragment(fragment, self._read_fragment(ctx)), fragment['frag_index'], ctx) + except KeyboardInterrupt: + if info_dict.get('is_live'): + break + raise if not result: return False if finish_func is not None: ctx['dest_stream'].write(finish_func()) ctx['dest_stream'].flush() - self._finish_frag_download(ctx, info_dict) - return True + return self._finish_frag_download(ctx, info_dict) diff --git a/hypervideo_dl/downloader/hls.py b/hypervideo_dl/downloader/hls.py index f3f32b5..4520edc 100644 --- a/hypervideo_dl/downloader/hls.py +++ b/hypervideo_dl/downloader/hls.py @@ -1,23 +1,14 @@ -from __future__ import unicode_literals - -import re -import io import binascii +import io +import re +import urllib.parse -from ..downloader import get_suitable_downloader -from .fragment import FragmentFD +from . import get_suitable_downloader from .external import FFmpegFD - -from ..compat import ( - compat_pycrypto_AES, - compat_urlparse, -) -from ..utils import ( - parse_m3u8_attributes, - update_url_query, - bug_reports_message, -) +from .fragment import FragmentFD from .. import webvtt +from ..dependencies import Cryptodome_AES +from ..utils import bug_reports_message, parse_m3u8_attributes, update_url_query class HlsFD(FragmentFD): @@ -70,12 +61,18 @@ class HlsFD(FragmentFD): s = urlh.read().decode('utf-8', 'ignore') can_download, message = self.can_download(s, info_dict, self.params.get('allow_unplayable_formats')), None - if can_download and not compat_pycrypto_AES and '#EXT-X-KEY:METHOD=AES-128' in s: - if FFmpegFD.available(): + if can_download: + has_ffmpeg = FFmpegFD.available() + no_crypto = not Cryptodome_AES and '#EXT-X-KEY:METHOD=AES-128' in s + if no_crypto and has_ffmpeg: can_download, message = False, 'The stream has AES-128 encryption and pycryptodome is not available' - else: + elif no_crypto: message = ('The stream has AES-128 encryption and neither ffmpeg nor pycryptodome are available; ' 'Decryption will be performed natively, but will be extremely slow') + elif info_dict.get('extractor_key') == 'Generic' and re.search(r'(?m)#EXT-X-MEDIA-SEQUENCE:(?!0$)', s): + install_ffmpeg = '' if has_ffmpeg else 'install ffmpeg and ' + message = ('Live HLS streams are not supported by the native downloader. If this is a livestream, ' + f'please {install_ffmpeg}add "--downloader ffmpeg --hls-use-mpegts" to your command') if not can_download: has_drm = re.search('|'.join([ r'#EXT-X-FAXS-CM:', # Adobe Flash Access @@ -102,8 +99,7 @@ class HlsFD(FragmentFD): if real_downloader and not real_downloader.supports_manifest(s): real_downloader = None if real_downloader: - self.to_screen( - '[%s] Fragment downloads will be delegated to %s' % (self.FD_NAME, real_downloader.get_basename())) + self.to_screen(f'[{self.FD_NAME}] Fragment downloads will be delegated to {real_downloader.get_basename()}') def is_ad_fragment_start(s): return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=ad' in s @@ -150,7 +146,7 @@ class HlsFD(FragmentFD): extra_query = None extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url') if extra_param_to_segment_url: - extra_query = compat_urlparse.parse_qs(extra_param_to_segment_url) + extra_query = urllib.parse.parse_qs(extra_param_to_segment_url) i = 0 media_sequence = 0 decrypt_info = {'METHOD': 'NONE'} @@ -172,7 +168,7 @@ class HlsFD(FragmentFD): frag_url = ( line if re.match(r'^https?://', line) - else compat_urlparse.urljoin(man_url, line)) + else urllib.parse.urljoin(man_url, line)) if extra_query: frag_url = update_url_query(frag_url, extra_query) @@ -197,10 +193,18 @@ class HlsFD(FragmentFD): frag_url = ( map_info.get('URI') if re.match(r'^https?://', map_info.get('URI')) - else compat_urlparse.urljoin(man_url, map_info.get('URI'))) + else urllib.parse.urljoin(man_url, map_info.get('URI'))) if extra_query: frag_url = update_url_query(frag_url, extra_query) + if map_info.get('BYTERANGE'): + splitted_byte_range = map_info.get('BYTERANGE').split('@') + sub_range_start = int(splitted_byte_range[1]) if len(splitted_byte_range) == 2 else byte_range['end'] + byte_range = { + 'start': sub_range_start, + 'end': sub_range_start + int(splitted_byte_range[0]), + } + fragments.append({ 'frag_index': frag_index, 'url': frag_url, @@ -210,14 +214,6 @@ class HlsFD(FragmentFD): }) media_sequence += 1 - if map_info.get('BYTERANGE'): - splitted_byte_range = map_info.get('BYTERANGE').split('@') - sub_range_start = int(splitted_byte_range[1]) if len(splitted_byte_range) == 2 else byte_range['end'] - byte_range = { - 'start': sub_range_start, - 'end': sub_range_start + int(splitted_byte_range[0]), - } - elif line.startswith('#EXT-X-KEY'): decrypt_url = decrypt_info.get('URI') decrypt_info = parse_m3u8_attributes(line[11:]) @@ -225,7 +221,7 @@ class HlsFD(FragmentFD): if 'IV' in decrypt_info: decrypt_info['IV'] = binascii.unhexlify(decrypt_info['IV'][2:].zfill(32)) if not re.match(r'^https?://', decrypt_info['URI']): - decrypt_info['URI'] = compat_urlparse.urljoin( + decrypt_info['URI'] = urllib.parse.urljoin( man_url, decrypt_info['URI']) if extra_query: decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query) @@ -339,7 +335,7 @@ class HlsFD(FragmentFD): continue block.write_into(output) - return output.getvalue().encode('utf-8') + return output.getvalue().encode() def fin_fragments(): dedup_window = extra_state.get('webvtt_dedup_window') @@ -350,7 +346,7 @@ class HlsFD(FragmentFD): for cue in dedup_window: webvtt.CueBlock.from_json(cue).write_into(output) - return output.getvalue().encode('utf-8') + return output.getvalue().encode() self.download_and_append_fragments( ctx, fragments, info_dict, pack_func=pack_fragment, finish_func=fin_fragments) diff --git a/hypervideo_dl/downloader/http.py b/hypervideo_dl/downloader/http.py index 591a9b0..95c870e 100644 --- a/hypervideo_dl/downloader/http.py +++ b/hypervideo_dl/downloader/http.py @@ -1,29 +1,33 @@ -from __future__ import unicode_literals - +import http.client import os +import random +import socket import ssl import time -import random +import urllib.error from .common import FileDownloader -from ..compat import ( - compat_urllib_error, - compat_http_client -) from ..utils import ( ContentTooShortError, + RetryManager, + ThrottledDownload, + XAttrMetadataError, + XAttrUnavailableError, encodeFilename, int_or_none, parse_http_range, sanitized_Request, - ThrottledDownload, try_call, write_xattr, - XAttrMetadataError, - XAttrUnavailableError, ) -RESPONSE_READ_EXCEPTIONS = (TimeoutError, ConnectionError, ssl.SSLError, compat_http_client.HTTPException) +RESPONSE_READ_EXCEPTIONS = ( + TimeoutError, + socket.timeout, # compat: py < 3.10 + ConnectionError, + ssl.SSLError, + http.client.HTTPException +) class HttpFD(FileDownloader): @@ -69,9 +73,6 @@ class HttpFD(FileDownloader): ctx.is_resume = ctx.resume_len > 0 - count = 0 - retries = self.params.get('retries', 0) - class SucceedDownload(Exception): pass @@ -134,19 +135,18 @@ class HttpFD(FileDownloader): if has_range: content_range = ctx.data.headers.get('Content-Range') content_range_start, content_range_end, content_len = parse_http_range(content_range) - if content_range_start is not None and range_start == content_range_start: - # Content-Range is present and matches requested Range, resume is possible - accept_content_len = ( + # Content-Range is present and matches requested Range, resume is possible + if range_start == content_range_start and ( # Non-chunked download not ctx.chunk_size # Chunked download and requested piece or # its part is promised to be served or content_range_end == range_end - or content_len < range_end) - if accept_content_len: - ctx.content_len = content_len - ctx.data_len = min(content_len, req_end or content_len) - (req_start or 0) - return + or content_len < range_end): + ctx.content_len = content_len + if content_len or req_end: + ctx.data_len = min(content_len or req_end, req_end or content_len) - (req_start or 0) + return # Content-Range is either not present or invalid. Assuming remote webserver is # trying to send the whole file, resume is not possible, so wiping the local file # and performing entire redownload @@ -154,7 +154,7 @@ class HttpFD(FileDownloader): ctx.resume_len = 0 ctx.open_mode = 'wb' ctx.data_len = ctx.content_len = int_or_none(ctx.data.info().get('Content-length', None)) - except (compat_urllib_error.HTTPError, ) as err: + except urllib.error.HTTPError as err: if err.code == 416: # Unable to resume (requested range not satisfiable) try: @@ -162,7 +162,7 @@ class HttpFD(FileDownloader): ctx.data = self.ydl.urlopen( sanitized_Request(url, request_data, headers)) content_length = ctx.data.info()['Content-Length'] - except (compat_urllib_error.HTTPError, ) as err: + except urllib.error.HTTPError as err: if err.code < 500 or err.code >= 600: raise else: @@ -195,7 +195,7 @@ class HttpFD(FileDownloader): # Unexpected HTTP error raise raise RetryDownload(err) - except compat_urllib_error.URLError as err: + except urllib.error.URLError as err: if isinstance(err.reason, ssl.CertificateError): raise raise RetryDownload(err) @@ -204,6 +204,12 @@ class HttpFD(FileDownloader): except RESPONSE_READ_EXCEPTIONS as err: raise RetryDownload(err) + def close_stream(): + if ctx.stream is not None: + if not ctx.tmpfilename == '-': + ctx.stream.close() + ctx.stream = None + def download(): data_len = ctx.data.info().get('Content-length', None) @@ -220,10 +226,12 @@ class HttpFD(FileDownloader): min_data_len = self.params.get('min_filesize') max_data_len = self.params.get('max_filesize') if min_data_len is not None and data_len < min_data_len: - self.to_screen('\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len)) + self.to_screen( + f'\r[download] File is smaller than min-filesize ({data_len} bytes < {min_data_len} bytes). Aborting.') return False if max_data_len is not None and data_len > max_data_len: - self.to_screen('\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len)) + self.to_screen( + f'\r[download] File is larger than max-filesize ({data_len} bytes > {max_data_len} bytes). Aborting.') return False byte_counter = 0 + ctx.resume_len @@ -235,12 +243,9 @@ class HttpFD(FileDownloader): before = start # start measuring def retry(e): - to_stdout = ctx.tmpfilename == '-' - if ctx.stream is not None: - if not to_stdout: - ctx.stream.close() - ctx.stream = None - ctx.resume_len = byte_counter if to_stdout else os.path.getsize(encodeFilename(ctx.tmpfilename)) + close_stream() + ctx.resume_len = (byte_counter if ctx.tmpfilename == '-' + else os.path.getsize(encodeFilename(ctx.tmpfilename))) raise RetryDownload(e) while True: @@ -264,19 +269,19 @@ class HttpFD(FileDownloader): assert ctx.stream is not None ctx.filename = self.undo_temp_name(ctx.tmpfilename) self.report_destination(ctx.filename) - except (OSError, IOError) as err: + except OSError as err: self.report_error('unable to open for writing: %s' % str(err)) return False if self.params.get('xattr_set_filesize', False) and data_len is not None: try: - write_xattr(ctx.tmpfilename, 'user.ytdl.filesize', str(data_len).encode('utf-8')) + write_xattr(ctx.tmpfilename, 'user.ytdl.filesize', str(data_len).encode()) except (XAttrUnavailableError, XAttrMetadataError) as err: self.report_error('unable to set filesize xattr: %s' % str(err)) try: ctx.stream.write(data_block) - except (IOError, OSError) as err: + except OSError as err: self.to_stderr('\n') self.report_error('unable to write data: %s' % str(err)) return False @@ -342,9 +347,7 @@ class HttpFD(FileDownloader): if data_len is not None and byte_counter != data_len: err = ContentTooShortError(byte_counter, int(data_len)) - if count <= retries: - retry(err) - raise err + retry(err) self.try_rename(ctx.tmpfilename, ctx.filename) @@ -363,21 +366,20 @@ class HttpFD(FileDownloader): return True - while count <= retries: + for retry in RetryManager(self.params.get('retries'), self.report_retry): try: establish_connection() return download() - except RetryDownload as e: - count += 1 - if count <= retries: - self.report_retry(e.source_error, count, retries) - else: - self.to_screen(f'[download] Got server HTTP error: {e.source_error}') + except RetryDownload as err: + retry.error = err.source_error continue except NextFragment: + retry.error = None + retry.attempt -= 1 continue except SucceedDownload: return True - - self.report_error('giving up after %s retries' % retries) + except: # noqa: E722 + close_stream() + raise return False diff --git a/hypervideo_dl/downloader/ism.py b/hypervideo_dl/downloader/ism.py index 4d5618c..a157a8a 100644 --- a/hypervideo_dl/downloader/ism.py +++ b/hypervideo_dl/downloader/ism.py @@ -1,27 +1,23 @@ -from __future__ import unicode_literals - -import time import binascii import io +import struct +import time +import urllib.error from .fragment import FragmentFD -from ..compat import ( - compat_Struct, - compat_urllib_error, -) +from ..utils import RetryManager +u8 = struct.Struct('>B') +u88 = struct.Struct('>Bx') +u16 = struct.Struct('>H') +u1616 = struct.Struct('>Hxx') +u32 = struct.Struct('>I') +u64 = struct.Struct('>Q') -u8 = compat_Struct('>B') -u88 = compat_Struct('>Bx') -u16 = compat_Struct('>H') -u1616 = compat_Struct('>Hxx') -u32 = compat_Struct('>I') -u64 = compat_Struct('>Q') - -s88 = compat_Struct('>bx') -s16 = compat_Struct('>h') -s1616 = compat_Struct('>hxx') -s32 = compat_Struct('>i') +s88 = struct.Struct('>bx') +s16 = struct.Struct('>h') +s1616 = struct.Struct('>hxx') +s32 = struct.Struct('>i') unity_matrix = (s32.pack(0x10000) + s32.pack(0) * 3) * 2 + s32.pack(0x40000000) @@ -142,6 +138,8 @@ def write_piff_header(stream, params): if fourcc == 'AACL': sample_entry_box = box(b'mp4a', sample_entry_payload) + if fourcc == 'EC-3': + sample_entry_box = box(b'ec-3', sample_entry_payload) elif stream_type == 'video': sample_entry_payload += u16.pack(0) # pre defined sample_entry_payload += u16.pack(0) # reserved @@ -156,7 +154,7 @@ def write_piff_header(stream, params): sample_entry_payload += u16.pack(0x18) # depth sample_entry_payload += s16.pack(-1) # pre defined - codec_private_data = binascii.unhexlify(params['codec_private_data'].encode('utf-8')) + codec_private_data = binascii.unhexlify(params['codec_private_data'].encode()) if fourcc in ('H264', 'AVC1'): sps, pps = codec_private_data.split(u32.pack(1))[1:] avcc_payload = u8.pack(1) # configuration version @@ -235,8 +233,6 @@ class IsmFD(FragmentFD): Download segments in a ISM manifest """ - FD_NAME = 'ism' - def real_download(self, filename, info_dict): segments = info_dict['fragments'][:1] if self.params.get( 'test', False) else info_dict['fragments'] @@ -252,7 +248,6 @@ class IsmFD(FragmentFD): 'ism_track_written': False, }) - fragment_retries = self.params.get('fragment_retries', 0) skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True) frag_index = 0 @@ -260,8 +255,10 @@ class IsmFD(FragmentFD): frag_index += 1 if frag_index <= ctx['fragment_index']: continue - count = 0 - while count <= fragment_retries: + + retry_manager = RetryManager(self.params.get('fragment_retries'), self.report_retry, + frag_index=frag_index, fatal=not skip_unavailable_fragments) + for retry in retry_manager: try: success = self._download_fragment(ctx, segment['url'], info_dict) if not success: @@ -274,18 +271,13 @@ class IsmFD(FragmentFD): write_piff_header(ctx['dest_stream'], info_dict['_download_params']) extra_state['ism_track_written'] = True self._append_fragment(ctx, frag_content) - break - except compat_urllib_error.HTTPError as err: - count += 1 - if count <= fragment_retries: - self.report_retry_fragment(err, frag_index, count, fragment_retries) - if count > fragment_retries: - if skip_unavailable_fragments: - self.report_skip_fragment(frag_index) + except urllib.error.HTTPError as err: + retry.error = err continue - self.report_error('giving up after %s fragment retries' % fragment_retries) - return False - self._finish_frag_download(ctx, info_dict) + if retry_manager.error: + if not skip_unavailable_fragments: + return False + self.report_skip_fragment(frag_index) - return True + return self._finish_frag_download(ctx, info_dict) diff --git a/hypervideo_dl/downloader/mhtml.py b/hypervideo_dl/downloader/mhtml.py index c8332c0..170a78d 100644 --- a/hypervideo_dl/downloader/mhtml.py +++ b/hypervideo_dl/downloader/mhtml.py @@ -1,24 +1,15 @@ -# coding: utf-8 -from __future__ import unicode_literals - import io import quopri import re import uuid from .fragment import FragmentFD -from ..utils import ( - escapeHTML, - formatSeconds, - srt_subtitles_timecode, - urljoin, -) +from ..compat import imghdr +from ..utils import escapeHTML, formatSeconds, srt_subtitles_timecode, urljoin from ..version import __version__ as YT_DLP_VERSION class MhtmlFD(FragmentFD): - FD_NAME = 'mhtml' - _STYLESHEET = """\ html, body { margin: 0; @@ -62,7 +53,7 @@ body > figure > img { def _escape_mime(s): return '=?utf-8?Q?' + (b''.join( bytes((b,)) if b >= 0x20 else b'=%02X' % b - for b in quopri.encodestring(s.encode('utf-8'), header=True) + for b in quopri.encodestring(s.encode(), header=True) )).decode('us-ascii') + '?=' def _gen_cid(self, i, fragment, frag_boundary): @@ -159,7 +150,7 @@ body > figure > img { length=len(stub), title=self._escape_mime(title), stub=stub - ).encode('utf-8')) + ).encode()) extra_state['header_written'] = True for i, fragment in enumerate(fragments): @@ -176,21 +167,13 @@ body > figure > img { continue frag_content = self._read_fragment(ctx) - mime_type = b'image/jpeg' - if frag_content.startswith(b'\x89PNG\r\n\x1a\n'): - mime_type = b'image/png' - if frag_content.startswith((b'GIF87a', b'GIF89a')): - mime_type = b'image/gif' - if frag_content.startswith(b'RIFF') and frag_content[8:12] == 'WEBP': - mime_type = b'image/webp' - frag_header = io.BytesIO() frag_header.write( b'--%b\r\n' % frag_boundary.encode('us-ascii')) frag_header.write( b'Content-ID: <%b>\r\n' % self._gen_cid(i, fragment, frag_boundary).encode('us-ascii')) frag_header.write( - b'Content-type: %b\r\n' % mime_type) + b'Content-type: %b\r\n' % f'image/{imghdr.what(h=frag_content) or "jpeg"}'.encode()) frag_header.write( b'Content-length: %u\r\n' % len(frag_content)) frag_header.write( @@ -203,5 +186,4 @@ body > figure > img { ctx['dest_stream'].write( b'--%b--\r\n\r\n' % frag_boundary.encode('us-ascii')) - self._finish_frag_download(ctx, info_dict) - return True + return self._finish_frag_download(ctx, info_dict) diff --git a/hypervideo_dl/downloader/niconico.py b/hypervideo_dl/downloader/niconico.py index 521dfec..77ed39e 100644 --- a/hypervideo_dl/downloader/niconico.py +++ b/hypervideo_dl/downloader/niconico.py @@ -1,22 +1,17 @@ -# coding: utf-8 -from __future__ import unicode_literals - import threading +from . import get_suitable_downloader from .common import FileDownloader -from ..downloader import get_suitable_downloader -from ..extractor.niconico import NiconicoIE from ..utils import sanitized_Request class NiconicoDmcFD(FileDownloader): """ Downloading niconico douga from DMC with heartbeat """ - FD_NAME = 'niconico_dmc' - def real_download(self, filename, info_dict): - self.to_screen('[%s] Downloading from DMC' % self.FD_NAME) + from ..extractor.niconico import NiconicoIE + self.to_screen('[%s] Downloading from DMC' % self.FD_NAME) ie = NiconicoIE(self.ydl) info_dict, heartbeat_info_dict = ie._get_heartbeat_info(info_dict) @@ -54,4 +49,4 @@ class NiconicoDmcFD(FileDownloader): with heartbeat_lock: timer[0].cancel() download_complete = True - return success + return success diff --git a/hypervideo_dl/downloader/rtmp.py b/hypervideo_dl/downloader/rtmp.py index 90f1acf..0e09525 100644 --- a/hypervideo_dl/downloader/rtmp.py +++ b/hypervideo_dl/downloader/rtmp.py @@ -1,18 +1,15 @@ -from __future__ import unicode_literals - import os import re import subprocess import time from .common import FileDownloader -from ..compat import compat_str from ..utils import ( + Popen, check_executable, - encodeFilename, encodeArgument, + encodeFilename, get_exe_version, - Popen, ) @@ -94,8 +91,7 @@ class RtmpFD(FileDownloader): self.to_screen('') return proc.wait() except BaseException: # Including KeyboardInterrupt - proc.kill() - proc.wait() + proc.kill(timeout=None) raise url = info_dict['url'] @@ -146,7 +142,7 @@ class RtmpFD(FileDownloader): if isinstance(conn, list): for entry in conn: basic_args += ['--conn', entry] - elif isinstance(conn, compat_str): + elif isinstance(conn, str): basic_args += ['--conn', conn] if protocol is not None: basic_args += ['--protocol', protocol] diff --git a/hypervideo_dl/downloader/rtsp.py b/hypervideo_dl/downloader/rtsp.py index 7815d59..e89269f 100644 --- a/hypervideo_dl/downloader/rtsp.py +++ b/hypervideo_dl/downloader/rtsp.py @@ -1,13 +1,8 @@ -from __future__ import unicode_literals - import os import subprocess from .common import FileDownloader -from ..utils import ( - check_executable, - encodeFilename, -) +from ..utils import check_executable, encodeFilename class RtspFD(FileDownloader): @@ -32,7 +27,7 @@ class RtspFD(FileDownloader): retval = subprocess.call(args) if retval == 0: fsize = os.path.getsize(encodeFilename(tmpfilename)) - self.to_screen('\r[%s] %s bytes' % (args[0], fsize)) + self.to_screen(f'\r[{args[0]}] {fsize} bytes') self.try_rename(tmpfilename, filename) self._hook_progress({ 'downloaded_bytes': fsize, diff --git a/hypervideo_dl/downloader/websocket.py b/hypervideo_dl/downloader/websocket.py index 58e2bce..6837ff1 100644 --- a/hypervideo_dl/downloader/websocket.py +++ b/hypervideo_dl/downloader/websocket.py @@ -1,19 +1,12 @@ +import asyncio +import contextlib import os import signal -import asyncio import threading -try: - import websockets -except (ImportError, SyntaxError): - # websockets 3.10 on python 3.6 causes SyntaxError - # See https://github.com/hypervideo/hypervideo/issues/2633 - has_websockets = False -else: - has_websockets = True - from .common import FileDownloader from .external import FFmpegFD +from ..dependencies import websockets class FFmpegSinkFD(FileDownloader): @@ -26,14 +19,12 @@ class FFmpegSinkFD(FileDownloader): async def call_conn(proc, stdin): try: await self.real_connection(stdin, info_dict) - except (BrokenPipeError, OSError): + except OSError: pass finally: - try: + with contextlib.suppress(OSError): stdin.flush() stdin.close() - except OSError: - pass os.kill(os.getpid(), signal.SIGINT) class FFmpegStdinFD(FFmpegFD): diff --git a/hypervideo_dl/downloader/youtube_live_chat.py b/hypervideo_dl/downloader/youtube_live_chat.py index dd21ac8..dfd290a 100644 --- a/hypervideo_dl/downloader/youtube_live_chat.py +++ b/hypervideo_dl/downloader/youtube_live_chat.py @@ -1,24 +1,20 @@ -from __future__ import division, unicode_literals - import json import time +import urllib.error from .fragment import FragmentFD -from ..compat import compat_urllib_error from ..utils import ( - try_get, + RegexNotFoundError, + RetryManager, dict_get, int_or_none, - RegexNotFoundError, + try_get, ) -from ..extractor.youtube import YoutubeBaseInfoExtractor as YT_BaseIE class YoutubeLiveChatFD(FragmentFD): """ Downloads YouTube live chats fragment by fragment """ - FD_NAME = 'youtube_live_chat' - def real_download(self, filename, info_dict): video_id = info_dict['video_id'] self.to_screen('[%s] Downloading live chat' % self.FD_NAME) @@ -26,7 +22,6 @@ class YoutubeLiveChatFD(FragmentFD): self.report_warning('Live chat download runs until the livestream ends. ' 'If you wish to download the video simultaneously, run a separate hypervideo instance') - fragment_retries = self.params.get('fragment_retries', 0) test = self.params.get('test', False) ctx = { @@ -35,7 +30,9 @@ class YoutubeLiveChatFD(FragmentFD): 'total_frags': None, } - ie = YT_BaseIE(self.ydl) + from ..extractor.youtube import YoutubeBaseInfoExtractor + + ie = YoutubeBaseInfoExtractor(self.ydl) start_time = int(time.time() * 1000) @@ -54,7 +51,7 @@ class YoutubeLiveChatFD(FragmentFD): replay_chat_item_action = action['replayChatItemAction'] offset = int(replay_chat_item_action['videoOffsetTimeMsec']) processed_fragment.extend( - json.dumps(action, ensure_ascii=False).encode('utf-8') + b'\n') + json.dumps(action, ensure_ascii=False).encode() + b'\n') if offset is not None: continuation = try_get( live_chat_continuation, @@ -96,7 +93,7 @@ class YoutubeLiveChatFD(FragmentFD): 'isLive': True, } processed_fragment.extend( - json.dumps(pseudo_action, ensure_ascii=False).encode('utf-8') + b'\n') + json.dumps(pseudo_action, ensure_ascii=False).encode() + b'\n') continuation_data_getters = [ lambda x: x['continuations'][0]['invalidationContinuationData'], lambda x: x['continuations'][0]['timedContinuationData'], @@ -112,8 +109,7 @@ class YoutubeLiveChatFD(FragmentFD): return continuation_id, live_offset, click_tracking_params def download_and_parse_fragment(url, frag_index, request_data=None, headers=None): - count = 0 - while count <= fragment_retries: + for retry in RetryManager(self.params.get('fragment_retries'), self.report_retry, frag_index=frag_index): try: success = dl_fragment(url, request_data, headers) if not success: @@ -128,21 +124,15 @@ class YoutubeLiveChatFD(FragmentFD): live_chat_continuation = try_get( data, lambda x: x['continuationContents']['liveChatContinuation'], dict) or {} - if info_dict['protocol'] == 'youtube_live_chat_replay': - if frag_index == 1: - continuation_id, offset, click_tracking_params = try_refresh_replay_beginning(live_chat_continuation) - else: - continuation_id, offset, click_tracking_params = parse_actions_replay(live_chat_continuation) - elif info_dict['protocol'] == 'youtube_live_chat': - continuation_id, offset, click_tracking_params = parse_actions_live(live_chat_continuation) - return True, continuation_id, offset, click_tracking_params - except compat_urllib_error.HTTPError as err: - count += 1 - if count <= fragment_retries: - self.report_retry_fragment(err, frag_index, count, fragment_retries) - if count > fragment_retries: - self.report_error('giving up after %s fragment retries' % fragment_retries) - return False, None, None, None + + func = (info_dict['protocol'] == 'youtube_live_chat' and parse_actions_live + or frag_index == 1 and try_refresh_replay_beginning + or parse_actions_replay) + return (True, *func(live_chat_continuation)) + except urllib.error.HTTPError as err: + retry.error = err + continue + return False, None, None, None self._prepare_and_start_frag_download(ctx, info_dict) @@ -190,7 +180,7 @@ class YoutubeLiveChatFD(FragmentFD): request_data['context']['clickTracking'] = {'clickTrackingParams': click_tracking_params} headers = ie.generate_api_headers(ytcfg=ytcfg, visitor_data=visitor_data) headers.update({'content-type': 'application/json'}) - fragment_request_data = json.dumps(request_data, ensure_ascii=False).encode('utf-8') + b'\n' + fragment_request_data = json.dumps(request_data, ensure_ascii=False).encode() + b'\n' success, continuation_id, offset, click_tracking_params = download_and_parse_fragment( url, frag_index, fragment_request_data, headers) else: @@ -201,8 +191,7 @@ class YoutubeLiveChatFD(FragmentFD): if test: break - self._finish_frag_download(ctx, info_dict) - return True + return self._finish_frag_download(ctx, info_dict) @staticmethod def parse_live_timestamp(action): diff --git a/hypervideo_dl/extractor/__init__.py b/hypervideo_dl/extractor/__init__.py index b354842..6bfa4bd 100644 --- a/hypervideo_dl/extractor/__init__.py +++ b/hypervideo_dl/extractor/__init__.py @@ -1,33 +1,15 @@ -import os +from ..compat.compat_utils import passthrough_module -from ..utils import load_plugins - -_LAZY_LOADER = False -if not os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'): - try: - from .lazy_extractors import * - from .lazy_extractors import _ALL_CLASSES - _LAZY_LOADER = True - except ImportError: - pass - -if not _LAZY_LOADER: - from .extractors import * - _ALL_CLASSES = [ - klass - for name, klass in globals().items() - if name.endswith('IE') and name != 'GenericIE' - ] - _ALL_CLASSES.append(GenericIE) - -_PLUGIN_CLASSES = load_plugins('extractor', 'IE', globals()) -_ALL_CLASSES = list(_PLUGIN_CLASSES.values()) + _ALL_CLASSES +passthrough_module(__name__, '.extractors') +del passthrough_module def gen_extractor_classes(): """ Return a list of supported extractors. The order does matter; the first extractor matched is the one handling the URL. """ + from .extractors import _ALL_CLASSES + return _ALL_CLASSES @@ -38,17 +20,23 @@ def gen_extractors(): return [klass() for klass in gen_extractor_classes()] -def list_extractors(age_limit): - """ - Return a list of extractors that are suitable for the given age, - sorted by extractor ID. - """ +def list_extractor_classes(age_limit=None): + """Return a list of extractors that are suitable for the given age, sorted by extractor name""" + from .generic import GenericIE + + yield from sorted(filter( + lambda ie: ie.is_suitable(age_limit) and ie != GenericIE, + gen_extractor_classes()), key=lambda ie: ie.IE_NAME.lower()) + yield GenericIE - return sorted( - filter(lambda ie: ie.is_suitable(age_limit), gen_extractors()), - key=lambda ie: ie.IE_NAME.lower()) + +def list_extractors(age_limit=None): + """Return a list of extractor instances that are suitable for the given age, sorted by extractor name""" + return [ie() for ie in list_extractor_classes(age_limit)] def get_info_extractor(ie_name): """Returns the info extractor class with the given ie_name""" - return globals()[ie_name + 'IE'] + from . import extractors + + return getattr(extractors, f'{ie_name}IE') diff --git a/hypervideo_dl/extractor/_extractors.py b/hypervideo_dl/extractor/_extractors.py new file mode 100644 index 0000000..2fe15f6 --- /dev/null +++ b/hypervideo_dl/extractor/_extractors.py @@ -0,0 +1,2354 @@ +# flake8: noqa: F401 + +from .youtube import ( # Youtube is moved to the top to improve performance + YoutubeIE, + YoutubeClipIE, + YoutubeFavouritesIE, + YoutubeNotificationsIE, + YoutubeHistoryIE, + YoutubeTabIE, + YoutubeLivestreamEmbedIE, + YoutubePlaylistIE, + YoutubeRecommendedIE, + YoutubeSearchDateIE, + YoutubeSearchIE, + YoutubeSearchURLIE, + YoutubeMusicSearchURLIE, + YoutubeSubscriptionsIE, + YoutubeStoriesIE, + YoutubeTruncatedIDIE, + YoutubeTruncatedURLIE, + YoutubeYtBeIE, + YoutubeYtUserIE, + YoutubeWatchLaterIE, + YoutubeShortsAudioPivotIE +) + +from .abc import ( + ABCIE, + ABCIViewIE, + ABCIViewShowSeriesIE, +) +from .abcnews import ( + AbcNewsIE, + AbcNewsVideoIE, +) +from .abcotvs import ( + ABCOTVSIE, + ABCOTVSClipsIE, +) +from .abematv import ( + AbemaTVIE, + AbemaTVTitleIE, +) +from .academicearth import AcademicEarthCourseIE +from .acast import ( + ACastIE, + ACastChannelIE, +) +from .acfun import AcFunVideoIE, AcFunBangumiIE +from .adn import ADNIE +from .adobeconnect import AdobeConnectIE +from .adobetv import ( + AdobeTVEmbedIE, + AdobeTVIE, + AdobeTVShowIE, + AdobeTVChannelIE, + AdobeTVVideoIE, +) +from .adultswim import AdultSwimIE +from .aenetworks import ( + AENetworksIE, + AENetworksCollectionIE, + AENetworksShowIE, + HistoryTopicIE, + HistoryPlayerIE, + BiographyIE, +) +from .aeonco import AeonCoIE +from .afreecatv import ( + AfreecaTVIE, + AfreecaTVLiveIE, + AfreecaTVUserIE, +) +from .agora import ( + TokFMAuditionIE, + TokFMPodcastIE, + WyborczaPodcastIE, + WyborczaVideoIE, +) +from .airmozilla import AirMozillaIE +from .aljazeera import AlJazeeraIE +from .alphaporno import AlphaPornoIE +from .amara import AmaraIE +from .alura import ( + AluraIE, + AluraCourseIE +) +from .amcnetworks import AMCNetworksIE +from .amazon import AmazonStoreIE +from .amazonminitv import ( + AmazonMiniTVIE, + AmazonMiniTVSeasonIE, + AmazonMiniTVSeriesIE, +) +from .americastestkitchen import ( + AmericasTestKitchenIE, + AmericasTestKitchenSeasonIE, +) +from .angel import AngelIE +from .anvato import AnvatoIE +from .aol import AolIE +from .allocine import AllocineIE +from .aliexpress import AliExpressLiveIE +from .alsace20tv import ( + Alsace20TVIE, + Alsace20TVEmbedIE, +) +from .apa import APAIE +from .aparat import AparatIE +from .appleconnect import AppleConnectIE +from .appletrailers import ( + AppleTrailersIE, + AppleTrailersSectionIE, +) +from .applepodcasts import ApplePodcastsIE +from .archiveorg import ( + ArchiveOrgIE, + YoutubeWebArchiveIE, +) +from .arcpublishing import ArcPublishingIE +from .arkena import ArkenaIE +from .ard import ( + ARDBetaMediathekIE, + ARDIE, + ARDMediathekIE, +) +from .arte import ( + ArteTVIE, + ArteTVEmbedIE, + ArteTVPlaylistIE, + ArteTVCategoryIE, +) +from .arnes import ArnesIE +from .asiancrush import ( + AsianCrushIE, + AsianCrushPlaylistIE, +) +from .atresplayer import AtresPlayerIE +from .atscaleconf import AtScaleConfEventIE +from .atttechchannel import ATTTechChannelIE +from .atvat import ATVAtIE +from .audimedia import AudiMediaIE +from .audioboom import AudioBoomIE +from .audiodraft import ( + AudiodraftCustomIE, + AudiodraftGenericIE, +) +from .audiomack import AudiomackIE, AudiomackAlbumIE +from .audius import ( + AudiusIE, + AudiusTrackIE, + AudiusPlaylistIE, + AudiusProfileIE, +) +from .awaan import ( + AWAANIE, + AWAANVideoIE, + AWAANLiveIE, + AWAANSeasonIE, +) +from .azmedien import AZMedienIE +from .baidu import BaiduVideoIE +from .banbye import ( + BanByeIE, + BanByeChannelIE, +) +from .bandaichannel import BandaiChannelIE +from .bandcamp import ( + BandcampIE, + BandcampAlbumIE, + BandcampWeeklyIE, + BandcampUserIE, +) +from .bannedvideo import BannedVideoIE +from .bbc import ( + BBCCoUkIE, + BBCCoUkArticleIE, + BBCCoUkIPlayerEpisodesIE, + BBCCoUkIPlayerGroupIE, + BBCCoUkPlaylistIE, + BBCIE, +) +from .beeg import BeegIE +from .behindkink import BehindKinkIE +from .bellmedia import BellMediaIE +from .beatport import BeatportIE +from .berufetv import BerufeTVIE +from .bet import BetIE +from .bfi import BFIPlayerIE +from .bfmtv import ( + BFMTVIE, + BFMTVLiveIE, + BFMTVArticleIE, +) +from .bibeltv import BibelTVIE +from .bigflix import BigflixIE +from .bigo import BigoIE +from .bild import BildIE +from .bilibili import ( + BiliBiliIE, + BiliBiliBangumiIE, + BiliBiliBangumiMediaIE, + BiliBiliSearchIE, + BilibiliCategoryIE, + BilibiliAudioIE, + BilibiliAudioAlbumIE, + BiliBiliPlayerIE, + BilibiliSpaceVideoIE, + BilibiliSpaceAudioIE, + BilibiliSpacePlaylistIE, + BiliIntlIE, + BiliIntlSeriesIE, + BiliLiveIE, +) +from .biobiochiletv import BioBioChileTVIE +from .bitchute import ( + BitChuteIE, + BitChuteChannelIE, +) +from .bitwave import ( + BitwaveReplayIE, + BitwaveStreamIE, +) +from .biqle import BIQLEIE +from .blackboardcollaborate import BlackboardCollaborateIE +from .bleacherreport import ( + BleacherReportIE, + BleacherReportCMSIE, +) +from .blogger import BloggerIE +from .bloomberg import BloombergIE +from .bokecc import BokeCCIE +from .bongacams import BongaCamsIE +from .bostonglobe import BostonGlobeIE +from .box import BoxIE +from .booyah import BooyahClipsIE +from .bpb import BpbIE +from .br import ( + BRIE, + BRMediathekIE, +) +from .bravotv import BravoTVIE +from .breakcom import BreakIE +from .breitbart import BreitBartIE +from .brightcove import ( + BrightcoveLegacyIE, + BrightcoveNewIE, +) +from .businessinsider import BusinessInsiderIE +from .bundesliga import BundesligaIE +from .buzzfeed import BuzzFeedIE +from .byutv import BYUtvIE +from .c56 import C56IE +from .cableav import CableAVIE +from .callin import CallinIE +from .caltrans import CaltransIE +from .cam4 import CAM4IE +from .camdemy import ( + CamdemyIE, + CamdemyFolderIE +) +from .cammodels import CamModelsIE +from .camsoda import CamsodaIE +from .camtasia import CamtasiaEmbedIE +from .camwithher import CamWithHerIE +from .canalalpha import CanalAlphaIE +from .canalplus import CanalplusIE +from .canalc2 import Canalc2IE +from .canvas import ( + CanvasIE, + CanvasEenIE, + VrtNUIE, + DagelijkseKostIE, +) +from .carambatv import ( + CarambaTVIE, + CarambaTVPageIE, +) +from .cartoonnetwork import CartoonNetworkIE +from .cbc import ( + CBCIE, + CBCPlayerIE, + CBCGemIE, + CBCGemPlaylistIE, + CBCGemLiveIE, +) +from .cbs import CBSIE +from .cbslocal import ( + CBSLocalIE, + CBSLocalArticleIE, +) +from .cbsinteractive import CBSInteractiveIE +from .cbsnews import ( + CBSNewsEmbedIE, + CBSNewsIE, + CBSNewsLiveVideoIE, +) +from .cbssports import ( + CBSSportsEmbedIE, + CBSSportsIE, + TwentyFourSevenSportsIE, +) +from .ccc import ( + CCCIE, + CCCPlaylistIE, +) +from .ccma import CCMAIE +from .cctv import CCTVIE +from .cda import CDAIE +from .cellebrite import CellebriteIE +from .ceskatelevize import CeskaTelevizeIE +from .cgtn import CGTNIE +from .channel9 import Channel9IE +from .charlierose import CharlieRoseIE +from .chaturbate import ChaturbateIE +from .chilloutzone import ChilloutzoneIE +from .chingari import ( + ChingariIE, + ChingariUserIE, +) +from .chirbit import ( + ChirbitIE, + ChirbitProfileIE, +) +from .cinchcast import CinchcastIE +from .cinemax import CinemaxIE +from .cinetecamilano import CinetecaMilanoIE +from .ciscolive import ( + CiscoLiveSessionIE, + CiscoLiveSearchIE, +) +from .ciscowebex import CiscoWebexIE +from .cjsw import CJSWIE +from .cliphunter import CliphunterIE +from .clippit import ClippitIE +from .cliprs import ClipRsIE +from .clipsyndicate import ClipsyndicateIE +from .closertotruth import CloserToTruthIE +from .cloudflarestream import CloudflareStreamIE +from .cloudy import CloudyIE +from .clubic import ClubicIE +from .clyp import ClypIE +from .cmt import CMTIE +from .cnbc import ( + CNBCIE, + CNBCVideoIE, +) +from .cnn import ( + CNNIE, + CNNBlogsIE, + CNNArticleIE, + CNNIndonesiaIE, +) +from .coub import CoubIE +from .comedycentral import ( + ComedyCentralIE, + ComedyCentralTVIE, +) +from .commonmistakes import CommonMistakesIE, UnicodeBOMIE +from .commonprotocols import ( + MmsIE, + RtmpIE, + ViewSourceIE, +) +from .condenast import CondeNastIE +from .contv import CONtvIE +from .corus import CorusIE +from .cpac import ( + CPACIE, + CPACPlaylistIE, +) +from .cozytv import CozyTVIE +from .cracked import CrackedIE +from .crackle import CrackleIE +from .craftsy import CraftsyIE +from .crooksandliars import CrooksAndLiarsIE +from .crowdbunker import ( + CrowdBunkerIE, + CrowdBunkerChannelIE, +) +from .crunchyroll import ( + CrunchyrollBetaIE, + CrunchyrollBetaShowIE, +) +from .cspan import CSpanIE, CSpanCongressIE +from .ctsnews import CtsNewsIE +from .ctv import CTVIE +from .ctvnews import CTVNewsIE +from .cultureunplugged import CultureUnpluggedIE +from .curiositystream import ( + CuriosityStreamIE, + CuriosityStreamCollectionsIE, + CuriosityStreamSeriesIE, +) +from .cwtv import CWTVIE +from .cybrary import ( + CybraryIE, + CybraryCourseIE +) +from .daftsex import DaftsexIE +from .dailymail import DailyMailIE +from .dailymotion import ( + DailymotionIE, + DailymotionPlaylistIE, + DailymotionUserIE, +) +from .dailywire import ( + DailyWireIE, + DailyWirePodcastIE, +) +from .damtomo import ( + DamtomoRecordIE, + DamtomoVideoIE, +) +from .daum import ( + DaumIE, + DaumClipIE, + DaumPlaylistIE, + DaumUserIE, +) +from .daystar import DaystarClipIE +from .dbtv import DBTVIE +from .dctp import DctpTvIE +from .deezer import ( + DeezerPlaylistIE, + DeezerAlbumIE, +) +from .democracynow import DemocracynowIE +from .detik import DetikEmbedIE +from .dfb import DFBIE +from .dhm import DHMIE +from .digg import DiggIE +from .dotsub import DotsubIE +from .douyutv import ( + DouyuShowIE, + DouyuTVIE, +) +from .dplay import ( + DPlayIE, + DiscoveryPlusIE, + HGTVDeIE, + GoDiscoveryIE, + TravelChannelIE, + CookingChannelIE, + HGTVUsaIE, + FoodNetworkIE, + InvestigationDiscoveryIE, + DestinationAmericaIE, + AmHistoryChannelIE, + ScienceChannelIE, + DIYNetworkIE, + DiscoveryLifeIE, + AnimalPlanetIE, + TLCIE, + MotorTrendIE, + MotorTrendOnDemandIE, + DiscoveryPlusIndiaIE, + DiscoveryNetworksDeIE, + DiscoveryPlusItalyIE, + DiscoveryPlusItalyShowIE, + DiscoveryPlusIndiaShowIE, +) +from .dreisat import DreiSatIE +from .drbonanza import DRBonanzaIE +from .drtuber import DrTuberIE +from .drtv import ( + DRTVIE, + DRTVLiveIE, +) +from .dtube import DTubeIE +from .dvtv import DVTVIE +from .duboku import ( + DubokuIE, + DubokuPlaylistIE +) +from .dumpert import DumpertIE +from .defense import DefenseGouvFrIE +from .deuxm import ( + DeuxMIE, + DeuxMNewsIE +) +from .digitalconcerthall import DigitalConcertHallIE +from .discovery import DiscoveryIE +from .disney import DisneyIE +from .dispeak import DigitallySpeakingIE +from .dropbox import DropboxIE +from .dropout import ( + DropoutSeasonIE, + DropoutIE +) +from .dw import ( + DWIE, + DWArticleIE, +) +from .eagleplatform import EaglePlatformIE, ClipYouEmbedIE +from .ebaumsworld import EbaumsWorldIE +from .echomsk import EchoMskIE +from .egghead import ( + EggheadCourseIE, + EggheadLessonIE, +) +from .ehow import EHowIE +from .eighttracks import EightTracksIE +from .einthusan import EinthusanIE +from .eitb import EitbIE +from .ellentube import ( + EllenTubeIE, + EllenTubeVideoIE, + EllenTubePlaylistIE, +) +from .elonet import ElonetIE +from .elpais import ElPaisIE +from .embedly import EmbedlyIE +from .engadget import EngadgetIE +from .epicon import ( + EpiconIE, + EpiconSeriesIE, +) +from .epoch import EpochIE +from .eporner import EpornerIE +from .eroprofile import ( + EroProfileIE, + EroProfileAlbumIE, +) +from .ertgr import ( + ERTFlixCodenameIE, + ERTFlixIE, + ERTWebtvEmbedIE, +) +from .escapist import EscapistIE +from .espn import ( + ESPNIE, + WatchESPNIE, + ESPNArticleIE, + FiveThirtyEightIE, + ESPNCricInfoIE, +) +from .esri import EsriVideoIE +from .europa import EuropaIE +from .europeantour import EuropeanTourIE +from .eurosport import EurosportIE +from .euscreen import EUScreenIE +from .expotv import ExpoTVIE +from .expressen import ExpressenIE +from .extremetube import ExtremeTubeIE +from .eyedotv import EyedoTVIE +from .facebook import ( + FacebookIE, + FacebookPluginsVideoIE, + FacebookRedirectURLIE, + FacebookReelIE, +) +from .fancode import ( + FancodeVodIE, + FancodeLiveIE +) + +from .faz import FazIE +from .fc2 import ( + FC2IE, + FC2EmbedIE, + FC2LiveIE, +) +from .fczenit import FczenitIE +from .fifa import FifaIE +from .filmmodu import FilmmoduIE +from .filmon import ( + FilmOnIE, + FilmOnChannelIE, +) +from .filmweb import FilmwebIE +from .firsttv import FirstTVIE +from .fivetv import FiveTVIE +from .flickr import FlickrIE +from .folketinget import FolketingetIE +from .footyroom import FootyRoomIE +from .formula1 import Formula1IE +from .fourtube import ( + FourTubeIE, + PornTubeIE, + PornerBrosIE, + FuxIE, +) +from .fourzerostudio import ( + FourZeroStudioArchiveIE, + FourZeroStudioClipIE, +) +from .fox import FOXIE +from .fox9 import ( + FOX9IE, + FOX9NewsIE, +) +from .foxgay import FoxgayIE +from .foxnews import ( + FoxNewsIE, + FoxNewsArticleIE, + FoxNewsVideoIE, +) +from .foxsports import FoxSportsIE +from .fptplay import FptplayIE +from .franceinter import FranceInterIE +from .francetv import ( + FranceTVIE, + FranceTVSiteIE, + FranceTVInfoIE, +) +from .freesound import FreesoundIE +from .freespeech import FreespeechIE +from .frontendmasters import ( + FrontendMastersIE, + FrontendMastersLessonIE, + FrontendMastersCourseIE +) +from .freetv import ( + FreeTvIE, + FreeTvMoviesIE, +) +from .fujitv import FujiTVFODPlus7IE +from .funimation import ( + FunimationIE, + FunimationPageIE, + FunimationShowIE, +) +from .funk import FunkIE +from .fusion import FusionIE +from .fuyintv import FuyinTVIE +from .gab import ( + GabTVIE, + GabIE, +) +from .gaia import GaiaIE +from .gameinformer import GameInformerIE +from .gamejolt import ( + GameJoltIE, + GameJoltUserIE, + GameJoltGameIE, + GameJoltGameSoundtrackIE, + GameJoltCommunityIE, + GameJoltSearchIE, +) +from .gamespot import GameSpotIE +from .gamestar import GameStarIE +from .gaskrank import GaskrankIE +from .gazeta import GazetaIE +from .gdcvault import GDCVaultIE +from .gedidigital import GediDigitalIE +from .generic import GenericIE +from .genius import ( + GeniusIE, + GeniusLyricsIE, +) +from .gettr import ( + GettrIE, + GettrStreamingIE, +) +from .gfycat import GfycatIE +from .giantbomb import GiantBombIE +from .giga import GigaIE +from .glide import GlideIE +from .globo import ( + GloboIE, + GloboArticleIE, +) +from .go import GoIE +from .godtube import GodTubeIE +from .gofile import GofileIE +from .golem import GolemIE +from .goodgame import GoodGameIE +from .googledrive import ( + GoogleDriveIE, + GoogleDriveFolderIE, +) +from .googlepodcasts import ( + GooglePodcastsIE, + GooglePodcastsFeedIE, +) +from .googlesearch import GoogleSearchIE +from .gopro import GoProIE +from .goplay import GoPlayIE +from .goshgay import GoshgayIE +from .gotostage import GoToStageIE +from .gputechconf import GPUTechConfIE +from .gronkh import ( + GronkhIE, + GronkhFeedIE, + GronkhVodsIE +) +from .groupon import GrouponIE +from .harpodeon import HarpodeonIE +from .hbo import HBOIE +from .hearthisat import HearThisAtIE +from .heise import HeiseIE +from .hellporno import HellPornoIE +from .helsinki import HelsinkiIE +from .hentaistigma import HentaiStigmaIE +from .hgtv import HGTVComShowIE +from .hketv import HKETVIE +from .hidive import HiDiveIE +from .historicfilms import HistoricFilmsIE +from .hitbox import HitboxIE, HitboxLiveIE +from .hitrecord import HitRecordIE +from .holodex import HolodexIE +from .hotnewhiphop import HotNewHipHopIE +from .hotstar import ( + HotStarIE, + HotStarPrefixIE, + HotStarPlaylistIE, + HotStarSeasonIE, + HotStarSeriesIE, +) +from .howcast import HowcastIE +from .howstuffworks import HowStuffWorksIE +from .hrfensehen import HRFernsehenIE +from .hrti import ( + HRTiIE, + HRTiPlaylistIE, +) +from .hse import ( + HSEShowIE, + HSEProductIE, +) +from .genericembeds import ( + HTML5MediaEmbedIE, + QuotedHTMLIE, +) +from .huajiao import HuajiaoIE +from .huya import HuyaLiveIE +from .huffpost import HuffPostIE +from .hungama import ( + HungamaIE, + HungamaSongIE, + HungamaAlbumPlaylistIE, +) +from .hypem import HypemIE +from .hytale import HytaleIE +from .icareus import IcareusIE +from .ichinanalive import ( + IchinanaLiveIE, + IchinanaLiveClipIE, +) +from .ign import ( + IGNIE, + IGNVideoIE, + IGNArticleIE, +) +from .iheart import ( + IHeartRadioIE, + IHeartRadioPodcastIE, +) +from .iltalehti import IltalehtiIE +from .imdb import ( + ImdbIE, + ImdbListIE +) +from .imgur import ( + ImgurIE, + ImgurAlbumIE, + ImgurGalleryIE, +) +from .ina import InaIE +from .inc import IncIE +from .indavideo import IndavideoEmbedIE +from .infoq import InfoQIE +from .instagram import ( + InstagramIE, + InstagramIOSIE, + InstagramUserIE, + InstagramTagIE, + InstagramStoryIE, +) +from .internazionale import InternazionaleIE +from .internetvideoarchive import InternetVideoArchiveIE +from .iprima import ( + IPrimaIE, + IPrimaCNNIE +) +from .iqiyi import ( + IqiyiIE, + IqIE, + IqAlbumIE +) +from .islamchannel import ( + IslamChannelIE, + IslamChannelSeriesIE, +) +from .israelnationalnews import IsraelNationalNewsIE +from .itprotv import ( + ITProTVIE, + ITProTVCourseIE +) +from .itv import ( + ITVIE, + ITVBTCCIE, +) +from .ivi import ( + IviIE, + IviCompilationIE +) +from .ivideon import IvideonIE +from .iwara import ( + IwaraIE, + IwaraPlaylistIE, + IwaraUserIE, +) +from .ixigua import IxiguaIE +from .izlesene import IzleseneIE +from .jable import ( + JableIE, + JablePlaylistIE, +) +from .jamendo import ( + JamendoIE, + JamendoAlbumIE, +) +from .japandiet import ( + ShugiinItvLiveIE, + ShugiinItvLiveRoomIE, + ShugiinItvVodIE, + SangiinInstructionIE, + SangiinIE, +) +from .jeuxvideo import JeuxVideoIE +from .jove import JoveIE +from .joj import JojIE +from .jwplatform import JWPlatformIE +from .kakao import KakaoIE +from .kaltura import KalturaIE +from .kanal2 import Kanal2IE +from .karaoketv import KaraoketvIE +from .karrierevideos import KarriereVideosIE +from .keezmovies import KeezMoviesIE +from .kelbyone import KelbyOneIE +from .ketnet import KetnetIE +from .khanacademy import ( + KhanAcademyIE, + KhanAcademyUnitIE, +) +from .kicker import KickerIE +from .kickstarter import KickStarterIE +from .kinja import KinjaEmbedIE +from .kinopoisk import KinoPoiskIE +from .kompas import KompasVideoIE +from .konserthusetplay import KonserthusetPlayIE +from .koo import KooIE +from .kth import KTHIE +from .krasview import KrasViewIE +from .ku6 import Ku6IE +from .kusi import KUSIIE +from .kuwo import ( + KuwoIE, + KuwoAlbumIE, + KuwoChartIE, + KuwoSingerIE, + KuwoCategoryIE, + KuwoMvIE, +) +from .la7 import ( + LA7IE, + LA7PodcastEpisodeIE, + LA7PodcastIE, +) +from .laola1tv import ( + Laola1TvEmbedIE, + Laola1TvIE, + EHFTVIE, + ITTFIE, +) +from .lastfm import ( + LastFMIE, + LastFMPlaylistIE, + LastFMUserIE, +) +from .lbry import ( + LBRYIE, + LBRYChannelIE, +) +from .lci import LCIIE +from .lcp import ( + LcpPlayIE, + LcpIE, +) +from .lecture2go import Lecture2GoIE +from .lecturio import ( + LecturioIE, + LecturioCourseIE, + LecturioDeCourseIE, +) +from .leeco import ( + LeIE, + LePlaylistIE, + LetvCloudIE, +) +from .lego import LEGOIE +from .lemonde import LemondeIE +from .lenta import LentaIE +from .libraryofcongress import LibraryOfCongressIE +from .libsyn import LibsynIE +from .lifenews import ( + LifeNewsIE, + LifeEmbedIE, +) +from .likee import ( + LikeeIE, + LikeeUserIE +) +from .limelight import ( + LimelightMediaIE, + LimelightChannelIE, + LimelightChannelListIE, +) +from .line import ( + LineLiveIE, + LineLiveChannelIE, +) +from .linkedin import ( + LinkedInIE, + LinkedInLearningIE, + LinkedInLearningCourseIE, +) +from .linuxacademy import LinuxAcademyIE +from .liputan6 import Liputan6IE +from .listennotes import ListenNotesIE +from .litv import LiTVIE +from .livejournal import LiveJournalIE +from .livestream import ( + LivestreamIE, + LivestreamOriginalIE, + LivestreamShortenerIE, +) +from .livestreamfails import LivestreamfailsIE +from .lnkgo import ( + LnkGoIE, + LnkIE, +) +from .localnews8 import LocalNews8IE +from .lovehomeporn import LoveHomePornIE +from .lrt import ( + LRTVODIE, + LRTStreamIE +) +from .lynda import ( + LyndaIE, + LyndaCourseIE +) +from .m6 import M6IE +from .magentamusik360 import MagentaMusik360IE +from .mailru import ( + MailRuIE, + MailRuMusicIE, + MailRuMusicSearchIE, +) +from .mainstreaming import MainStreamingIE +from .malltv import MallTVIE +from .mangomolo import ( + MangomoloVideoIE, + MangomoloLiveIE, +) +from .manoto import ( + ManotoTVIE, + ManotoTVShowIE, + ManotoTVLiveIE, +) +from .manyvids import ManyVidsIE +from .maoritv import MaoriTVIE +from .markiza import ( + MarkizaIE, + MarkizaPageIE, +) +from .massengeschmacktv import MassengeschmackTVIE +from .masters import MastersIE +from .matchtv import MatchTVIE +from .mdr import MDRIE +from .medaltv import MedalTVIE +from .mediaite import MediaiteIE +from .mediaklikk import MediaKlikkIE +from .mediaset import ( + MediasetIE, + MediasetShowIE, +) +from .mediasite import ( + MediasiteIE, + MediasiteCatalogIE, + MediasiteNamedCatalogIE, +) +from .mediaworksnz import MediaWorksNZVODIE +from .medici import MediciIE +from .megaphone import MegaphoneIE +from .meipai import MeipaiIE +from .melonvod import MelonVODIE +from .meta import METAIE +from .metacafe import MetacafeIE +from .metacritic import MetacriticIE +from .mgoon import MgoonIE +from .mgtv import MGTVIE +from .miaopai import MiaoPaiIE +from .microsoftstream import MicrosoftStreamIE +from .microsoftvirtualacademy import ( + MicrosoftVirtualAcademyIE, + MicrosoftVirtualAcademyCourseIE, +) +from .microsoftembed import MicrosoftEmbedIE +from .mildom import ( + MildomIE, + MildomVodIE, + MildomClipIE, + MildomUserVodIE, +) +from .minds import ( + MindsIE, + MindsChannelIE, + MindsGroupIE, +) +from .ministrygrid import MinistryGridIE +from .minoto import MinotoIE +from .miomio import MioMioIE +from .mirrativ import ( + MirrativIE, + MirrativUserIE, +) +from .mirrorcouk import MirrorCoUKIE +from .mit import TechTVMITIE, OCWMITIE +from .mitele import MiTeleIE +from .mixch import ( + MixchIE, + MixchArchiveIE, +) +from .mixcloud import ( + MixcloudIE, + MixcloudUserIE, + MixcloudPlaylistIE, +) +from .mlb import ( + MLBIE, + MLBVideoIE, + MLBTVIE, + MLBArticleIE, +) +from .mlssoccer import MLSSoccerIE +from .mnet import MnetIE +from .mocha import MochaVideoIE +from .moevideo import MoeVideoIE +from .mofosex import ( + MofosexIE, + MofosexEmbedIE, +) +from .mojvideo import MojvideoIE +from .morningstar import MorningstarIE +from .motherless import ( + MotherlessIE, + MotherlessGroupIE +) +from .motorsport import MotorsportIE +from .movieclips import MovieClipsIE +from .moviepilot import MoviepilotIE +from .moview import MoviewPlayIE +from .moviezine import MoviezineIE +from .movingimage import MovingImageIE +from .msn import MSNIE +from .mtv import ( + MTVIE, + MTVVideoIE, + MTVServicesEmbeddedIE, + MTVDEIE, + MTVJapanIE, + MTVItaliaIE, + MTVItaliaProgrammaIE, +) +from .muenchentv import MuenchenTVIE +from .murrtube import MurrtubeIE, MurrtubeUserIE +from .musescore import MuseScoreIE +from .musicdex import ( + MusicdexSongIE, + MusicdexAlbumIE, + MusicdexArtistIE, + MusicdexPlaylistIE, +) +from .mwave import MwaveIE, MwaveMeetGreetIE +from .mxplayer import ( + MxplayerIE, + MxplayerShowIE, +) +from .mychannels import MyChannelsIE +from .myspace import MySpaceIE, MySpaceAlbumIE +from .myspass import MySpassIE +from .myvi import ( + MyviIE, + MyviEmbedIE, +) +from .myvideoge import MyVideoGeIE +from .myvidster import MyVidsterIE +from .n1 import ( + N1InfoAssetIE, + N1InfoIIE, +) +from .nate import ( + NateIE, + NateProgramIE, +) +from .nationalgeographic import ( + NationalGeographicVideoIE, + NationalGeographicTVIE, +) +from .naver import ( + NaverIE, + NaverLiveIE, + NaverNowIE, +) +from .nba import ( + NBAWatchEmbedIE, + NBAWatchIE, + NBAWatchCollectionIE, + NBAEmbedIE, + NBAIE, + NBAChannelIE, +) +from .nbc import ( + NBCIE, + NBCNewsIE, + NBCOlympicsIE, + NBCOlympicsStreamIE, + NBCSportsIE, + NBCSportsStreamIE, + NBCSportsVPlayerIE, + NBCStationsIE, +) +from .ndr import ( + NDRIE, + NJoyIE, + NDREmbedBaseIE, + NDREmbedIE, + NJoyEmbedIE, +) +from .ndtv import NDTVIE +from .nebula import ( + NebulaIE, + NebulaSubscriptionsIE, + NebulaChannelIE, +) +from .nerdcubed import NerdCubedFeedIE +from .netzkino import NetzkinoIE +from .neteasemusic import ( + NetEaseMusicIE, + NetEaseMusicAlbumIE, + NetEaseMusicSingerIE, + NetEaseMusicListIE, + NetEaseMusicMvIE, + NetEaseMusicProgramIE, + NetEaseMusicDjRadioIE, +) +from .netverse import ( + NetverseIE, + NetversePlaylistIE, +) +from .newgrounds import ( + NewgroundsIE, + NewgroundsPlaylistIE, + NewgroundsUserIE, +) +from .newspicks import NewsPicksIE +from .newstube import NewstubeIE +from .newsy import NewsyIE +from .nextmedia import ( + NextMediaIE, + NextMediaActionNewsIE, + AppleDailyIE, + NextTVIE, +) +from .nexx import ( + NexxIE, + NexxEmbedIE, +) +from .nfb import NFBIE +from .nfhsnetwork import NFHSNetworkIE +from .nfl import ( + NFLIE, + NFLArticleIE, +) +from .nhk import ( + NhkVodIE, + NhkVodProgramIE, + NhkForSchoolBangumiIE, + NhkForSchoolSubjectIE, + NhkForSchoolProgramListIE, +) +from .nhl import NHLIE +from .nick import ( + NickIE, + NickBrIE, + NickDeIE, + NickNightIE, + NickRuIE, +) +from .niconico import ( + NiconicoIE, + NiconicoPlaylistIE, + NiconicoUserIE, + NiconicoSeriesIE, + NiconicoHistoryIE, + NicovideoSearchDateIE, + NicovideoSearchIE, + NicovideoSearchURLIE, + NicovideoTagURLIE, +) +from .ninecninemedia import ( + NineCNineMediaIE, + CPTwentyFourIE, +) +from .ninegag import NineGagIE +from .ninenow import NineNowIE +from .nintendo import NintendoIE +from .nitter import NitterIE +from .njpwworld import NJPWWorldIE +from .nobelprize import NobelPrizeIE +from .nonktube import NonkTubeIE +from .noodlemagazine import NoodleMagazineIE +from .noovo import NoovoIE +from .normalboots import NormalbootsIE +from .nosvideo import NosVideoIE +from .nosnl import NOSNLArticleIE +from .nova import ( + NovaEmbedIE, + NovaIE, +) +from .novaplay import NovaPlayIE +from .nowness import ( + NownessIE, + NownessPlaylistIE, + NownessSeriesIE, +) +from .noz import NozIE +from .npo import ( + AndereTijdenIE, + NPOIE, + NPOLiveIE, + NPORadioIE, + NPORadioFragmentIE, + SchoolTVIE, + HetKlokhuisIE, + VPROIE, + WNLIE, +) +from .npr import NprIE +from .nrk import ( + NRKIE, + NRKPlaylistIE, + NRKSkoleIE, + NRKTVIE, + NRKTVDirekteIE, + NRKRadioPodkastIE, + NRKTVEpisodeIE, + NRKTVEpisodesIE, + NRKTVSeasonIE, + NRKTVSeriesIE, +) +from .nrl import NRLTVIE +from .ntvcojp import NTVCoJpCUIE +from .ntvde import NTVDeIE +from .ntvru import NTVRuIE +from .nytimes import ( + NYTimesIE, + NYTimesArticleIE, + NYTimesCookingIE, +) +from .nuvid import NuvidIE +from .nzherald import NZHeraldIE +from .nzz import NZZIE +from .odatv import OdaTVIE +from .odnoklassniki import OdnoklassnikiIE +from .oftv import ( + OfTVIE, + OfTVPlaylistIE +) +from .oktoberfesttv import OktoberfestTVIE +from .olympics import OlympicsReplayIE +from .on24 import On24IE +from .ondemandkorea import OnDemandKoreaIE +from .onefootball import OneFootballIE +from .onenewsnz import OneNewsNZIE +from .onet import ( + OnetIE, + OnetChannelIE, + OnetMVPIE, + OnetPlIE, +) +from .onionstudios import OnionStudiosIE +from .ooyala import ( + OoyalaIE, + OoyalaExternalIE, +) +from .opencast import ( + OpencastIE, + OpencastPlaylistIE, +) +from .openrec import ( + OpenRecIE, + OpenRecCaptureIE, + OpenRecMovieIE, +) +from .ora import OraTVIE +from .orf import ( + ORFTVthekIE, + ORFFM4StoryIE, + ORFRadioIE, + ORFIPTVIE, +) +from .outsidetv import OutsideTVIE +from .packtpub import ( + PacktPubIE, + PacktPubCourseIE, +) +from .palcomp3 import ( + PalcoMP3IE, + PalcoMP3ArtistIE, + PalcoMP3VideoIE, +) +from .pandoratv import PandoraTVIE +from .panopto import ( + PanoptoIE, + PanoptoListIE, + PanoptoPlaylistIE +) +from .paramountplus import ( + ParamountPlusIE, + ParamountPlusSeriesIE, +) +from .parler import ParlerIE +from .parlview import ParlviewIE +from .patreon import ( + PatreonIE, + PatreonCampaignIE +) +from .pbs import PBSIE +from .pearvideo import PearVideoIE +from .peekvids import PeekVidsIE, PlayVidsIE +from .peertube import ( + PeerTubeIE, + PeerTubePlaylistIE, +) +from .peertv import PeerTVIE +from .peloton import ( + PelotonIE, + PelotonLiveIE +) +from .people import PeopleIE +from .performgroup import PerformGroupIE +from .periscope import ( + PeriscopeIE, + PeriscopeUserIE, +) +from .philharmoniedeparis import PhilharmonieDeParisIE +from .phoenix import PhoenixIE +from .photobucket import PhotobucketIE +from .piapro import PiaproIE +from .picarto import ( + PicartoIE, + PicartoVodIE, +) +from .piksel import PikselIE +from .pinkbike import PinkbikeIE +from .pinterest import ( + PinterestIE, + PinterestCollectionIE, +) +from .pixivsketch import ( + PixivSketchIE, + PixivSketchUserIE, +) +from .pladform import PladformIE +from .planetmarathi import PlanetMarathiIE +from .platzi import ( + PlatziIE, + PlatziCourseIE, +) +from .playfm import PlayFMIE +from .playplustv import PlayPlusTVIE +from .plays import PlaysTVIE +from .playstuff import PlayStuffIE +from .playsuisse import PlaySuisseIE +from .playtvak import PlaytvakIE +from .playvid import PlayvidIE +from .playwire import PlaywireIE +from .plutotv import PlutoTVIE +from .pluralsight import ( + PluralsightIE, + PluralsightCourseIE, +) +from .podbayfm import PodbayFMIE, PodbayFMChannelIE +from .podchaser import PodchaserIE +from .podomatic import PodomaticIE +from .pokemon import ( + PokemonIE, + PokemonWatchIE, +) +from .pokergo import ( + PokerGoIE, + PokerGoCollectionIE, +) +from .polsatgo import PolsatGoIE +from .polskieradio import ( + PolskieRadioIE, + PolskieRadioCategoryIE, + PolskieRadioPlayerIE, + PolskieRadioPodcastIE, + PolskieRadioPodcastListIE, + PolskieRadioRadioKierowcowIE, +) +from .popcorntimes import PopcorntimesIE +from .popcorntv import PopcornTVIE +from .porn91 import Porn91IE +from .porncom import PornComIE +from .pornflip import PornFlipIE +from .pornhd import PornHdIE +from .pornhub import ( + PornHubIE, + PornHubUserIE, + PornHubPlaylistIE, + PornHubPagedVideoListIE, + PornHubUserVideosUploadIE, +) +from .pornotube import PornotubeIE +from .pornovoisines import PornoVoisinesIE +from .pornoxo import PornoXOIE +from .pornez import PornezIE +from .puhutv import ( + PuhuTVIE, + PuhuTVSerieIE, +) +from .prankcast import PrankCastIE +from .premiershiprugby import PremiershipRugbyIE +from .presstv import PressTVIE +from .projectveritas import ProjectVeritasIE +from .prosiebensat1 import ProSiebenSat1IE +from .prx import ( + PRXStoryIE, + PRXSeriesIE, + PRXAccountIE, + PRXStoriesSearchIE, + PRXSeriesSearchIE +) +from .puls4 import Puls4IE +from .pyvideo import PyvideoIE +from .qingting import QingTingIE +from .qqmusic import ( + QQMusicIE, + QQMusicSingerIE, + QQMusicAlbumIE, + QQMusicToplistIE, + QQMusicPlaylistIE, +) +from .r7 import ( + R7IE, + R7ArticleIE, +) +from .radiko import RadikoIE, RadikoRadioIE +from .radiocanada import ( + RadioCanadaIE, + RadioCanadaAudioVideoIE, +) +from .radiode import RadioDeIE +from .radiojavan import RadioJavanIE +from .radiobremen import RadioBremenIE +from .radiofrance import FranceCultureIE, RadioFranceIE +from .radiozet import RadioZetPodcastIE +from .radiokapital import ( + RadioKapitalIE, + RadioKapitalShowIE, +) +from .radlive import ( + RadLiveIE, + RadLiveChannelIE, + RadLiveSeasonIE, +) +from .rai import ( + RaiPlayIE, + RaiPlayLiveIE, + RaiPlayPlaylistIE, + RaiPlaySoundIE, + RaiPlaySoundLiveIE, + RaiPlaySoundPlaylistIE, + RaiNewsIE, + RaiSudtirolIE, + RaiIE, +) +from .raywenderlich import ( + RayWenderlichIE, + RayWenderlichCourseIE, +) +from .rbmaradio import RBMARadioIE +from .rcs import ( + RCSIE, + RCSEmbedsIE, + RCSVariousIE, +) +from .rcti import ( + RCTIPlusIE, + RCTIPlusSeriesIE, + RCTIPlusTVIE, +) +from .rds import RDSIE +from .redbee import ParliamentLiveUKIE, RTBFIE +from .redbulltv import ( + RedBullTVIE, + RedBullEmbedIE, + RedBullTVRrnContentIE, + RedBullIE, +) +from .reddit import RedditIE +from .redgifs import ( + RedGifsIE, + RedGifsSearchIE, + RedGifsUserIE, +) +from .redtube import RedTubeIE +from .regiotv import RegioTVIE +from .rentv import ( + RENTVIE, + RENTVArticleIE, +) +from .restudy import RestudyIE +from .reuters import ReutersIE +from .reverbnation import ReverbNationIE +from .rice import RICEIE +from .rmcdecouverte import RMCDecouverteIE +from .rockstargames import RockstarGamesIE +from .rokfin import ( + RokfinIE, + RokfinStackIE, + RokfinChannelIE, + RokfinSearchIE, +) +from .roosterteeth import RoosterTeethIE, RoosterTeethSeriesIE +from .rottentomatoes import RottenTomatoesIE +from .rozhlas import RozhlasIE +from .rte import RteIE, RteRadioIE +from .rtlnl import ( + RtlNlIE, + RTLLuTeleVODIE, + RTLLuArticleIE, + RTLLuLiveIE, + RTLLuRadioIE, +) +from .rtl2 import ( + RTL2IE, + RTL2YouIE, + RTL2YouSeriesIE, +) +from .rtnews import ( + RTNewsIE, + RTDocumentryIE, + RTDocumentryPlaylistIE, + RuptlyIE, +) +from .rtp import RTPIE +from .rtrfm import RTRFMIE +from .rts import RTSIE +from .rtve import ( + RTVEALaCartaIE, + RTVEAudioIE, + RTVELiveIE, + RTVEInfantilIE, + RTVETelevisionIE, +) +from .rtvnh import RTVNHIE +from .rtvs import RTVSIE +from .rtvslo import RTVSLOIE +from .ruhd import RUHDIE +from .rule34video import Rule34VideoIE +from .rumble import ( + RumbleEmbedIE, + RumbleChannelIE, +) +from .rutube import ( + RutubeIE, + RutubeChannelIE, + RutubeEmbedIE, + RutubeMovieIE, + RutubePersonIE, + RutubePlaylistIE, + RutubeTagsIE, +) +from .glomex import ( + GlomexIE, + GlomexEmbedIE, +) +from .megatvcom import ( + MegaTVComIE, + MegaTVComEmbedIE, +) +from .ant1newsgr import ( + Ant1NewsGrWatchIE, + Ant1NewsGrArticleIE, + Ant1NewsGrEmbedIE, +) +from .rutv import RUTVIE +from .ruutu import RuutuIE +from .ruv import ( + RuvIE, + RuvSpilaIE +) +from .safari import ( + SafariIE, + SafariApiIE, + SafariCourseIE, +) +from .saitosan import SaitosanIE +from .samplefocus import SampleFocusIE +from .sapo import SapoIE +from .savefrom import SaveFromIE +from .sbs import SBSIE +from .screen9 import Screen9IE +from .screencast import ScreencastIE +from .screencastify import ScreencastifyIE +from .screencastomatic import ScreencastOMaticIE +from .scrippsnetworks import ( + ScrippsNetworksWatchIE, + ScrippsNetworksIE, +) +from .scte import ( + SCTEIE, + SCTECourseIE, +) +from .scrolller import ScrolllerIE +from .seeker import SeekerIE +from .senategov import SenateISVPIE, SenateGovIE +from .sendtonews import SendtoNewsIE +from .servus import ServusIE +from .sevenplus import SevenPlusIE +from .sexu import SexuIE +from .seznamzpravy import ( + SeznamZpravyIE, + SeznamZpravyArticleIE, +) +from .shahid import ( + ShahidIE, + ShahidShowIE, +) +from .shared import ( + SharedIE, + VivoIE, +) +from .sharevideos import ShareVideosEmbedIE +from .shemaroome import ShemarooMeIE +from .showroomlive import ShowRoomLiveIE +from .simplecast import ( + SimplecastIE, + SimplecastEpisodeIE, + SimplecastPodcastIE, +) +from .sina import SinaIE +from .sixplay import SixPlayIE +from .skeb import SkebIE +from .skyit import ( + SkyItPlayerIE, + SkyItVideoIE, + SkyItVideoLiveIE, + SkyItIE, + SkyItArteIE, + CieloTVItIE, + TV8ItIE, +) +from .skylinewebcams import SkylineWebcamsIE +from .skynewsarabia import ( + SkyNewsArabiaIE, + SkyNewsArabiaArticleIE, +) +from .skynewsau import SkyNewsAUIE +from .sky import ( + SkyNewsIE, + SkyNewsStoryIE, + SkySportsIE, + SkySportsNewsIE, +) +from .slideshare import SlideshareIE +from .slideslive import SlidesLiveIE +from .slutload import SlutloadIE +from .smotrim import SmotrimIE +from .snotr import SnotrIE +from .sohu import SohuIE +from .sonyliv import ( + SonyLIVIE, + SonyLIVSeriesIE, +) +from .soundcloud import ( + SoundcloudEmbedIE, + SoundcloudIE, + SoundcloudSetIE, + SoundcloudRelatedIE, + SoundcloudUserIE, + SoundcloudTrackStationIE, + SoundcloudPlaylistIE, + SoundcloudSearchIE, +) +from .soundgasm import ( + SoundgasmIE, + SoundgasmProfileIE +) +from .southpark import ( + SouthParkIE, + SouthParkDeIE, + SouthParkDkIE, + SouthParkEsIE, + SouthParkLatIE, + SouthParkNlIE +) +from .sovietscloset import ( + SovietsClosetIE, + SovietsClosetPlaylistIE +) +from .spankbang import ( + SpankBangIE, + SpankBangPlaylistIE, +) +from .spankwire import SpankwireIE +from .spiegel import SpiegelIE +from .spike import ( + BellatorIE, + ParamountNetworkIE, +) +from .startrek import StarTrekIE +from .stitcher import ( + StitcherIE, + StitcherShowIE, +) +from .sport5 import Sport5IE +from .sportbox import SportBoxIE +from .sportdeutschland import SportDeutschlandIE +from .spotify import ( + SpotifyIE, + SpotifyShowIE, +) +from .spreaker import ( + SpreakerIE, + SpreakerPageIE, + SpreakerShowIE, + SpreakerShowPageIE, +) +from .springboardplatform import SpringboardPlatformIE +from .sprout import SproutIE +from .srgssr import ( + SRGSSRIE, + SRGSSRPlayIE, +) +from .srmediathek import SRMediathekIE +from .stanfordoc import StanfordOpenClassroomIE +from .startv import StarTVIE +from .steam import ( + SteamIE, + SteamCommunityBroadcastIE, +) +from .storyfire import ( + StoryFireIE, + StoryFireUserIE, + StoryFireSeriesIE, +) +from .streamable import StreamableIE +from .streamanity import StreamanityIE +from .streamcloud import StreamcloudIE +from .streamcz import StreamCZIE +from .streamff import StreamFFIE +from .streetvoice import StreetVoiceIE +from .stretchinternet import StretchInternetIE +from .stripchat import StripchatIE +from .stv import STVPlayerIE +from .substack import SubstackIE +from .sunporno import SunPornoIE +from .sverigesradio import ( + SverigesRadioEpisodeIE, + SverigesRadioPublicationIE, +) +from .svt import ( + SVTIE, + SVTPageIE, + SVTPlayIE, + SVTSeriesIE, +) +from .swearnet import SwearnetEpisodeIE +from .swrmediathek import SWRMediathekIE +from .syvdk import SYVDKIE +from .syfy import SyfyIE +from .sztvhu import SztvHuIE +from .tagesschau import TagesschauIE +from .tass import TassIE +from .tbs import TBSIE +from .tdslifeway import TDSLifewayIE +from .teachable import ( + TeachableIE, + TeachableCourseIE, +) +from .teachertube import ( + TeacherTubeIE, + TeacherTubeUserIE, +) +from .teachingchannel import TeachingChannelIE +from .teamcoco import TeamcocoIE +from .teamtreehouse import TeamTreeHouseIE +from .techtalks import TechTalksIE +from .ted import ( + TedEmbedIE, + TedPlaylistIE, + TedSeriesIE, + TedTalkIE, +) +from .tele5 import Tele5IE +from .tele13 import Tele13IE +from .telebruxelles import TeleBruxellesIE +from .telecinco import TelecincoIE +from .telegraaf import TelegraafIE +from .telegram import TelegramEmbedIE +from .telemb import TeleMBIE +from .telemundo import TelemundoIE +from .telequebec import ( + TeleQuebecIE, + TeleQuebecSquatIE, + TeleQuebecEmissionIE, + TeleQuebecLiveIE, + TeleQuebecVideoIE, +) +from .teletask import TeleTaskIE +from .telewebion import TelewebionIE +from .tempo import TempoIE +from .tencent import ( + IflixEpisodeIE, + IflixSeriesIE, + VQQSeriesIE, + VQQVideoIE, + WeTvEpisodeIE, + WeTvSeriesIE, +) +from .tennistv import TennisTVIE +from .tenplay import TenPlayIE +from .testurl import TestURLIE +from .tf1 import TF1IE +from .tfo import TFOIE +from .theholetv import TheHoleTvIE +from .theintercept import TheInterceptIE +from .theplatform import ( + ThePlatformIE, + ThePlatformFeedIE, +) +from .thestar import TheStarIE +from .thesun import TheSunIE +from .theta import ( + ThetaVideoIE, + ThetaStreamIE, +) +from .theweatherchannel import TheWeatherChannelIE +from .thisamericanlife import ThisAmericanLifeIE +from .thisav import ThisAVIE +from .thisoldhouse import ThisOldHouseIE +from .threespeak import ( + ThreeSpeakIE, + ThreeSpeakUserIE, +) +from .threeqsdn import ThreeQSDNIE +from .tiktok import ( + TikTokIE, + TikTokUserIE, + TikTokSoundIE, + TikTokEffectIE, + TikTokTagIE, + TikTokVMIE, + DouyinIE, +) +from .tinypic import TinyPicIE +from .tmz import TMZIE +from .tnaflix import ( + TNAFlixNetworkEmbedIE, + TNAFlixIE, + EMPFlixIE, + MovieFapIE, +) +from .toggle import ( + ToggleIE, + MeWatchIE, +) +from .toggo import ( + ToggoIE, +) +from .tokentube import ( + TokentubeIE, + TokentubeChannelIE +) +from .tonline import TOnlineIE +from .toongoggles import ToonGogglesIE +from .toutv import TouTvIE +from .toypics import ToypicsUserIE, ToypicsIE +from .traileraddict import TrailerAddictIE +from .triller import ( + TrillerIE, + TrillerUserIE, +) +from .trilulilu import TriluliluIE +from .trovo import ( + TrovoIE, + TrovoVodIE, + TrovoChannelVodIE, + TrovoChannelClipIE, +) +from .trueid import TrueIDIE +from .trunews import TruNewsIE +from .truth import TruthIE +from .trutv import TruTVIE +from .tube8 import Tube8IE +from .tubetugraz import TubeTuGrazIE, TubeTuGrazSeriesIE +from .tubitv import ( + TubiTvIE, + TubiTvShowIE, +) +from .tumblr import TumblrIE +from .tunein import ( + TuneInClipIE, + TuneInStationIE, + TuneInProgramIE, + TuneInTopicIE, + TuneInShortenerIE, +) +from .tunepk import TunePkIE +from .turbo import TurboIE +from .tv2 import ( + TV2IE, + TV2ArticleIE, + KatsomoIE, + MTVUutisetArticleIE, +) +from .tv24ua import ( + TV24UAVideoIE, +) +from .tv2dk import ( + TV2DKIE, + TV2DKBornholmPlayIE, +) +from .tv2hu import ( + TV2HuIE, + TV2HuSeriesIE, +) +from .tv4 import TV4IE +from .tv5mondeplus import TV5MondePlusIE +from .tv5unis import ( + TV5UnisVideoIE, + TV5UnisIE, +) +from .tva import ( + TVAIE, + QubIE, +) +from .tvanouvelles import ( + TVANouvellesIE, + TVANouvellesArticleIE, +) +from .tvc import ( + TVCIE, + TVCArticleIE, +) +from .tver import TVerIE +from .tvigle import TvigleIE +from .tviplayer import TVIPlayerIE +from .tvland import TVLandIE +from .tvn24 import TVN24IE +from .tvnet import TVNetIE +from .tvnoe import TVNoeIE +from .tvnow import ( + TVNowIE, + TVNowFilmIE, + TVNowNewIE, + TVNowSeasonIE, + TVNowAnnualIE, + TVNowShowIE, +) +from .tvopengr import ( + TVOpenGrWatchIE, + TVOpenGrEmbedIE, +) +from .tvp import ( + TVPEmbedIE, + TVPIE, + TVPStreamIE, + TVPVODSeriesIE, + TVPVODVideoIE, +) +from .tvplay import ( + TVPlayIE, + ViafreeIE, + TVPlayHomeIE, +) +from .tvplayer import TVPlayerIE +from .tweakers import TweakersIE +from .twentyfourvideo import TwentyFourVideoIE +from .twentymin import TwentyMinutenIE +from .twentythreevideo import TwentyThreeVideoIE +from .twitcasting import ( + TwitCastingIE, + TwitCastingLiveIE, + TwitCastingUserIE, +) +from .twitch import ( + TwitchVodIE, + TwitchCollectionIE, + TwitchVideosIE, + TwitchVideosClipsIE, + TwitchVideosCollectionsIE, + TwitchStreamIE, + TwitchClipsIE, +) +from .twitter import ( + TwitterCardIE, + TwitterIE, + TwitterAmplifyIE, + TwitterBroadcastIE, + TwitterSpacesIE, + TwitterShortenerIE, +) +from .udemy import ( + UdemyIE, + UdemyCourseIE +) +from .udn import UDNEmbedIE +from .ufctv import ( + UFCTVIE, + UFCArabiaIE, +) +from .ukcolumn import UkColumnIE +from .uktvplay import UKTVPlayIE +from .digiteka import DigitekaIE +from .dlive import ( + DLiveVODIE, + DLiveStreamIE, +) +from .drooble import DroobleIE +from .umg import UMGDeIE +from .unistra import UnistraIE +from .unity import UnityIE +from .unscripted import UnscriptedNewsVideoIE +from .unsupported import KnownDRMIE, KnownPiracyIE +from .uol import UOLIE +from .uplynk import ( + UplynkIE, + UplynkPreplayIE, +) +from .urort import UrortIE +from .urplay import URPlayIE +from .usanetwork import USANetworkIE +from .usatoday import USATodayIE +from .ustream import UstreamIE, UstreamChannelIE +from .ustudio import ( + UstudioIE, + UstudioEmbedIE, +) +from .utreon import UtreonIE +from .varzesh3 import Varzesh3IE +from .vbox7 import Vbox7IE +from .veehd import VeeHDIE +from .veo import VeoIE +from .veoh import ( + VeohIE, + VeohUserIE +) +from .vesti import VestiIE +from .vevo import ( + VevoIE, + VevoPlaylistIE, +) +from .vgtv import ( + BTArticleIE, + BTVestlendingenIE, + VGTVIE, +) +from .vh1 import VH1IE +from .vice import ( + ViceIE, + ViceArticleIE, + ViceShowIE, +) +from .vidbit import VidbitIE +from .viddler import ViddlerIE +from .videa import VideaIE +from .videocampus_sachsen import ( + VideocampusSachsenIE, + ViMPPlaylistIE, +) +from .videodetective import VideoDetectiveIE +from .videofyme import VideofyMeIE +from .videomore import ( + VideomoreIE, + VideomoreVideoIE, + VideomoreSeasonIE, +) +from .videopress import VideoPressIE +from .vidio import ( + VidioIE, + VidioPremierIE, + VidioLiveIE +) +from .vidlii import VidLiiIE +from .viewlift import ( + ViewLiftIE, + ViewLiftEmbedIE, +) +from .viidea import ViideaIE +from .vimeo import ( + VimeoIE, + VimeoAlbumIE, + VimeoChannelIE, + VimeoGroupsIE, + VimeoLikesIE, + VimeoOndemandIE, + VimeoProIE, + VimeoReviewIE, + VimeoUserIE, + VimeoWatchLaterIE, + VHXEmbedIE, +) +from .vimm import ( + VimmIE, + VimmRecordingIE, +) +from .vimple import VimpleIE +from .vine import ( + VineIE, + VineUserIE, +) +from .viki import ( + VikiIE, + VikiChannelIE, +) +from .viqeo import ViqeoIE +from .viu import ( + ViuIE, + ViuPlaylistIE, + ViuOTTIE, +) +from .vk import ( + VKIE, + VKUserVideosIE, + VKWallPostIE, +) +from .vlive import ( + VLiveIE, + VLivePostIE, + VLiveChannelIE, +) +from .vodlocker import VodlockerIE +from .vodpl import VODPlIE +from .vodplatform import VODPlatformIE +from .voicerepublic import VoiceRepublicIE +from .voicy import ( + VoicyIE, + VoicyChannelIE, +) +from .voot import ( + VootIE, + VootSeriesIE, +) +from .voxmedia import ( + VoxMediaVolumeIE, + VoxMediaIE, +) +from .vrt import VRTIE +from .vrak import VrakIE +from .vrv import ( + VRVIE, + VRVSeriesIE, +) +from .vshare import VShareIE +from .vtm import VTMIE +from .medialaan import MedialaanIE +from .vuclip import VuClipIE +from .vupload import VuploadIE +from .vvvvid import ( + VVVVIDIE, + VVVVIDShowIE, +) +from .vyborymos import VyboryMosIE +from .vzaar import VzaarIE +from .wakanim import WakanimIE +from .walla import WallaIE +from .washingtonpost import ( + WashingtonPostIE, + WashingtonPostArticleIE, +) +from .wasdtv import ( + WASDTVStreamIE, + WASDTVRecordIE, + WASDTVClipIE, +) +from .wat import WatIE +from .watchbox import WatchBoxIE +from .watchindianporn import WatchIndianPornIE +from .wdr import ( + WDRIE, + WDRPageIE, + WDRElefantIE, + WDRMobileIE, +) +from .webcaster import ( + WebcasterIE, + WebcasterFeedIE, +) +from .webofstories import ( + WebOfStoriesIE, + WebOfStoriesPlaylistIE, +) +from .weibo import ( + WeiboIE, + WeiboMobileIE +) +from .weiqitv import WeiqiTVIE +from .wikimedia import WikimediaIE +from .willow import WillowIE +from .wimtv import WimTVIE +from .whowatch import WhoWatchIE +from .wistia import ( + WistiaIE, + WistiaPlaylistIE, + WistiaChannelIE, +) +from .wordpress import ( + WordpressPlaylistEmbedIE, + WordpressMiniAudioPlayerEmbedIE, +) +from .worldstarhiphop import WorldStarHipHopIE +from .wppilot import ( + WPPilotIE, + WPPilotChannelsIE, +) +from .wsj import ( + WSJIE, + WSJArticleIE, +) +from .wwe import WWEIE +from .xbef import XBefIE +from .xboxclips import XboxClipsIE +from .xfileshare import XFileShareIE +from .xhamster import ( + XHamsterIE, + XHamsterEmbedIE, + XHamsterUserIE, +) +from .xiami import ( + XiamiSongIE, + XiamiAlbumIE, + XiamiArtistIE, + XiamiCollectionIE +) +from .ximalaya import ( + XimalayaIE, + XimalayaAlbumIE +) +from .xinpianchang import XinpianchangIE +from .xminus import XMinusIE +from .xnxx import XNXXIE +from .xstream import XstreamIE +from .xtube import XTubeUserIE, XTubeIE +from .xuite import XuiteIE +from .xvideos import XVideosIE +from .xxxymovies import XXXYMoviesIE +from .yahoo import ( + YahooIE, + YahooSearchIE, + YahooGyaOPlayerIE, + YahooGyaOIE, + YahooJapanNewsIE, +) +from .yandexdisk import YandexDiskIE +from .yandexmusic import ( + YandexMusicTrackIE, + YandexMusicAlbumIE, + YandexMusicPlaylistIE, + YandexMusicArtistTracksIE, + YandexMusicArtistAlbumsIE, +) +from .yandexvideo import ( + YandexVideoIE, + YandexVideoPreviewIE, + ZenYandexIE, + ZenYandexChannelIE, +) +from .yapfiles import YapFilesIE +from .yesjapan import YesJapanIE +from .yinyuetai import YinYueTaiIE +from .yle_areena import YleAreenaIE +from .ynet import YnetIE +from .youjizz import YouJizzIE +from .youku import ( + YoukuIE, + YoukuShowIE, +) +from .younow import ( + YouNowLiveIE, + YouNowChannelIE, + YouNowMomentIE, +) +from .youporn import YouPornIE +from .yourporn import YourPornIE +from .yourupload import YourUploadIE +from .zapiks import ZapiksIE +from .zattoo import ( + BBVTVIE, + BBVTVLiveIE, + BBVTVRecordingsIE, + EinsUndEinsTVIE, + EinsUndEinsTVLiveIE, + EinsUndEinsTVRecordingsIE, + EWETVIE, + EWETVLiveIE, + EWETVRecordingsIE, + GlattvisionTVIE, + GlattvisionTVLiveIE, + GlattvisionTVRecordingsIE, + MNetTVIE, + MNetTVLiveIE, + MNetTVRecordingsIE, + NetPlusTVIE, + NetPlusTVLiveIE, + NetPlusTVRecordingsIE, + OsnatelTVIE, + OsnatelTVLiveIE, + OsnatelTVRecordingsIE, + QuantumTVIE, + QuantumTVLiveIE, + QuantumTVRecordingsIE, + SaltTVIE, + SaltTVLiveIE, + SaltTVRecordingsIE, + SAKTVIE, + SAKTVLiveIE, + SAKTVRecordingsIE, + VTXTVIE, + VTXTVLiveIE, + VTXTVRecordingsIE, + WalyTVIE, + WalyTVLiveIE, + WalyTVRecordingsIE, + ZattooIE, + ZattooLiveIE, + ZattooMoviesIE, + ZattooRecordingsIE, +) +from .zdf import ZDFIE, ZDFChannelIE +from .zee5 import ( + Zee5IE, + Zee5SeriesIE, +) +from .zeenews import ZeeNewsIE +from .zhihu import ZhihuIE +from .zingmp3 import ( + ZingMp3IE, + ZingMp3AlbumIE, + ZingMp3ChartHomeIE, + ZingMp3WeekChartIE, + ZingMp3ChartMusicVideoIE, + ZingMp3UserIE, +) +from .zoom import ZoomIE +from .zype import ZypeIE diff --git a/hypervideo_dl/extractor/abc.py b/hypervideo_dl/extractor/abc.py index 6fe195e..0ca76b8 100644 --- a/hypervideo_dl/extractor/abc.py +++ b/hypervideo_dl/extractor/abc.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import hashlib import hmac import re @@ -157,8 +155,6 @@ class ABCIE(InfoExtractor): 'format_id': format_id }) - self._sort_formats(formats) - return { 'id': video_id, 'title': self._og_search_title(webpage), @@ -223,7 +219,6 @@ class ABCIViewIE(InfoExtractor): entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) if formats: break - self._sort_formats(formats) subtitles = {} src_vtt = stream.get('captions', {}).get('src-vtt') diff --git a/hypervideo_dl/extractor/abcnews.py b/hypervideo_dl/extractor/abcnews.py index 296b8ce..a57295b 100644 --- a/hypervideo_dl/extractor/abcnews.py +++ b/hypervideo_dl/extractor/abcnews.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .amp import AMPIE from .common import InfoExtractor from ..utils import ( diff --git a/hypervideo_dl/extractor/abcotvs.py b/hypervideo_dl/extractor/abcotvs.py index 5bff466..6dca19d 100644 --- a/hypervideo_dl/extractor/abcotvs.py +++ b/hypervideo_dl/extractor/abcotvs.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -82,7 +78,6 @@ class ABCOTVSIE(InfoExtractor): 'url': mp4_url, 'width': 640, }) - self._sort_formats(formats) image = video.get('image') or {} @@ -123,7 +118,6 @@ class ABCOTVSClipsIE(InfoExtractor): title = video_data['title'] formats = self._extract_m3u8_formats( video_data['videoURL'].split('?')[0], video_id, 'mp4') - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/abematv.py b/hypervideo_dl/extractor/abematv.py index 27b7d86..80046af 100644 --- a/hypervideo_dl/extractor/abematv.py +++ b/hypervideo_dl/extractor/abematv.py @@ -1,42 +1,41 @@ -import io -import json -import time +import base64 +import binascii +import functools import hashlib import hmac +import io +import json import re import struct -from base64 import urlsafe_b64encode -from binascii import unhexlify +import time +import urllib.parse +import urllib.request +import urllib.response +import uuid from .common import InfoExtractor from ..aes import aes_ecb_decrypt -from ..compat import ( - compat_urllib_response, - compat_urllib_parse_urlparse, - compat_urllib_request, -) from ..utils import ( ExtractorError, - decode_base, + bytes_to_intlist, + decode_base_n, int_or_none, - random_uuidv4, + intlist_to_bytes, + OnDemandPagedList, request_to_url, time_seconds, - update_url_query, traverse_obj, - intlist_to_bytes, - bytes_to_intlist, - urljoin, + update_url_query, ) - # NOTE: network handler related code is temporary thing until network stack overhaul PRs are merged (#2861/#2862) + def add_opener(ydl, handler): ''' Add a handler for opening URLs, like _download_webpage ''' # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426 # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605 - assert isinstance(ydl._opener, compat_urllib_request.OpenerDirector) + assert isinstance(ydl._opener, urllib.request.OpenerDirector) ydl._opener.add_handler(handler) @@ -49,7 +48,7 @@ def remove_opener(ydl, handler): # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L426 # https://github.com/python/cpython/blob/main/Lib/urllib/request.py#L605 opener = ydl._opener - assert isinstance(ydl._opener, compat_urllib_request.OpenerDirector) + assert isinstance(ydl._opener, urllib.request.OpenerDirector) if isinstance(handler, (type, tuple)): find_cp = lambda x: isinstance(x, handler) else: @@ -99,20 +98,20 @@ def remove_opener(ydl, handler): opener.handlers[:] = [x for x in opener.handlers if not find_cp(x)] -class AbemaLicenseHandler(compat_urllib_request.BaseHandler): +class AbemaLicenseHandler(urllib.request.BaseHandler): handler_order = 499 STRTABLE = '123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz' HKEY = b'3AF0298C219469522A313570E8583005A642E73EDD58E3EA2FB7339D3DF1597E' def __init__(self, ie: 'AbemaTVIE'): - # the protcol that this should really handle is 'abematv-license://' + # the protocol that this should really handle is 'abematv-license://' # abematv_license_open is just a placeholder for development purposes # ref. https://github.com/python/cpython/blob/f4c03484da59049eb62a9bf7777b963e2267d187/Lib/urllib/request.py#L510 setattr(self, 'abematv-license_open', getattr(self, 'abematv_license_open')) self.ie = ie def _get_videokey_from_ticket(self, ticket): - to_show = self.ie._downloader.params.get('verbose', False) + to_show = self.ie.get_param('verbose', False) media_token = self.ie._get_media_token(to_show=to_show) license_response = self.ie._download_json( @@ -126,11 +125,11 @@ class AbemaLicenseHandler(compat_urllib_request.BaseHandler): 'Content-Type': 'application/json', }) - res = decode_base(license_response['k'], self.STRTABLE) + res = decode_base_n(license_response['k'], table=self.STRTABLE) encvideokey = bytes_to_intlist(struct.pack('>QQ', res >> 64, res & 0xffffffffffffffff)) h = hmac.new( - unhexlify(self.HKEY), + binascii.unhexlify(self.HKEY), (license_response['cid'] + self.ie._DEVICE_ID).encode('utf-8'), digestmod=hashlib.sha256) enckey = bytes_to_intlist(h.digest()) @@ -139,84 +138,22 @@ class AbemaLicenseHandler(compat_urllib_request.BaseHandler): def abematv_license_open(self, url): url = request_to_url(url) - ticket = compat_urllib_parse_urlparse(url).netloc + ticket = urllib.parse.urlparse(url).netloc response_data = self._get_videokey_from_ticket(ticket) - return compat_urllib_response.addinfourl(io.BytesIO(response_data), headers={ + return urllib.response.addinfourl(io.BytesIO(response_data), headers={ 'Content-Length': len(response_data), }, url=url, code=200) class AbemaTVBaseIE(InfoExtractor): - def _extract_breadcrumb_list(self, webpage, video_id): - for jld in re.finditer( - r'(?is)</span></li></ul><script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>', - webpage): - jsonld = self._parse_json(jld.group('json_ld'), video_id, fatal=False) - if jsonld: - if jsonld.get('@type') != 'BreadcrumbList': - continue - trav = traverse_obj(jsonld, ('itemListElement', ..., 'name')) - if trav: - return trav - return [] - - -class AbemaTVIE(AbemaTVBaseIE): - _VALID_URL = r'https?://abema\.tv/(?P<type>now-on-air|video/episode|channels/.+?/slots)/(?P<id>[^?/]+)' - _NETRC_MACHINE = 'abematv' - _TESTS = [{ - 'url': 'https://abema.tv/video/episode/194-25_s2_p1', - 'info_dict': { - 'id': '194-25_s2_p1', - 'title': '第1話 「チーズケーキ」 「モーニング再び」', - 'series': '異世界食堂2', - 'series_number': 2, - 'episode': '第1話 「チーズケーキ」 「モーニング再び」', - 'episode_number': 1, - }, - 'skip': 'expired', - }, { - 'url': 'https://abema.tv/channels/anime-live2/slots/E8tvAnMJ7a9a5d', - 'info_dict': { - 'id': 'E8tvAnMJ7a9a5d', - 'title': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】', - 'series': 'ゆるキャン△ SEASON2', - 'episode': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】', - 'series_number': 2, - 'episode_number': 1, - 'description': 'md5:9c5a3172ae763278f9303922f0ea5b17', - }, - 'skip': 'expired', - }, { - 'url': 'https://abema.tv/video/episode/87-877_s1282_p31047', - 'info_dict': { - 'id': 'E8tvAnMJ7a9a5d', - 'title': '第5話『光射す』', - 'description': 'md5:56d4fc1b4f7769ded5f923c55bb4695d', - 'thumbnail': r're:https://hayabusa\.io/.+', - 'series': '相棒', - 'episode': '第5話『光射す』', - }, - 'skip': 'expired', - }, { - 'url': 'https://abema.tv/now-on-air/abema-anime', - 'info_dict': { - 'id': 'abema-anime', - # this varies - # 'title': '女子高生の無駄づかい 全話一挙【無料ビデオ72時間】', - 'description': 'md5:55f2e61f46a17e9230802d7bcc913d5f', - 'is_live': True, - }, - 'skip': 'Not supported until hypervideo implements native live downloader OR AbemaTV can start a local HTTP server', - }] _USERTOKEN = None _DEVICE_ID = None - _TIMETABLE = None _MEDIATOKEN = None _SECRETKEY = b'v+Gjs=25Aw5erR!J8ZuvRrCx*rGswhB&qdHd_SYerEWdU&a?3DzN9BRbp5KwY4hEmcj5#fykMjJ=AuWz5GSMY-d@H7DMEh3M@9n2G552Us$$k9cD=3TxwWe86!x#Zyhe' - def _generate_aks(self, deviceid): + @classmethod + def _generate_aks(cls, deviceid): deviceid = deviceid.encode('utf-8') # add 1 hour and then drop minute and secs ts_1hour = int((time_seconds(hours=9) // 3600 + 1) * 3600) @@ -227,7 +164,7 @@ class AbemaTVIE(AbemaTVBaseIE): def mix_once(nonce): nonlocal tmp - h = hmac.new(self._SECRETKEY, digestmod=hashlib.sha256) + h = hmac.new(cls._SECRETKEY, digestmod=hashlib.sha256) h.update(nonce) tmp = h.digest() @@ -238,22 +175,22 @@ class AbemaTVIE(AbemaTVBaseIE): def mix_twist(nonce): nonlocal tmp - mix_once(urlsafe_b64encode(tmp).rstrip(b'=') + nonce) + mix_once(base64.urlsafe_b64encode(tmp).rstrip(b'=') + nonce) - mix_once(self._SECRETKEY) + mix_once(cls._SECRETKEY) mix_tmp(time_struct.tm_mon) mix_twist(deviceid) mix_tmp(time_struct.tm_mday % 5) mix_twist(ts_1hour_str) mix_tmp(time_struct.tm_hour % 5) - return urlsafe_b64encode(tmp).rstrip(b'=').decode('utf-8') + return base64.urlsafe_b64encode(tmp).rstrip(b'=').decode('utf-8') def _get_device_token(self): if self._USERTOKEN: return self._USERTOKEN - self._DEVICE_ID = random_uuidv4() + AbemaTVBaseIE._DEVICE_ID = str(uuid.uuid4()) aks = self._generate_aks(self._DEVICE_ID) user_data = self._download_json( 'https://api.abema.io/v1/users', None, note='Authorizing', @@ -264,7 +201,7 @@ class AbemaTVIE(AbemaTVBaseIE): headers={ 'Content-Type': 'application/json', }) - self._USERTOKEN = user_data['token'] + AbemaTVBaseIE._USERTOKEN = user_data['token'] # don't allow adding it 2 times or more, though it's guarded remove_opener(self._downloader, AbemaLicenseHandler) @@ -276,7 +213,7 @@ class AbemaTVIE(AbemaTVBaseIE): if not invalidate and self._MEDIATOKEN: return self._MEDIATOKEN - self._MEDIATOKEN = self._download_json( + AbemaTVBaseIE._MEDIATOKEN = self._download_json( 'https://api.abema.io/v1/media/token', None, note='Fetching media token' if to_show else False, query={ 'osName': 'android', @@ -286,11 +223,82 @@ class AbemaTVIE(AbemaTVBaseIE): 'appId': 'tv.abema', 'appVersion': '3.27.1' }, headers={ - 'Authorization': 'bearer ' + self._get_device_token() + 'Authorization': f'bearer {self._get_device_token()}', })['token'] return self._MEDIATOKEN + def _call_api(self, endpoint, video_id, query=None, note='Downloading JSON metadata'): + return self._download_json( + f'https://api.abema.io/{endpoint}', video_id, query=query or {}, + note=note, + headers={ + 'Authorization': f'bearer {self._get_device_token()}', + }) + + def _extract_breadcrumb_list(self, webpage, video_id): + for jld in re.finditer( + r'(?is)</span></li></ul><script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>', + webpage): + jsonld = self._parse_json(jld.group('json_ld'), video_id, fatal=False) + if traverse_obj(jsonld, '@type') != 'BreadcrumbList': + continue + items = traverse_obj(jsonld, ('itemListElement', ..., 'name')) + if items: + return items + return [] + + +class AbemaTVIE(AbemaTVBaseIE): + _VALID_URL = r'https?://abema\.tv/(?P<type>now-on-air|video/episode|channels/.+?/slots)/(?P<id>[^?/]+)' + _NETRC_MACHINE = 'abematv' + _TESTS = [{ + 'url': 'https://abema.tv/video/episode/194-25_s2_p1', + 'info_dict': { + 'id': '194-25_s2_p1', + 'title': '第1話 「チーズケーキ」 「モーニング再び」', + 'series': '異世界食堂2', + 'series_number': 2, + 'episode': '第1話 「チーズケーキ」 「モーニング再び」', + 'episode_number': 1, + }, + 'skip': 'expired', + }, { + 'url': 'https://abema.tv/channels/anime-live2/slots/E8tvAnMJ7a9a5d', + 'info_dict': { + 'id': 'E8tvAnMJ7a9a5d', + 'title': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】', + 'series': 'ゆるキャン△ SEASON2', + 'episode': 'ゆるキャン△ SEASON2 全話一挙【無料ビデオ72時間】', + 'series_number': 2, + 'episode_number': 1, + 'description': 'md5:9c5a3172ae763278f9303922f0ea5b17', + }, + 'skip': 'expired', + }, { + 'url': 'https://abema.tv/video/episode/87-877_s1282_p31047', + 'info_dict': { + 'id': 'E8tvAnMJ7a9a5d', + 'title': '第5話『光射す』', + 'description': 'md5:56d4fc1b4f7769ded5f923c55bb4695d', + 'thumbnail': r're:https://hayabusa\.io/.+', + 'series': '相棒', + 'episode': '第5話『光射す』', + }, + 'skip': 'expired', + }, { + 'url': 'https://abema.tv/now-on-air/abema-anime', + 'info_dict': { + 'id': 'abema-anime', + # this varies + # 'title': '女子高生の無駄づかい 全話一挙【無料ビデオ72時間】', + 'description': 'md5:55f2e61f46a17e9230802d7bcc913d5f', + 'is_live': True, + }, + 'skip': 'Not supported until hypervideo implements native live downloader OR AbemaTV can start a local HTTP server', + }] + _TIMETABLE = None + def _perform_login(self, username, password): if '@' in username: # don't strictly check if it's email address or not ep, method = 'user/email', 'email' @@ -303,18 +311,18 @@ class AbemaTVIE(AbemaTVBaseIE): method: username, 'password': password }).encode('utf-8'), headers={ - 'Authorization': 'bearer ' + self._get_device_token(), + 'Authorization': f'bearer {self._get_device_token()}', 'Origin': 'https://abema.tv', 'Referer': 'https://abema.tv/', 'Content-Type': 'application/json', }) - self._USERTOKEN = login_response['token'] + AbemaTVBaseIE._USERTOKEN = login_response['token'] self._get_media_token(True) def _real_extract(self, url): # starting download using infojson from this extractor is undefined behavior, - # and never be fixed in the future; you must trigger downloads by directly specifing URL. + # and never be fixed in the future; you must trigger downloads by directly specifying URL. # (unless there's a way to hook before downloading by extractor) video_id, video_type = self._match_valid_url(url).group('id', 'type') headers = { @@ -357,7 +365,7 @@ class AbemaTVIE(AbemaTVBaseIE): # read breadcrumb on top of page breadcrumb = self._extract_breadcrumb_list(webpage, video_id) if breadcrumb: - # breadcrumb list translates to: (example is 1st test for this IE) + # breadcrumb list translates to: (e.g. 1st test for this IE) # Home > Anime (genre) > Isekai Shokudo 2 (series name) > Episode 1 "Cheese cakes" "Morning again" (episode title) # hence this works info['series'] = breadcrumb[-2] @@ -444,6 +452,7 @@ class AbemaTVIE(AbemaTVBaseIE): class AbemaTVTitleIE(AbemaTVBaseIE): _VALID_URL = r'https?://abema\.tv/video/title/(?P<id>[^?/]+)' + _PAGE_SIZE = 25 _TESTS = [{ 'url': 'https://abema.tv/video/title/90-1597', @@ -459,18 +468,39 @@ class AbemaTVTitleIE(AbemaTVBaseIE): 'title': '真心が届く~僕とスターのオフィス・ラブ!?~', }, 'playlist_mincount': 16, + }, { + 'url': 'https://abema.tv/video/title/25-102', + 'info_dict': { + 'id': '25-102', + 'title': 'ソードアート・オンライン アリシゼーション', + }, + 'playlist_mincount': 24, }] - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + def _fetch_page(self, playlist_id, series_version, page): + programs = self._call_api( + f'v1/video/series/{playlist_id}/programs', playlist_id, + note=f'Downloading page {page + 1}', + query={ + 'seriesVersion': series_version, + 'offset': str(page * self._PAGE_SIZE), + 'order': 'seq', + 'limit': str(self._PAGE_SIZE), + }) + yield from ( + self.url_result(f'https://abema.tv/video/episode/{x}') + for x in traverse_obj(programs, ('programs', ..., 'id'), default=[])) - playlist_title, breadcrumb = None, self._extract_breadcrumb_list(webpage, video_id) - if breadcrumb: - playlist_title = breadcrumb[-1] + def _entries(self, playlist_id, series_version): + return OnDemandPagedList( + functools.partial(self._fetch_page, playlist_id, series_version), + self._PAGE_SIZE) - playlist = [ - self.url_result(urljoin('https://abema.tv/', mobj.group(1))) - for mobj in re.finditer(r'<li\s*class=".+?EpisodeList.+?"><a\s*href="(/[^"]+?)"', webpage)] + def _real_extract(self, url): + playlist_id = self._match_id(url) + series_info = self._call_api(f'v1/video/series/{playlist_id}', playlist_id) - return self.playlist_result(playlist, playlist_title=playlist_title, playlist_id=video_id) + return self.playlist_result( + self._entries(playlist_id, series_info['version']), playlist_id=playlist_id, + playlist_title=series_info.get('title'), + playlist_description=series_info.get('content')) diff --git a/hypervideo_dl/extractor/academicearth.py b/hypervideo_dl/extractor/academicearth.py index 3409550..d9691cb 100644 --- a/hypervideo_dl/extractor/academicearth.py +++ b/hypervideo_dl/extractor/academicearth.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/acast.py b/hypervideo_dl/extractor/acast.py index 63587c5..f2f828f 100644 --- a/hypervideo_dl/extractor/acast.py +++ b/hypervideo_dl/extractor/acast.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( clean_html, diff --git a/hypervideo_dl/extractor/acfun.py b/hypervideo_dl/extractor/acfun.py new file mode 100644 index 0000000..dc57929 --- /dev/null +++ b/hypervideo_dl/extractor/acfun.py @@ -0,0 +1,199 @@ +from .common import InfoExtractor +from ..utils import ( + float_or_none, + format_field, + int_or_none, + traverse_obj, + parse_codecs, + parse_qs, +) + + +class AcFunVideoBaseIE(InfoExtractor): + def _extract_metadata(self, video_id, video_info): + playjson = self._parse_json(video_info['ksPlayJson'], video_id) + + formats, subtitles = [], {} + for video in traverse_obj(playjson, ('adaptationSet', 0, 'representation')): + fmts, subs = self._extract_m3u8_formats_and_subtitles(video['url'], video_id, 'mp4', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + for f in fmts: + f.update({ + 'fps': float_or_none(video.get('frameRate')), + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), + 'tbr': float_or_none(video.get('avgBitrate')), + **parse_codecs(video.get('codecs', '')) + }) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'duration': float_or_none(video_info.get('durationMillis'), 1000), + 'timestamp': int_or_none(video_info.get('uploadTime'), 1000), + 'http_headers': {'Referer': 'https://www.acfun.cn/'}, + } + + +class AcFunVideoIE(AcFunVideoBaseIE): + _VALID_URL = r'https?://www\.acfun\.cn/v/ac(?P<id>[_\d]+)' + + _TESTS = [{ + 'url': 'https://www.acfun.cn/v/ac35457073', + 'info_dict': { + 'id': '35457073', + 'ext': 'mp4', + 'duration': 174.208, + 'timestamp': 1656403967, + 'title': '1 8 岁 现 状', + 'description': '“赶紧回去!班主任查班了!”', + 'uploader': '锤子game', + 'uploader_id': '51246077', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg)', + 'upload_date': '20220628', + 'like_count': int, + 'view_count': int, + 'comment_count': int, + 'tags': list, + }, + }, { + # example for len(video_list) > 1 + 'url': 'https://www.acfun.cn/v/ac35468952_2', + 'info_dict': { + 'id': '35468952_2', + 'ext': 'mp4', + 'title': '【动画剧集】Rocket & Groot Season 1(2022)/火箭浣熊与格鲁特第1季 P02 S01E02 十拿九穩', + 'duration': 90.459, + 'uploader': '比令', + 'uploader_id': '37259967', + 'upload_date': '20220629', + 'timestamp': 1656479962, + 'tags': list, + 'like_count': int, + 'view_count': int, + 'comment_count': int, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg)', + 'description': 'md5:67583aaf3a0f933bd606bc8a2d3ebb17', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + json_all = self._search_json(r'window.videoInfo\s*=', webpage, 'videoInfo', video_id) + + title = json_all.get('title') + video_list = json_all.get('videoList') or [] + video_internal_id = traverse_obj(json_all, ('currentVideoInfo', 'id')) + if video_internal_id and len(video_list) > 1: + part_idx, part_video_info = next( + (idx + 1, v) for (idx, v) in enumerate(video_list) + if v['id'] == video_internal_id) + title = f'{title} P{part_idx:02d} {part_video_info["title"]}' + + return { + **self._extract_metadata(video_id, json_all['currentVideoInfo']), + 'title': title, + 'thumbnail': json_all.get('coverUrl'), + 'description': json_all.get('description'), + 'uploader': traverse_obj(json_all, ('user', 'name')), + 'uploader_id': traverse_obj(json_all, ('user', 'href')), + 'tags': traverse_obj(json_all, ('tagList', ..., 'name')), + 'view_count': int_or_none(json_all.get('viewCount')), + 'like_count': int_or_none(json_all.get('likeCountShow')), + 'comment_count': int_or_none(json_all.get('commentCountShow')), + } + + +class AcFunBangumiIE(AcFunVideoBaseIE): + _VALID_URL = r'https?://www\.acfun\.cn/bangumi/(?P<id>aa[_\d]+)' + + _TESTS = [{ + 'url': 'https://www.acfun.cn/bangumi/aa6002917_36188_1745457?ac=2', + 'info_dict': { + 'id': 'aa6002917_36188_1745457__2', + 'ext': 'mp4', + 'title': '【7月】租借女友 水原千鹤角色曲『DATE』特别PV', + 'upload_date': '20200916', + 'timestamp': 1600243813, + 'duration': 92.091, + }, + }, { + 'url': 'https://www.acfun.cn/bangumi/aa5023171_36188_1750645', + 'info_dict': { + 'id': 'aa5023171_36188_1750645', + 'ext': 'mp4', + 'title': '红孩儿之趴趴蛙寻石记 第5话 ', + 'duration': 760.0, + 'season': '红孩儿之趴趴蛙寻石记', + 'season_id': 5023171, + 'season_number': 1, # series has only 1 season + 'episode': 'Episode 5', + 'episode_number': 5, + 'upload_date': '20181223', + 'timestamp': 1545552185, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)', + 'comment_count': int, + }, + }, { + 'url': 'https://www.acfun.cn/bangumi/aa6065485_36188_1885061', + 'info_dict': { + 'id': 'aa6065485_36188_1885061', + 'ext': 'mp4', + 'title': '叽歪老表(第二季) 第5话 坚不可摧', + 'season': '叽歪老表(第二季)', + 'season_number': 2, + 'season_id': 6065485, + 'episode': '坚不可摧', + 'episode_number': 5, + 'upload_date': '20220324', + 'timestamp': 1648082786, + 'duration': 105.002, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)', + 'comment_count': int, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + ac_idx = parse_qs(url).get('ac', [None])[-1] + video_id = f'{video_id}{format_field(ac_idx, None, "__%s")}' + + webpage = self._download_webpage(url, video_id) + json_bangumi_data = self._search_json(r'window.bangumiData\s*=', webpage, 'bangumiData', video_id) + + if ac_idx: + video_info = json_bangumi_data['hlVideoInfo'] + return { + **self._extract_metadata(video_id, video_info), + 'title': video_info.get('title'), + } + + video_info = json_bangumi_data['currentVideoInfo'] + + season_id = json_bangumi_data.get('bangumiId') + season_number = season_id and next(( + idx for idx, v in enumerate(json_bangumi_data.get('relatedBangumis') or [], 1) + if v.get('id') == season_id), 1) + + json_bangumi_list = self._search_json( + r'window\.bangumiList\s*=', webpage, 'bangumiList', video_id, fatal=False) + video_internal_id = int_or_none(traverse_obj(json_bangumi_data, ('currentVideoInfo', 'id'))) + episode_number = video_internal_id and next(( + idx for idx, v in enumerate(json_bangumi_list.get('items') or [], 1) + if v.get('videoId') == video_internal_id), None) + + return { + **self._extract_metadata(video_id, video_info), + 'title': json_bangumi_data.get('showTitle'), + 'thumbnail': json_bangumi_data.get('image'), + 'season': json_bangumi_data.get('bangumiTitle'), + 'season_id': season_id, + 'season_number': season_number, + 'episode': json_bangumi_data.get('title'), + 'episode_number': episode_number, + 'comment_count': int_or_none(json_bangumi_data.get('commentCount')), + } diff --git a/hypervideo_dl/extractor/adn.py b/hypervideo_dl/extractor/adn.py index fca6e60..e0c18c8 100644 --- a/hypervideo_dl/extractor/adn.py +++ b/hypervideo_dl/extractor/adn.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import base64 import binascii import json @@ -31,30 +28,34 @@ from ..utils import ( class ADNIE(InfoExtractor): - IE_DESC = 'Anime Digital Network' - _VALID_URL = r'https?://(?:www\.)?animedigitalnetwork\.fr/video/[^/]+/(?P<id>\d+)' - _TEST = { - 'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites', - 'md5': '0319c99885ff5547565cacb4f3f9348d', + IE_DESC = 'Animation Digital Network' + _VALID_URL = r'https?://(?:www\.)?(?:animation|anime)digitalnetwork\.fr/video/[^/]+/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://animationdigitalnetwork.fr/video/fruits-basket/9841-episode-1-a-ce-soir', + 'md5': '1c9ef066ceb302c86f80c2b371615261', 'info_dict': { - 'id': '7778', + 'id': '9841', 'ext': 'mp4', - 'title': 'Blue Exorcist - Kyôto Saga - Episode 1', - 'description': 'md5:2f7b5aa76edbc1a7a92cedcda8a528d5', - 'series': 'Blue Exorcist - Kyôto Saga', - 'duration': 1467, - 'release_date': '20170106', + 'title': 'Fruits Basket - Episode 1', + 'description': 'md5:14be2f72c3c96809b0ca424b0097d336', + 'series': 'Fruits Basket', + 'duration': 1437, + 'release_date': '20190405', 'comment_count': int, 'average_rating': float, - 'season_number': 2, - 'episode': 'Début des hostilités', + 'season_number': 1, + 'episode': 'À ce soir !', 'episode_number': 1, - } - } + }, + 'skip': 'Only available in region (FR, ...)', + }, { + 'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites', + 'only_matching': True, + }] - _NETRC_MACHINE = 'animedigitalnetwork' - _BASE_URL = 'http://animedigitalnetwork.fr' - _API_BASE_URL = 'https://gw.api.animedigitalnetwork.fr/' + _NETRC_MACHINE = 'animationdigitalnetwork' + _BASE = 'animationdigitalnetwork.fr' + _API_BASE_URL = 'https://gw.api.' + _BASE + '/' _PLAYER_BASE_URL = _API_BASE_URL + 'player/' _HEADERS = {} _LOGIN_ERR_MESSAGE = 'Unable to log in' @@ -78,14 +79,14 @@ class ADNIE(InfoExtractor): if subtitle_location: enc_subtitles = self._download_webpage( subtitle_location, video_id, 'Downloading subtitles data', - fatal=False, headers={'Origin': 'https://animedigitalnetwork.fr'}) + fatal=False, headers={'Origin': 'https://' + self._BASE}) if not enc_subtitles: return None - # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js + # http://animationdigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js dec_subtitles = unpad_pkcs7(aes_cbc_decrypt_bytes( compat_b64decode(enc_subtitles[24:]), - binascii.unhexlify(self._K + 'ab9f52f5baae7c72'), + binascii.unhexlify(self._K + '7fac1178830cfe0c'), compat_b64decode(enc_subtitles[:24]))) subtitles_json = self._parse_json(dec_subtitles.decode(), None, fatal=False) if not subtitles_json: @@ -234,7 +235,6 @@ Format: Marked,Start,End,Style,Name,MarginL,MarginR,MarginV,Effect,Text''' for f in m3u8_formats: f['language'] = 'fr' formats.extend(m3u8_formats) - self._sort_formats(formats) video = (self._download_json( self._API_BASE_URL + 'video/%s' % video_id, video_id, diff --git a/hypervideo_dl/extractor/adobeconnect.py b/hypervideo_dl/extractor/adobeconnect.py index e2e6f93..8963b12 100644 --- a/hypervideo_dl/extractor/adobeconnect.py +++ b/hypervideo_dl/extractor/adobeconnect.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_parse_qs, diff --git a/hypervideo_dl/extractor/adobepass.py b/hypervideo_dl/extractor/adobepass.py index 5d98301..e5944f7 100644 --- a/hypervideo_dl/extractor/adobepass.py +++ b/hypervideo_dl/extractor/adobepass.py @@ -1,26 +1,20 @@ -# coding: utf-8 -from __future__ import unicode_literals - +import getpass import json import re import time +import urllib.error import xml.etree.ElementTree as etree from .common import InfoExtractor -from ..compat import ( - compat_kwargs, - compat_urlparse, - compat_getpass -) +from ..compat import compat_urlparse from ..utils import ( + NO_DEFAULT, + ExtractorError, unescapeHTML, - urlencode_postdata, unified_timestamp, - ExtractorError, - NO_DEFAULT, + urlencode_postdata, ) - MSO_INFO = { 'DTV': { 'name': 'DIRECTV', @@ -1350,10 +1344,15 @@ MSO_INFO = { 'username_field': 'username', 'password_field': 'password', }, + 'AlticeOne': { + 'name': 'Optimum TV', + 'username_field': 'j_username', + 'password_field': 'j_password', + }, } -class AdobePassIE(InfoExtractor): +class AdobePassIE(InfoExtractor): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor _SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s' _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0' _MVPD_CACHE = 'ap-mvpd' @@ -1365,7 +1364,7 @@ class AdobePassIE(InfoExtractor): headers.update(kwargs.get('headers', {})) kwargs['headers'] = headers return super(AdobePassIE, self)._download_webpage_handle( - *args, **compat_kwargs(kwargs)) + *args, **kwargs) @staticmethod def _get_mvpd_resource(provider_id, title, guid, rating): @@ -1434,32 +1433,34 @@ class AdobePassIE(InfoExtractor): guid = xml_text(resource, 'guid') if '<' in resource else resource count = 0 while count < 2: - requestor_info = self._downloader.cache.load(self._MVPD_CACHE, requestor_id) or {} + requestor_info = self.cache.load(self._MVPD_CACHE, requestor_id) or {} authn_token = requestor_info.get('authn_token') if authn_token and is_expired(authn_token, 'simpleTokenExpires'): authn_token = None if not authn_token: - # TODO add support for other TV Providers mso_id = self.get_param('ap_mso') - if not mso_id: - raise_mvpd_required() - username, password = self._get_login_info('ap_username', 'ap_password', mso_id) - if not username or not password: - raise_mvpd_required() - mso_info = MSO_INFO[mso_id] + if mso_id: + username, password = self._get_login_info('ap_username', 'ap_password', mso_id) + if not username or not password: + raise_mvpd_required() + mso_info = MSO_INFO[mso_id] - provider_redirect_page_res = self._download_webpage_handle( - self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id, - 'Downloading Provider Redirect Page', query={ - 'noflash': 'true', - 'mso_id': mso_id, - 'requestor_id': requestor_id, - 'no_iframe': 'false', - 'domain_name': 'adobe.com', - 'redirect_url': url, - }) + provider_redirect_page_res = self._download_webpage_handle( + self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id, + 'Downloading Provider Redirect Page', query={ + 'noflash': 'true', + 'mso_id': mso_id, + 'requestor_id': requestor_id, + 'no_iframe': 'false', + 'domain_name': 'adobe.com', + 'redirect_url': url, + }) + elif not self._cookies_passed: + raise_mvpd_required() - if mso_id == 'Comcast_SSO': + if not mso_id: + pass + elif mso_id == 'Comcast_SSO': # Comcast page flow varies by video site and whether you # are on Comcast's network. provider_redirect_page, urlh = provider_redirect_page_res @@ -1507,7 +1508,7 @@ class AdobePassIE(InfoExtractor): 'send_confirm_link': False, 'send_token': True })) - philo_code = compat_getpass('Type auth code you have received [Return]: ') + philo_code = getpass.getpass('Type auth code you have received [Return]: ') self._download_webpage( 'https://idp.philo.com/auth/update/login_code', video_id, 'Submitting token', data=urlencode_postdata({ 'token': philo_code @@ -1709,25 +1710,30 @@ class AdobePassIE(InfoExtractor): mso_info.get('username_field', 'username'): username, mso_info.get('password_field', 'password'): password } - if mso_id == 'Cablevision': + if mso_id in ('Cablevision', 'AlticeOne'): form_data['_eventId_proceed'] = '' mvpd_confirm_page_res = post_form(provider_login_page_res, 'Logging in', form_data) if mso_id != 'Rogers': post_form(mvpd_confirm_page_res, 'Confirming Login') - session = self._download_webpage( - self._SERVICE_PROVIDER_TEMPLATE % 'session', video_id, - 'Retrieving Session', data=urlencode_postdata({ - '_method': 'GET', - 'requestor_id': requestor_id, - }), headers=mvpd_headers) + try: + session = self._download_webpage( + self._SERVICE_PROVIDER_TEMPLATE % 'session', video_id, + 'Retrieving Session', data=urlencode_postdata({ + '_method': 'GET', + 'requestor_id': requestor_id, + }), headers=mvpd_headers) + except ExtractorError as e: + if not mso_id and isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 401: + raise_mvpd_required() + raise if '<pendingLogout' in session: - self._downloader.cache.store(self._MVPD_CACHE, requestor_id, {}) + self.cache.store(self._MVPD_CACHE, requestor_id, {}) count += 1 continue authn_token = unescapeHTML(xml_text(session, 'authnToken')) requestor_info['authn_token'] = authn_token - self._downloader.cache.store(self._MVPD_CACHE, requestor_id, requestor_info) + self.cache.store(self._MVPD_CACHE, requestor_id, requestor_info) authz_token = requestor_info.get(guid) if authz_token and is_expired(authz_token, 'simpleTokenTTL'): @@ -1743,14 +1749,14 @@ class AdobePassIE(InfoExtractor): 'userMeta': '1', }), headers=mvpd_headers) if '<pendingLogout' in authorize: - self._downloader.cache.store(self._MVPD_CACHE, requestor_id, {}) + self.cache.store(self._MVPD_CACHE, requestor_id, {}) count += 1 continue if '<error' in authorize: raise ExtractorError(xml_text(authorize, 'details'), expected=True) authz_token = unescapeHTML(xml_text(authorize, 'authzToken')) requestor_info[guid] = authz_token - self._downloader.cache.store(self._MVPD_CACHE, requestor_id, requestor_info) + self.cache.store(self._MVPD_CACHE, requestor_id, requestor_info) mvpd_headers.update({ 'ap_19': xml_text(authn_token, 'simpleSamlNameID'), @@ -1766,7 +1772,7 @@ class AdobePassIE(InfoExtractor): 'hashed_guid': 'false', }), headers=mvpd_headers) if '<pendingLogout' in short_authorize: - self._downloader.cache.store(self._MVPD_CACHE, requestor_id, {}) + self.cache.store(self._MVPD_CACHE, requestor_id, {}) count += 1 continue return short_authorize diff --git a/hypervideo_dl/extractor/adobetv.py b/hypervideo_dl/extractor/adobetv.py index 3cfa1ff..d1525a1 100644 --- a/hypervideo_dl/extractor/adobetv.py +++ b/hypervideo_dl/extractor/adobetv.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import functools import re @@ -72,7 +70,6 @@ class AdobeTVBaseIE(InfoExtractor): }) s3_extracted = True formats.append(f) - self._sort_formats(formats) return { 'id': video_id, @@ -234,6 +231,7 @@ class AdobeTVChannelIE(AdobeTVPlaylistBaseIE): class AdobeTVVideoIE(AdobeTVBaseIE): IE_NAME = 'adobetv:video' _VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P<id>\d+)' + _EMBED_REGEX = [r'<iframe[^>]+src=[\'"](?P<url>(?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]'] _TEST = { # From https://helpx.adobe.com/acrobat/how-to/new-experience-acrobat-dc.html?set=acrobat--get-started--essential-beginners @@ -270,7 +268,6 @@ class AdobeTVVideoIE(AdobeTVBaseIE): 'width': int_or_none(source.get('width') or None), 'url': source_src, }) - self._sort_formats(formats) # For both metadata and downloaded files the duration varies among # formats. I just pick the max one diff --git a/hypervideo_dl/extractor/adultswim.py b/hypervideo_dl/extractor/adultswim.py index c97cfc1..bd29eb4 100644 --- a/hypervideo_dl/extractor/adultswim.py +++ b/hypervideo_dl/extractor/adultswim.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .turner import TurnerBaseIE @@ -183,7 +180,6 @@ class AdultSwimIE(TurnerBaseIE): info['subtitles'].setdefault('en', []).append({ 'url': asset_url, }) - self._sort_formats(info['formats']) return info else: diff --git a/hypervideo_dl/extractor/aenetworks.py b/hypervideo_dl/extractor/aenetworks.py index 8025de5..d7c4010 100644 --- a/hypervideo_dl/extractor/aenetworks.py +++ b/hypervideo_dl/extractor/aenetworks.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .theplatform import ThePlatformIE from ..utils import ( ExtractorError, @@ -12,7 +8,7 @@ from ..utils import ( ) -class AENetworksBaseIE(ThePlatformIE): +class AENetworksBaseIE(ThePlatformIE): # XXX: Do not subclass from concrete IE _BASE_URL_REGEX = r'''(?x)https?:// (?:(?:www|play|watch)\.)? (?P<domain> @@ -32,14 +28,17 @@ class AENetworksBaseIE(ThePlatformIE): } def _extract_aen_smil(self, smil_url, video_id, auth=None): - query = {'mbr': 'true'} + query = { + 'mbr': 'true', + 'formats': 'M3U+none,MPEG-DASH+none,MPEG4,MP3', + } if auth: query['auth'] = auth TP_SMIL_QUERY = [{ 'assetTypes': 'high_video_ak', - 'switch': 'hls_high_ak' + 'switch': 'hls_high_ak', }, { - 'assetTypes': 'high_video_s3' + 'assetTypes': 'high_video_s3', }, { 'assetTypes': 'high_video_s3', 'switch': 'hls_high_fastly', @@ -63,7 +62,6 @@ class AENetworksBaseIE(ThePlatformIE): subtitles = self._merge_subtitles(subtitles, tp_subtitles) if last_e and not formats: raise last_e - self._sort_formats(formats) return { 'id': video_id, 'formats': formats, @@ -305,7 +303,6 @@ class HistoryTopicIE(AENetworksBaseIE): class HistoryPlayerIE(AENetworksBaseIE): IE_NAME = 'history:player' _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:history|biography)\.com)/player/(?P<id>\d+)' - _TESTS = [] def _real_extract(self, url): domain, video_id = self._match_valid_url(url).groups() diff --git a/hypervideo_dl/extractor/aeonco.py b/hypervideo_dl/extractor/aeonco.py new file mode 100644 index 0000000..4655862 --- /dev/null +++ b/hypervideo_dl/extractor/aeonco.py @@ -0,0 +1,40 @@ +from .common import InfoExtractor +from .vimeo import VimeoIE + + +class AeonCoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?aeon\.co/videos/(?P<id>[^/?]+)' + _TESTS = [{ + 'url': 'https://aeon.co/videos/raw-solar-storm-footage-is-the-punk-rock-antidote-to-sleek-james-webb-imagery', + 'md5': 'e5884d80552c9b6ea8d268a258753362', + 'info_dict': { + 'id': '1284717', + 'ext': 'mp4', + 'title': 'Brilliant Noise', + 'thumbnail': 'https://i.vimeocdn.com/video/21006315-1a1e49da8b07fd908384a982b4ba9ff0268c509a474576ebdf7b1392f4acae3b-d_960', + 'uploader': 'Semiconductor', + 'uploader_id': 'semiconductor', + 'uploader_url': 'https://vimeo.com/semiconductor', + 'duration': 348 + } + }, { + 'url': 'https://aeon.co/videos/dazzling-timelapse-shows-how-microbes-spoil-our-food-and-sometimes-enrich-it', + 'md5': '4e5f3dad9dbda0dbfa2da41a851e631e', + 'info_dict': { + 'id': '728595228', + 'ext': 'mp4', + 'title': 'Wrought', + 'thumbnail': 'https://i.vimeocdn.com/video/1484618528-c91452611f9a4e4497735a533da60d45b2fe472deb0c880f0afaab0cd2efb22a-d_1280', + 'uploader': 'Biofilm Productions', + 'uploader_id': 'user140352216', + 'uploader_url': 'https://vimeo.com/user140352216', + 'duration': 1344 + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + vimeo_id = self._search_regex(r'hosterId":\s*"(?P<id>[0-9]+)', webpage, 'vimeo id') + vimeo_url = VimeoIE._smuggle_referrer(f'https://player.vimeo.com/video/{vimeo_id}', 'https://aeon.co') + return self.url_result(vimeo_url, VimeoIE) diff --git a/hypervideo_dl/extractor/afreecatv.py b/hypervideo_dl/extractor/afreecatv.py index 77f0e3c..9276fe7 100644 --- a/hypervideo_dl/extractor/afreecatv.py +++ b/hypervideo_dl/extractor/afreecatv.py @@ -1,14 +1,12 @@ -# coding: utf-8 -from __future__ import unicode_literals - +import functools import re from .common import InfoExtractor -from ..compat import compat_xpath from ..utils import ( + ExtractorError, + OnDemandPagedList, date_from_str, determine_ext, - ExtractorError, int_or_none, qualities, traverse_obj, @@ -280,7 +278,7 @@ class AfreecaTVIE(InfoExtractor): else: raise ExtractorError('Unable to download video info') - video_element = video_xml.findall(compat_xpath('./track/video'))[-1] + video_element = video_xml.findall('./track/video')[-1] if video_element is None or video_element.text is None: raise ExtractorError( 'Video %s does not exist' % video_id, expected=True) @@ -310,7 +308,7 @@ class AfreecaTVIE(InfoExtractor): if not video_url: entries = [] - file_elements = video_element.findall(compat_xpath('./file')) + file_elements = video_element.findall('./file') one = len(file_elements) == 1 for file_num, file_element in enumerate(file_elements, start=1): file_url = url_or_none(file_element.text) @@ -340,7 +338,6 @@ class AfreecaTVIE(InfoExtractor): }] if not formats and not self.get_param('ignore_no_formats'): continue - self._sort_formats(formats) file_info = common_entry.copy() file_info.update({ 'id': format_id, @@ -382,7 +379,7 @@ class AfreecaTVIE(InfoExtractor): return info -class AfreecaTVLiveIE(AfreecaTVIE): +class AfreecaTVLiveIE(AfreecaTVIE): # XXX: Do not subclass from concrete IE IE_NAME = 'afreecatv:live' _VALID_URL = r'https?://play\.afreeca(?:tv)?\.com/(?P<id>[^/]+)(?:/(?P<bno>\d+))?' @@ -466,8 +463,6 @@ class AfreecaTVLiveIE(AfreecaTVIE): 'quality': quality_key(quality_str), }) - self._sort_formats(formats) - station_info = self._download_json( 'https://st.afreecatv.com/api/get_station_status.php', broadcast_no, query={'szBjId': broadcaster_id}, fatal=False, @@ -482,3 +477,57 @@ class AfreecaTVLiveIE(AfreecaTVIE): 'formats': formats, 'is_live': True, } + + +class AfreecaTVUserIE(InfoExtractor): + IE_NAME = 'afreecatv:user' + _VALID_URL = r'https?://bj\.afreeca(?:tv)?\.com/(?P<id>[^/]+)/vods/?(?P<slug_type>[^/]+)?' + _TESTS = [{ + 'url': 'https://bj.afreecatv.com/ryuryu24/vods/review', + 'info_dict': { + '_type': 'playlist', + 'id': 'ryuryu24', + 'title': 'ryuryu24 - review', + }, + 'playlist_count': 218, + }, { + 'url': 'https://bj.afreecatv.com/parang1995/vods/highlight', + 'info_dict': { + '_type': 'playlist', + 'id': 'parang1995', + 'title': 'parang1995 - highlight', + }, + 'playlist_count': 997, + }, { + 'url': 'https://bj.afreecatv.com/ryuryu24/vods', + 'info_dict': { + '_type': 'playlist', + 'id': 'ryuryu24', + 'title': 'ryuryu24 - all', + }, + 'playlist_count': 221, + }, { + 'url': 'https://bj.afreecatv.com/ryuryu24/vods/balloonclip', + 'info_dict': { + '_type': 'playlist', + 'id': 'ryuryu24', + 'title': 'ryuryu24 - balloonclip', + }, + 'playlist_count': 0, + }] + _PER_PAGE = 60 + + def _fetch_page(self, user_id, user_type, page): + page += 1 + info = self._download_json(f'https://bjapi.afreecatv.com/api/{user_id}/vods/{user_type}', user_id, + query={'page': page, 'per_page': self._PER_PAGE, 'orderby': 'reg_date'}, + note=f'Downloading {user_type} video page {page}') + for item in info['data']: + yield self.url_result( + f'https://vod.afreecatv.com/player/{item["title_no"]}/', AfreecaTVIE, item['title_no']) + + def _real_extract(self, url): + user_id, user_type = self._match_valid_url(url).group('id', 'slug_type') + user_type = user_type or 'all' + entries = OnDemandPagedList(functools.partial(self._fetch_page, user_id, user_type), self._PER_PAGE) + return self.playlist_result(entries, user_id, f'{user_id} - {user_type}') diff --git a/hypervideo_dl/extractor/agora.py b/hypervideo_dl/extractor/agora.py new file mode 100644 index 0000000..abb2d3f --- /dev/null +++ b/hypervideo_dl/extractor/agora.py @@ -0,0 +1,251 @@ +import functools +import uuid + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + OnDemandPagedList, + int_or_none, + month_by_name, + parse_duration, + try_call, +) + + +class WyborczaVideoIE(InfoExtractor): + # this id is not an article id, it has to be extracted from the article + _VALID_URL = r'(?:wyborcza:video:|https?://wyborcza\.pl/(?:api-)?video/)(?P<id>\d+)' + IE_NAME = 'wyborcza:video' + _TESTS = [{ + 'url': 'wyborcza:video:26207634', + 'info_dict': { + 'id': '26207634', + 'ext': 'mp4', + 'title': '- Polska w 2020 r. jest innym państwem niż w 2015 r. Nie zmieniła się konstytucja, ale jest to już inny ustrój - mówi Adam Bodnar', + 'description': ' ', + 'uploader': 'Dorota Roman', + 'duration': 2474, + 'thumbnail': r're:https://.+\.jpg', + }, + }, { + 'url': 'https://wyborcza.pl/video/26207634', + 'only_matching': True, + }, { + 'url': 'https://wyborcza.pl/api-video/26207634', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + meta = self._download_json(f'https://wyborcza.pl/api-video/{video_id}', video_id) + + formats = [] + base_url = meta['redirector'].replace('http://', 'https://') + meta['basePath'] + for quality in ('standard', 'high'): + if not meta['files'].get(quality): + continue + formats.append({ + 'url': base_url + meta['files'][quality], + 'height': int_or_none( + self._search_regex( + r'p(\d+)[a-z]+\.mp4$', meta['files'][quality], + 'mp4 video height', default=None)), + 'format_id': quality, + }) + if meta['files'].get('dash'): + formats.extend(self._extract_mpd_formats(base_url + meta['files']['dash'], video_id)) + + return { + 'id': video_id, + 'formats': formats, + 'title': meta.get('title'), + 'description': meta.get('lead'), + 'uploader': meta.get('signature'), + 'thumbnail': meta.get('imageUrl'), + 'duration': meta.get('duration'), + } + + +class WyborczaPodcastIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?://(?:www\.)?(?: + wyborcza\.pl/podcast(?:/0,172673\.html)?| + wysokieobcasy\.pl/wysokie-obcasy/0,176631\.html + )(?:\?(?:[^&#]+?&)*podcast=(?P<id>\d+))? + ''' + _TESTS = [{ + 'url': 'https://wyborcza.pl/podcast/0,172673.html?podcast=100720#S.main_topic-K.C-B.6-L.1.podcast', + 'info_dict': { + 'id': '100720', + 'ext': 'mp3', + 'title': 'Cyfrodziewczyny. Kim były pionierki polskiej informatyki ', + 'uploader': 'Michał Nogaś ', + 'upload_date': '20210117', + 'description': 'md5:49f0a06ffc4c1931210d3ab1416a651d', + 'duration': 3684.0, + 'thumbnail': r're:https://.+\.jpg', + }, + }, { + 'url': 'https://www.wysokieobcasy.pl/wysokie-obcasy/0,176631.html?podcast=100673', + 'info_dict': { + 'id': '100673', + 'ext': 'mp3', + 'title': 'Czym jest ubóstwo menstruacyjne i dlaczego dotyczy każdej i każdego z nas?', + 'uploader': 'Agnieszka Urazińska ', + 'upload_date': '20210115', + 'description': 'md5:c161dc035f8dbb60077011fc41274899', + 'duration': 1803.0, + 'thumbnail': r're:https://.+\.jpg', + }, + }, { + 'url': 'https://wyborcza.pl/podcast', + 'info_dict': { + 'id': '334', + 'title': 'Gościnnie: Wyborcza, 8:10', + 'series': 'Gościnnie: Wyborcza, 8:10', + }, + 'playlist_mincount': 370, + }, { + 'url': 'https://www.wysokieobcasy.pl/wysokie-obcasy/0,176631.html', + 'info_dict': { + 'id': '395', + 'title': 'Gościnnie: Wysokie Obcasy', + 'series': 'Gościnnie: Wysokie Obcasy', + }, + 'playlist_mincount': 12, + }] + + def _real_extract(self, url): + podcast_id = self._match_id(url) + + if not podcast_id: # playlist + podcast_id = '395' if 'wysokieobcasy.pl/' in url else '334' + return self.url_result(TokFMAuditionIE._create_url(podcast_id), TokFMAuditionIE, podcast_id) + + meta = self._download_json('https://wyborcza.pl/api/podcast', podcast_id, + query={'guid': podcast_id, 'type': 'wo' if 'wysokieobcasy.pl/' in url else None}) + + day, month, year = self._search_regex(r'^(\d\d?) (\w+) (\d{4})$', meta.get('publishedDate'), + 'upload date', group=(1, 2, 3), default=(None, None, None)) + return { + 'id': podcast_id, + 'url': meta['url'], + 'title': meta.get('title'), + 'description': meta.get('description'), + 'thumbnail': meta.get('imageUrl'), + 'duration': parse_duration(meta.get('duration')), + 'uploader': meta.get('author'), + 'upload_date': try_call(lambda: f'{year}{month_by_name(month, lang="pl"):0>2}{day:0>2}'), + } + + +class TokFMPodcastIE(InfoExtractor): + _VALID_URL = r'(?:https?://audycje\.tokfm\.pl/podcast/|tokfm:podcast:)(?P<id>\d+),?' + IE_NAME = 'tokfm:podcast' + _TESTS = [{ + 'url': 'https://audycje.tokfm.pl/podcast/91275,-Systemowy-rasizm-Czy-zamieszki-w-USA-po-morderstwie-w-Minneapolis-doprowadza-do-zmian-w-sluzbach-panstwowych', + 'info_dict': { + 'id': '91275', + 'ext': 'aac', + 'title': 'md5:a9b15488009065556900169fb8061cce', + 'episode': 'md5:a9b15488009065556900169fb8061cce', + 'series': 'Analizy', + }, + }] + + def _real_extract(self, url): + media_id = self._match_id(url) + + # in case it breaks see this but it returns a lot of useless data + # https://api.podcast.radioagora.pl/api4/getPodcasts?podcast_id=100091&with_guests=true&with_leaders_for_mobile=true + metadata = self._download_json( + f'https://audycje.tokfm.pl/getp/3{media_id}', media_id, 'Downloading podcast metadata') + if not metadata: + raise ExtractorError('No such podcast', expected=True) + metadata = metadata[0] + + formats = [] + for ext in ('aac', 'mp3'): + url_data = self._download_json( + f'https://api.podcast.radioagora.pl/api4/getSongUrl?podcast_id={media_id}&device_id={uuid.uuid4()}&ppre=false&audio={ext}', + media_id, 'Downloading podcast %s URL' % ext) + # prevents inserting the mp3 (default) multiple times + if 'link_ssl' in url_data and f'.{ext}' in url_data['link_ssl']: + formats.append({ + 'url': url_data['link_ssl'], + 'ext': ext, + 'vcodec': 'none', + 'acodec': ext, + }) + + return { + 'id': media_id, + 'formats': formats, + 'title': metadata.get('podcast_name'), + 'series': metadata.get('series_name'), + 'episode': metadata.get('podcast_name'), + } + + +class TokFMAuditionIE(InfoExtractor): + _VALID_URL = r'(?:https?://audycje\.tokfm\.pl/audycja/|tokfm:audition:)(?P<id>\d+),?' + IE_NAME = 'tokfm:audition' + _TESTS = [{ + 'url': 'https://audycje.tokfm.pl/audycja/218,Analizy', + 'info_dict': { + 'id': '218', + 'title': 'Analizy', + 'series': 'Analizy', + }, + 'playlist_count': 1635, + }] + + _PAGE_SIZE = 30 + _HEADERS = { + 'User-Agent': 'Mozilla/5.0 (Linux; Android 9; Redmi 3S Build/PQ3A.190801.002; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/87.0.4280.101 Mobile Safari/537.36', + } + + @staticmethod + def _create_url(id): + return f'https://audycje.tokfm.pl/audycja/{id}' + + def _real_extract(self, url): + audition_id = self._match_id(url) + + data = self._download_json( + f'https://api.podcast.radioagora.pl/api4/getSeries?series_id={audition_id}', + audition_id, 'Downloading audition metadata', headers=self._HEADERS) + if not data: + raise ExtractorError('No such audition', expected=True) + data = data[0] + + entries = OnDemandPagedList(functools.partial( + self._fetch_page, audition_id, data), self._PAGE_SIZE) + + return { + '_type': 'playlist', + 'id': audition_id, + 'title': data.get('series_name'), + 'series': data.get('series_name'), + 'entries': entries, + } + + def _fetch_page(self, audition_id, data, page): + for retry in self.RetryManager(): + podcast_page = self._download_json( + f'https://api.podcast.radioagora.pl/api4/getPodcasts?series_id={audition_id}&limit=30&offset={page}&with_guests=true&with_leaders_for_mobile=true', + audition_id, f'Downloading podcast list page {page + 1}', headers=self._HEADERS) + if not podcast_page: + retry.error = ExtractorError('Agora returned empty page', expected=True) + + for podcast in podcast_page: + yield { + '_type': 'url_transparent', + 'url': podcast['podcast_sharing_url'], + 'ie_key': TokFMPodcastIE.ie_key(), + 'title': podcast.get('podcast_name'), + 'episode': podcast.get('podcast_name'), + 'description': podcast.get('podcast_description'), + 'timestamp': int_or_none(podcast.get('podcast_timestamp')), + 'series': data.get('series_name'), + } diff --git a/hypervideo_dl/extractor/airmozilla.py b/hypervideo_dl/extractor/airmozilla.py index 9e38136..669556b 100644 --- a/hypervideo_dl/extractor/airmozilla.py +++ b/hypervideo_dl/extractor/airmozilla.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/aliexpress.py b/hypervideo_dl/extractor/aliexpress.py index 9722fe9..2e83f2e 100644 --- a/hypervideo_dl/extractor/aliexpress.py +++ b/hypervideo_dl/extractor/aliexpress.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( diff --git a/hypervideo_dl/extractor/aljazeera.py b/hypervideo_dl/extractor/aljazeera.py index 7bcdb7a..124bab0 100644 --- a/hypervideo_dl/extractor/aljazeera.py +++ b/hypervideo_dl/extractor/aljazeera.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/allocine.py b/hypervideo_dl/extractor/allocine.py index 403a277..2d342cf 100644 --- a/hypervideo_dl/extractor/allocine.py +++ b/hypervideo_dl/extractor/allocine.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -115,8 +112,6 @@ class AllocineIE(InfoExtractor): }) duration, view_count, timestamp = [None] * 3 - self._sort_formats(formats) - return { 'id': video_id, 'display_id': display_id, diff --git a/hypervideo_dl/extractor/alphaporno.py b/hypervideo_dl/extractor/alphaporno.py index 3a6d99f..8d5b472 100644 --- a/hypervideo_dl/extractor/alphaporno.py +++ b/hypervideo_dl/extractor/alphaporno.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( parse_iso8601, diff --git a/hypervideo_dl/extractor/alsace20tv.py b/hypervideo_dl/extractor/alsace20tv.py index 4aae6fe..ea3332e 100644 --- a/hypervideo_dl/extractor/alsace20tv.py +++ b/hypervideo_dl/extractor/alsace20tv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( clean_html, @@ -25,7 +22,6 @@ class Alsace20TVBaseIE(InfoExtractor): self._extract_smil_formats(fmt_url, video_id, fatal=False) if '/smil:_' in fmt_url else self._extract_mpd_formats(fmt_url, video_id, mpd_id=res, fatal=False)) - self._sort_formats(formats) webpage = (url and self._download_webpage(url, video_id, fatal=False)) or '' thumbnail = url_or_none(dict_get(info, ('image', 'preview', )) or self._og_search_thumbnail(webpage)) diff --git a/hypervideo_dl/extractor/alura.py b/hypervideo_dl/extractor/alura.py index d2e2df2..bfe066b 100644 --- a/hypervideo_dl/extractor/alura.py +++ b/hypervideo_dl/extractor/alura.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -66,8 +63,6 @@ class AluraIE(InfoExtractor): f['height'] = int('720' if m.group('res') == 'hd' else '480') formats.extend(video_format) - self._sort_formats(formats) - return { 'id': video_id, 'title': video_title, @@ -116,7 +111,7 @@ class AluraIE(InfoExtractor): raise ExtractorError('Unable to log in') -class AluraCourseIE(AluraIE): +class AluraCourseIE(AluraIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'https?://(?:cursos\.)?alura\.com\.br/course/(?P<id>[^/]+)' _LOGIN_URL = 'https://cursos.alura.com.br/loginForm?urlAfterLogin=/loginForm' diff --git a/hypervideo_dl/extractor/amara.py b/hypervideo_dl/extractor/amara.py index 61d4695..5018710 100644 --- a/hypervideo_dl/extractor/amara.py +++ b/hypervideo_dl/extractor/amara.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from .youtube import YoutubeIE from .vimeo import VimeoIE diff --git a/hypervideo_dl/extractor/amazon.py b/hypervideo_dl/extractor/amazon.py index 07b1b18..4d31706 100644 --- a/hypervideo_dl/extractor/amazon.py +++ b/hypervideo_dl/extractor/amazon.py @@ -1,6 +1,5 @@ -# coding: utf-8 from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ExtractorError, int_or_none class AmazonStoreIE(InfoExtractor): @@ -10,7 +9,7 @@ class AmazonStoreIE(InfoExtractor): 'url': 'https://www.amazon.co.uk/dp/B098XNCHLD/', 'info_dict': { 'id': 'B098XNCHLD', - 'title': 'md5:5f3194dbf75a8dcfc83079bd63a2abed', + 'title': 'md5:dae240564cbb2642170c02f7f0d7e472', }, 'playlist_mincount': 1, 'playlist': [{ @@ -19,28 +18,44 @@ class AmazonStoreIE(InfoExtractor): 'ext': 'mp4', 'title': 'mcdodo usb c cable 100W 5a', 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 34, }, }] }, { 'url': 'https://www.amazon.in/Sony-WH-1000XM4-Cancelling-Headphones-Bluetooth/dp/B0863TXGM3', 'info_dict': { 'id': 'B0863TXGM3', - 'title': 'md5:b0bde4881d3cfd40d63af19f7898b8ff', + 'title': 'md5:d1d3352428f8f015706c84b31e132169', }, 'playlist_mincount': 4, }, { 'url': 'https://www.amazon.com/dp/B0845NXCXF/', 'info_dict': { 'id': 'B0845NXCXF', - 'title': 'md5:2145cd4e3c7782f1ee73649a3cff1171', + 'title': 'md5:f3fa12779bf62ddb6a6ec86a360a858e', }, 'playlist-mincount': 1, + }, { + 'url': 'https://www.amazon.es/Samsung-Smartphone-s-AMOLED-Quad-c%C3%A1mara-espa%C3%B1ola/dp/B08WX337PQ', + 'info_dict': { + 'id': 'B08WX337PQ', + 'title': 'md5:f3fa12779bf62ddb6a6ec86a360a858e', + }, + 'playlist_mincount': 1, }] def _real_extract(self, url): id = self._match_id(url) - webpage = self._download_webpage(url, id) - data_json = self._parse_json(self._html_search_regex(r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'(.*)\'\)', webpage, 'data'), id) + + for retry in self.RetryManager(): + webpage = self._download_webpage(url, id) + try: + data_json = self._search_json( + r'var\s?obj\s?=\s?jQuery\.parseJSON\(\'', webpage, 'data', id, + transform_source=lambda x: x.replace(R'\\u', R'\u')) + except ExtractorError as e: + retry.error = e + entries = [{ 'id': video['marketPlaceID'], 'url': video['url'], @@ -50,4 +65,4 @@ class AmazonStoreIE(InfoExtractor): 'height': int_or_none(video.get('videoHeight')), 'width': int_or_none(video.get('videoWidth')), } for video in (data_json.get('videos') or []) if video.get('isVideo') and video.get('url')] - return self.playlist_result(entries, playlist_id=id, playlist_title=data_json['title']) + return self.playlist_result(entries, playlist_id=id, playlist_title=data_json.get('title')) diff --git a/hypervideo_dl/extractor/amazonminitv.py b/hypervideo_dl/extractor/amazonminitv.py new file mode 100644 index 0000000..7309968 --- /dev/null +++ b/hypervideo_dl/extractor/amazonminitv.py @@ -0,0 +1,290 @@ +import json + +from .common import InfoExtractor +from ..utils import ExtractorError, int_or_none, traverse_obj, try_get + + +class AmazonMiniTVBaseIE(InfoExtractor): + def _real_initialize(self): + self._download_webpage( + 'https://www.amazon.in/minitv', None, + note='Fetching guest session cookies') + AmazonMiniTVBaseIE.session_id = self._get_cookies('https://www.amazon.in')['session-id'].value + + def _call_api(self, asin, data=None, note=None): + device = {'clientId': 'ATVIN', 'deviceLocale': 'en_GB'} + if data: + data['variables'].update({ + 'contentType': 'VOD', + 'sessionIdToken': self.session_id, + **device, + }) + + resp = self._download_json( + f'https://www.amazon.in/minitv/api/web/{"graphql" if data else "prs"}', + asin, note=note, headers={'Content-Type': 'application/json'}, + data=json.dumps(data).encode() if data else None, + query=None if data else { + 'deviceType': 'A1WMMUXPCUJL4N', + 'contentId': asin, + **device, + }) + + if resp.get('errors'): + raise ExtractorError(f'MiniTV said: {resp["errors"][0]["message"]}') + elif not data: + return resp + return resp['data'][data['operationName']] + + +class AmazonMiniTVIE(AmazonMiniTVBaseIE): + _VALID_URL = r'(?:https?://(?:www\.)?amazon\.in/minitv/tp/|amazonminitv:(?:amzn1\.dv\.gti\.)?)(?P<id>[a-f0-9-]+)' + _TESTS = [{ + 'url': 'https://www.amazon.in/minitv/tp/75fe3a75-b8fe-4499-8100-5c9424344840?referrer=https%3A%2F%2Fwww.amazon.in%2Fminitv', + 'info_dict': { + 'id': 'amzn1.dv.gti.75fe3a75-b8fe-4499-8100-5c9424344840', + 'ext': 'mp4', + 'title': 'May I Kiss You?', + 'language': 'Hindi', + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'md5:a549bfc747973e04feb707833474e59d', + 'release_timestamp': 1644710400, + 'release_date': '20220213', + 'duration': 846, + 'chapters': 'count:2', + 'series': 'Couple Goals', + 'series_id': 'amzn1.dv.gti.56521d46-b040-4fd5-872e-3e70476a04b0', + 'season': 'Season 3', + 'season_number': 3, + 'season_id': 'amzn1.dv.gti.20331016-d9b9-4968-b991-c89fa4927a36', + 'episode': 'May I Kiss You?', + 'episode_number': 2, + 'episode_id': 'amzn1.dv.gti.75fe3a75-b8fe-4499-8100-5c9424344840', + }, + }, { + 'url': 'https://www.amazon.in/minitv/tp/280d2564-584f-452f-9c98-7baf906e01ab?referrer=https%3A%2F%2Fwww.amazon.in%2Fminitv', + 'info_dict': { + 'id': 'amzn1.dv.gti.280d2564-584f-452f-9c98-7baf906e01ab', + 'ext': 'mp4', + 'title': 'Jahaan', + 'language': 'Hindi', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': 'md5:05eb765a77bf703f322f120ec6867339', + 'release_timestamp': 1647475200, + 'release_date': '20220317', + 'duration': 783, + 'chapters': [], + }, + }, { + 'url': 'https://www.amazon.in/minitv/tp/280d2564-584f-452f-9c98-7baf906e01ab', + 'only_matching': True, + }, { + 'url': 'amazonminitv:amzn1.dv.gti.280d2564-584f-452f-9c98-7baf906e01ab', + 'only_matching': True, + }, { + 'url': 'amazonminitv:280d2564-584f-452f-9c98-7baf906e01ab', + 'only_matching': True, + }] + + _GRAPHQL_QUERY_CONTENT = ''' +query content($sessionIdToken: String!, $deviceLocale: String, $contentId: ID!, $contentType: ContentType!, $clientId: String) { + content( + applicationContextInput: {deviceLocale: $deviceLocale, sessionIdToken: $sessionIdToken, clientId: $clientId} + contentId: $contentId + contentType: $contentType + ) { + contentId + name + ... on Episode { + contentId + vodType + name + images + description { + synopsis + contentLengthInSeconds + } + publicReleaseDateUTC + audioTracks + seasonId + seriesId + seriesName + seasonNumber + episodeNumber + timecode { + endCreditsTime + } + } + ... on MovieContent { + contentId + vodType + name + description { + synopsis + contentLengthInSeconds + } + images + publicReleaseDateUTC + audioTracks + } + } +}''' + + def _real_extract(self, url): + asin = f'amzn1.dv.gti.{self._match_id(url)}' + prs = self._call_api(asin, note='Downloading playback info') + + formats, subtitles = [], {} + for type_, asset in prs['playbackAssets'].items(): + if not traverse_obj(asset, 'manifestUrl'): + continue + if type_ == 'hls': + m3u8_fmts, m3u8_subs = self._extract_m3u8_formats_and_subtitles( + asset['manifestUrl'], asin, ext='mp4', entry_protocol='m3u8_native', + m3u8_id=type_, fatal=False) + formats.extend(m3u8_fmts) + subtitles = self._merge_subtitles(subtitles, m3u8_subs) + elif type_ == 'dash': + mpd_fmts, mpd_subs = self._extract_mpd_formats_and_subtitles( + asset['manifestUrl'], asin, mpd_id=type_, fatal=False) + formats.extend(mpd_fmts) + subtitles = self._merge_subtitles(subtitles, mpd_subs) + else: + self.report_warning(f'Unknown asset type: {type_}') + + title_info = self._call_api( + asin, note='Downloading title info', data={ + 'operationName': 'content', + 'variables': {'contentId': asin}, + 'query': self._GRAPHQL_QUERY_CONTENT, + }) + credits_time = try_get(title_info, lambda x: x['timecode']['endCreditsTime'] / 1000) + is_episode = title_info.get('vodType') == 'EPISODE' + + return { + 'id': asin, + 'title': title_info.get('name'), + 'formats': formats, + 'subtitles': subtitles, + 'language': traverse_obj(title_info, ('audioTracks', 0)), + 'thumbnails': [{ + 'id': type_, + 'url': url, + } for type_, url in (title_info.get('images') or {}).items()], + 'description': traverse_obj(title_info, ('description', 'synopsis')), + 'release_timestamp': int_or_none(try_get(title_info, lambda x: x['publicReleaseDateUTC'] / 1000)), + 'duration': traverse_obj(title_info, ('description', 'contentLengthInSeconds')), + 'chapters': [{ + 'start_time': credits_time, + 'title': 'End Credits', + }] if credits_time else [], + 'series': title_info.get('seriesName'), + 'series_id': title_info.get('seriesId'), + 'season_number': title_info.get('seasonNumber'), + 'season_id': title_info.get('seasonId'), + 'episode': title_info.get('name') if is_episode else None, + 'episode_number': title_info.get('episodeNumber'), + 'episode_id': asin if is_episode else None, + } + + +class AmazonMiniTVSeasonIE(AmazonMiniTVBaseIE): + IE_NAME = 'amazonminitv:season' + _VALID_URL = r'amazonminitv:season:(?:amzn1\.dv\.gti\.)?(?P<id>[a-f0-9-]+)' + IE_DESC = 'Amazon MiniTV Series, "minitv:season:" prefix' + _TESTS = [{ + 'url': 'amazonminitv:season:amzn1.dv.gti.0aa996eb-6a1b-4886-a342-387fbd2f1db0', + 'playlist_mincount': 6, + 'info_dict': { + 'id': 'amzn1.dv.gti.0aa996eb-6a1b-4886-a342-387fbd2f1db0', + }, + }, { + 'url': 'amazonminitv:season:0aa996eb-6a1b-4886-a342-387fbd2f1db0', + 'only_matching': True, + }] + + _GRAPHQL_QUERY = ''' +query getEpisodes($sessionIdToken: String!, $clientId: String, $episodeOrSeasonId: ID!, $deviceLocale: String) { + getEpisodes( + applicationContextInput: {sessionIdToken: $sessionIdToken, deviceLocale: $deviceLocale, clientId: $clientId} + episodeOrSeasonId: $episodeOrSeasonId + ) { + episodes { + ... on Episode { + contentId + name + images + seriesName + seasonId + seriesId + seasonNumber + episodeNumber + description { + synopsis + contentLengthInSeconds + } + publicReleaseDateUTC + } + } + } +} +''' + + def _entries(self, asin): + season_info = self._call_api( + asin, note='Downloading season info', data={ + 'operationName': 'getEpisodes', + 'variables': {'episodeOrSeasonId': asin}, + 'query': self._GRAPHQL_QUERY, + }) + + for episode in season_info['episodes']: + yield self.url_result( + f'amazonminitv:{episode["contentId"]}', AmazonMiniTVIE, episode['contentId']) + + def _real_extract(self, url): + asin = f'amzn1.dv.gti.{self._match_id(url)}' + return self.playlist_result(self._entries(asin), asin) + + +class AmazonMiniTVSeriesIE(AmazonMiniTVBaseIE): + IE_NAME = 'amazonminitv:series' + _VALID_URL = r'amazonminitv:series:(?:amzn1\.dv\.gti\.)?(?P<id>[a-f0-9-]+)' + _TESTS = [{ + 'url': 'amazonminitv:series:amzn1.dv.gti.56521d46-b040-4fd5-872e-3e70476a04b0', + 'playlist_mincount': 3, + 'info_dict': { + 'id': 'amzn1.dv.gti.56521d46-b040-4fd5-872e-3e70476a04b0', + }, + }, { + 'url': 'amazonminitv:series:56521d46-b040-4fd5-872e-3e70476a04b0', + 'only_matching': True, + }] + + _GRAPHQL_QUERY = ''' +query getSeasons($sessionIdToken: String!, $deviceLocale: String, $episodeOrSeasonOrSeriesId: ID!, $clientId: String) { + getSeasons( + applicationContextInput: {deviceLocale: $deviceLocale, sessionIdToken: $sessionIdToken, clientId: $clientId} + episodeOrSeasonOrSeriesId: $episodeOrSeasonOrSeriesId + ) { + seasons { + seasonId + } + } +} +''' + + def _entries(self, asin): + season_info = self._call_api( + asin, note='Downloading series info', data={ + 'operationName': 'getSeasons', + 'variables': {'episodeOrSeasonOrSeriesId': asin}, + 'query': self._GRAPHQL_QUERY, + }) + + for season in season_info['seasons']: + yield self.url_result(f'amazonminitv:season:{season["seasonId"]}', AmazonMiniTVSeasonIE, season['seasonId']) + + def _real_extract(self, url): + asin = f'amzn1.dv.gti.{self._match_id(url)}' + return self.playlist_result(self._entries(asin), asin) diff --git a/hypervideo_dl/extractor/amcnetworks.py b/hypervideo_dl/extractor/amcnetworks.py index e38e215..c58bc7b 100644 --- a/hypervideo_dl/extractor/amcnetworks.py +++ b/hypervideo_dl/extractor/amcnetworks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .theplatform import ThePlatformIE @@ -12,7 +9,7 @@ from ..utils import ( ) -class AMCNetworksIE(ThePlatformIE): +class AMCNetworksIE(ThePlatformIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'https?://(?:www\.)?(?P<site>amc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?P<id>(?:movies|shows(?:/[^/]+)+)/[^/?#&]+)' _TESTS = [{ 'url': 'https://www.bbcamerica.com/shows/the-graham-norton-show/videos/tina-feys-adorable-airline-themed-family-dinner--51631', @@ -109,7 +106,6 @@ class AMCNetworksIE(ThePlatformIE): media_url = update_url_query(media_url, query) formats, subtitles = self._extract_theplatform_smil( media_url, video_id) - self._sort_formats(formats) thumbnails = [] thumbnail_urls = [properties.get('imageDesktop')] diff --git a/hypervideo_dl/extractor/americastestkitchen.py b/hypervideo_dl/extractor/americastestkitchen.py index 6e6099a..abda55d 100644 --- a/hypervideo_dl/extractor/americastestkitchen.py +++ b/hypervideo_dl/extractor/americastestkitchen.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -14,7 +11,7 @@ from ..utils import ( class AmericasTestKitchenIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:americastestkitchen|cooks(?:country|illustrated))\.com/(?P<resource_type>episode|videos)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com/(?:cooks(?:country|illustrated)/)?(?P<resource_type>episode|videos)/(?P<id>\d+)' _TESTS = [{ 'url': 'https://www.americastestkitchen.com/episode/582-weeknight-japanese-suppers', 'md5': 'b861c3e365ac38ad319cfd509c30577f', @@ -22,15 +19,20 @@ class AmericasTestKitchenIE(InfoExtractor): 'id': '5b400b9ee338f922cb06450c', 'title': 'Japanese Suppers', 'ext': 'mp4', + 'display_id': 'weeknight-japanese-suppers', 'description': 'md5:64e606bfee910627efc4b5f050de92b3', - 'thumbnail': r're:^https?://', - 'timestamp': 1523318400, - 'upload_date': '20180410', - 'release_date': '20180410', - 'series': "America's Test Kitchen", - 'season_number': 18, + 'timestamp': 1523304000, + 'upload_date': '20180409', + 'release_date': '20180409', + 'series': 'America\'s Test Kitchen', + 'season': 'Season 18', 'episode': 'Japanese Suppers', + 'season_number': 18, 'episode_number': 15, + 'duration': 1376, + 'thumbnail': r're:^https?://', + 'average_rating': 0, + 'view_count': int, }, 'params': { 'skip_download': True, @@ -43,15 +45,20 @@ class AmericasTestKitchenIE(InfoExtractor): 'id': '5fbe8c61bda2010001c6763b', 'title': 'Simple Chicken Dinner', 'ext': 'mp4', + 'display_id': 'atktv_2103_simple-chicken-dinner_full-episode_web-mp4', 'description': 'md5:eb68737cc2fd4c26ca7db30139d109e7', - 'thumbnail': r're:^https?://', - 'timestamp': 1610755200, - 'upload_date': '20210116', - 'release_date': '20210116', - 'series': "America's Test Kitchen", - 'season_number': 21, + 'timestamp': 1610737200, + 'upload_date': '20210115', + 'release_date': '20210115', + 'series': 'America\'s Test Kitchen', + 'season': 'Season 21', 'episode': 'Simple Chicken Dinner', + 'season_number': 21, 'episode_number': 3, + 'duration': 1397, + 'thumbnail': r're:^https?://', + 'view_count': int, + 'average_rating': 0, }, 'params': { 'skip_download': True, @@ -60,10 +67,10 @@ class AmericasTestKitchenIE(InfoExtractor): 'url': 'https://www.americastestkitchen.com/videos/3420-pan-seared-salmon', 'only_matching': True, }, { - 'url': 'https://www.cookscountry.com/episode/564-when-only-chocolate-will-do', + 'url': 'https://www.americastestkitchen.com/cookscountry/episode/564-when-only-chocolate-will-do', 'only_matching': True, }, { - 'url': 'https://www.cooksillustrated.com/videos/4478-beef-wellington', + 'url': 'https://www.americastestkitchen.com/cooksillustrated/videos/4478-beef-wellington', 'only_matching': True, }] @@ -93,7 +100,7 @@ class AmericasTestKitchenIE(InfoExtractor): class AmericasTestKitchenSeasonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?P<show>americastestkitchen|cookscountry)\.com/episodes/browse/season_(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?americastestkitchen\.com(?P<show>/cookscountry)?/episodes/browse/season_(?P<id>\d+)' _TESTS = [{ # ATK Season 'url': 'https://www.americastestkitchen.com/episodes/browse/season_1', @@ -104,7 +111,7 @@ class AmericasTestKitchenSeasonIE(InfoExtractor): 'playlist_count': 13, }, { # Cooks Country Season - 'url': 'https://www.cookscountry.com/episodes/browse/season_12', + 'url': 'https://www.americastestkitchen.com/cookscountry/episodes/browse/season_12', 'info_dict': { 'id': 'season_12', 'title': 'Season 12', @@ -113,17 +120,17 @@ class AmericasTestKitchenSeasonIE(InfoExtractor): }] def _real_extract(self, url): - show_name, season_number = self._match_valid_url(url).groups() + show_path, season_number = self._match_valid_url(url).group('show', 'id') season_number = int(season_number) - slug = 'atk' if show_name == 'americastestkitchen' else 'cco' + slug = 'cco' if show_path == '/cookscountry' else 'atk' season = 'Season %d' % season_number season_search = self._download_json( 'https://y1fnzxui30-dsn.algolia.net/1/indexes/everest_search_%s_season_desc_production' % slug, season, headers={ - 'Origin': 'https://www.%s.com' % show_name, + 'Origin': 'https://www.americastestkitchen.com', 'X-Algolia-API-Key': '8d504d0099ed27c1b73708d22871d805', 'X-Algolia-Application-Id': 'Y1FNZXUI30', }, query={ @@ -139,12 +146,12 @@ class AmericasTestKitchenSeasonIE(InfoExtractor): def entries(): for episode in (season_search.get('hits') or []): - search_url = episode.get('search_url') + search_url = episode.get('search_url') # always formatted like '/episode/123-title-of-episode' if not search_url: continue yield { '_type': 'url', - 'url': 'https://www.%s.com%s' % (show_name, search_url), + 'url': f'https://www.americastestkitchen.com{show_path or ""}{search_url}', 'id': try_get(episode, lambda e: e['objectID'].split('_')[-1]), 'title': episode.get('title'), 'description': episode.get('description'), diff --git a/hypervideo_dl/extractor/amp.py b/hypervideo_dl/extractor/amp.py index 24c684c..b0cbd77 100644 --- a/hypervideo_dl/extractor/amp.py +++ b/hypervideo_dl/extractor/amp.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -13,7 +10,7 @@ from ..utils import ( ) -class AMPIE(InfoExtractor): +class AMPIE(InfoExtractor): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor # parse Akamai Adaptive Media Player feed def _extract_feed_info(self, url): feed = self._download_json( @@ -87,8 +84,6 @@ class AMPIE(InfoExtractor): 'ext': ext, }) - self._sort_formats(formats) - timestamp = unified_timestamp(item.get('pubDate'), ' ') or parse_iso8601(item.get('dc-date')) return { diff --git a/hypervideo_dl/extractor/angel.py b/hypervideo_dl/extractor/angel.py new file mode 100644 index 0000000..306b365 --- /dev/null +++ b/hypervideo_dl/extractor/angel.py @@ -0,0 +1,56 @@ +import re + +from .common import InfoExtractor +from ..utils import url_or_none, merge_dicts + + +class AngelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?angel\.com/watch/(?P<series>[^/?#]+)/episode/(?P<id>[\w-]+)/season-(?P<season_number>\d+)/episode-(?P<episode_number>\d+)/(?P<title>[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.angel.com/watch/tuttle-twins/episode/2f3d0382-ea82-4cdc-958e-84fbadadc710/season-1/episode-1/when-laws-give-you-lemons', + 'md5': '4734e5cfdd64a568e837246aa3eaa524', + 'info_dict': { + 'id': '2f3d0382-ea82-4cdc-958e-84fbadadc710', + 'ext': 'mp4', + 'title': 'Tuttle Twins Season 1, Episode 1: When Laws Give You Lemons', + 'description': 'md5:73b704897c20ab59c433a9c0a8202d5e', + 'thumbnail': r're:^https?://images.angelstudios.com/image/upload/angel-app/.*$', + 'duration': 1359.0 + } + }, { + 'url': 'https://www.angel.com/watch/the-chosen/episode/8dfb714d-bca5-4812-8125-24fb9514cd10/season-1/episode-1/i-have-called-you-by-name', + 'md5': 'e4774bad0a5f0ad2e90d175cafdb797d', + 'info_dict': { + 'id': '8dfb714d-bca5-4812-8125-24fb9514cd10', + 'ext': 'mp4', + 'title': 'The Chosen Season 1, Episode 1: I Have Called You By Name', + 'description': 'md5:aadfb4827a94415de5ff6426e6dee3be', + 'thumbnail': r're:^https?://images.angelstudios.com/image/upload/angel-app/.*$', + 'duration': 3276.0 + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + json_ld = self._search_json_ld(webpage, video_id) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + json_ld.pop('url'), video_id, note='Downloading HD m3u8 information') + + info_dict = { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'formats': formats, + 'subtitles': subtitles + } + + # Angel uses cloudinary in the background and supports image transformations. + # We remove these transformations and return the source file + base_thumbnail_url = url_or_none(self._og_search_thumbnail(webpage)) or json_ld.pop('thumbnails') + if base_thumbnail_url: + info_dict['thumbnail'] = re.sub(r'(/upload)/.+(/angel-app/.+)$', r'\1\2', base_thumbnail_url) + + return merge_dicts(info_dict, json_ld) diff --git a/hypervideo_dl/extractor/animelab.py b/hypervideo_dl/extractor/animelab.py deleted file mode 100644 index 1c2cc47..0000000 --- a/hypervideo_dl/extractor/animelab.py +++ /dev/null @@ -1,278 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - -from ..utils import ( - ExtractorError, - urlencode_postdata, - int_or_none, - str_or_none, - determine_ext, -) - -from ..compat import compat_HTTPError - - -class AnimeLabBaseIE(InfoExtractor): - _LOGIN_URL = 'https://www.animelab.com/login' - _NETRC_MACHINE = 'animelab' - _LOGGED_IN = False - - def _is_logged_in(self, login_page=None): - if not self._LOGGED_IN: - if not login_page: - login_page = self._download_webpage(self._LOGIN_URL, None, 'Downloading login page') - AnimeLabBaseIE._LOGGED_IN = 'Sign In' not in login_page - return self._LOGGED_IN - - def _perform_login(self, username, password): - if self._is_logged_in(): - return - - login_form = { - 'email': username, - 'password': password, - } - - try: - response = self._download_webpage( - self._LOGIN_URL, None, 'Logging in', 'Wrong login info', - data=urlencode_postdata(login_form), - headers={'Content-Type': 'application/x-www-form-urlencoded'}) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: - raise ExtractorError('Unable to log in (wrong credentials?)', expected=True) - raise - - if not self._is_logged_in(response): - raise ExtractorError('Unable to login (cannot verify if logged in)') - - def _real_initialize(self): - if not self._is_logged_in(): - self.raise_login_required('Login is required to access any AnimeLab content') - - -class AnimeLabIE(AnimeLabBaseIE): - _VALID_URL = r'https?://(?:www\.)?animelab\.com/player/(?P<id>[^/]+)' - - # the following tests require authentication, but a free account will suffice - # just set 'usenetrc' to true in test/local_parameters.json if you use a .netrc file - # or you can set 'username' and 'password' there - # the tests also select a specific format so that the same video is downloaded - # regardless of whether the user is premium or not (needs testing on a premium account) - _TEST = { - 'url': 'https://www.animelab.com/player/fullmetal-alchemist-brotherhood-episode-42', - 'md5': '05bde4b91a5d1ff46ef5b94df05b0f7f', - 'info_dict': { - 'id': '383', - 'ext': 'mp4', - 'display_id': 'fullmetal-alchemist-brotherhood-episode-42', - 'title': 'Fullmetal Alchemist: Brotherhood - Episode 42 - Signs of a Counteroffensive', - 'description': 'md5:103eb61dd0a56d3dfc5dbf748e5e83f4', - 'series': 'Fullmetal Alchemist: Brotherhood', - 'episode': 'Signs of a Counteroffensive', - 'episode_number': 42, - 'duration': 1469, - 'season': 'Season 1', - 'season_number': 1, - 'season_id': '38', - }, - 'params': { - 'format': '[format_id=21711_yeshardsubbed_ja-JP][height=480]', - }, - 'skip': 'All AnimeLab content requires authentication', - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - # unfortunately we can get different URLs for the same formats - # e.g. if we are using a "free" account so no dubs available - # (so _remove_duplicate_formats is not effective) - # so we use a dictionary as a workaround - formats = {} - for language_option_url in ('https://www.animelab.com/player/%s/subtitles', - 'https://www.animelab.com/player/%s/dubbed'): - actual_url = language_option_url % display_id - webpage = self._download_webpage(actual_url, display_id, 'Downloading URL ' + actual_url) - - video_collection = self._parse_json(self._search_regex(r'new\s+?AnimeLabApp\.VideoCollection\s*?\((.*?)\);', webpage, 'AnimeLab VideoCollection'), display_id) - position = int_or_none(self._search_regex(r'playlistPosition\s*?=\s*?(\d+)', webpage, 'Playlist Position')) - - raw_data = video_collection[position]['videoEntry'] - - video_id = str_or_none(raw_data['id']) - - # create a title from many sources (while grabbing other info) - # TODO use more fallback sources to get some of these - series = raw_data.get('showTitle') - video_type = raw_data.get('videoEntryType', {}).get('name') - episode_number = raw_data.get('episodeNumber') - episode_name = raw_data.get('name') - - title_parts = (series, video_type, episode_number, episode_name) - if None not in title_parts: - title = '%s - %s %s - %s' % title_parts - else: - title = episode_name - - description = raw_data.get('synopsis') or self._og_search_description(webpage, default=None) - - duration = int_or_none(raw_data.get('duration')) - - thumbnail_data = raw_data.get('images', []) - thumbnails = [] - for thumbnail in thumbnail_data: - for instance in thumbnail['imageInstances']: - image_data = instance.get('imageInfo', {}) - thumbnails.append({ - 'id': str_or_none(image_data.get('id')), - 'url': image_data.get('fullPath'), - 'width': image_data.get('width'), - 'height': image_data.get('height'), - }) - - season_data = raw_data.get('season', {}) or {} - season = str_or_none(season_data.get('name')) - season_number = int_or_none(season_data.get('seasonNumber')) - season_id = str_or_none(season_data.get('id')) - - for video_data in raw_data['videoList']: - current_video_list = {} - current_video_list['language'] = video_data.get('language', {}).get('languageCode') - - is_hardsubbed = video_data.get('hardSubbed') - - for video_instance in video_data['videoInstances']: - httpurl = video_instance.get('httpUrl') - url = httpurl if httpurl else video_instance.get('rtmpUrl') - if url is None: - # this video format is unavailable to the user (not premium etc.) - continue - - current_format = current_video_list.copy() - - format_id_parts = [] - - format_id_parts.append(str_or_none(video_instance.get('id'))) - - if is_hardsubbed is not None: - if is_hardsubbed: - format_id_parts.append('yeshardsubbed') - else: - format_id_parts.append('nothardsubbed') - - format_id_parts.append(current_format['language']) - - format_id = '_'.join([x for x in format_id_parts if x is not None]) - - ext = determine_ext(url) - if ext == 'm3u8': - for format_ in self._extract_m3u8_formats( - url, video_id, m3u8_id=format_id, fatal=False): - formats[format_['format_id']] = format_ - continue - elif ext == 'mpd': - for format_ in self._extract_mpd_formats( - url, video_id, mpd_id=format_id, fatal=False): - formats[format_['format_id']] = format_ - continue - - current_format['url'] = url - quality_data = video_instance.get('videoQuality') - if quality_data: - quality = quality_data.get('name') or quality_data.get('description') - else: - quality = None - - height = None - if quality: - height = int_or_none(self._search_regex(r'(\d+)p?$', quality, 'Video format height', default=None)) - - if height is None: - self.report_warning('Could not get height of video') - else: - current_format['height'] = height - current_format['format_id'] = format_id - - formats[current_format['format_id']] = current_format - - formats = list(formats.values()) - self._sort_formats(formats) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'series': series, - 'episode': episode_name, - 'episode_number': int_or_none(episode_number), - 'thumbnails': thumbnails, - 'duration': duration, - 'formats': formats, - 'season': season, - 'season_number': season_number, - 'season_id': season_id, - } - - -class AnimeLabShowsIE(AnimeLabBaseIE): - _VALID_URL = r'https?://(?:www\.)?animelab\.com/shows/(?P<id>[^/]+)' - - _TEST = { - 'url': 'https://www.animelab.com/shows/attack-on-titan', - 'info_dict': { - 'id': '45', - 'title': 'Attack on Titan', - 'description': 'md5:989d95a2677e9309368d5cf39ba91469', - }, - 'playlist_count': 59, - 'skip': 'All AnimeLab content requires authentication', - } - - def _real_extract(self, url): - _BASE_URL = 'http://www.animelab.com' - _SHOWS_API_URL = '/api/videoentries/show/videos/' - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id, 'Downloading requested URL') - - show_data_str = self._search_regex(r'({"id":.*}),\svideoEntry', webpage, 'AnimeLab show data') - show_data = self._parse_json(show_data_str, display_id) - - show_id = str_or_none(show_data.get('id')) - title = show_data.get('name') - description = show_data.get('shortSynopsis') or show_data.get('longSynopsis') - - entries = [] - for season in show_data['seasons']: - season_id = season['id'] - get_data = urlencode_postdata({ - 'seasonId': season_id, - 'limit': 1000, - }) - # despite using urlencode_postdata, we are sending a GET request - target_url = _BASE_URL + _SHOWS_API_URL + show_id + "?" + get_data.decode('utf-8') - response = self._download_webpage( - target_url, - None, 'Season id %s' % season_id) - - season_data = self._parse_json(response, display_id) - - for video_data in season_data['list']: - entries.append(self.url_result( - _BASE_URL + '/player/' + video_data['slug'], 'AnimeLab', - str_or_none(video_data.get('id')), video_data.get('name') - )) - - return { - '_type': 'playlist', - 'id': show_id, - 'title': title, - 'description': description, - 'entries': entries, - } - -# TODO implement myqueue diff --git a/hypervideo_dl/extractor/animeondemand.py b/hypervideo_dl/extractor/animeondemand.py deleted file mode 100644 index 2e674d5..0000000 --- a/hypervideo_dl/extractor/animeondemand.py +++ /dev/null @@ -1,284 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - determine_ext, - extract_attributes, - ExtractorError, - join_nonempty, - url_or_none, - urlencode_postdata, - urljoin, -) - - -class AnimeOnDemandIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?anime-on-demand\.de/anime/(?P<id>\d+)' - _LOGIN_URL = 'https://www.anime-on-demand.de/users/sign_in' - _APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply' - _NETRC_MACHINE = 'animeondemand' - # German-speaking countries of Europe - _GEO_COUNTRIES = ['AT', 'CH', 'DE', 'LI', 'LU'] - _TESTS = [{ - # jap, OmU - 'url': 'https://www.anime-on-demand.de/anime/161', - 'info_dict': { - 'id': '161', - 'title': 'Grimgar, Ashes and Illusions (OmU)', - 'description': 'md5:6681ce3c07c7189d255ac6ab23812d31', - }, - 'playlist_mincount': 4, - }, { - # Film wording is used instead of Episode, ger/jap, Dub/OmU - 'url': 'https://www.anime-on-demand.de/anime/39', - 'only_matching': True, - }, { - # Episodes without titles, jap, OmU - 'url': 'https://www.anime-on-demand.de/anime/162', - 'only_matching': True, - }, { - # ger/jap, Dub/OmU, account required - 'url': 'https://www.anime-on-demand.de/anime/169', - 'only_matching': True, - }, { - # Full length film, non-series, ger/jap, Dub/OmU, account required - 'url': 'https://www.anime-on-demand.de/anime/185', - 'only_matching': True, - }, { - # Flash videos - 'url': 'https://www.anime-on-demand.de/anime/12', - 'only_matching': True, - }] - - def _perform_login(self, username, password): - login_page = self._download_webpage( - self._LOGIN_URL, None, 'Downloading login page') - - if '>Our licensing terms allow the distribution of animes only to German-speaking countries of Europe' in login_page: - self.raise_geo_restricted( - '%s is only available in German-speaking countries of Europe' % self.IE_NAME) - - login_form = self._form_hidden_inputs('new_user', login_page) - - login_form.update({ - 'user[login]': username, - 'user[password]': password, - }) - - post_url = self._search_regex( - r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page, - 'post url', default=self._LOGIN_URL, group='url') - - if not post_url.startswith('http'): - post_url = urljoin(self._LOGIN_URL, post_url) - - response = self._download_webpage( - post_url, None, 'Logging in', - data=urlencode_postdata(login_form), headers={ - 'Referer': self._LOGIN_URL, - }) - - if all(p not in response for p in ('>Logout<', 'href="/users/sign_out"')): - error = self._search_regex( - r'<p[^>]+\bclass=(["\'])(?:(?!\1).)*\balert\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</p>', - response, 'error', default=None, group='error') - if error: - raise ExtractorError('Unable to login: %s' % error, expected=True) - raise ExtractorError('Unable to log in') - - def _real_extract(self, url): - anime_id = self._match_id(url) - - webpage = self._download_webpage(url, anime_id) - - if 'data-playlist=' not in webpage: - self._download_webpage( - self._APPLY_HTML5_URL, anime_id, - 'Activating HTML5 beta', 'Unable to apply HTML5 beta') - webpage = self._download_webpage(url, anime_id) - - csrf_token = self._html_search_meta( - 'csrf-token', webpage, 'csrf token', fatal=True) - - anime_title = self._html_search_regex( - r'(?s)<h1[^>]+itemprop="name"[^>]*>(.+?)</h1>', - webpage, 'anime name') - anime_description = self._html_search_regex( - r'(?s)<div[^>]+itemprop="description"[^>]*>(.+?)</div>', - webpage, 'anime description', default=None) - - def extract_info(html, video_id, num=None): - title, description = [None] * 2 - formats = [] - - for input_ in re.findall( - r'<input[^>]+class=["\'].*?streamstarter[^>]+>', html): - attributes = extract_attributes(input_) - title = attributes.get('data-dialog-header') - playlist_urls = [] - for playlist_key in ('data-playlist', 'data-otherplaylist', 'data-stream'): - playlist_url = attributes.get(playlist_key) - if isinstance(playlist_url, compat_str) and re.match( - r'/?[\da-zA-Z]+', playlist_url): - playlist_urls.append(attributes[playlist_key]) - if not playlist_urls: - continue - - lang = attributes.get('data-lang') - lang_note = attributes.get('value') - - for playlist_url in playlist_urls: - kind = self._search_regex( - r'videomaterialurl/\d+/([^/]+)/', - playlist_url, 'media kind', default=None) - format_id = join_nonempty(lang, kind) if lang or kind else str(num) - format_note = join_nonempty(kind, lang_note, delim=', ') - item_id_list = [] - if format_id: - item_id_list.append(format_id) - item_id_list.append('videomaterial') - playlist = self._download_json( - urljoin(url, playlist_url), video_id, - 'Downloading %s JSON' % ' '.join(item_id_list), - headers={ - 'X-Requested-With': 'XMLHttpRequest', - 'X-CSRF-Token': csrf_token, - 'Referer': url, - 'Accept': 'application/json, text/javascript, */*; q=0.01', - }, fatal=False) - if not playlist: - continue - stream_url = url_or_none(playlist.get('streamurl')) - if stream_url: - rtmp = re.search( - r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+/))(?P<playpath>mp[34]:.+)', - stream_url) - if rtmp: - formats.append({ - 'url': rtmp.group('url'), - 'app': rtmp.group('app'), - 'play_path': rtmp.group('playpath'), - 'page_url': url, - 'player_url': 'https://www.anime-on-demand.de/assets/jwplayer.flash-55abfb34080700304d49125ce9ffb4a6.swf', - 'rtmp_real_time': True, - 'format_id': 'rtmp', - 'ext': 'flv', - }) - continue - start_video = playlist.get('startvideo', 0) - playlist = playlist.get('playlist') - if not playlist or not isinstance(playlist, list): - continue - playlist = playlist[start_video] - title = playlist.get('title') - if not title: - continue - description = playlist.get('description') - for source in playlist.get('sources', []): - file_ = source.get('file') - if not file_: - continue - ext = determine_ext(file_) - format_id = join_nonempty( - lang, kind, - 'hls' if ext == 'm3u8' else None, - 'dash' if source.get('type') == 'video/dash' or ext == 'mpd' else None) - if ext == 'm3u8': - file_formats = self._extract_m3u8_formats( - file_, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False) - elif source.get('type') == 'video/dash' or ext == 'mpd': - continue - file_formats = self._extract_mpd_formats( - file_, video_id, mpd_id=format_id, fatal=False) - else: - continue - for f in file_formats: - f.update({ - 'language': lang, - 'format_note': format_note, - }) - formats.extend(file_formats) - - return { - 'title': title, - 'description': description, - 'formats': formats, - } - - def extract_entries(html, video_id, common_info, num=None): - info = extract_info(html, video_id, num) - - if info['formats']: - self._sort_formats(info['formats']) - f = common_info.copy() - f.update(info) - yield f - - # Extract teaser/trailer only when full episode is not available - if not info['formats']: - m = re.search( - r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>(?P<kind>Teaser|Trailer)<', - html) - if m: - f = common_info.copy() - f.update({ - 'id': '%s-%s' % (f['id'], m.group('kind').lower()), - 'title': m.group('title'), - 'url': urljoin(url, m.group('href')), - }) - yield f - - def extract_episodes(html): - for num, episode_html in enumerate(re.findall( - r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', html), 1): - episodebox_title = self._search_regex( - (r'class="episodebox-title"[^>]+title=(["\'])(?P<title>.+?)\1', - r'class="episodebox-title"[^>]+>(?P<title>.+?)<'), - episode_html, 'episodebox title', default=None, group='title') - if not episodebox_title: - continue - - episode_number = int(self._search_regex( - r'(?:Episode|Film)\s*(\d+)', - episodebox_title, 'episode number', default=num)) - episode_title = self._search_regex( - r'(?:Episode|Film)\s*\d+\s*-\s*(.+)', - episodebox_title, 'episode title', default=None) - - video_id = 'episode-%d' % episode_number - - common_info = { - 'id': video_id, - 'series': anime_title, - 'episode': episode_title, - 'episode_number': episode_number, - } - - for e in extract_entries(episode_html, video_id, common_info): - yield e - - def extract_film(html, video_id): - common_info = { - 'id': anime_id, - 'title': anime_title, - 'description': anime_description, - } - for e in extract_entries(html, video_id, common_info): - yield e - - def entries(): - has_episodes = False - for e in extract_episodes(webpage): - has_episodes = True - yield e - - if not has_episodes: - for e in extract_film(webpage, anime_id): - yield e - - return self.playlist_result( - entries(), anime_id, anime_title, anime_description) diff --git a/hypervideo_dl/extractor/ant1newsgr.py b/hypervideo_dl/extractor/ant1newsgr.py index 1075b46..7b384b2 100644 --- a/hypervideo_dl/extractor/ant1newsgr.py +++ b/hypervideo_dl/extractor/ant1newsgr.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re import urllib.parse from .common import InfoExtractor @@ -10,7 +6,6 @@ from ..utils import ( ExtractorError, determine_ext, scale_thumbnails_to_max_format_width, - unescapeHTML, ) @@ -24,7 +19,6 @@ class Ant1NewsGrBaseIE(InfoExtractor): raise ExtractorError('no source found for %s' % video_id) formats, subs = (self._extract_m3u8_formats_and_subtitles(source, video_id, 'mp4') if determine_ext(source) == 'm3u8' else ([{'url': source}], {})) - self._sort_formats(formats) thumbnails = scale_thumbnails_to_max_format_width( formats, [{'url': info['thumb']}], r'(?<=/imgHandler/)\d+') return { @@ -94,7 +88,7 @@ class Ant1NewsGrArticleIE(Ant1NewsGrBaseIE): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) info = self._search_json_ld(webpage, video_id, expected_type='NewsArticle') - embed_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage)) + embed_urls = list(Ant1NewsGrEmbedIE._extract_embed_urls(url, webpage)) if not embed_urls: raise ExtractorError('no videos found for %s' % video_id, expected=True) return self.playlist_from_matches( @@ -107,6 +101,7 @@ class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE): IE_DESC = 'ant1news.gr embedded videos' _BASE_PLAYER_URL_RE = r'(?:https?:)?//(?:[a-zA-Z0-9\-]+\.)?(?:antenna|ant1news)\.gr/templates/pages/player' _VALID_URL = rf'{_BASE_PLAYER_URL_RE}\?([^#]+&)?cid=(?P<id>[^#&]+)' + _EMBED_REGEX = [rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{_BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+)(?P=_q1)'] _API_PATH = '/news/templates/data/jsonPlayer' _TESTS = [{ @@ -120,16 +115,6 @@ class Ant1NewsGrEmbedIE(Ant1NewsGrBaseIE): }, }] - @classmethod - def _extract_urls(cls, webpage): - _EMBED_URL_RE = rf'{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+' - _EMBED_RE = rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{_EMBED_URL_RE})(?P=_q1)' - for mobj in re.finditer(_EMBED_RE, webpage): - url = unescapeHTML(mobj.group('url')) - if not cls.suitable(url): - continue - yield url - def _real_extract(self, url): video_id = self._match_id(url) diff --git a/hypervideo_dl/extractor/anvato.py b/hypervideo_dl/extractor/anvato.py index 686d453..79bfe41 100644 --- a/hypervideo_dl/extractor/anvato.py +++ b/hypervideo_dl/extractor/anvato.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import base64 import hashlib import json @@ -10,38 +7,68 @@ import time from .common import InfoExtractor from ..aes import aes_encrypt -from ..compat import compat_str from ..utils import ( bytes_to_intlist, determine_ext, - intlist_to_bytes, int_or_none, + intlist_to_bytes, join_nonempty, + smuggle_url, strip_jsonp, + traverse_obj, unescapeHTML, unsmuggle_url, ) -# This import causes a ModuleNotFoundError on some systems for unknown reason. -# See issues: -# https://github.com/hypervideo/hypervideo/issues/35 -# https://github.com/ytdl-org/youtube-dl/issues/27449 -# https://github.com/animelover1984/youtube-dl/issues/17 -try: - from .anvato_token_generator import NFLTokenGenerator -except ImportError: - NFLTokenGenerator = None - def md5_text(s): - if not isinstance(s, compat_str): - s = compat_str(s) - return hashlib.md5(s.encode('utf-8')).hexdigest() + return hashlib.md5(str(s).encode()).hexdigest() class AnvatoIE(InfoExtractor): _VALID_URL = r'anvato:(?P<access_key_or_mcp>[^:]+):(?P<id>\d+)' + _API_BASE_URL = 'https://tkx.mp.lura.live/rest/v2' + _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1' + _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce' # from anvplayer.min.js + + _TESTS = [{ + # from https://www.nfl.com/videos/baker-mayfield-s-game-changing-plays-from-3-td-game-week-14 + 'url': 'anvato:GXvEgwyJeWem8KCYXfeoHWknwP48Mboj:899441', + 'md5': '921919dab3cd0b849ff3d624831ae3e2', + 'info_dict': { + 'id': '899441', + 'ext': 'mp4', + 'title': 'Baker Mayfield\'s game-changing plays from 3-TD game Week 14', + 'description': 'md5:85e05a3cc163f8c344340f220521136d', + 'upload_date': '20201215', + 'timestamp': 1608009755, + 'thumbnail': r're:^https?://.*\.jpg', + 'uploader': 'NFL', + 'tags': ['Baltimore Ravens at Cleveland Browns (2020-REG-14)', 'Baker Mayfield', 'Game Highlights', + 'Player Highlights', 'Cleveland Browns', 'league'], + 'duration': 157, + 'categories': ['Entertainment', 'Game', 'Highlights'], + }, + }, { + # from https://ktla.com/news/99-year-old-woman-learns-to-fly-in-torrance-checks-off-bucket-list-dream/ + 'url': 'anvato:X8POa4zpGZMmeiq0wqiO8IP5rMqQM9VN:8032455', + 'md5': '837718bcfb3a7778d022f857f7a9b19e', + 'info_dict': { + 'id': '8032455', + 'ext': 'mp4', + 'title': '99-year-old woman learns to fly plane in Torrance, checks off bucket list dream', + 'description': 'md5:0a12bab8159445e78f52a297a35c6609', + 'upload_date': '20220928', + 'timestamp': 1664408881, + 'thumbnail': r're:^https?://.*\.jpg', + 'uploader': 'LIN', + 'tags': ['video', 'news', '5live'], + 'duration': 155, + 'categories': ['News'], + }, + }] + # Copied from anvplayer.min.js _ANVACK_TABLE = { 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ', @@ -214,86 +241,74 @@ class AnvatoIE(InfoExtractor): 'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582' } - _TOKEN_GENERATORS = { - 'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': NFLTokenGenerator, + def _generate_nfl_token(self, anvack, mcp_id): + reroute = self._download_json( + 'https://api.nfl.com/v1/reroute', mcp_id, data=b'grant_type=client_credentials', + headers={'X-Domain-Id': 100}, note='Fetching token info') + token_type = reroute.get('token_type') or 'Bearer' + auth_token = f'{token_type} {reroute["access_token"]}' + response = self._download_json( + 'https://api.nfl.com/v3/shield/', mcp_id, data=json.dumps({ + 'query': '''{ + viewer { + mediaToken(anvack: "%s", id: %s) { + token } + } +}''' % (anvack, mcp_id), + }).encode(), headers={ + 'Authorization': auth_token, + 'Content-Type': 'application/json', + }, note='Fetching NFL API token') + return traverse_obj(response, ('data', 'viewer', 'mediaToken', 'token')) - _API_KEY = '3hwbSuqqT690uxjNYBktSQpa5ZrpYYR0Iofx7NcJHyA' - - _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1' - _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce' - - _TESTS = [{ - # from https://www.boston25news.com/news/watch-humpback-whale-breaches-right-next-to-fishing-boat-near-nh/817484874 - 'url': 'anvato:8v9BEynrwx8EFLYpgfOWcG1qJqyXKlRM:4465496', - 'info_dict': { - 'id': '4465496', - 'ext': 'mp4', - 'title': 'VIDEO: Humpback whale breaches right next to NH boat', - 'description': 'VIDEO: Humpback whale breaches right next to NH boat. Footage courtesy: Zach Fahey.', - 'duration': 22, - 'timestamp': 1534855680, - 'upload_date': '20180821', - 'uploader': 'ANV', - }, - 'params': { - 'skip_download': True, - }, - }, { - # from https://sanfrancisco.cbslocal.com/2016/06/17/source-oakland-cop-on-leave-for-having-girlfriend-help-with-police-reports/ - 'url': 'anvato:DVzl9QRzox3ZZsP9bNu5Li3X7obQOnqP:3417601', - 'only_matching': True, - }] - - def __init__(self, *args, **kwargs): - super(AnvatoIE, self).__init__(*args, **kwargs) - self.__server_time = None + _TOKEN_GENERATORS = { + 'GXvEgwyJeWem8KCYXfeoHWknwP48Mboj': _generate_nfl_token, + } def _server_time(self, access_key, video_id): - if self.__server_time is not None: - return self.__server_time - - self.__server_time = int(self._download_json( - self._api_prefix(access_key) + 'server_time?anvack=' + access_key, video_id, - note='Fetching server time')['server_time']) + return int_or_none(traverse_obj(self._download_json( + f'{self._API_BASE_URL}/server_time', video_id, query={'anvack': access_key}, + note='Fetching server time', fatal=False), 'server_time')) or int(time.time()) - return self.__server_time - - def _api_prefix(self, access_key): - return 'https://tkx2-%s.anvato.net/rest/v2/' % ('prod' if 'prod' in access_key else 'stage') - - def _get_video_json(self, access_key, video_id): + def _get_video_json(self, access_key, video_id, extracted_token): # See et() in anvplayer.min.js, which is an alias of getVideoJSON() - video_data_url = self._api_prefix(access_key) + 'mcp/video/%s?anvack=%s' % (video_id, access_key) + video_data_url = f'{self._API_BASE_URL}/mcp/video/{video_id}?anvack={access_key}' server_time = self._server_time(access_key, video_id) - input_data = '%d~%s~%s' % (server_time, md5_text(video_data_url), md5_text(server_time)) + input_data = f'{server_time}~{md5_text(video_data_url)}~{md5_text(server_time)}' auth_secret = intlist_to_bytes(aes_encrypt( bytes_to_intlist(input_data[:64]), bytes_to_intlist(self._AUTH_KEY))) - - video_data_url += '&X-Anvato-Adst-Auth=' + base64.b64encode(auth_secret).decode('ascii') + query = { + 'X-Anvato-Adst-Auth': base64.b64encode(auth_secret).decode('ascii'), + 'rtyp': 'fp', + } anvrid = md5_text(time.time() * 1000 * random.random())[:30] api = { 'anvrid': anvrid, 'anvts': server_time, } - if self._TOKEN_GENERATORS.get(access_key) is not None: - api['anvstk2'] = self._TOKEN_GENERATORS[access_key].generate(self, access_key, video_id) + if extracted_token is not None: + api['anvstk2'] = extracted_token + elif self._TOKEN_GENERATORS.get(access_key) is not None: + api['anvstk2'] = self._TOKEN_GENERATORS[access_key](self, access_key, video_id) + elif self._ANVACK_TABLE.get(access_key) is not None: + api['anvstk'] = md5_text(f'{access_key}|{anvrid}|{server_time}|{self._ANVACK_TABLE[access_key]}') else: - api['anvstk'] = md5_text('%s|%s|%d|%s' % ( - access_key, anvrid, server_time, - self._ANVACK_TABLE.get(access_key, self._API_KEY))) + api['anvstk2'] = 'default' return self._download_json( - video_data_url, video_id, transform_source=strip_jsonp, - data=json.dumps({'api': api}).encode('utf-8')) + video_data_url, video_id, transform_source=strip_jsonp, query=query, + data=json.dumps({'api': api}, separators=(',', ':')).encode('utf-8')) - def _get_anvato_videos(self, access_key, video_id): - video_data = self._get_video_json(access_key, video_id) + def _get_anvato_videos(self, access_key, video_id, token): + video_data = self._get_video_json(access_key, video_id, token) formats = [] for published_url in video_data['published_urls']: - video_url = published_url['embed_url'] + video_url = published_url.get('embed_url') + if not video_url: + continue media_format = published_url.get('format') ext = determine_ext(video_url) @@ -308,15 +323,27 @@ class AnvatoIE(InfoExtractor): 'tbr': tbr or None, } - if media_format == 'm3u8' and tbr is not None: + vtt_subs, hls_subs = {}, {} + if media_format == 'vtt': + _, vtt_subs = self._extract_m3u8_formats_and_subtitles( + video_url, video_id, m3u8_id='vtt', fatal=False) + continue + elif media_format == 'm3u8' and tbr is not None: a_format.update({ 'format_id': join_nonempty('hls', tbr), 'ext': 'mp4', }) elif media_format == 'm3u8-variant' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) + # For some videos the initial m3u8 URL returns JSON instead + manifest_json = self._download_json( + video_url, video_id, note='Downloading manifest JSON', errnote=False) + if manifest_json: + video_url = manifest_json.get('master_m3u8') + if not video_url: + continue + hls_fmts, hls_subs = self._extract_m3u8_formats_and_subtitles( + video_url, video_id, ext='mp4', m3u8_id='hls', fatal=False) + formats.extend(hls_fmts) continue elif ext == 'mp3' or media_format == 'mp3': a_format['vcodec'] = 'none' @@ -327,8 +354,6 @@ class AnvatoIE(InfoExtractor): }) formats.append(a_format) - self._sort_formats(formats) - subtitles = {} for caption in video_data.get('captions', []): a_caption = { @@ -336,6 +361,7 @@ class AnvatoIE(InfoExtractor): 'ext': 'tt' if caption.get('format') == 'SMPTE-TT' else None } subtitles.setdefault(caption['language'], []).append(a_caption) + subtitles = self._merge_subtitles(subtitles, hls_subs, vtt_subs) return { 'id': video_id, @@ -352,30 +378,19 @@ class AnvatoIE(InfoExtractor): 'subtitles': subtitles, } - @staticmethod - def _extract_urls(ie, webpage, video_id): - entries = [] - for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage): - anvplayer_data = ie._parse_json( - mobj.group('anvp'), video_id, transform_source=unescapeHTML, - fatal=False) - if not anvplayer_data: - continue - video = anvplayer_data.get('video') - if not isinstance(video, compat_str) or not video.isdigit(): - continue - access_key = anvplayer_data.get('accessKey') - if not access_key: - mcp = anvplayer_data.get('mcp') - if mcp: - access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get( - mcp.lower()) + @classmethod + def _extract_from_webpage(cls, url, webpage): + for mobj in re.finditer(cls._ANVP_RE, webpage): + anvplayer_data = unescapeHTML(json.loads(mobj.group('anvp'))) or {} + video_id, access_key = anvplayer_data.get('video'), anvplayer_data.get('accessKey') if not access_key: + access_key = cls._MCP_TO_ACCESS_KEY_TABLE.get((anvplayer_data.get('mcp') or '').lower()) + if not (video_id or '').isdigit() or not access_key: continue - entries.append(ie.url_result( - 'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(), - video_id=video)) - return entries + url = f'anvato:{access_key}:{video_id}' + if anvplayer_data.get('token'): + url = smuggle_url(url, {'token': anvplayer_data['token']}) + yield cls.url_result(url, AnvatoIE, video_id) def _extract_anvato_videos(self, webpage, video_id): anvplayer_data = self._parse_json( @@ -383,7 +398,7 @@ class AnvatoIE(InfoExtractor): self._ANVP_RE, webpage, 'Anvato player data', group='anvp'), video_id) return self._get_anvato_videos( - anvplayer_data['accessKey'], anvplayer_data['video']) + anvplayer_data['accessKey'], anvplayer_data['video'], 'default') # cbslocal token = 'default' def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -391,9 +406,7 @@ class AnvatoIE(InfoExtractor): 'countries': smuggled_data.get('geo_countries'), }) - mobj = self._match_valid_url(url) - access_key, video_id = mobj.group('access_key_or_mcp', 'id') + access_key, video_id = self._match_valid_url(url).group('access_key_or_mcp', 'id') if access_key not in self._ANVACK_TABLE: - access_key = self._MCP_TO_ACCESS_KEY_TABLE.get( - access_key) or access_key - return self._get_anvato_videos(access_key, video_id) + access_key = self._MCP_TO_ACCESS_KEY_TABLE.get(access_key) or access_key + return self._get_anvato_videos(access_key, video_id, smuggled_data.get('token')) diff --git a/hypervideo_dl/extractor/anvato_token_generator/__init__.py b/hypervideo_dl/extractor/anvato_token_generator/__init__.py deleted file mode 100644 index 6e223db..0000000 --- a/hypervideo_dl/extractor/anvato_token_generator/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from __future__ import unicode_literals - -from .nfl import NFLTokenGenerator - -__all__ = [ - 'NFLTokenGenerator', -] diff --git a/hypervideo_dl/extractor/anvato_token_generator/common.py b/hypervideo_dl/extractor/anvato_token_generator/common.py deleted file mode 100644 index b959a90..0000000 --- a/hypervideo_dl/extractor/anvato_token_generator/common.py +++ /dev/null @@ -1,6 +0,0 @@ -from __future__ import unicode_literals - - -class TokenGenerator: - def generate(self, anvack, mcp_id): - raise NotImplementedError('This method must be implemented by subclasses') diff --git a/hypervideo_dl/extractor/anvato_token_generator/nfl.py b/hypervideo_dl/extractor/anvato_token_generator/nfl.py deleted file mode 100644 index 97a2b24..0000000 --- a/hypervideo_dl/extractor/anvato_token_generator/nfl.py +++ /dev/null @@ -1,30 +0,0 @@ -from __future__ import unicode_literals - -import json - -from .common import TokenGenerator - - -class NFLTokenGenerator(TokenGenerator): - _AUTHORIZATION = None - - def generate(ie, anvack, mcp_id): - if not NFLTokenGenerator._AUTHORIZATION: - reroute = ie._download_json( - 'https://api.nfl.com/v1/reroute', mcp_id, - data=b'grant_type=client_credentials', - headers={'X-Domain-Id': 100}) - NFLTokenGenerator._AUTHORIZATION = '%s %s' % (reroute.get('token_type') or 'Bearer', reroute['access_token']) - return ie._download_json( - 'https://api.nfl.com/v3/shield/', mcp_id, data=json.dumps({ - 'query': '''{ - viewer { - mediaToken(anvack: "%s", id: %s) { - token - } - } -}''' % (anvack, mcp_id), - }).encode(), headers={ - 'Authorization': NFLTokenGenerator._AUTHORIZATION, - 'Content-Type': 'application/json', - })['data']['viewer']['mediaToken']['token'] diff --git a/hypervideo_dl/extractor/aol.py b/hypervideo_dl/extractor/aol.py index 4766a2c..6949ca9 100644 --- a/hypervideo_dl/extractor/aol.py +++ b/hypervideo_dl/extractor/aol.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .yahoo import YahooIE @@ -12,7 +9,7 @@ from ..utils import ( ) -class AolIE(YahooIE): +class AolIE(YahooIE): # XXX: Do not subclass from concrete IE IE_NAME = 'aol.com' _VALID_URL = r'(?:aol-video:|https?://(?:www\.)?aol\.(?:com|ca|co\.uk|de|jp)/video/(?:[^/]+/)*)(?P<id>\d{9}|[0-9a-f]{24}|[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12})' @@ -122,7 +119,6 @@ class AolIE(YahooIE): 'height': int_or_none(qs.get('h', [None])[0]), }) formats.append(f) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/apa.py b/hypervideo_dl/extractor/apa.py index 1736cdf..1ea0b1d 100644 --- a/hypervideo_dl/extractor/apa.py +++ b/hypervideo_dl/extractor/apa.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -13,6 +8,7 @@ from ..utils import ( class APAIE(InfoExtractor): _VALID_URL = r'(?P<base_url>https?://[^/]+\.apa\.at)/embed/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1'] _TESTS = [{ 'url': 'http://uvp.apa.at/embed/293f6d17-692a-44e3-9fd5-7b178f3a1029', 'md5': '2b12292faeb0a7d930c778c7a5b4759b', @@ -33,14 +29,6 @@ class APAIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//[^/]+\.apa\.at/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}.*?)\1', - webpage)] - def _real_extract(self, url): mobj = self._match_valid_url(url) video_id, base_url = mobj.group('id', 'base_url') @@ -84,7 +72,6 @@ class APAIE(InfoExtractor): 'format_id': format_id, 'height': height, }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/aparat.py b/hypervideo_dl/extractor/aparat.py index 1057233..4a989d8 100644 --- a/hypervideo_dl/extractor/aparat.py +++ b/hypervideo_dl/extractor/aparat.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( get_element_by_id, @@ -13,6 +10,7 @@ from ..utils import ( class AparatIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)' + _EMBED_REGEX = [r'<iframe .*?src="(?P<url>http://www\.aparat\.com/video/[^"]+)"'] _TESTS = [{ 'url': 'http://www.aparat.com/v/wP8On', @@ -75,7 +73,6 @@ class AparatIE(InfoExtractor): r'(\d+)[pP]', label or '', 'height', default=None)), }) - self._sort_formats(formats) info = self._search_json_ld(webpage, video_id, default={}) diff --git a/hypervideo_dl/extractor/appleconnect.py b/hypervideo_dl/extractor/appleconnect.py index 494f833..d00b0f9 100644 --- a/hypervideo_dl/extractor/appleconnect.py +++ b/hypervideo_dl/extractor/appleconnect.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( str_to_int, diff --git a/hypervideo_dl/extractor/applepodcasts.py b/hypervideo_dl/extractor/applepodcasts.py index 9139ff7..49bbeab 100644 --- a/hypervideo_dl/extractor/applepodcasts.py +++ b/hypervideo_dl/extractor/applepodcasts.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( clean_html, diff --git a/hypervideo_dl/extractor/appletrailers.py b/hypervideo_dl/extractor/appletrailers.py index 0abfb43..a5abb55 100644 --- a/hypervideo_dl/extractor/appletrailers.py +++ b/hypervideo_dl/extractor/appletrailers.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re import json @@ -122,7 +120,6 @@ class AppleTrailersIE(InfoExtractor): 'height': int_or_none(size_data.get('height')), 'language': version[:2], }) - self._sort_formats(formats) entries.append({ 'id': movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', clip_title).lower(), @@ -187,8 +184,6 @@ class AppleTrailersIE(InfoExtractor): 'height': int_or_none(format['height']), }) - self._sort_formats(formats) - playlist.append({ '_type': 'video', 'id': video_id, diff --git a/hypervideo_dl/extractor/archiveorg.py b/hypervideo_dl/extractor/archiveorg.py index 2ab3c1b..90dda9f 100644 --- a/hypervideo_dl/extractor/archiveorg.py +++ b/hypervideo_dl/extractor/archiveorg.py @@ -1,39 +1,35 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re import json +import re +import urllib.parse + from .common import InfoExtractor -from .youtube import YoutubeIE, YoutubeBaseInfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_urllib_parse_unquote_plus, - compat_HTTPError -) +from .youtube import YoutubeBaseInfoExtractor, YoutubeIE +from ..compat import compat_HTTPError, compat_urllib_parse_unquote from ..utils import ( + KNOWN_EXTENSIONS, + ExtractorError, + HEADRequest, bug_reports_message, clean_html, dict_get, extract_attributes, - ExtractorError, get_element_by_id, - HEADRequest, int_or_none, join_nonempty, - KNOWN_EXTENSIONS, + js_to_json, merge_dicts, mimetype2ext, orderedSet, parse_duration, parse_qs, - str_to_int, str_or_none, + str_to_int, traverse_obj, try_get, unified_strdate, unified_timestamp, + url_or_none, urlhandle_detect_ext, - url_or_none ) @@ -54,6 +50,11 @@ class ArchiveOrgIE(InfoExtractor): 'upload_date': '20100315', 'creator': 'SRI International', 'uploader': 'laura@archive.org', + 'thumbnail': r're:https://archive\.org/download/.*\.jpg', + 'release_year': 1968, + 'display_id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect.cdr', + 'track': 'XD300-23 68HighlightsAResearchCntAugHumanIntellect', + }, }, { 'url': 'https://archive.org/details/Cops1922', @@ -62,33 +63,43 @@ class ArchiveOrgIE(InfoExtractor): 'id': 'Cops1922', 'ext': 'mp4', 'title': 'Buster Keaton\'s "Cops" (1922)', - 'description': 'md5:43a603fd6c5b4b90d12a96b921212b9c', + 'description': 'md5:cd6f9910c35aedd5fc237dbc3957e2ca', 'uploader': 'yorkmba99@hotmail.com', 'timestamp': 1387699629, 'upload_date': '20131222', + 'display_id': 'Cops-v2.mp4', + 'thumbnail': r're:https://archive\.org/download/.*\.jpg', + 'duration': 1091.96, }, }, { 'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect', 'only_matching': True, }, { 'url': 'https://archive.org/details/Election_Ads', - 'md5': '284180e857160cf866358700bab668a3', + 'md5': 'eec5cddebd4793c6a653b69c3b11f2e6', 'info_dict': { 'id': 'Election_Ads/Commercial-JFK1960ElectionAdCampaignJingle.mpg', 'title': 'Commercial-JFK1960ElectionAdCampaignJingle.mpg', - 'ext': 'mp4', + 'ext': 'mpg', + 'thumbnail': r're:https://archive\.org/download/.*\.jpg', + 'duration': 59.77, + 'display_id': 'Commercial-JFK1960ElectionAdCampaignJingle.mpg', }, }, { 'url': 'https://archive.org/details/Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg', - 'md5': '7915213ef02559b5501fe630e1a53f59', + 'md5': 'ea1eed8234e7d4165f38c8c769edef38', 'info_dict': { 'id': 'Election_Ads/Commercial-Nixon1960ElectionAdToughonDefense.mpg', 'title': 'Commercial-Nixon1960ElectionAdToughonDefense.mpg', - 'ext': 'mp4', + 'ext': 'mpg', 'timestamp': 1205588045, 'uploader': 'mikedavisstripmaster@yahoo.com', 'description': '1960 Presidential Campaign Election Commercials John F Kennedy, Richard M Nixon', 'upload_date': '20080315', + 'display_id': 'Commercial-Nixon1960ElectionAdToughonDefense.mpg', + 'duration': 59.51, + 'license': 'http://creativecommons.org/licenses/publicdomain/', + 'thumbnail': r're:https://archive\.org/download/.*\.jpg', }, }, { 'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16', @@ -97,6 +108,12 @@ class ArchiveOrgIE(InfoExtractor): 'id': 'gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t01.flac', 'title': 'Turning', 'ext': 'flac', + 'track': 'Turning', + 'creator': 'Grateful Dead', + 'display_id': 'gd1977-05-08d01t01.flac', + 'track_number': 1, + 'album': '1977-05-08 - Barton Hall - Cornell University', + 'duration': 39.8, }, }, { 'url': 'https://archive.org/details/gd1977-05-08.shure57.stevenson.29303.flac16/gd1977-05-08d01t07.flac', @@ -107,11 +124,20 @@ class ArchiveOrgIE(InfoExtractor): 'ext': 'flac', 'timestamp': 1205895624, 'uploader': 'mvernon54@yahoo.com', - 'description': 'md5:6a31f1996db0aa0fc9da6d6e708a1bb0', + 'description': 'md5:6c921464414814720c6593810a5c7e3d', 'upload_date': '20080319', 'location': 'Barton Hall - Cornell University', + 'duration': 438.68, + 'track': 'Deal', + 'creator': 'Grateful Dead', + 'album': '1977-05-08 - Barton Hall - Cornell University', + 'release_date': '19770508', + 'display_id': 'gd1977-05-08d01t07.flac', + 'release_year': 1977, + 'track_number': 7, }, }, { + # FIXME: give a better error message than just IndexError when all available formats are restricted 'url': 'https://archive.org/details/lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik', 'md5': '7cb019baa9b332e82ea7c10403acd180', 'info_dict': { @@ -119,6 +145,7 @@ class ArchiveOrgIE(InfoExtractor): 'title': 'Bells Of Rostov', 'ext': 'mp3', }, + 'skip': 'restricted' }, { 'url': 'https://archive.org/details/lp_the-music-of-russia_various-artists-a-askaryan-alexander-melik/disc1/02.02.+Song+And+Chorus+In+The+Polovetsian+Camp+From+%22Prince+Igor%22+(Act+2%2C+Scene+1).mp3', 'md5': '1d0aabe03edca83ca58d9ed3b493a3c3', @@ -131,6 +158,52 @@ class ArchiveOrgIE(InfoExtractor): 'description': 'md5:012b2d668ae753be36896f343d12a236', 'upload_date': '20190928', }, + 'skip': 'restricted' + }, { + # Original formats are private + 'url': 'https://archive.org/details/irelandthemakingofarepublic', + 'info_dict': { + 'id': 'irelandthemakingofarepublic', + 'title': 'Ireland: The Making of a Republic', + 'upload_date': '20160610', + 'description': 'md5:f70956a156645a658a0dc9513d9e78b7', + 'uploader': 'dimitrios@archive.org', + 'creator': ['British Broadcasting Corporation', 'Time-Life Films'], + 'timestamp': 1465594947, + }, + 'playlist': [ + { + 'md5': '0b211261b26590d49df968f71b90690d', + 'info_dict': { + 'id': 'irelandthemakingofarepublic/irelandthemakingofarepublicreel1_01.mov', + 'ext': 'mp4', + 'title': 'irelandthemakingofarepublicreel1_01.mov', + 'duration': 130.46, + 'thumbnail': 'https://archive.org/download/irelandthemakingofarepublic/irelandthemakingofarepublic.thumbs/irelandthemakingofarepublicreel1_01_000117.jpg', + 'display_id': 'irelandthemakingofarepublicreel1_01.mov', + }, + }, { + 'md5': '67335ee3b23a0da930841981c1e79b02', + 'info_dict': { + 'id': 'irelandthemakingofarepublic/irelandthemakingofarepublicreel1_02.mov', + 'ext': 'mp4', + 'duration': 1395.13, + 'title': 'irelandthemakingofarepublicreel1_02.mov', + 'display_id': 'irelandthemakingofarepublicreel1_02.mov', + 'thumbnail': 'https://archive.org/download/irelandthemakingofarepublic/irelandthemakingofarepublic.thumbs/irelandthemakingofarepublicreel1_02_001374.jpg', + }, + }, { + 'md5': 'e470e86787893603f4a341a16c281eb5', + 'info_dict': { + 'id': 'irelandthemakingofarepublic/irelandthemakingofarepublicreel2.mov', + 'ext': 'mp4', + 'duration': 1602.67, + 'title': 'irelandthemakingofarepublicreel2.mov', + 'thumbnail': 'https://archive.org/download/irelandthemakingofarepublic/irelandthemakingofarepublic.thumbs/irelandthemakingofarepublicreel2_001554.jpg', + 'display_id': 'irelandthemakingofarepublicreel2.mov', + }, + } + ] }] @staticmethod @@ -146,7 +219,7 @@ class ArchiveOrgIE(InfoExtractor): return json.loads(extract_attributes(element)['value']) def _real_extract(self, url): - video_id = compat_urllib_parse_unquote_plus(self._match_id(url)) + video_id = urllib.parse.unquote_plus(self._match_id(url)) identifier, entry_id = (video_id.split('/', 1) + [None])[:2] # Archive.org metadata API doesn't clearly demarcate playlist entries @@ -221,17 +294,25 @@ class ArchiveOrgIE(InfoExtractor): 'filesize': int_or_none(f.get('size'))}) extension = (f['name'].rsplit('.', 1) + [None])[1] - if extension in KNOWN_EXTENSIONS: + + # We don't want to skip private formats if the user has access to them, + # however without access to an account with such privileges we can't implement/test this. + # For now to be safe, we will only skip them if there is no user logged in. + is_logged_in = bool(self._get_cookies('https://archive.org').get('logged-in-sig')) + if extension in KNOWN_EXTENSIONS and (not f.get('private') or is_logged_in): entry['formats'].append({ 'url': 'https://archive.org/download/' + identifier + '/' + f['name'], 'format': f.get('format'), 'width': int_or_none(f.get('width')), 'height': int_or_none(f.get('height')), 'filesize': int_or_none(f.get('size')), - 'protocol': 'https'}) + 'protocol': 'https', + 'source_preference': 0 if f.get('source') == 'original' else -1, + 'format_note': f.get('source') + }) for entry in entries.values(): - self._sort_formats(entry['formats']) + entry['_format_sort_fields'] = ('source', ) if len(entries) == 1: # If there's only one item, use it as the main info dict @@ -287,7 +368,9 @@ class YoutubeWebArchiveIE(InfoExtractor): 'channel_id': 'UCukCyHaD-bK3in_pKpfH9Eg', 'duration': 32, 'uploader_id': 'Zeurel', - 'uploader_url': 'http://www.youtube.com/user/Zeurel' + 'uploader_url': 'https://www.youtube.com/user/Zeurel', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'channel_url': 'https://www.youtube.com/channel/UCukCyHaD-bK3in_pKpfH9Eg', } }, { # Internal link @@ -302,7 +385,9 @@ class YoutubeWebArchiveIE(InfoExtractor): 'channel_id': 'UCHnyfMqiRRG1u-2MsSQLbXA', 'duration': 771, 'uploader_id': '1veritasium', - 'uploader_url': 'http://www.youtube.com/user/1veritasium' + 'uploader_url': 'https://www.youtube.com/user/1veritasium', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'channel_url': 'https://www.youtube.com/channel/UCHnyfMqiRRG1u-2MsSQLbXA', } }, { # Video from 2012, webm format itag 45. Newest capture is deleted video, with an invalid description. @@ -316,7 +401,9 @@ class YoutubeWebArchiveIE(InfoExtractor): 'duration': 398, 'description': 'md5:ff4de6a7980cb65d951c2f6966a4f2f3', 'uploader_id': 'machinima', - 'uploader_url': 'http://www.youtube.com/user/machinima' + 'uploader_url': 'https://www.youtube.com/user/machinima', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'uploader': 'machinima' } }, { # FLV video. Video file URL does not provide itag information @@ -330,7 +417,10 @@ class YoutubeWebArchiveIE(InfoExtractor): 'duration': 19, 'description': 'md5:10436b12e07ac43ff8df65287a56efb4', 'uploader_id': 'jawed', - 'uploader_url': 'http://www.youtube.com/user/jawed' + 'uploader_url': 'https://www.youtube.com/user/jawed', + 'channel_url': 'https://www.youtube.com/channel/UC4QobU6STFB0P71PMvOGN5A', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'uploader': 'jawed', } }, { 'url': 'https://web.archive.org/web/20110712231407/http://www.youtube.com/watch?v=lTx3G6h2xyA', @@ -344,7 +434,9 @@ class YoutubeWebArchiveIE(InfoExtractor): 'duration': 204, 'description': 'md5:f7535343b6eda34a314eff8b85444680', 'uploader_id': 'itsmadeon', - 'uploader_url': 'http://www.youtube.com/user/itsmadeon' + 'uploader_url': 'https://www.youtube.com/user/itsmadeon', + 'channel_url': 'https://www.youtube.com/channel/UCqMDNf3Pn5L7pcNkuSEeO3w', + 'thumbnail': r're:https?://.*\.(jpg|webp)', } }, { # First capture is of dead video, second is the oldest from CDX response. @@ -355,10 +447,13 @@ class YoutubeWebArchiveIE(InfoExtractor): 'title': 'Fake Teen Doctor Strikes AGAIN! - Weekly Weird News', 'upload_date': '20160218', 'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA', - 'duration': 1236, + 'duration': 1235, 'description': 'md5:21032bae736421e89c2edf36d1936947', 'uploader_id': 'MachinimaETC', - 'uploader_url': 'http://www.youtube.com/user/MachinimaETC' + 'uploader_url': 'https://www.youtube.com/user/MachinimaETC', + 'channel_url': 'https://www.youtube.com/channel/UCdIaNUarhzLSXGoItz7BHVA', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'uploader': 'ETC News', } }, { # First capture of dead video, capture date in link links to dead capture. @@ -369,10 +464,13 @@ class YoutubeWebArchiveIE(InfoExtractor): 'title': 'WTF: Video Games Still Launch BROKEN?! - T.U.G.S.', 'upload_date': '20160219', 'channel_id': 'UCdIaNUarhzLSXGoItz7BHVA', - 'duration': 798, + 'duration': 797, 'description': 'md5:a1dbf12d9a3bd7cb4c5e33b27d77ffe7', 'uploader_id': 'MachinimaETC', - 'uploader_url': 'http://www.youtube.com/user/MachinimaETC' + 'uploader_url': 'https://www.youtube.com/user/MachinimaETC', + 'channel_url': 'https://www.youtube.com/channel/UCdIaNUarhzLSXGoItz7BHVA', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'uploader': 'ETC News', }, 'expected_warnings': [ r'unable to download capture webpage \(it may not be archived\)' @@ -392,12 +490,11 @@ class YoutubeWebArchiveIE(InfoExtractor): 'title': 'It\'s Bootleg AirPods Time.', 'upload_date': '20211021', 'channel_id': 'UC7Jwj9fkrf1adN4fMmTkpug', - 'channel_url': 'http://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug', + 'channel_url': 'https://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug', 'duration': 810, 'description': 'md5:7b567f898d8237b256f36c1a07d6d7bc', + 'thumbnail': r're:https?://.*\.(jpg|webp)', 'uploader': 'DankPods', - 'uploader_id': 'UC7Jwj9fkrf1adN4fMmTkpug', - 'uploader_url': 'http://www.youtube.com/channel/UC7Jwj9fkrf1adN4fMmTkpug' } }, { # player response contains '};' See: https://github.com/ytdl-org/youtube-dl/issues/27093 @@ -408,12 +505,135 @@ class YoutubeWebArchiveIE(InfoExtractor): 'title': 'bitch lasagna', 'upload_date': '20181005', 'channel_id': 'UC-lHJZR3Gqxm24_Vd_AJ5Yw', - 'channel_url': 'http://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw', + 'channel_url': 'https://www.youtube.com/channel/UC-lHJZR3Gqxm24_Vd_AJ5Yw', 'duration': 135, 'description': 'md5:2dbe4051feeff2dab5f41f82bb6d11d0', 'uploader': 'PewDiePie', 'uploader_id': 'PewDiePie', - 'uploader_url': 'http://www.youtube.com/user/PewDiePie' + 'uploader_url': 'https://www.youtube.com/user/PewDiePie', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + } + }, { + # ~June 2010 Capture. swfconfig + 'url': 'https://web.archive.org/web/0/https://www.youtube.com/watch?v=8XeW5ilk-9Y', + 'info_dict': { + 'id': '8XeW5ilk-9Y', + 'ext': 'flv', + 'title': 'Story of Stuff, The Critique Part 4 of 4', + 'duration': 541, + 'description': 'md5:28157da06f2c5e94c97f7f3072509972', + 'uploader': 'HowTheWorldWorks', + 'uploader_id': 'HowTheWorldWorks', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'uploader_url': 'https://www.youtube.com/user/HowTheWorldWorks', + 'upload_date': '20090520', + } + }, { + # Jan 2011: watch-video-date/eow-date surrounded by whitespace + 'url': 'https://web.archive.org/web/20110126141719/http://www.youtube.com/watch?v=Q_yjX80U7Yc', + 'info_dict': { + 'id': 'Q_yjX80U7Yc', + 'ext': 'flv', + 'title': 'Spray Paint Art by Clay Butler: Purple Fantasy Forest', + 'uploader_id': 'claybutlermusic', + 'description': 'md5:4595264559e3d0a0ceb3f011f6334543', + 'upload_date': '20090803', + 'uploader': 'claybutlermusic', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'duration': 132, + 'uploader_url': 'https://www.youtube.com/user/claybutlermusic', + } + }, { + # ~May 2009 swfArgs. ytcfg is spread out over various vars + 'url': 'https://web.archive.org/web/0/https://www.youtube.com/watch?v=c5uJgG05xUY', + 'info_dict': { + 'id': 'c5uJgG05xUY', + 'ext': 'webm', + 'title': 'Story of Stuff, The Critique Part 1 of 4', + 'uploader_id': 'HowTheWorldWorks', + 'uploader': 'HowTheWorldWorks', + 'uploader_url': 'https://www.youtube.com/user/HowTheWorldWorks', + 'upload_date': '20090513', + 'description': 'md5:4ca77d79538064e41e4cc464e93f44f0', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'duration': 754, + } + }, { + # ~June 2012. Upload date is in another lang so cannot extract. + 'url': 'https://web.archive.org/web/20120607174520/http://www.youtube.com/watch?v=xWTLLl-dQaA', + 'info_dict': { + 'id': 'xWTLLl-dQaA', + 'ext': 'mp4', + 'title': 'Black Nerd eHarmony Video Bio Parody (SPOOF)', + 'uploader_url': 'https://www.youtube.com/user/BlackNerdComedy', + 'description': 'md5:e25f0133aaf9e6793fb81c18021d193e', + 'uploader_id': 'BlackNerdComedy', + 'uploader': 'BlackNerdComedy', + 'duration': 182, + 'thumbnail': r're:https?://.*\.(jpg|webp)', + } + }, { + # ~July 2013 + 'url': 'https://web.archive.org/web/*/https://www.youtube.com/watch?v=9eO1aasHyTM', + 'info_dict': { + 'id': '9eO1aasHyTM', + 'ext': 'mp4', + 'title': 'Polar-oid', + 'description': 'Cameras and bears are dangerous!', + 'uploader_url': 'https://www.youtube.com/user/punkybird', + 'uploader_id': 'punkybird', + 'duration': 202, + 'channel_id': 'UC62R2cBezNBOqxSerfb1nMQ', + 'channel_url': 'https://www.youtube.com/channel/UC62R2cBezNBOqxSerfb1nMQ', + 'upload_date': '20060428', + 'uploader': 'punkybird', + } + }, { + # April 2020: Player response in player config + 'url': 'https://web.archive.org/web/20200416034815/https://www.youtube.com/watch?v=Cf7vS8jc7dY&gl=US&hl=en', + 'info_dict': { + 'id': 'Cf7vS8jc7dY', + 'ext': 'mp4', + 'title': 'A Dramatic Pool Story (by Jamie Spicer-Lewis) - Game Grumps Animated', + 'duration': 64, + 'upload_date': '20200408', + 'uploader_id': 'GameGrumps', + 'uploader': 'GameGrumps', + 'channel_url': 'https://www.youtube.com/channel/UC9CuvdOVfMPvKCiwdGKL3cQ', + 'channel_id': 'UC9CuvdOVfMPvKCiwdGKL3cQ', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'description': 'md5:c625bb3c02c4f5fb4205971e468fa341', + 'uploader_url': 'https://www.youtube.com/user/GameGrumps', + } + }, { + # watch7-user-header with yt-user-info + 'url': 'ytarchive:kbh4T_b4Ixw:20160307085057', + 'info_dict': { + 'id': 'kbh4T_b4Ixw', + 'ext': 'mp4', + 'title': 'Shovel Knight OST - Strike the Earth! Plains of Passage 16 bit SNES style remake / remix', + 'channel_url': 'https://www.youtube.com/channel/UCnTaGvsHmMy792DWeT6HbGA', + 'uploader': 'Nelward music', + 'duration': 213, + 'description': 'md5:804b4a9ce37b050a5fefdbb23aeba54d', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'upload_date': '20150503', + 'channel_id': 'UCnTaGvsHmMy792DWeT6HbGA', + } + }, { + # April 2012 + 'url': 'https://web.archive.org/web/0/https://www.youtube.com/watch?v=SOm7mPoPskU', + 'info_dict': { + 'id': 'SOm7mPoPskU', + 'ext': 'mp4', + 'title': 'Boyfriend - Justin Bieber Parody', + 'uploader_url': 'https://www.youtube.com/user/thecomputernerd01', + 'uploader': 'thecomputernerd01', + 'thumbnail': r're:https?://.*\.(jpg|webp)', + 'description': 'md5:dd7fa635519c2a5b4d566beaecad7491', + 'duration': 200, + 'upload_date': '20120407', + 'uploader_id': 'thecomputernerd01', } }, { 'url': 'https://web.archive.org/web/http://www.youtube.com/watch?v=kH-G_aIBlFw', @@ -445,9 +665,11 @@ class YoutubeWebArchiveIE(InfoExtractor): 'only_matching': True }, ] - _YT_INITIAL_DATA_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE - _YT_INITIAL_PLAYER_RESPONSE_RE = r'(?:(?:(?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*({.+?})[)\s]*;)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE - _YT_INITIAL_BOUNDARY_RE = r'(?:(?:var\s+meta|</script|\n)|%s)' % YoutubeBaseInfoExtractor._YT_INITIAL_BOUNDARY_RE + _YT_INITIAL_DATA_RE = YoutubeBaseInfoExtractor._YT_INITIAL_DATA_RE + _YT_INITIAL_PLAYER_RESPONSE_RE = fr'''(?x: + (?:window\s*\[\s*["\']ytInitialPlayerResponse["\']\s*\]|ytInitialPlayerResponse)\s*=[(\s]*| + {YoutubeBaseInfoExtractor._YT_INITIAL_PLAYER_RESPONSE_RE} + )''' _YT_DEFAULT_THUMB_SERVERS = ['i.ytimg.com'] # thumbnails most likely archived on these servers _YT_ALL_THUMB_SERVERS = orderedSet( @@ -477,11 +699,6 @@ class YoutubeWebArchiveIE(InfoExtractor): elif not isinstance(res, list) or len(res) != 0: self.report_warning('Error while parsing CDX API response' + bug_reports_message()) - def _extract_yt_initial_variable(self, webpage, regex, video_id, name): - return self._parse_json(self._search_regex( - (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE), - regex), webpage, name, default='{}'), video_id, fatal=False) - def _extract_webpage_title(self, webpage): page_title = self._html_extract_title(webpage, default='') # YouTube video pages appear to always have either 'YouTube -' as prefix or '- YouTube' as suffix. @@ -491,10 +708,32 @@ class YoutubeWebArchiveIE(InfoExtractor): def _extract_metadata(self, video_id, webpage): search_meta = ((lambda x: self._html_search_meta(x, webpage, default=None)) if webpage else (lambda x: None)) - player_response = self._extract_yt_initial_variable( - webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, video_id, 'initial player response') or {} - initial_data = self._extract_yt_initial_variable( - webpage, self._YT_INITIAL_DATA_RE, video_id, 'initial player response') or {} + player_response = self._search_json( + self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, 'initial player response', + video_id, default={}) + initial_data = self._search_json( + self._YT_INITIAL_DATA_RE, webpage, 'initial data', video_id, default={}) + + ytcfg = {} + for j in re.findall(r'yt\.setConfig\(\s*(?P<json>{\s*(?s:.+?)\s*})\s*\);', webpage): # ~June 2010 + ytcfg.update(self._parse_json(j, video_id, fatal=False, ignore_extra=True, transform_source=js_to_json, errnote='') or {}) + + # XXX: this also may contain a 'ptchn' key + player_config = ( + self._search_json( + r'(?:yt\.playerConfig|ytplayer\.config|swfConfig)\s*=', + webpage, 'player config', video_id, default=None) + or ytcfg.get('PLAYER_CONFIG') or {}) + + # XXX: this may also contain a 'creator' key. + swf_args = self._search_json(r'swfArgs\s*=', webpage, 'swf config', video_id, default={}) + if swf_args and not traverse_obj(player_config, ('args',)): + player_config['args'] = swf_args + + if not player_response: + # April 2020 + player_response = self._parse_json( + traverse_obj(player_config, ('args', 'player_response')) or '{}', video_id, fatal=False) initial_data_video = traverse_obj( initial_data, ('contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', ..., 'videoPrimaryInfoRenderer'), @@ -510,21 +749,64 @@ class YoutubeWebArchiveIE(InfoExtractor): video_details.get('title') or YoutubeBaseInfoExtractor._get_text(microformats, 'title') or YoutubeBaseInfoExtractor._get_text(initial_data_video, 'title') + or traverse_obj(player_config, ('args', 'title')) or self._extract_webpage_title(webpage) or search_meta(['og:title', 'twitter:title', 'title'])) + def id_from_url(url, type_): + return self._search_regex( + rf'(?:{type_})/([^/#&?]+)', url or '', f'{type_} id', default=None) + + # XXX: would the get_elements_by_... functions be better suited here? + _CHANNEL_URL_HREF_RE = r'href="[^"]*(?P<url>https?://www\.youtube\.com/(?:user|channel)/[^"]+)"' + uploader_or_channel_url = self._search_regex( + [fr'<(?:link\s*itemprop=\"url\"|a\s*id=\"watch-username\").*?\b{_CHANNEL_URL_HREF_RE}>', # @fd05024 + fr'<div\s*id=\"(?:watch-channel-stats|watch-headline-user-info)\"[^>]*>\s*<a[^>]*\b{_CHANNEL_URL_HREF_RE}'], # ~ May 2009, ~June 2012 + webpage, 'uploader or channel url', default=None) + + owner_profile_url = url_or_none(microformats.get('ownerProfileUrl')) # @a6211d2 + + # Uploader refers to the /user/ id ONLY + uploader_id = ( + id_from_url(owner_profile_url, 'user') + or id_from_url(uploader_or_channel_url, 'user') + or ytcfg.get('VIDEO_USERNAME')) + uploader_url = f'https://www.youtube.com/user/{uploader_id}' if uploader_id else None + + # XXX: do we want to differentiate uploader and channel? + uploader = ( + self._search_regex( + [r'<a\s*id="watch-username"[^>]*>\s*<strong>([^<]+)</strong>', # June 2010 + r'var\s*watchUsername\s*=\s*\'(.+?)\';', # ~May 2009 + r'<div\s*\bid=\"watch-channel-stats"[^>]*>\s*<a[^>]*>\s*(.+?)\s*</a', # ~May 2009 + r'<a\s*id="watch-userbanner"[^>]*title="\s*(.+?)\s*"'], # ~June 2012 + webpage, 'uploader', default=None) + or self._html_search_regex( + [r'(?s)<div\s*class="yt-user-info".*?<a[^>]*[^>]*>\s*(.*?)\s*</a', # March 2016 + r'(?s)<a[^>]*yt-user-name[^>]*>\s*(.*?)\s*</a'], # july 2013 + get_element_by_id('watch7-user-header', webpage), 'uploader', default=None) + or self._html_search_regex( + r'<button\s*href="/user/[^>]*>\s*<span[^>]*>\s*(.+?)\s*<', # April 2012 + get_element_by_id('watch-headline-user-info', webpage), 'uploader', default=None) + or traverse_obj(player_config, ('args', 'creator')) + or video_details.get('author')) + channel_id = str_or_none( video_details.get('channelId') or microformats.get('externalChannelId') or search_meta('channelId') or self._search_regex( r'data-channel-external-id=(["\'])(?P<id>(?:(?!\1).)+)\1', # @b45a9e6 - webpage, 'channel id', default=None, group='id')) - channel_url = f'http://www.youtube.com/channel/{channel_id}' if channel_id else None + webpage, 'channel id', default=None, group='id') + or id_from_url(owner_profile_url, 'channel') + or id_from_url(uploader_or_channel_url, 'channel') + or traverse_obj(player_config, ('args', 'ucid'))) + channel_url = f'https://www.youtube.com/channel/{channel_id}' if channel_id else None duration = int_or_none( video_details.get('lengthSeconds') or microformats.get('lengthSeconds') + or traverse_obj(player_config, ('args', ('length_seconds', 'l')), get_all=False) or parse_duration(search_meta('duration'))) description = ( video_details.get('shortDescription') @@ -532,26 +814,13 @@ class YoutubeWebArchiveIE(InfoExtractor): or clean_html(get_element_by_id('eow-description', webpage)) # @9e6dd23 or search_meta(['description', 'og:description', 'twitter:description'])) - uploader = video_details.get('author') - - # Uploader ID and URL - uploader_mobj = re.search( - r'<link itemprop="url" href="(?P<uploader_url>https?://www\.youtube\.com/(?:user|channel)/(?P<uploader_id>[^"]+))">', # @fd05024 - webpage) - if uploader_mobj is not None: - uploader_id, uploader_url = uploader_mobj.group('uploader_id'), uploader_mobj.group('uploader_url') - else: - # @a6211d2 - uploader_url = url_or_none(microformats.get('ownerProfileUrl')) - uploader_id = self._search_regex( - r'(?:user|channel)/([^/]+)', uploader_url or '', 'uploader id', default=None) - upload_date = unified_strdate( dict_get(microformats, ('uploadDate', 'publishDate')) or search_meta(['uploadDate', 'datePublished']) or self._search_regex( - [r'(?s)id="eow-date.*?>(.*?)</span>', - r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'], # @7998520 + [r'(?s)id="eow-date.*?>\s*(.*?)\s*</span>', + r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']', # @7998520 + r'class\s*=\s*"(?:watch-video-date|watch-video-added post-date)"[^>]*>\s*([^<]+?)\s*<'], # ~June 2010, ~Jan 2009 (respectively) webpage, 'upload date', default=None)) return { @@ -597,7 +866,7 @@ class YoutubeWebArchiveIE(InfoExtractor): response = self._call_cdx_api( video_id, f'https://www.youtube.com/watch?v={video_id}', filters=['mimetype:text/html'], collapse=['timestamp:6', 'digest'], query={'matchType': 'prefix'}) or [] - all_captures = sorted([int_or_none(r['timestamp']) for r in response if int_or_none(r['timestamp']) is not None]) + all_captures = sorted(int_or_none(r['timestamp']) for r in response if int_or_none(r['timestamp']) is not None) # Prefer the new polymer UI captures as we support extracting more metadata from them # WBM captures seem to all switch to this layout ~July 2020 @@ -620,18 +889,22 @@ class YoutubeWebArchiveIE(InfoExtractor): url_date = url_date or url_date_2 urlh = None - try: - urlh = self._request_webpage( - HEADRequest('https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s' % video_id), - video_id, note='Fetching archived video file url', expected_status=True) - except ExtractorError as e: - # HTTP Error 404 is expected if the video is not saved. - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: - self.raise_no_formats( - 'The requested video is not archived, indexed, or there is an issue with web.archive.org', - expected=True) - else: - raise + retry_manager = self.RetryManager(fatal=False) + for retry in retry_manager: + try: + urlh = self._request_webpage( + HEADRequest('https://web.archive.org/web/2oe_/http://wayback-fakeurl.archive.org/yt/%s' % video_id), + video_id, note='Fetching archived video file url', expected_status=True) + except ExtractorError as e: + # HTTP Error 404 is expected if the video is not saved. + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: + self.raise_no_formats( + 'The requested video is not archived, indexed, or there is an issue with web.archive.org (try again later)', expected=True) + else: + retry.error = e + + if retry_manager.error: + self.raise_no_formats(retry_manager.error, expected=True, video_id=video_id) capture_dates = self._get_capture_dates(video_id, int_or_none(url_date)) self.write_debug('Captures to try: ' + join_nonempty(*capture_dates, delim=', ')) diff --git a/hypervideo_dl/extractor/arcpublishing.py b/hypervideo_dl/extractor/arcpublishing.py index 8880e5c..febd3d2 100644 --- a/hypervideo_dl/extractor/arcpublishing.py +++ b/hypervideo_dl/extractor/arcpublishing.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -73,8 +70,8 @@ class ArcPublishingIE(InfoExtractor): ], 'video-api-cdn.%s.arcpublishing.com/api'), ] - @staticmethod - def _extract_urls(webpage): + @classmethod + def _extract_embed_urls(cls, url, webpage): entries = [] # https://arcpublishing.atlassian.net/wiki/spaces/POWA/overview for powa_el in re.findall(r'(<div[^>]+class="[^"]*\bpowa\b[^"]*"[^>]+data-uuid="%s"[^>]*>)' % ArcPublishingIE._UUID_REGEX, webpage): @@ -147,7 +144,6 @@ class ArcPublishingIE(InfoExtractor): 'url': s_url, 'quality': -10, }) - self._sort_formats(formats) subtitles = {} for subtitle in (try_get(video, lambda x: x['subtitles']['urls'], list) or []): diff --git a/hypervideo_dl/extractor/ard.py b/hypervideo_dl/extractor/ard.py index 7ea339b..0a8a874 100644 --- a/hypervideo_dl/extractor/ard.py +++ b/hypervideo_dl/extractor/ard.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import re @@ -43,8 +40,6 @@ class ARDMediathekBaseIE(InfoExtractor): 'This video is not available due to geoblocking', countries=self._GEO_COUNTRIES, metadata_available=True) - self._sort_formats(formats) - subtitles = {} subtitle_url = media_info.get('_subtitleUrl') if subtitle_url: @@ -265,7 +260,6 @@ class ARDMediathekIE(ARDMediathekBaseIE): 'format_id': fid, 'url': furl, }) - self._sort_formats(formats) info = { 'formats': formats, } @@ -374,7 +368,6 @@ class ARDIE(InfoExtractor): continue f['url'] = format_url formats.append(f) - self._sort_formats(formats) _SUB_FORMATS = ( ('./dataTimedText', 'ttml'), diff --git a/hypervideo_dl/extractor/arkena.py b/hypervideo_dl/extractor/arkena.py index 4f4f457..de36ec8 100644 --- a/hypervideo_dl/extractor/arkena.py +++ b/hypervideo_dl/extractor/arkena.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -22,6 +17,8 @@ class ArkenaIE(InfoExtractor): play\.arkena\.com/(?:config|embed)/avp/v\d/player/media/(?P<id>[^/]+)/[^/]+/(?P<account_id>\d+) ) ''' + # See https://support.arkena.com/display/PLAY/Ways+to+embed+your+video + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//play\.arkena\.com/embed/avp/.+?)\1'] _TESTS = [{ 'url': 'https://video.qbrick.com/play2/embed/player?accountId=1034090&mediaId=d8ab4607-00090107-aab86310', 'md5': '97f117754e5f3c020f5f26da4a44ebaf', @@ -53,15 +50,6 @@ class ArkenaIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - # See https://support.arkena.com/display/PLAY/Ways+to+embed+your+video - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//play\.arkena\.com/embed/avp/.+?)\1', - webpage) - if mobj: - return mobj.group('url') - def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') @@ -148,7 +136,6 @@ class ArkenaIE(InfoExtractor): elif mime_type == 'application/vnd.ms-sstr+xml': formats.extend(self._extract_ism_formats( href, video_id, ism_id='mss', fatal=False)) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/arnes.py b/hypervideo_dl/extractor/arnes.py index 050c252..a493714 100644 --- a/hypervideo_dl/extractor/arnes.py +++ b/hypervideo_dl/extractor/arnes.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_parse_qs, @@ -76,7 +73,6 @@ class ArnesIE(InfoExtractor): 'width': int_or_none(media.get('width')), 'height': int_or_none(media.get('height')), }) - self._sort_formats(formats) channel = video.get('channel') or {} channel_id = channel.get('url') @@ -93,7 +89,7 @@ class ArnesIE(InfoExtractor): 'timestamp': parse_iso8601(video.get('creationTime')), 'channel': channel.get('name'), 'channel_id': channel_id, - 'channel_url': format_field(channel_id, template=f'{self._BASE_URL}/?channel=%s'), + 'channel_url': format_field(channel_id, None, f'{self._BASE_URL}/?channel=%s'), 'duration': float_or_none(video.get('duration'), 1000), 'view_count': int_or_none(video.get('views')), 'tags': video.get('hashtags'), diff --git a/hypervideo_dl/extractor/arte.py b/hypervideo_dl/extractor/arte.py index c2f2c1b..54e4d2d 100644 --- a/hypervideo_dl/extractor/arte.py +++ b/hypervideo_dl/extractor/arte.py @@ -1,193 +1,216 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor -from ..compat import ( - compat_str, -) from ..utils import ( ExtractorError, + GeoRestrictedError, int_or_none, + parse_iso8601, parse_qs, - qualities, strip_or_none, - try_get, - unified_strdate, + traverse_obj, url_or_none, ) class ArteTVBaseIE(InfoExtractor): _ARTE_LANGUAGES = 'fr|de|en|es|it|pl' - _API_BASE = 'https://api.arte.tv/api/player/v1' + _API_BASE = 'https://api.arte.tv/api/player/v2' class ArteTVIE(ArteTVBaseIE): _VALID_URL = r'''(?x) - https?:// + (?:https?:// (?: (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos| api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s) ) - /(?P<id>\d{6}-\d{3}-[AF]) + |arte://program) + /(?P<id>\d{6}-\d{3}-[AF]|LIVE) ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES} _TESTS = [{ 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/', + 'only_matching': True, + }, { + 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/', 'info_dict': { - 'id': '088501-000-A', + 'id': '100103-000-A', + 'title': 'USA: Dyskryminacja na porodówce', + 'description': 'md5:242017b7cce59ffae340a54baefcafb1', + 'alt_title': 'ARTE Reportage', + 'upload_date': '20201103', + 'duration': 554, + 'thumbnail': r're:https://api-cdn\.arte\.tv/.+940x530', + 'timestamp': 1604417980, 'ext': 'mp4', - 'title': 'Mexico: Stealing Petrol to Survive', - 'upload_date': '20190628', }, + 'params': {'skip_download': 'm3u8'} }, { - 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/', - 'only_matching': True, + 'note': 'No alt_title', + 'url': 'https://www.arte.tv/fr/videos/110371-000-A/la-chaleur-supplice-des-arbres-de-rue/', + 'info_dict': { + 'id': '110371-000-A', + 'ext': 'mp4', + 'upload_date': '20220718', + 'duration': 154, + 'timestamp': 1658162460, + 'description': 'md5:5890f36fe7dccfadb8b7c0891de54786', + 'title': 'La chaleur, supplice des arbres de rue', + 'thumbnail': 'https://api-cdn.arte.tv/img/v2/image/CPE2sQDtD8GLQgt8DuYHLf/940x530', + }, + 'params': {'skip_download': 'm3u8'} }, { 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A', 'only_matching': True, + }, { + 'url': 'https://api.arte.tv/api/player/v2/config/de/LIVE', + 'only_matching': True, }] + _GEO_BYPASS = True + + _LANG_MAP = { # ISO639 -> French abbreviations + 'fr': 'F', + 'de': 'A', + 'en': 'E[ANG]', + 'es': 'E[ESP]', + 'it': 'E[ITA]', + 'pl': 'E[POL]', + # XXX: probably means mixed; <https://www.arte.tv/en/videos/107710-029-A/dispatches-from-ukraine-local-journalists-report/> + # uses this code for audio that happens to be in Ukrainian, but the manifest uses the ISO code 'mul' (mixed) + 'mul': 'EU', + } + + _VERSION_CODE_RE = re.compile(r'''(?x) + V + (?P<original_voice>O?) + (?P<vlang>[FA]|E\[[A-Z]+\]|EU)? + (?P<audio_desc>AUD|) + (?: + (?P<has_sub>-ST) + (?P<sdh_sub>M?) + (?P<sub_lang>[FA]|E\[[A-Z]+\]|EU) + )? + ''') + + # all obtained by exhaustive testing + _COUNTRIES_MAP = { + 'DE_FR': ( + 'BL', 'DE', 'FR', 'GF', 'GP', 'MF', 'MQ', 'NC', + 'PF', 'PM', 'RE', 'WF', 'YT', + ), + # with both of the below 'BE' sometimes works, sometimes doesn't + 'EUR_DE_FR': ( + 'AT', 'BL', 'CH', 'DE', 'FR', 'GF', 'GP', 'LI', + 'MC', 'MF', 'MQ', 'NC', 'PF', 'PM', 'RE', 'WF', + 'YT', + ), + 'SAT': ( + 'AD', 'AT', 'AX', 'BG', 'BL', 'CH', 'CY', 'CZ', + 'DE', 'DK', 'EE', 'ES', 'FI', 'FR', 'GB', 'GF', + 'GR', 'HR', 'HU', 'IE', 'IS', 'IT', 'KN', 'LI', + 'LT', 'LU', 'LV', 'MC', 'MF', 'MQ', 'MT', 'NC', + 'NL', 'NO', 'PF', 'PL', 'PM', 'PT', 'RE', 'RO', + 'SE', 'SI', 'SK', 'SM', 'VA', 'WF', 'YT', + ), + } + def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') lang = mobj.group('lang') or mobj.group('lang_2') - - info = self._download_json( - '%s/config/%s/%s' % (self._API_BASE, lang, video_id), video_id) - player_info = info['videoJsonPlayer'] - - vsr = try_get(player_info, lambda x: x['VSR'], dict) - if not vsr: - error = None - if try_get(player_info, lambda x: x['custom_msg']['type']) == 'error': - error = try_get( - player_info, lambda x: x['custom_msg']['msg'], compat_str) - if not error: - error = 'Video %s is not available' % player_info.get('VID') or video_id - raise ExtractorError(error, expected=True) - - upload_date_str = player_info.get('shootingDate') - if not upload_date_str: - upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0] - - title = (player_info.get('VTI') or player_info['VID']).strip() - subtitle = player_info.get('VSU', '').strip() - if subtitle: - title += ' - %s' % subtitle - - qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ']) - - LANGS = { - 'fr': 'F', - 'de': 'A', - 'en': 'E[ANG]', - 'es': 'E[ESP]', - 'it': 'E[ITA]', - 'pl': 'E[POL]', - } - - langcode = LANGS.get(lang, lang) - - formats = [] - for format_id, format_dict in vsr.items(): - f = dict(format_dict) - format_url = url_or_none(f.get('url')) - streamer = f.get('streamer') - if not format_url and not streamer: - continue - versionCode = f.get('versionCode') - l = re.escape(langcode) - - # Language preference from most to least priority - # Reference: section 6.8 of - # https://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-07-1.pdf - PREFERENCES = ( - # original version in requested language, without subtitles - r'VO{0}$'.format(l), - # original version in requested language, with partial subtitles in requested language - r'VO{0}-ST{0}$'.format(l), - # original version in requested language, with subtitles for the deaf and hard-of-hearing in requested language - r'VO{0}-STM{0}$'.format(l), - # non-original (dubbed) version in requested language, without subtitles - r'V{0}$'.format(l), - # non-original (dubbed) version in requested language, with subtitles partial subtitles in requested language - r'V{0}-ST{0}$'.format(l), - # non-original (dubbed) version in requested language, with subtitles for the deaf and hard-of-hearing in requested language - r'V{0}-STM{0}$'.format(l), - # original version in requested language, with partial subtitles in different language - r'VO{0}-ST(?!{0}).+?$'.format(l), - # original version in requested language, with subtitles for the deaf and hard-of-hearing in different language - r'VO{0}-STM(?!{0}).+?$'.format(l), - # original version in different language, with partial subtitles in requested language - r'VO(?:(?!{0}).+?)?-ST{0}$'.format(l), - # original version in different language, with subtitles for the deaf and hard-of-hearing in requested language - r'VO(?:(?!{0}).+?)?-STM{0}$'.format(l), - # original version in different language, without subtitles - r'VO(?:(?!{0}))?$'.format(l), - # original version in different language, with partial subtitles in different language - r'VO(?:(?!{0}).+?)?-ST(?!{0}).+?$'.format(l), - # original version in different language, with subtitles for the deaf and hard-of-hearing in different language - r'VO(?:(?!{0}).+?)?-STM(?!{0}).+?$'.format(l), - ) - - for pref, p in enumerate(PREFERENCES): - if re.match(p, versionCode): - lang_pref = len(PREFERENCES) - pref - break - else: - lang_pref = -1 - format_note = '%s, %s' % (f.get('versionCode'), f.get('versionLibelle')) - - media_type = f.get('mediaType') - if media_type == 'hls': - m3u8_formats = self._extract_m3u8_formats( - format_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id=format_id, fatal=False) - for m3u8_format in m3u8_formats: - m3u8_format.update({ + langauge_code = self._LANG_MAP.get(lang) + + config = self._download_json(f'{self._API_BASE}/config/{lang}/{video_id}', video_id) + + geoblocking = traverse_obj(config, ('data', 'attributes', 'restriction', 'geoblocking')) or {} + if geoblocking.get('restrictedArea'): + raise GeoRestrictedError(f'Video restricted to {geoblocking["code"]!r}', + countries=self._COUNTRIES_MAP.get(geoblocking['code'], ('DE', 'FR'))) + + if not traverse_obj(config, ('data', 'attributes', 'rights')): + # Eg: https://www.arte.tv/de/videos/097407-215-A/28-minuten + # Eg: https://www.arte.tv/es/videos/104351-002-A/serviteur-du-peuple-1-23 + raise ExtractorError( + 'Video is not available in this language edition of Arte or broadcast rights expired', expected=True) + + formats, subtitles = [], {} + secondary_formats = [] + for stream in config['data']['attributes']['streams']: + # official player contains code like `e.get("versions")[0].eStat.ml5` + stream_version = stream['versions'][0] + stream_version_code = stream_version['eStat']['ml5'] + + lang_pref = -1 + m = self._VERSION_CODE_RE.match(stream_version_code) + if m: + lang_pref = int(''.join('01'[x] for x in ( + m.group('vlang') == langauge_code, # we prefer voice in the requested language + not m.group('audio_desc'), # and not the audio description version + bool(m.group('original_voice')), # but if voice is not in the requested language, at least choose the original voice + m.group('sub_lang') == langauge_code, # if subtitles are present, we prefer them in the requested language + not m.group('has_sub'), # but we prefer no subtitles otherwise + not m.group('sdh_sub'), # and we prefer not the hard-of-hearing subtitles if there are subtitles + ))) + + short_label = traverse_obj(stream_version, 'shortLabel', expected_type=str, default='?') + if stream['protocol'].startswith('HLS'): + fmts, subs = self._extract_m3u8_formats_and_subtitles( + stream['url'], video_id=video_id, ext='mp4', m3u8_id=stream_version_code, fatal=False) + for fmt in fmts: + fmt.update({ + 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]', 'language_preference': lang_pref, - 'format_note': format_note, }) - formats.extend(m3u8_formats) - continue + if any(map(short_label.startswith, ('cc', 'OGsub'))): + secondary_formats.extend(fmts) + else: + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + elif stream['protocol'] in ('HTTPS', 'RTMP'): + formats.append({ + 'format_id': f'{stream["protocol"]}-{stream_version_code}', + 'url': stream['url'], + 'format_note': f'{stream_version.get("label", "unknown")} [{short_label}]', + 'language_preference': lang_pref, + # 'ext': 'mp4', # XXX: may or may not be necessary, at least for HTTPS + }) - format = { - 'format_id': format_id, - 'language_preference': lang_pref, - 'format_note': format_note, - 'width': int_or_none(f.get('width')), - 'height': int_or_none(f.get('height')), - 'tbr': int_or_none(f.get('bitrate')), - 'quality': qfunc(f.get('quality')), - } - - if media_type == 'rtmp': - format['url'] = f['streamer'] - format['play_path'] = 'mp4:' + f['url'] - format['ext'] = 'flv' else: - format['url'] = f['url'] + self.report_warning(f'Skipping stream with unknown protocol {stream["protocol"]}') - formats.append(format) + # TODO: chapters from stream['segments']? + # The JS also looks for chapters in config['data']['attributes']['chapters'], + # but I am yet to find a video having those - # For this extractor, quality only represents the relative quality - # with respect to other formats with the same resolution - self._sort_formats(formats, ('res', 'quality')) + formats.extend(secondary_formats) + self._remove_duplicate_formats(formats) + + metadata = config['data']['attributes']['metadata'] return { - 'id': player_info.get('VID') or video_id, - 'title': title, - 'description': player_info.get('VDE') or player_info.get('V7T'), - 'upload_date': unified_strdate(upload_date_str), - 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'), + 'id': metadata['providerId'], + 'webpage_url': traverse_obj(metadata, ('link', 'url')), + 'title': traverse_obj(metadata, 'subtitle', 'title'), + 'alt_title': metadata.get('subtitle') and metadata.get('title'), + 'description': metadata.get('description'), + 'duration': traverse_obj(metadata, ('duration', 'seconds')), + 'language': metadata.get('language'), + 'timestamp': traverse_obj(config, ('data', 'attributes', 'rights', 'begin'), expected_type=parse_iso8601), + 'is_live': config['data']['attributes'].get('live', False), 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': [ + {'url': image['url'], 'id': image.get('caption')} + for image in metadata.get('images') or [] if url_or_none(image.get('url')) + ], } class ArteTVEmbedIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+' + _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1'] _TESTS = [{ 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A', 'info_dict': { @@ -197,17 +220,12 @@ class ArteTVEmbedIE(InfoExtractor): 'description': 'md5:be40b667f45189632b78c1425c7c2ce1', 'upload_date': '20201116', }, + 'skip': 'No video available' }, { 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A', 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [url for _, url in re.findall( - r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1', - webpage)] - def _real_extract(self, url): qs = parse_qs(url) json_url = qs['json_url'][0] @@ -220,44 +238,36 @@ class ArteTVPlaylistIE(ArteTVBaseIE): _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES _TESTS = [{ 'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/', - 'info_dict': { - 'id': 'RC-016954', - 'title': 'Earn a Living', - 'description': 'md5:d322c55011514b3a7241f7fb80d494c2', - }, - 'playlist_mincount': 6, + 'only_matching': True, }, { 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/', - 'only_matching': True, + 'playlist_mincount': 100, + 'info_dict': { + 'description': 'md5:84e7bf1feda248bc325ebfac818c476e', + 'id': 'RC-014123', + 'title': 'ARTE Reportage - najlepsze reportaże', + }, }] def _real_extract(self, url): - lang, playlist_id = self._match_valid_url(url).groups() - collection = self._download_json( - '%s/collectionData/%s/%s?source=videos' - % (self._API_BASE, lang, playlist_id), playlist_id) - entries = [] - for video in collection['videos']: - if not isinstance(video, dict): - continue - video_url = url_or_none(video.get('url')) or url_or_none(video.get('jsonUrl')) - if not video_url: - continue - video_id = video.get('programId') - entries.append({ - '_type': 'url_transparent', - 'url': video_url, - 'id': video_id, - 'title': video.get('title'), - 'alt_title': video.get('subtitle'), - 'thumbnail': url_or_none(try_get(video, lambda x: x['mainImage']['url'], compat_str)), - 'duration': int_or_none(video.get('durationSeconds')), - 'view_count': int_or_none(video.get('views')), - 'ie_key': ArteTVIE.ie_key(), - }) - title = collection.get('title') - description = collection.get('shortDescription') or collection.get('teaserText') - return self.playlist_result(entries, playlist_id, title, description) + lang, playlist_id = self._match_valid_url(url).group('lang', 'id') + playlist = self._download_json( + f'{self._API_BASE}/playlist/{lang}/{playlist_id}', playlist_id)['data']['attributes'] + + entries = [{ + '_type': 'url_transparent', + 'url': video['config']['url'], + 'ie_key': ArteTVIE.ie_key(), + 'id': video.get('providerId'), + 'title': video.get('title'), + 'alt_title': video.get('subtitle'), + 'thumbnail': url_or_none(traverse_obj(video, ('mainImage', 'url'))), + 'duration': int_or_none(traverse_obj(video, ('duration', 'seconds'))), + } for video in traverse_obj(playlist, ('items', lambda _, v: v['config']['url']))] + + return self.playlist_result(entries, playlist_id, + traverse_obj(playlist, ('metadata', 'title')), + traverse_obj(playlist, ('metadata', 'description'))) class ArteTVCategoryIE(ArteTVBaseIE): @@ -270,14 +280,13 @@ class ArteTVCategoryIE(ArteTVBaseIE): 'description': 'Investigative documentary series, geopolitical analysis, and international commentary', }, 'playlist_mincount': 13, - }, - ] + }] @classmethod def suitable(cls, url): return ( not any(ie.suitable(url) for ie in (ArteTVIE, ArteTVPlaylistIE, )) - and super(ArteTVCategoryIE, cls).suitable(url)) + and super().suitable(url)) def _real_extract(self, url): lang, playlist_id = self._match_valid_url(url).groups() @@ -293,9 +302,7 @@ class ArteTVCategoryIE(ArteTVBaseIE): if any(ie.suitable(video) for ie in (ArteTVIE, ArteTVPlaylistIE, )): items.append(video) - title = (self._og_search_title(webpage, default=None) - or self._html_search_regex(r'<title\b[^>]*>([^<]+)</title>', default=None)) - title = strip_or_none(title.rsplit('|', 1)[0]) or self._generic_title(url) + title = strip_or_none(self._generic_title('', webpage, default='').rsplit('|', 1)[0]) or None return self.playlist_from_matches(items, playlist_id=playlist_id, playlist_title=title, description=self._og_search_description(webpage, default=None)) diff --git a/hypervideo_dl/extractor/asiancrush.py b/hypervideo_dl/extractor/asiancrush.py index 7f1940f..23f310e 100644 --- a/hypervideo_dl/extractor/asiancrush.py +++ b/hypervideo_dl/extractor/asiancrush.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import functools import re diff --git a/hypervideo_dl/extractor/atresplayer.py b/hypervideo_dl/extractor/atresplayer.py index 465af4e..a20e7f9 100644 --- a/hypervideo_dl/extractor/atresplayer.py +++ b/hypervideo_dl/extractor/atresplayer.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( @@ -88,7 +84,6 @@ class AtresPlayerIE(InfoExtractor): elif src_type == 'application/dash+xml': formats, subtitles = self._extract_mpd_formats( src, video_id, mpd_id='dash', fatal=False) - self._sort_formats(formats) heartbeat = episode.get('heartbeat') or {} omniture = episode.get('omniture') or {} diff --git a/hypervideo_dl/extractor/atscaleconf.py b/hypervideo_dl/extractor/atscaleconf.py new file mode 100644 index 0000000..3f7b1e9 --- /dev/null +++ b/hypervideo_dl/extractor/atscaleconf.py @@ -0,0 +1,34 @@ +import re + +from .common import InfoExtractor + + +class AtScaleConfEventIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?atscaleconference\.com/events/(?P<id>[^/&$?]+)' + + _TESTS = [{ + 'url': 'https://atscaleconference.com/events/data-scale-spring-2022/', + 'playlist_mincount': 13, + 'info_dict': { + 'id': 'data-scale-spring-2022', + 'title': 'Data @Scale Spring 2022', + 'description': 'md5:7d7ca1c42ac9c6d8a785092a1aea4b55' + }, + }, { + 'url': 'https://atscaleconference.com/events/video-scale-2021/', + 'playlist_mincount': 14, + 'info_dict': { + 'id': 'video-scale-2021', + 'title': 'Video @Scale 2021', + 'description': 'md5:7d7ca1c42ac9c6d8a785092a1aea4b55' + }, + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + + return self.playlist_from_matches( + re.findall(r'data-url\s*=\s*"(https?://(?:www\.)?atscaleconference\.com/videos/[^"]+)"', webpage), + ie='Generic', playlist_id=id, + title=self._og_search_title(webpage), description=self._og_search_description(webpage)) diff --git a/hypervideo_dl/extractor/atttechchannel.py b/hypervideo_dl/extractor/atttechchannel.py index 8f93fb3..6ff4ec0 100644 --- a/hypervideo_dl/extractor/atttechchannel.py +++ b/hypervideo_dl/extractor/atttechchannel.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import unified_strdate diff --git a/hypervideo_dl/extractor/atvat.py b/hypervideo_dl/extractor/atvat.py index 481a097..d6ed9e4 100644 --- a/hypervideo_dl/extractor/atvat.py +++ b/hypervideo_dl/extractor/atvat.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import datetime from .common import InfoExtractor @@ -52,7 +49,6 @@ class ATVAtIE(InfoExtractor): 'url': source_url, 'format_id': protocol, }) - self._sort_formats(formats) return { 'id': clip_id, diff --git a/hypervideo_dl/extractor/audimedia.py b/hypervideo_dl/extractor/audimedia.py index 6bd48ef..35114e5 100644 --- a/hypervideo_dl/extractor/audimedia.py +++ b/hypervideo_dl/extractor/audimedia.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -79,7 +76,6 @@ class AudiMediaIE(InfoExtractor): 'format_id': 'http-%s' % bitrate, }) formats.append(f) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/audioboom.py b/hypervideo_dl/extractor/audioboom.py index c51837b..a23fcd2 100644 --- a/hypervideo_dl/extractor/audioboom.py +++ b/hypervideo_dl/extractor/audioboom.py @@ -1,27 +1,33 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor -from ..utils import ( - clean_html, - float_or_none, -) +from ..utils import clean_html, float_or_none, traverse_obj, unescapeHTML class AudioBoomIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?audioboom\.com/(?:boos|posts)/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'https://audioboom.com/posts/7398103-asim-chaudhry', - 'md5': '7b00192e593ff227e6a315486979a42d', + 'md5': '4d68be11c9f9daf3dab0778ad1e010c3', 'info_dict': { 'id': '7398103', 'ext': 'mp3', 'title': 'Asim Chaudhry', - 'description': 'md5:2f3fef17dacc2595b5362e1d7d3602fc', + 'description': 'md5:0ed714ae0e81e5d9119cac2f618ad679', 'duration': 4000.99, 'uploader': 'Sue Perkins: An hour or so with...', 'uploader_url': r're:https?://(?:www\.)?audioboom\.com/channel/perkins', } + }, { # Direct mp3-file link + 'url': 'https://audioboom.com/posts/8128496.mp3', + 'md5': 'e329edf304d450def95c7f86a9165ee1', + 'info_dict': { + 'id': '8128496', + 'ext': 'mp3', + 'title': 'TCRNo8 / DAILY 03 - In Control', + 'description': 'md5:44665f142db74858dfa21c5b34787948', + 'duration': 1689.7, + 'uploader': 'Lost Dot Podcast: The Trans Pyrenees and Transcontinental Race', + 'uploader_url': r're:https?://(?:www\.)?audioboom\.com/channels/5003904', + } }, { 'url': 'https://audioboom.com/posts/4279833-3-09-2016-czaban-hour-3?t=0', 'only_matching': True, @@ -29,45 +35,23 @@ class AudioBoomIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(f'https://audioboom.com/posts/{video_id}', video_id) - webpage = self._download_webpage(url, video_id) - - clip = None - - clip_store = self._parse_json( - self._html_search_regex( - r'data-new-clip-store=(["\'])(?P<json>{.+?})\1', - webpage, 'clip store', default='{}', group='json'), - video_id, fatal=False) - if clip_store: - clips = clip_store.get('clips') - if clips and isinstance(clips, list) and isinstance(clips[0], dict): - clip = clips[0] - - def from_clip(field): - if clip: - return clip.get(field) - - audio_url = from_clip('clipURLPriorToLoading') or self._og_search_property( - 'audio', webpage, 'audio url') - title = from_clip('title') or self._html_search_meta( - ['og:title', 'og:audio:title', 'audio_title'], webpage) - description = from_clip('description') or clean_html(from_clip('formattedDescription')) or self._og_search_description(webpage) - - duration = float_or_none(from_clip('duration') or self._html_search_meta( - 'weibo:audio:duration', webpage)) - - uploader = from_clip('author') or self._html_search_meta( - ['og:audio:artist', 'twitter:audio:artist_name', 'audio_artist'], webpage, 'uploader') - uploader_url = from_clip('author_url') or self._html_search_meta( - 'audioboo:channel', webpage, 'uploader url') + clip_store = self._search_json( + r'data-react-class="V5DetailPagePlayer"\s*data-react-props=["\']', + webpage, 'clip store', video_id, fatal=False, transform_source=unescapeHTML) + clip = traverse_obj(clip_store, ('clips', 0), expected_type=dict) or {} return { 'id': video_id, - 'url': audio_url, - 'title': title, - 'description': description, - 'duration': duration, - 'uploader': uploader, - 'uploader_url': uploader_url, + 'url': clip.get('clipURLPriorToLoading') or self._og_search_property('audio', webpage, 'audio url'), + 'title': clip.get('title') or self._html_search_meta(['og:title', 'og:audio:title', 'audio_title'], webpage), + 'description': (clip.get('description') or clean_html(clip.get('formattedDescription')) + or self._og_search_description(webpage)), + 'duration': float_or_none(clip.get('duration') or self._html_search_meta('weibo:audio:duration', webpage)), + 'uploader': clip.get('author') or self._html_search_meta( + ['og:audio:artist', 'twitter:audio:artist_name', 'audio_artist'], webpage, 'uploader'), + 'uploader_url': clip.get('author_url') or self._html_search_regex( + r'<div class="avatar flex-shrink-0">\s*<a href="(?P<uploader_url>http[^"]+)"', + webpage, 'uploader url', fatal=False), } diff --git a/hypervideo_dl/extractor/audiodraft.py b/hypervideo_dl/extractor/audiodraft.py new file mode 100644 index 0000000..71e5afd --- /dev/null +++ b/hypervideo_dl/extractor/audiodraft.py @@ -0,0 +1,93 @@ +from .common import InfoExtractor +from ..utils import int_or_none + + +class AudiodraftBaseIE(InfoExtractor): + def _audiodraft_extract_from_id(self, player_entry_id): + data_json = self._download_json( + 'https://www.audiodraft.com/scripts/general/player/getPlayerInfoNew.php', player_entry_id, + headers={ + 'Content-type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'X-Requested-With': 'XMLHttpRequest', + }, data=f'id={player_entry_id}'.encode('utf-8')) + + return { + 'id': str(data_json['entry_id']), + 'title': data_json.get('entry_title'), + 'url': data_json['path'], + 'vcodec': 'none', + 'ext': 'mp3', + 'uploader': data_json.get('designer_name'), + 'uploader_id': data_json.get('designer_id'), + 'webpage_url': data_json.get('entry_url'), + 'like_count': int_or_none(data_json.get('entry_likes')), + 'average_rating': int_or_none(data_json.get('entry_rating')), + } + + +class AudiodraftCustomIE(AudiodraftBaseIE): + IE_NAME = 'Audiodraft:custom' + _VALID_URL = r'https?://(?:[-\w]+)\.audiodraft\.com/entry/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'http://nokiatune.audiodraft.com/entry/5874', + 'info_dict': { + 'id': '9485', + 'ext': 'mp3', + 'title': 'Hula Hula Calls', + 'uploader': 'unclemaki', + 'uploader_id': '13512', + 'average_rating': 5, + 'like_count': int, + }, + }, { + 'url': 'http://vikinggrace.audiodraft.com/entry/501', + 'info_dict': { + 'id': '22241', + 'ext': 'mp3', + 'title': 'MVG Happy', + 'uploader': 'frog', + 'uploader_id': '19142', + 'average_rating': 5, + 'like_count': int, + }, + }, { + 'url': 'http://timferriss.audiodraft.com/entry/765', + 'info_dict': { + 'id': '19710', + 'ext': 'mp3', + 'title': 'ferris03', + 'uploader': 'malex', + 'uploader_id': '17335', + 'average_rating': 5, + 'like_count': int, + }, + }] + + def _real_extract(self, url): + id = self._match_id(url) + webpage = self._download_webpage(url, id) + player_entry_id = self._search_regex(r'playAudio\(\'(player_entry_\d+)\'\);', webpage, id, 'play entry id') + return self._audiodraft_extract_from_id(player_entry_id) + + +class AudiodraftGenericIE(AudiodraftBaseIE): + IE_NAME = 'Audiodraft:generic' + _VALID_URL = r'https?://www\.audiodraft\.com/contests/[^/#]+#entries&eid=(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://www.audiodraft.com/contests/570-Score-A-Video-Surprise-Us#entries&eid=30138', + 'info_dict': { + 'id': '30138', + 'ext': 'mp3', + 'title': 'DROP in sound_V2', + 'uploader': 'TiagoSilva', + 'uploader_id': '19452', + 'average_rating': 4, + 'like_count': int, + }, + }] + + def _real_extract(self, url): + id = self._match_id(url) + return self._audiodraft_extract_from_id(f'player_entry_{id}') diff --git a/hypervideo_dl/extractor/audiomack.py b/hypervideo_dl/extractor/audiomack.py index 19775cf..5c4160f 100644 --- a/hypervideo_dl/extractor/audiomack.py +++ b/hypervideo_dl/extractor/audiomack.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import itertools import time diff --git a/hypervideo_dl/extractor/audius.py b/hypervideo_dl/extractor/audius.py index fa64995..6448b44 100644 --- a/hypervideo_dl/extractor/audius.py +++ b/hypervideo_dl/extractor/audius.py @@ -1,11 +1,8 @@ -# coding: utf-8 -from __future__ import unicode_literals - import random from .common import InfoExtractor -from ..utils import ExtractorError, try_get, compat_str, str_or_none -from ..compat import compat_urllib_parse_unquote +from ..compat import compat_str, compat_urllib_parse_unquote +from ..utils import ExtractorError, str_or_none, try_get class AudiusBaseIE(InfoExtractor): @@ -171,7 +168,7 @@ class AudiusIE(AudiusBaseIE): } -class AudiusTrackIE(AudiusIE): +class AudiusTrackIE(AudiusIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'''(?x)(?:audius:)(?:https?://(?:www\.)?.+/v1/tracks/)?(?P<track_id>\w+)''' IE_NAME = 'audius:track' IE_DESC = 'Audius track ID or API link. Prepend with "audius:"' @@ -246,7 +243,7 @@ class AudiusPlaylistIE(AudiusBaseIE): playlist_data.get('description')) -class AudiusProfileIE(AudiusPlaylistIE): +class AudiusProfileIE(AudiusPlaylistIE): # XXX: Do not subclass from concrete IE IE_NAME = 'audius:artist' IE_DESC = 'Audius.co profile/artist pages' _VALID_URL = r'https?://(?:www)?audius\.co/(?P<id>[^\/]+)/?(?:[?#]|$)' diff --git a/hypervideo_dl/extractor/awaan.py b/hypervideo_dl/extractor/awaan.py index f5e559c..6fc938d 100644 --- a/hypervideo_dl/extractor/awaan.py +++ b/hypervideo_dl/extractor/awaan.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import base64 from .common import InfoExtractor @@ -44,7 +41,7 @@ class AWAANBaseIE(InfoExtractor): 'id': video_id, 'title': title, 'description': video_data.get('description_en') or video_data.get('description_ar'), - 'thumbnail': format_field(img, template='http://admin.mangomolo.com/analytics/%s'), + 'thumbnail': format_field(img, None, 'http://admin.mangomolo.com/analytics/%s'), 'duration': int_or_none(video_data.get('duration')), 'timestamp': parse_iso8601(video_data.get('create_time'), ' '), 'is_live': is_live, diff --git a/hypervideo_dl/extractor/aws.py b/hypervideo_dl/extractor/aws.py index dccfeaf..eb831a1 100644 --- a/hypervideo_dl/extractor/aws.py +++ b/hypervideo_dl/extractor/aws.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import datetime import hashlib import hmac @@ -9,7 +6,7 @@ from .common import InfoExtractor from ..compat import compat_urllib_parse_urlencode -class AWSIE(InfoExtractor): +class AWSIE(InfoExtractor): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor _AWS_ALGORITHM = 'AWS4-HMAC-SHA256' _AWS_REGION = 'us-east-1' diff --git a/hypervideo_dl/extractor/azmedien.py b/hypervideo_dl/extractor/azmedien.py index 0168340..d1686ee 100644 --- a/hypervideo_dl/extractor/azmedien.py +++ b/hypervideo_dl/extractor/azmedien.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/baidu.py b/hypervideo_dl/extractor/baidu.py index 364fd94..8786d67 100644 --- a/hypervideo_dl/extractor/baidu.py +++ b/hypervideo_dl/extractor/baidu.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import unescapeHTML diff --git a/hypervideo_dl/extractor/banbye.py b/hypervideo_dl/extractor/banbye.py index 3d4d36e..c873425 100644 --- a/hypervideo_dl/extractor/banbye.py +++ b/hypervideo_dl/extractor/banbye.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import math from .common import InfoExtractor @@ -83,8 +80,6 @@ class BanByeIE(BanByeBaseIE): 'url': f'{self._CDN_BASE}/video/{video_id}/{quality}.mp4', } for quality in data['quality']] - self._sort_formats(formats) - return { 'id': video_id, 'title': data.get('title'), diff --git a/hypervideo_dl/extractor/bandaichannel.py b/hypervideo_dl/extractor/bandaichannel.py index f1bcdef..d7fcf44 100644 --- a/hypervideo_dl/extractor/bandaichannel.py +++ b/hypervideo_dl/extractor/bandaichannel.py @@ -1,11 +1,8 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .brightcove import BrightcoveNewIE +from .brightcove import BrightcoveNewBaseIE from ..utils import extract_attributes -class BandaiChannelIE(BrightcoveNewIE): +class BandaiChannelIE(BrightcoveNewBaseIE): IE_NAME = 'bandaichannel' _VALID_URL = r'https?://(?:www\.)?b-ch\.com/titles/(?P<id>\d+/\d+)' _TESTS = [{ diff --git a/hypervideo_dl/extractor/bandcamp.py b/hypervideo_dl/extractor/bandcamp.py index 745055e..de81e0d 100644 --- a/hypervideo_dl/extractor/bandcamp.py +++ b/hypervideo_dl/extractor/bandcamp.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import random import re import time @@ -8,23 +5,24 @@ import time from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + KNOWN_EXTENSIONS, ExtractorError, float_or_none, int_or_none, - KNOWN_EXTENSIONS, parse_filesize, str_or_none, try_get, - update_url_query, unified_strdate, unified_timestamp, + update_url_query, url_or_none, urljoin, ) class BandcampIE(InfoExtractor): - _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?P<uploader>[^/]+)\.bandcamp\.com/track/(?P<id>[^/?#&]+)' + _EMBED_REGEX = [r'<meta property="og:url"[^>]*?content="(?P<url>.*?bandcamp\.com.*?)"'] _TESTS = [{ 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', 'md5': 'c557841d5e50261777a6585648adf439', @@ -87,7 +85,7 @@ class BandcampIE(InfoExtractor): attr + ' data', group=2), video_id, fatal=fatal) def _real_extract(self, url): - title = self._match_id(url) + title, uploader = self._match_valid_url(url).group('id', 'uploader') webpage = self._download_webpage(url, title) tralbum = self._extract_data_attr(webpage, title) thumbnail = self._og_search_thumbnail(webpage) @@ -186,8 +184,6 @@ class BandcampIE(InfoExtractor): 'acodec': format_id.split('-')[0], }) - self._sort_formats(formats) - title = '%s - %s' % (artist, track) if artist else track if not duration: @@ -199,6 +195,8 @@ class BandcampIE(InfoExtractor): 'title': title, 'thumbnail': thumbnail, 'uploader': artist, + 'uploader_id': uploader, + 'uploader_url': f'https://{uploader}.bandcamp.com', 'timestamp': timestamp, 'release_timestamp': unified_timestamp(tralbum.get('album_release_date')), 'duration': duration, @@ -211,7 +209,7 @@ class BandcampIE(InfoExtractor): } -class BandcampAlbumIE(BandcampIE): +class BandcampAlbumIE(BandcampIE): # XXX: Do not subclass from concrete IE IE_NAME = 'Bandcamp:album' _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com/album/(?P<id>[^/?#&]+)' @@ -314,7 +312,7 @@ class BandcampAlbumIE(BandcampIE): } -class BandcampWeeklyIE(BandcampIE): +class BandcampWeeklyIE(BandcampIE): # XXX: Do not subclass from concrete IE IE_NAME = 'Bandcamp:weekly' _VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)' _TESTS = [{ @@ -363,7 +361,6 @@ class BandcampWeeklyIE(BandcampIE): 'ext': ext, 'vcodec': 'none', }) - self._sort_formats(formats) title = show.get('audio_title') or 'Bandcamp Weekly' subtitle = show.get('subtitle') @@ -439,7 +436,7 @@ class BandcampUserIE(InfoExtractor): uploader = self._match_id(url) webpage = self._download_webpage(url, uploader) - discography_data = (re.findall(r'<li data-item-id=["\'][^>]+>\s*<a href=["\']([^"\']+)', webpage) + discography_data = (re.findall(r'<li data-item-id=["\'][^>]+>\s*<a href=["\'](?![^"\'/]*?/merch)([^"\']+)', webpage) or re.findall(r'<div[^>]+trackTitle["\'][^"\']+["\']([^"\']+)', webpage)) return self.playlist_from_matches( diff --git a/hypervideo_dl/extractor/bannedvideo.py b/hypervideo_dl/extractor/bannedvideo.py index 3db1151..51e7220 100644 --- a/hypervideo_dl/extractor/bannedvideo.py +++ b/hypervideo_dl/extractor/bannedvideo.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -137,7 +135,6 @@ query GetCommentReplies($id: String!) { formats.extend(self._extract_m3u8_formats( video_info.get('streamUrl'), video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', live=True)) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/bbc.py b/hypervideo_dl/extractor/bbc.py index 29ad7de..9d28e70 100644 --- a/hypervideo_dl/extractor/bbc.py +++ b/hypervideo_dl/extractor/bbc.py @@ -1,19 +1,12 @@ -# coding: utf-8 -from __future__ import unicode_literals - import functools import itertools import json import re +import urllib.error +import xml.etree.ElementTree from .common import InfoExtractor -from ..compat import ( - compat_etree_Element, - compat_HTTPError, - compat_str, - compat_urllib_error, - compat_urlparse, -) +from ..compat import compat_HTTPError, compat_str, compat_urlparse from ..utils import ( ExtractorError, OnDemandPagedList, @@ -53,6 +46,7 @@ class BBCCoUkIE(InfoExtractor): ) (?P<id>%s)(?!/(?:episodes|broadcasts|clips)) ''' % _ID_REGEX + _EMBED_REGEX = [r'setPlaylist\("(?P<url>https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)'] _LOGIN_URL = 'https://account.bbc.com/signin' _NETRC_MACHINE = 'bbc' @@ -318,7 +312,7 @@ class BBCCoUkIE(InfoExtractor): continue captions = self._download_xml( cc_url, programme_id, 'Downloading captions', fatal=False) - if not isinstance(captions, compat_etree_Element): + if not isinstance(captions, xml.etree.ElementTree.Element): continue subtitles['en'] = [ { @@ -394,7 +388,7 @@ class BBCCoUkIE(InfoExtractor): href, programme_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False) except ExtractorError as e: - if not (isinstance(e.exc_info[1], compat_urllib_error.HTTPError) + if not (isinstance(e.exc_info[1], urllib.error.HTTPError) and e.exc_info[1].code in (403, 404)): raise fmts = [] @@ -581,8 +575,6 @@ class BBCCoUkIE(InfoExtractor): else: programme_id, title, description, duration, formats, subtitles = self._download_playlist(group_id) - self._sort_formats(formats) - return { 'id': programme_id, 'title': title, @@ -594,10 +586,15 @@ class BBCCoUkIE(InfoExtractor): } -class BBCIE(BBCCoUkIE): +class BBCIE(BBCCoUkIE): # XXX: Do not subclass from concrete IE IE_NAME = 'bbc' IE_DESC = 'BBC' - _VALID_URL = r'https?://(?:www\.)?bbc\.(?:com|co\.uk)/(?:[^/]+/)+(?P<id>[^/#?]+)' + _VALID_URL = r'''(?x) + https?://(?:www\.)?(?: + bbc\.(?:com|co\.uk)| + bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd\.onion| + bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad\.onion + )/(?:[^/]+/)+(?P<id>[^/#?]+)''' _MEDIA_SETS = [ 'pc', @@ -847,6 +844,12 @@ class BBCIE(BBCCoUkIE): 'upload_date': '20190604', 'categories': ['Psychology'], }, + }, { # onion routes + 'url': 'https://www.bbcnewsd73hkzno2ini43t4gblxvycyac5aw4gnv7t2rccijh7745uqd.onion/news/av/world-europe-63208576', + 'only_matching': True, + }, { + 'url': 'https://www.bbcweb3hytmzhn5d532owbu6oqadra5z3ar726vq5kgwwn6aucdccrad.onion/sport/av/football/63195681', + 'only_matching': True, }] @classmethod @@ -885,7 +888,6 @@ class BBCIE(BBCCoUkIE): def _extract_from_playlist_sxml(self, url, playlist_id, timestamp): programme_id, title, description, duration, formats, subtitles = \ self._process_legacy_playlist_url(url, playlist_id) - self._sort_formats(formats) return { 'id': programme_id, 'title': title, @@ -904,12 +906,8 @@ class BBCIE(BBCCoUkIE): json_ld_info = self._search_json_ld(webpage, playlist_id, default={}) timestamp = json_ld_info.get('timestamp') - playlist_title = json_ld_info.get('title') - if not playlist_title: - playlist_title = (self._og_search_title(webpage, default=None) - or self._html_extract_title(webpage, 'playlist title', default=None)) - if playlist_title: - playlist_title = re.sub(r'(.+)\s*-\s*BBC.*?$', r'\1', playlist_title).strip() + playlist_title = json_ld_info.get('title') or re.sub( + r'(.+)\s*-\s*BBC.*?$', r'\1', self._generic_title('', webpage, default='')).strip() or None playlist_description = json_ld_info.get( 'description') or self._og_search_description(webpage, default=None) @@ -953,7 +951,6 @@ class BBCIE(BBCCoUkIE): duration = int_or_none(items[0].get('duration')) programme_id = items[0].get('vpid') formats, subtitles = self._download_media_selector(programme_id) - self._sort_formats(formats) entries.append({ 'id': programme_id, 'title': title, @@ -990,7 +987,6 @@ class BBCIE(BBCCoUkIE): continue raise if entry: - self._sort_formats(entry['formats']) entries.append(entry) if entries: @@ -1014,7 +1010,6 @@ class BBCIE(BBCCoUkIE): if programme_id: formats, subtitles = self._download_media_selector(programme_id) - self._sort_formats(formats) # digitalData may be missing (e.g. http://www.bbc.com/autos/story/20130513-hyundais-rock-star) digital_data = self._parse_json( self._search_regex( @@ -1046,7 +1041,6 @@ class BBCIE(BBCCoUkIE): if version_id: title = smp_data['title'] formats, subtitles = self._download_media_selector(version_id) - self._sort_formats(formats) image_url = smp_data.get('holdingImageURL') display_date = init_data.get('displayDate') topic_title = init_data.get('topicTitle') @@ -1088,7 +1082,6 @@ class BBCIE(BBCCoUkIE): continue title = lead_media.get('title') or self._og_search_title(webpage) formats, subtitles = self._download_media_selector(programme_id) - self._sort_formats(formats) description = lead_media.get('summary') uploader = lead_media.get('masterBrand') uploader_id = lead_media.get('mid') @@ -1117,7 +1110,6 @@ class BBCIE(BBCCoUkIE): if current_programme and programme_id and current_programme.get('type') == 'playable_item': title = current_programme.get('titles', {}).get('tertiary') or playlist_title formats, subtitles = self._download_media_selector(programme_id) - self._sort_formats(formats) synopses = current_programme.get('synopses') or {} network = current_programme.get('network') or {} duration = int_or_none( @@ -1150,7 +1142,6 @@ class BBCIE(BBCCoUkIE): clip_title = clip.get('title') if clip_vpid and clip_title: formats, subtitles = self._download_media_selector(clip_vpid) - self._sort_formats(formats) return { 'id': clip_vpid, 'title': clip_title, @@ -1172,7 +1163,6 @@ class BBCIE(BBCCoUkIE): if not programme_id: continue formats, subtitles = self._download_media_selector(programme_id) - self._sort_formats(formats) entries.append({ 'id': programme_id, 'title': playlist_title, @@ -1204,7 +1194,6 @@ class BBCIE(BBCCoUkIE): if not (item_id and item_title): continue formats, subtitles = self._download_media_selector(item_id) - self._sort_formats(formats) item_desc = None blocks = try_get(media, lambda x: x['summary']['blocks'], list) if blocks: @@ -1238,7 +1227,7 @@ class BBCIE(BBCCoUkIE): (lambda x: x['data']['blocks'], lambda x: x['data']['content']['model']['blocks'],), list) or []): - if block.get('type') != 'media': + if block.get('type') not in ['media', 'video']: continue parse_media(block.get('model')) return self.playlist_result( @@ -1305,7 +1294,6 @@ class BBCIE(BBCCoUkIE): formats, subtitles = self._extract_from_media_meta(media_meta, playlist_id) if not formats and not self.get_param('ignore_no_formats'): continue - self._sort_formats(formats) video_id = media_meta.get('externalId') if not video_id: diff --git a/hypervideo_dl/extractor/beatport.py b/hypervideo_dl/extractor/beatport.py index e1cf8b4..0aecbd0 100644 --- a/hypervideo_dl/extractor/beatport.py +++ b/hypervideo_dl/extractor/beatport.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -77,7 +74,6 @@ class BeatportIE(InfoExtractor): fmt['abr'] = 96 fmt['asr'] = 44100 formats.append(fmt) - self._sort_formats(formats) images = [] for name, info in track['images'].items(): diff --git a/hypervideo_dl/extractor/beeg.py b/hypervideo_dl/extractor/beeg.py index 717fff3..52ee68e 100644 --- a/hypervideo_dl/extractor/beeg.py +++ b/hypervideo_dl/extractor/beeg.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( @@ -78,8 +76,6 @@ class BeegIE(InfoExtractor): f['height'] = height formats.extend(current_formats) - self._sort_formats(formats) - return { 'id': video_id, 'display_id': first_fact.get('id'), diff --git a/hypervideo_dl/extractor/behindkink.py b/hypervideo_dl/extractor/behindkink.py index 2c97f98..ca44981 100644 --- a/hypervideo_dl/extractor/behindkink.py +++ b/hypervideo_dl/extractor/behindkink.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import url_basename diff --git a/hypervideo_dl/extractor/bellmedia.py b/hypervideo_dl/extractor/bellmedia.py index 904c17e..5ae4b91 100644 --- a/hypervideo_dl/extractor/bellmedia.py +++ b/hypervideo_dl/extractor/bellmedia.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor @@ -28,7 +24,7 @@ class BellMediaIE(InfoExtractor): )/.*?(?:\b(?:vid(?:eoid)?|clipId)=|-vid|~|%7E|/(?:episode)?)(?P<id>[0-9]{6,})''' _TESTS = [{ 'url': 'https://www.bnnbloomberg.ca/video/david-cockfield-s-top-picks~1403070', - 'md5': '36d3ef559cfe8af8efe15922cd3ce950', + 'md5': '3e5b8e38370741d5089da79161646635', 'info_dict': { 'id': '1403070', 'ext': 'flv', @@ -36,6 +32,14 @@ class BellMediaIE(InfoExtractor): 'description': 'md5:810f7f8c6a83ad5b48677c3f8e5bb2c3', 'upload_date': '20180525', 'timestamp': 1527288600, + 'season_id': 73997, + 'season': '2018', + 'thumbnail': 'http://images2.9c9media.com/image_asset/2018_5_25_baf30cbd-b28d-4a18-9903-4bb8713b00f5_PNG_956x536.jpg', + 'tags': [], + 'categories': ['ETFs'], + 'season_number': 8, + 'duration': 272.038, + 'series': 'Market Call Tonight', }, }, { 'url': 'http://www.thecomedynetwork.ca/video/player?vid=923582', diff --git a/hypervideo_dl/extractor/berufetv.py b/hypervideo_dl/extractor/berufetv.py new file mode 100644 index 0000000..8160cbd --- /dev/null +++ b/hypervideo_dl/extractor/berufetv.py @@ -0,0 +1,70 @@ +from .common import InfoExtractor +from ..utils import float_or_none, mimetype2ext, traverse_obj + + +class BerufeTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?web\.arbeitsagentur\.de/berufetv/[^?#]+/film;filmId=(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://web.arbeitsagentur.de/berufetv/studienberufe/wirtschaftswissenschaften/wirtschaftswissenschaften-volkswirtschaft/film;filmId=DvKC3DUpMKvUZ_6fEnfg3u', + 'md5': '041b6432ec8e6838f84a5c30f31cc795', + 'info_dict': { + 'id': 'DvKC3DUpMKvUZ_6fEnfg3u', + 'ext': 'mp4', + 'title': 'Volkswirtschaftslehre', + 'description': 'md5:6bd87d0c63163480a6489a37526ee1c1', + 'categories': ['Studien­beruf'], + 'tags': ['Studienfilm'], + 'duration': 602.440, + 'thumbnail': r're:^https://asset-out-cdn\.video-cdn\.net/private/videos/DvKC3DUpMKvUZ_6fEnfg3u/thumbnails/793063\?quality=thumbnail&__token__=[^\s]+$', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + movie_metadata = self._download_json( + 'https://rest.arbeitsagentur.de/infosysbub/berufetv/pc/v1/film-metadata', + video_id, 'Downloading JSON metadata', + headers={'X-API-Key': '79089773-4892-4386-86e6-e8503669f426'}, fatal=False) + + meta = traverse_obj( + movie_metadata, ('metadaten', lambda _, i: video_id == i['miId']), + get_all=False, default={}) + + video = self._download_json( + f'https://d.video-cdn.net/play/player/8YRzUk6pTzmBdrsLe9Y88W/video/{video_id}', + video_id, 'Downloading video JSON') + + formats, subtitles = [], {} + for key, source in video['videoSources']['html'].items(): + if key == 'auto': + fmts, subs = self._extract_m3u8_formats_and_subtitles(source[0]['source'], video_id) + formats += fmts + subtitles = subs + else: + formats.append({ + 'url': source[0]['source'], + 'ext': mimetype2ext(source[0]['mimeType']), + 'format_id': key, + }) + + for track in video.get('videoTracks') or []: + if track.get('type') != 'SUBTITLES': + continue + subtitles.setdefault(track['language'], []).append({ + 'url': track['source'], + 'name': track.get('label'), + 'ext': 'vtt' + }) + + return { + 'id': video_id, + 'title': meta.get('titel') or traverse_obj(video, ('videoMetaData', 'title')), + 'description': meta.get('beschreibung'), + 'thumbnail': meta.get('thumbnail') or f'https://asset-out-cdn.video-cdn.net/private/videos/{video_id}/thumbnails/active', + 'duration': float_or_none(video.get('duration'), scale=1000), + 'categories': [meta['kategorie']] if meta.get('kategorie') else None, + 'tags': meta.get('themengebiete'), + 'subtitles': subtitles, + 'formats': formats, + } diff --git a/hypervideo_dl/extractor/bet.py b/hypervideo_dl/extractor/bet.py index 2c71442..6b867d1 100644 --- a/hypervideo_dl/extractor/bet.py +++ b/hypervideo_dl/extractor/bet.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .mtv import MTVServicesInfoExtractor from ..utils import unified_strdate diff --git a/hypervideo_dl/extractor/bfi.py b/hypervideo_dl/extractor/bfi.py index 60c8944..76f0516 100644 --- a/hypervideo_dl/extractor/bfi.py +++ b/hypervideo_dl/extractor/bfi.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/bfmtv.py b/hypervideo_dl/extractor/bfmtv.py index 501f69d..d86d283 100644 --- a/hypervideo_dl/extractor/bfmtv.py +++ b/hypervideo_dl/extractor/bfmtv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -45,7 +42,7 @@ class BFMTVIE(BFMTVBaseIE): return self._brightcove_url_result(video_block['videoid'], video_block) -class BFMTVLiveIE(BFMTVIE): +class BFMTVLiveIE(BFMTVIE): # XXX: Do not subclass from concrete IE IE_NAME = 'bfmtv:live' _VALID_URL = BFMTVBaseIE._VALID_URL_BASE + '(?P<id>(?:[^/]+/)?en-direct)' _TESTS = [{ diff --git a/hypervideo_dl/extractor/bibeltv.py b/hypervideo_dl/extractor/bibeltv.py index 56c2bfe..fd20aad 100644 --- a/hypervideo_dl/extractor/bibeltv.py +++ b/hypervideo_dl/extractor/bibeltv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/bigflix.py b/hypervideo_dl/extractor/bigflix.py index 28e3e59..02d1ba0 100644 --- a/hypervideo_dl/extractor/bigflix.py +++ b/hypervideo_dl/extractor/bigflix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -66,8 +63,6 @@ class BigflixIE(InfoExtractor): 'url': decode_url(file_url), }) - self._sort_formats(formats) - description = self._html_search_meta('description', webpage) return { diff --git a/hypervideo_dl/extractor/bigo.py b/hypervideo_dl/extractor/bigo.py index ddf76ac..1cb6e58 100644 --- a/hypervideo_dl/extractor/bigo.py +++ b/hypervideo_dl/extractor/bigo.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ExtractorError, urlencode_postdata @@ -31,7 +28,7 @@ class BigoIE(InfoExtractor): user_id = self._match_id(url) info_raw = self._download_json( - 'https://bigo.tv/studio/getInternalStudioInfo', + 'https://ta.bigo.tv/official_website/studio/getInternalStudioInfo', user_id, data=urlencode_postdata({'siteId': user_id})) if not isinstance(info_raw, dict): @@ -44,14 +41,14 @@ class BigoIE(InfoExtractor): if not info.get('alive'): raise ExtractorError('This user is offline.', expected=True) + formats, subs = self._extract_m3u8_formats_and_subtitles( + info.get('hls_src'), user_id, 'mp4', 'm3u8') + return { 'id': info.get('roomId') or user_id, 'title': info.get('roomTopic') or info.get('nick_name') or user_id, - 'formats': [{ - 'url': info.get('hls_src'), - 'ext': 'mp4', - 'protocol': 'm3u8', - }], + 'formats': formats, + 'subtitles': subs, 'thumbnail': info.get('snapshot'), 'uploader': info.get('nick_name'), 'uploader_id': user_id, diff --git a/hypervideo_dl/extractor/bild.py b/hypervideo_dl/extractor/bild.py index b8dfbd4..f3dea33 100644 --- a/hypervideo_dl/extractor/bild.py +++ b/hypervideo_dl/extractor/bild.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, diff --git a/hypervideo_dl/extractor/bilibili.py b/hypervideo_dl/extractor/bilibili.py index 909f7f8..bc04241 100644 --- a/hypervideo_dl/extractor/bilibili.py +++ b/hypervideo_dl/extractor/bilibili.py @@ -1,509 +1,561 @@ -# coding: utf-8 - import base64 -import hashlib -import itertools import functools -import re +import itertools import math +import urllib.error +import urllib.parse from .common import InfoExtractor, SearchInfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urlparse, - compat_urllib_parse_urlparse -) from ..utils import ( ExtractorError, + GeoRestrictedError, + InAdvancePagedList, + OnDemandPagedList, filter_dict, - int_or_none, float_or_none, + format_field, + int_or_none, + make_archive_id, mimetype2ext, - parse_iso8601, - traverse_obj, parse_count, - smuggle_url, + parse_qs, + qualities, srt_subtitles_timecode, str_or_none, - strip_jsonp, - unified_timestamp, - unsmuggle_url, - urlencode_postdata, + traverse_obj, url_or_none, - OnDemandPagedList + urlencode_postdata, ) -class BiliBiliIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?:// - (?:(?:www|bangumi)\.)? - bilibili\.(?:tv|com)/ - (?: - (?: - video/[aA][vV]| - anime/(?P<anime_id>\d+)/play\# - )(?P<id>\d+)| - (s/)?video/[bB][vV](?P<id_bv>[^/?#&]+) - ) - (?:/?\?p=(?P<page>\d+))? - ''' +class BilibiliBaseIE(InfoExtractor): + def extract_formats(self, play_info): + format_names = { + r['quality']: traverse_obj(r, 'new_description', 'display_desc') + for r in traverse_obj(play_info, ('support_formats', lambda _, v: v['quality'])) + } + + audios = traverse_obj(play_info, ('dash', 'audio', ...)) + flac_audio = traverse_obj(play_info, ('dash', 'flac', 'audio')) + if flac_audio: + audios.append(flac_audio) + formats = [{ + 'url': traverse_obj(audio, 'baseUrl', 'base_url', 'url'), + 'ext': mimetype2ext(traverse_obj(audio, 'mimeType', 'mime_type')), + 'acodec': audio.get('codecs'), + 'vcodec': 'none', + 'tbr': float_or_none(audio.get('bandwidth'), scale=1000), + 'filesize': int_or_none(audio.get('size')) + } for audio in audios] + + formats.extend({ + 'url': traverse_obj(video, 'baseUrl', 'base_url', 'url'), + 'ext': mimetype2ext(traverse_obj(video, 'mimeType', 'mime_type')), + 'fps': float_or_none(traverse_obj(video, 'frameRate', 'frame_rate')), + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), + 'vcodec': video.get('codecs'), + 'acodec': 'none' if audios else None, + 'tbr': float_or_none(video.get('bandwidth'), scale=1000), + 'filesize': int_or_none(video.get('size')), + 'quality': int_or_none(video.get('id')), + 'format': format_names.get(video.get('id')), + } for video in traverse_obj(play_info, ('dash', 'video', ...))) + + missing_formats = format_names.keys() - set(traverse_obj(formats, (..., 'quality'))) + if missing_formats: + self.to_screen(f'Format(s) {", ".join(format_names[i] for i in missing_formats)} are missing; ' + f'you have to login or become premium member to download them. {self._login_hint()}') + + return formats + + def json2srt(self, json_data): + srt_data = '' + for idx, line in enumerate(json_data.get('body') or []): + srt_data += (f'{idx + 1}\n' + f'{srt_subtitles_timecode(line["from"])} --> {srt_subtitles_timecode(line["to"])}\n' + f'{line["content"]}\n\n') + return srt_data + + def _get_subtitles(self, video_id, initial_state, cid): + subtitles = { + 'danmaku': [{ + 'ext': 'xml', + 'url': f'https://comment.bilibili.com/{cid}.xml', + }] + } + + for s in traverse_obj(initial_state, ('videoData', 'subtitle', 'list')) or []: + subtitles.setdefault(s['lan'], []).append({ + 'ext': 'srt', + 'data': self.json2srt(self._download_json(s['subtitle_url'], video_id)) + }) + return subtitles + + def _get_chapters(self, aid, cid): + chapters = aid and cid and self._download_json( + 'https://api.bilibili.com/x/player/v2', aid, query={'aid': aid, 'cid': cid}, + note='Extracting chapters', fatal=False) + return traverse_obj(chapters, ('data', 'view_points', ..., { + 'title': 'content', + 'start_time': 'from', + 'end_time': 'to', + })) or None + + def _get_comments(self, aid): + for idx in itertools.count(1): + replies = traverse_obj( + self._download_json( + f'https://api.bilibili.com/x/v2/reply?pn={idx}&oid={aid}&type=1&jsonp=jsonp&sort=2&_=1567227301685', + aid, note=f'Extracting comments from page {idx}', fatal=False), + ('data', 'replies')) + if not replies: + return + for children in map(self._get_all_children, replies): + yield from children + + def _get_all_children(self, reply): + yield { + 'author': traverse_obj(reply, ('member', 'uname')), + 'author_id': traverse_obj(reply, ('member', 'mid')), + 'id': reply.get('rpid'), + 'text': traverse_obj(reply, ('content', 'message')), + 'timestamp': reply.get('ctime'), + 'parent': reply.get('parent') or 'root', + } + for children in map(self._get_all_children, traverse_obj(reply, ('replies', ...))): + yield from children + + +class BiliBiliIE(BilibiliBaseIE): + _VALID_URL = r'https?://www\.bilibili\.com/video/[aAbB][vV](?P<id>[^/?#&]+)' _TESTS = [{ + 'url': 'https://www.bilibili.com/video/BV13x41117TL', + 'info_dict': { + 'id': 'BV13x41117TL', + 'title': '阿滴英文|英文歌分享#6 "Closer', + 'ext': 'mp4', + 'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文', + 'uploader_id': '65880958', + 'uploader': '阿滴英文', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + 'duration': 554.117, + 'tags': list, + 'comment_count': int, + 'upload_date': '20170301', + 'timestamp': 1488353834, + 'like_count': int, + 'view_count': int, + }, + }, { + # old av URL version 'url': 'http://www.bilibili.com/video/av1074402/', - 'md5': '5f7d29e1a2872f3df0cf76b1f87d3788', 'info_dict': { - 'id': '1074402_part1', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg)$', 'ext': 'mp4', - 'title': '【金坷垃】金泡沫', - 'uploader_id': '156160', 'uploader': '菊子桑', + 'uploader_id': '156160', + 'id': 'BV11x411K7CN', + 'title': '【金坷垃】金泡沫', + 'duration': 308.36, 'upload_date': '20140420', + 'timestamp': 1397983878, 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', - 'timestamp': 1398012678, + 'like_count': int, + 'comment_count': int, + 'view_count': int, + 'tags': list, }, + 'params': {'skip_download': True}, }, { - # Tested in BiliBiliBangumiIE - 'url': 'http://bangumi.bilibili.com/anime/1869/play#40062', - 'only_matching': True, + 'note': 'Anthology', + 'url': 'https://www.bilibili.com/video/BV1bK411W797', + 'info_dict': { + 'id': 'BV1bK411W797', + 'title': '物语中的人物是如何吐槽自己的OP的' + }, + 'playlist_count': 18, + 'playlist': [{ + 'info_dict': { + 'id': 'BV1bK411W797_p1', + 'ext': 'mp4', + 'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川', + 'tags': 'count:11', + 'timestamp': 1589601697, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + 'uploader': '打牌还是打桩', + 'uploader_id': '150259984', + 'like_count': int, + 'comment_count': int, + 'upload_date': '20200516', + 'view_count': int, + 'description': 'md5:e3c401cf7bc363118d1783dd74068a68', + 'duration': 90.314, + } + }] }, { - # bilibili.tv - 'url': 'http://www.bilibili.tv/video/av1074402/', - 'only_matching': True, + 'note': 'Specific page of Anthology', + 'url': 'https://www.bilibili.com/video/BV1bK411W797?p=1', + 'info_dict': { + 'id': 'BV1bK411W797_p1', + 'ext': 'mp4', + 'title': '物语中的人物是如何吐槽自己的OP的 p01 Staple Stable/战场原+羽川', + 'tags': 'count:11', + 'timestamp': 1589601697, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + 'uploader': '打牌还是打桩', + 'uploader_id': '150259984', + 'like_count': int, + 'comment_count': int, + 'upload_date': '20200516', + 'view_count': int, + 'description': 'md5:e3c401cf7bc363118d1783dd74068a68', + 'duration': 90.314, + } }, { - 'url': 'http://bangumi.bilibili.com/anime/5802/play#100643', - 'md5': '3f721ad1e75030cc06faf73587cfec57', + 'note': 'video has subtitles', + 'url': 'https://www.bilibili.com/video/BV12N4y1M7rh', 'info_dict': { - 'id': '100643_part1', + 'id': 'BV12N4y1M7rh', 'ext': 'mp4', - 'title': 'CHAOS;CHILD', - 'description': '如果你是神明,并且能够让妄想成为现实。那你会进行怎么样的妄想?是淫靡的世界?独裁社会?毁灭性的制裁?还是……2015年,涩谷。从6年前发生的大灾害“涩谷地震”之后复兴了的这个街区里新设立的私立高中...', + 'title': 'md5:96e8bb42c2b432c0d4ce3434a61479c1', + 'tags': list, + 'description': 'md5:afde2b7ba9025c01d9e3dde10de221e4', + 'duration': 313.557, + 'upload_date': '20220709', + 'uploader': '小夫Tech', + 'timestamp': 1657347907, + 'uploader_id': '1326814124', + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + 'subtitles': 'count:2' }, - 'skip': 'Geo-restricted to China', + 'params': {'listsubtitles': True}, }, { - 'url': 'http://www.bilibili.com/video/av8903802/', + 'url': 'https://www.bilibili.com/video/av8903802/', 'info_dict': { - 'id': '8903802_part1', + 'id': 'BV13x41117TL', 'ext': 'mp4', 'title': '阿滴英文|英文歌分享#6 "Closer', 'upload_date': '20170301', - 'description': '滴妹今天唱Closer給你聽! 有史以来,被推最多次也是最久的歌曲,其实歌词跟我原本想像差蛮多的,不过还是好听! 微博@阿滴英文', - 'timestamp': 1488382634, + 'description': 'md5:3b1b9e25b78da4ef87e9b548b88ee76a', + 'timestamp': 1488353834, 'uploader_id': '65880958', 'uploader': '阿滴英文', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + 'duration': 554.117, + 'tags': list, + 'comment_count': int, + 'view_count': int, + 'like_count': int, }, 'params': { 'skip_download': True, }, }, { - # new BV video id format - 'url': 'https://www.bilibili.com/video/BV1JE411F741', - 'only_matching': True, - }, { - # Anthology - 'url': 'https://www.bilibili.com/video/BV1bK411W797', + 'note': 'video has chapter', + 'url': 'https://www.bilibili.com/video/BV1vL411G7N7/', 'info_dict': { - 'id': 'BV1bK411W797', - 'title': '物语中的人物是如何吐槽自己的OP的' + 'id': 'BV1vL411G7N7', + 'ext': 'mp4', + 'title': '如何为你的B站视频添加进度条分段', + 'timestamp': 1634554558, + 'upload_date': '20211018', + 'description': 'md5:a9a3d6702b3a94518d419b2e9c320a6d', + 'tags': list, + 'uploader': '爱喝咖啡的当麻', + 'duration': 669.482, + 'uploader_id': '1680903', + 'chapters': 'count:6', + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', }, - 'playlist_count': 17, + 'params': {'skip_download': True}, }] - _APP_KEY = 'iVGUTjsxvpLeuDCf' - _BILIBILI_KEY = 'aHRmhWMLkdeMuILqORnYZocwMBpMEOdt' + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id) + play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data'] - def _report_error(self, result): - if 'message' in result: - raise ExtractorError('%s said: %s' % (self.IE_NAME, result['message']), expected=True) - elif 'code' in result: - raise ExtractorError('%s returns error %d' % (self.IE_NAME, result['code']), expected=True) - else: - raise ExtractorError('Can\'t extract Bangumi episode ID') + video_data = initial_state['videoData'] + video_id, title = video_data['bvid'], video_data.get('title') - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) + # Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself. + page_list_json = traverse_obj( + self._download_json( + 'https://api.bilibili.com/x/player/pagelist', video_id, + fatal=False, query={'bvid': video_id, 'jsonp': 'jsonp'}, + note='Extracting videos in anthology'), + 'data', expected_type=list) or [] + is_anthology = len(page_list_json) > 1 + + part_id = int_or_none(parse_qs(url).get('p', [None])[-1]) + if is_anthology and not part_id and self._yes_playlist(video_id, video_id): + return self.playlist_from_matches( + page_list_json, video_id, title, ie=BiliBiliIE, + getter=lambda entry: f'https://www.bilibili.com/video/{video_id}?p={entry["page"]}') - mobj = self._match_valid_url(url) - video_id = mobj.group('id_bv') or mobj.group('id') + if is_anthology: + title += f' p{part_id:02d} {traverse_obj(page_list_json, ((part_id or 1) - 1, "part")) or ""}' - av_id, bv_id = self._get_video_id_set(video_id, mobj.group('id_bv') is not None) - video_id = av_id + aid = video_data.get('aid') + old_video_id = format_field(aid, None, f'%s_part{part_id or 1}') - info = {} - anime_id = mobj.group('anime_id') - page_id = mobj.group('page') - webpage = self._download_webpage(url, video_id) + cid = traverse_obj(video_data, ('pages', part_id - 1, 'cid')) if part_id else video_data.get('cid') - # Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself. - # If the video has no page argument, check to see if it's an anthology - if page_id is None: - if not self.get_param('noplaylist'): - r = self._extract_anthology_entries(bv_id, video_id, webpage) - if r is not None: - self.to_screen('Downloading anthology %s - add --no-playlist to just download video' % video_id) - return r - else: - self.to_screen('Downloading just video %s because of --no-playlist' % video_id) - - if 'anime/' not in url: - cid = self._search_regex( - r'\bcid(?:["\']:|=)(\d+),["\']page(?:["\']:|=)' + str(page_id), webpage, 'cid', - default=None - ) or self._search_regex( - r'\bcid(?:["\']:|=)(\d+)', webpage, 'cid', - default=None - ) or compat_parse_qs(self._search_regex( - [r'EmbedPlayer\([^)]+,\s*"([^"]+)"\)', - r'EmbedPlayer\([^)]+,\s*\\"([^"]+)\\"\)', - r'<iframe[^>]+src="https://secure\.bilibili\.com/secure,([^"]+)"'], - webpage, 'player parameters'))['cid'][0] - else: - if 'no_bangumi_tip' not in smuggled_data: - self.to_screen('Downloading episode %s. To download all videos in anime %s, re-run hypervideo with %s' % ( - video_id, anime_id, compat_urlparse.urljoin(url, '//bangumi.bilibili.com/anime/%s' % anime_id))) - headers = { - 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', - 'Referer': url - } - headers.update(self.geo_verification_headers()) - - js = self._download_json( - 'http://bangumi.bilibili.com/web_api/get_source', video_id, - data=urlencode_postdata({'episode_id': video_id}), - headers=headers) - if 'result' not in js: - self._report_error(js) - cid = js['result']['cid'] - - headers = { - 'Accept': 'application/json', - 'Referer': url + return { + 'id': f'{video_id}{format_field(part_id, None, "_p%d")}', + 'formats': self.extract_formats(play_info), + '_old_archive_ids': [make_archive_id(self, old_video_id)] if old_video_id else None, + 'title': title, + 'description': traverse_obj(initial_state, ('videoData', 'desc')), + 'view_count': traverse_obj(initial_state, ('videoData', 'stat', 'view')), + 'uploader': traverse_obj(initial_state, ('upData', 'name')), + 'uploader_id': traverse_obj(initial_state, ('upData', 'mid')), + 'like_count': traverse_obj(initial_state, ('videoData', 'stat', 'like')), + 'comment_count': traverse_obj(initial_state, ('videoData', 'stat', 'reply')), + 'tags': traverse_obj(initial_state, ('tags', ..., 'tag_name')), + 'thumbnail': traverse_obj(initial_state, ('videoData', 'pic')), + 'timestamp': traverse_obj(initial_state, ('videoData', 'pubdate')), + 'duration': float_or_none(play_info.get('timelength'), scale=1000), + 'chapters': self._get_chapters(aid, cid), + 'subtitles': self.extract_subtitles(video_id, initial_state, cid), + '__post_extractor': self.extract_comments(aid), + 'http_headers': {'Referer': url}, } - headers.update(self.geo_verification_headers()) - video_info = self._parse_json( - self._search_regex(r'window.__playinfo__\s*=\s*({.+?})</script>', webpage, 'video info', default=None) or '{}', - video_id, fatal=False) - video_info = video_info.get('data') or {} - durl = traverse_obj(video_info, ('dash', 'video')) - audios = traverse_obj(video_info, ('dash', 'audio')) or [] - entries = [] +class BiliBiliBangumiIE(BilibiliBaseIE): + _VALID_URL = r'(?x)https?://www\.bilibili\.com/bangumi/play/(?P<id>(?:ss|ep)\d+)' - RENDITIONS = ('qn=80&quality=80&type=', 'quality=2&type=mp4') - for num, rendition in enumerate(RENDITIONS, start=1): - payload = 'appkey=%s&cid=%s&otype=json&%s' % (self._APP_KEY, cid, rendition) - sign = hashlib.md5((payload + self._BILIBILI_KEY).encode('utf-8')).hexdigest() - if not video_info: - video_info = self._download_json( - 'http://interface.bilibili.com/v2/playurl?%s&sign=%s' % (payload, sign), - video_id, note='Downloading video info page', - headers=headers, fatal=num == len(RENDITIONS)) - if not video_info: - continue - - if not durl and 'durl' not in video_info: - if num < len(RENDITIONS): - continue - self._report_error(video_info) - - formats = [] - for idx, durl in enumerate(durl or video_info['durl']): - formats.append({ - 'url': durl.get('baseUrl') or durl.get('base_url') or durl.get('url'), - 'ext': mimetype2ext(durl.get('mimeType') or durl.get('mime_type')), - 'fps': int_or_none(durl.get('frameRate') or durl.get('frame_rate')), - 'width': int_or_none(durl.get('width')), - 'height': int_or_none(durl.get('height')), - 'vcodec': durl.get('codecs'), - 'acodec': 'none' if audios else None, - 'tbr': float_or_none(durl.get('bandwidth'), scale=1000), - 'filesize': int_or_none(durl.get('size')), - }) - for backup_url in traverse_obj(durl, 'backup_url', expected_type=list) or []: - formats.append({ - 'url': backup_url, - 'quality': -2 if 'hd.mp4' in backup_url else -3, - }) - - for audio in audios: - formats.append({ - 'url': audio.get('baseUrl') or audio.get('base_url') or audio.get('url'), - 'ext': mimetype2ext(audio.get('mimeType') or audio.get('mime_type')), - 'fps': int_or_none(audio.get('frameRate') or audio.get('frame_rate')), - 'width': int_or_none(audio.get('width')), - 'height': int_or_none(audio.get('height')), - 'acodec': audio.get('codecs'), - 'vcodec': 'none', - 'tbr': float_or_none(audio.get('bandwidth'), scale=1000), - 'filesize': int_or_none(audio.get('size')) - }) - for backup_url in traverse_obj(audio, 'backup_url', expected_type=list) or []: - formats.append({ - 'url': backup_url, - # backup URLs have lower priorities - 'quality': -3, - }) - - info.update({ - 'id': video_id, - 'duration': float_or_none(durl.get('length'), 1000), - 'formats': formats, - 'http_headers': { - 'Referer': url, - }, - }) - break - - self._sort_formats(formats) - - title = self._html_search_regex(( - r'<h1[^>]+title=(["\'])(?P<content>[^"\']+)', - r'(?s)<h1[^>]*>(?P<content>.+?)</h1>', - self._meta_regex('title') - ), webpage, 'title', group='content', fatal=False) - - # Get part title for anthologies - if page_id is not None: - # TODO: The json is already downloaded by _extract_anthology_entries. Don't redownload for each video. - part_info = traverse_obj(self._download_json( - f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp', - video_id, note='Extracting videos in anthology'), 'data', expected_type=list) - title = title if len(part_info) == 1 else traverse_obj(part_info, (int(page_id) - 1, 'part')) or title - - description = self._html_search_meta('description', webpage) - timestamp = unified_timestamp(self._html_search_regex( - r'<time[^>]+datetime="([^"]+)"', webpage, 'upload time', - default=None) or self._html_search_meta( - 'uploadDate', webpage, 'timestamp', default=None)) - thumbnail = self._html_search_meta(['og:image', 'thumbnailUrl'], webpage) - - # TODO 'view_count' requires deobfuscating Javascript - info.update({ - 'id': f'{video_id}_part{page_id or 1}', - 'cid': cid, - 'title': title, - 'description': description, - 'timestamp': timestamp, - 'thumbnail': thumbnail, - 'duration': float_or_none(video_info.get('timelength'), scale=1000), - }) - - uploader_mobj = re.search( - r'<a[^>]+href="(?:https?:)?//space\.bilibili\.com/(?P<id>\d+)"[^>]*>\s*(?P<name>[^<]+?)\s*<', - webpage) - if uploader_mobj: - info.update({ - 'uploader': uploader_mobj.group('name').strip(), - 'uploader_id': uploader_mobj.group('id'), - }) + _TESTS = [{ + 'url': 'https://www.bilibili.com/bangumi/play/ss897', + 'info_dict': { + 'id': 'ss897', + 'ext': 'mp4', + 'series': '神的记事本', + 'season': '神的记事本', + 'season_id': 897, + 'season_number': 1, + 'episode': '你与旅行包', + 'episode_number': 2, + 'title': '神的记事本:第2话 你与旅行包', + 'duration': 1428.487, + 'timestamp': 1310809380, + 'upload_date': '20110716', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + }, + }, { + 'url': 'https://www.bilibili.com/bangumi/play/ep508406', + 'only_matching': True, + }] - if not info.get('uploader'): - info['uploader'] = self._html_search_meta( - 'author', webpage, 'uploader', default=None) + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) - top_level_info = { - 'tags': traverse_obj(self._download_json( - f'https://api.bilibili.com/x/tag/archive/tags?aid={video_id}', - video_id, fatal=False, note='Downloading tags'), ('data', ..., 'tag_name')), - } + if '您所在的地区无法观看本片' in webpage: + raise GeoRestrictedError('This video is restricted') + elif ('开通大会员观看' in webpage and '__playinfo__' not in webpage + or '正在观看预览,大会员免费看全片' in webpage): + self.raise_login_required('This video is for premium members only') - info['subtitles'] = { - 'danmaku': [{ - 'ext': 'xml', - 'url': f'https://comment.bilibili.com/{cid}.xml', - }] - } + play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data'] + formats = self.extract_formats(play_info) + if (not formats and '成为大会员抢先看' in webpage + and play_info.get('durl') and not play_info.get('dash')): + self.raise_login_required('This video is for premium members only') - r''' - # Requires https://github.com/m13253/danmaku2ass which is licenced under GPL3 - # See https://github.com/animelover1984/youtube-dl + initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id) - raw_danmaku = self._download_webpage( - f'https://comment.bilibili.com/{cid}.xml', video_id, fatal=False, note='Downloading danmaku comments') - danmaku = NiconicoIE.CreateDanmaku(raw_danmaku, commentType='Bilibili', x=1024, y=576) - entries[0]['subtitles'] = { - 'danmaku': [{ - 'ext': 'ass', - 'data': danmaku - }] + season_id = traverse_obj(initial_state, ('mediaInfo', 'season_id')) + season_number = season_id and next(( + idx + 1 for idx, e in enumerate( + traverse_obj(initial_state, ('mediaInfo', 'seasons', ...))) + if e.get('season_id') == season_id + ), None) + + return { + 'id': video_id, + 'formats': formats, + 'title': traverse_obj(initial_state, 'h1Title'), + 'episode': traverse_obj(initial_state, ('epInfo', 'long_title')), + 'episode_number': int_or_none(traverse_obj(initial_state, ('epInfo', 'title'))), + 'series': traverse_obj(initial_state, ('mediaInfo', 'series')), + 'season': traverse_obj(initial_state, ('mediaInfo', 'season_title')), + 'season_id': season_id, + 'season_number': season_number, + 'thumbnail': traverse_obj(initial_state, ('epInfo', 'cover')), + 'timestamp': traverse_obj(initial_state, ('epInfo', 'pub_time')), + 'duration': float_or_none(play_info.get('timelength'), scale=1000), + 'subtitles': self.extract_subtitles( + video_id, initial_state, traverse_obj(initial_state, ('epInfo', 'cid'))), + '__post_extractor': self.extract_comments(traverse_obj(initial_state, ('epInfo', 'aid'))), + 'http_headers': {'Referer': url, **self.geo_verification_headers()}, } - ''' - top_level_info['__post_extractor'] = self.extract_comments(video_id) - for entry in entries: - entry.update(info) +class BiliBiliBangumiMediaIE(InfoExtractor): + _VALID_URL = r'https?://www\.bilibili\.com/bangumi/media/md(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.bilibili.com/bangumi/media/md24097891', + 'info_dict': { + 'id': '24097891', + }, + 'playlist_mincount': 25, + }] - if len(entries) == 1: - entries[0].update(top_level_info) - return entries[0] + def _real_extract(self, url): + media_id = self._match_id(url) + webpage = self._download_webpage(url, media_id) - for idx, entry in enumerate(entries): - entry['id'] = '%s_part%d' % (video_id, (idx + 1)) + initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id) + episode_list = self._download_json( + 'https://api.bilibili.com/pgc/web/season/section', media_id, + query={'season_id': initial_state['mediaInfo']['season_id']}, + note='Downloading season info')['result']['main_section']['episodes'] - return { - 'id': str(video_id), - 'bv_id': bv_id, - 'title': title, - 'description': description, - **info, **top_level_info - } + return self.playlist_result(( + self.url_result(entry['share_url'], BiliBiliBangumiIE, entry['aid']) + for entry in episode_list), media_id) - def _extract_anthology_entries(self, bv_id, video_id, webpage): - title = self._html_search_regex( - (r'<h1[^>]+\btitle=(["\'])(?P<title>(?:(?!\1).)+)\1', - r'(?s)<h1[^>]*>(?P<title>.+?)</h1>', - r'<title>(?P<title>.+?)</title>'), webpage, 'title', - group='title') - json_data = self._download_json( - f'https://api.bilibili.com/x/player/pagelist?bvid={bv_id}&jsonp=jsonp', - video_id, note='Extracting videos in anthology') - - if json_data['data']: - return self.playlist_from_matches( - json_data['data'], bv_id, title, ie=BiliBiliIE.ie_key(), - getter=lambda entry: 'https://www.bilibili.com/video/%s?p=%d' % (bv_id, entry['page'])) - - def _get_video_id_set(self, id, is_bv): - query = {'bvid': id} if is_bv else {'aid': id} - response = self._download_json( - "http://api.bilibili.cn/x/web-interface/view", - id, query=query, - note='Grabbing original ID via API') - - if response['code'] == -400: - raise ExtractorError('Video ID does not exist', expected=True, video_id=id) - elif response['code'] != 0: - raise ExtractorError(f'Unknown error occurred during API check (code {response["code"]})', - expected=True, video_id=id) - return response['data']['aid'], response['data']['bvid'] - - def _get_comments(self, video_id, commentPageNumber=0): - for idx in itertools.count(1): - replies = traverse_obj( - self._download_json( - f'https://api.bilibili.com/x/v2/reply?pn={idx}&oid={video_id}&type=1&jsonp=jsonp&sort=2&_=1567227301685', - video_id, note=f'Extracting comments from page {idx}', fatal=False), - ('data', 'replies')) - if not replies: - return - for children in map(self._get_all_children, replies): - yield from children - def _get_all_children(self, reply): - yield { - 'author': traverse_obj(reply, ('member', 'uname')), - 'author_id': traverse_obj(reply, ('member', 'mid')), - 'id': reply.get('rpid'), - 'text': traverse_obj(reply, ('content', 'message')), - 'timestamp': reply.get('ctime'), - 'parent': reply.get('parent') or 'root', - } - for children in map(self._get_all_children, reply.get('replies') or []): - yield from children +class BilibiliSpaceBaseIE(InfoExtractor): + def _extract_playlist(self, fetch_page, get_metadata, get_entries): + first_page = fetch_page(0) + metadata = get_metadata(first_page) + paged_list = InAdvancePagedList( + lambda idx: get_entries(fetch_page(idx) if idx else first_page), + metadata['page_count'], metadata['page_size']) -class BiliBiliBangumiIE(InfoExtractor): - _VALID_URL = r'https?://bangumi\.bilibili\.com/anime/(?P<id>\d+)' + return metadata, paged_list - IE_NAME = 'bangumi.bilibili.com' - IE_DESC = 'BiliBili番剧' +class BilibiliSpaceVideoIE(BilibiliSpaceBaseIE): + _VALID_URL = r'https?://space\.bilibili\.com/(?P<id>\d+)(?P<video>/video)?/?(?:[?#]|$)' _TESTS = [{ - 'url': 'http://bangumi.bilibili.com/anime/1869', - 'info_dict': { - 'id': '1869', - 'title': '混沌武士', - 'description': 'md5:6a9622b911565794c11f25f81d6a97d2', - }, - 'playlist_count': 26, - }, { - 'url': 'http://bangumi.bilibili.com/anime/1869', + 'url': 'https://space.bilibili.com/3985676/video', 'info_dict': { - 'id': '1869', - 'title': '混沌武士', - 'description': 'md5:6a9622b911565794c11f25f81d6a97d2', - }, - 'playlist': [{ - 'md5': '91da8621454dd58316851c27c68b0c13', - 'info_dict': { - 'id': '40062', - 'ext': 'mp4', - 'title': '混沌武士', - 'description': '故事发生在日本的江户时代。风是一个小酒馆的打工女。一日,酒馆里来了一群恶霸,虽然他们的举动令风十分不满,但是毕竟风只是一届女流,无法对他们采取什么行动,只能在心里嘟哝。这时,酒家里又进来了个“不良份子...', - 'timestamp': 1414538739, - 'upload_date': '20141028', - 'episode': '疾风怒涛 Tempestuous Temperaments', - 'episode_number': 1, - }, - }], - 'params': { - 'playlist_items': '1', + 'id': '3985676', }, + 'playlist_mincount': 178, }] - @classmethod - def suitable(cls, url): - return False if BiliBiliIE.suitable(url) else super(BiliBiliBangumiIE, cls).suitable(url) - def _real_extract(self, url): - bangumi_id = self._match_id(url) - - # Sometimes this API returns a JSONP response - season_info = self._download_json( - 'http://bangumi.bilibili.com/jsonp/seasoninfo/%s.ver' % bangumi_id, - bangumi_id, transform_source=strip_jsonp)['result'] + playlist_id, is_video_url = self._match_valid_url(url).group('id', 'video') + if not is_video_url: + self.to_screen('A channel URL was given. Only the channel\'s videos will be downloaded. ' + 'To download audios, add a "/audio" to the URL') - entries = [{ - '_type': 'url_transparent', - 'url': smuggle_url(episode['webplay_url'], {'no_bangumi_tip': 1}), - 'ie_key': BiliBiliIE.ie_key(), - 'timestamp': parse_iso8601(episode.get('update_time'), delimiter=' '), - 'episode': episode.get('index_title'), - 'episode_number': int_or_none(episode.get('index')), - } for episode in season_info['episodes']] + def fetch_page(page_idx): + try: + response = self._download_json('https://api.bilibili.com/x/space/arc/search', + playlist_id, note=f'Downloading page {page_idx}', + query={'mid': playlist_id, 'pn': page_idx + 1, 'jsonp': 'jsonp'}) + except ExtractorError as e: + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 412: + raise ExtractorError( + 'Request is blocked by server (412), please add cookies, wait and try later.', expected=True) + raise + if response['code'] == -401: + raise ExtractorError( + 'Request is blocked by server (401), please add cookies, wait and try later.', expected=True) + return response['data'] + + def get_metadata(page_data): + page_size = page_data['page']['ps'] + entry_count = page_data['page']['count'] + return { + 'page_count': math.ceil(entry_count / page_size), + 'page_size': page_size, + } - entries = sorted(entries, key=lambda entry: entry.get('episode_number')) + def get_entries(page_data): + for entry in traverse_obj(page_data, ('list', 'vlist')) or []: + yield self.url_result(f'https://www.bilibili.com/video/{entry["bvid"]}', BiliBiliIE, entry['bvid']) - return self.playlist_result( - entries, bangumi_id, - season_info.get('bangumi_title'), season_info.get('evaluate')) + metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries) + return self.playlist_result(paged_list, playlist_id) -class BilibiliChannelIE(InfoExtractor): - _VALID_URL = r'https?://space.bilibili\.com/(?P<id>\d+)' - _API_URL = "https://api.bilibili.com/x/space/arc/search?mid=%s&pn=%d&jsonp=jsonp" +class BilibiliSpaceAudioIE(BilibiliSpaceBaseIE): + _VALID_URL = r'https?://space\.bilibili\.com/(?P<id>\d+)/audio' _TESTS = [{ - 'url': 'https://space.bilibili.com/3985676/video', - 'info_dict': {}, - 'playlist_mincount': 112, + 'url': 'https://space.bilibili.com/3985676/audio', + 'info_dict': { + 'id': '3985676', + }, + 'playlist_mincount': 1, }] - def _entries(self, list_id): - count, max_count = 0, None + def _real_extract(self, url): + playlist_id = self._match_id(url) + + def fetch_page(page_idx): + return self._download_json( + 'https://api.bilibili.com/audio/music-service/web/song/upper', playlist_id, + note=f'Downloading page {page_idx}', + query={'uid': playlist_id, 'pn': page_idx + 1, 'ps': 30, 'order': 1, 'jsonp': 'jsonp'})['data'] + + def get_metadata(page_data): + return { + 'page_count': page_data['pageCount'], + 'page_size': page_data['pageSize'], + } - for page_num in itertools.count(1): - data = self._download_json( - self._API_URL % (list_id, page_num), list_id, note=f'Downloading page {page_num}')['data'] + def get_entries(page_data): + for entry in page_data.get('data', []): + yield self.url_result(f'https://www.bilibili.com/audio/au{entry["id"]}', BilibiliAudioIE, entry['id']) - max_count = max_count or traverse_obj(data, ('page', 'count')) + metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries) + return self.playlist_result(paged_list, playlist_id) - entries = traverse_obj(data, ('list', 'vlist')) - if not entries: - return - for entry in entries: - yield self.url_result( - 'https://www.bilibili.com/video/%s' % entry['bvid'], - BiliBiliIE.ie_key(), entry['bvid']) - count += len(entries) - if max_count and count >= max_count: - return +class BilibiliSpacePlaylistIE(BilibiliSpaceBaseIE): + _VALID_URL = r'https?://space.bilibili\.com/(?P<mid>\d+)/channel/collectiondetail\?sid=(?P<sid>\d+)' + _TESTS = [{ + 'url': 'https://space.bilibili.com/2142762/channel/collectiondetail?sid=57445', + 'info_dict': { + 'id': '2142762_57445', + 'title': '《底特律 变人》' + }, + 'playlist_mincount': 31, + }] def _real_extract(self, url): - list_id = self._match_id(url) - return self.playlist_result(self._entries(list_id), list_id) + mid, sid = self._match_valid_url(url).group('mid', 'sid') + playlist_id = f'{mid}_{sid}' + + def fetch_page(page_idx): + return self._download_json( + 'https://api.bilibili.com/x/polymer/space/seasons_archives_list', + playlist_id, note=f'Downloading page {page_idx}', + query={'mid': mid, 'season_id': sid, 'page_num': page_idx + 1, 'page_size': 30})['data'] + + def get_metadata(page_data): + page_size = page_data['page']['page_size'] + entry_count = page_data['page']['total'] + return { + 'page_count': math.ceil(entry_count / page_size), + 'page_size': page_size, + 'title': traverse_obj(page_data, ('meta', 'name')) + } + + def get_entries(page_data): + for entry in page_data.get('archives', []): + yield self.url_result(f'https://www.bilibili.com/video/{entry["bvid"]}', + BiliBiliIE, entry['bvid']) + + metadata, paged_list = self._extract_playlist(fetch_page, get_metadata, get_entries) + return self.playlist_result(paged_list, playlist_id, metadata['title']) class BilibiliCategoryIE(InfoExtractor): @@ -568,8 +620,7 @@ class BilibiliCategoryIE(InfoExtractor): self._fetch_page, api_url, num_pages, query), size) def _real_extract(self, url): - u = compat_urllib_parse_urlparse(url) - category, subcategory = u.path.split('/')[2:4] + category, subcategory = urllib.parse.urlparse(url).path.split('/')[2:4] query = '%s: %s' % (category, subcategory) return self.playlist_result(self._entries(category, subcategory, query), query, query) @@ -589,14 +640,15 @@ class BiliBiliSearchIE(SearchInfoExtractor): 'keyword': query, 'page': page_num, 'context': '', - 'order': 'pubdate', 'duration': 0, 'tids_2': '', '__refresh__': 'true', 'search_type': 'video', 'tids': 0, 'highlight': 1, - })['data'].get('result') or [] + })['data'].get('result') + if not videos: + break for video in videos: yield self.url_result(video['arcurl'], 'BiliBili', str(video['aid'])) @@ -646,6 +698,11 @@ class BilibiliAudioIE(BilibiliAudioBaseIE): 'vcodec': 'none' }] + for a_format in formats: + a_format.setdefault('http_headers', {}).update({ + 'Referer': url, + }) + song = self._call_api('song/info', au_id) title = song['title'] statistic = song.get('statistic') or {} @@ -753,17 +810,20 @@ class BiliIntlBaseIE(InfoExtractor): def json2srt(self, json): data = '\n\n'.join( f'{i + 1}\n{srt_subtitles_timecode(line["from"])} --> {srt_subtitles_timecode(line["to"])}\n{line["content"]}' - for i, line in enumerate(json['body']) if line.get('content')) + for i, line in enumerate(traverse_obj(json, ( + 'body', lambda _, l: l['content'] and l['from'] and l['to'])))) return data def _get_subtitles(self, *, ep_id=None, aid=None): sub_json = self._call_api( - '/web/v2/subtitle', ep_id or aid, note='Downloading subtitles list', - errnote='Unable to download subtitles list', query=filter_dict({ + '/web/v2/subtitle', ep_id or aid, fatal=False, + note='Downloading subtitles list', errnote='Unable to download subtitles list', + query=filter_dict({ 'platform': 'web', + 's_locale': 'en_US', 'episode_id': ep_id, 'aid': aid, - })) + })) or {} subtitles = {} for sub in sub_json.get('subtitles') or []: sub_url = sub.get('url') @@ -818,7 +878,6 @@ class BiliIntlBaseIE(InfoExtractor): 'filesize': aud.get('size'), }) - self._sort_formats(formats) return formats def _extract_video_info(self, video_data, *, ep_id=None, aid=None): @@ -866,7 +925,7 @@ class BiliIntlBaseIE(InfoExtractor): class BiliIntlIE(BiliIntlBaseIE): - _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?(play/(?P<season_id>\d+)/(?P<ep_id>\d+)|video/(?P<aid>\d+))' + _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-zA-Z]{2}/)?(play/(?P<season_id>\d+)/(?P<ep_id>\d+)|video/(?P<aid>\d+))' _TESTS = [{ # Bstation page 'url': 'https://www.bilibili.tv/en/play/34613/341736', @@ -909,6 +968,10 @@ class BiliIntlIE(BiliIntlBaseIE): # No language in URL 'url': 'https://www.bilibili.tv/video/2019955076', 'only_matching': True, + }, { + # Uppercase language in URL + 'url': 'https://www.bilibili.tv/EN/video/2019955076', + 'only_matching': True, }] def _real_extract(self, url): @@ -916,12 +979,11 @@ class BiliIntlIE(BiliIntlBaseIE): video_id = ep_id or aid webpage = self._download_webpage(url, video_id) # Bstation layout - initial_data = self._parse_json(self._search_regex( - r'window\.__INITIAL_(?:DATA|STATE)__\s*=\s*({.+?});', webpage, - 'preload state', default='{}'), video_id, fatal=False) or {} - video_data = ( - traverse_obj(initial_data, ('OgvVideo', 'epDetail'), expected_type=dict) - or traverse_obj(initial_data, ('UgcVideo', 'videoData'), expected_type=dict) or {}) + initial_data = ( + self._search_json(r'window\.__INITIAL_(?:DATA|STATE)__\s*=', webpage, 'preload state', video_id, default={}) + or self._search_nuxt_data(webpage, video_id, '__initialState', fatal=False, traverse=None)) + video_data = traverse_obj( + initial_data, ('OgvVideo', 'epDetail'), ('UgcVideo', 'videoData'), ('ugc', 'archive'), expected_type=dict) if season_id and not video_data: # Non-Bstation layout, read through episode list @@ -929,11 +991,11 @@ class BiliIntlIE(BiliIntlBaseIE): video_data = traverse_obj(season_json, ('sections', ..., 'episodes', lambda _, v: str(v['episode_id']) == ep_id), expected_type=dict, get_all=False) - return self._extract_video_info(video_data, ep_id=ep_id, aid=aid) + return self._extract_video_info(video_data or {}, ep_id=ep_id, aid=aid) class BiliIntlSeriesIE(BiliIntlBaseIE): - _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?play/(?P<id>\d+)$' + _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-zA-Z]{2}/)?play/(?P<id>\d+)/?(?:[?#]|$)' _TESTS = [{ 'url': 'https://www.bilibili.tv/en/play/34613', 'playlist_mincount': 15, @@ -951,6 +1013,9 @@ class BiliIntlSeriesIE(BiliIntlBaseIE): }, { 'url': 'https://www.biliintl.com/en/play/34613', 'only_matching': True, + }, { + 'url': 'https://www.biliintl.com/EN/play/34613', + 'only_matching': True, }] def _entries(self, series_id): @@ -966,3 +1031,87 @@ class BiliIntlSeriesIE(BiliIntlBaseIE): self._entries(series_id), series_id, series_info.get('title'), series_info.get('description'), categories=traverse_obj(series_info, ('styles', ..., 'title'), expected_type=str_or_none), thumbnail=url_or_none(series_info.get('horizontal_cover')), view_count=parse_count(series_info.get('view'))) + + +class BiliLiveIE(InfoExtractor): + _VALID_URL = r'https?://live.bilibili.com/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://live.bilibili.com/196', + 'info_dict': { + 'id': '33989', + 'description': "周六杂谈回,其他时候随机游戏。 | \n录播:@下播型泛式录播组。 | \n直播通知群(全员禁言):666906670,902092584,59971⑧481 (功能一样,别多加)", + 'ext': 'flv', + 'title': "太空狼人杀联动,不被爆杀就算赢", + 'thumbnail': "https://i0.hdslb.com/bfs/live/new_room_cover/e607bc1529057ef4b332e1026e62cf46984c314d.jpg", + 'timestamp': 1650802769, + }, + 'skip': 'not live' + }, { + 'url': 'https://live.bilibili.com/196?broadcast_type=0&is_room_feed=1?spm_id_from=333.999.space_home.strengthen_live_card.click', + 'only_matching': True + }] + + _FORMATS = { + 80: {'format_id': 'low', 'format_note': '流畅'}, + 150: {'format_id': 'high_res', 'format_note': '高清'}, + 250: {'format_id': 'ultra_high_res', 'format_note': '超清'}, + 400: {'format_id': 'blue_ray', 'format_note': '蓝光'}, + 10000: {'format_id': 'source', 'format_note': '原画'}, + 20000: {'format_id': '4K', 'format_note': '4K'}, + 30000: {'format_id': 'dolby', 'format_note': '杜比'}, + } + + _quality = staticmethod(qualities(list(_FORMATS))) + + def _call_api(self, path, room_id, query): + api_result = self._download_json(f'https://api.live.bilibili.com/{path}', room_id, query=query) + if api_result.get('code') != 0: + raise ExtractorError(api_result.get('message') or 'Unable to download JSON metadata') + return api_result.get('data') or {} + + def _parse_formats(self, qn, fmt): + for codec in fmt.get('codec') or []: + if codec.get('current_qn') != qn: + continue + for url_info in codec['url_info']: + yield { + 'url': f'{url_info["host"]}{codec["base_url"]}{url_info["extra"]}', + 'ext': fmt.get('format_name'), + 'vcodec': codec.get('codec_name'), + 'quality': self._quality(qn), + **self._FORMATS[qn], + } + + def _real_extract(self, url): + room_id = self._match_id(url) + room_data = self._call_api('room/v1/Room/get_info', room_id, {'id': room_id}) + if room_data.get('live_status') == 0: + raise ExtractorError('Streamer is not live', expected=True) + + formats = [] + for qn in self._FORMATS.keys(): + stream_data = self._call_api('xlive/web-room/v2/index/getRoomPlayInfo', room_id, { + 'room_id': room_id, + 'qn': qn, + 'codec': '0,1', + 'format': '0,2', + 'mask': '0', + 'no_playurl': '0', + 'platform': 'web', + 'protocol': '0,1', + }) + for fmt in traverse_obj(stream_data, ('playurl_info', 'playurl', 'stream', ..., 'format', ...)) or []: + formats.extend(self._parse_formats(qn, fmt)) + + return { + 'id': room_id, + 'title': room_data.get('title'), + 'description': room_data.get('description'), + 'thumbnail': room_data.get('user_cover'), + 'timestamp': stream_data.get('live_time'), + 'formats': formats, + 'http_headers': { + 'Referer': url, + }, + } diff --git a/hypervideo_dl/extractor/biobiochiletv.py b/hypervideo_dl/extractor/biobiochiletv.py index dc86c57..180c965 100644 --- a/hypervideo_dl/extractor/biobiochiletv.py +++ b/hypervideo_dl/extractor/biobiochiletv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( ExtractorError, diff --git a/hypervideo_dl/extractor/biqle.py b/hypervideo_dl/extractor/biqle.py index 2b57bad..0277535 100644 --- a/hypervideo_dl/extractor/biqle.py +++ b/hypervideo_dl/extractor/biqle.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from .vk import VKIE from ..compat import compat_b64decode @@ -89,7 +86,6 @@ class BIQLEIE(InfoExtractor): 'height': int_or_none(height), 'ext': ext, }) - self._sort_formats(formats) thumbnails = [] for k, v in item.items(): diff --git a/hypervideo_dl/extractor/bitchute.py b/hypervideo_dl/extractor/bitchute.py index dcae6f4..10e7b0b 100644 --- a/hypervideo_dl/extractor/bitchute.py +++ b/hypervideo_dl/extractor/bitchute.py @@ -1,14 +1,20 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import itertools +import functools import re from .common import InfoExtractor from ..utils import ( ExtractorError, - GeoRestrictedError, + HEADRequest, + OnDemandPagedList, + clean_html, + get_element_by_class, + get_element_by_id, + get_elements_html_by_class, + int_or_none, orderedSet, + parse_count, + parse_duration, + traverse_obj, unified_strdate, urlencode_postdata, ) @@ -16,11 +22,12 @@ from ..utils import ( class BitChuteIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?:video|embed|torrent/[^/]+)/(?P<id>[^/?#&]+)' + _EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{_VALID_URL})'] _TESTS = [{ 'url': 'https://www.bitchute.com/video/UGlrF9o9b-Q/', 'md5': '7e427d7ed7af5a75b5855705ec750e2b', 'info_dict': { - 'id': 'szoMrox2JEI', + 'id': 'UGlrF9o9b-Q', 'ext': 'mp4', 'title': 'This is the first video on #BitChute !', 'description': 'md5:a0337e7b1fe39e32336974af8173a034', @@ -29,130 +36,198 @@ class BitChuteIE(InfoExtractor): 'upload_date': '20170103', }, }, { + # video not downloadable in browser, but we can recover it + 'url': 'https://www.bitchute.com/video/2s6B3nZjAk7R/', + 'md5': '05c12397d5354bf24494885b08d24ed1', + 'info_dict': { + 'id': '2s6B3nZjAk7R', + 'ext': 'mp4', + 'filesize': 71537926, + 'title': 'STYXHEXENHAMMER666 - Election Fraud, Clinton 2020, EU Armies, and Gun Control', + 'description': 'md5:228ee93bd840a24938f536aeac9cf749', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'BitChute', + 'upload_date': '20181113', + }, + 'params': {'check_formats': None}, + }, { + # restricted video + 'url': 'https://www.bitchute.com/video/WEnQU7XGcTdl/', + 'info_dict': { + 'id': 'WEnQU7XGcTdl', + 'ext': 'mp4', + 'title': 'Impartial Truth - Ein Letzter Appell an die Vernunft', + }, + 'params': {'skip_download': True}, + 'skip': 'Georestricted in DE', + }, { 'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/', 'only_matching': True, }, { 'url': 'https://www.bitchute.com/torrent/Zee5BE49045h/szoMrox2JEI.webtorrent', 'only_matching': True, }] + _GEO_BYPASS = False - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>%s)' % BitChuteIE._VALID_URL, - webpage)] + _HEADERS = { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36', + 'Referer': 'https://www.bitchute.com/', + } + + def _check_format(self, video_url, video_id): + urls = orderedSet( + re.sub(r'(^https?://)(seed\d+)(?=\.bitchute\.com)', fr'\g<1>{host}', video_url) + for host in (r'\g<2>', 'seed150', 'seed151', 'seed152', 'seed153')) + for url in urls: + try: + response = self._request_webpage( + HEADRequest(url), video_id=video_id, note=f'Checking {url}', headers=self._HEADERS) + except ExtractorError as e: + self.to_screen(f'{video_id}: URL is invalid, skipping: {e.cause}') + continue + return { + 'url': url, + 'filesize': int_or_none(response.headers.get('Content-Length')) + } + + def _raise_if_restricted(self, webpage): + page_title = clean_html(get_element_by_class('page-title', webpage)) or '' + if re.fullmatch(r'(?:Channel|Video) Restricted', page_title): + reason = clean_html(get_element_by_id('page-detail', webpage)) or page_title + self.raise_geo_restricted(reason) def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'https://www.bitchute.com/video/%s' % video_id, video_id, headers={ - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.57 Safari/537.36', - }) + f'https://www.bitchute.com/video/{video_id}', video_id, headers=self._HEADERS) - title = self._html_search_regex( - (r'<[^>]+\bid=["\']video-title[^>]+>([^<]+)', r'<title>([^<]+)'), - webpage, 'title', default=None) or self._html_search_meta( - 'description', webpage, 'title', - default=None) or self._og_search_description(webpage) + self._raise_if_restricted(webpage) + publish_date = clean_html(get_element_by_class('video-publish-date', webpage)) + entries = self._parse_html5_media_entries(url, webpage, video_id) - format_urls = [] - for mobj in re.finditer( - r'addWebSeed\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage): - format_urls.append(mobj.group('url')) - format_urls.extend(re.findall(r'as=(https?://[^&"\']+)', webpage)) - - formats = [ - {'url': format_url} - for format_url in orderedSet(format_urls)] + formats = [] + for format_ in traverse_obj(entries, (0, 'formats', ...)): + if self.get_param('check_formats') is not False: + format_.update(self._check_format(format_.pop('url'), video_id) or {}) + if 'url' not in format_: + continue + formats.append(format_) if not formats: - entries = self._parse_html5_media_entries( - url, webpage, video_id) - if not entries: - error = self._html_search_regex(r'<h1 class="page-title">([^<]+)</h1>', webpage, 'error', default='Cannot find video') - if error == 'Video Unavailable': - raise GeoRestrictedError(error) - raise ExtractorError(error) - formats = entries[0]['formats'] - - self._check_formats(formats, video_id) - self._sort_formats(formats) - - description = self._html_search_regex( - r'(?s)<div\b[^>]+\bclass=["\']full hidden[^>]+>(.+?)</div>', - webpage, 'description', fatal=False) - thumbnail = self._og_search_thumbnail( - webpage, default=None) or self._html_search_meta( - 'twitter:image:src', webpage, 'thumbnail') - uploader = self._html_search_regex( - (r'(?s)<div class=["\']channel-banner.*?<p\b[^>]+\bclass=["\']name[^>]+>(.+?)</p>', - r'(?s)<p\b[^>]+\bclass=["\']video-author[^>]+>(.+?)</p>'), - webpage, 'uploader', fatal=False) - - upload_date = unified_strdate(self._search_regex( - r'class=["\']video-publish-date[^>]+>[^<]+ at \d+:\d+ UTC on (.+?)\.', - webpage, 'upload date', fatal=False)) + self.raise_no_formats( + 'Video is unavailable. Please make sure this video is playable in the browser ' + 'before reporting this issue.', expected=True, video_id=video_id) return { 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'upload_date': upload_date, + 'title': self._html_extract_title(webpage) or self._og_search_title(webpage), + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage), + 'uploader': clean_html(get_element_by_class('owner', webpage)), + 'upload_date': unified_strdate(self._search_regex( + r'at \d+:\d+ UTC on (.+?)\.', publish_date, 'upload date', fatal=False)), 'formats': formats, } class BitChuteChannelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?bitchute\.com/channel/(?P<id>[^/?#&]+)' - _TEST = { - 'url': 'https://www.bitchute.com/channel/victoriaxrave/', - 'playlist_mincount': 185, + _VALID_URL = r'https?://(?:www\.)?bitchute\.com/(?P<type>channel|playlist)/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.bitchute.com/channel/bitchute/', 'info_dict': { - 'id': 'victoriaxrave', + 'id': 'bitchute', + 'title': 'BitChute', + 'description': 'md5:5329fb3866125afa9446835594a9b138', }, - } + 'playlist': [ + { + 'md5': '7e427d7ed7af5a75b5855705ec750e2b', + 'info_dict': { + 'id': 'UGlrF9o9b-Q', + 'ext': 'mp4', + 'filesize': None, + 'title': 'This is the first video on #BitChute !', + 'description': 'md5:a0337e7b1fe39e32336974af8173a034', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'BitChute', + 'upload_date': '20170103', + 'duration': 16, + 'view_count': int, + }, + } + ], + 'params': { + 'skip_download': True, + 'playlist_items': '-1', + }, + }, { + 'url': 'https://www.bitchute.com/playlist/wV9Imujxasw9/', + 'playlist_mincount': 20, + 'info_dict': { + 'id': 'wV9Imujxasw9', + 'title': 'Bruce MacDonald and "The Light of Darkness"', + 'description': 'md5:04913227d2714af1d36d804aa2ab6b1e', + } + }] _TOKEN = 'zyG6tQcGPE5swyAEFLqKUwMuMMuF6IO2DZ6ZDQjGfsL0e4dcTLwqkTTul05Jdve7' + PAGE_SIZE = 25 + HTML_CLASS_NAMES = { + 'channel': { + 'container': 'channel-videos-container', + 'title': 'channel-videos-title', + 'description': 'channel-videos-text', + }, + 'playlist': { + 'container': 'playlist-video', + 'title': 'title', + 'description': 'description', + } - def _entries(self, channel_id): - channel_url = 'https://www.bitchute.com/channel/%s/' % channel_id - offset = 0 - for page_num in itertools.count(1): - data = self._download_json( - '%sextend/' % channel_url, channel_id, - 'Downloading channel page %d' % page_num, - data=urlencode_postdata({ - 'csrfmiddlewaretoken': self._TOKEN, - 'name': '', - 'offset': offset, - }), headers={ - 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', - 'Referer': channel_url, - 'X-Requested-With': 'XMLHttpRequest', - 'Cookie': 'csrftoken=%s' % self._TOKEN, - }) - if data.get('success') is False: - break - html = data.get('html') - if not html: - break - video_ids = re.findall( - r'class=["\']channel-videos-image-container[^>]+>\s*<a\b[^>]+\bhref=["\']/video/([^"\'/]+)', - html) - if not video_ids: - break - offset += len(video_ids) - for video_id in video_ids: - yield self.url_result( - 'https://www.bitchute.com/video/%s' % video_id, - ie=BitChuteIE.ie_key(), video_id=video_id) + } + + @staticmethod + def _make_url(playlist_id, playlist_type): + return f'https://www.bitchute.com/{playlist_type}/{playlist_id}/' + + def _fetch_page(self, playlist_id, playlist_type, page_num): + playlist_url = self._make_url(playlist_id, playlist_type) + data = self._download_json( + f'{playlist_url}extend/', playlist_id, f'Downloading page {page_num}', + data=urlencode_postdata({ + 'csrfmiddlewaretoken': self._TOKEN, + 'name': '', + 'offset': page_num * self.PAGE_SIZE, + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', + 'Referer': playlist_url, + 'X-Requested-With': 'XMLHttpRequest', + 'Cookie': f'csrftoken={self._TOKEN}', + }) + if not data.get('success'): + return + classes = self.HTML_CLASS_NAMES[playlist_type] + for video_html in get_elements_html_by_class(classes['container'], data.get('html')): + video_id = self._search_regex( + r'<a\s[^>]*\bhref=["\']/video/([^"\'/]+)', video_html, 'video id', default=None) + if not video_id: + continue + yield self.url_result( + f'https://www.bitchute.com/video/{video_id}', BitChuteIE, video_id, url_transparent=True, + title=clean_html(get_element_by_class(classes['title'], video_html)), + description=clean_html(get_element_by_class(classes['description'], video_html)), + duration=parse_duration(get_element_by_class('video-duration', video_html)), + view_count=parse_count(clean_html(get_element_by_class('video-views', video_html)))) def _real_extract(self, url): - channel_id = self._match_id(url) + playlist_type, playlist_id = self._match_valid_url(url).group('type', 'id') + webpage = self._download_webpage(self._make_url(playlist_id, playlist_type), playlist_id) + + page_func = functools.partial(self._fetch_page, playlist_id, playlist_type) return self.playlist_result( - self._entries(channel_id), playlist_id=channel_id) + OnDemandPagedList(page_func, self.PAGE_SIZE), playlist_id, + title=self._html_extract_title(webpage, default=None), + description=self._html_search_meta( + ('description', 'og:description', 'twitter:description'), webpage, default=None), + playlist_count=int_or_none(self._html_search_regex( + r'<span>(\d+)\s+videos?</span>', webpage, 'playlist count', default=None))) diff --git a/hypervideo_dl/extractor/bitwave.py b/hypervideo_dl/extractor/bitwave.py index e6e093f..a82cd26 100644 --- a/hypervideo_dl/extractor/bitwave.py +++ b/hypervideo_dl/extractor/bitwave.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor @@ -47,7 +45,6 @@ class BitwaveStreamIE(InfoExtractor): formats = self._extract_m3u8_formats( channel['data']['url'], username, 'mp4') - self._sort_formats(formats) return { 'id': username, diff --git a/hypervideo_dl/extractor/blackboardcollaborate.py b/hypervideo_dl/extractor/blackboardcollaborate.py index 8ae2941..8f41c89 100644 --- a/hypervideo_dl/extractor/blackboardcollaborate.py +++ b/hypervideo_dl/extractor/blackboardcollaborate.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import parse_iso8601 diff --git a/hypervideo_dl/extractor/bleacherreport.py b/hypervideo_dl/extractor/bleacherreport.py index d1bf8e8..8d8fabe 100644 --- a/hypervideo_dl/extractor/bleacherreport.py +++ b/hypervideo_dl/extractor/bleacherreport.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from .amp import AMPIE from ..utils import ( diff --git a/hypervideo_dl/extractor/blinkx.py b/hypervideo_dl/extractor/blinkx.py deleted file mode 100644 index d70a3b3..0000000 --- a/hypervideo_dl/extractor/blinkx.py +++ /dev/null @@ -1,86 +0,0 @@ -from __future__ import unicode_literals - -import json - -from .common import InfoExtractor -from ..utils import ( - remove_start, - int_or_none, -) - - -class BlinkxIE(InfoExtractor): - _VALID_URL = r'(?:https?://(?:www\.)blinkx\.com/#?ce/|blinkx:)(?P<id>[^?]+)' - IE_NAME = 'blinkx' - - _TEST = { - 'url': 'http://www.blinkx.com/ce/Da0Gw3xc5ucpNduzLuDDlv4WC9PuI4fDi1-t6Y3LyfdY2SZS5Urbvn-UPJvrvbo8LTKTc67Wu2rPKSQDJyZeeORCR8bYkhs8lI7eqddznH2ofh5WEEdjYXnoRtj7ByQwt7atMErmXIeYKPsSDuMAAqJDlQZ-3Ff4HJVeH_s3Gh8oQ', - 'md5': '337cf7a344663ec79bf93a526a2e06c7', - 'info_dict': { - 'id': 'Da0Gw3xc', - 'ext': 'mp4', - 'title': 'No Daily Show for John Oliver; HBO Show Renewed - IGN News', - 'uploader': 'IGN News', - 'upload_date': '20150217', - 'timestamp': 1424215740, - 'description': 'HBO has renewed Last Week Tonight With John Oliver for two more seasons.', - 'duration': 47.743333, - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - display_id = video_id[:8] - - api_url = ('https://apib4.blinkx.com/api.php?action=play_video&' - + 'video=%s' % video_id) - data_json = self._download_webpage(api_url, display_id) - data = json.loads(data_json)['api']['results'][0] - duration = None - thumbnails = [] - formats = [] - for m in data['media']: - if m['type'] == 'jpg': - thumbnails.append({ - 'url': m['link'], - 'width': int(m['w']), - 'height': int(m['h']), - }) - elif m['type'] == 'original': - duration = float(m['d']) - elif m['type'] == 'youtube': - yt_id = m['link'] - self.to_screen('Youtube video detected: %s' % yt_id) - return self.url_result(yt_id, 'Youtube', video_id=yt_id) - elif m['type'] in ('flv', 'mp4'): - vcodec = remove_start(m['vcodec'], 'ff') - acodec = remove_start(m['acodec'], 'ff') - vbr = int_or_none(m.get('vbr') or m.get('vbitrate'), 1000) - abr = int_or_none(m.get('abr') or m.get('abitrate'), 1000) - tbr = vbr + abr if vbr and abr else None - format_id = '%s-%sk-%s' % (vcodec, tbr, m['w']) - formats.append({ - 'format_id': format_id, - 'url': m['link'], - 'vcodec': vcodec, - 'acodec': acodec, - 'abr': abr, - 'vbr': vbr, - 'tbr': tbr, - 'width': int_or_none(m.get('w')), - 'height': int_or_none(m.get('h')), - }) - - self._sort_formats(formats) - - return { - 'id': display_id, - 'fullid': video_id, - 'title': data['title'], - 'formats': formats, - 'uploader': data.get('channel_name'), - 'timestamp': data.get('pubdate_epoch'), - 'description': data.get('description'), - 'thumbnails': thumbnails, - 'duration': duration, - } diff --git a/hypervideo_dl/extractor/blogger.py b/hypervideo_dl/extractor/blogger.py index dba131c..3d6e033 100644 --- a/hypervideo_dl/extractor/blogger.py +++ b/hypervideo_dl/extractor/blogger.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from ..utils import ( mimetype2ext, parse_duration, @@ -16,7 +11,7 @@ from .common import InfoExtractor class BloggerIE(InfoExtractor): IE_NAME = 'blogger.com' _VALID_URL = r'https?://(?:www\.)?blogger\.com/video\.g\?token=(?P<id>.+)' - _VALID_EMBED = r'''<iframe[^>]+src=["']((?:https?:)?//(?:www\.)?blogger\.com/video\.g\?token=[^"']+)["']''' + _EMBED_REGEX = [r'''<iframe[^>]+src=["'](?P<url>(?:https?:)?//(?:www\.)?blogger\.com/video\.g\?token=[^"']+)["']'''] _TESTS = [{ 'url': 'https://www.blogger.com/video.g?token=AD6v5dzEe9hfcARr5Hlq1WTkYy6t-fXH3BBahVhGvVHe5szdEUBEloSEDSTA8-b111089KbfWuBvTN7fnbxMtymsHhXAXwVvyzHH4Qch2cfLQdGxKQrrEuFpC1amSl_9GuLWODjPgw', 'md5': 'f1bc19b6ea1b0fd1d81e84ca9ec467ac', @@ -29,10 +24,6 @@ class BloggerIE(InfoExtractor): } }] - @staticmethod - def _extract_urls(webpage): - return re.findall(BloggerIE._VALID_EMBED, webpage) - def _real_extract(self, url): token_id = self._match_id(url) webpage = self._download_webpage(url, token_id) diff --git a/hypervideo_dl/extractor/bloomberg.py b/hypervideo_dl/extractor/bloomberg.py index 2fbfad1..792155e 100644 --- a/hypervideo_dl/extractor/bloomberg.py +++ b/hypervideo_dl/extractor/bloomberg.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -10,13 +7,11 @@ class BloombergIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?bloomberg\.com/(?:[^/]+/)*(?P<id>[^/?#]+)' _TESTS = [{ - 'url': 'http://www.bloomberg.com/news/videos/b/aaeae121-5949-481e-a1ce-4562db6f5df2', - # The md5 checksum changes + 'url': 'https://www.bloomberg.com/news/videos/2021-09-14/apple-unveils-the-new-iphone-13-stock-doesn-t-move-much-video', 'info_dict': { - 'id': 'qurhIVlJSB6hzkVi229d8g', + 'id': 'V8cFcYMxTHaMcEiiYVr39A', 'ext': 'flv', - 'title': 'Shah\'s Presentation on Foreign-Exchange Strategies', - 'description': 'md5:a8ba0302912d03d246979735c17d2761', + 'title': 'Apple Unveils the New IPhone 13, Stock Doesn\'t Move Much', }, 'params': { 'format': 'best[format_id^=hds]', @@ -60,7 +55,7 @@ class BloombergIE(InfoExtractor): title = re.sub(': Video$', '', self._og_search_title(webpage)) embed_info = self._download_json( - 'http://www.bloomberg.com/api/embed?id=%s' % video_id, video_id) + 'http://www.bloomberg.com/multimedia/api/embed?id=%s' % video_id, video_id) formats = [] for stream in embed_info['streams']: stream_url = stream.get('url') @@ -72,7 +67,6 @@ class BloombergIE(InfoExtractor): else: formats.extend(self._extract_f4m_formats( stream_url, video_id, f4m_id='hds', fatal=False)) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/bokecc.py b/hypervideo_dl/extractor/bokecc.py index 6a89d36..ca326f2 100644 --- a/hypervideo_dl/extractor/bokecc.py +++ b/hypervideo_dl/extractor/bokecc.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..compat import compat_parse_qs from ..utils import ExtractorError @@ -25,8 +21,6 @@ class BokeCCBaseIE(InfoExtractor): 'quality': int(quality.attrib['value']), } for quality in info_xml.findall('./video/quality')] - self._sort_formats(formats) - return formats diff --git a/hypervideo_dl/extractor/bongacams.py b/hypervideo_dl/extractor/bongacams.py index 4e346e7..bf95566 100644 --- a/hypervideo_dl/extractor/bongacams.py +++ b/hypervideo_dl/extractor/bongacams.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -11,13 +8,28 @@ from ..utils import ( class BongaCamsIE(InfoExtractor): - _VALID_URL = r'https?://(?P<host>(?:[^/]+\.)?bongacams\d*\.com)/(?P<id>[^/?&#]+)' + _VALID_URL = r'https?://(?P<host>(?:[^/]+\.)?bongacams\d*\.(?:com|net))/(?P<id>[^/?&#]+)' _TESTS = [{ 'url': 'https://de.bongacams.com/azumi-8', 'only_matching': True, }, { 'url': 'https://cn.bongacams.com/azumi-8', 'only_matching': True, + }, { + 'url': 'https://de.bongacams.net/claireashton', + 'info_dict': { + 'id': 'claireashton', + 'ext': 'mp4', + 'title': r're:ClaireAshton \d{4}-\d{2}-\d{2} \d{2}:\d{2}', + 'age_limit': 18, + 'uploader_id': 'ClaireAshton', + 'uploader': 'ClaireAshton', + 'like_count': int, + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -45,7 +57,6 @@ class BongaCamsIE(InfoExtractor): formats = self._extract_m3u8_formats( '%s/hls/stream_%s/playlist.m3u8' % (server_url, uploader_id), channel_id, 'mp4', m3u8_id='hls', live=True) - self._sort_formats(formats) return { 'id': channel_id, diff --git a/hypervideo_dl/extractor/booyah.py b/hypervideo_dl/extractor/booyah.py new file mode 100644 index 0000000..5c55f2c --- /dev/null +++ b/hypervideo_dl/extractor/booyah.py @@ -0,0 +1,86 @@ +from .common import InfoExtractor +from ..utils import int_or_none, str_or_none, traverse_obj + + +class BooyahBaseIE(InfoExtractor): + _BOOYAH_SESSION_KEY = None + + def _real_initialize(self): + BooyahBaseIE._BOOYAH_SESSION_KEY = self._request_webpage( + 'https://booyah.live/api/v3/auths/sessions', None, data=b'').getheader('booyah-session-key') + + def _get_comments(self, video_id): + comment_json = self._download_json( + f'https://booyah.live/api/v3/playbacks/{video_id}/comments/tops', video_id, + headers={'Booyah-Session-Key': self._BOOYAH_SESSION_KEY}, fatal=False) or {} + + return [{ + 'id': comment.get('comment_id'), + 'author': comment.get('from_nickname'), + 'author_id': comment.get('from_uid'), + 'author_thumbnail': comment.get('from_thumbnail'), + 'text': comment.get('content'), + 'timestamp': comment.get('create_time'), + 'like_count': comment.get('like_cnt'), + } for comment in comment_json.get('comment_list') or ()] + + +class BooyahClipsIE(BooyahBaseIE): + _VALID_URL = r'https?://booyah.live/clips/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://booyah.live/clips/13887261322952306617', + 'info_dict': { + 'id': '13887261322952306617', + 'ext': 'mp4', + 'view_count': int, + 'duration': 30, + 'channel_id': 90565760, + 'like_count': int, + 'title': 'Cayendo con estilo 😎', + 'uploader': '♡LɪꜱGΛMER', + 'comment_count': int, + 'uploader_id': '90565760', + 'thumbnail': 'https://resmambet-a.akamaihd.net/mambet-storage/Clip/90565760/90565760-27204374-fba0-409d-9d7b-63a48b5c0e75.jpg', + 'upload_date': '20220617', + 'timestamp': 1655490556, + 'modified_timestamp': 1655490556, + 'modified_date': '20220617', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + json_data = self._download_json( + f'https://booyah.live/api/v3/playbacks/{video_id}', video_id, + headers={'Booyah-Session-key': self._BOOYAH_SESSION_KEY}) + + formats = [] + for video_data in json_data['playback']['endpoint_list']: + formats.extend(({ + 'url': video_data.get('stream_url'), + 'ext': 'mp4', + 'height': video_data.get('resolution'), + }, { + 'url': video_data.get('download_url'), + 'ext': 'mp4', + 'format_note': 'Watermarked', + 'height': video_data.get('resolution'), + 'preference': -10, + })) + + return { + 'id': video_id, + 'title': traverse_obj(json_data, ('playback', 'name')), + 'thumbnail': traverse_obj(json_data, ('playback', 'thumbnail_url')), + 'formats': formats, + 'view_count': traverse_obj(json_data, ('playback', 'views')), + 'like_count': traverse_obj(json_data, ('playback', 'likes')), + 'duration': traverse_obj(json_data, ('playback', 'duration')), + 'comment_count': traverse_obj(json_data, ('playback', 'comment_cnt')), + 'channel_id': traverse_obj(json_data, ('playback', 'channel_id')), + 'uploader': traverse_obj(json_data, ('user', 'nickname')), + 'uploader_id': str_or_none(traverse_obj(json_data, ('user', 'uid'))), + 'modified_timestamp': int_or_none(traverse_obj(json_data, ('playback', 'update_time_ms')), 1000), + 'timestamp': int_or_none(traverse_obj(json_data, ('playback', 'create_time_ms')), 1000), + '__post_extractor': self.extract_comments(video_id, self._get_comments(video_id)), + } diff --git a/hypervideo_dl/extractor/bostonglobe.py b/hypervideo_dl/extractor/bostonglobe.py index 57882fb..92f8ea2 100644 --- a/hypervideo_dl/extractor/bostonglobe.py +++ b/hypervideo_dl/extractor/bostonglobe.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/box.py b/hypervideo_dl/extractor/box.py index 8214086..8ab1496 100644 --- a/hypervideo_dl/extractor/box.py +++ b/hypervideo_dl/extractor/box.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -82,8 +79,6 @@ class BoxIE(InfoExtractor): 'url': update_url_query(authenticated_download_url, query), }) - self._sort_formats(formats) - creator = f.get('created_by') or {} return { diff --git a/hypervideo_dl/extractor/bpb.py b/hypervideo_dl/extractor/bpb.py index 8f6ef3c..f28e581 100644 --- a/hypervideo_dl/extractor/bpb.py +++ b/hypervideo_dl/extractor/bpb.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -16,7 +13,6 @@ class BpbIE(InfoExtractor): _TEST = { 'url': 'http://www.bpb.de/mediathek/297/joachim-gauck-zu-1989-und-die-erinnerung-an-die-ddr', - # md5 fails in Python 2.6 due to buggy server response and wrong handling of urllib2 'md5': 'c4f84c8a8044ca9ff68bb8441d300b3f', 'info_dict': { 'id': '297', @@ -52,8 +48,6 @@ class BpbIE(InfoExtractor): 'format_id': '%s-%s' % (quality, determine_ext(video_url)), }) - self._sort_formats(formats) - return { 'id': video_id, 'formats': formats, diff --git a/hypervideo_dl/extractor/br.py b/hypervideo_dl/extractor/br.py index 0155827..309452d 100644 --- a/hypervideo_dl/extractor/br.py +++ b/hypervideo_dl/extractor/br.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -160,7 +157,6 @@ class BRIE(InfoExtractor): 'format_id': 'rtmp-%s' % asset_type, }) formats.append(rtmp_format_info) - self._sort_formats(formats) return formats def _extract_thumbnails(self, variants, base_url): @@ -275,7 +271,6 @@ class BRMediathekIE(InfoExtractor): 'tbr': tbr, 'filesize': int_or_none(node.get('fileSize')), }) - self._sort_formats(formats) subtitles = {} for edge in clip.get('captionFiles', {}).get('edges', []): diff --git a/hypervideo_dl/extractor/bravotv.py b/hypervideo_dl/extractor/bravotv.py index 139d51c..d489584 100644 --- a/hypervideo_dl/extractor/bravotv.py +++ b/hypervideo_dl/extractor/bravotv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .adobepass import AdobePassIE diff --git a/hypervideo_dl/extractor/breakcom.py b/hypervideo_dl/extractor/breakcom.py index f38789f..00cf308 100644 --- a/hypervideo_dl/extractor/breakcom.py +++ b/hypervideo_dl/extractor/breakcom.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor from .youtube import YoutubeIE from ..utils import ( @@ -66,7 +63,6 @@ class BreakIE(InfoExtractor): 'format_id': 'http-%d' % bitrate if bitrate else 'http', 'tbr': bitrate, }) - self._sort_formats(formats) title = self._search_regex( (r'title["\']\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', diff --git a/hypervideo_dl/extractor/breitbart.py b/hypervideo_dl/extractor/breitbart.py index e029aa6..ea0a59c 100644 --- a/hypervideo_dl/extractor/breitbart.py +++ b/hypervideo_dl/extractor/breitbart.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor @@ -26,11 +24,9 @@ class BreitBartIE(InfoExtractor): webpage = self._download_webpage(url, video_id) formats = self._extract_m3u8_formats(f'https://cdn.jwplayer.com/manifests/{video_id}.m3u8', video_id, ext='mp4') - self._sort_formats(formats) return { 'id': video_id, - 'title': (self._og_search_title(webpage, default=None) - or self._html_extract_title(webpage, 'video title')), + 'title': self._generic_title('', webpage), 'description': self._og_search_description(webpage), 'thumbnail': self._og_search_thumbnail(webpage), 'age_limit': self._rta_search(webpage), diff --git a/hypervideo_dl/extractor/brightcove.py b/hypervideo_dl/extractor/brightcove.py index dcd332b..2b7ddca 100644 --- a/hypervideo_dl/extractor/brightcove.py +++ b/hypervideo_dl/extractor/brightcove.py @@ -1,9 +1,7 @@ -# coding: utf-8 -from __future__ import unicode_literals - import base64 import re import struct +import xml.etree.ElementTree from .adobepass import AdobePassIE from .common import InfoExtractor @@ -12,7 +10,6 @@ from ..compat import ( compat_HTTPError, compat_parse_qs, compat_urlparse, - compat_xml_parse_error, ) from ..utils import ( clean_html, @@ -148,6 +145,159 @@ class BrightcoveLegacyIE(InfoExtractor): } ] + _WEBPAGE_TESTS = [{ + # embedded brightcove video + # it also tests brightcove videos that need to set the 'Referer' + # in the http requests + 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', + 'info_dict': { + 'id': '2765128793001', + 'ext': 'mp4', + 'title': 'Le cours de bourse : l’analyse technique', + 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9', + 'uploader': 'BFM BUSINESS', + }, + 'params': { + 'skip_download': True, + }, + 'skip': '404 Not Found', + }, { + # embedded with itemprop embedURL and video id spelled as `idVideo` + 'url': 'http://bfmbusiness.bfmtv.com/mediaplayer/chroniques/olivier-delamarche/', + 'info_dict': { + 'id': '5255628253001', + 'ext': 'mp4', + 'title': 'md5:37c519b1128915607601e75a87995fc0', + 'description': 'md5:37f7f888b434bb8f8cc8dbd4f7a4cf26', + 'uploader': 'BFM BUSINESS', + 'uploader_id': '876450612001', + 'timestamp': 1482255315, + 'upload_date': '20161220', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Redirects, page gone', + }, { + # https://github.com/ytdl-org/youtube-dl/issues/2253 + 'url': 'http://bcove.me/i6nfkrc3', + 'md5': '0ba9446db037002366bab3b3eb30c88c', + 'info_dict': { + 'id': '3101154703001', + 'ext': 'mp4', + 'title': 'Still no power', + 'uploader': 'thestar.com', + 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.', + }, + 'skip': 'video gone', + }, { + # https://github.com/ytdl-org/youtube-dl/issues/3541 + 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1', + 'info_dict': { + 'id': '3866516442001', + 'ext': 'mp4', + 'title': 'Leer mij vrouwen kennen: Aflevering 1', + 'description': 'Leer mij vrouwen kennen: Aflevering 1', + 'uploader': 'SBS Broadcasting', + }, + 'skip': 'Restricted to Netherlands, 404 Not Found', + 'params': { + 'skip_download': True, # m3u8 download + }, + }, { + # Brightcove video in <iframe> + 'url': 'http://www.un.org/chinese/News/story.asp?NewsID=27724', + 'md5': '36d74ef5e37c8b4a2ce92880d208b968', + 'info_dict': { + 'id': '5360463607001', + 'ext': 'mp4', + 'title': '叙利亚失明儿童在废墟上演唱《心跳》 呼吁获得正常童年生活', + 'description': '联合国儿童基金会中东和北非区域大使、作曲家扎德·迪拉尼(Zade Dirani)在3月15日叙利亚冲突爆发7周年纪念日之际发布了为叙利亚谱写的歌曲《心跳》(HEARTBEAT),为受到六年冲突影响的叙利亚儿童发出强烈呐喊,呼吁世界做出共同努力,使叙利亚儿童重新获得享有正常童年生活的权利。', + 'uploader': 'United Nations', + 'uploader_id': '1362235914001', + 'timestamp': 1489593889, + 'upload_date': '20170315', + }, + 'skip': '404 Not Found', + }, { + # Brightcove with UUID in videoPlayer + 'url': 'http://www8.hp.com/cn/zh/home.html', + 'info_dict': { + 'id': '5255815316001', + 'ext': 'mp4', + 'title': 'Sprocket Video - China', + 'description': 'Sprocket Video - China', + 'uploader': 'HP-Video Gallery', + 'timestamp': 1482263210, + 'upload_date': '20161220', + 'uploader_id': '1107601872001', + }, + 'params': { + 'skip_download': True, # m3u8 download + }, + 'skip': 'video rotates...weekly?', + }, { + # Multiple brightcove videos + # https://github.com/ytdl-org/youtube-dl/issues/2283 + 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html', + 'info_dict': { + 'id': 'always-never', + 'title': 'Always / Never - The New Yorker', + }, + 'playlist_count': 3, + 'params': { + 'extract_flat': False, + 'skip_download': True, + }, + 'skip': 'Redirects, page gone', + }, { + # BrightcoveInPageEmbed embed + 'url': 'http://www.geekandsundry.com/tabletop-bonus-wils-final-thoughts-on-dread/', + 'info_dict': { + 'id': '4238694884001', + 'ext': 'flv', + 'title': 'Tabletop: Dread, Last Thoughts', + 'description': 'Tabletop: Dread, Last Thoughts', + 'duration': 51690, + }, + 'skip': 'Redirects, page gone', + }, { + # Brightcove embed, with no valid 'renditions' but valid 'IOSRenditions' + # This video can't be played in browsers if Flash disabled and UA set to iPhone, which is actually a false alarm + 'url': 'https://dl.dropboxusercontent.com/u/29092637/interview.html', + 'info_dict': { + 'id': '4785848093001', + 'ext': 'mp4', + 'title': 'The Cardinal Pell Interview', + 'description': 'Sky News Contributor Andrew Bolt interviews George Pell in Rome, following the Cardinal\'s evidence before the Royal Commission into Child Abuse. ', + 'uploader': 'GlobeCast Australia - GlobeStream', + 'uploader_id': '2733773828001', + 'upload_date': '20160304', + 'timestamp': 1457083087, + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + 'skip': '404 Not Found', + }, { + # Brightcove embed with whitespace around attribute names + 'url': 'http://www.stack.com/video/3167554373001/learn-to-hit-open-three-pointers-with-damian-lillard-s-baseline-drift-drill', + 'info_dict': { + 'id': '3167554373001', + 'ext': 'mp4', + 'title': "Learn to Hit Open Three-Pointers With Damian Lillard's Baseline Drift Drill", + 'description': 'md5:57bacb0e0f29349de4972bfda3191713', + 'uploader_id': '1079349493', + 'upload_date': '20140207', + 'timestamp': 1391810548, + }, + 'params': { + 'skip_download': True, + }, + 'skip': '410 Gone', + }] + @classmethod def _build_brightcove_url(cls, object_str): """ @@ -166,7 +316,7 @@ class BrightcoveLegacyIE(InfoExtractor): try: object_doc = compat_etree_fromstring(object_str.encode('utf-8')) - except compat_xml_parse_error: + except xml.etree.ElementTree.ParseError: return fv_el = find_xpath_attr(object_doc, './param', 'name', 'flashVars') @@ -284,6 +434,11 @@ class BrightcoveLegacyIE(InfoExtractor): return [src for _, src in re.findall( r'<iframe[^>]+src=([\'"])((?:https?:)?//link\.brightcove\.com/services/player/(?!\1).+)\1', webpage)] + def _extract_from_webpage(self, url, webpage): + bc_urls = self._extract_brightcove_urls(webpage) + for bc_url in bc_urls: + yield self.url_result(smuggle_url(bc_url, {'Referer': url}), BrightcoveLegacyIE) + def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -339,7 +494,131 @@ class BrightcoveLegacyIE(InfoExtractor): raise UnsupportedError(url) -class BrightcoveNewIE(AdobePassIE): +class BrightcoveNewBaseIE(AdobePassIE): + def _parse_brightcove_metadata(self, json_data, video_id, headers={}): + title = json_data['name'].strip() + + formats, subtitles = [], {} + sources = json_data.get('sources') or [] + for source in sources: + container = source.get('container') + ext = mimetype2ext(source.get('type')) + src = source.get('src') + if ext == 'm3u8' or container == 'M2TS': + if not src: + continue + fmts, subs = self._extract_m3u8_formats_and_subtitles( + src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + subtitles = self._merge_subtitles(subtitles, subs) + elif ext == 'mpd': + if not src: + continue + fmts, subs = self._extract_mpd_formats_and_subtitles(src, video_id, 'dash', fatal=False) + subtitles = self._merge_subtitles(subtitles, subs) + else: + streaming_src = source.get('streaming_src') + stream_name, app_name = source.get('stream_name'), source.get('app_name') + if not src and not streaming_src and (not stream_name or not app_name): + continue + tbr = float_or_none(source.get('avg_bitrate'), 1000) + height = int_or_none(source.get('height')) + width = int_or_none(source.get('width')) + f = { + 'tbr': tbr, + 'filesize': int_or_none(source.get('size')), + 'container': container, + 'ext': ext or container.lower(), + } + if width == 0 and height == 0: + f.update({ + 'vcodec': 'none', + }) + else: + f.update({ + 'width': width, + 'height': height, + 'vcodec': source.get('codec'), + }) + + def build_format_id(kind): + format_id = kind + if tbr: + format_id += '-%dk' % int(tbr) + if height: + format_id += '-%dp' % height + return format_id + + if src or streaming_src: + f.update({ + 'url': src or streaming_src, + 'format_id': build_format_id('http' if src else 'http-streaming'), + 'source_preference': 0 if src else -1, + }) + else: + f.update({ + 'url': app_name, + 'play_path': stream_name, + 'format_id': build_format_id('rtmp'), + }) + fmts = [f] + + # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object + if container == 'WVM' or source.get('key_systems') or ext == 'ism': + for f in fmts: + f['has_drm'] = True + formats.extend(fmts) + + if not formats: + errors = json_data.get('errors') + if errors: + error = errors[0] + self.raise_no_formats( + error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) + + for f in formats: + f.setdefault('http_headers', {}).update(headers) + + for text_track in json_data.get('text_tracks', []): + if text_track.get('kind') != 'captions': + continue + text_track_url = url_or_none(text_track.get('src')) + if not text_track_url: + continue + lang = (str_or_none(text_track.get('srclang')) + or str_or_none(text_track.get('label')) or 'en').lower() + subtitles.setdefault(lang, []).append({ + 'url': text_track_url, + }) + + is_live = False + duration = float_or_none(json_data.get('duration'), 1000) + if duration is not None and duration <= 0: + is_live = True + + common_res = [(160, 90), (320, 180), (480, 720), (640, 360), (768, 432), (1024, 576), (1280, 720), (1366, 768), (1920, 1080)] + thumb_base_url = dict_get(json_data, ('poster', 'thumbnail')) + thumbnails = [{ + 'url': re.sub(r'\d+x\d+', f'{w}x{h}', thumb_base_url), + 'width': w, + 'height': h, + } for w, h in common_res] if thumb_base_url else None + + return { + 'id': video_id, + 'title': title, + 'description': clean_html(json_data.get('description')), + 'thumbnails': thumbnails, + 'duration': duration, + 'timestamp': parse_iso8601(json_data.get('published_at')), + 'uploader_id': json_data.get('account_id'), + 'formats': formats, + 'subtitles': subtitles, + 'tags': json_data.get('tags', []), + 'is_live': is_live, + } + + +class BrightcoveNewIE(BrightcoveNewBaseIE): IE_NAME = 'brightcove:new' _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*(?P<content_type>video|playlist)Id=(?P<video_id>\d+|ref:[^&]+)' _TESTS = [{ @@ -356,6 +635,7 @@ class BrightcoveNewIE(AdobePassIE): 'uploader_id': '929656772001', 'formats': 'mincount:20', }, + 'skip': '404 Not Found', }, { # with rtmp streams 'url': 'http://players.brightcove.net/4036320279001/5d112ed9-283f-485f-a7f9-33f42e8bc042_default/index.html?videoId=4279049078001', @@ -403,13 +683,114 @@ class BrightcoveNewIE(AdobePassIE): 'only_matching': True, }] + _WEBPAGE_TESTS = [{ + # brightcove player url embed + 'url': 'https://nbc-2.com/weather/forecast/2022/11/16/forecast-warmest-day-of-the-week/', + 'md5': '2934d5372b354d27083ccf8575dbfee2', + 'info_dict': { + 'id': '6315650313112', + 'title': 'First Alert Forecast: November 15, 2022', + 'ext': 'mp4', + 'tags': ['nbc2', 'forecast'], + 'uploader_id': '6146886170001', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1668574571, + 'duration': 233.375, + 'upload_date': '20221116', + }, + }, { + # embedded with video tag only + 'url': 'https://www.gooddishtv.com/tiktok-rapping-chef-mr-pyrex', + 'info_dict': { + 'id': 'tiktok-rapping-chef-mr-pyrex', + 'title': 'TikTok\'s Rapping Chef Makes Jambalaya for the Hosts', + 'thumbnail': r're:^https?://.*\.jpg$', + 'age_limit': 0, + 'description': 'Just in time for Mardi Gras', + }, + 'playlist': [{ + 'info_dict': { + 'id': '6299189544001', + 'ext': 'mp4', + 'title': 'TGD_01-032_5', + 'thumbnail': r're:^https?://.*\.jpg$', + 'tags': [], + 'timestamp': 1646078943, + 'uploader_id': '1569565978001', + 'upload_date': '20220228', + 'duration': 217.195, + }, + }, { + 'info_dict': { + 'id': '6305565995112', + 'ext': 'mp4', + 'title': 'TGD 01-087 (Airs 05.25.22)_Segment 5', + 'thumbnail': r're:^https?://.*\.jpg$', + 'tags': [], + 'timestamp': 1651604591, + 'uploader_id': '1569565978001', + 'upload_date': '20220503', + 'duration': 310.421, + }, + }], + }, { + # Brightcove:new type [2]. + 'url': 'http://www.delawaresportszone.com/video-st-thomas-more-earns-first-trip-to-basketball-semis', + 'md5': '2b35148fcf48da41c9fb4591650784f3', + 'info_dict': { + 'id': '5348741021001', + 'ext': 'mp4', + 'upload_date': '20170306', + 'uploader_id': '4191638492001', + 'timestamp': 1488769918, + 'title': 'VIDEO: St. Thomas More earns first trip to basketball semis', + }, + 'skip': '404 Not Found', + }, { + # Alternative brightcove <video> attributes + 'url': 'http://www.programme-tv.net/videos/extraits/81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche/', + 'info_dict': { + 'id': '81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche', + 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche, Extraits : toutes les vidéos avec Télé-Loisirs", + }, + 'playlist': [{ + 'md5': '732d22ba3d33f2f3fc253c39f8f36523', + 'info_dict': { + 'id': '5311302538001', + 'ext': 'mp4', + 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche", + 'description': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche (France 2, 5 février 2017)", + 'timestamp': 1486321708, + 'upload_date': '20170205', + 'uploader_id': '800000640001', + }, + 'only_matching': True, + }], + 'skip': '404 Not Found', + }, { + # Brightcove URL in single quotes + 'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/', + 'md5': '4ae374f1f8b91c889c4b9203c8c752af', + 'info_dict': { + 'id': '4255764656001', + 'ext': 'mp4', + 'title': 'SN Presents: Russell Martin, World Citizen', + 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.', + 'uploader': 'Rogers Sportsnet', + 'uploader_id': '1704050871', + 'upload_date': '20150525', + 'timestamp': 1432570283, + }, + 'skip': 'Page no longer has URL, now has javascript', + }] + @staticmethod def _extract_url(ie, webpage): - urls = BrightcoveNewIE._extract_urls(ie, webpage) + urls = BrightcoveNewIE._extract_brightcove_urls(ie, webpage) return urls[0] if urls else None @staticmethod - def _extract_urls(ie, webpage): + def _extract_brightcove_urls(ie, webpage): # Reference: # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#tag @@ -469,129 +850,10 @@ class BrightcoveNewIE(AdobePassIE): return entries - def _parse_brightcove_metadata(self, json_data, video_id, headers={}): - title = json_data['name'].strip() - - formats, subtitles = [], {} - sources = json_data.get('sources') or [] - for source in sources: - container = source.get('container') - ext = mimetype2ext(source.get('type')) - src = source.get('src') - if ext == 'm3u8' or container == 'M2TS': - if not src: - continue - fmts, subs = self._extract_m3u8_formats_and_subtitles( - src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - subtitles = self._merge_subtitles(subtitles, subs) - elif ext == 'mpd': - if not src: - continue - fmts, subs = self._extract_mpd_formats_and_subtitles(src, video_id, 'dash', fatal=False) - subtitles = self._merge_subtitles(subtitles, subs) - else: - streaming_src = source.get('streaming_src') - stream_name, app_name = source.get('stream_name'), source.get('app_name') - if not src and not streaming_src and (not stream_name or not app_name): - continue - tbr = float_or_none(source.get('avg_bitrate'), 1000) - height = int_or_none(source.get('height')) - width = int_or_none(source.get('width')) - f = { - 'tbr': tbr, - 'filesize': int_or_none(source.get('size')), - 'container': container, - 'ext': ext or container.lower(), - } - if width == 0 and height == 0: - f.update({ - 'vcodec': 'none', - }) - else: - f.update({ - 'width': width, - 'height': height, - 'vcodec': source.get('codec'), - }) - - def build_format_id(kind): - format_id = kind - if tbr: - format_id += '-%dk' % int(tbr) - if height: - format_id += '-%dp' % height - return format_id - - if src or streaming_src: - f.update({ - 'url': src or streaming_src, - 'format_id': build_format_id('http' if src else 'http-streaming'), - 'source_preference': 0 if src else -1, - }) - else: - f.update({ - 'url': app_name, - 'play_path': stream_name, - 'format_id': build_format_id('rtmp'), - }) - fmts = [f] - - # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object - if container == 'WVM' or source.get('key_systems') or ext == 'ism': - for f in fmts: - f['has_drm'] = True - formats.extend(fmts) - - if not formats: - errors = json_data.get('errors') - if errors: - error = errors[0] - self.raise_no_formats( - error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) - - self._sort_formats(formats) - - for f in formats: - f.setdefault('http_headers', {}).update(headers) - - for text_track in json_data.get('text_tracks', []): - if text_track.get('kind') != 'captions': - continue - text_track_url = url_or_none(text_track.get('src')) - if not text_track_url: - continue - lang = (str_or_none(text_track.get('srclang')) - or str_or_none(text_track.get('label')) or 'en').lower() - subtitles.setdefault(lang, []).append({ - 'url': text_track_url, - }) - - is_live = False - duration = float_or_none(json_data.get('duration'), 1000) - if duration is not None and duration <= 0: - is_live = True - - common_res = [(160, 90), (320, 180), (480, 720), (640, 360), (768, 432), (1024, 576), (1280, 720), (1366, 768), (1920, 1080)] - thumb_base_url = dict_get(json_data, ('poster', 'thumbnail')) - thumbnails = [{ - 'url': re.sub(r'\d+x\d+', f'{w}x{h}', thumb_base_url), - 'width': w, - 'height': h, - } for w, h in common_res] if thumb_base_url else None - - return { - 'id': video_id, - 'title': title, - 'description': clean_html(json_data.get('description')), - 'thumbnails': thumbnails, - 'duration': duration, - 'timestamp': parse_iso8601(json_data.get('published_at')), - 'uploader_id': json_data.get('account_id'), - 'formats': formats, - 'subtitles': subtitles, - 'tags': json_data.get('tags', []), - 'is_live': is_live, - } + def _extract_from_webpage(self, url, webpage): + bc_urls = self._extract_brightcove_urls(self, webpage) + for bc_url in bc_urls: + yield self.url_result(smuggle_url(bc_url, {'referrer': url}), BrightcoveNewIE) def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -603,9 +865,9 @@ class BrightcoveNewIE(AdobePassIE): account_id, player_id, embed, content_type, video_id = self._match_valid_url(url).groups() policy_key_id = '%s_%s' % (account_id, player_id) - policy_key = self._downloader.cache.load('brightcove', policy_key_id) + policy_key = self.cache.load('brightcove', policy_key_id) policy_key_extracted = False - store_pk = lambda x: self._downloader.cache.store('brightcove', policy_key_id, x) + store_pk = lambda x: self.cache.store('brightcove', policy_key_id, x) def extract_policy_key(): base_url = 'http://players.brightcove.net/%s/%s_%s/' % (account_id, player_id, embed) @@ -635,7 +897,7 @@ class BrightcoveNewIE(AdobePassIE): api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/%ss/%s' % (account_id, content_type, video_id) headers = {} - referrer = smuggled_data.get('referrer') + referrer = smuggled_data.get('referrer') # XXX: notice the spelling/case of the key if referrer: headers.update({ 'Referer': referrer, diff --git a/hypervideo_dl/extractor/bundesliga.py b/hypervideo_dl/extractor/bundesliga.py new file mode 100644 index 0000000..e76dd58 --- /dev/null +++ b/hypervideo_dl/extractor/bundesliga.py @@ -0,0 +1,34 @@ +from .common import InfoExtractor +from .jwplatform import JWPlatformIE + + +class BundesligaIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?bundesliga\.com/[a-z]{2}/bundesliga/videos(?:/[^?]+)?\?vid=(?P<id>[a-zA-Z0-9]{8})' + _TESTS = [ + { + 'url': 'https://www.bundesliga.com/en/bundesliga/videos?vid=bhhHkKyN', + 'md5': '8fc3b25cd12440e3a8cdc51f1493849c', + 'info_dict': { + 'id': 'bhhHkKyN', + 'ext': 'mp4', + 'title': 'Watch: Alphonso Davies and Jeremie Frimpong head-to-head', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/bhhHkKyN/poster.jpg?width=720', + 'upload_date': '20220928', + 'duration': 146, + 'timestamp': 1664366511, + 'description': 'md5:803d4411bd134140c774021dd4b7598b' + } + }, + { + 'url': 'https://www.bundesliga.com/en/bundesliga/videos/latest-features/T8IKc8TX?vid=ROHjs06G', + 'only_matching': True + }, + { + 'url': 'https://www.bundesliga.com/en/bundesliga/videos/goals?vid=mOG56vWA', + 'only_matching': True + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result(f'jwplatform:{video_id}', JWPlatformIE, video_id) diff --git a/hypervideo_dl/extractor/businessinsider.py b/hypervideo_dl/extractor/businessinsider.py index 73a57b1..4b3f5e6 100644 --- a/hypervideo_dl/extractor/businessinsider.py +++ b/hypervideo_dl/extractor/businessinsider.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from .jwplatform import JWPlatformIE diff --git a/hypervideo_dl/extractor/buzzfeed.py b/hypervideo_dl/extractor/buzzfeed.py index ec41109..b30a3b7 100644 --- a/hypervideo_dl/extractor/buzzfeed.py +++ b/hypervideo_dl/extractor/buzzfeed.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import re @@ -84,7 +81,7 @@ class BuzzFeedIE(InfoExtractor): continue entries.append(self.url_result(video['url'])) - facebook_urls = FacebookIE._extract_urls(webpage) + facebook_urls = FacebookIE._extract_embed_urls(url, webpage) entries.extend([ self.url_result(facebook_url) for facebook_url in facebook_urls]) diff --git a/hypervideo_dl/extractor/byutv.py b/hypervideo_dl/extractor/byutv.py index f4d5086..9ed6efe 100644 --- a/hypervideo_dl/extractor/byutv.py +++ b/hypervideo_dl/extractor/byutv.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -111,7 +108,6 @@ class BYUtvIE(InfoExtractor): 'thumbnail': ep.get('imageThumbnail'), 'duration': parse_duration(ep.get('length')), }) - self._sort_formats(formats) return merge_dicts(info, { 'id': video_id, diff --git a/hypervideo_dl/extractor/c56.py b/hypervideo_dl/extractor/c56.py index a853c53..e4b1c9a 100644 --- a/hypervideo_dl/extractor/c56.py +++ b/hypervideo_dl/extractor/c56.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import js_to_json @@ -53,7 +49,6 @@ class C56IE(InfoExtractor): 'url': f['url'] } for f in info['rfiles'] ] - self._sort_formats(formats) return { 'id': info['vid'], diff --git a/hypervideo_dl/extractor/cableav.py b/hypervideo_dl/extractor/cableav.py index 77efdf4..2e374e5 100644 --- a/hypervideo_dl/extractor/cableav.py +++ b/hypervideo_dl/extractor/cableav.py @@ -1,4 +1,3 @@ -# coding: utf-8 from .common import InfoExtractor @@ -23,7 +22,6 @@ class CableAVIE(InfoExtractor): video_url = self._og_search_video_url(webpage, secure=False) formats = self._extract_m3u8_formats(video_url, video_id, 'mp4') - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/callin.py b/hypervideo_dl/extractor/callin.py index 1f3b7cf..e966876 100644 --- a/hypervideo_dl/extractor/callin.py +++ b/hypervideo_dl/extractor/callin.py @@ -1,4 +1,3 @@ -# coding: utf-8 from .common import InfoExtractor from ..utils import ( traverse_obj, @@ -52,12 +51,9 @@ class CallinIE(InfoExtractor): episode = next_data['props']['pageProps']['episode'] id = episode['id'] - title = (episode.get('title') - or self._og_search_title(webpage, fatal=False) - or self._html_extract_title(webpage)) + title = episode.get('title') or self._generic_title('', webpage) url = episode['m3u8'] formats = self._extract_m3u8_formats(url, display_id, ext='ts') - self._sort_formats(formats) show = traverse_obj(episode, ('show', 'title')) show_id = traverse_obj(episode, ('show', 'id')) diff --git a/hypervideo_dl/extractor/caltrans.py b/hypervideo_dl/extractor/caltrans.py index 9ac740f..f4a4a83 100644 --- a/hypervideo_dl/extractor/caltrans.py +++ b/hypervideo_dl/extractor/caltrans.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor @@ -30,7 +27,6 @@ class CaltransIE(InfoExtractor): video_stream = self._search_regex(r'videoStreamURL\s*=\s*"([^"]+)"', global_vars, 'Video Stream URL', fatal=False) formats = self._extract_m3u8_formats(video_stream, video_id, 'ts', live=True) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/cam4.py b/hypervideo_dl/extractor/cam4.py index 2a3931f..2650cc1 100644 --- a/hypervideo_dl/extractor/cam4.py +++ b/hypervideo_dl/extractor/cam4.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor @@ -23,7 +20,6 @@ class CAM4IE(InfoExtractor): m3u8_playlist = self._download_json('https://www.cam4.com/rest/v1.0/profile/{}/streamInfo'.format(channel_id), channel_id).get('cdnURL') formats = self._extract_m3u8_formats(m3u8_playlist, channel_id, 'mp4', m3u8_id='hls', live=True) - self._sort_formats(formats) return { 'id': channel_id, diff --git a/hypervideo_dl/extractor/camdemy.py b/hypervideo_dl/extractor/camdemy.py index 8f0c6c5..c7079e4 100644 --- a/hypervideo_dl/extractor/camdemy.py +++ b/hypervideo_dl/extractor/camdemy.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/cammodels.py b/hypervideo_dl/extractor/cammodels.py index 3dc1937..0509057 100644 --- a/hypervideo_dl/extractor/cammodels.py +++ b/hypervideo_dl/extractor/cammodels.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -87,7 +84,6 @@ class CamModelsIE(InfoExtractor): else: continue formats.append(f) - self._sort_formats(formats) return { 'id': user_id, diff --git a/hypervideo_dl/extractor/camsoda.py b/hypervideo_dl/extractor/camsoda.py new file mode 100644 index 0000000..021cd91 --- /dev/null +++ b/hypervideo_dl/extractor/camsoda.py @@ -0,0 +1,57 @@ +import random + +from .common import InfoExtractor +from ..utils import ExtractorError, traverse_obj + + +class CamsodaIE(InfoExtractor): + _VALID_URL = r'https?://www\.camsoda\.com/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.camsoda.com/lizzhopf', + 'info_dict': { + 'id': 'lizzhopf', + 'ext': 'mp4', + 'title': 'lizzhopf (lizzhopf) Nude on Cam. Free Live Sex Chat Room - CamSoda', + 'description': str, + 'is_live': True, + 'age_limit': 18, + }, + 'skip': 'Room is offline', + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id, headers=self.geo_verification_headers()) + + data = self._download_json( + f'https://camsoda.com/api/v1/video/vtoken/{video_id}', video_id, + query={'username': f'guest_{random.randrange(10000, 99999)}'}, + headers=self.geo_verification_headers()) + if not data: + raise ExtractorError('Unable to find configuration for stream.') + elif data.get('private_servers'): + raise ExtractorError('Model is in private show.', expected=True) + elif not data.get('stream_name'): + raise ExtractorError('Model is offline.', expected=True) + + stream_name = traverse_obj(data, 'stream_name', expected_type=str) + token = traverse_obj(data, 'token', expected_type=str) + + formats = [] + for server in traverse_obj(data, ('edge_servers', ...)): + formats = self._extract_m3u8_formats( + f'https://{server}/{stream_name}_v1/index.m3u8?token={token}', + video_id, ext='mp4', m3u8_id='hls', fatal=False, live=True) + if formats: + break + if not formats: + self.raise_no_formats('No active streams found', expected=True) + + return { + 'id': video_id, + 'title': self._html_extract_title(webpage), + 'description': self._html_search_meta('description', webpage, default=None), + 'is_live': True, + 'formats': formats, + 'age_limit': 18, + } diff --git a/hypervideo_dl/extractor/camtasia.py b/hypervideo_dl/extractor/camtasia.py new file mode 100644 index 0000000..70ab6c6 --- /dev/null +++ b/hypervideo_dl/extractor/camtasia.py @@ -0,0 +1,71 @@ +import os +import urllib.parse + +from .common import InfoExtractor +from ..utils import float_or_none + + +class CamtasiaEmbedIE(InfoExtractor): + _VALID_URL = False + _WEBPAGE_TESTS = [ + { + 'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/', + 'playlist': [{ + 'md5': '0c5e352edabf715d762b0ad4e6d9ee67', + 'info_dict': { + 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final', + 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1', + 'ext': 'flv', + 'duration': 2235.90, + } + }, { + 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63', + 'info_dict': { + 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP', + 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip', + 'ext': 'flv', + 'duration': 2235.93, + } + }], + 'info_dict': { + 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final', + }, + 'skip': 'webpage dead' + }, + + ] + + def _extract_from_webpage(self, url, webpage): + camtasia_cfg = self._search_regex( + r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);', + webpage, 'camtasia configuration file', default=None) + if camtasia_cfg is None: + return None + + title = self._html_search_meta('DC.title', webpage, fatal=True) + + camtasia_url = urllib.parse.urljoin(url, camtasia_cfg) + camtasia_cfg = self._download_xml( + camtasia_url, self._generic_id(url), + note='Downloading camtasia configuration', + errnote='Failed to download camtasia configuration') + fileset_node = camtasia_cfg.find('./playlist/array/fileset') + + entries = [] + for n in fileset_node.getchildren(): + url_n = n.find('./uri') + if url_n is None: + continue + + entries.append({ + 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0], + 'title': f'{title} - {n.tag}', + 'url': urllib.parse.urljoin(url, url_n.text), + 'duration': float_or_none(n.find('./duration').text), + }) + + return { + '_type': 'playlist', + 'entries': entries, + 'title': title, + } diff --git a/hypervideo_dl/extractor/camtube.py b/hypervideo_dl/extractor/camtube.py deleted file mode 100644 index b3be3bd..0000000 --- a/hypervideo_dl/extractor/camtube.py +++ /dev/null @@ -1,71 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - unified_timestamp, -) - - -class CamTubeIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|api)\.)?camtube\.co/recordings?/(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'https://camtube.co/recording/minafay-030618-1136-chaturbate-female', - 'info_dict': { - 'id': '42ad3956-dd5b-445a-8313-803ea6079fac', - 'display_id': 'minafay-030618-1136-chaturbate-female', - 'ext': 'mp4', - 'title': 'minafay-030618-1136-chaturbate-female', - 'duration': 1274, - 'timestamp': 1528018608, - 'upload_date': '20180603', - 'age_limit': 18 - }, - 'params': { - 'skip_download': True, - }, - }] - - _API_BASE = 'https://api.camtube.co' - - def _real_extract(self, url): - display_id = self._match_id(url) - - token = self._download_json( - '%s/rpc/session/new' % self._API_BASE, display_id, - 'Downloading session token')['token'] - - self._set_cookie('api.camtube.co', 'session', token) - - video = self._download_json( - '%s/recordings/%s' % (self._API_BASE, display_id), display_id, - headers={'Referer': url}) - - video_id = video['uuid'] - timestamp = unified_timestamp(video.get('createdAt')) - duration = int_or_none(video.get('duration')) - view_count = int_or_none(video.get('viewCount')) - like_count = int_or_none(video.get('likeCount')) - creator = video.get('stageName') - - formats = [{ - 'url': '%s/recordings/%s/manifest.m3u8' - % (self._API_BASE, video_id), - 'format_id': 'hls', - 'ext': 'mp4', - 'protocol': 'm3u8_native', - }] - - return { - 'id': video_id, - 'display_id': display_id, - 'title': display_id, - 'timestamp': timestamp, - 'duration': duration, - 'view_count': view_count, - 'like_count': like_count, - 'creator': creator, - 'formats': formats, - 'age_limit': 18 - } diff --git a/hypervideo_dl/extractor/camwithher.py b/hypervideo_dl/extractor/camwithher.py index bbc5205..a0b3749 100644 --- a/hypervideo_dl/extractor/camwithher.py +++ b/hypervideo_dl/extractor/camwithher.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/canalalpha.py b/hypervideo_dl/extractor/canalalpha.py index 0365cb2..df5ca58 100644 --- a/hypervideo_dl/extractor/canalalpha.py +++ b/hypervideo_dl/extractor/canalalpha.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( clean_html, @@ -85,7 +82,6 @@ class CanalAlphaIE(InfoExtractor): dash_frmts, dash_subs = self._parse_mpd_formats_and_subtitles(manifests['dash']) formats.extend(dash_frmts) subtitles = self._merge_subtitles(subtitles, dash_subs) - self._sort_formats(formats) return { 'id': id, 'title': data_json.get('title').strip(), diff --git a/hypervideo_dl/extractor/canalc2.py b/hypervideo_dl/extractor/canalc2.py index 407cc80..597cb2a 100644 --- a/hypervideo_dl/extractor/canalc2.py +++ b/hypervideo_dl/extractor/canalc2.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -61,8 +58,6 @@ class Canalc2IE(InfoExtractor): else: info = self._parse_html5_media_entries(url, webpage, url)[0] - self._sort_formats(info['formats']) - info.update({ 'id': video_id, 'title': title, diff --git a/hypervideo_dl/extractor/canalplus.py b/hypervideo_dl/extractor/canalplus.py index 211ea26..b7e2f9d 100644 --- a/hypervideo_dl/extractor/canalplus.py +++ b/hypervideo_dl/extractor/canalplus.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( # ExtractorError, @@ -90,7 +86,6 @@ class CanalplusIE(InfoExtractor): 'format_id': format_id, 'quality': preference(format_id), }) - self._sort_formats(formats) thumbnails = [{ 'id': image_id, diff --git a/hypervideo_dl/extractor/canvas.py b/hypervideo_dl/extractor/canvas.py index 8b99037..ae6e03a 100644 --- a/hypervideo_dl/extractor/canvas.py +++ b/hypervideo_dl/extractor/canvas.py @@ -1,4 +1,3 @@ -from __future__ import unicode_literals import json @@ -119,7 +118,6 @@ class CanvasIE(InfoExtractor): 'format_id': format_type, 'url': format_url, }) - self._sort_formats(formats) subtitle_urls = data.get('subtitleUrls') if isinstance(subtitle_urls, list): diff --git a/hypervideo_dl/extractor/carambatv.py b/hypervideo_dl/extractor/carambatv.py index 7e5cc90..d6044a3 100644 --- a/hypervideo_dl/extractor/carambatv.py +++ b/hypervideo_dl/extractor/carambatv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -46,7 +43,6 @@ class CarambaTVIE(InfoExtractor): 'height': int_or_none(f.get('height')), 'format_id': format_field(f, 'height', '%sp'), } for f in video['qualities'] if f.get('fn')] - self._sort_formats(formats) thumbnail = video.get('splash') duration = float_or_none(try_get( diff --git a/hypervideo_dl/extractor/cartoonnetwork.py b/hypervideo_dl/extractor/cartoonnetwork.py index 48b3361..4dd7ac4 100644 --- a/hypervideo_dl/extractor/cartoonnetwork.py +++ b/hypervideo_dl/extractor/cartoonnetwork.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .turner import TurnerBaseIE from ..utils import int_or_none diff --git a/hypervideo_dl/extractor/cbc.py b/hypervideo_dl/extractor/cbc.py index 4892419..a9f6cd2 100644 --- a/hypervideo_dl/extractor/cbc.py +++ b/hypervideo_dl/extractor/cbc.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re import json import base64 @@ -307,13 +304,13 @@ class CBCGemIE(InfoExtractor): def _get_claims_token(self, email, password): if not self.claims_token_valid(): self._claims_token = self._new_claims_token(email, password) - self._downloader.cache.store(self._NETRC_MACHINE, 'claims_token', self._claims_token) + self.cache.store(self._NETRC_MACHINE, 'claims_token', self._claims_token) return self._claims_token def _real_initialize(self): if self.claims_token_valid(): return - self._claims_token = self._downloader.cache.load(self._NETRC_MACHINE, 'claims_token') + self._claims_token = self.cache.load(self._NETRC_MACHINE, 'claims_token') def _find_secret_formats(self, formats, video_id): """ Find a valid video url and convert it to the secret variant """ @@ -383,8 +380,6 @@ class CBCGemIE(InfoExtractor): if 'descriptive' in format['format_id'].lower(): format['preference'] = -2 - self._sort_formats(formats) - return { 'id': video_id, 'title': video_info['title'], diff --git a/hypervideo_dl/extractor/cbs.py b/hypervideo_dl/extractor/cbs.py index 2af36ea..9aacd50 100644 --- a/hypervideo_dl/extractor/cbs.py +++ b/hypervideo_dl/extractor/cbs.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .theplatform import ThePlatformFeedIE from ..utils import ( ExtractorError, @@ -12,7 +10,7 @@ from ..utils import ( ) -class CBSBaseIE(ThePlatformFeedIE): +class CBSBaseIE(ThePlatformFeedIE): # XXX: Do not subclass from concrete IE def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): subtitles = {} for k, ext in [('sMPTE-TTCCURL', 'tt'), ('ClosedCaptionURL', 'ttml'), ('webVTTCaptionURL', 'vtt')]: @@ -54,7 +52,6 @@ class CBSBaseIE(ThePlatformFeedIE): subtitles = self._merge_subtitles(subtitles, tp_subtitles) if last_e and not formats: self.raise_no_formats(last_e, True, content_id) - self._sort_formats(formats) extra_info.update({ 'id': content_id, diff --git a/hypervideo_dl/extractor/cbsinteractive.py b/hypervideo_dl/extractor/cbsinteractive.py index 9d4f754..b09e982 100644 --- a/hypervideo_dl/extractor/cbsinteractive.py +++ b/hypervideo_dl/extractor/cbsinteractive.py @@ -1,12 +1,8 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .cbs import CBSIE from ..utils import int_or_none -class CBSInteractiveIE(CBSIE): +class CBSInteractiveIE(CBSIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'https?://(?:www\.)?(?P<site>cnet|zdnet)\.com/(?:videos|video(?:/share)?)/(?P<id>[^/?]+)' _TESTS = [{ 'url': 'http://www.cnet.com/videos/hands-on-with-microsofts-windows-8-1-update/', diff --git a/hypervideo_dl/extractor/cbslocal.py b/hypervideo_dl/extractor/cbslocal.py index 3b7e1a8..3d50b04 100644 --- a/hypervideo_dl/extractor/cbslocal.py +++ b/hypervideo_dl/extractor/cbslocal.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .anvato import AnvatoIE from .sendtonews import SendtoNewsIE from ..compat import compat_urlparse @@ -10,7 +7,7 @@ from ..utils import ( ) -class CBSLocalIE(AnvatoIE): +class CBSLocalIE(AnvatoIE): # XXX: Do not subclass from concrete IE _VALID_URL_BASE = r'https?://[a-z]+\.cbslocal\.com/' _VALID_URL = _VALID_URL_BASE + r'video/(?P<id>\d+)' @@ -50,7 +47,7 @@ class CBSLocalIE(AnvatoIE): 'anvato:anvato_cbslocal_app_web_prod_547f3e49241ef0e5d30c79b2efbca5d92c698f67:' + mcp_id, 'Anvato', mcp_id) -class CBSLocalArticleIE(AnvatoIE): +class CBSLocalArticleIE(AnvatoIE): # XXX: Do not subclass from concrete IE _VALID_URL = CBSLocalIE._VALID_URL_BASE + r'\d+/\d+/\d+/(?P<id>[0-9a-z-]+)' _TESTS = [{ diff --git a/hypervideo_dl/extractor/cbsnews.py b/hypervideo_dl/extractor/cbsnews.py index 1285ed6..16edf3a 100644 --- a/hypervideo_dl/extractor/cbsnews.py +++ b/hypervideo_dl/extractor/cbsnews.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re import zlib @@ -15,7 +12,7 @@ from ..utils import ( ) -class CBSNewsEmbedIE(CBSIE): +class CBSNewsEmbedIE(CBSIE): # XXX: Do not subclass from concrete IE IE_NAME = 'cbsnews:embed' _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/embed/video[^#]*#(?P<id>.+)' _TESTS = [{ @@ -30,7 +27,7 @@ class CBSNewsEmbedIE(CBSIE): return self._extract_video_info(item['mpxRefId'], 'cbsnews') -class CBSNewsIE(CBSIE): +class CBSNewsIE(CBSIE): # XXX: Do not subclass from concrete IE IE_NAME = 'cbsnews' IE_DESC = 'CBS News' _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|video)/(?P<id>[\da-z_-]+)' @@ -135,7 +132,6 @@ class CBSNewsLiveVideoIE(InfoExtractor): }) formats = self._extract_akamai_formats(video_info['url'], display_id) - self._sort_formats(formats) return { 'id': display_id, diff --git a/hypervideo_dl/extractor/cbssports.py b/hypervideo_dl/extractor/cbssports.py index b8a6e59..b5d85af 100644 --- a/hypervideo_dl/extractor/cbssports.py +++ b/hypervideo_dl/extractor/cbssports.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - # from .cbs import CBSBaseIE from .common import InfoExtractor from ..utils import ( @@ -43,7 +40,6 @@ class CBSSportsEmbedIE(InfoExtractor): formats = self._extract_m3u8_formats( metadata['files'][0]['url'], video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) - self._sort_formats(formats) image = video.get('image') thumbnails = None diff --git a/hypervideo_dl/extractor/ccc.py b/hypervideo_dl/extractor/ccc.py index 36e6dff..22e3a22 100644 --- a/hypervideo_dl/extractor/ccc.py +++ b/hypervideo_dl/extractor/ccc.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -67,7 +64,6 @@ class CCCIE(InfoExtractor): 'language': language, 'vcodec': vcodec, }) - self._sort_formats(formats) return { 'id': event_id, @@ -78,6 +74,7 @@ class CCCIE(InfoExtractor): 'thumbnail': event_data.get('thumb_url'), 'timestamp': parse_iso8601(event_data.get('date')), 'duration': int_or_none(event_data.get('length')), + 'view_count': int_or_none(event_data.get('view_count')), 'tags': event_data.get('tags'), 'formats': formats, } diff --git a/hypervideo_dl/extractor/ccma.py b/hypervideo_dl/extractor/ccma.py index 9dbaabf..88ff82f 100644 --- a/hypervideo_dl/extractor/ccma.py +++ b/hypervideo_dl/extractor/ccma.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( clean_html, @@ -84,7 +81,6 @@ class CCMAIE(InfoExtractor): 'url': media_url, 'vcodec': 'none' if media_type == 'audio' else None, }) - self._sort_formats(formats) informacio = media['informacio'] title = informacio['titol'] diff --git a/hypervideo_dl/extractor/cctv.py b/hypervideo_dl/extractor/cctv.py index 0ed5f32..466bdfb 100644 --- a/hypervideo_dl/extractor/cctv.py +++ b/hypervideo_dl/extractor/cctv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -173,8 +170,6 @@ class CCTVIE(InfoExtractor): hls_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) - self._sort_formats(formats) - uploader = data.get('editer_name') description = self._html_search_meta( 'description', webpage, default=None) diff --git a/hypervideo_dl/extractor/cda.py b/hypervideo_dl/extractor/cda.py index 72c4705..d1212e6 100644 --- a/hypervideo_dl/extractor/cda.py +++ b/hypervideo_dl/extractor/cda.py @@ -1,16 +1,13 @@ -# coding: utf-8 -from __future__ import unicode_literals - +import base64 import codecs -import re +import datetime +import hashlib +import hmac import json +import re from .common import InfoExtractor -from ..compat import ( - compat_chr, - compat_ord, - compat_urllib_parse_unquote, -) +from ..compat import compat_ord, compat_urllib_parse_unquote from ..utils import ( ExtractorError, float_or_none, @@ -19,14 +16,27 @@ from ..utils import ( multipart_encode, parse_duration, random_birthday, - urljoin, + traverse_obj, + try_call, try_get, + urljoin, ) class CDAIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P<id>[0-9a-z]+)' + _NETRC_MACHINE = 'cdapl' + _BASE_URL = 'http://www.cda.pl/' + _BASE_API_URL = 'https://api.cda.pl' + _API_HEADERS = { + 'Accept': 'application/vnd.cda.public+json', + 'User-Agent': 'pl.cda 1.0 (version 1.2.88 build 15306; Android 9; Xiaomi Redmi 3S)', + } + # hardcoded in the app + _LOGIN_REQUEST_AUTH = 'Basic YzU3YzBlZDUtYTIzOC00MWQwLWI2NjQtNmZmMWMxY2Y2YzVlOklBTm95QlhRRVR6U09MV1hnV3MwMW0xT2VyNWJNZzV4clRNTXhpNGZJUGVGZ0lWUlo5UGVYTDhtUGZaR1U1U3Q' + _BEARER_CACHE = 'cda-bearer' + _TESTS = [{ 'url': 'http://www.cda.pl/video/5749950c', 'md5': '6f844bf51b15f31fae165365707ae970', @@ -90,8 +100,71 @@ class CDAIE(InfoExtractor): 'Content-Type': content_type, }, **kwargs) + def _perform_login(self, username, password): + cached_bearer = self.cache.load(self._BEARER_CACHE, username) or {} + if cached_bearer.get('valid_until', 0) > datetime.datetime.now().timestamp() + 5: + self._API_HEADERS['Authorization'] = f'Bearer {cached_bearer["token"]}' + return + + password_hash = base64.urlsafe_b64encode(hmac.new( + b's01m1Oer5IANoyBXQETzSOLWXgWs01m1Oer5bMg5xrTMMxRZ9Pi4fIPeFgIVRZ9PeXL8mPfXQETZGUAN5StRZ9P', + ''.join(f'{bytes((bt & 255, )).hex():0>2}' + for bt in hashlib.md5(password.encode()).digest()).encode(), + hashlib.sha256).digest()).decode().replace('=', '') + + token_res = self._download_json( + f'{self._BASE_API_URL}/oauth/token', None, 'Logging in', data=b'', + headers={**self._API_HEADERS, 'Authorization': self._LOGIN_REQUEST_AUTH}, + query={ + 'grant_type': 'password', + 'login': username, + 'password': password_hash, + }) + self.cache.store(self._BEARER_CACHE, username, { + 'token': token_res['access_token'], + 'valid_until': token_res['expires_in'] + datetime.datetime.now().timestamp(), + }) + self._API_HEADERS['Authorization'] = f'Bearer {token_res["access_token"]}' + def _real_extract(self, url): video_id = self._match_id(url) + + if 'Authorization' in self._API_HEADERS: + return self._api_extract(video_id) + else: + return self._web_extract(video_id, url) + + def _api_extract(self, video_id): + meta = self._download_json( + f'{self._BASE_API_URL}/video/{video_id}', video_id, headers=self._API_HEADERS)['video'] + + if meta.get('premium') and not meta.get('premium_free'): + self.report_drm(video_id) + + uploader = traverse_obj(meta, 'author', 'login') + + formats = [{ + 'url': quality['file'], + 'format': quality.get('title'), + 'resolution': quality.get('name'), + 'height': try_call(lambda: int(quality['name'][:-1])), + 'filesize': quality.get('length'), + } for quality in meta['qualities'] if quality.get('file')] + + return { + 'id': video_id, + 'title': meta.get('title'), + 'description': meta.get('description'), + 'uploader': None if uploader == 'anonim' else uploader, + 'average_rating': float_or_none(meta.get('rating')), + 'thumbnail': meta.get('thumb'), + 'formats': formats, + 'duration': meta.get('duration'), + 'age_limit': 18 if meta.get('for_adults') else 0, + 'view_count': meta.get('views'), + } + + def _web_extract(self, video_id, url): self._set_cookie('cda.pl', 'cda.player', 'html5') webpage = self._download_webpage( self._BASE_URL + '/video/' + video_id, video_id) @@ -147,7 +220,7 @@ class CDAIE(InfoExtractor): b = [] for c in a: f = compat_ord(c) - b.append(compat_chr(33 + (f + 14) % 94) if 33 <= f <= 126 else compat_chr(f)) + b.append(chr(33 + (f + 14) % 94) if 33 <= f <= 126 else chr(f)) a = ''.join(b) a = a.replace('.cda.mp4', '') for p in ('.2cda.pl', '.3cda.pl'): @@ -229,6 +302,4 @@ class CDAIE(InfoExtractor): extract_format(webpage, resolution) - self._sort_formats(formats) - return merge_dicts(info_dict, info) diff --git a/hypervideo_dl/extractor/cellebrite.py b/hypervideo_dl/extractor/cellebrite.py new file mode 100644 index 0000000..9896a31 --- /dev/null +++ b/hypervideo_dl/extractor/cellebrite.py @@ -0,0 +1,63 @@ +from .common import InfoExtractor +from ..utils import traverse_obj + + +class CellebriteIE(InfoExtractor): + _VALID_URL = r'https?://cellebrite\.com/(?:\w+)?/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://cellebrite.com/en/collect-data-from-android-devices-with-cellebrite-ufed/', + 'info_dict': { + 'id': '16025876', + 'ext': 'mp4', + 'description': 'md5:174571cb97083fd1d457d75c684f4e2b', + 'thumbnail': 'https://cellebrite.com/wp-content/uploads/2021/05/Chat-Capture-1024x559.png', + 'title': 'Ask the Expert: Chat Capture - Collect Data from Android Devices in Cellebrite UFED', + 'duration': 455, + 'tags': [], + } + }, { + 'url': 'https://cellebrite.com/en/how-to-lawfully-collect-the-maximum-amount-of-data-from-android-devices/', + 'info_dict': { + 'id': '29018255', + 'ext': 'mp4', + 'duration': 134, + 'tags': [], + 'description': 'md5:e9a3d124c7287b0b07bad2547061cacf', + 'thumbnail': 'https://cellebrite.com/wp-content/uploads/2022/07/How-to-Lawfully-Collect-the-Maximum-Amount-of-Data-From-Android-Devices.png', + 'title': 'Android Extractions Explained', + } + }] + + def _get_formats_and_subtitles(self, json_data, display_id): + formats = [{'url': url} for url in traverse_obj(json_data, ('mp4', ..., 'url')) or []] + subtitles = {} + + for url in traverse_obj(json_data, ('hls', ..., 'url')) or []: + fmt, sub = self._extract_m3u8_formats_and_subtitles( + url, display_id, ext='mp4', headers={'Referer': 'https://play.vidyard.com/'}) + formats.extend(fmt) + self._merge_subtitles(sub, target=subtitles) + + return formats, subtitles + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + player_uuid = self._search_regex( + r'<img\s[^>]*\bdata-uuid\s*=\s*"([^"\?]+)', webpage, 'player UUID') + json_data = self._download_json( + f'https://play.vidyard.com/player/{player_uuid}.json', display_id)['payload']['chapters'][0] + + formats, subtitles = self._get_formats_and_subtitles(json_data['sources'], display_id) + return { + 'id': str(json_data['videoId']), + 'title': json_data.get('name') or self._og_search_title(webpage), + 'formats': formats, + 'subtitles': subtitles, + 'description': json_data.get('description') or self._og_search_description(webpage), + 'duration': json_data.get('seconds'), + 'tags': json_data.get('tags'), + 'thumbnail': self._og_search_thumbnail(webpage), + 'http_headers': {'Referer': 'https://play.vidyard.com/'}, + } diff --git a/hypervideo_dl/extractor/ceskatelevize.py b/hypervideo_dl/extractor/ceskatelevize.py index ddf66b2..be2b0bb 100644 --- a/hypervideo_dl/extractor/ceskatelevize.py +++ b/hypervideo_dl/extractor/ceskatelevize.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -12,6 +9,7 @@ from ..utils import ( ExtractorError, float_or_none, sanitized_Request, + str_or_none, traverse_obj, urlencode_postdata, USER_AGENTS, @@ -19,13 +17,13 @@ from ..utils import ( class CeskaTelevizeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/(?:ivysilani|porady)/(?:[^/?#&]+/)*(?P<id>[^/#?]+)' + _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/(?:ivysilani|porady|zive)/(?:[^/?#&]+/)*(?P<id>[^/#?]+)' _TESTS = [{ 'url': 'http://www.ceskatelevize.cz/ivysilani/10441294653-hyde-park-civilizace/215411058090502/bonus/20641-bonus-01-en', 'info_dict': { 'id': '61924494877028507', 'ext': 'mp4', - 'title': 'Hyde Park Civilizace: Bonus 01 - En', + 'title': 'Bonus 01 - En - Hyde Park Civilizace', 'description': 'English Subtittles', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 81.3, @@ -36,18 +34,29 @@ class CeskaTelevizeIE(InfoExtractor): }, }, { # live stream - 'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/', + 'url': 'http://www.ceskatelevize.cz/zive/ct1/', 'info_dict': { - 'id': 402, + 'id': '102', 'ext': 'mp4', - 'title': r're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'title': r'ČT1 - živé vysílání online', + 'description': 'Sledujte živé vysílání kanálu ČT1 online. Vybírat si můžete i z dalších kanálů České televize na kterémkoli z vašich zařízení.', 'is_live': True, }, 'params': { # m3u8 download 'skip_download': True, }, - 'skip': 'Georestricted to Czech Republic', + }, { + # another + 'url': 'http://www.ceskatelevize.cz/ivysilani/zive/ct4/', + 'only_matching': True, + 'info_dict': { + 'id': 402, + 'ext': 'mp4', + 'title': r're:^ČT Sport \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'is_live': True, + }, + # 'skip': 'Georestricted to Czech Republic', }, { 'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25', 'only_matching': True, @@ -56,21 +65,21 @@ class CeskaTelevizeIE(InfoExtractor): 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', 'info_dict': { 'id': '215562210900007-bogotart', - 'title': 'Queer: Bogotart', - 'description': 'Hlavní město Kolumbie v doprovodu queer umělců. Vroucí svět plný vášně, sebevědomí, ale i násilí a bolesti. Připravil Peter Serge Butko', + 'title': 'Bogotart - Queer', + 'description': 'Hlavní město Kolumbie v doprovodu queer umělců. Vroucí svět plný vášně, sebevědomí, ale i násilí a bolesti', }, 'playlist': [{ 'info_dict': { 'id': '61924494877311053', 'ext': 'mp4', - 'title': 'Queer: Bogotart (Varování 18+)', + 'title': 'Bogotart - Queer (Varování 18+)', 'duration': 11.9, }, }, { 'info_dict': { 'id': '61924494877068022', 'ext': 'mp4', - 'title': 'Queer: Bogotart (Queer)', + 'title': 'Bogotart - Queer (Queer)', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 1558.3, }, @@ -87,28 +96,42 @@ class CeskaTelevizeIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) - parsed_url = compat_urllib_parse_urlparse(url) - webpage = self._download_webpage(url, playlist_id) - site_name = self._og_search_property('site_name', webpage, fatal=False, default=None) + webpage, urlh = self._download_webpage_handle(url, playlist_id) + parsed_url = compat_urllib_parse_urlparse(urlh.geturl()) + site_name = self._og_search_property('site_name', webpage, fatal=False, default='Česká televize') playlist_title = self._og_search_title(webpage, default=None) if site_name and playlist_title: - playlist_title = playlist_title.replace(f' — {site_name}', '', 1) + playlist_title = re.split(r'\s*[—|]\s*%s' % (site_name, ), playlist_title, 1)[0] playlist_description = self._og_search_description(webpage, default=None) if playlist_description: playlist_description = playlist_description.replace('\xa0', ' ') - if parsed_url.path.startswith('/porady/'): + type_ = 'IDEC' + if re.search(r'(^/porady|/zive)/', parsed_url.path): next_data = self._search_nextjs_data(webpage, playlist_id) - idec = traverse_obj(next_data, ('props', 'pageProps', 'data', ('show', 'mediaMeta'), 'idec'), get_all=False) + if '/zive/' in parsed_url.path: + idec = traverse_obj(next_data, ('props', 'pageProps', 'data', 'liveBroadcast', 'current', 'idec'), get_all=False) + else: + idec = traverse_obj(next_data, ('props', 'pageProps', 'data', ('show', 'mediaMeta'), 'idec'), get_all=False) + if not idec: + idec = traverse_obj(next_data, ('props', 'pageProps', 'data', 'videobonusDetail', 'bonusId'), get_all=False) + if idec: + type_ = 'bonus' if not idec: raise ExtractorError('Failed to find IDEC id') - iframe_hash = self._download_webpage('https://www.ceskatelevize.cz/v-api/iframe-hash/', playlist_id) - webpage = self._download_webpage('https://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php', playlist_id, - query={'hash': iframe_hash, 'origin': 'iVysilani', 'autoStart': 'true', 'IDEC': idec}) + iframe_hash = self._download_webpage( + 'https://www.ceskatelevize.cz/v-api/iframe-hash/', + playlist_id, note='Getting IFRAME hash') + query = {'hash': iframe_hash, 'origin': 'iVysilani', 'autoStart': 'true', type_: idec, } + webpage = self._download_webpage( + 'https://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php', + playlist_id, note='Downloading player', query=query) NOT_AVAILABLE_STRING = 'This content is not available at your territory due to limited copyright.' if '%s</p>' % NOT_AVAILABLE_STRING in webpage: - raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) + self.raise_geo_restricted(NOT_AVAILABLE_STRING) + if any(not_found in webpage for not_found in ('Neplatný parametr pro videopřehrávač', 'IDEC nebyl nalezen', )): + raise ExtractorError('no video with IDEC available', video_id=idec, expected=True) type_ = None episode_id = None @@ -177,7 +200,6 @@ class CeskaTelevizeIE(InfoExtractor): is_live = item.get('type') == 'LIVE' formats = [] for format_id, stream_url in item.get('streamUrls', {}).items(): - stream_url = stream_url.replace('https://', 'http://') if 'playerType=flash' in stream_url: stream_formats = self._extract_m3u8_formats( stream_url, playlist_id, 'mp4', 'm3u8_native', @@ -199,7 +221,7 @@ class CeskaTelevizeIE(InfoExtractor): entries[num]['formats'].extend(formats) continue - item_id = item.get('id') or item['assetId'] + item_id = str_or_none(item.get('id') or item['assetId']) title = item['title'] duration = float_or_none(item.get('duration')) @@ -227,9 +249,8 @@ class CeskaTelevizeIE(InfoExtractor): 'is_live': is_live, }) - for e in entries: - self._sort_formats(e['formats']) - + if len(entries) == 1: + return entries[0] return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) def _get_subtitles(self, episode_id, subs): diff --git a/hypervideo_dl/extractor/cgtn.py b/hypervideo_dl/extractor/cgtn.py index 89f1738..aaafa02 100644 --- a/hypervideo_dl/extractor/cgtn.py +++ b/hypervideo_dl/extractor/cgtn.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( try_get, diff --git a/hypervideo_dl/extractor/channel9.py b/hypervideo_dl/extractor/channel9.py index 90024db..a884740 100644 --- a/hypervideo_dl/extractor/channel9.py +++ b/hypervideo_dl/extractor/channel9.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -16,6 +14,7 @@ class Channel9IE(InfoExtractor): IE_DESC = 'Channel 9' IE_NAME = 'channel9' _VALID_URL = r'https?://(?:www\.)?(?:channel9\.msdn\.com|s\.ch9\.ms)/(?P<contentpath>.+?)(?P<rss>/RSS)?/?(?:[?#&]|$)' + _EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>https?://channel9\.msdn\.com/(?:[^/]+/)+)player\b'] _TESTS = [{ 'url': 'http://channel9.msdn.com/Events/TechEd/Australia/2013/KOS002', @@ -80,12 +79,6 @@ class Channel9IE(InfoExtractor): _RSS_URL = 'http://channel9.msdn.com/%s/RSS' - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+src=["\'](https?://channel9\.msdn\.com/(?:[^/]+/)+)player\b', - webpage) - def _extract_list(self, video_id, rss_url=None): if not rss_url: rss_url = self._RSS_URL % video_id @@ -192,7 +185,6 @@ class Channel9IE(InfoExtractor): if not formats and not slides and not zip_file: self.raise_no_formats( 'None of recording, slides or zip are available for %s' % content_path) - self._sort_formats(formats) subtitles = {} for caption in content_data.get('Captions', []): diff --git a/hypervideo_dl/extractor/charlierose.py b/hypervideo_dl/extractor/charlierose.py index 42c9af2..8fe6797 100644 --- a/hypervideo_dl/extractor/charlierose.py +++ b/hypervideo_dl/extractor/charlierose.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import remove_end @@ -40,8 +38,6 @@ class CharlieRoseIE(InfoExtractor): info_dict = self._parse_html5_media_entries( self._PLAYER_BASE % video_id, webpage, video_id, m3u8_entry_protocol='m3u8_native')[0] - - self._sort_formats(info_dict['formats']) self._remove_duplicate_formats(info_dict['formats']) info_dict.update({ diff --git a/hypervideo_dl/extractor/chaturbate.py b/hypervideo_dl/extractor/chaturbate.py index 8da51f9..99dfcfd 100644 --- a/hypervideo_dl/extractor/chaturbate.py +++ b/hypervideo_dl/extractor/chaturbate.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -97,7 +95,6 @@ class ChaturbateIE(InfoExtractor): # ffmpeg skips segments for fast m3u8 preference=-10 if m3u8_id == 'fast' else None, m3u8_id=m3u8_id, fatal=False, live=True)) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/chilloutzone.py b/hypervideo_dl/extractor/chilloutzone.py index fd5202b..1a2f77c 100644 --- a/hypervideo_dl/extractor/chilloutzone.py +++ b/hypervideo_dl/extractor/chilloutzone.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import json from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/chingari.py b/hypervideo_dl/extractor/chingari.py index e6841fb..48091dd 100644 --- a/hypervideo_dl/extractor/chingari.py +++ b/hypervideo_dl/extractor/chingari.py @@ -1,14 +1,11 @@ -# coding: utf-8 -from __future__ import unicode_literals - import itertools import json +import urllib.parse from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote_plus from ..utils import ( - clean_html, ExtractorError, + clean_html, int_or_none, str_to_int, url_or_none, @@ -35,7 +32,6 @@ class ChingariBaseIE(InfoExtractor): 'url': base_url + '/apipublic' + media_data['path'], 'quality': 10, }) - self._sort_formats(formats) timestamp = str_to_int(post_data.get('created_at')) if timestamp: timestamp = int_or_none(timestamp, 1000) @@ -48,8 +44,10 @@ class ChingariBaseIE(InfoExtractor): return { 'id': id, - 'title': compat_urllib_parse_unquote_plus(clean_html(post_data.get('caption'))), - 'description': compat_urllib_parse_unquote_plus(clean_html(post_data.get('caption'))), + 'extractor_key': ChingariIE.ie_key(), + 'extractor': 'Chingari', + 'title': urllib.parse.unquote_plus(clean_html(post_data.get('caption'))), + 'description': urllib.parse.unquote_plus(clean_html(post_data.get('caption'))), 'duration': media_data.get('duration'), 'thumbnail': url_or_none(thumbnail), 'like_count': post_data.get('likeCount'), @@ -105,11 +103,11 @@ class ChingariUserIE(ChingariBaseIE): _VALID_URL = r'https?://(?:www\.)?chingari\.io/(?!share/post)(?P<id>[^/?]+)' _TESTS = [{ 'url': 'https://chingari.io/dada1023', - 'playlist_mincount': 3, 'info_dict': { 'id': 'dada1023', }, - 'entries': [{ + 'params': {'playlistend': 3}, + 'playlist': [{ 'url': 'https://chingari.io/share/post?id=614781f3ade60b3a0bfff42a', 'info_dict': { 'id': '614781f3ade60b3a0bfff42a', diff --git a/hypervideo_dl/extractor/chirbit.py b/hypervideo_dl/extractor/chirbit.py index 8d75cdf..452711d 100644 --- a/hypervideo_dl/extractor/chirbit.py +++ b/hypervideo_dl/extractor/chirbit.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/cinchcast.py b/hypervideo_dl/extractor/cinchcast.py index b861d54..7a7ea8b 100644 --- a/hypervideo_dl/extractor/cinchcast.py +++ b/hypervideo_dl/extractor/cinchcast.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( unified_strdate, @@ -10,6 +7,8 @@ from ..utils import ( class CinchcastIE(InfoExtractor): _VALID_URL = r'https?://player\.cinchcast\.com/.*?(?:assetId|show_id)=(?P<id>[0-9]+)' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1'] + _TESTS = [{ 'url': 'http://player.cinchcast.com/?show_id=5258197&platformId=1&assetType=single', 'info_dict': { @@ -48,7 +47,6 @@ class CinchcastIE(InfoExtractor): 'format_id': 'backup', 'url': backup_url, }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/cinemax.py b/hypervideo_dl/extractor/cinemax.py index 2c3ff8d..54cab22 100644 --- a/hypervideo_dl/extractor/cinemax.py +++ b/hypervideo_dl/extractor/cinemax.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .hbo import HBOBaseIE diff --git a/hypervideo_dl/extractor/cinetecamilano.py b/hypervideo_dl/extractor/cinetecamilano.py new file mode 100644 index 0000000..5e770eb --- /dev/null +++ b/hypervideo_dl/extractor/cinetecamilano.py @@ -0,0 +1,61 @@ +import json +import urllib.error +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + parse_iso8601, + strip_or_none, + traverse_obj, + try_get, + urljoin, +) + + +class CinetecaMilanoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?cinetecamilano\.it/film/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.cinetecamilano.it/film/1942', + 'info_dict': { + 'id': '1942', + 'ext': 'mp4', + 'title': 'Il draghetto Gris\u00f9 (4 episodi)', + 'release_date': '20220129', + 'thumbnail': r're:.+\.png', + 'description': 'md5:5328cbe080b93224712b6f17fcaf2c01', + 'modified_date': '20200520', + 'duration': 3139, + 'release_timestamp': 1643446208, + 'modified_timestamp': int + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + try: + film_json = self._download_json( + f'https://www.cinetecamilano.it/api/catalogo/{video_id}/?', + video_id, headers={ + 'Referer': url, + 'Authorization': try_get(self._get_cookies('https://www.cinetecamilano.it'), lambda x: f'Bearer {x["cnt-token"].value}') or '' + }) + except ExtractorError as e: + if ((isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 500) + or isinstance(e.cause, json.JSONDecodeError)): + self.raise_login_required(method='cookies') + raise + if not film_json.get('success') or not film_json.get('archive'): + raise ExtractorError('Video information not found') + archive = film_json['archive'] + + return { + 'id': video_id, + 'title': archive.get('title'), + 'description': strip_or_none(archive.get('description')), + 'duration': float_or_none(archive.get('duration'), invscale=60), + 'release_timestamp': parse_iso8601(archive.get('updated_at'), delimiter=' '), + 'modified_timestamp': parse_iso8601(archive.get('created_at'), delimiter=' '), + 'thumbnail': urljoin(url, try_get(archive, lambda x: x['thumb']['src'].replace('/public/', '/storage/'))), + 'formats': self._extract_m3u8_formats( + urljoin(url, traverse_obj(archive, ('drm', 'hls'))), video_id, 'mp4') + } diff --git a/hypervideo_dl/extractor/ciscolive.py b/hypervideo_dl/extractor/ciscolive.py index 349c5eb..0668578 100644 --- a/hypervideo_dl/extractor/ciscolive.py +++ b/hypervideo_dl/extractor/ciscolive.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import itertools from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/ciscowebex.py b/hypervideo_dl/extractor/ciscowebex.py index 882dae9..44595d8 100644 --- a/hypervideo_dl/extractor/ciscowebex.py +++ b/hypervideo_dl/extractor/ciscowebex.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -75,7 +72,6 @@ class CiscoWebexIE(InfoExtractor): 'vcodec': 'none', 'acodec': 'mp3', }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/cjsw.py b/hypervideo_dl/extractor/cjsw.py index 1dea0d7..c37a3b8 100644 --- a/hypervideo_dl/extractor/cjsw.py +++ b/hypervideo_dl/extractor/cjsw.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( determine_ext, diff --git a/hypervideo_dl/extractor/cliphunter.py b/hypervideo_dl/extractor/cliphunter.py index f2ca7a3..2b907dc 100644 --- a/hypervideo_dl/extractor/cliphunter.py +++ b/hypervideo_dl/extractor/cliphunter.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -64,7 +62,6 @@ class CliphunterIE(InfoExtractor): 'height': int_or_none(height), 'tbr': int_or_none(f.get('br')), }) - self._sort_formats(formats) thumbnail = self._search_regex( r"var\s+mov_thumb\s*=\s*'([^']+)';", diff --git a/hypervideo_dl/extractor/clippit.py b/hypervideo_dl/extractor/clippit.py index a1a7a77..006a713 100644 --- a/hypervideo_dl/extractor/clippit.py +++ b/hypervideo_dl/extractor/clippit.py @@ -1,7 +1,3 @@ -# coding: utf-8 - -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( parse_iso8601, diff --git a/hypervideo_dl/extractor/cliprs.py b/hypervideo_dl/extractor/cliprs.py index d55b26d..567f77b 100644 --- a/hypervideo_dl/extractor/cliprs.py +++ b/hypervideo_dl/extractor/cliprs.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .onet import OnetBaseIE diff --git a/hypervideo_dl/extractor/clipsyndicate.py b/hypervideo_dl/extractor/clipsyndicate.py index 6cdb42f..6064443 100644 --- a/hypervideo_dl/extractor/clipsyndicate.py +++ b/hypervideo_dl/extractor/clipsyndicate.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( find_xpath_attr, diff --git a/hypervideo_dl/extractor/closertotruth.py b/hypervideo_dl/extractor/closertotruth.py index 517e121..e78e26a 100644 --- a/hypervideo_dl/extractor/closertotruth.py +++ b/hypervideo_dl/extractor/closertotruth.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/cloudflarestream.py b/hypervideo_dl/extractor/cloudflarestream.py index 2fdcfbb..748e8e9 100644 --- a/hypervideo_dl/extractor/cloudflarestream.py +++ b/hypervideo_dl/extractor/cloudflarestream.py @@ -1,8 +1,4 @@ -# coding: utf-8 -from __future__ import unicode_literals - import base64 -import re from .common import InfoExtractor @@ -19,6 +15,7 @@ class CloudflareStreamIE(InfoExtractor): ) (?P<id>%s) ''' % (_DOMAIN_RE, _EMBED_RE, _ID_RE) + _EMBED_REGEX = [fr'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//{_EMBED_RE}(?:{_ID_RE}).*?)\1'] _TESTS = [{ 'url': 'https://embed.cloudflarestream.com/embed/we4g.fla9.latest.js?video=31c9291ab41fac05471db4e73aa11717', 'info_dict': { @@ -40,21 +37,13 @@ class CloudflareStreamIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//%s(?:%s).*?)\1' % (CloudflareStreamIE._EMBED_RE, CloudflareStreamIE._ID_RE), - webpage)] - def _real_extract(self, url): video_id = self._match_id(url) domain = 'bytehighway.net' if 'bytehighway.net/' in url else 'videodelivery.net' base_url = 'https://%s/%s/' % (domain, video_id) if '.' in video_id: video_id = self._parse_json(base64.urlsafe_b64decode( - video_id.split('.')[1]), video_id)['sub'] + video_id.split('.')[1] + '==='), video_id)['sub'] manifest_base_url = base_url + 'manifest/video.' formats = self._extract_m3u8_formats( @@ -62,7 +51,6 @@ class CloudflareStreamIE(InfoExtractor): 'm3u8_native', m3u8_id='hls', fatal=False) formats.extend(self._extract_mpd_formats( manifest_base_url + 'mpd', video_id, mpd_id='dash', fatal=False)) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/cloudy.py b/hypervideo_dl/extractor/cloudy.py index 85ca20e..848643e 100644 --- a/hypervideo_dl/extractor/cloudy.py +++ b/hypervideo_dl/extractor/cloudy.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( str_to_int, diff --git a/hypervideo_dl/extractor/clubic.py b/hypervideo_dl/extractor/clubic.py index 98f9cb5..403e44a 100644 --- a/hypervideo_dl/extractor/clubic.py +++ b/hypervideo_dl/extractor/clubic.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( clean_html, @@ -45,7 +42,6 @@ class ClubicIE(InfoExtractor): 'url': src['src'], 'quality': quality_order(src['streamQuality']), } for src in sources] - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/clyp.py b/hypervideo_dl/extractor/clyp.py index e6b2ac4..0aaf73d 100644 --- a/hypervideo_dl/extractor/clyp.py +++ b/hypervideo_dl/extractor/clyp.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( float_or_none, @@ -62,7 +60,6 @@ class ClypIE(InfoExtractor): 'format_id': format_id, 'vcodec': 'none', }) - self._sort_formats(formats) title = metadata['Title'] description = metadata.get('Description') diff --git a/hypervideo_dl/extractor/cmt.py b/hypervideo_dl/extractor/cmt.py index a4ddb91..8aed770 100644 --- a/hypervideo_dl/extractor/cmt.py +++ b/hypervideo_dl/extractor/cmt.py @@ -1,11 +1,9 @@ -from __future__ import unicode_literals - from .mtv import MTVIE # TODO Remove - Reason: Outdated Site -class CMTIE(MTVIE): +class CMTIE(MTVIE): # XXX: Do not subclass from concrete IE IE_NAME = 'cmt.com' _VALID_URL = r'https?://(?:www\.)?cmt\.com/(?:videos|shows|(?:full-)?episodes|video-clips)/(?P<id>[^/]+)' diff --git a/hypervideo_dl/extractor/cnbc.py b/hypervideo_dl/extractor/cnbc.py index da3730c..68fd025 100644 --- a/hypervideo_dl/extractor/cnbc.py +++ b/hypervideo_dl/extractor/cnbc.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import smuggle_url diff --git a/hypervideo_dl/extractor/cnn.py b/hypervideo_dl/extractor/cnn.py index af11d95..61b62fa 100644 --- a/hypervideo_dl/extractor/cnn.py +++ b/hypervideo_dl/extractor/cnn.py @@ -1,9 +1,6 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor from .turner import TurnerBaseIE -from ..utils import url_basename +from ..utils import merge_dicts, try_call, url_basename class CNNIE(TurnerBaseIE): @@ -144,3 +141,58 @@ class CNNArticleIE(InfoExtractor): webpage = self._download_webpage(url, url_basename(url)) cnn_url = self._html_search_regex(r"video:\s*'([^']+)'", webpage, 'cnn url') return self.url_result('http://cnn.com/video/?/video/' + cnn_url, CNNIE.ie_key()) + + +class CNNIndonesiaIE(InfoExtractor): + _VALID_URL = r'https?://www\.cnnindonesia\.com/[\w-]+/(?P<upload_date>\d{8})\d+-\d+-(?P<id>\d+)/(?P<display_id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.cnnindonesia.com/ekonomi/20220909212635-89-845885/alasan-harga-bbm-di-indonesia-masih-disubsidi', + 'info_dict': { + 'id': '845885', + 'ext': 'mp4', + 'description': 'md5:e7954bfa6f1749bc9ef0c079a719c347', + 'upload_date': '20220909', + 'title': 'Alasan Harga BBM di Indonesia Masih Disubsidi', + 'timestamp': 1662859088, + 'duration': 120.0, + 'thumbnail': r're:https://akcdn\.detik\.net\.id/visual/2022/09/09/thumbnail-ekopedia-alasan-harga-bbm-disubsidi_169\.jpeg', + 'tags': ['ekopedia', 'subsidi bbm', 'subsidi', 'bbm', 'bbm subsidi', 'harga pertalite naik'], + 'age_limit': 0, + 'release_timestamp': 1662859088, + 'release_date': '20220911', + 'uploader': 'Asfahan Yahsyi', + } + }, { + 'url': 'https://www.cnnindonesia.com/internasional/20220911104341-139-846189/video-momen-charles-disambut-meriah-usai-dilantik-jadi-raja-inggris', + 'info_dict': { + 'id': '846189', + 'ext': 'mp4', + 'upload_date': '20220911', + 'duration': 76.0, + 'timestamp': 1662869995, + 'description': 'md5:ece7b003b3ee7d81c6a5cfede7d5397d', + 'thumbnail': r're:https://akcdn\.detik\.net\.id/visual/2022/09/11/thumbnail-video-1_169\.jpeg', + 'title': 'VIDEO: Momen Charles Disambut Meriah usai Dilantik jadi Raja Inggris', + 'tags': ['raja charles', 'raja charles iii', 'ratu elizabeth', 'ratu elizabeth meninggal dunia', 'raja inggris', 'inggris'], + 'age_limit': 0, + 'release_date': '20220911', + 'uploader': 'REUTERS', + 'release_timestamp': 1662869995, + } + }] + + def _real_extract(self, url): + upload_date, video_id, display_id = self._match_valid_url(url).group('upload_date', 'id', 'display_id') + webpage = self._download_webpage(url, display_id) + + json_ld_list = list(self._yield_json_ld(webpage, display_id)) + json_ld_data = self._json_ld(json_ld_list, display_id) + embed_url = next( + json_ld.get('embedUrl') for json_ld in json_ld_list if json_ld.get('@type') == 'VideoObject') + + return merge_dicts(json_ld_data, { + '_type': 'url_transparent', + 'url': embed_url, + 'upload_date': upload_date, + 'tags': try_call(lambda: self._html_search_meta('keywords', webpage).split(', ')) + }) diff --git a/hypervideo_dl/extractor/comedycentral.py b/hypervideo_dl/extractor/comedycentral.py index 5a12ab5..05fc9f2 100644 --- a/hypervideo_dl/extractor/comedycentral.py +++ b/hypervideo_dl/extractor/comedycentral.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .mtv import MTVServicesInfoExtractor diff --git a/hypervideo_dl/extractor/common.py b/hypervideo_dl/extractor/common.py index 0035191..4b56307 100644 --- a/hypervideo_dl/extractor/common.py +++ b/hypervideo_dl/extractor/common.py @@ -1,67 +1,61 @@ -# coding: utf-8 -from __future__ import unicode_literals - import base64 import collections +import getpass import hashlib +import http.client +import http.cookiejar +import http.cookies +import inspect import itertools import json +import math import netrc import os import random import re import sys import time -import math - -from ..compat import ( - compat_cookiejar_Cookie, - compat_cookies_SimpleCookie, - compat_etree_Element, - compat_etree_fromstring, - compat_expanduser, - compat_getpass, - compat_http_client, - compat_os_name, - compat_str, - compat_urllib_error, - compat_urllib_parse_unquote, - compat_urllib_parse_urlencode, - compat_urllib_request, - compat_urlparse, - compat_xml_parse_error, -) -from ..downloader import FileDownloader -from ..downloader.f4m import ( - get_base_url, - remove_encrypted_media, -) +import types +import urllib.parse +import urllib.request +import xml.etree.ElementTree + +from ..compat import functools # isort: split +from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name +from ..cookies import LenientSimpleCookie +from ..downloader.f4m import get_base_url, remove_encrypted_media from ..utils import ( + IDENTITY, + JSON_LD_RE, + NO_DEFAULT, + ExtractorError, + FormatSorter, + GeoRestrictedError, + GeoUtils, + LenientJSONDecoder, + RegexNotFoundError, + RetryManager, + UnsupportedError, age_restricted, base_url, bug_reports_message, + classproperty, clean_html, - compiled_regex_type, + deprecation_warning, determine_ext, - determine_protocol, dict_get, encode_data_uri, error_to_compat_str, extract_attributes, - ExtractorError, filter_dict, fix_xml_ampersands, float_or_none, format_field, - GeoRestrictedError, - GeoUtils, int_or_none, join_nonempty, js_to_json, - JSON_LD_RE, mimetype2ext, network_exceptions, - NO_DEFAULT, orderedSet, parse_bitrate, parse_codecs, @@ -69,16 +63,17 @@ from ..utils import ( parse_iso8601, parse_m3u8_attributes, parse_resolution, - RegexNotFoundError, sanitize_filename, + sanitize_url, sanitized_Request, + smuggle_url, str_or_none, str_to_int, strip_or_none, traverse_obj, + try_call, try_get, unescapeHTML, - UnsupportedError, unified_strdate, unified_timestamp, update_Request, @@ -93,7 +88,7 @@ from ..utils import ( ) -class InfoExtractor(object): +class InfoExtractor: """Information Extractor class. Information extractors are the classes that, given a URL, extract @@ -111,7 +106,9 @@ class InfoExtractor(object): For a video, the dictionaries must include the following fields: id: Video identifier. - title: Video title, unescaped. + title: Video title, unescaped. Set to an empty string if video has + no title as opposed to "None" which signifies that the + extractor failed to obtain a title Additionally, it must contain either a formats entry or a url one: @@ -153,13 +150,17 @@ class InfoExtractor(object): ("3D" or "DASH video") * width Width of the video, if known * height Height of the video, if known + * aspect_ratio Aspect ratio of the video, if known + Automatically calculated from width and height * resolution Textual description of width and height + Automatically calculated from width and height * dynamic_range The dynamic range of the video. One of: "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV" * tbr Average bitrate of audio and video in KBit/s * abr Average audio bitrate in KBit/s * acodec Name of the audio codec in use * asr Audio sampling rate in Hertz + * audio_channels Number of audio channels * vbr Average video bitrate in KBit/s * fps Frame rate * vcodec Name of the video codec in use @@ -216,8 +217,10 @@ class InfoExtractor(object): * no_resume The server does not support resuming the (HTTP or RTMP) download. Boolean. * has_drm The format has DRM and cannot be downloaded. Boolean - * downloader_options A dictionary of downloader options as - described in FileDownloader (For internal use only) + * downloader_options A dictionary of downloader options + (For internal use only) + * http_chunk_size Chunk size for HTTP downloads + * ffmpeg_args Extra arguments for ffmpeg downloader RTMP formats can also have the additional fields: page_url, app, play_path, tc_url, flash_version, rtmp_live, rtmp_conn, rtmp_protocol, rtmp_real_time @@ -285,6 +288,7 @@ class InfoExtractor(object): captions instead of normal subtitles duration: Length of the video in seconds, as an integer or float. view_count: How many users have watched the video on the platform. + concurrent_view_count: How many users are currently watching the video on the platform. like_count: Number of positive ratings of the video dislike_count: Number of negative ratings of the video repost_count: Number of reposts of the video @@ -320,7 +324,8 @@ class InfoExtractor(object): live stream that goes on instead of a fixed-length video. was_live: True, False, or None (=unknown). Whether this video was originally a live stream. - live_status: 'is_live', 'is_upcoming', 'was_live', 'not_live' or None (=unknown) + live_status: None (=unknown), 'is_live', 'is_upcoming', 'was_live', 'not_live', + or 'post_live' (was live, but VOD is not yet processed) If absent, automatically set from is_live, was_live start_time: Time in seconds where the reproduction should start, as specified in the URL. @@ -333,11 +338,13 @@ class InfoExtractor(object): playable_in_embed: Whether this video is allowed to play in embedded players on other sites. Can be True (=always allowed), False (=never allowed), None (=unknown), or a string - specifying the criteria for embedability (Eg: 'whitelist') + specifying the criteria for embedability; e.g. 'whitelist' availability: Under what condition the video is available. One of 'private', 'premium_only', 'subscriber_only', 'needs_auth', 'unlisted' or 'public'. Use 'InfoExtractor._availability' to set it + _old_archive_ids: A list of old archive ids needed for backward compatibility + _format_sort_fields: A list of fields to use for sorting formats __post_extractor: A function to be called just before the metadata is written to either disk, logger or console. The function must return a dict which will be added to the info_dict. @@ -387,6 +394,15 @@ class InfoExtractor(object): release_year: Year (YYYY) when the album was released. composer: Composer of the piece + The following fields should only be set for clips that should be cut from the original video: + + section_start: Start time of the section in seconds + section_end: End time of the section in seconds + + The following fields should only be set for storyboards: + rows: Number of rows in each storyboard fragment, as an integer + columns: Number of columns in each storyboard fragment, as an integer + Unless mentioned otherwise, the fields should be Unicode strings. Unless mentioned otherwise, None is equivalent to absence of information. @@ -396,7 +412,7 @@ class InfoExtractor(object): There must be a key "entries", which is a list, an iterable, or a PagedList object, each element of which is a valid dictionary by this specification. - Additionally, playlists can have "id", "title", and any other relevent + Additionally, playlists can have "id", "title", and any other relevant attributes with the same semantics as videos (see above). It can also have the following optional fields: @@ -429,14 +445,26 @@ class InfoExtractor(object): title, description etc. - Subclasses of this should define a _VALID_URL regexp and, re-define the - _real_extract() and (optionally) _real_initialize() methods. - Probably, they should also be added to the list of extractors. + Subclasses of this should also be added to the list of extractors and + should define a _VALID_URL regexp and, re-define the _real_extract() and + (optionally) _real_initialize() methods. Subclasses may also override suitable() if necessary, but ensure the function signature is preserved and that this function imports everything it needs (except other extractors), so that lazy_extractors works correctly. + Subclasses can define a list of _EMBED_REGEX, which will be searched for in + the HTML of Generic webpages. It may also override _extract_embed_urls + or _extract_from_webpage as necessary. While these are normally classmethods, + _extract_from_webpage is allowed to be an instance method. + + _extract_from_webpage may raise self.StopExtraction() to stop further + processing of the webpage and obtain exclusive rights to it. This is useful + when the extractor cannot reliably be matched using just the URL, + e.g. invidious/peertube instances + + Embed-only extractors can be defined by setting _VALID_URL = False. + To support username + password (or netrc) login, the extractor must define a _NETRC_MACHINE and re-define _perform_login(username, password) and (optionally) _initialize_pre_login() methods. The _perform_login method will @@ -460,6 +488,9 @@ class InfoExtractor(object): will be used by geo restriction bypass mechanism similarly to _GEO_COUNTRIES. + The _ENABLED attribute should be set to False for IEs that + are disabled by default and must be explicitly enabled. + The _WORKING attribute should be set to False for broken IEs in order to warn the users and skip the tests. """ @@ -471,16 +502,23 @@ class InfoExtractor(object): _GEO_COUNTRIES = None _GEO_IP_BLOCKS = None _WORKING = True + _ENABLED = True _NETRC_MACHINE = None IE_DESC = None + SEARCH_KEY = None + _VALID_URL = None + _EMBED_REGEX = [] - _LOGIN_HINTS = { - 'any': 'Use --cookies, --cookies-from-browser, --username and --password, or --netrc to provide account credentials', - 'cookies': ( - 'Use --cookies-from-browser or --cookies for the authentication. ' - 'See https://github.com/ytdl-org/youtube-dl#how-do-i-pass-cookies-to-youtube-dl for how to manually pass cookies'), - 'password': 'Use --username and --password, or --netrc to provide account credentials', - } + def _login_hint(self, method=NO_DEFAULT, netrc=None): + password_hint = f'--username and --password, or --netrc ({netrc or self._NETRC_MACHINE}) to provide account credentials' + return { + None: '', + 'any': f'Use --cookies, --cookies-from-browser, {password_hint}', + 'password': f'Use {password_hint}', + 'cookies': ( + 'Use --cookies-from-browser or --cookies for the authentication. ' + 'See https://github.com/hypervideo/hypervideo/wiki/FAQ#how-do-i-pass-cookies-to-hypervideo for how to manually pass cookies'), + }[method if method is not NO_DEFAULT else 'any' if self.supports_login() else 'cookies'] def __init__(self, downloader=None): """Constructor. Receives an optional downloader (a YoutubeDL instance). @@ -493,12 +531,12 @@ class InfoExtractor(object): @classmethod def _match_valid_url(cls, url): + if cls._VALID_URL is False: + return None # This does not use has/getattr intentionally - we want to know whether # we have cached the regexp for *this* class, whereas getattr would also # match the superclass if '_VALID_URL_RE' not in cls.__dict__: - if '_VALID_URL' not in cls.__dict__: - cls._VALID_URL = cls._make_valid_url() cls._VALID_URL_RE = re.compile(cls._VALID_URL) return cls._VALID_URL_RE.match(url) @@ -543,7 +581,7 @@ class InfoExtractor(object): if username: self._perform_login(username, password) elif self.get_param('username') and False not in (self.IE_DESC, self._NETRC_MACHINE): - self.report_warning(f'Login with password is not supported for this website. {self._LOGIN_HINTS["cookies"]}') + self.report_warning(f'Login with password is not supported for this website. {self._login_hint("cookies")}') self._real_initialize() self._ready = True @@ -609,8 +647,7 @@ class InfoExtractor(object): if ip_block: self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block) - self._downloader.write_debug( - '[debug] Using fake IP %s as X-Forwarded-For' % self._x_forwarded_for_ip) + self.write_debug(f'Using fake IP {self._x_forwarded_for_ip} as X-Forwarded-For') return # Path 2: bypassing based on country code @@ -629,7 +666,7 @@ class InfoExtractor(object): if country: self._x_forwarded_for_ip = GeoUtils.random_ipv4(country) self._downloader.write_debug( - 'Using fake IP %s (%s) as X-Forwarded-For' % (self._x_forwarded_for_ip, country.upper())) + f'Using fake IP {self._x_forwarded_for_ip} ({country.upper()}) as X-Forwarded-For') def extract(self, url): """Extracts URL information and returns it in list of dicts.""" @@ -643,10 +680,10 @@ class InfoExtractor(object): return None if self._x_forwarded_for_ip: ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip - subtitles = ie_result.get('subtitles') - if (subtitles and 'live_chat' in subtitles - and 'no-live-chat' in self.get_param('compat_opts', [])): - del subtitles['live_chat'] + subtitles = ie_result.get('subtitles') or {} + if 'no-live-chat' in self.get_param('compat_opts'): + for lang in ('live_chat', 'comments', 'danmaku'): + subtitles.pop(lang, None) return ie_result except GeoRestrictedError as e: if self.__maybe_fake_ip_and_retry(e.countries): @@ -655,17 +692,11 @@ class InfoExtractor(object): except UnsupportedError: raise except ExtractorError as e: - kwargs = { - 'video_id': e.video_id or self.get_temp_id(url), - 'ie': self.IE_NAME, - 'tb': e.traceback or sys.exc_info()[2], - 'expected': e.expected, - 'cause': e.cause - } - if hasattr(e, 'countries'): - kwargs['countries'] = e.countries - raise type(e)(e.orig_msg, **kwargs) - except compat_http_client.IncompleteRead as e: + e.video_id = e.video_id or self.get_temp_id(url), + e.ie = e.ie or self.IE_NAME, + e.traceback = e.traceback or sys.exc_info()[2] + raise + except http.client.IncompleteRead as e: raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url)) except (KeyError, StopIteration) as e: raise ExtractorError('An extractor error has occurred.', cause=e, video_id=self.get_temp_id(url)) @@ -689,8 +720,16 @@ class InfoExtractor(object): """Sets a YoutubeDL instance as the downloader for this IE.""" self._downloader = downloader + @property + def cache(self): + return self._downloader.cache + + @property + def cookiejar(self): + return self._downloader.cookiejar + def _initialize_pre_login(self): - """ Intialization before login. Redefine in subclasses.""" + """ Initialization before login. Redefine in subclasses.""" pass def _perform_login(self, username, password): @@ -710,13 +749,13 @@ class InfoExtractor(object): """A string for getting the InfoExtractor with get_info_extractor""" return cls.__name__[:-2] - @property - def IE_NAME(self): - return compat_str(type(self).__name__[:-2]) + @classproperty + def IE_NAME(cls): + return cls.__name__[:-2] @staticmethod def __can_accept_status_code(err, expected_status): - assert isinstance(err, compat_urllib_error.HTTPError) + assert isinstance(err, urllib.error.HTTPError) if expected_status is None: return False elif callable(expected_status): @@ -724,7 +763,14 @@ class InfoExtractor(object): else: return err.code in variadic(expected_status) - def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}, expected_status=None): + def _create_request(self, url_or_request, data=None, headers=None, query=None): + if isinstance(url_or_request, urllib.request.Request): + return update_Request(url_or_request, data=data, headers=headers, query=query) + if query: + url_or_request = update_url_query(url_or_request, query) + return sanitized_Request(url_or_request, data, headers or {}) + + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None): """ Return the response handle. @@ -742,9 +788,9 @@ class InfoExtractor(object): self.report_download_webpage(video_id) elif note is not False: if video_id is None: - self.to_screen('%s' % (note,)) + self.to_screen(str(note)) else: - self.to_screen('%s: %s' % (video_id, note)) + self.to_screen(f'{video_id}: {note}') # Some sites check X-Forwarded-For HTTP header in order to figure out # the origin of the client behind proxy. This allows bypassing geo @@ -752,21 +798,13 @@ class InfoExtractor(object): # geo unrestricted country. We will do so once we encounter any # geo restriction error. if self._x_forwarded_for_ip: - if 'X-Forwarded-For' not in headers: - headers['X-Forwarded-For'] = self._x_forwarded_for_ip + headers = (headers or {}).copy() + headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip) - if isinstance(url_or_request, compat_urllib_request.Request): - url_or_request = update_Request( - url_or_request, data=data, headers=headers, query=query) - else: - if query: - url_or_request = update_url_query(url_or_request, query) - if data is not None or headers: - url_or_request = sanitized_Request(url_or_request, data, headers) try: - return self._downloader.urlopen(url_or_request) + return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query)) except network_exceptions as err: - if isinstance(err, compat_urllib_error.HTTPError): + if isinstance(err, urllib.error.HTTPError): if self.__can_accept_status_code(err, expected_status): # Retain reference to error to prevent file object from # being closed before it can be read. Works around the @@ -780,21 +818,49 @@ class InfoExtractor(object): if errnote is None: errnote = 'Unable to download webpage' - errmsg = '%s: %s' % (errnote, error_to_compat_str(err)) + errmsg = f'{errnote}: {error_to_compat_str(err)}' if fatal: raise ExtractorError(errmsg, cause=err) else: self.report_warning(errmsg) return False - def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): + def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, + encoding=None, data=None, headers={}, query={}, expected_status=None): """ Return a tuple (page content as string, URL handle). - See _download_webpage docstring for arguments specification. + Arguments: + url_or_request -- plain text URL as a string or + a urllib.request.Request object + video_id -- Video/playlist/item identifier (string) + + Keyword arguments: + note -- note printed before downloading (string) + errnote -- note printed in case of an error (string) + fatal -- flag denoting whether error should be considered fatal, + i.e. whether it should cause ExtractionError to be raised, + otherwise a warning will be reported and extraction continued + encoding -- encoding for a page content decoding, guessed automatically + when not explicitly specified + data -- POST data (bytes) + headers -- HTTP headers (dict) + query -- URL query (dict) + expected_status -- allows to accept failed HTTP requests (non 2xx + status code) by explicitly specifying a set of accepted status + codes. Can be any of the following entities: + - an integer type specifying an exact failed status code to + accept + - a list or a tuple of integer types specifying a list of + failed status codes to accept + - a callable accepting an actual failed status code and + returning True if it should be accepted + Note that this argument does not affect success status codes (2xx) + which are always accepted. """ + # Strip hashes from the URL (#1038) - if isinstance(url_or_request, (compat_str, str)): + if isinstance(url_or_request, str): url_or_request = url_or_request.partition('#')[0] urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status) @@ -849,247 +915,178 @@ class InfoExtractor(object): 'Visit http://blocklist.rkn.gov.ru/ for a block reason.', expected=True) + def _request_dump_filename(self, url, video_id): + basen = f'{video_id}_{url}' + trim_length = self.get_param('trim_file_name') or 240 + if len(basen) > trim_length: + h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() + basen = basen[:trim_length - len(h)] + h + filename = sanitize_filename(f'{basen}.dump', restricted=True) + # Working around MAX_PATH limitation on Windows (see + # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) + if compat_os_name == 'nt': + absfilepath = os.path.abspath(filename) + if len(absfilepath) > 259: + filename = fR'\\?\{absfilepath}' + return filename + + def __decode_webpage(self, webpage_bytes, encoding, headers): + if not encoding: + encoding = self._guess_encoding_from_content(headers.get('Content-Type', ''), webpage_bytes) + try: + return webpage_bytes.decode(encoding, 'replace') + except LookupError: + return webpage_bytes.decode('utf-8', 'replace') + def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): - content_type = urlh.headers.get('Content-Type', '') webpage_bytes = urlh.read() if prefix is not None: webpage_bytes = prefix + webpage_bytes - if not encoding: - encoding = self._guess_encoding_from_content(content_type, webpage_bytes) if self.get_param('dump_intermediate_pages', False): self.to_screen('Dumping request to ' + urlh.geturl()) dump = base64.b64encode(webpage_bytes).decode('ascii') self._downloader.to_screen(dump) - if self.get_param('write_pages', False): - basen = '%s_%s' % (video_id, urlh.geturl()) - trim_length = self.get_param('trim_file_name') or 240 - if len(basen) > trim_length: - h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() - basen = basen[:trim_length - len(h)] + h - raw_filename = basen + '.dump' - filename = sanitize_filename(raw_filename, restricted=True) - self.to_screen('Saving request to ' + filename) - # Working around MAX_PATH limitation on Windows (see - # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) - if compat_os_name == 'nt': - absfilepath = os.path.abspath(filename) - if len(absfilepath) > 259: - filename = '\\\\?\\' + absfilepath + if self.get_param('write_pages'): + filename = self._request_dump_filename(urlh.geturl(), video_id) + self.to_screen(f'Saving request to {filename}') with open(filename, 'wb') as outf: outf.write(webpage_bytes) - try: - content = webpage_bytes.decode(encoding, 'replace') - except LookupError: - content = webpage_bytes.decode('utf-8', 'replace') - + content = self.__decode_webpage(webpage_bytes, encoding, urlh.headers) self.__check_blocked(content) return content + def __print_error(self, errnote, fatal, video_id, err): + if fatal: + raise ExtractorError(f'{video_id}: {errnote}', cause=err) + elif errnote: + self.report_warning(f'{video_id}: {errnote}: {err}') + + def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True, errnote=None): + if transform_source: + xml_string = transform_source(xml_string) + try: + return compat_etree_fromstring(xml_string.encode('utf-8')) + except xml.etree.ElementTree.ParseError as ve: + self.__print_error('Failed to parse XML' if errnote is None else errnote, fatal, video_id, ve) + + def _parse_json(self, json_string, video_id, transform_source=None, fatal=True, errnote=None, **parser_kwargs): + try: + return json.loads( + json_string, cls=LenientJSONDecoder, strict=False, transform_source=transform_source, **parser_kwargs) + except ValueError as ve: + self.__print_error('Failed to parse JSON' if errnote is None else errnote, fatal, video_id, ve) + + def _parse_socket_response_as_json(self, data, *args, **kwargs): + return self._parse_json(data[data.find('{'):data.rfind('}') + 1], *args, **kwargs) + + def __create_download_methods(name, parser, note, errnote, return_value): + + def parse(ie, content, *args, errnote=errnote, **kwargs): + if parser is None: + return content + if errnote is False: + kwargs['errnote'] = errnote + # parser is fetched by name so subclasses can override it + return getattr(ie, parser)(content, *args, **kwargs) + + def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None, + fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): + res = self._download_webpage_handle( + url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding, + data=data, headers=headers, query=query, expected_status=expected_status) + if res is False: + return res + content, urlh = res + return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh + + def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None, + fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): + if self.get_param('load_pages'): + url_or_request = self._create_request(url_or_request, data, headers, query) + filename = self._request_dump_filename(url_or_request.full_url, video_id) + self.to_screen(f'Loading request from {filename}') + try: + with open(filename, 'rb') as dumpf: + webpage_bytes = dumpf.read() + except OSError as e: + self.report_warning(f'Unable to load request from disk: {e}') + else: + content = self.__decode_webpage(webpage_bytes, encoding, url_or_request.headers) + return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote) + kwargs = { + 'note': note, + 'errnote': errnote, + 'transform_source': transform_source, + 'fatal': fatal, + 'encoding': encoding, + 'data': data, + 'headers': headers, + 'query': query, + 'expected_status': expected_status, + } + if parser is None: + kwargs.pop('transform_source') + # The method is fetched by name so subclasses can override _download_..._handle + res = getattr(self, download_handle.__name__)(url_or_request, video_id, **kwargs) + return res if res is False else res[0] + + def impersonate(func, name, return_value): + func.__name__, func.__qualname__ = name, f'InfoExtractor.{name}' + func.__doc__ = f''' + @param transform_source Apply this transformation before parsing + @returns {return_value} + + See _download_webpage_handle docstring for other arguments specification + ''' + + impersonate(download_handle, f'_download_{name}_handle', f'({return_value}, URL handle)') + impersonate(download_content, f'_download_{name}', f'{return_value}') + return download_handle, download_content + + _download_xml_handle, _download_xml = __create_download_methods( + 'xml', '_parse_xml', 'Downloading XML', 'Unable to download XML', 'xml as an xml.etree.ElementTree.Element') + _download_json_handle, _download_json = __create_download_methods( + 'json', '_parse_json', 'Downloading JSON metadata', 'Unable to download JSON metadata', 'JSON object as a dict') + _download_socket_json_handle, _download_socket_json = __create_download_methods( + 'socket_json', '_parse_socket_response_as_json', 'Polling socket', 'Unable to poll socket', 'JSON object as a dict') + __download_webpage = __create_download_methods('webpage', None, None, None, 'data of the page as a string')[1] + def _download_webpage( self, url_or_request, video_id, note=None, errnote=None, - fatal=True, tries=1, timeout=5, encoding=None, data=None, - headers={}, query={}, expected_status=None): + fatal=True, tries=1, timeout=NO_DEFAULT, *args, **kwargs): """ Return the data of the page as a string. - Arguments: - url_or_request -- plain text URL as a string or - a compat_urllib_request.Requestobject - video_id -- Video/playlist/item identifier (string) - Keyword arguments: - note -- note printed before downloading (string) - errnote -- note printed in case of an error (string) - fatal -- flag denoting whether error should be considered fatal, - i.e. whether it should cause ExtractionError to be raised, - otherwise a warning will be reported and extraction continued tries -- number of tries timeout -- sleep interval between tries - encoding -- encoding for a page content decoding, guessed automatically - when not explicitly specified - data -- POST data (bytes) - headers -- HTTP headers (dict) - query -- URL query (dict) - expected_status -- allows to accept failed HTTP requests (non 2xx - status code) by explicitly specifying a set of accepted status - codes. Can be any of the following entities: - - an integer type specifying an exact failed status code to - accept - - a list or a tuple of integer types specifying a list of - failed status codes to accept - - a callable accepting an actual failed status code and - returning True if it should be accepted - Note that this argument does not affect success status codes (2xx) - which are always accepted. + + See _download_webpage_handle docstring for other arguments specification. """ - success = False + R''' # NB: These are unused; should they be deprecated? + if tries != 1: + self._downloader.deprecation_warning('tries argument is deprecated in InfoExtractor._download_webpage') + if timeout is NO_DEFAULT: + timeout = 5 + else: + self._downloader.deprecation_warning('timeout argument is deprecated in InfoExtractor._download_webpage') + ''' + try_count = 0 - while success is False: + while True: try: - res = self._download_webpage_handle( - url_or_request, video_id, note, errnote, fatal, - encoding=encoding, data=data, headers=headers, query=query, - expected_status=expected_status) - success = True - except compat_http_client.IncompleteRead as e: + return self.__download_webpage(url_or_request, video_id, note, errnote, None, fatal, *args, **kwargs) + except http.client.IncompleteRead as e: try_count += 1 if try_count >= tries: raise e self._sleep(timeout, video_id) - if res is False: - return res - else: - content, _ = res - return content - - def _download_xml_handle( - self, url_or_request, video_id, note='Downloading XML', - errnote='Unable to download XML', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}, - expected_status=None): - """ - Return a tuple (xml as an compat_etree_Element, URL handle). - - See _download_webpage docstring for arguments specification. - """ - res = self._download_webpage_handle( - url_or_request, video_id, note, errnote, fatal=fatal, - encoding=encoding, data=data, headers=headers, query=query, - expected_status=expected_status) - if res is False: - return res - xml_string, urlh = res - return self._parse_xml( - xml_string, video_id, transform_source=transform_source, - fatal=fatal), urlh - - def _download_xml( - self, url_or_request, video_id, - note='Downloading XML', errnote='Unable to download XML', - transform_source=None, fatal=True, encoding=None, - data=None, headers={}, query={}, expected_status=None): - """ - Return the xml as an compat_etree_Element. - - See _download_webpage docstring for arguments specification. - """ - res = self._download_xml_handle( - url_or_request, video_id, note=note, errnote=errnote, - transform_source=transform_source, fatal=fatal, encoding=encoding, - data=data, headers=headers, query=query, - expected_status=expected_status) - return res if res is False else res[0] - - def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True): - if transform_source: - xml_string = transform_source(xml_string) - try: - return compat_etree_fromstring(xml_string.encode('utf-8')) - except compat_xml_parse_error as ve: - errmsg = '%s: Failed to parse XML ' % video_id - if fatal: - raise ExtractorError(errmsg, cause=ve) - else: - self.report_warning(errmsg + str(ve)) - - def _download_json_handle( - self, url_or_request, video_id, note='Downloading JSON metadata', - errnote='Unable to download JSON metadata', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}, - expected_status=None): - """ - Return a tuple (JSON object, URL handle). - - See _download_webpage docstring for arguments specification. - """ - res = self._download_webpage_handle( - url_or_request, video_id, note, errnote, fatal=fatal, - encoding=encoding, data=data, headers=headers, query=query, - expected_status=expected_status) - if res is False: - return res - json_string, urlh = res - return self._parse_json( - json_string, video_id, transform_source=transform_source, - fatal=fatal), urlh - - def _download_json( - self, url_or_request, video_id, note='Downloading JSON metadata', - errnote='Unable to download JSON metadata', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}, - expected_status=None): - """ - Return the JSON object as a dict. - - See _download_webpage docstring for arguments specification. - """ - res = self._download_json_handle( - url_or_request, video_id, note=note, errnote=errnote, - transform_source=transform_source, fatal=fatal, encoding=encoding, - data=data, headers=headers, query=query, - expected_status=expected_status) - return res if res is False else res[0] - - def _parse_json(self, json_string, video_id, transform_source=None, fatal=True): - if transform_source: - json_string = transform_source(json_string) - try: - return json.loads(json_string, strict=False) - except ValueError as ve: - errmsg = '%s: Failed to parse JSON ' % video_id - if fatal: - raise ExtractorError(errmsg, cause=ve) - else: - self.report_warning(errmsg + str(ve)) - - def _parse_socket_response_as_json(self, data, video_id, transform_source=None, fatal=True): - return self._parse_json( - data[data.find('{'):data.rfind('}') + 1], - video_id, transform_source, fatal) - - def _download_socket_json_handle( - self, url_or_request, video_id, note='Polling socket', - errnote='Unable to poll socket', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}, - expected_status=None): - """ - Return a tuple (JSON object, URL handle). - - See _download_webpage docstring for arguments specification. - """ - res = self._download_webpage_handle( - url_or_request, video_id, note, errnote, fatal=fatal, - encoding=encoding, data=data, headers=headers, query=query, - expected_status=expected_status) - if res is False: - return res - webpage, urlh = res - return self._parse_socket_response_as_json( - webpage, video_id, transform_source=transform_source, - fatal=fatal), urlh - - def _download_socket_json( - self, url_or_request, video_id, note='Polling socket', - errnote='Unable to poll socket', transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}, - expected_status=None): - """ - Return the JSON object as a dict. - - See _download_webpage docstring for arguments specification. - """ - res = self._download_socket_json_handle( - url_or_request, video_id, note=note, errnote=errnote, - transform_source=transform_source, fatal=fatal, encoding=encoding, - data=data, headers=headers, query=query, - expected_status=expected_status) - return res if res is False else res[0] def report_warning(self, msg, video_id=None, *args, only_once=False, **kwargs): - idstr = format_field(video_id, template='%s: ') + idstr = format_field(video_id, None, '%s: ') msg = f'[{self.IE_NAME}] {idstr}{msg}' if only_once: if f'WARNING: {msg}' in self._printed_messages: @@ -1099,17 +1096,19 @@ class InfoExtractor(object): def to_screen(self, msg, *args, **kwargs): """Print msg to screen, prefixing it with '[ie_name]'""" - self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs) + self._downloader.to_screen(f'[{self.IE_NAME}] {msg}', *args, **kwargs) def write_debug(self, msg, *args, **kwargs): - self._downloader.write_debug('[%s] %s' % (self.IE_NAME, msg), *args, **kwargs) + self._downloader.write_debug(f'[{self.IE_NAME}] {msg}', *args, **kwargs) def get_param(self, name, default=None, *args, **kwargs): if self._downloader: return self._downloader.params.get(name, default, *args, **kwargs) return default - def report_drm(self, video_id, partial=False): + def report_drm(self, video_id, partial=NO_DEFAULT): + if partial is not NO_DEFAULT: + self._downloader.deprecation_warning('InfoExtractor.report_drm no longer accepts the argument partial') self.raise_no_formats('This video is DRM protected', expected=True, video_id=video_id) def report_extraction(self, id_or_name): @@ -1135,11 +1134,7 @@ class InfoExtractor(object): self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')): self.report_warning(msg) return - if method is NO_DEFAULT: - method = 'any' if self.supports_login() else 'cookies' - if method is not None: - assert method in self._LOGIN_HINTS, 'Invalid login method' - msg = '%s. %s' % (msg, self._LOGIN_HINTS[method]) + msg += format_field(self._login_hint(method), None, '. %s') raise ExtractorError(msg, expected=True) def raise_geo_restricted( @@ -1176,10 +1171,12 @@ class InfoExtractor(object): 'url': url, } - def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None, video_kwargs=None, **kwargs): - urls = (self.url_result(self._proto_relative_url(m), ie, **(video_kwargs or {})) - for m in orderedSet(map(getter, matches) if getter else matches)) - return self.playlist_result(urls, playlist_id, playlist_title, **kwargs) + @classmethod + def playlist_from_matches(cls, matches, playlist_id=None, playlist_title=None, + getter=IDENTITY, ie=None, video_kwargs=None, **kwargs): + return cls.playlist_result( + (cls.url_result(m, ie, **(video_kwargs or {})) for m in orderedSet(map(getter, matches), lazy=True)), + playlist_id, playlist_title, **kwargs) @staticmethod def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None, *, multi_video=False, **kwargs): @@ -1203,7 +1200,9 @@ class InfoExtractor(object): In case of failure return a default value or raise a WARNING or a RegexNotFoundError, depending on fatal, specifying the field name. """ - if isinstance(pattern, (str, compat_str, compiled_regex_type)): + if string is None: + mobj = None + elif isinstance(pattern, (str, re.Pattern)): mobj = re.search(pattern, string, flags) else: for p in pattern: @@ -1229,6 +1228,33 @@ class InfoExtractor(object): self.report_warning('unable to extract %s' % _name + bug_reports_message()) return None + def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='', + contains_pattern=r'{(?s:.+)}', fatal=True, default=NO_DEFAULT, **kwargs): + """Searches string for the JSON object specified by start_pattern""" + # NB: end_pattern is only used to reduce the size of the initial match + if default is NO_DEFAULT: + default, has_default = {}, False + else: + fatal, has_default = False, True + + json_string = self._search_regex( + rf'(?:{start_pattern})\s*(?P<json>{contains_pattern})\s*(?:{end_pattern})', + string, name, group='json', fatal=fatal, default=None if has_default else NO_DEFAULT) + if not json_string: + return default + + _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS) + try: + return self._parse_json(json_string, video_id, ignore_extra=True, **kwargs) + except ExtractorError as e: + if fatal: + raise ExtractorError( + f'Unable to extract {_name} - Failed to parse JSON', cause=e.cause, video_id=video_id) + elif not has_default: + self.report_warning( + f'Unable to extract {_name} - Failed to parse JSON: {e}', video_id=video_id) + return default + def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): """ Like _search_regex, but strips HTML tags and unescapes entities. @@ -1256,7 +1282,7 @@ class InfoExtractor(object): else: raise netrc.NetrcParseError( 'No authenticators for %s' % netrc_machine) - except (IOError, netrc.NetrcParseError) as err: + except (OSError, netrc.NetrcParseError) as err: self.report_warning( 'parsing .netrc: %s' % error_to_compat_str(err)) @@ -1293,7 +1319,7 @@ class InfoExtractor(object): if tfa is not None: return tfa - return compat_getpass('Type %s and press [Return]: ' % note) + return getpass.getpass('Type %s and press [Return]: ' % note) # Helper functions for extracting OpenGraph info @staticmethod @@ -1344,7 +1370,7 @@ class InfoExtractor(object): return self._og_search_property('url', html, **kargs) def _html_extract_title(self, html, name='title', *, fatal=False, **kwargs): - return self._html_search_regex(r'(?s)<title>([^<]+)</title>', html, name, fatal=fatal, **kwargs) + return self._html_search_regex(r'(?s)<title\b[^>]*>([^<]+)</title>', html, name, fatal=fatal, **kwargs) def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs): name = variadic(name) @@ -1357,12 +1383,20 @@ class InfoExtractor(object): def _dc_search_uploader(self, html): return self._html_search_meta('dc.creator', html, 'uploader') - def _rta_search(self, html): + @staticmethod + def _rta_search(html): # See http://www.rtalabel.org/index.php?content=howtofaq#single if re.search(r'(?ix)<meta\s+name="rating"\s+' r' content="RTA-5042-1996-1400-1577-RTA"', html): return 18 + + # And then there are the jokers who advertise that they use RTA, but actually don't. + AGE_LIMIT_MARKERS = [ + r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>', + ] + if any(re.search(marker, html) for marker in AGE_LIMIT_MARKERS): + return 18 return 0 def _media_rating_search(self, html): @@ -1401,27 +1435,25 @@ class InfoExtractor(object): return self._html_search_meta('twitter:player', html, 'twitter card player') - def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): - json_ld_list = list(re.finditer(JSON_LD_RE, html)) - default = kwargs.get('default', NO_DEFAULT) - # JSON-LD may be malformed and thus `fatal` should be respected. - # At the same time `default` may be passed that assumes `fatal=False` - # for _search_regex. Let's simulate the same behavior here as well. - fatal = kwargs.get('fatal', True) if default is NO_DEFAULT else False - json_ld = [] - for mobj in json_ld_list: - json_ld_item = self._parse_json( - mobj.group('json_ld'), video_id, fatal=fatal) - if not json_ld_item: - continue - if isinstance(json_ld_item, dict): - json_ld.append(json_ld_item) - elif isinstance(json_ld_item, (list, tuple)): - json_ld.extend(json_ld_item) - if json_ld: - json_ld = self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type) - if json_ld: - return json_ld + def _yield_json_ld(self, html, video_id, *, fatal=True, default=NO_DEFAULT): + """Yield all json ld objects in the html""" + if default is not NO_DEFAULT: + fatal = False + for mobj in re.finditer(JSON_LD_RE, html): + json_ld_item = self._parse_json(mobj.group('json_ld'), video_id, fatal=fatal) + for json_ld in variadic(json_ld_item): + if isinstance(json_ld, dict): + yield json_ld + + def _search_json_ld(self, html, video_id, expected_type=None, *, fatal=True, default=NO_DEFAULT): + """Search for a video in any json ld in the html""" + if default is not NO_DEFAULT: + fatal = False + info = self._json_ld( + list(self._yield_json_ld(html, video_id, fatal=fatal, default=default)), + video_id, fatal=fatal, expected_type=expected_type) + if info: + return info if default is not NO_DEFAULT: return default elif fatal: @@ -1431,15 +1463,11 @@ class InfoExtractor(object): return {} def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None): - if isinstance(json_ld, compat_str): + if isinstance(json_ld, str): json_ld = self._parse_json(json_ld, video_id, fatal=fatal) if not json_ld: return {} info = {} - if not isinstance(json_ld, (list, tuple, dict)): - return info - if isinstance(json_ld, dict): - json_ld = [json_ld] INTERACTION_TYPE_MAP = { 'CommentAction': 'comment', @@ -1452,6 +1480,10 @@ class InfoExtractor(object): 'ViewAction': 'view', } + def is_type(e, *expected_types): + type = variadic(traverse_obj(e, '@type')) + return any(x in type for x in expected_types) + def extract_interaction_type(e): interaction_type = e.get('interactionType') if isinstance(interaction_type, dict): @@ -1465,9 +1497,7 @@ class InfoExtractor(object): if not isinstance(interaction_statistic, list): return for is_e in interaction_statistic: - if not isinstance(is_e, dict): - continue - if is_e.get('@type') != 'InteractionCounter': + if not is_type(is_e, 'InteractionCounter'): continue interaction_type = extract_interaction_type(is_e) if not interaction_type: @@ -1504,44 +1534,53 @@ class InfoExtractor(object): info['chapters'] = chapters def extract_video_object(e): - assert e['@type'] == 'VideoObject' author = e.get('author') info.update({ 'url': url_or_none(e.get('contentUrl')), + 'ext': mimetype2ext(e.get('encodingFormat')), 'title': unescapeHTML(e.get('name')), 'description': unescapeHTML(e.get('description')), - 'thumbnails': [{'url': url_or_none(url)} - for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL'))], + 'thumbnails': [{'url': unescapeHTML(url)} + for url in variadic(traverse_obj(e, 'thumbnailUrl', 'thumbnailURL')) + if url_or_none(url)], 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('uploadDate')), # author can be an instance of 'Organization' or 'Person' types. # both types can have 'name' property(inherited from 'Thing' type). [1] # however some websites are using 'Text' type instead. # 1. https://schema.org/VideoObject - 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, compat_str) else None, - 'filesize': float_or_none(e.get('contentSize')), + 'uploader': author.get('name') if isinstance(author, dict) else author if isinstance(author, str) else None, + 'artist': traverse_obj(e, ('byArtist', 'name'), expected_type=str), + 'filesize': int_or_none(float_or_none(e.get('contentSize'))), 'tbr': int_or_none(e.get('bitrate')), 'width': int_or_none(e.get('width')), 'height': int_or_none(e.get('height')), 'view_count': int_or_none(e.get('interactionCount')), + 'tags': try_call(lambda: e.get('keywords').split(',')), }) + if is_type(e, 'AudioObject'): + info.update({ + 'vcodec': 'none', + 'abr': int_or_none(e.get('bitrate')), + }) extract_interaction_statistic(e) extract_chapter_information(e) def traverse_json_ld(json_ld, at_top_level=True): - for e in json_ld: + for e in variadic(json_ld): + if not isinstance(e, dict): + continue if at_top_level and '@context' not in e: continue if at_top_level and set(e.keys()) == {'@context', '@graph'}: - traverse_json_ld(variadic(e['@graph'], allowed_types=(dict,)), at_top_level=False) - break - item_type = e.get('@type') - if expected_type is not None and expected_type != item_type: + traverse_json_ld(e['@graph'], at_top_level=False) + continue + if expected_type is not None and not is_type(e, expected_type): continue rating = traverse_obj(e, ('aggregateRating', 'ratingValue'), expected_type=float_or_none) if rating is not None: info['average_rating'] = rating - if item_type in ('TVEpisode', 'Episode'): + if is_type(e, 'TVEpisode', 'Episode'): episode_name = unescapeHTML(e.get('name')) info.update({ 'episode': episode_name, @@ -1551,44 +1590,46 @@ class InfoExtractor(object): if not info.get('title') and episode_name: info['title'] = episode_name part_of_season = e.get('partOfSeason') - if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'): + if is_type(part_of_season, 'TVSeason', 'Season', 'CreativeWorkSeason'): info.update({ 'season': unescapeHTML(part_of_season.get('name')), 'season_number': int_or_none(part_of_season.get('seasonNumber')), }) part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') - if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'): + if is_type(part_of_series, 'TVSeries', 'Series', 'CreativeWorkSeries'): info['series'] = unescapeHTML(part_of_series.get('name')) - elif item_type == 'Movie': + elif is_type(e, 'Movie'): info.update({ 'title': unescapeHTML(e.get('name')), 'description': unescapeHTML(e.get('description')), 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('dateCreated')), }) - elif item_type in ('Article', 'NewsArticle'): + elif is_type(e, 'Article', 'NewsArticle'): info.update({ 'timestamp': parse_iso8601(e.get('datePublished')), 'title': unescapeHTML(e.get('headline')), 'description': unescapeHTML(e.get('articleBody') or e.get('description')), }) - if traverse_obj(e, ('video', 0, '@type')) == 'VideoObject': + if is_type(traverse_obj(e, ('video', 0)), 'VideoObject'): extract_video_object(e['video'][0]) - elif item_type == 'VideoObject': + elif is_type(traverse_obj(e, ('subjectOf', 0)), 'VideoObject'): + extract_video_object(e['subjectOf'][0]) + elif is_type(e, 'VideoObject', 'AudioObject'): extract_video_object(e) if expected_type is None: continue else: break video = e.get('video') - if isinstance(video, dict) and video.get('@type') == 'VideoObject': + if is_type(video, 'VideoObject'): extract_video_object(video) if expected_type is None: continue else: break - traverse_json_ld(json_ld) + traverse_json_ld(json_ld) return filter_dict(info) def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw): @@ -1598,15 +1639,16 @@ class InfoExtractor(object): webpage, 'next.js data', fatal=fatal, **kw), video_id, transform_source=transform_source, fatal=fatal) - def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__'): - ''' Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function. ''' - # not all website do this, but it can be changed - # https://stackoverflow.com/questions/67463109/how-to-change-or-hide-nuxt-and-nuxt-keyword-in-page-source + def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)): + """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" rectx = re.escape(context_name) + FUNCTION_RE = r'\(function\((?P<arg_keys>.*?)\){return\s+(?P<js>{.*?})\s*;?\s*}\((?P<arg_vals>.*?)\)' js, arg_keys, arg_vals = self._search_regex( - (r'<script>window\.%s=\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.+?)\)\);?</script>' % rectx, - r'%s\(.*?\(function\((?P<arg_keys>.*?)\)\{return\s(?P<js>\{.*?\})\}\((?P<arg_vals>.*?)\)' % rectx), - webpage, context_name, group=['js', 'arg_keys', 'arg_vals']) + (rf'<script>\s*window\.{rectx}={FUNCTION_RE}\s*\)\s*;?\s*</script>', rf'{rectx}\(.*?{FUNCTION_RE}'), + webpage, context_name, group=('js', 'arg_keys', 'arg_vals'), + default=NO_DEFAULT if fatal else (None, None, None)) + if js is None: + return {} args = dict(zip(arg_keys.split(','), arg_vals.split(','))) @@ -1614,7 +1656,8 @@ class InfoExtractor(object): if val in ('undefined', 'void 0'): args[key] = 'null' - return self._parse_json(js_to_json(js, args), video_id)['data'][0] + ret = self._parse_json(js, video_id, transform_source=functools.partial(js_to_json, vars=args), fatal=fatal) + return traverse_obj(ret, traverse) or {} @staticmethod def _hidden_inputs(html): @@ -1638,296 +1681,27 @@ class InfoExtractor(object): html, '%s form' % form_id, group='form') return self._hidden_inputs(form) - class FormatSort: - regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$' - - default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality', - 'res', 'fps', 'hdr:12', 'codec:vp9.2', 'size', 'br', 'asr', - 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases - ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr', - 'height', 'width', 'proto', 'vext', 'abr', 'aext', - 'fps', 'fs_approx', 'source', 'id') - - settings = { - 'vcodec': {'type': 'ordered', 'regex': True, - 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']}, - 'acodec': {'type': 'ordered', 'regex': True, - 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']}, - 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range', - 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]}, - 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol', - 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']}, - 'vext': {'type': 'ordered', 'field': 'video_ext', - 'order': ('mp4', 'webm', 'flv', '', 'none'), - 'order_free': ('webm', 'mp4', 'flv', '', 'none')}, - 'aext': {'type': 'ordered', 'field': 'audio_ext', - 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'), - 'order_free': ('opus', 'ogg', 'webm', 'm4a', 'mp3', 'aac', '', 'none')}, - 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000}, - 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple', - 'field': ('vcodec', 'acodec'), - 'function': lambda it: int(any(v != 'none' for v in it))}, - 'ie_pref': {'priority': True, 'type': 'extractor'}, - 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)}, - 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)}, - 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1}, - 'quality': {'convert': 'float', 'default': -1}, - 'filesize': {'convert': 'bytes'}, - 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'}, - 'id': {'convert': 'string', 'field': 'format_id'}, - 'height': {'convert': 'float_none'}, - 'width': {'convert': 'float_none'}, - 'fps': {'convert': 'float_none'}, - 'tbr': {'convert': 'float_none'}, - 'vbr': {'convert': 'float_none'}, - 'abr': {'convert': 'float_none'}, - 'asr': {'convert': 'float_none'}, - 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1}, - - 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')}, - 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True}, - 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')}, - 'ext': {'type': 'combined', 'field': ('vext', 'aext')}, - 'res': {'type': 'multiple', 'field': ('height', 'width'), - 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))}, - - # For compatibility with youtube-dl - 'format_id': {'type': 'alias', 'field': 'id'}, - 'preference': {'type': 'alias', 'field': 'ie_pref'}, - 'language_preference': {'type': 'alias', 'field': 'lang'}, - 'source_preference': {'type': 'alias', 'field': 'source'}, - 'protocol': {'type': 'alias', 'field': 'proto'}, - 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'}, - - # Deprecated - 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True}, - 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True}, - 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True}, - 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True}, - 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True}, - 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True}, - 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True}, - 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True}, - 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True}, - 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True}, - 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True}, - 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True}, - 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True}, - 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True}, - 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True}, - 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True}, - 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True}, - 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True}, - 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True}, - 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True}, - } + @classproperty(cache=True) + def FormatSort(cls): + class FormatSort(FormatSorter): + def __init__(ie, *args, **kwargs): + super().__init__(ie._downloader, *args, **kwargs) - def __init__(self, ie, field_preference): - self._order = [] - self.ydl = ie._downloader - self.evaluate_params(self.ydl.params, field_preference) - if ie.get_param('verbose'): - self.print_verbose_info(self.ydl.write_debug) - - def _get_field_setting(self, field, key): - if field not in self.settings: - if key in ('forced', 'priority'): - return False - self.ydl.deprecation_warning( - f'Using arbitrary fields ({field}) for format sorting is deprecated ' - 'and may be removed in a future version') - self.settings[field] = {} - propObj = self.settings[field] - if key not in propObj: - type = propObj.get('type') - if key == 'field': - default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field - elif key == 'convert': - default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore' - else: - default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None) - propObj[key] = default - return propObj[key] - - def _resolve_field_value(self, field, value, convertNone=False): - if value is None: - if not convertNone: - return None - else: - value = value.lower() - conversion = self._get_field_setting(field, 'convert') - if conversion == 'ignore': - return None - if conversion == 'string': - return value - elif conversion == 'float_none': - return float_or_none(value) - elif conversion == 'bytes': - return FileDownloader.parse_bytes(value) - elif conversion == 'order': - order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order') - use_regex = self._get_field_setting(field, 'regex') - list_length = len(order_list) - empty_pos = order_list.index('') if '' in order_list else list_length + 1 - if use_regex and value is not None: - for i, regex in enumerate(order_list): - if regex and re.match(regex, value): - return list_length - i - return list_length - empty_pos # not in list - else: # not regex or value = None - return list_length - (order_list.index(value) if value in order_list else empty_pos) - else: - if value.isnumeric(): - return float(value) - else: - self.settings[field]['convert'] = 'string' - return value - - def evaluate_params(self, params, sort_extractor): - self._use_free_order = params.get('prefer_free_formats', False) - self._sort_user = params.get('format_sort', []) - self._sort_extractor = sort_extractor - - def add_item(field, reverse, closest, limit_text): - field = field.lower() - if field in self._order: - return - self._order.append(field) - limit = self._resolve_field_value(field, limit_text) - data = { - 'reverse': reverse, - 'closest': False if limit is None else closest, - 'limit_text': limit_text, - 'limit': limit} - if field in self.settings: - self.settings[field].update(data) - else: - self.settings[field] = data - - sort_list = ( - tuple(field for field in self.default if self._get_field_setting(field, 'forced')) - + (tuple() if params.get('format_sort_force', False) - else tuple(field for field in self.default if self._get_field_setting(field, 'priority'))) - + tuple(self._sort_user) + tuple(sort_extractor) + self.default) - - for item in sort_list: - match = re.match(self.regex, item) - if match is None: - raise ExtractorError('Invalid format sort string "%s" given by extractor' % item) - field = match.group('field') - if field is None: - continue - if self._get_field_setting(field, 'type') == 'alias': - alias, field = field, self._get_field_setting(field, 'field') - if self._get_field_setting(alias, 'deprecated'): - self.ydl.deprecation_warning( - f'Format sorting alias {alias} is deprecated ' - f'and may be removed in a future version. Please use {field} instead') - reverse = match.group('reverse') is not None - closest = match.group('separator') == '~' - limit_text = match.group('limit') - - has_limit = limit_text is not None - has_multiple_fields = self._get_field_setting(field, 'type') == 'combined' - has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit') - - fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,) - limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple() - limit_count = len(limits) - for (i, f) in enumerate(fields): - add_item(f, reverse, closest, - limits[i] if i < limit_count - else limits[0] if has_limit and not has_multiple_limits - else None) - - def print_verbose_info(self, write_debug): - if self._sort_user: - write_debug('Sort order given by user: %s' % ', '.join(self._sort_user)) - if self._sort_extractor: - write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor)) - write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % ( - '+' if self._get_field_setting(field, 'reverse') else '', field, - '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':', - self._get_field_setting(field, 'limit_text'), - self._get_field_setting(field, 'limit')) - if self._get_field_setting(field, 'limit_text') is not None else '') - for field in self._order if self._get_field_setting(field, 'visible')])) - - def _calculate_field_preference_from_value(self, format, field, type, value): - reverse = self._get_field_setting(field, 'reverse') - closest = self._get_field_setting(field, 'closest') - limit = self._get_field_setting(field, 'limit') - - if type == 'extractor': - maximum = self._get_field_setting(field, 'max') - if value is None or (maximum is not None and value >= maximum): - value = -1 - elif type == 'boolean': - in_list = self._get_field_setting(field, 'in_list') - not_in_list = self._get_field_setting(field, 'not_in_list') - value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1 - elif type == 'ordered': - value = self._resolve_field_value(field, value, True) - - # try to convert to number - val_num = float_or_none(value, default=self._get_field_setting(field, 'default')) - is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None - if is_num: - value = val_num - - return ((-10, 0) if value is None - else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher - else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest - else (0, value, 0) if not reverse and (limit is None or value <= limit) - else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit - else (-1, value, 0)) - - def _calculate_field_preference(self, format, field): - type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple - get_value = lambda f: format.get(self._get_field_setting(f, 'field')) - if type == 'multiple': - type = 'field' # Only 'field' is allowed in multiple for now - actual_fields = self._get_field_setting(field, 'field') - - value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields) - else: - value = get_value(field) - return self._calculate_field_preference_from_value(format, field, type, value) - - def calculate_preference(self, format): - # Determine missing protocol - if not format.get('protocol'): - format['protocol'] = determine_protocol(format) - - # Determine missing ext - if not format.get('ext') and 'url' in format: - format['ext'] = determine_ext(format['url']) - if format.get('vcodec') == 'none': - format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none' - format['video_ext'] = 'none' - else: - format['video_ext'] = format['ext'] - format['audio_ext'] = 'none' - # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported? - # format['preference'] = -1000 - - # Determine missing bitrates - if format.get('tbr') is None: - if format.get('vbr') is not None and format.get('abr') is not None: - format['tbr'] = format.get('vbr', 0) + format.get('abr', 0) - else: - if format.get('vcodec') != 'none' and format.get('vbr') is None: - format['vbr'] = format.get('tbr') - format.get('abr', 0) - if format.get('acodec') != 'none' and format.get('abr') is None: - format['abr'] = format.get('tbr') - format.get('vbr', 0) - - return tuple(self._calculate_field_preference(format, field) for field in self._order) + deprecation_warning( + 'hypervideo_dl.InfoExtractor.FormatSort is deprecated and may be removed in the future. ' + 'Use hypervideo_dl.utils.FormatSorter instead') + return FormatSort def _sort_formats(self, formats, field_preference=[]): - if not formats: + if not field_preference: + self._downloader.deprecation_warning( + 'hypervideo_dl.InfoExtractor._sort_formats is deprecated and is no longer required') return - format_sort = self.FormatSort(self, field_preference) - formats.sort(key=lambda f: format_sort.calculate_preference(f)) + self._downloader.deprecation_warning( + 'hypervideo_dl.InfoExtractor._sort_formats is deprecated and no longer works as expected. ' + 'Return _format_sort_fields in the info_dict instead') + if formats: + formats[0]['__sort_fields'] = field_preference def _check_formats(self, formats, video_id): if formats: @@ -1969,14 +1743,9 @@ class InfoExtractor(object): else 'https:') def _proto_relative_url(self, url, scheme=None): - if url is None: - return url - if url.startswith('//'): - if scheme is None: - scheme = self.http_scheme() - return scheme + url - else: - return url + scheme = scheme or self.http_scheme() + assert scheme.endswith(':') + return sanitize_url(url, scheme=scheme[:-1]) def _sleep(self, timeout, video_id, msg_template=None): if msg_template is None: @@ -1988,17 +1757,19 @@ class InfoExtractor(object): def _extract_f4m_formats(self, manifest_url, video_id, preference=None, quality=None, f4m_id=None, transform_source=lambda s: fix_xml_ampersands(s).strip(), fatal=True, m3u8_id=None, data=None, headers={}, query={}): - manifest = self._download_xml( + res = self._download_xml_handle( manifest_url, video_id, 'Downloading f4m manifest', 'Unable to download f4m manifest', # Some manifests may be malformed, e.g. prosiebensat1 generated manifests # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244) transform_source=transform_source, fatal=fatal, data=data, headers=headers, query=query) - - if manifest is False: + if res is False: return [] + manifest, urlh = res + manifest_url = urlh.geturl() + return self._parse_f4m_formats( manifest, manifest_url, video_id, preference=preference, quality=quality, f4m_id=f4m_id, transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id) @@ -2006,7 +1777,7 @@ class InfoExtractor(object): def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, quality=None, f4m_id=None, transform_source=lambda s: fix_xml_ampersands(s).strip(), fatal=True, m3u8_id=None): - if not isinstance(manifest, compat_etree_Element) and not fatal: + if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal: return [] # currently hypervideo cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy @@ -2166,7 +1937,7 @@ class InfoExtractor(object): ]), m3u8_doc) def format_url(url): - return url if re.match(r'^https?://', url) else compat_urlparse.urljoin(m3u8_url, url) + return url if re.match(r'^https?://', url) else urllib.parse.urljoin(m3u8_url, url) if self.get_param('hls_split_discontinuity', False): def _extract_m3u8_playlist_indices(manifest_url=None, m3u8_doc=None): @@ -2342,7 +2113,7 @@ class InfoExtractor(object): audio_group_id = last_stream_inf.get('AUDIO') # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which # references a rendition group MUST have a CODECS attribute. - # However, this is not always respected, for example, [2] + # However, this is not always respected. E.g. [2] # contains EXT-X-STREAM-INF tag which references AUDIO # rendition group but does not have CODECS and despite # referencing an audio group it represents a complete @@ -2406,12 +2177,14 @@ class InfoExtractor(object): return '/'.join(out) def _extract_smil_formats_and_subtitles(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None): - smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source) - - if smil is False: + res = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source) + if res is False: assert not fatal return [], {} + smil, urlh = res + smil_url = urlh.geturl() + namespace = self._parse_smil_namespace(smil) fmts = self._parse_smil_formats( @@ -2428,13 +2201,17 @@ class InfoExtractor(object): return fmts def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None): - smil = self._download_smil(smil_url, video_id, fatal=fatal) - if smil is False: + res = self._download_smil(smil_url, video_id, fatal=fatal) + if res is False: return {} + + smil, urlh = res + smil_url = urlh.geturl() + return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None): - return self._download_xml( + return self._download_xml_handle( smil_url, video_id, 'Downloading SMIL file', 'Unable to download SMIL file', fatal=fatal, transform_source=transform_source) @@ -2533,7 +2310,7 @@ class InfoExtractor(object): }) continue - src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) + src_url = src if src.startswith('http') else urllib.parse.urljoin(base, src) src_url = src_url.strip() if proto == 'm3u8' or src_ext == 'm3u8': @@ -2556,7 +2333,7 @@ class InfoExtractor(object): 'plugin': 'flowplayer-3.2.0.1', } f4m_url += '&' if '?' in f4m_url else '?' - f4m_url += compat_urllib_parse_urlencode(f4m_params) + f4m_url += urllib.parse.urlencode(f4m_params) formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) elif src_ext == 'mpd': formats.extend(self._extract_mpd_formats( @@ -2613,11 +2390,15 @@ class InfoExtractor(object): return subtitles def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True): - xspf = self._download_xml( + res = self._download_xml_handle( xspf_url, playlist_id, 'Downloading xpsf playlist', 'Unable to download xspf manifest', fatal=fatal) - if xspf is False: + if res is False: return [] + + xspf, urlh = res + xspf_url = urlh.geturl() + return self._parse_xspf( xspf, playlist_id, xspf_url=xspf_url, xspf_base_url=base_url(xspf_url)) @@ -2651,7 +2432,6 @@ class InfoExtractor(object): 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), }) - self._sort_formats(formats) entries.append({ 'id': playlist_id, @@ -2682,7 +2462,10 @@ class InfoExtractor(object): mpd_doc, urlh = res if mpd_doc is None: return [], {} - mpd_base_url = base_url(urlh.geturl()) + + # We could have been redirected to a new url when we retrieved our mpd file. + mpd_url = urlh.geturl() + mpd_base_url = base_url(mpd_url) return self._parse_mpd_formats_and_subtitles( mpd_doc, mpd_id, mpd_base_url, mpd_url) @@ -2790,15 +2573,20 @@ class InfoExtractor(object): mime_type = representation_attrib['mimeType'] content_type = representation_attrib.get('contentType', mime_type.split('/')[0]) - codecs = parse_codecs(representation_attrib.get('codecs', '')) + codec_str = representation_attrib.get('codecs', '') + # Some kind of binary subtitle found in some youtube livestreams + if mime_type == 'application/x-rawcc': + codecs = {'scodec': codec_str} + else: + codecs = parse_codecs(codec_str) if content_type not in ('video', 'audio', 'text'): if mime_type == 'image/jpeg': content_type = mime_type - elif codecs['vcodec'] != 'none': + elif codecs.get('vcodec', 'none') != 'none': content_type = 'video' - elif codecs['acodec'] != 'none': + elif codecs.get('acodec', 'none') != 'none': content_type = 'audio' - elif codecs.get('tcodec', 'none') != 'none': + elif codecs.get('scodec', 'none') != 'none': content_type = 'text' elif mimetype2ext(mime_type) in ('tt', 'dfxp', 'ttml', 'xml', 'json'): content_type = 'text' @@ -2809,12 +2597,12 @@ class InfoExtractor(object): base_url = '' for element in (representation, adaptation_set, period, mpd_doc): base_url_e = element.find(_add_ns('BaseURL')) - if base_url_e is not None: + if try_call(lambda: base_url_e.text) is not None: base_url = base_url_e.text + base_url if re.match(r'^https?://', base_url): break if mpd_base_url and base_url.startswith('/'): - base_url = compat_urlparse.urljoin(mpd_base_url, base_url) + base_url = urllib.parse.urljoin(mpd_base_url, base_url) elif mpd_base_url and not re.match(r'^https?://', base_url): if not mpd_base_url.endswith('/'): mpd_base_url += '/' @@ -2869,6 +2657,8 @@ class InfoExtractor(object): def prepare_template(template_name, identifiers): tmpl = representation_ms_info[template_name] + if representation_id is not None: + tmpl = tmpl.replace('$RepresentationID$', representation_id) # First of, % characters outside $...$ templates # must be escaped by doubling for proper processing # by % operator string formatting used further (see @@ -2883,8 +2673,6 @@ class InfoExtractor(object): t += c # Next, $...$ templates are translated to their # %(...) counterparts to be used with % operator - if representation_id is not None: - t = t.replace('$RepresentationID$', representation_id) t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) t.replace('$$', '$') @@ -2960,8 +2748,8 @@ class InfoExtractor(object): segment_number += 1 segment_time += segment_d elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info: - # No media template - # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI + # No media template, + # e.g. https://www.youtube.com/watch?v=iXZV5uAYMJI # or any YouTube dashsegments video fragments = [] segment_index = 0 @@ -2978,7 +2766,7 @@ class InfoExtractor(object): representation_ms_info['fragments'] = fragments elif 'segment_urls' in representation_ms_info: # Segment URLs with no SegmentTimeline - # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091 + # E.g. https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091 # https://github.com/ytdl-org/youtube-dl/pull/14844 fragments = [] segment_duration = float_or_none( @@ -3070,9 +2858,10 @@ class InfoExtractor(object): stream_name = stream.get('Name') stream_language = stream.get('Language', 'und') for track in stream.findall('QualityLevel'): - fourcc = track.get('FourCC') or ('AACL' if track.get('AudioTag') == '255' else None) + KNOWN_TAGS = {'255': 'AACL', '65534': 'EC-3'} + fourcc = track.get('FourCC') or KNOWN_TAGS.get(track.get('AudioTag')) # TODO: add support for WVC1 and WMAP - if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML'): + if fourcc not in ('H264', 'AVC1', 'AACL', 'TTML', 'EC-3'): self.report_warning('%s is not a supported codec' % fourcc) continue tbr = int(track.attrib['Bitrate']) // 1000 @@ -3084,7 +2873,7 @@ class InfoExtractor(object): sampling_rate = int_or_none(track.get('SamplingRate')) track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern) - track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern) + track_url_pattern = urllib.parse.urljoin(ism_url, track_url_pattern) fragments = [] fragment_ctx = { @@ -3103,7 +2892,7 @@ class InfoExtractor(object): fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat for _ in range(fragment_repeat): fragments.append({ - 'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern), + 'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern), 'duration': fragment_ctx['duration'] / stream_timescale, }) fragment_ctx['time'] += fragment_ctx['duration'] @@ -3171,7 +2960,8 @@ class InfoExtractor(object): return f return {} - def _media_formats(src, cur_media_type, type_info={}): + def _media_formats(src, cur_media_type, type_info=None): + type_info = type_info or {} full_url = absolute_url(src) ext = type_info.get('ext') or determine_ext(full_url) if ext == 'm3u8': @@ -3189,12 +2979,13 @@ class InfoExtractor(object): formats = [{ 'url': full_url, 'vcodec': 'none' if cur_media_type == 'audio' else None, + 'ext': ext, }] return is_plain_url, formats entries = [] # amp-video and amp-audio are very similar to their HTML5 counterparts - # so we wll include them right here (see + # so we will include them right here (see # https://www.ampproject.org/docs/reference/components/amp-video) # For dl8-* tags see https://delight-vr.com/documentation/dl8-video/ _MEDIA_TAG_NAME_RE = r'(?:(?:amp|dl8(?:-live)?)-)?(video|audio)' @@ -3204,8 +2995,8 @@ class InfoExtractor(object): media_tags.extend(re.findall( # We only allow video|audio followed by a whitespace or '>'. # Allowing more characters may end up in significant slow down (see - # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL: - # http://www.porntrex.com/maps/videositemap.xml). + # https://github.com/ytdl-org/youtube-dl/issues/11979, + # e.g. http://www.porntrex.com/maps/videositemap.xml). r'(?s)(<(?P<tag>%s)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>' % _MEDIA_TAG_NAME_RE, webpage)) for media_tag, _, media_type, media_content in media_tags: media_info = { @@ -3213,9 +3004,10 @@ class InfoExtractor(object): 'subtitles': {}, } media_attributes = extract_attributes(media_tag) - src = strip_or_none(media_attributes.get('src')) + src = strip_or_none(dict_get(media_attributes, ('src', 'data-video-src', 'data-src', 'data-source'))) if src: - _, formats = _media_formats(src, media_type) + f = parse_content_type(media_attributes.get('type')) + _, formats = _media_formats(src, media_type, f) media_info['formats'].extend(formats) media_info['thumbnail'] = absolute_url(media_attributes.get('poster')) if media_content: @@ -3223,7 +3015,7 @@ class InfoExtractor(object): s_attr = extract_attributes(source_tag) # data-video-src and data-src are non standard but seen # several times in the wild - src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src'))) + src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src', 'data-source'))) if not src: continue f = parse_content_type(s_attr.get('type')) @@ -3332,7 +3124,7 @@ class InfoExtractor(object): http_f = f.copy() del http_f['manifest_url'] http_url = re.sub( - REPL_REGEX, protocol + r'://%s/\g<1>%s\3' % (http_host, qualities[i]), f['url']) + REPL_REGEX, protocol + fr'://{http_host}/\g<1>{qualities[i]}\3', f['url']) http_f.update({ 'format_id': http_f['format_id'].replace('hls-', protocol + '-'), 'url': http_url, @@ -3344,7 +3136,7 @@ class InfoExtractor(object): return formats, subtitles def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): - query = compat_urlparse.urlparse(url).query + query = urllib.parse.urlparse(url).query url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url) mobj = re.search( r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url) @@ -3353,7 +3145,7 @@ class InfoExtractor(object): formats = [] def manifest_url(manifest): - m_url = '%s/%s' % (http_base_url, manifest) + m_url = f'{http_base_url}/{manifest}' if query: m_url += '?%s' % query return m_url @@ -3390,7 +3182,7 @@ class InfoExtractor(object): for protocol in ('rtmp', 'rtsp'): if protocol not in skip_protocols: formats.append({ - 'url': '%s:%s' % (protocol, url_base), + 'url': f'{protocol}:{url_base}', 'format_id': protocol, 'protocol': protocol, }) @@ -3450,7 +3242,7 @@ class InfoExtractor(object): if not isinstance(track, dict): continue track_kind = track.get('kind') - if not track_kind or not isinstance(track_kind, compat_str): + if not track_kind or not isinstance(track_kind, str): continue if track_kind.lower() not in ('captions', 'subtitles'): continue @@ -3477,7 +3269,6 @@ class InfoExtractor(object): 'url': formats[0]['url'], }) else: - self._sort_formats(formats) entry['formats'] = formats entries.append(entry) if len(entries) == 1: @@ -3523,13 +3314,14 @@ class InfoExtractor(object): # Often no height is provided but there is a label in # format like "1080p", "720p SD", or 1080. height = int_or_none(self._search_regex( - r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''), + r'^(\d{3,4})[pP]?(?:\b|$)', str(source.get('label') or ''), 'height', default=None)) a_format = { 'url': source_url, 'width': int_or_none(source.get('width')), 'height': height, - 'tbr': int_or_none(source.get('bitrate')), + 'tbr': int_or_none(source.get('bitrate'), scale=1000), + 'filesize': int_or_none(source.get('filesize')), 'ext': ext, } if source_url.startswith('rtmp'): @@ -3556,7 +3348,7 @@ class InfoExtractor(object): def _int(self, v, name, fatal=False, **kwargs): res = int_or_none(v, **kwargs) if res is None: - msg = 'Failed to extract %s: Could not parse value %r' % (name, v) + msg = f'Failed to extract {name}: Could not parse value {v!r}' if fatal: raise ExtractorError(msg) else: @@ -3566,7 +3358,7 @@ class InfoExtractor(object): def _float(self, v, name, fatal=False, **kwargs): res = float_or_none(v, **kwargs) if res is None: - msg = 'Failed to extract %s: Could not parse value %r' % (name, v) + msg = f'Failed to extract {name}: Could not parse value {v!r}' if fatal: raise ExtractorError(msg) else: @@ -3575,17 +3367,15 @@ class InfoExtractor(object): def _set_cookie(self, domain, name, value, expire_time=None, port=None, path='/', secure=False, discard=False, rest={}, **kwargs): - cookie = compat_cookiejar_Cookie( + cookie = http.cookiejar.Cookie( 0, name, value, port, port is not None, domain, True, domain.startswith('.'), path, True, secure, expire_time, discard, None, None, rest) - self._downloader.cookiejar.set_cookie(cookie) + self.cookiejar.set_cookie(cookie) def _get_cookies(self, url): - """ Return a compat_cookies_SimpleCookie with the cookies for the url """ - req = sanitized_Request(url) - self._downloader.cookiejar.add_cookie_header(req) - return compat_cookies_SimpleCookie(req.get_header('Cookie')) + """ Return a http.cookies.SimpleCookie with the cookies for the url """ + return LenientSimpleCookie(self._downloader._calc_cookies(url)) def _apply_first_set_cookie_header(self, url_handle, cookie): """ @@ -3604,9 +3394,7 @@ class InfoExtractor(object): for header, cookies in url_handle.headers.items(): if header.lower() != 'set-cookie': continue - if sys.version_info[0] >= 3: - cookies = cookies.encode('iso-8859-1') - cookies = cookies.decode('utf-8') + cookies = cookies.encode('iso-8859-1').decode('utf-8') cookie_value = re.search( r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies) if cookie_value: @@ -3614,34 +3402,82 @@ class InfoExtractor(object): self._set_cookie(domain, cookie, value) break - def get_testcases(self, include_onlymatching=False): - t = getattr(self, '_TEST', None) + @classmethod + def get_testcases(cls, include_onlymatching=False): + # Do not look in super classes + t = vars(cls).get('_TEST') if t: - assert not hasattr(self, '_TESTS'), \ - '%s has _TEST and _TESTS' % type(self).__name__ + assert not hasattr(cls, '_TESTS'), f'{cls.ie_key()}IE has _TEST and _TESTS' tests = [t] else: - tests = getattr(self, '_TESTS', []) + tests = vars(cls).get('_TESTS', []) for t in tests: if not include_onlymatching and t.get('only_matching', False): continue - t['name'] = type(self).__name__[:-len('IE')] + t['name'] = cls.ie_key() yield t - def is_suitable(self, age_limit): - """ Test whether the extractor is generally suitable for the given - age limit (i.e. pornographic sites are not, all others usually are) """ - - any_restricted = False - for tc in self.get_testcases(include_onlymatching=False): - if tc.get('playlist', []): - tc = tc['playlist'][0] - is_restricted = age_restricted( - tc.get('info_dict', {}).get('age_limit'), age_limit) - if not is_restricted: - return True - any_restricted = any_restricted or is_restricted - return not any_restricted + @classmethod + def get_webpage_testcases(cls): + tests = vars(cls).get('_WEBPAGE_TESTS', []) + for t in tests: + t['name'] = cls.ie_key() + return tests + + @classproperty(cache=True) + def age_limit(cls): + """Get age limit from the testcases""" + return max(traverse_obj( + (*cls.get_testcases(include_onlymatching=False), *cls.get_webpage_testcases()), + (..., (('playlist', 0), None), 'info_dict', 'age_limit')) or [0]) + + @classproperty(cache=True) + def _RETURN_TYPE(cls): + """What the extractor returns: "video", "playlist", "any", or None (Unknown)""" + tests = tuple(cls.get_testcases(include_onlymatching=False)) + if not tests: + return None + elif not any(k.startswith('playlist') for test in tests for k in test): + return 'video' + elif all(any(k.startswith('playlist') for k in test) for test in tests): + return 'playlist' + return 'any' + + @classmethod + def is_single_video(cls, url): + """Returns whether the URL is of a single video, None if unknown""" + assert cls.suitable(url), 'The URL must be suitable for the extractor' + return {'video': True, 'playlist': False}.get(cls._RETURN_TYPE) + + @classmethod + def is_suitable(cls, age_limit): + """Test whether the extractor is generally suitable for the given age limit""" + return not age_restricted(cls.age_limit, age_limit) + + @classmethod + def description(cls, *, markdown=True, search_examples=None): + """Description of the extractor""" + desc = '' + if cls._NETRC_MACHINE: + if markdown: + desc += f' [<abbr title="netrc machine"><em>{cls._NETRC_MACHINE}</em></abbr>]' + else: + desc += f' [{cls._NETRC_MACHINE}]' + if cls.IE_DESC is False: + desc += ' [HIDDEN]' + elif cls.IE_DESC: + desc += f' {cls.IE_DESC}' + if cls.SEARCH_KEY: + desc += f'; "{cls.SEARCH_KEY}:" prefix' + if search_examples: + _COUNTS = ('', '5', '10', 'all') + desc += f' (e.g. "{cls.SEARCH_KEY}{random.choice(_COUNTS)}:{random.choice(search_examples)}")' + if not cls.working(): + desc += ' (**Currently broken**)' if markdown else ' (Currently broken)' + + # Escape emojis. Ref: https://github.com/github/markup/issues/1153 + name = (' - **%s**' % re.sub(r':(\w+:)', ':\u200B\\g<1>', cls.IE_NAME)) if markdown else cls.IE_NAME + return f'{name}:{desc}' if desc else name def extract_subtitles(self, *args, **kwargs): if (self.get_param('writesubtitles', False) @@ -3652,6 +3488,9 @@ class InfoExtractor(object): def _get_subtitles(self, *args, **kwargs): raise NotImplementedError('This method must be implemented by subclasses') + class CommentsDisabled(Exception): + """Raise in _get_comments if comments are disabled for the video""" + def extract_comments(self, *args, **kwargs): if not self.get_param('getcomments'): return None @@ -3667,6 +3506,8 @@ class InfoExtractor(object): interrupted = False except KeyboardInterrupt: self.to_screen('Interrupted by user') + except self.CommentsDisabled: + return {'comments': None, 'comment_count': None} except Exception as e: if self.get_param('ignoreerrors') is not True: raise @@ -3686,7 +3527,7 @@ class InfoExtractor(object): def _merge_subtitle_items(subtitle_list1, subtitle_list2): """ Merge subtitle items for one language. Items with duplicated URLs/data will be dropped. """ - list1_data = set((item.get('url'), item.get('data')) for item in subtitle_list1) + list1_data = {(item.get('url'), item.get('data')) for item in subtitle_list1} ret = list(subtitle_list1) ret.extend(item for item in subtitle_list2 if (item.get('url'), item.get('data')) not in list1_data) return ret @@ -3710,11 +3551,15 @@ class InfoExtractor(object): def _get_automatic_captions(self, *args, **kwargs): raise NotImplementedError('This method must be implemented by subclasses') + @functools.cached_property + def _cookies_passed(self): + """Whether cookies have been passed to YoutubeDL""" + return self.get_param('cookiefile') is not None or self.get_param('cookiesfrombrowser') is not None + def mark_watched(self, *args, **kwargs): if not self.get_param('mark_watched', False): return - if (self.supports_login() and self._get_login_info()[0] is not None - or self.get_param('cookiefile') or self.get_param('cookiesfrombrowser')): + if self.supports_login() and self._get_login_info()[0] is not None or self._cookies_passed: self._mark_watched(*args, **kwargs) def _mark_watched(self, *args, **kwargs): @@ -3727,11 +3572,15 @@ class InfoExtractor(object): headers['Ytdl-request-proxy'] = geo_verification_proxy return headers - def _generic_id(self, url): - return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) + @staticmethod + def _generic_id(url): + return urllib.parse.unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) - def _generic_title(self, url): - return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]) + def _generic_title(self, url='', webpage='', *, default=None): + return (self._og_search_title(webpage, default=None) + or self._html_extract_title(webpage, default=None) + or urllib.parse.unquote(os.path.splitext(url_basename(url))[0]) + or default) @staticmethod def _availability(is_private=None, needs_premium=None, needs_subscription=None, needs_auth=None, is_unlisted=None): @@ -3754,8 +3603,8 @@ class InfoExtractor(object): @param default The default value to return when the key is not present (default: []) @param casesense When false, the values are converted to lower case ''' - val = traverse_obj( - self._downloader.params, ('extractor_args', (ie_key or self.ie_key()).lower(), key)) + ie_key = ie_key if isinstance(ie_key, str) else (ie_key or self).ie_key() + val = traverse_obj(self._downloader.params, ('extractor_args', ie_key.lower(), key)) if val is None: return [] if default is NO_DEFAULT else default return list(val) if casesense else [x.lower() for x in val] @@ -3776,6 +3625,72 @@ class InfoExtractor(object): self.to_screen(f'Downloading {playlist_label}{playlist_id} - add --no-playlist to download just the {video_label}{video_id}') return True + def _error_or_warning(self, err, _count=None, _retries=0, *, fatal=True): + RetryManager.report_retry( + err, _count or int(fatal), _retries, + info=self.to_screen, warn=self.report_warning, error=None if fatal else self.report_warning, + sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor')) + + def RetryManager(self, **kwargs): + return RetryManager(self.get_param('extractor_retries', 3), self._error_or_warning, **kwargs) + + def _extract_generic_embeds(self, url, *args, info_dict={}, note='Extracting generic embeds', **kwargs): + display_id = traverse_obj(info_dict, 'display_id', 'id') + self.to_screen(f'{format_field(display_id, None, "%s: ")}{note}') + return self._downloader.get_info_extractor('Generic')._extract_embeds( + smuggle_url(url, {'block_ies': [self.ie_key()]}), *args, **kwargs) + + @classmethod + def extract_from_webpage(cls, ydl, url, webpage): + ie = (cls if isinstance(cls._extract_from_webpage, types.MethodType) + else ydl.get_info_extractor(cls.ie_key())) + for info in ie._extract_from_webpage(url, webpage) or []: + # url = None since we do not want to set (webpage/original)_url + ydl.add_default_extra_info(info, ie, None) + yield info + + @classmethod + def _extract_from_webpage(cls, url, webpage): + for embed_url in orderedSet( + cls._extract_embed_urls(url, webpage) or [], lazy=True): + yield cls.url_result(embed_url, None if cls._VALID_URL is False else cls) + + @classmethod + def _extract_embed_urls(cls, url, webpage): + """@returns all the embed urls on the webpage""" + if '_EMBED_URL_RE' not in cls.__dict__: + assert isinstance(cls._EMBED_REGEX, (list, tuple)) + for idx, regex in enumerate(cls._EMBED_REGEX): + assert regex.count('(?P<url>') == 1, \ + f'{cls.__name__}._EMBED_REGEX[{idx}] must have exactly 1 url group\n\t{regex}' + cls._EMBED_URL_RE = tuple(map(re.compile, cls._EMBED_REGEX)) + + for regex in cls._EMBED_URL_RE: + for mobj in regex.finditer(webpage): + embed_url = urllib.parse.urljoin(url, unescapeHTML(mobj.group('url'))) + if cls._VALID_URL is False or cls.suitable(embed_url): + yield embed_url + + class StopExtraction(Exception): + pass + + @classmethod + def _extract_url(cls, webpage): # TODO: Remove + """Only for compatibility with some older extractors""" + return next(iter(cls._extract_embed_urls(None, webpage) or []), None) + + @classmethod + def __init_subclass__(cls, *, plugin_name=None, **kwargs): + if plugin_name: + mro = inspect.getmro(cls) + super_class = cls.__wrapped__ = mro[mro.index(cls) + 1] + cls.IE_NAME, cls.ie_key = f'{super_class.IE_NAME}+{plugin_name}', super_class.ie_key + while getattr(super_class, '__wrapped__', None): + super_class = super_class.__wrapped__ + setattr(sys.modules[super_class.__module__], super_class.__name__, cls) + + return super().__init_subclass__(**kwargs) + class SearchInfoExtractor(InfoExtractor): """ @@ -3785,9 +3700,10 @@ class SearchInfoExtractor(InfoExtractor): """ _MAX_RESULTS = float('inf') + _RETURN_TYPE = 'playlist' - @classmethod - def _make_valid_url(cls): + @classproperty + def _VALID_URL(cls): return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY def _real_extract(self, query): @@ -3799,7 +3715,7 @@ class SearchInfoExtractor(InfoExtractor): else: n = int(prefix) if n <= 0: - raise ExtractorError('invalid download number %s for query "%s"' % (n, query)) + raise ExtractorError(f'invalid download number {n} for query "{query}"') elif n > self._MAX_RESULTS: self.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n)) n = self._MAX_RESULTS @@ -3816,6 +3732,15 @@ class SearchInfoExtractor(InfoExtractor): """Returns an iterator of search results""" raise NotImplementedError('This method must be implemented by subclasses') - @property - def SEARCH_KEY(self): - return self._SEARCH_KEY + @classproperty + def SEARCH_KEY(cls): + return cls._SEARCH_KEY + + +class UnsupportedURLIE(InfoExtractor): + _VALID_URL = '.*' + _ENABLED = False + IE_DESC = False + + def _real_extract(self, url): + raise UnsupportedError(url) diff --git a/hypervideo_dl/extractor/commonmistakes.py b/hypervideo_dl/extractor/commonmistakes.py index eb76fe5..a4a38cf 100644 --- a/hypervideo_dl/extractor/commonmistakes.py +++ b/hypervideo_dl/extractor/commonmistakes.py @@ -1,16 +1,10 @@ -from __future__ import unicode_literals - -import sys - from .common import InfoExtractor from ..utils import ExtractorError class CommonMistakesIE(InfoExtractor): IE_DESC = False # Do not list - _VALID_URL = r'''(?x) - (?:url|URL)$ - ''' + _VALID_URL = r'(?:url|URL|hypervideo)$' _TESTS = [{ 'url': 'url', @@ -35,9 +29,7 @@ class UnicodeBOMIE(InfoExtractor): IE_DESC = False _VALID_URL = r'(?P<bom>\ufeff)(?P<id>.*)$' - # Disable test for python 3.2 since BOM is broken in re in this version - # (see https://github.com/ytdl-org/youtube-dl/issues/9751) - _TESTS = [] if (3, 0) < sys.version_info <= (3, 3) else [{ + _TESTS = [{ 'url': '\ufeffhttp://www.youtube.com/watch?v=BaW_jenozKc', 'only_matching': True, }] diff --git a/hypervideo_dl/extractor/commonprotocols.py b/hypervideo_dl/extractor/commonprotocols.py index 3708c6a..2f93e8e 100644 --- a/hypervideo_dl/extractor/commonprotocols.py +++ b/hypervideo_dl/extractor/commonprotocols.py @@ -1,10 +1,6 @@ -from __future__ import unicode_literals - +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_urlparse, -) class RtmpIE(InfoExtractor): @@ -28,7 +24,7 @@ class RtmpIE(InfoExtractor): 'formats': [{ 'url': url, 'ext': 'flv', - 'format_id': compat_urlparse.urlparse(url).scheme, + 'format_id': urllib.parse.urlparse(url).scheme, }], } diff --git a/hypervideo_dl/extractor/condenast.py b/hypervideo_dl/extractor/condenast.py index 54e7af8..3170c29 100644 --- a/hypervideo_dl/extractor/condenast.py +++ b/hypervideo_dl/extractor/condenast.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -61,7 +58,10 @@ class CondeNastIE(InfoExtractor): )''' % '|'.join(_SITES.keys()) IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) - EMBED_URL = r'(?:https?:)?//player(?:-backend)?\.(?:%s)\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+?' % '|'.join(_SITES.keys()) + _EMBED_REGEX = [r'''(?x) + <(?:iframe|script)[^>]+?src=(["\'])(?P<url> + (?:https?:)?//player(?:-backend)?\.(?:%s)\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+? + )\1''' % '|'.join(_SITES.keys())] _TESTS = [{ 'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led', @@ -197,7 +197,6 @@ class CondeNastIE(InfoExtractor): 'ext': ext, 'quality': 1 if quality == 'high' else 0, }) - self._sort_formats(formats) subtitles = {} for t, caption in video_info.get('captions', {}).items(): diff --git a/hypervideo_dl/extractor/contv.py b/hypervideo_dl/extractor/contv.py index 84b462d..d69e816 100644 --- a/hypervideo_dl/extractor/contv.py +++ b/hypervideo_dl/extractor/contv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( float_or_none, @@ -72,8 +69,6 @@ class CONtvIE(InfoExtractor): 'url': media_mp4_url, }) - self._sort_formats(formats) - subtitles = {} captions = m_details.get('captions') or {} for caption_url in captions.values(): diff --git a/hypervideo_dl/extractor/corus.py b/hypervideo_dl/extractor/corus.py index 1194613..c03d653 100644 --- a/hypervideo_dl/extractor/corus.py +++ b/hypervideo_dl/extractor/corus.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .theplatform import ThePlatformFeedIE from ..utils import ( dict_get, @@ -11,7 +7,7 @@ from ..utils import ( ) -class CorusIE(ThePlatformFeedIE): +class CorusIE(ThePlatformFeedIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'''(?x) https?:// (?:www\.)? @@ -130,7 +126,6 @@ class CorusIE(ThePlatformFeedIE): smil, smil_url, video_id, namespace)) if not formats and video.get('drm'): self.report_drm(video_id) - self._sort_formats(formats) subtitles = {} for track in video.get('tracks', []): diff --git a/hypervideo_dl/extractor/coub.py b/hypervideo_dl/extractor/coub.py index e90aa19..9bab698 100644 --- a/hypervideo_dl/extractor/coub.py +++ b/hypervideo_dl/extractor/coub.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -107,8 +104,6 @@ class CoubIE(InfoExtractor): 'source_preference': preference_key(MOBILE), }) - self._sort_formats(formats) - thumbnail = coub.get('picture') duration = float_or_none(coub.get('duration')) timestamp = parse_iso8601(coub.get('published_at') or coub.get('created_at')) diff --git a/hypervideo_dl/extractor/cozytv.py b/hypervideo_dl/extractor/cozytv.py index d49f1ca..5ef5afc 100644 --- a/hypervideo_dl/extractor/cozytv.py +++ b/hypervideo_dl/extractor/cozytv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import unified_strdate diff --git a/hypervideo_dl/extractor/cpac.py b/hypervideo_dl/extractor/cpac.py index 2274115..0f23f2b 100644 --- a/hypervideo_dl/extractor/cpac.py +++ b/hypervideo_dl/extractor/cpac.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -12,13 +9,6 @@ from ..utils import ( urljoin, ) -# compat_range -try: - if callable(xrange): - range = xrange -except (NameError, TypeError): - pass - class CPACIE(InfoExtractor): IE_NAME = 'cpac' @@ -64,8 +54,6 @@ class CPACIE(InfoExtractor): else: fmt['language_preference'] = -10 - self._sort_formats(formats) - category = str_or_none(content['details']['category_%s_t' % (url_lang, )]) def is_live(v_type): diff --git a/hypervideo_dl/extractor/cracked.py b/hypervideo_dl/extractor/cracked.py index f77a68e..c6aabcc 100644 --- a/hypervideo_dl/extractor/cracked.py +++ b/hypervideo_dl/extractor/cracked.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/crackle.py b/hypervideo_dl/extractor/crackle.py index db4962c..4610015 100644 --- a/hypervideo_dl/extractor/crackle.py +++ b/hypervideo_dl/extractor/crackle.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals, division - import hashlib import hmac import re @@ -180,7 +177,6 @@ class CrackleIE(InfoExtractor): }) if not formats and has_drm: self.report_drm(video_id) - self._sort_formats(formats) description = media.get('Description') duration = int_or_none(media.get( diff --git a/hypervideo_dl/extractor/craftsy.py b/hypervideo_dl/extractor/craftsy.py index ed2f442..307bfb9 100644 --- a/hypervideo_dl/extractor/craftsy.py +++ b/hypervideo_dl/extractor/craftsy.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .brightcove import BrightcoveNewIE from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/crooksandliars.py b/hypervideo_dl/extractor/crooksandliars.py index 7fb782d..4de7e3d 100644 --- a/hypervideo_dl/extractor/crooksandliars.py +++ b/hypervideo_dl/extractor/crooksandliars.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -9,6 +7,8 @@ from ..utils import ( class CrooksAndLiarsIE(InfoExtractor): _VALID_URL = r'https?://embed\.crooksandliars\.com/(?:embed|v)/(?P<id>[A-Za-z0-9]+)' + _EMBED_REGEX = [r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1'] + _TESTS = [{ 'url': 'https://embed.crooksandliars.com/embed/8RUoRhRi', 'info_dict': { @@ -45,7 +45,6 @@ class CrooksAndLiarsIE(InfoExtractor): 'format_id': item['type'], 'quality': quality(item['type']), } for item in manifest['flavors'] if item['mime'].startswith('video/')] - self._sort_formats(formats) return { 'url': url, diff --git a/hypervideo_dl/extractor/crowdbunker.py b/hypervideo_dl/extractor/crowdbunker.py index 72906af..d83c015 100644 --- a/hypervideo_dl/extractor/crowdbunker.py +++ b/hypervideo_dl/extractor/crowdbunker.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import itertools from .common import InfoExtractor @@ -63,7 +60,6 @@ class CrowdBunkerIE(InfoExtractor): 'width': int_or_none(image.get('width')), } for image in video_json.get('thumbnails') or [] if image.get('url')] - self._sort_formats(formats) return { 'id': id, 'title': video_json.get('title'), diff --git a/hypervideo_dl/extractor/crunchyroll.py b/hypervideo_dl/extractor/crunchyroll.py index 7edb645..d226050 100644 --- a/hypervideo_dl/extractor/crunchyroll.py +++ b/hypervideo_dl/extractor/crunchyroll.py @@ -1,44 +1,16 @@ -# coding: utf-8 -from __future__ import unicode_literals - import base64 -import re -import json -import zlib +import urllib.parse -from hashlib import sha1 -from math import pow, sqrt, floor from .common import InfoExtractor -from .vrv import VRVBaseIE -from ..compat import ( - compat_b64decode, - compat_etree_Element, - compat_etree_fromstring, - compat_str, - compat_urllib_parse_urlencode, - compat_urllib_request, - compat_urlparse, -) from ..utils import ( ExtractorError, - bytes_to_intlist, - extract_attributes, float_or_none, format_field, - intlist_to_bytes, - int_or_none, join_nonempty, - lowercase_escape, - merge_dicts, + parse_iso8601, qualities, - remove_end, - sanitized_Request, traverse_obj, try_get, - xpath_text, -) -from ..aes import ( - aes_cbc_decrypt, ) @@ -46,16 +18,7 @@ class CrunchyrollBaseIE(InfoExtractor): _LOGIN_URL = 'https://www.crunchyroll.com/welcome/login' _API_BASE = 'https://api.crunchyroll.com' _NETRC_MACHINE = 'crunchyroll' - - def _call_rpc_api(self, method, video_id, note=None, data=None): - data = data or {} - data['req'] = 'RpcApi' + method - data = compat_urllib_parse_urlencode(data).encode('utf-8') - return self._download_xml( - 'https://www.crunchyroll.com/xml/', - video_id, note, fatal=False, data=data, headers={ - 'Content-Type': 'application/x-www-form-urlencoded', - }) + params = None def _perform_login(self, username, password): if self._get_cookies(self._LOGIN_URL).get('etp_rt'): @@ -76,7 +39,7 @@ class CrunchyrollBaseIE(InfoExtractor): login_response = self._download_json( f'{self._API_BASE}/login.1.json', None, 'Logging in', - data=compat_urllib_parse_urlencode({ + data=urllib.parse.urlencode({ 'account': username, 'password': password, 'session_id': session_id @@ -86,800 +49,173 @@ class CrunchyrollBaseIE(InfoExtractor): if not self._get_cookies(self._LOGIN_URL).get('etp_rt'): raise ExtractorError('Login succeeded but did not set etp_rt cookie') - # Beta-specific, but needed for redirects - def _get_beta_embedded_json(self, webpage, display_id): + def _get_embedded_json(self, webpage, display_id): initial_state = self._parse_json(self._search_regex( r'__INITIAL_STATE__\s*=\s*({.+?})\s*;', webpage, 'initial state'), display_id) app_config = self._parse_json(self._search_regex( r'__APP_CONFIG__\s*=\s*({.+?})\s*;', webpage, 'app config'), display_id) return initial_state, app_config - def _redirect_to_beta(self, webpage, iekey, video_id): - if not self._get_cookies(self._LOGIN_URL).get('etp_rt'): - raise ExtractorError('Received a beta page from non-beta url when not logged in.') - initial_state, app_config = self._get_beta_embedded_json(webpage, video_id) - url = app_config['baseSiteUrl'] + initial_state['router']['locations']['current']['pathname'] - self.to_screen(f'{video_id}: Redirected to beta site - {url}') - return self.url_result(f'{url}', iekey, video_id) - - @staticmethod - def _add_skip_wall(url): - parsed_url = compat_urlparse.urlparse(url) - qs = compat_urlparse.parse_qs(parsed_url.query) - # Always force skip_wall to bypass maturity wall, namely 18+ confirmation message: - # > This content may be inappropriate for some people. - # > Are you sure you want to continue? - # since it's not disabled by default in crunchyroll account's settings. - # See https://github.com/ytdl-org/youtube-dl/issues/7202. - qs['skip_wall'] = ['1'] - return compat_urlparse.urlunparse( - parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True))) - - -class CrunchyrollIE(CrunchyrollBaseIE, VRVBaseIE): - IE_NAME = 'crunchyroll' - _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|(?:[^/]*/){1,2}[^/?&]*?)(?P<id>[0-9]+))(?:[/?&]|$)' - _TESTS = [{ - 'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513', - 'info_dict': { - 'id': '645513', - 'ext': 'mp4', - 'title': 'Wanna be the Strongest in the World Episode 1 – An Idol-Wrestler is Born!', - 'description': 'md5:2d17137920c64f2f49981a7797d275ef', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Yomiuri Telecasting Corporation (YTV)', - 'upload_date': '20131013', - 'url': 're:(?!.*&)', - }, - 'params': { - # rtmp - 'skip_download': True, - }, - 'skip': 'Video gone', - }, { - 'url': 'http://www.crunchyroll.com/media-589804/culture-japan-1', - 'info_dict': { - 'id': '589804', - 'ext': 'flv', - 'title': 'Culture Japan Episode 1 – Rebuilding Japan after the 3.11', - 'description': 'md5:2fbc01f90b87e8e9137296f37b461c12', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Danny Choo Network', - 'upload_date': '20120213', - }, - 'params': { - # rtmp - 'skip_download': True, - }, - 'skip': 'Video gone', - }, { - 'url': 'http://www.crunchyroll.com/rezero-starting-life-in-another-world-/episode-5-the-morning-of-our-promise-is-still-distant-702409', - 'info_dict': { - 'id': '702409', - 'ext': 'mp4', - 'title': compat_str, - 'description': compat_str, - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Re:Zero Partners', - 'timestamp': 1462098900, - 'upload_date': '20160501', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://www.crunchyroll.com/konosuba-gods-blessing-on-this-wonderful-world/episode-1-give-me-deliverance-from-this-judicial-injustice-727589', - 'info_dict': { - 'id': '727589', - 'ext': 'mp4', - 'title': compat_str, - 'description': compat_str, - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Kadokawa Pictures Inc.', - 'timestamp': 1484130900, - 'upload_date': '20170111', - 'series': compat_str, - 'season': "KONOSUBA -God's blessing on this wonderful world! 2", - 'season_number': 2, - 'episode': 'Give Me Deliverance From This Judicial Injustice!', - 'episode_number': 1, - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697', - 'only_matching': True, - }, { - # geo-restricted (US), 18+ maturity wall, non-premium available - 'url': 'http://www.crunchyroll.com/cosplay-complex-ova/episode-1-the-birth-of-the-cosplay-club-565617', - 'only_matching': True, - }, { - # A description with double quotes - 'url': 'http://www.crunchyroll.com/11eyes/episode-1-piros-jszaka-red-night-535080', - 'info_dict': { - 'id': '535080', - 'ext': 'mp4', - 'title': compat_str, - 'description': compat_str, - 'uploader': 'Marvelous AQL Inc.', - 'timestamp': 1255512600, - 'upload_date': '20091014', - }, - 'params': { - # Just test metadata extraction - 'skip_download': True, - }, - }, { - # make sure we can extract an uploader name that's not a link - 'url': 'http://www.crunchyroll.com/hakuoki-reimeiroku/episode-1-dawn-of-the-divine-warriors-606899', - 'info_dict': { - 'id': '606899', - 'ext': 'mp4', - 'title': 'Hakuoki Reimeiroku Episode 1 – Dawn of the Divine Warriors', - 'description': 'Ryunosuke was left to die, but Serizawa-san asked him a simple question "Do you want to live?"', - 'uploader': 'Geneon Entertainment', - 'upload_date': '20120717', - }, - 'params': { - # just test metadata extraction - 'skip_download': True, - }, - 'skip': 'Video gone', - }, { - # A video with a vastly different season name compared to the series name - 'url': 'http://www.crunchyroll.com/nyarko-san-another-crawling-chaos/episode-1-test-590532', - 'info_dict': { - 'id': '590532', - 'ext': 'mp4', - 'title': compat_str, - 'description': compat_str, - 'uploader': 'TV TOKYO', - 'timestamp': 1330956000, - 'upload_date': '20120305', - 'series': 'Nyarko-san: Another Crawling Chaos', - 'season': 'Haiyoru! Nyaruani (ONA)', - }, - 'params': { - # Just test metadata extraction - 'skip_download': True, - }, - }, { - 'url': 'http://www.crunchyroll.com/media-723735', - 'only_matching': True, - }, { - 'url': 'https://www.crunchyroll.com/en-gb/mob-psycho-100/episode-2-urban-legends-encountering-rumors-780921', - 'only_matching': True, - }] - - _FORMAT_IDS = { - '360': ('60', '106'), - '480': ('61', '106'), - '720': ('62', '106'), - '1080': ('80', '108'), - } - - def _download_webpage(self, url_or_request, *args, **kwargs): - request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request) - else sanitized_Request(url_or_request)) - # Accept-Language must be set explicitly to accept any language to avoid issues - # similar to https://github.com/ytdl-org/youtube-dl/issues/6797. - # Along with IP address Crunchyroll uses Accept-Language to guess whether georestriction - # should be imposed or not (from what I can see it just takes the first language - # ignoring the priority and requires it to correspond the IP). By the way this causes - # Crunchyroll to not work in georestriction cases in some browsers that don't place - # the locale lang first in header. However allowing any language seems to workaround the issue. - request.add_header('Accept-Language', '*') - return super(CrunchyrollBaseIE, self)._download_webpage(request, *args, **kwargs) - - def _decrypt_subtitles(self, data, iv, id): - data = bytes_to_intlist(compat_b64decode(data)) - iv = bytes_to_intlist(compat_b64decode(iv)) - id = int(id) - - def obfuscate_key_aux(count, modulo, start): - output = list(start) - for _ in range(count): - output.append(output[-1] + output[-2]) - # cut off start values - output = output[2:] - output = list(map(lambda x: x % modulo + 33, output)) - return output - - def obfuscate_key(key): - num1 = int(floor(pow(2, 25) * sqrt(6.9))) - num2 = (num1 ^ key) << 5 - num3 = key ^ num1 - num4 = num3 ^ (num3 >> 3) ^ num2 - prefix = intlist_to_bytes(obfuscate_key_aux(20, 97, (1, 2))) - shaHash = bytes_to_intlist(sha1(prefix + str(num4).encode('ascii')).digest()) - # Extend 160 Bit hash to 256 Bit - return shaHash + [0] * 12 - - key = obfuscate_key(id) - - decrypted_data = intlist_to_bytes(aes_cbc_decrypt(data, key, iv)) - return zlib.decompress(decrypted_data) - - def _convert_subtitles_to_srt(self, sub_root): - output = '' - - for i, event in enumerate(sub_root.findall('./events/event'), 1): - start = event.attrib['start'].replace('.', ',') - end = event.attrib['end'].replace('.', ',') - text = event.attrib['text'].replace('\\N', '\n') - output += '%d\n%s --> %s\n%s\n\n' % (i, start, end, text) - return output - - def _convert_subtitles_to_ass(self, sub_root): - output = '' - - def ass_bool(strvalue): - assvalue = '0' - if strvalue == '1': - assvalue = '-1' - return assvalue - - output = '[Script Info]\n' - output += 'Title: %s\n' % sub_root.attrib['title'] - output += 'ScriptType: v4.00+\n' - output += 'WrapStyle: %s\n' % sub_root.attrib['wrap_style'] - output += 'PlayResX: %s\n' % sub_root.attrib['play_res_x'] - output += 'PlayResY: %s\n' % sub_root.attrib['play_res_y'] - output += """ -[V4+ Styles] -Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding -""" - for style in sub_root.findall('./styles/style'): - output += 'Style: ' + style.attrib['name'] - output += ',' + style.attrib['font_name'] - output += ',' + style.attrib['font_size'] - output += ',' + style.attrib['primary_colour'] - output += ',' + style.attrib['secondary_colour'] - output += ',' + style.attrib['outline_colour'] - output += ',' + style.attrib['back_colour'] - output += ',' + ass_bool(style.attrib['bold']) - output += ',' + ass_bool(style.attrib['italic']) - output += ',' + ass_bool(style.attrib['underline']) - output += ',' + ass_bool(style.attrib['strikeout']) - output += ',' + style.attrib['scale_x'] - output += ',' + style.attrib['scale_y'] - output += ',' + style.attrib['spacing'] - output += ',' + style.attrib['angle'] - output += ',' + style.attrib['border_style'] - output += ',' + style.attrib['outline'] - output += ',' + style.attrib['shadow'] - output += ',' + style.attrib['alignment'] - output += ',' + style.attrib['margin_l'] - output += ',' + style.attrib['margin_r'] - output += ',' + style.attrib['margin_v'] - output += ',' + style.attrib['encoding'] - output += '\n' - - output += """ -[Events] -Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text -""" - for event in sub_root.findall('./events/event'): - output += 'Dialogue: 0' - output += ',' + event.attrib['start'] - output += ',' + event.attrib['end'] - output += ',' + event.attrib['style'] - output += ',' + event.attrib['name'] - output += ',' + event.attrib['margin_l'] - output += ',' + event.attrib['margin_r'] - output += ',' + event.attrib['margin_v'] - output += ',' + event.attrib['effect'] - output += ',' + event.attrib['text'] - output += '\n' - - return output - - def _extract_subtitles(self, subtitle): - sub_root = compat_etree_fromstring(subtitle) - return [{ - 'ext': 'srt', - 'data': self._convert_subtitles_to_srt(sub_root), - }, { - 'ext': 'ass', - 'data': self._convert_subtitles_to_ass(sub_root), - }] - - def _get_subtitles(self, video_id, webpage): - subtitles = {} - for sub_id, sub_name in re.findall(r'\bssid=([0-9]+)"[^>]+?\btitle="([^"]+)', webpage): - sub_doc = self._call_rpc_api( - 'Subtitle_GetXml', video_id, - 'Downloading subtitles for ' + sub_name, data={ - 'subtitle_script_id': sub_id, - }) - if not isinstance(sub_doc, compat_etree_Element): - continue - sid = sub_doc.get('id') - iv = xpath_text(sub_doc, 'iv', 'subtitle iv') - data = xpath_text(sub_doc, 'data', 'subtitle data') - if not sid or not iv or not data: - continue - subtitle = self._decrypt_subtitles(data, iv, sid).decode('utf-8') - lang_code = self._search_regex(r'lang_code=["\']([^"\']+)', subtitle, 'subtitle_lang_code', fatal=False) - if not lang_code: - continue - subtitles[lang_code] = self._extract_subtitles(subtitle) - return subtitles - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - - if mobj.group('prefix') == 'm': - mobile_webpage = self._download_webpage(url, video_id, 'Downloading mobile webpage') - webpage_url = self._search_regex(r'<link rel="canonical" href="([^"]+)" />', mobile_webpage, 'webpage_url') - else: - webpage_url = 'http://www.' + mobj.group('url') - - webpage = self._download_webpage( - self._add_skip_wall(webpage_url), video_id, - headers=self.geo_verification_headers()) - if re.search(r'<div id="preload-data">', webpage): - return self._redirect_to_beta(webpage, CrunchyrollBetaIE.ie_key(), video_id) - note_m = self._html_search_regex( - r'<div class="showmedia-trailer-notice">(.+?)</div>', - webpage, 'trailer-notice', default='') - if note_m: - raise ExtractorError(note_m, expected=True) - - mobj = re.search(r'Page\.messaging_box_controller\.addItems\(\[(?P<msg>{.+?})\]\)', webpage) - if mobj: - msg = json.loads(mobj.group('msg')) - if msg.get('type') == 'error': - raise ExtractorError('crunchyroll returned error: %s' % msg['message_body'], expected=True) - - if 'To view this, please log in to verify you are 18 or older.' in webpage: - self.raise_login_required() - - media = self._parse_json(self._search_regex( - r'vilos\.config\.media\s*=\s*({.+?});', - webpage, 'vilos media', default='{}'), video_id) - media_metadata = media.get('metadata') or {} - - language = self._search_regex( - r'(?:vilos\.config\.player\.language|LOCALE)\s*=\s*(["\'])(?P<lang>(?:(?!\1).)+)\1', - webpage, 'language', default=None, group='lang') - - video_title = self._html_search_regex( - (r'(?s)<h1[^>]*>((?:(?!<h1).)*?<(?:span[^>]+itemprop=["\']title["\']|meta[^>]+itemprop=["\']position["\'])[^>]*>(?:(?!<h1).)+?)</h1>', - r'<title>(.+?),\s+-\s+.+? Crunchyroll'), - webpage, 'video_title', default=None) - if not video_title: - video_title = re.sub(r'^Watch\s+', '', self._og_search_description(webpage)) - video_title = re.sub(r' {2,}', ' ', video_title) - video_description = (self._parse_json(self._html_search_regex( - r'<script[^>]*>\s*.+?\[media_id=%s\].+?({.+?"description"\s*:.+?})\);' % video_id, - webpage, 'description', default='{}'), video_id) or media_metadata).get('description') - - thumbnails = [] - thumbnail_url = (self._parse_json(self._html_search_regex( - r'<script type="application\/ld\+json">\n\s*(.+?)<\/script>', - webpage, 'thumbnail_url', default='{}'), video_id)).get('image') - if thumbnail_url: - thumbnails.append({ - 'url': thumbnail_url, - 'width': 1920, - 'height': 1080 - }) - - if video_description: - video_description = lowercase_escape(video_description.replace(r'\r\n', '\n')) - video_uploader = self._html_search_regex( - # try looking for both an uploader that's a link and one that's not - [r'<a[^>]+href="/publisher/[^"]+"[^>]*>([^<]+)</a>', r'<div>\s*Publisher:\s*<span>\s*(.+?)\s*</span>\s*</div>'], - webpage, 'video_uploader', default=False) - - requested_languages = self._configuration_arg('language') - requested_hardsubs = [('' if val == 'none' else val) for val in self._configuration_arg('hardsub')] - language_preference = qualities((requested_languages or [language or ''])[::-1]) - hardsub_preference = qualities((requested_hardsubs or ['', language or ''])[::-1]) - - formats = [] - for stream in media.get('streams', []): - audio_lang = stream.get('audio_lang') or '' - hardsub_lang = stream.get('hardsub_lang') or '' - if (requested_languages and audio_lang.lower() not in requested_languages - or requested_hardsubs and hardsub_lang.lower() not in requested_hardsubs): - continue - vrv_formats = self._extract_vrv_formats( - stream.get('url'), video_id, stream.get('format'), - audio_lang, hardsub_lang) - for f in vrv_formats: - f['language_preference'] = language_preference(audio_lang) - f['quality'] = hardsub_preference(hardsub_lang) - formats.extend(vrv_formats) - if not formats: - available_fmts = [] - for a, fmt in re.findall(r'(<a[^>]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage): - attrs = extract_attributes(a) - href = attrs.get('href') - if href and '/freetrial' in href: - continue - available_fmts.append(fmt) - if not available_fmts: - for p in (r'token=["\']showmedia\.([0-9]{3,4})p"', r'showmedia\.([0-9]{3,4})p'): - available_fmts = re.findall(p, webpage) - if available_fmts: - break - if not available_fmts: - available_fmts = self._FORMAT_IDS.keys() - video_encode_ids = [] - - for fmt in available_fmts: - stream_quality, stream_format = self._FORMAT_IDS[fmt] - video_format = fmt + 'p' - stream_infos = [] - streamdata = self._call_rpc_api( - 'VideoPlayer_GetStandardConfig', video_id, - 'Downloading media info for %s' % video_format, data={ - 'media_id': video_id, - 'video_format': stream_format, - 'video_quality': stream_quality, - 'current_page': url, - }) - if isinstance(streamdata, compat_etree_Element): - stream_info = streamdata.find('./{default}preload/stream_info') - if stream_info is not None: - stream_infos.append(stream_info) - stream_info = self._call_rpc_api( - 'VideoEncode_GetStreamInfo', video_id, - 'Downloading stream info for %s' % video_format, data={ - 'media_id': video_id, - 'video_format': stream_format, - 'video_encode_quality': stream_quality, - }) - if isinstance(stream_info, compat_etree_Element): - stream_infos.append(stream_info) - for stream_info in stream_infos: - video_encode_id = xpath_text(stream_info, './video_encode_id') - if video_encode_id in video_encode_ids: - continue - video_encode_ids.append(video_encode_id) - - video_file = xpath_text(stream_info, './file') - if not video_file: - continue - if video_file.startswith('http'): - formats.extend(self._extract_m3u8_formats( - video_file, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - continue - - video_url = xpath_text(stream_info, './host') - if not video_url: - continue - metadata = stream_info.find('./metadata') - format_info = { - 'format': video_format, - 'height': int_or_none(xpath_text(metadata, './height')), - 'width': int_or_none(xpath_text(metadata, './width')), - } - - if '.fplive.net/' in video_url: - video_url = re.sub(r'^rtmpe?://', 'http://', video_url.strip()) - parsed_video_url = compat_urlparse.urlparse(video_url) - direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace( - netloc='v.lvlt.crcdn.net', - path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1]))) - if self._is_valid_url(direct_video_url, video_id, video_format): - format_info.update({ - 'format_id': 'http-' + video_format, - 'url': direct_video_url, - }) - formats.append(format_info) - continue - - format_info.update({ - 'format_id': 'rtmp-' + video_format, - 'url': video_url, - 'play_path': video_file, - 'ext': 'flv', - }) - formats.append(format_info) - self._sort_formats(formats) - - metadata = self._call_rpc_api( - 'VideoPlayer_GetMediaMetadata', video_id, - note='Downloading media info', data={ - 'media_id': video_id, - }) - - subtitles = {} - for subtitle in media.get('subtitles', []): - subtitle_url = subtitle.get('url') - if not subtitle_url: - continue - subtitles.setdefault(subtitle.get('language', 'enUS'), []).append({ - 'url': subtitle_url, - 'ext': subtitle.get('format', 'ass'), - }) - if not subtitles: - subtitles = self.extract_subtitles(video_id, webpage) - - # webpage provide more accurate data than series_title from XML - series = self._html_search_regex( - r'(?s)<h\d[^>]+\bid=["\']showmedia_about_episode_num[^>]+>(.+?)</h\d', - webpage, 'series', fatal=False) - - season = episode = episode_number = duration = None - - if isinstance(metadata, compat_etree_Element): - season = xpath_text(metadata, 'series_title') - episode = xpath_text(metadata, 'episode_title') - episode_number = int_or_none(xpath_text(metadata, 'episode_number')) - duration = float_or_none(media_metadata.get('duration'), 1000) - - if not episode: - episode = media_metadata.get('title') - if not episode_number: - episode_number = int_or_none(media_metadata.get('episode_number')) - thumbnail_url = try_get(media, lambda x: x['thumbnail']['url']) - if thumbnail_url: - thumbnails.append({ - 'url': thumbnail_url, - 'width': 640, - 'height': 360 - }) - - season_number = int_or_none(self._search_regex( - r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)', - webpage, 'season number', default=None)) - - info = self._search_json_ld(webpage, video_id, default={}) - - return merge_dicts({ - 'id': video_id, - 'title': video_title, - 'description': video_description, - 'duration': duration, - 'thumbnails': thumbnails, - 'uploader': video_uploader, - 'series': series, - 'season': season, - 'season_number': season_number, - 'episode': episode, - 'episode_number': episode_number, - 'subtitles': subtitles, - 'formats': formats, - }, info) - - -class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): - IE_NAME = 'crunchyroll:playlist' - _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?:\w{1,2}/)?(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login|media-\d+))(?P<id>[\w\-]+))/?(?:\?|$)' - - _TESTS = [{ - 'url': 'https://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi', - 'info_dict': { - 'id': 'a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi', - 'title': 'A Bridge to the Starry Skies - Hoshizora e Kakaru Hashi' - }, - 'playlist_count': 13, - }, { - # geo-restricted (US), 18+ maturity wall, non-premium available - 'url': 'http://www.crunchyroll.com/cosplay-complex-ova', - 'info_dict': { - 'id': 'cosplay-complex-ova', - 'title': 'Cosplay Complex OVA' - }, - 'playlist_count': 3, - 'skip': 'Georestricted', - }, { - # geo-restricted (US), 18+ maturity wall, non-premium will be available since 2015.11.14 - 'url': 'http://www.crunchyroll.com/ladies-versus-butlers?skip_wall=1', - 'only_matching': True, - }, { - 'url': 'http://www.crunchyroll.com/fr/ladies-versus-butlers', - 'only_matching': True, - }] - - def _real_extract(self, url): - show_id = self._match_id(url) - - webpage = self._download_webpage( - # https:// gives a 403, but http:// does not - self._add_skip_wall(url).replace('https://', 'http://'), show_id, - headers=self.geo_verification_headers()) - if re.search(r'<div id="preload-data">', webpage): - return self._redirect_to_beta(webpage, CrunchyrollBetaShowIE.ie_key(), show_id) - title = self._html_search_meta('name', webpage, default=None) - - episode_re = r'<li id="showview_videos_media_(\d+)"[^>]+>.*?<a href="([^"]+)"' - season_re = r'<a [^>]+season-dropdown[^>]+>([^<]+)' - paths = re.findall(f'(?s){episode_re}|{season_re}', webpage) - - entries, current_season = [], None - for ep_id, ep, season in paths: - if season: - current_season = season - continue - entries.append(self.url_result( - f'http://www.crunchyroll.com{ep}', CrunchyrollIE.ie_key(), ep_id, season=current_season)) - - return { - '_type': 'playlist', - 'id': show_id, - 'title': title, - 'entries': reversed(entries), - } + def _get_params(self, lang): + if not CrunchyrollBaseIE.params: + if self._get_cookies(f'https://www.crunchyroll.com/{lang}').get('etp_rt'): + grant_type, key = 'etp_rt_cookie', 'accountAuthClientId' + else: + grant_type, key = 'client_id', 'anonClientId' + initial_state, app_config = self._get_embedded_json(self._download_webpage( + f'https://www.crunchyroll.com/{lang}', None, note='Retrieving main page'), None) + api_domain = app_config['cxApiParams']['apiDomain'].replace('beta.crunchyroll.com', 'www.crunchyroll.com') -class CrunchyrollBetaBaseIE(CrunchyrollBaseIE): - params = None - - def _get_params(self, lang): - if not CrunchyrollBetaBaseIE.params: - initial_state, app_config = self._get_beta_embedded_json(self._download_webpage( - f'https://beta.crunchyroll.com/{lang}', None, note='Retrieving main page'), None) - api_domain = app_config['cxApiParams']['apiDomain'] - basic_token = str(base64.b64encode(('%s:' % app_config['cxApiParams']['accountAuthClientId']).encode('ascii')), 'ascii') auth_response = self._download_json( - f'{api_domain}/auth/v1/token', None, note='Authenticating with cookie', + f'{api_domain}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}', headers={ - 'Authorization': 'Basic ' + basic_token - }, data='grant_type=etp_rt_cookie'.encode('ascii')) + 'Authorization': 'Basic ' + str(base64.b64encode(('%s:' % app_config['cxApiParams'][key]).encode('ascii')), 'ascii') + }, data=f'grant_type={grant_type}'.encode('ascii')) policy_response = self._download_json( f'{api_domain}/index/v2', None, note='Retrieving signed policy', headers={ 'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token'] }) - bucket = policy_response['cms']['bucket'] + cms = policy_response.get('cms_web') + bucket = cms['bucket'] params = { - 'Policy': policy_response['cms']['policy'], - 'Signature': policy_response['cms']['signature'], - 'Key-Pair-Id': policy_response['cms']['key_pair_id'] + 'Policy': cms['policy'], + 'Signature': cms['signature'], + 'Key-Pair-Id': cms['key_pair_id'] } locale = traverse_obj(initial_state, ('localization', 'locale')) if locale: params['locale'] = locale - CrunchyrollBetaBaseIE.params = (api_domain, bucket, params) - return CrunchyrollBetaBaseIE.params - - def _redirect_from_beta(self, url, lang, internal_id, display_id, is_episode, iekey): - initial_state, app_config = self._get_beta_embedded_json(self._download_webpage(url, display_id), display_id) - content_data = initial_state['content']['byId'][internal_id] - if is_episode: - video_id = content_data['external_id'].split('.')[1] - series_id = content_data['episode_metadata']['series_slug_title'] - else: - series_id = content_data['slug_title'] - series_id = re.sub(r'-{2,}', '-', series_id) - url = f'https://www.crunchyroll.com/{lang}{series_id}' - if is_episode: - url = url + f'/{display_id}-{video_id}' - self.to_screen(f'{display_id}: Not logged in. Redirecting to non-beta site - {url}') - return self.url_result(url, iekey, display_id) + CrunchyrollBaseIE.params = (api_domain, bucket, params) + return CrunchyrollBaseIE.params -class CrunchyrollBetaIE(CrunchyrollBetaBaseIE): - IE_NAME = 'crunchyroll:beta' - _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)watch/(?P<id>\w+)/(?P<display_id>[\w\-]*)/?(?:\?|$)' +class CrunchyrollBetaIE(CrunchyrollBaseIE): + IE_NAME = 'crunchyroll' + _VALID_URL = r'''(?x) + https?://(?:beta|www)\.crunchyroll\.com/ + (?P<lang>(?:\w{2}(?:-\w{2})?/)?) + watch/(?P<id>\w+) + (?:/(?P<display_id>[\w-]+))?/?(?:[?#]|$)''' _TESTS = [{ - 'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/to-the-future', + 'url': 'https://www.crunchyroll.com/watch/GY2P1Q98Y/to-the-future', 'info_dict': { - 'id': '696363', + 'id': 'GY2P1Q98Y', 'ext': 'mp4', - 'timestamp': 1459610100, + 'duration': 1380.241, + 'timestamp': 1459632600, 'description': 'md5:a022fbec4fbb023d43631032c91ed64b', - 'uploader': 'Toei Animation', 'title': 'World Trigger Episode 73 – To the Future', 'upload_date': '20160402', - 'episode_number': 73, 'series': 'World Trigger', - 'average_rating': 4.9, - 'episode': 'To the Future', + 'series_id': 'GR757DMKY', 'season': 'World Trigger', - 'thumbnail': 'https://img1.ak.crunchyroll.com/i/spire3-tmb/c870dedca1a83137c2d3d144984155ed1459527119_main.jpg', + 'season_id': 'GR9P39NJ6', 'season_number': 1, + 'episode': 'To the Future', + 'episode_number': 73, + 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg$', }, - 'params': {'skip_download': 'm3u8'}, - 'expected_warnings': ['Unable to download XML'] + 'params': {'skip_download': 'm3u8', 'format': 'all[format_id~=hardsub]'}, }, { - 'url': 'https://beta.crunchyroll.com/watch/GYK53DMPR/wicked-lord-shingan-reborn', + 'url': 'https://www.crunchyroll.com/watch/GYE5WKQGR', 'info_dict': { - 'id': '648781', + 'id': 'GYE5WKQGR', 'ext': 'mp4', - 'episode_number': 1, - 'timestamp': 1389173400, - 'series': 'Love, Chunibyo & Other Delusions - Heart Throb -', - 'description': 'md5:5579d1a0355cc618558ba23d27067a62', - 'uploader': 'TBS', - 'episode': 'Wicked Lord Shingan... Reborn', - 'average_rating': 4.9, - 'season': 'Love, Chunibyo & Other Delusions - Heart Throb -', - 'thumbnail': 'https://img1.ak.crunchyroll.com/i/spire3-tmb/2ba0384e225a5370d5f0ee9496d91ea51389046521_main.jpg', - 'title': 'Love, Chunibyo & Other Delusions - Heart Throb - Episode 1 – Wicked Lord Shingan... Reborn', - 'season_number': 2, - 'upload_date': '20140108', + 'duration': 366.459, + 'timestamp': 1476788400, + 'description': 'md5:74b67283ffddd75f6e224ca7dc031e76', + 'title': 'SHELTER Episode – Porter Robinson presents Shelter the Animation', + 'upload_date': '20161018', + 'series': 'SHELTER', + 'series_id': 'GYGG09WWY', + 'season': 'SHELTER', + 'season_id': 'GR09MGK4R', + 'season_number': 1, + 'episode': 'Porter Robinson presents Shelter the Animation', + 'episode_number': 0, + 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg$', }, - 'params': {'skip_download': 'm3u8'}, - 'expected_warnings': ['Unable to download XML'] + 'params': {'skip_download': True}, + 'skip': 'Video is Premium only', + }, { + 'url': 'https://www.crunchyroll.com/watch/GY2P1Q98Y', + 'only_matching': True, }, { - 'url': 'https://beta.crunchyroll.com/watch/GY2P1Q98Y/', + 'url': 'https://beta.crunchyroll.com/pt-br/watch/G8WUN8VKP/the-ruler-of-conspiracy', 'only_matching': True, }] def _real_extract(self, url): lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') - - if not self._get_cookies(url).get('etp_rt'): - return self._redirect_from_beta(url, lang, internal_id, display_id, True, CrunchyrollIE.ie_key()) - api_domain, bucket, params = self._get_params(lang) episode_response = self._download_json( f'{api_domain}/cms/v2{bucket}/episodes/{internal_id}', display_id, - note='Retrieving episode metadata', - query=params) + note='Retrieving episode metadata', query=params) if episode_response.get('is_premium_only') and not episode_response.get('playback'): raise ExtractorError('This video is for premium members only.', expected=True) - stream_response = self._download_json( - episode_response['playback'], display_id, - note='Retrieving stream info') - thumbnails = [] - for thumbnails_data in traverse_obj(episode_response, ('images', 'thumbnail')): - for thumbnail_data in thumbnails_data: - thumbnails.append({ - 'url': thumbnail_data.get('source'), - 'width': thumbnail_data.get('width'), - 'height': thumbnail_data.get('height'), - }) - subtitles = {} - for lang, subtitle_data in stream_response.get('subtitles').items(): - subtitles[lang] = [{ - 'url': subtitle_data.get('url'), - 'ext': subtitle_data.get('format') - }] + stream_response = self._download_json( + f'{api_domain}{episode_response["__links__"]["streams"]["href"]}', display_id, + note='Retrieving stream info', query=params) + get_streams = lambda name: (traverse_obj(stream_response, name) or {}).items() requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])] hardsub_preference = qualities(requested_hardsubs[::-1]) requested_formats = self._configuration_arg('format') or ['adaptive_hls'] - formats = [] - for stream_type, streams in stream_response.get('streams', {}).items(): + available_formats = {} + for stream_type, streams in get_streams('streams'): if stream_type not in requested_formats: continue for stream in streams.values(): - hardsub_lang = stream.get('hardsub_locale') or '' - if hardsub_lang.lower() not in requested_hardsubs: - continue - format_id = join_nonempty( - stream_type, - format_field(stream, 'hardsub_locale', 'hardsub-%s')) if not stream.get('url'): continue - if stream_type.split('_')[-1] == 'hls': + hardsub_lang = stream.get('hardsub_locale') or '' + format_id = join_nonempty(stream_type, format_field(stream, 'hardsub_locale', 'hardsub-%s')) + available_formats[hardsub_lang] = (stream_type, format_id, hardsub_lang, stream['url']) + + if '' in available_formats and 'all' not in requested_hardsubs: + full_format_langs = set(requested_hardsubs) + self.to_screen( + 'To get all formats of a hardsub language, use ' + '"--extractor-args crunchyrollbeta:hardsub=<language_code or all>". ' + 'See https://github.com/hypervideo/hypervideo#crunchyrollbeta for more info', + only_once=True) + else: + full_format_langs = set(map(str.lower, available_formats)) + + formats = [] + for stream_type, format_id, hardsub_lang, stream_url in available_formats.values(): + if stream_type.endswith('hls'): + if hardsub_lang.lower() in full_format_langs: adaptive_formats = self._extract_m3u8_formats( - stream['url'], display_id, 'mp4', m3u8_id=format_id, - note='Downloading %s information' % format_id, - fatal=False) - elif stream_type.split('_')[-1] == 'dash': - adaptive_formats = self._extract_mpd_formats( - stream['url'], display_id, mpd_id=format_id, - note='Downloading %s information' % format_id, - fatal=False) - for f in adaptive_formats: - if f.get('acodec') != 'none': - f['language'] = stream_response.get('audio_locale') - f['quality'] = hardsub_preference(hardsub_lang.lower()) - formats.extend(adaptive_formats) - self._sort_formats(formats) + stream_url, display_id, 'mp4', m3u8_id=format_id, + fatal=False, note=f'Downloading {format_id} HLS manifest') + else: + adaptive_formats = (self._m3u8_meta_format(stream_url, ext='mp4', m3u8_id=format_id),) + elif stream_type.endswith('dash'): + adaptive_formats = self._extract_mpd_formats( + stream_url, display_id, mpd_id=format_id, + fatal=False, note=f'Downloading {format_id} MPD manifest') + else: + self.report_warning(f'Encountered unknown stream_type: {stream_type!r}', display_id, only_once=True) + continue + for f in adaptive_formats: + if f.get('acodec') != 'none': + f['language'] = stream_response.get('audio_locale') + f['quality'] = hardsub_preference(hardsub_lang.lower()) + formats.extend(adaptive_formats) return { 'id': internal_id, - 'title': '%s Episode %s – %s' % (episode_response.get('season_title'), episode_response.get('episode'), episode_response.get('title')), - 'description': episode_response.get('description').replace(r'\r\n', '\n'), + 'title': '%s Episode %s – %s' % ( + episode_response.get('season_title'), episode_response.get('episode'), episode_response.get('title')), + 'description': try_get(episode_response, lambda x: x['description'].replace(r'\r\n', '\n')), 'duration': float_or_none(episode_response.get('duration_ms'), 1000), - 'thumbnails': thumbnails, + 'timestamp': parse_iso8601(episode_response.get('upload_date')), 'series': episode_response.get('series_title'), 'series_id': episode_response.get('series_id'), 'season': episode_response.get('season_title'), @@ -887,39 +223,42 @@ class CrunchyrollBetaIE(CrunchyrollBetaBaseIE): 'season_number': episode_response.get('season_number'), 'episode': episode_response.get('title'), 'episode_number': episode_response.get('sequence_number'), - 'subtitles': subtitles, - 'formats': formats + 'formats': formats, + 'thumbnails': [{ + 'url': thumb.get('source'), + 'width': thumb.get('width'), + 'height': thumb.get('height'), + } for thumb in traverse_obj(episode_response, ('images', 'thumbnail', ..., ...)) or []], + 'subtitles': { + lang: [{ + 'url': subtitle_data.get('url'), + 'ext': subtitle_data.get('format') + }] for lang, subtitle_data in get_streams('subtitles') + }, } -class CrunchyrollBetaShowIE(CrunchyrollBetaBaseIE): - IE_NAME = 'crunchyroll:playlist:beta' - _VALID_URL = r'https?://beta\.crunchyroll\.com/(?P<lang>(?:\w{1,2}/)?)series/(?P<id>\w+)/(?P<display_id>[\w\-]*)/?(?:\?|$)' +class CrunchyrollBetaShowIE(CrunchyrollBaseIE): + IE_NAME = 'crunchyroll:playlist' + _VALID_URL = r'''(?x) + https?://(?:beta|www)\.crunchyroll\.com/ + (?P<lang>(?:\w{2}(?:-\w{2})?/)?) + series/(?P<id>\w+) + (?:/(?P<display_id>[\w-]+))?/?(?:[?#]|$)''' _TESTS = [{ - 'url': 'https://beta.crunchyroll.com/series/GY19NQ2QR/Girl-Friend-BETA', + 'url': 'https://www.crunchyroll.com/series/GY19NQ2QR/Girl-Friend-BETA', 'info_dict': { - 'id': 'girl-friend-beta', + 'id': 'GY19NQ2QR', 'title': 'Girl Friend BETA', }, 'playlist_mincount': 10, }, { - 'url': 'https://beta.crunchyroll.com/series/GYJQV73V6/love-chunibyo--other-delusions---heart-throb--', - 'info_dict': { - 'id': 'love-chunibyo-other-delusions-heart-throb-', - 'title': 'Love, Chunibyo & Other Delusions - Heart Throb -', - }, - 'playlist_mincount': 10, - }, { - 'url': 'https://beta.crunchyroll.com/it/series/GY19NQ2QR/Girl-Friend-BETA', + 'url': 'https://beta.crunchyroll.com/it/series/GY19NQ2QR', 'only_matching': True, }] def _real_extract(self, url): lang, internal_id, display_id = self._match_valid_url(url).group('lang', 'id', 'display_id') - - if not self._get_cookies(url).get('etp_rt'): - return self._redirect_from_beta(url, lang, internal_id, display_id, False, CrunchyrollShowPlaylistIE.ie_key()) - api_domain, bucket, params = self._get_params(lang) series_response = self._download_json( @@ -940,7 +279,7 @@ class CrunchyrollBetaShowIE(CrunchyrollBetaBaseIE): episode_display_id = episode['slug_title'] yield { '_type': 'url', - 'url': f'https://beta.crunchyroll.com/{lang}watch/{episode_id}/{episode_display_id}', + 'url': f'https://www.crunchyroll.com/{lang}watch/{episode_id}/{episode_display_id}', 'ie_key': CrunchyrollBetaIE.ie_key(), 'id': episode_id, 'title': '%s Episode %s – %s' % (episode.get('season_title'), episode.get('episode'), episode.get('title')), diff --git a/hypervideo_dl/extractor/cspan.py b/hypervideo_dl/extractor/cspan.py index f51159b..0075680 100644 --- a/hypervideo_dl/extractor/cspan.py +++ b/hypervideo_dl/extractor/cspan.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -165,7 +163,7 @@ class CSpanIE(InfoExtractor): video_id = m.group('id') video_type = 'program' if m.group('type') == 'prog' else 'clip' else: - senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) + senate_isvp_url = SenateISVPIE._extract_url(webpage) if senate_isvp_url: title = self._og_search_title(webpage) surl = smuggle_url(senate_isvp_url, {'force_title': title}) @@ -220,7 +218,6 @@ class CSpanIE(InfoExtractor): path, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') if determine_ext(path) == 'm3u8' else [{'url': path, }] add_referer(formats) - self._sort_formats(formats) entries.append({ 'id': '%s_%d' % (video_id, partnum + 1), 'title': ( @@ -277,8 +274,7 @@ class CSpanCongressIE(InfoExtractor): self._search_regex(r'jwsetup\s*=\s*({(?:.|\n)[^;]+});', webpage, 'player config'), video_id, transform_source=js_to_json) - title = (self._og_search_title(webpage, default=None) - or self._html_extract_title(webpage, 'video title')) + title = self._generic_title('', webpage) description = (self._og_search_description(webpage, default=None) or self._html_search_meta('description', webpage, 'description', default=None)) diff --git a/hypervideo_dl/extractor/ctsnews.py b/hypervideo_dl/extractor/ctsnews.py index 679f1d9..cec178f 100644 --- a/hypervideo_dl/extractor/ctsnews.py +++ b/hypervideo_dl/extractor/ctsnews.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import unified_timestamp from .youtube import YoutubeIE diff --git a/hypervideo_dl/extractor/ctv.py b/hypervideo_dl/extractor/ctv.py index 756bcc2..f125c1c 100644 --- a/hypervideo_dl/extractor/ctv.py +++ b/hypervideo_dl/extractor/ctv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/ctvnews.py b/hypervideo_dl/extractor/ctvnews.py index 952f4c7..ad3f0d8 100644 --- a/hypervideo_dl/extractor/ctvnews.py +++ b/hypervideo_dl/extractor/ctvnews.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/cultureunplugged.py b/hypervideo_dl/extractor/cultureunplugged.py index 9002e4c..2fb2280 100644 --- a/hypervideo_dl/extractor/cultureunplugged.py +++ b/hypervideo_dl/extractor/cultureunplugged.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import time from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/curiositystream.py b/hypervideo_dl/extractor/curiositystream.py index b8abcf7..26cf24f 100644 --- a/hypervideo_dl/extractor/curiositystream.py +++ b/hypervideo_dl/extractor/curiositystream.py @@ -1,15 +1,8 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor -from ..utils import ( - int_or_none, - urlencode_postdata, - compat_str, - ExtractorError, -) +from ..compat import compat_str +from ..utils import ExtractorError, int_or_none, urlencode_postdata class CuriosityStreamBaseIE(InfoExtractor): @@ -26,6 +19,11 @@ class CuriosityStreamBaseIE(InfoExtractor): def _call_api(self, path, video_id, query=None): headers = {} + if not self._auth_token: + auth_cookie = self._get_cookies('https://curiositystream.com').get('auth_token') + if auth_cookie: + self.write_debug('Obtained auth_token cookie') + self._auth_token = auth_cookie.value if self._auth_token: headers['X-Auth-Token'] = self._auth_token result = self._download_json( @@ -48,7 +46,7 @@ class CuriosityStreamIE(CuriosityStreamBaseIE): IE_NAME = 'curiositystream' _VALID_URL = r'https?://(?:app\.)?curiositystream\.com/video/(?P<id>\d+)' _TESTS = [{ - 'url': 'https://app.curiositystream.com/video/2', + 'url': 'http://app.curiositystream.com/video/2', 'info_dict': { 'id': '2', 'ext': 'mp4', @@ -119,7 +117,6 @@ class CuriosityStreamIE(CuriosityStreamBaseIE): 'format_id': 'http', }) formats.append(fmt) - self._sort_formats(formats) title = media['title'] diff --git a/hypervideo_dl/extractor/cwtv.py b/hypervideo_dl/extractor/cwtv.py index 7338243..9b83264 100644 --- a/hypervideo_dl/extractor/cwtv.py +++ b/hypervideo_dl/extractor/cwtv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -94,4 +91,5 @@ class CWTVIE(InfoExtractor): 'timestamp': parse_iso8601(video_data.get('start_time')), 'age_limit': parse_age_limit(video_data.get('rating')), 'ie_key': 'ThePlatform', + 'thumbnail': video_data.get('large_thumbnail') } diff --git a/hypervideo_dl/extractor/cybrary.py b/hypervideo_dl/extractor/cybrary.py index c278f0f..73f2439 100644 --- a/hypervideo_dl/extractor/cybrary.py +++ b/hypervideo_dl/extractor/cybrary.py @@ -1,12 +1,10 @@ -# coding: utf-8 from .common import InfoExtractor - from ..utils import ( ExtractorError, smuggle_url, str_or_none, traverse_obj, - urlencode_postdata + urlencode_postdata, ) diff --git a/hypervideo_dl/extractor/daftsex.py b/hypervideo_dl/extractor/daftsex.py index 6037fd9..551d5e3 100644 --- a/hypervideo_dl/extractor/daftsex.py +++ b/hypervideo_dl/extractor/daftsex.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_b64decode from ..utils import ( @@ -84,7 +81,6 @@ class DaftsexIE(InfoExtractor): 'height': int_or_none(height), 'ext': ext, }) - self._sort_formats(formats) return { 'id': video_id, @@ -120,7 +116,6 @@ class DaftsexIE(InfoExtractor): 'height': int_or_none(height), 'ext': ext, }) - self._sort_formats(formats) thumbnails = [] for k, v in item.items(): diff --git a/hypervideo_dl/extractor/dailymail.py b/hypervideo_dl/extractor/dailymail.py index 67b88fd..43401e1 100644 --- a/hypervideo_dl/extractor/dailymail.py +++ b/hypervideo_dl/extractor/dailymail.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -15,6 +10,7 @@ from ..utils import ( class DailyMailIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/(?:video/[^/]+/video-|embed/video/)(?P<id>[0-9]+)' + _EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?dailymail\.co\.uk/embed/video/\d+\.html)'] _TESTS = [{ 'url': 'http://www.dailymail.co.uk/video/tvshowbiz/video-1295863/The-Mountain-appears-sparkling-water-ad-Heavy-Bubbles.html', 'md5': 'f6129624562251f628296c3a9ffde124', @@ -29,12 +25,6 @@ class DailyMailIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?dailymail\.co\.uk/embed/video/\d+\.html)', - webpage) - def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -73,7 +63,6 @@ class DailyMailIE(InfoExtractor): 'protocol': protocol, 'ext': 'mp4' if is_hls else None, }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/dailymotion.py b/hypervideo_dl/extractor/dailymotion.py index 9cb5618..2a44718 100644 --- a/hypervideo_dl/extractor/dailymotion.py +++ b/hypervideo_dl/extractor/dailymotion.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import functools import json import re @@ -8,13 +5,15 @@ import re from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( + ExtractorError, + OnDemandPagedList, age_restricted, clean_html, - ExtractorError, int_or_none, - OnDemandPagedList, + traverse_obj, try_get, unescapeHTML, + unsmuggle_url, urlencode_postdata, ) @@ -100,6 +99,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): [/=](?P<id>[^/?_&]+)(?:.+?\bplaylist=(?P<playlist_id>x[0-9a-z]+))? ''' IE_NAME = 'dailymotion' + _EMBED_REGEX = [r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1'] _TESTS = [{ 'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news', 'md5': '074b95bdee76b9e3654137aee9c79dfe', @@ -209,20 +209,16 @@ class DailymotionIE(DailymotionBaseInfoExtractor): } xid''' - @staticmethod - def _extract_urls(webpage): - urls = [] - # Look for embedded Dailymotion player + @classmethod + def _extract_embed_urls(cls, url, webpage): # https://developer.dailymotion.com/player#player-parameters - for mobj in re.finditer( - r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage): - urls.append(unescapeHTML(mobj.group('url'))) + yield from super()._extract_embed_urls(url, webpage) for mobj in re.finditer( r'(?s)DM\.player\([^,]+,\s*{.*?video[\'"]?\s*:\s*["\']?(?P<id>[0-9a-zA-Z]+).+?}\s*\);', webpage): - urls.append('https://www.dailymotion.com/embed/video/' + mobj.group('id')) - return urls + yield from 'https://www.dailymotion.com/embed/video/' + mobj.group('id') def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url) video_id, playlist_id = self._match_valid_url(url).groups() if playlist_id: @@ -255,7 +251,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor): metadata = self._download_json( 'https://www.dailymotion.com/player/metadata/video/' + xid, xid, 'Downloading metadata JSON', - query={'app': 'com.dailymotion.neon'}) + query=traverse_obj(smuggled_data, 'query') or {'app': 'com.dailymotion.neon'}) error = metadata.get('error') if error: @@ -297,7 +293,6 @@ class DailymotionIE(DailymotionBaseInfoExtractor): f['url'] = f['url'].split('#')[0] if not f.get('fps') and f['format_id'].endswith('@60'): f['fps'] = 60 - self._sort_formats(formats) subtitles = {} subtitles_data = try_get(metadata, lambda x: x['subtitles']['data'], dict) or {} @@ -378,6 +373,15 @@ class DailymotionPlaylistIE(DailymotionPlaylistBaseIE): }] _OBJECT_TYPE = 'collection' + @classmethod + def _extract_embed_urls(cls, url, webpage): + # Look for embedded Dailymotion playlist player (#3822) + for mobj in re.finditer( + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', + webpage): + for p in re.findall(r'list\[\]=/playlist/([^/]+)/', unescapeHTML(mobj.group('url'))): + yield '//dailymotion.com/playlist/%s' % p + class DailymotionUserIE(DailymotionPlaylistBaseIE): IE_NAME = 'dailymotion:user' diff --git a/hypervideo_dl/extractor/dailywire.py b/hypervideo_dl/extractor/dailywire.py new file mode 100644 index 0000000..f177c9d --- /dev/null +++ b/hypervideo_dl/extractor/dailywire.py @@ -0,0 +1,113 @@ +from .common import InfoExtractor +from ..utils import ( + determine_ext, + float_or_none, + join_nonempty, + traverse_obj, + url_or_none, +) + + +class DailyWireBaseIE(InfoExtractor): + _JSON_PATH = { + 'episode': ('props', 'pageProps', 'episodeData', 'episode'), + 'videos': ('props', 'pageProps', 'videoData', 'video'), + 'podcasts': ('props', 'pageProps', 'episode'), + } + + def _get_json(self, url): + sites_type, slug = self._match_valid_url(url).group('sites_type', 'id') + json_data = self._search_nextjs_data(self._download_webpage(url, slug), slug) + return slug, traverse_obj(json_data, self._JSON_PATH[sites_type]) + + +class DailyWireIE(DailyWireBaseIE): + _VALID_URL = r'https?://(?:www\.)dailywire(?:\.com)/(?P<sites_type>episode|videos)/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.dailywire.com/episode/1-fauci', + 'info_dict': { + 'id': 'ckzsl50xnqpy30850in3v4bu7', + 'ext': 'mp4', + 'display_id': '1-fauci', + 'title': '1. Fauci', + 'description': 'md5:9df630347ef85081b7e97dd30bc22853', + 'thumbnail': 'https://daily-wire-production.imgix.net/episodes/ckzsl50xnqpy30850in3v4bu7/ckzsl50xnqpy30850in3v4bu7-1648237399554.jpg', + 'creator': 'Caroline Roberts', + 'series_id': 'ckzplm0a097fn0826r2vc3j7h', + 'series': 'China: The Enemy Within', + } + }, { + 'url': 'https://www.dailywire.com/episode/ep-124-bill-maher', + 'info_dict': { + 'id': 'cl0ngbaalplc80894sfdo9edf', + 'ext': 'mp3', + 'display_id': 'ep-124-bill-maher', + 'title': 'Ep. 124 - Bill Maher', + 'thumbnail': 'https://daily-wire-production.imgix.net/episodes/cl0ngbaalplc80894sfdo9edf/cl0ngbaalplc80894sfdo9edf-1647065568518.jpg', + 'creator': 'Caroline Roberts', + 'description': 'md5:adb0de584bcfa9c41374999d9e324e98', + 'series_id': 'cjzvep7270hp00786l9hwccob', + 'series': 'The Sunday Special', + } + }, { + 'url': 'https://www.dailywire.com/videos/the-hyperions', + 'only_matching': True, + }] + + def _real_extract(self, url): + slug, episode_info = self._get_json(url) + urls = traverse_obj( + episode_info, (('segments', 'videoUrl'), ..., ('video', 'audio')), expected_type=url_or_none) + + formats, subtitles = [], {} + for url in urls: + if determine_ext(url) != 'm3u8': + formats.append({'url': url}) + continue + format_, subs_ = self._extract_m3u8_formats_and_subtitles(url, slug) + formats.extend(format_) + self._merge_subtitles(subs_, target=subtitles) + return { + 'id': episode_info['id'], + 'display_id': slug, + 'title': traverse_obj(episode_info, 'title', 'name'), + 'description': episode_info.get('description'), + 'creator': join_nonempty(('createdBy', 'firstName'), ('createdBy', 'lastName'), from_dict=episode_info, delim=' '), + 'duration': float_or_none(episode_info.get('duration')), + 'is_live': episode_info.get('isLive'), + 'thumbnail': traverse_obj(episode_info, 'thumbnail', 'image', expected_type=url_or_none), + 'formats': formats, + 'subtitles': subtitles, + 'series_id': traverse_obj(episode_info, ('show', 'id')), + 'series': traverse_obj(episode_info, ('show', 'name')), + } + + +class DailyWirePodcastIE(DailyWireBaseIE): + _VALID_URL = r'https?://(?:www\.)dailywire(?:\.com)/(?P<sites_type>podcasts)/(?P<podcaster>[\w-]+/(?P<id>[\w-]+))' + _TESTS = [{ + 'url': 'https://www.dailywire.com/podcasts/morning-wire/get-ready-for-recession-6-15-22', + 'info_dict': { + 'id': 'cl4f01d0w8pbe0a98ydd0cfn1', + 'ext': 'm4a', + 'display_id': 'get-ready-for-recession-6-15-22', + 'title': 'Get Ready for Recession | 6.15.22', + 'description': 'md5:c4afbadda4e1c38a4496f6d62be55634', + 'thumbnail': 'https://daily-wire-production.imgix.net/podcasts/ckx4otgd71jm508699tzb6hf4-1639506575562.jpg', + 'duration': 900.117667, + } + }] + + def _real_extract(self, url): + slug, episode_info = self._get_json(url) + audio_id = traverse_obj(episode_info, 'audioMuxPlaybackId', 'VUsAipTrBVSgzw73SpC2DAJD401TYYwEp') + + return { + 'id': episode_info['id'], + 'url': f'https://stream.media.dailywire.com/{audio_id}/audio.m4a', + 'display_id': slug, + 'title': episode_info.get('title'), + 'duration': float_or_none(episode_info.get('duration')), + 'thumbnail': episode_info.get('thumbnail'), + 'description': episode_info.get('description'), + } diff --git a/hypervideo_dl/extractor/damtomo.py b/hypervideo_dl/extractor/damtomo.py index 456cd35..0e08e4f 100644 --- a/hypervideo_dl/extractor/damtomo.py +++ b/hypervideo_dl/extractor/damtomo.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -39,7 +36,6 @@ class DamtomoBaseIE(InfoExtractor): if not m3u8_url: raise ExtractorError('Failed to obtain m3u8 URL') formats = self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4') - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/daum.py b/hypervideo_dl/extractor/daum.py index 4362e92..3ef5140 100644 --- a/hypervideo_dl/extractor/daum.py +++ b/hypervideo_dl/extractor/daum.py @@ -1,7 +1,3 @@ -# coding: utf-8 - -from __future__ import unicode_literals - import itertools from .common import InfoExtractor @@ -129,7 +125,7 @@ class DaumClipIE(DaumBaseIE): self._KAKAO_EMBED_BASE + video_id, 'Kakao', video_id) -class DaumListIE(InfoExtractor): +class DaumListIE(InfoExtractor): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor def _get_entries(self, list_id, list_id_type): name = None entries = [] diff --git a/hypervideo_dl/extractor/daystar.py b/hypervideo_dl/extractor/daystar.py index 4f59d90..ef3520a 100644 --- a/hypervideo_dl/extractor/daystar.py +++ b/hypervideo_dl/extractor/daystar.py @@ -36,7 +36,6 @@ class DaystarClipIE(InfoExtractor): video_id, 'mp4', fatal=False, headers={'Referer': src_iframe}) formats.extend(fmts) subtitles = self._merge_subtitles(subtitles, subs) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/dbtv.py b/hypervideo_dl/extractor/dbtv.py index 8e73176..18be46f 100644 --- a/hypervideo_dl/extractor/dbtv.py +++ b/hypervideo_dl/extractor/dbtv.py @@ -1,13 +1,9 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor class DBTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dagbladet\.no/video/(?:(?:embed|(?P<display_id>[^/]+))/)?(?P<id>[0-9A-Za-z_-]{11}|[a-zA-Z0-9]{8})' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dagbladet\.no/video/embed/(?:[0-9A-Za-z_-]{11}|[a-zA-Z0-9]{8}).*?)\1'] _TESTS = [{ 'url': 'https://www.dagbladet.no/video/PynxJnNWChE/', 'md5': 'b8f850ba1860adbda668d367f9b77699', @@ -31,12 +27,6 @@ class DBTVIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [url for _, url in re.findall( - r'<iframe[^>]+src=(["\'])((?:https?:)?//(?:www\.)?dagbladet\.no/video/embed/(?:[0-9A-Za-z_-]{11}|[a-zA-Z0-9]{8}).*?)\1', - webpage)] - def _real_extract(self, url): display_id, video_id = self._match_valid_url(url).groups() info = { diff --git a/hypervideo_dl/extractor/dctp.py b/hypervideo_dl/extractor/dctp.py index e700f8d..24bb6ac 100644 --- a/hypervideo_dl/extractor/dctp.py +++ b/hypervideo_dl/extractor/dctp.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( diff --git a/hypervideo_dl/extractor/deezer.py b/hypervideo_dl/extractor/deezer.py index 7ba02e5..f61f12a 100644 --- a/hypervideo_dl/extractor/deezer.py +++ b/hypervideo_dl/extractor/deezer.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -64,7 +62,6 @@ class DeezerPlaylistIE(DeezerBaseInfoExtractor): 'preference': -100, # Only the first 30 seconds 'ext': 'mp3', }] - self._sort_formats(formats) artists = ', '.join( orderedSet(a.get('ART_NAME') for a in s.get('ARTISTS'))) entries.append({ @@ -117,7 +114,6 @@ class DeezerAlbumIE(DeezerBaseInfoExtractor): 'preference': -100, # Only the first 30 seconds 'ext': 'mp3', }] - self._sort_formats(formats) artists = ', '.join( orderedSet(a.get('ART_NAME') for a in s.get('ARTISTS'))) entries.append({ diff --git a/hypervideo_dl/extractor/defense.py b/hypervideo_dl/extractor/defense.py index 9fe144e..7d73ea8 100644 --- a/hypervideo_dl/extractor/defense.py +++ b/hypervideo_dl/extractor/defense.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/democracynow.py b/hypervideo_dl/extractor/democracynow.py index 5c9c0ec..1624d08 100644 --- a/hypervideo_dl/extractor/democracynow.py +++ b/hypervideo_dl/extractor/democracynow.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re import os.path @@ -62,8 +59,6 @@ class DemocracynowIE(InfoExtractor): 'vcodec': 'none' if key == 'audio' else None, }) - self._sort_formats(formats) - default_lang = 'en' subtitles = {} diff --git a/hypervideo_dl/extractor/detik.py b/hypervideo_dl/extractor/detik.py new file mode 100644 index 0000000..f148054 --- /dev/null +++ b/hypervideo_dl/extractor/detik.py @@ -0,0 +1,159 @@ +from .common import InfoExtractor +from ..utils import int_or_none, merge_dicts, try_call, url_basename + + +class DetikEmbedIE(InfoExtractor): + _VALID_URL = False + _WEBPAGE_TESTS = [{ + # cnn embed + 'url': 'https://www.cnnindonesia.com/embed/video/846189', + 'info_dict': { + 'id': '846189', + 'ext': 'mp4', + 'description': 'md5:ece7b003b3ee7d81c6a5cfede7d5397d', + 'thumbnail': r're:https?://akcdn\.detik\.net\.id/visual/2022/09/11/thumbnail-video-1_169.jpeg', + 'title': 'Video CNN Indonesia - VIDEO: Momen Charles Disambut Meriah usai Dilantik jadi Raja Inggris', + 'age_limit': 0, + 'tags': ['raja charles', ' raja charles iii', ' ratu elizabeth', ' ratu elizabeth meninggal dunia', ' raja inggris', ' inggris'], + 'release_timestamp': 1662869995, + 'release_date': '20220911', + 'uploader': 'REUTERS' + } + }, { + # 20.detik + 'url': 'https://20.detik.com/otobuzz/20220704-220704093/mulai-rp-10-jutaan-ini-skema-kredit-mitsubishi-pajero-sport', + 'info_dict': { + 'display_id': 'mulai-rp-10-jutaan-ini-skema-kredit-mitsubishi-pajero-sport', + 'id': '220704093', + 'ext': 'mp4', + 'description': 'md5:9b2257341b6f375cdcf90106146d5ffb', + 'thumbnail': r're:https?://cdnv\.detik\.com/videoservice/AdminTV/2022/07/04/5d6187e402ec4a91877755a5886ff5b6-20220704161859-0s.jpg', + 'title': 'Mulai Rp 10 Jutaan! Ini Skema Kredit Mitsubishi Pajero Sport', + 'timestamp': 1656951521, + 'upload_date': '20220704', + 'duration': 83.0, + 'tags': ['cicilan mobil', 'mitsubishi pajero sport', 'mitsubishi', 'pajero sport'], + 'release_timestamp': 1656926321, + 'release_date': '20220704', + 'age_limit': 0, + 'uploader': 'Ridwan Arifin ' # TODO: strip trailling whitespace at uploader + } + }, { + # pasangmata.detik + 'url': 'https://pasangmata.detik.com/contribution/366649', + 'info_dict': { + 'id': '366649', + 'ext': 'mp4', + 'title': 'Saling Dorong Aparat dan Pendemo di Aksi Tolak Kenaikan BBM', + 'description': 'md5:7a6580876c8381c454679e028620bea7', + 'age_limit': 0, + 'tags': 'count:17', + 'thumbnail': 'https://akcdn.detik.net.id/community/data/media/thumbs-pasangmata/2022/09/08/366649-16626229351533009620.mp4-03.jpg', + } + }, { + # insertlive embed + 'url': 'https://www.insertlive.com/embed/video/290482', + 'info_dict': { + 'id': '290482', + 'ext': 'mp4', + 'release_timestamp': 1663063704, + 'thumbnail': 'https://akcdn.detik.net.id/visual/2022/09/13/leonardo-dicaprio_169.png?w=600&q=90', + 'age_limit': 0, + 'description': 'Aktor Leonardo DiCaprio memang baru saja putus dari kekasihnya yang bernama Camilla Morrone.', + 'release_date': '20220913', + 'title': 'Diincar Leonardo DiCaprio, Gigi Hadid Ngaku Tertarik Tapi Belum Cinta', + 'tags': ['leonardo dicaprio', ' gigi hadid', ' hollywood'], + 'uploader': '!nsertlive', + } + }, { + # beautynesia embed + 'url': 'https://www.beautynesia.id/embed/video/261636', + 'info_dict': { + 'id': '261636', + 'ext': 'mp4', + 'age_limit': 0, + 'release_timestamp': 1662375600, + 'description': 'Menurut ramalan astrologi, tiga zodiak ini bakal hoki sepanjang September 2022.', + 'title': '3 Zodiak Paling Beruntung Selama September 2022', + 'release_date': '20220905', + 'tags': ['zodiac update', ' zodiak', ' ramalan bintang', ' zodiak beruntung 2022', ' zodiak hoki september 2022', ' zodiak beruntung september 2022'], + 'thumbnail': 'https://akcdn.detik.net.id/visual/2022/09/05/3-zodiak-paling-beruntung-selama-september-2022_169.jpeg?w=600&q=90', + 'uploader': 'amh', + } + }, { + # cnbcindonesia embed + 'url': 'https://www.cnbcindonesia.com/embed/video/371839', + 'info_dict': { + 'id': '371839', + 'ext': 'mp4', + 'title': 'Puluhan Pejabat Rusia Tuntut Putin Mundur', + 'tags': ['putin'], + 'age_limit': 0, + 'thumbnail': 'https://awsimages.detik.net.id/visual/2022/09/13/cnbc-indonesia-tv-3_169.png?w=600&q=80', + 'description': 'md5:8b9111e37555fcd95fe549a9b4ae6fdc', + } + }, { + # detik shortlink (we can get it from https://dtk.id/?<url>) + 'url': 'https://dtk.id/NkISKr', + 'info_dict': { + 'id': '220914049', + 'ext': 'mp4', + 'release_timestamp': 1663114488, + 'uploader': 'Tim 20Detik', + 'title': 'Pakar Bicara soal Tim Khusus Jokowi dan Mereka yang Pro ke Bjorka', + 'age_limit': 0, + 'thumbnail': 'https://cdnv.detik.com/videoservice/AdminTV/2022/09/14/f15cae71d7b640c58e75b254ecbb1ce1-20220914071613-0s.jpg?w=400&q=80', + 'display_id': 'pakar-bicara-soal-tim-khusus-jokowi-dan-mereka-yang-pro-ke-bjorka', + 'upload_date': '20220914', + 'release_date': '20220914', + 'description': 'md5:5eb03225f7ee40207dd3a1e18a73f1ff', + 'timestamp': 1663139688, + 'duration': 213.0, + 'tags': ['hacker bjorka', 'bjorka', 'hacker bjorka bocorkan data rahasia presiden jokowi', 'jokowi'], + } + }] + + def _extract_from_webpage(self, url, webpage): + player_type, video_data = self._search_regex( + r'<script\s*[^>]+src="https?://(aws)?cdn\.detik\.net\.id/(?P<type>flowplayer|detikVideo)[^>]+>\s*(?P<video_data>{[^}]+})', + webpage, 'playerjs', group=('type', 'video_data'), default=(None, '')) + if not player_type: + return + + display_id, extra_info_dict = url_basename(url), {} + + if player_type == 'flowplayer': + video_json_data = self._parse_json(video_data.replace('\'', '"'), display_id) + video_url = video_json_data['videoUrl'] + + extra_info_dict = { + 'id': self._search_regex(r'identifier\s*:\s*\'([^\']+)', webpage, 'identifier'), + 'thumbnail': video_json_data.get('imageUrl'), + } + + elif player_type == 'detikVideo': + video_url = self._search_regex( + r'videoUrl\s*:\s*[\'"]?([^"\']+)', video_data, 'videoUrl') + extra_info_dict = { + 'id': self._html_search_meta(['video_id', 'dtk:video_id'], webpage), + 'thumbnail': self._search_regex(r'imageUrl\s*:\s*[\'"]?([^"\']+)', video_data, 'videoUrl'), + 'duration': int_or_none(self._html_search_meta('duration', webpage, fatal=False, default=None)), + 'release_timestamp': int_or_none(self._html_search_meta('dtk:publishdateunix', webpage, fatal=False, default=None), 1000), + 'timestamp': int_or_none(self._html_search_meta('dtk:createdateunix', webpage, fatal=False, default=None), 1000), + 'uploader': self._search_regex( + r'([^-]+)', self._html_search_meta('dtk:author', webpage, default='').strip(), 'uploader', + default=None) + } + + formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, display_id) + + json_ld_data = self._search_json_ld(webpage, display_id, default={}) + yield merge_dicts(json_ld_data, extra_info_dict, { + 'display_id': display_id, + 'title': self._html_search_meta(['og:title', 'originalTitle'], webpage) or self._html_extract_title(webpage), + 'description': self._html_search_meta(['og:description', 'twitter:description', 'description'], webpage), + 'formats': formats, + 'subtitles': subtitles, + 'tags': try_call(lambda: self._html_search_meta( + ['keywords', 'keyword', 'dtk:keywords'], webpage).split(',')), + }) diff --git a/hypervideo_dl/extractor/deuxm.py b/hypervideo_dl/extractor/deuxm.py new file mode 100644 index 0000000..74a6da6 --- /dev/null +++ b/hypervideo_dl/extractor/deuxm.py @@ -0,0 +1,76 @@ +from .common import InfoExtractor +from ..utils import url_or_none + + +class DeuxMIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?2m\.ma/[^/]+/replay/single/(?P<id>([\w.]{1,24})+)' + + _TESTS = [{ + 'url': 'https://2m.ma/fr/replay/single/6351d439b15e1a613b3debe8', + 'md5': '5f761f04c9d686e553b685134dca5d32', + 'info_dict': { + 'id': '6351d439b15e1a613b3debe8', + 'ext': 'mp4', + 'title': 'Grand Angle : Jeudi 20 Octobre 2022', + 'thumbnail': r're:^https?://2msoread-ww.amagi.tv/mediasfiles/videos/images/.*\.png$' + } + }, { + 'url': 'https://2m.ma/fr/replay/single/635c0aeab4eec832622356da', + 'md5': 'ad6af2f5e4d5b2ad2194a84b6e890b4c', + 'info_dict': { + 'id': '635c0aeab4eec832622356da', + 'ext': 'mp4', + 'title': 'Journal Amazigh : Vendredi 28 Octobre 2022', + 'thumbnail': r're:^https?://2msoread-ww.amagi.tv/mediasfiles/videos/images/.*\.png$' + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + video = self._download_json( + f'https://2m.ma/api/watchDetail/{video_id}', video_id)['response']['News'] + return { + 'id': video_id, + 'title': video.get('titre'), + 'url': video['url'], + 'description': video.get('description'), + 'thumbnail': url_or_none(video.get('image')), + } + + +class DeuxMNewsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?2m\.ma/(?P<lang>\w+)/news/(?P<id>[^/#?]+)' + + _TESTS = [{ + 'url': 'https://2m.ma/fr/news/Kan-Ya-Mkan-d%C3%A9poussi%C3%A8re-l-histoire-du-phare-du-Cap-Beddouza-20221028', + 'md5': '43d5e693a53fa0b71e8a5204c7d4542a', + 'info_dict': { + 'id': '635c5d1233b83834e35b282e', + 'ext': 'mp4', + 'title': 'Kan Ya Mkan d\u00e9poussi\u00e8re l\u2019histoire du phare du Cap Beddouza', + 'description': 'md5:99dcf29b82f1d7f2a4acafed1d487527', + 'thumbnail': r're:^https?://2msoread-ww.amagi.tv/mediasfiles/videos/images/.*\.png$' + } + }, { + 'url': 'https://2m.ma/fr/news/Interview-Casablanca-hors-des-sentiers-battus-avec-Abderrahim-KASSOU-Replay--20221017', + 'md5': '7aca29f02230945ef635eb8290283c0c', + 'info_dict': { + 'id': '634d9e108b70d40bc51a844b', + 'ext': 'mp4', + 'title': 'Interview: Casablanca hors des sentiers battus avec Abderrahim KASSOU (Replay) ', + 'description': 'md5:3b8e78111de9fcc6ef7f7dd6cff2430c', + 'thumbnail': r're:^https?://2msoread-ww.amagi.tv/mediasfiles/videos/images/.*\.png$' + } + }] + + def _real_extract(self, url): + article_name, lang = self._match_valid_url(url).group('id', 'lang') + video = self._download_json( + f'https://2m.ma/api/articlesByUrl?lang={lang}&url=/news/{article_name}', article_name)['response']['article'][0] + return { + 'id': video['id'], + 'title': video.get('title'), + 'url': video['image'][0], + 'description': video.get('content'), + 'thumbnail': url_or_none(video.get('cover')), + } diff --git a/hypervideo_dl/extractor/dfb.py b/hypervideo_dl/extractor/dfb.py index 97f70fc..c4fb5c2 100644 --- a/hypervideo_dl/extractor/dfb.py +++ b/hypervideo_dl/extractor/dfb.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import unified_strdate @@ -44,7 +41,6 @@ class DFBIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( manifest_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/dhm.py b/hypervideo_dl/extractor/dhm.py index aee72a6..3d42fc2 100644 --- a/hypervideo_dl/extractor/dhm.py +++ b/hypervideo_dl/extractor/dhm.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import parse_duration diff --git a/hypervideo_dl/extractor/digg.py b/hypervideo_dl/extractor/digg.py index 913c175..86e8a6f 100644 --- a/hypervideo_dl/extractor/digg.py +++ b/hypervideo_dl/extractor/digg.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import js_to_json diff --git a/hypervideo_dl/extractor/digitalconcerthall.py b/hypervideo_dl/extractor/digitalconcerthall.py index 8398ae3..3461e36 100644 --- a/hypervideo_dl/extractor/digitalconcerthall.py +++ b/hypervideo_dl/extractor/digitalconcerthall.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( @@ -89,9 +86,8 @@ class DigitalConcertHallIE(InfoExtractor): }) m3u8_url = traverse_obj( - stream_info, ('channel', lambda x: x.startswith('vod_mixed'), 'stream', 0, 'url'), get_all=False) + stream_info, ('channel', lambda k, _: k.startswith('vod_mixed'), 'stream', 0, 'url'), get_all=False) formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', 'm3u8_native', fatal=False) - self._sort_formats(formats) yield { 'id': video_id, diff --git a/hypervideo_dl/extractor/digiteka.py b/hypervideo_dl/extractor/digiteka.py index d632047..912e33b 100644 --- a/hypervideo_dl/extractor/digiteka.py +++ b/hypervideo_dl/extractor/digiteka.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import int_or_none @@ -28,6 +23,7 @@ class DigitekaIE(InfoExtractor): ) /id )/(?P<id>[\d+a-z]+)''' + _EMBED_REGEX = [r'<(?:iframe|script)[^>]+src=["\'](?P<url>(?:https?:)?//(?:www\.)?ultimedia\.com/deliver/(?:generic|musique)(?:/[^/]+)*/(?:src|article)/[\d+a-z]+)'] _TESTS = [{ # news 'url': 'https://www.ultimedia.com/default/index/videogeneric/id/s8uk0r', @@ -61,14 +57,6 @@ class DigitekaIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<(?:iframe|script)[^>]+src=["\'](?P<url>(?:https?:)?//(?:www\.)?ultimedia\.com/deliver/(?:generic|musique)(?:/[^/]+)*/(?:src|article)/[\d+a-z]+)', - webpage) - if mobj: - return mobj.group('url') - def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') @@ -93,8 +81,6 @@ class DigitekaIE(InfoExtractor): 'format_id': source.get('label'), }) - self._sort_formats(formats) - title = deliver_info['title'] thumbnail = jwconf.get('image') duration = int_or_none(deliver_info.get('duration')) diff --git a/hypervideo_dl/extractor/discovery.py b/hypervideo_dl/extractor/discovery.py index fd3ad75..fd3fc8f 100644 --- a/hypervideo_dl/extractor/discovery.py +++ b/hypervideo_dl/extractor/discovery.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import random import string diff --git a/hypervideo_dl/extractor/discoverygo.py b/hypervideo_dl/extractor/discoverygo.py index 9e7b14a..1f3d8e3 100644 --- a/hypervideo_dl/extractor/discoverygo.py +++ b/hypervideo_dl/extractor/discoverygo.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -52,7 +50,6 @@ class DiscoveryGoBaseIE(InfoExtractor): elif stream_kind == 'hds': formats.extend(self._extract_f4m_formats( stream_url, display_id, f4m_id=stream_kind, fatal=False)) - self._sort_formats(formats) video_id = video.get('id') or display_id description = video.get('description', {}).get('detailed') diff --git a/hypervideo_dl/extractor/discoverynetworks.py b/hypervideo_dl/extractor/discoverynetworks.py deleted file mode 100644 index f43c871..0000000 --- a/hypervideo_dl/extractor/discoverynetworks.py +++ /dev/null @@ -1,42 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - - -from .dplay import DPlayIE - - -class DiscoveryNetworksDeIE(DPlayIE): - _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show|sendungen)/(?P<programme>[^/]+)/(?:video/)?(?P<alternate_id>[^/]+)' - - _TESTS = [{ - 'url': 'https://www.tlc.de/programme/breaking-amish/video/die-welt-da-drauen/DCB331270001100', - 'info_dict': { - 'id': '78867', - 'ext': 'mp4', - 'title': 'Die Welt da draußen', - 'description': 'md5:61033c12b73286e409d99a41742ef608', - 'timestamp': 1554069600, - 'upload_date': '20190331', - }, - 'params': { - 'format': 'bestvideo', - 'skip_download': True, - }, - }, { - 'url': 'https://www.dmax.de/programme/dmax-highlights/video/tuning-star-sidney-hoffmann-exklusiv-bei-dmax/191023082312316', - 'only_matching': True, - }, { - 'url': 'https://www.dplay.co.uk/show/ghost-adventures/video/hotel-leger-103620/EHD_280313B', - 'only_matching': True, - }, { - 'url': 'https://tlc.de/sendungen/breaking-amish/die-welt-da-drauen/', - 'only_matching': True, - }] - - def _real_extract(self, url): - domain, programme, alternate_id = self._match_valid_url(url).groups() - country = 'GB' if domain == 'dplay.co.uk' else 'DE' - realm = 'questuk' if country == 'GB' else domain.replace('.', '') - return self._get_disco_api_info( - url, '%s/%s' % (programme, alternate_id), - 'sonic-eu1-prod.disco-api.com', realm, country) diff --git a/hypervideo_dl/extractor/discoveryplusindia.py b/hypervideo_dl/extractor/discoveryplusindia.py deleted file mode 100644 index 5180140..0000000 --- a/hypervideo_dl/extractor/discoveryplusindia.py +++ /dev/null @@ -1,98 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json - -from ..compat import compat_str -from ..utils import try_get -from .common import InfoExtractor -from .dplay import DPlayIE - - -class DiscoveryPlusIndiaIE(DPlayIE): - _VALID_URL = r'https?://(?:www\.)?discoveryplus\.in/videos?' + DPlayIE._PATH_REGEX - _TESTS = [{ - 'url': 'https://www.discoveryplus.in/videos/how-do-they-do-it/fugu-and-more?seasonId=8&type=EPISODE', - 'info_dict': { - 'id': '27104', - 'ext': 'mp4', - 'display_id': 'how-do-they-do-it/fugu-and-more', - 'title': 'Fugu and More', - 'description': 'The Japanese catch, prepare and eat the deadliest fish on the planet.', - 'duration': 1319, - 'timestamp': 1582309800, - 'upload_date': '20200221', - 'series': 'How Do They Do It?', - 'season_number': 8, - 'episode_number': 2, - 'creator': 'Discovery Channel', - }, - 'params': { - 'format': 'bestvideo', - 'skip_download': True, - }, - 'skip': 'Cookies (not necessarily logged in) are needed' - }] - - def _update_disco_api_headers(self, headers, disco_base, display_id, realm): - headers['x-disco-params'] = 'realm=%s' % realm - headers['x-disco-client'] = 'WEB:UNKNOWN:dplus-india:17.0.0' - - def _download_video_playback_info(self, disco_base, video_id, headers): - return self._download_json( - disco_base + 'playback/v3/videoPlaybackInfo', - video_id, headers=headers, data=json.dumps({ - 'deviceInfo': { - 'adBlocker': False, - }, - 'videoId': video_id, - }).encode('utf-8'))['data']['attributes']['streaming'] - - def _real_extract(self, url): - display_id = self._match_id(url) - return self._get_disco_api_info( - url, display_id, 'ap2-prod-direct.discoveryplus.in', 'dplusindia', 'in') - - -class DiscoveryPlusIndiaShowIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?discoveryplus\.in/show/(?P<show_name>[^/]+)/?(?:[?#]|$)' - _TESTS = [{ - 'url': 'https://www.discoveryplus.in/show/how-do-they-do-it', - 'playlist_mincount': 140, - 'info_dict': { - 'id': 'how-do-they-do-it', - }, - }] - - def _entries(self, show_name): - headers = { - 'x-disco-client': 'WEB:UNKNOWN:dplus-india:prod', - 'x-disco-params': 'realm=dplusindia', - 'referer': 'https://www.discoveryplus.in/', - } - show_url = 'https://ap2-prod-direct.discoveryplus.in/cms/routes/show/{}?include=default'.format(show_name) - show_json = self._download_json(show_url, - video_id=show_name, - headers=headers)['included'][4]['attributes']['component'] - show_id = show_json['mandatoryParams'].split('=')[-1] - season_url = 'https://ap2-prod-direct.discoveryplus.in/content/videos?sort=episodeNumber&filter[seasonNumber]={}&filter[show.id]={}&page[size]=100&page[number]={}' - for season in show_json['filters'][0]['options']: - season_id = season['id'] - total_pages, page_num = 1, 0 - while page_num < total_pages: - season_json = self._download_json(season_url.format(season_id, show_id, compat_str(page_num + 1)), - video_id=show_id, headers=headers, - note='Downloading JSON metadata%s' % (' page %d' % page_num if page_num else '')) - if page_num == 0: - total_pages = try_get(season_json, lambda x: x['meta']['totalPages'], int) or 1 - episodes_json = season_json['data'] - for episode in episodes_json: - video_id = episode['attributes']['path'] - yield self.url_result( - 'https://discoveryplus.in/videos/%s' % video_id, - ie=DiscoveryPlusIndiaIE.ie_key(), video_id=video_id) - page_num += 1 - - def _real_extract(self, url): - show_name = self._match_valid_url(url).group('show_name') - return self.playlist_result(self._entries(show_name), playlist_id=show_name) diff --git a/hypervideo_dl/extractor/discoveryvr.py b/hypervideo_dl/extractor/discoveryvr.py deleted file mode 100644 index cb63c26..0000000 --- a/hypervideo_dl/extractor/discoveryvr.py +++ /dev/null @@ -1,59 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import parse_duration - - -class DiscoveryVRIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?discoveryvr\.com/watch/(?P<id>[^/?#]+)' - _TEST = { - 'url': 'http://www.discoveryvr.com/watch/discovery-vr-an-introduction', - 'md5': '32b1929798c464a54356378b7912eca4', - 'info_dict': { - 'id': 'discovery-vr-an-introduction', - 'ext': 'mp4', - 'title': 'Discovery VR - An Introduction', - 'description': 'md5:80d418a10efb8899d9403e61d8790f06', - } - } - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - bootstrap_data = self._search_regex( - r'root\.DVR\.bootstrapData\s+=\s+"({.+?})";', - webpage, 'bootstrap data') - bootstrap_data = self._parse_json( - bootstrap_data.encode('utf-8').decode('unicode_escape'), - display_id) - videos = self._parse_json(bootstrap_data['videos'], display_id)['allVideos'] - video_data = next(video for video in videos if video.get('slug') == display_id) - - series = video_data.get('showTitle') - title = episode = video_data.get('title') or series - if series and series != title: - title = '%s - %s' % (series, title) - - formats = [] - for f, format_id in (('cdnUriM3U8', 'mobi'), ('webVideoUrlSd', 'sd'), ('webVideoUrlHd', 'hd')): - f_url = video_data.get(f) - if not f_url: - continue - formats.append({ - 'format_id': format_id, - 'url': f_url, - }) - - return { - 'id': display_id, - 'display_id': display_id, - 'title': title, - 'description': video_data.get('description'), - 'thumbnail': video_data.get('thumbnail'), - 'duration': parse_duration(video_data.get('runTime')), - 'formats': formats, - 'episode': episode, - 'series': series, - } diff --git a/hypervideo_dl/extractor/disney.py b/hypervideo_dl/extractor/disney.py index 0ad7b1f..430de32 100644 --- a/hypervideo_dl/extractor/disney.py +++ b/hypervideo_dl/extractor/disney.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -137,7 +134,6 @@ class DisneyIE(InfoExtractor): self.raise_no_formats( '%s said: %s' % (self.IE_NAME, page_data['translations']['video_expired']), expected=True) - self._sort_formats(formats) subtitles = {} for caption in video_data.get('captions', []): diff --git a/hypervideo_dl/extractor/dispeak.py b/hypervideo_dl/extractor/dispeak.py index 3d651f3..37f89b9 100644 --- a/hypervideo_dl/extractor/dispeak.py +++ b/hypervideo_dl/extractor/dispeak.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -119,7 +117,6 @@ class DigitallySpeakingIE(InfoExtractor): video_formats = self._parse_mp4(metadata) if video_formats is None: video_formats = self._parse_flv(metadata) - self._sort_formats(video_formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/dlive.py b/hypervideo_dl/extractor/dlive.py index 7410eb6..30fcf9f 100644 --- a/hypervideo_dl/extractor/dlive.py +++ b/hypervideo_dl/extractor/dlive.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -42,7 +40,6 @@ class DLiveVODIE(InfoExtractor): title = broadcast['title'] formats = self._extract_m3u8_formats( broadcast['playbackUrl'], vod_id, 'mp4', 'm3u8_native') - self._sort_formats(formats) return { 'id': vod_id, 'title': title, @@ -81,7 +78,6 @@ class DLiveStreamIE(InfoExtractor): formats = self._extract_m3u8_formats( 'https://live.prd.dlive.tv/hls/live/%s.m3u8' % username, display_name, 'mp4') - self._sort_formats(formats) return { 'id': display_name, 'title': title, diff --git a/hypervideo_dl/extractor/doodstream.py b/hypervideo_dl/extractor/doodstream.py deleted file mode 100644 index f692127..0000000 --- a/hypervideo_dl/extractor/doodstream.py +++ /dev/null @@ -1,76 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import string -import random -import time - -from .common import InfoExtractor - - -class DoodStreamIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dood\.(?:to|watch)/[ed]/(?P<id>[a-z0-9]+)' - _TESTS = [{ - 'url': 'http://dood.to/e/5s1wmbdacezb', - 'md5': '4568b83b31e13242b3f1ff96c55f0595', - 'info_dict': { - 'id': '5s1wmbdacezb', - 'ext': 'mp4', - 'title': 'Kat Wonders - Monthly May 2020', - 'description': 'Kat Wonders - Monthly May 2020 | DoodStream.com', - 'thumbnail': 'https://img.doodcdn.com/snaps/flyus84qgl2fsk4g.jpg', - } - }, { - 'url': 'http://dood.watch/d/5s1wmbdacezb', - 'md5': '4568b83b31e13242b3f1ff96c55f0595', - 'info_dict': { - 'id': '5s1wmbdacezb', - 'ext': 'mp4', - 'title': 'Kat Wonders - Monthly May 2020', - 'description': 'Kat Wonders - Monthly May 2020 | DoodStream.com', - 'thumbnail': 'https://img.doodcdn.com/snaps/flyus84qgl2fsk4g.jpg', - } - }, { - 'url': 'https://dood.to/d/jzrxn12t2s7n', - 'md5': '3207e199426eca7c2aa23c2872e6728a', - 'info_dict': { - 'id': 'jzrxn12t2s7n', - 'ext': 'mp4', - 'title': 'Stacy Cruz Cute ALLWAYSWELL', - 'description': 'Stacy Cruz Cute ALLWAYSWELL | DoodStream.com', - 'thumbnail': 'https://img.doodcdn.com/snaps/8edqd5nppkac3x8u.jpg', - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - url = f'https://dood.to/e/{video_id}' - webpage = self._download_webpage(url, video_id) - - title = self._html_search_meta(['og:title', 'twitter:title'], webpage, default=None) - thumb = self._html_search_meta(['og:image', 'twitter:image'], webpage, default=None) - token = self._html_search_regex(r'[?&]token=([a-z0-9]+)[&\']', webpage, 'token') - description = self._html_search_meta( - ['og:description', 'description', 'twitter:description'], webpage, default=None) - - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/66.0', - 'referer': url - } - - pass_md5 = self._html_search_regex(r'(/pass_md5.*?)\'', webpage, 'pass_md5') - final_url = ''.join(( - self._download_webpage(f'https://dood.to{pass_md5}', video_id, headers=headers), - *(random.choice(string.ascii_letters + string.digits) for _ in range(10)), - f'?token={token}&expiry={int(time.time() * 1000)}', - )) - - return { - 'id': video_id, - 'title': title, - 'url': final_url, - 'http_headers': headers, - 'ext': 'mp4', - 'description': description, - 'thumbnail': thumb, - } diff --git a/hypervideo_dl/extractor/dotsub.py b/hypervideo_dl/extractor/dotsub.py index 148605c..079f837 100644 --- a/hypervideo_dl/extractor/dotsub.py +++ b/hypervideo_dl/extractor/dotsub.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( float_or_none, diff --git a/hypervideo_dl/extractor/douyutv.py b/hypervideo_dl/extractor/douyutv.py index 26a8d64..477f468 100644 --- a/hypervideo_dl/extractor/douyutv.py +++ b/hypervideo_dl/extractor/douyutv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import time import hashlib import re diff --git a/hypervideo_dl/extractor/dplay.py b/hypervideo_dl/extractor/dplay.py index a25f27c..8eb4d8f 100644 --- a/hypervideo_dl/extractor/dplay.py +++ b/hypervideo_dl/extractor/dplay.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import uuid @@ -11,6 +8,7 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + remove_start, strip_or_none, try_get, unified_timestamp, @@ -128,7 +126,6 @@ class DPlayBaseIE(InfoExtractor): 'url': format_url, 'format_id': format_id, }) - self._sort_formats(formats) creator = series = None tags = [] @@ -314,7 +311,7 @@ class DPlayIE(DPlayBaseIE): def _real_extract(self, url): mobj = self._match_valid_url(url) display_id = mobj.group('id') - domain = mobj.group('domain').lstrip('www.') + domain = remove_start(mobj.group('domain'), 'www.') country = mobj.group('country') or mobj.group('subdomain_country') or mobj.group('plus_country') host = 'disco-api.' + domain if domain[0] == 'd' else 'eu2-prod.disco-api.com' return self._get_disco_api_info( @@ -720,6 +717,72 @@ class TLCIE(DiscoveryPlusBaseIE): } +class MotorTrendIE(DiscoveryPlusBaseIE): + _VALID_URL = r'https?://(?:watch\.)?motortrend\.com/video' + DPlayBaseIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://watch.motortrend.com/video/car-issues-motortrend-atve-us/double-dakotas', + 'info_dict': { + 'id': '"4859182"', + 'display_id': 'double-dakotas', + 'ext': 'mp4', + 'title': 'Double Dakotas', + 'description': 'Tylers buy-one-get-one Dakota deal has the Wizard pulling double duty.', + 'season_number': 2, + 'episode_number': 3, + }, + 'skip': 'Available for Premium users', + }, { + 'url': 'https://watch.motortrend.com/video/car-issues-motortrend-atve-us/double-dakotas', + 'only_matching': True, + }] + + _PRODUCT = 'vel' + _DISCO_API_PARAMS = { + 'disco_host': 'us1-prod-direct.watch.motortrend.com', + 'realm': 'go', + 'country': 'us', + } + + +class MotorTrendOnDemandIE(DiscoveryPlusBaseIE): + _VALID_URL = r'https?://(?:www\.)?motortrendondemand\.com/detail' + DPlayBaseIE._PATH_REGEX + _TESTS = [{ + 'url': 'https://www.motortrendondemand.com/detail/wheelstanding-dump-truck-stubby-bobs-comeback/37699/784', + 'info_dict': { + 'id': '37699', + 'display_id': 'wheelstanding-dump-truck-stubby-bobs-comeback/37699', + 'ext': 'mp4', + 'title': 'Wheelstanding Dump Truck! Stubby Bob’s Comeback', + 'description': 'md5:996915abe52a1c3dfc83aecea3cce8e7', + 'season_number': 5, + 'episode_number': 52, + 'episode': 'Episode 52', + 'season': 'Season 5', + 'thumbnail': r're:^https?://.+\.jpe?g$', + 'timestamp': 1388534401, + 'duration': 1887.345, + 'creator': 'Originals', + 'series': 'Roadkill', + 'upload_date': '20140101', + 'tags': [], + }, + }] + + _PRODUCT = 'MTOD' + _DISCO_API_PARAMS = { + 'disco_host': 'us1-prod-direct.motortrendondemand.com', + 'realm': 'motortrend', + 'country': 'us', + } + + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers.update({ + 'x-disco-params': f'realm={realm}', + 'x-disco-client': f'WEB:UNKNOWN:{self._PRODUCT}:4.39.1-gi1', + 'Authorization': self._get_auth(disco_base, display_id, realm), + }) + + class DiscoveryPlusIE(DiscoveryPlusBaseIE): _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/(?!it/)(?:\w{2}/)?video' + DPlayBaseIE._PATH_REGEX _TESTS = [{ @@ -882,6 +945,9 @@ class DiscoveryPlusItalyIE(DiscoveryPlusBaseIE): _TESTS = [{ 'url': 'https://www.discoveryplus.com/it/video/i-signori-della-neve/stagione-2-episodio-1-i-preparativi', 'only_matching': True, + }, { + 'url': 'https://www.discoveryplus.com/it/video/super-benny/trailer', + 'only_matching': True, }] _PRODUCT = 'dplus_us' @@ -891,6 +957,13 @@ class DiscoveryPlusItalyIE(DiscoveryPlusBaseIE): 'country': 'it', } + def _update_disco_api_headers(self, headers, disco_base, display_id, realm): + headers.update({ + 'x-disco-params': 'realm=%s' % realm, + 'x-disco-client': f'WEB:UNKNOWN:{self._PRODUCT}:25.2.6', + 'Authorization': self._get_auth(disco_base, display_id, realm), + }) + class DiscoveryPlusItalyShowIE(DiscoveryPlusShowBaseIE): _VALID_URL = r'https?://(?:www\.)?discoveryplus\.it/programmi/(?P<show_name>[^/]+)/?(?:[?#]|$)' diff --git a/hypervideo_dl/extractor/drbonanza.py b/hypervideo_dl/extractor/drbonanza.py index ea0f06d..824d70d 100644 --- a/hypervideo_dl/extractor/drbonanza.py +++ b/hypervideo_dl/extractor/drbonanza.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( js_to_json, @@ -33,7 +30,6 @@ class DRBonanzaIE(InfoExtractor): info = self._parse_html5_media_entries( url, webpage, display_id, m3u8_id='hls', m3u8_entry_protocol='m3u8_native')[0] - self._sort_formats(info['formats']) asset = self._parse_json( self._search_regex( diff --git a/hypervideo_dl/extractor/dreisat.py b/hypervideo_dl/extractor/dreisat.py index 5a07c18..8a59c23 100644 --- a/hypervideo_dl/extractor/dreisat.py +++ b/hypervideo_dl/extractor/dreisat.py @@ -1,9 +1,7 @@ -from __future__ import unicode_literals - from .zdf import ZDFIE -class DreiSatIE(ZDFIE): +class DreiSatIE(ZDFIE): # XXX: Do not subclass from concrete IE IE_NAME = '3sat' _VALID_URL = r'https?://(?:www\.)?3sat\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)\.html' _TESTS = [{ diff --git a/hypervideo_dl/extractor/drooble.py b/hypervideo_dl/extractor/drooble.py index 0584250..106e5c4 100644 --- a/hypervideo_dl/extractor/drooble.py +++ b/hypervideo_dl/extractor/drooble.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/dropbox.py b/hypervideo_dl/extractor/dropbox.py index 2559657..214b309 100644 --- a/hypervideo_dl/extractor/dropbox.py +++ b/hypervideo_dl/extractor/dropbox.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import os.path import re @@ -56,8 +53,8 @@ class DropboxIE(InfoExtractor): else: raise ExtractorError('Password protected video, use --video-password <password>', expected=True) - json_string = self._html_search_regex(r'InitReact\.mountComponent\(.*?,\s*(\{.+\})\s*?\)', webpage, 'Info JSON') - info_json = self._parse_json(json_string, video_id).get('props') + info_json = self._search_json(r'InitReact\.mountComponent\(.*?,', webpage, 'mountComponent', video_id, + contains_pattern=r'{.+?"preview".+?}', end_pattern=r'\)')['props'] transcode_url = traverse_obj(info_json, ((None, 'preview'), 'file', 'preview', 'content', 'transcode_url'), get_all=False) formats, subtitles = self._extract_m3u8_formats_and_subtitles(transcode_url, video_id) @@ -66,7 +63,6 @@ class DropboxIE(InfoExtractor): video_url = re.sub(r'[?&]dl=0', '', url) video_url += ('?' if '?' not in video_url else '&') + 'dl=1' formats.append({'url': video_url, 'format_id': 'original', 'format_note': 'Original', 'quality': 1}) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/dropout.py b/hypervideo_dl/extractor/dropout.py index 2fa6195..e280b1c 100644 --- a/hypervideo_dl/extractor/dropout.py +++ b/hypervideo_dl/extractor/dropout.py @@ -1,9 +1,8 @@ -# coding: utf-8 from .common import InfoExtractor from .vimeo import VHXEmbedIE from ..utils import ( - clean_html, ExtractorError, + clean_html, get_element_by_class, get_element_by_id, get_elements_by_class, @@ -97,11 +96,12 @@ class DropoutIE(InfoExtractor): def _login(self, display_id): username, password = self._get_login_info() - if not (username and password): - self.raise_login_required(method='password') + if not username: + return True response = self._download_webpage( - self._LOGIN_URL, display_id, note='Logging in', data=urlencode_postdata({ + self._LOGIN_URL, display_id, note='Logging in', fatal=False, + data=urlencode_postdata({ 'email': username, 'password': password, 'authenticity_token': self._get_authenticity_token(display_id), @@ -111,19 +111,25 @@ class DropoutIE(InfoExtractor): user_has_subscription = self._search_regex( r'user_has_subscription:\s*["\'](.+?)["\']', response, 'subscription status', default='none') if user_has_subscription.lower() == 'true': - return response + return elif user_has_subscription.lower() == 'false': - raise ExtractorError('Account is not subscribed') + return 'Account is not subscribed' else: - raise ExtractorError('Incorrect username/password') + return 'Incorrect username/password' def _real_extract(self, url): display_id = self._match_id(url) - try: - self._login(display_id) - webpage = self._download_webpage(url, display_id, note='Downloading video webpage') - finally: - self._download_webpage('https://www.dropout.tv/logout', display_id, note='Logging out', fatal=False) + + webpage = None + if self._get_cookies('https://www.dropout.tv').get('_session'): + webpage = self._download_webpage(url, display_id) + if not webpage or '<div id="watch-unauthorized"' in webpage: + login_err = self._login(display_id) + webpage = self._download_webpage(url, display_id) + if login_err and '<div id="watch-unauthorized"' in webpage: + if login_err is True: + self.raise_login_required(method='any') + raise ExtractorError(login_err, expected=True) embed_url = self._search_regex(r'embed_url:\s*["\'](.+?)["\']', webpage, 'embed url') thumbnail = self._og_search_thumbnail(webpage) @@ -138,7 +144,7 @@ class DropoutIE(InfoExtractor): return { '_type': 'url_transparent', 'ie_key': VHXEmbedIE.ie_key(), - 'url': embed_url, + 'url': VHXEmbedIE._smuggle_referrer(embed_url, 'https://www.dropout.tv'), 'id': self._search_regex(r'embed\.vhx\.tv/videos/(.+?)\?', embed_url, 'id'), 'display_id': display_id, 'title': title, diff --git a/hypervideo_dl/extractor/drtuber.py b/hypervideo_dl/extractor/drtuber.py index 540b86a..e5dab6a 100644 --- a/hypervideo_dl/extractor/drtuber.py +++ b/hypervideo_dl/extractor/drtuber.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -13,6 +11,7 @@ from ..utils import ( class DrTuberIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www|m)\.)?drtuber\.com/(?:video|embed)/(?P<id>\d+)(?:/(?P<display_id>[\w-]+))?' + _EMBED_REGEX = [r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?drtuber\.com/embed/\d+)'] _TESTS = [{ 'url': 'http://www.drtuber.com/video/1740434/hot-perky-blonde-naked-golf', 'md5': '93e680cf2536ad0dfb7e74d94a89facd', @@ -35,12 +34,6 @@ class DrTuberIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?drtuber\.com/embed/\d+)', - webpage) - def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') @@ -65,7 +58,6 @@ class DrTuberIE(InfoExtractor): 'quality': 2 if format_id == 'hq' else 1, 'url': video_url }) - self._sort_formats(formats) duration = int_or_none(video_data.get('duration')) or parse_duration( video_data.get('duration_format')) diff --git a/hypervideo_dl/extractor/drtv.py b/hypervideo_dl/extractor/drtv.py index 37e4d5b..128f439 100644 --- a/hypervideo_dl/extractor/drtv.py +++ b/hypervideo_dl/extractor/drtv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import binascii import hashlib import re @@ -26,7 +23,7 @@ class DRTVIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:www\.)?dr\.dk/(?:tv/se|nyheder|radio(?:/ondemand)?)/(?:[^/]+/)*| + (?:www\.)?dr\.dk/(?:tv/se|nyheder|(?:radio|lyd)(?:/ondemand)?)/(?:[^/]+/)*| (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode|program)/ ) (?P<id>[\da-z_-]+) @@ -54,6 +51,7 @@ class DRTVIE(InfoExtractor): 'release_year': 2016, }, 'expected_warnings': ['Unable to download f4m manifest'], + 'skip': 'this video has been removed', }, { # embed 'url': 'https://www.dr.dk/nyheder/indland/live-christianias-rydning-af-pusher-street-er-i-gang', @@ -74,31 +72,41 @@ class DRTVIE(InfoExtractor): # with SignLanguage formats 'url': 'https://www.dr.dk/tv/se/historien-om-danmark/-/historien-om-danmark-stenalder', 'info_dict': { - 'id': 'historien-om-danmark-stenalder', + 'id': '00831690010', 'ext': 'mp4', 'title': 'Historien om Danmark: Stenalder', 'description': 'md5:8c66dcbc1669bbc6f873879880f37f2a', 'timestamp': 1546628400, 'upload_date': '20190104', - 'duration': 3502.56, + 'duration': 3504.618, 'formats': 'mincount:20', + 'release_year': 2017, + 'season_id': 'urn:dr:mu:bundle:5afc03ad6187a4065ca5fd35', + 'season_number': 1, + 'season': 'Historien om Danmark', + 'series': 'Historien om Danmark', }, 'params': { 'skip_download': True, }, }, { - 'url': 'https://www.dr.dk/radio/p4kbh/regionale-nyheder-kh4/p4-nyheder-2019-06-26-17-30-9', + 'url': 'https://www.dr.dk/lyd/p4kbh/regionale-nyheder-kh4/p4-nyheder-2019-06-26-17-30-9', 'only_matching': True, }, { 'url': 'https://www.dr.dk/drtv/se/bonderoeven_71769', 'info_dict': { 'id': '00951930010', 'ext': 'mp4', - 'title': 'Bonderøven (1:8)', - 'description': 'md5:3cf18fc0d3b205745d4505f896af8121', - 'timestamp': 1546542000, - 'upload_date': '20190103', + 'title': 'Bonderøven 2019 (1:8)', + 'description': 'md5:b6dcfe9b6f0bea6703e9a0092739a5bd', + 'timestamp': 1603188600, + 'upload_date': '20201020', 'duration': 2576.6, + 'season': 'Bonderøven 2019', + 'season_id': 'urn:dr:mu:bundle:5c201667a11fa01ca4528ce5', + 'release_year': 2019, + 'season_number': 2019, + 'series': 'Frank & Kastaniegaarden' }, 'params': { 'skip_download': True, @@ -112,6 +120,24 @@ class DRTVIE(InfoExtractor): }, { 'url': 'https://www.dr.dk/drtv/program/jagten_220924', 'only_matching': True, + }, { + 'url': 'https://www.dr.dk/lyd/p4aarhus/regionale-nyheder-ar4/regionale-nyheder-2022-05-05-12-30-3', + 'info_dict': { + 'id': 'urn:dr:mu:programcard:6265cb2571401424d0360113', + 'title': "Regionale nyheder", + 'ext': 'mp4', + 'duration': 120.043, + 'series': 'P4 Østjylland regionale nyheder', + 'timestamp': 1651746600, + 'season': 'Regionale nyheder', + 'release_year': 0, + 'season_id': 'urn:dr:mu:bundle:61c26889539f0201586b73c5', + 'description': '', + 'upload_date': '20220505', + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -274,8 +300,6 @@ class DRTVIE(InfoExtractor): 'Unfortunately, DR is not allowed to show this program outside Denmark.', countries=self._GEO_COUNTRIES) - self._sort_formats(formats) - return { 'id': video_id, 'title': title, @@ -340,7 +364,6 @@ class DRTVLiveIE(InfoExtractor): formats.extend(self._extract_f4m_formats(update_url_query( '%s/%s' % (server, stream_path), {'hdcore': '3.7.0'}), channel_id, f4m_id=link_type, fatal=False)) - self._sort_formats(formats) return { 'id': channel_id, diff --git a/hypervideo_dl/extractor/dtube.py b/hypervideo_dl/extractor/dtube.py index ad247b7..25a98f6 100644 --- a/hypervideo_dl/extractor/dtube.py +++ b/hypervideo_dl/extractor/dtube.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from socket import timeout diff --git a/hypervideo_dl/extractor/duboku.py b/hypervideo_dl/extractor/duboku.py index a875978..fb0546c 100644 --- a/hypervideo_dl/extractor/duboku.py +++ b/hypervideo_dl/extractor/duboku.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -54,31 +51,39 @@ def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, e class DubokuIE(InfoExtractor): IE_NAME = 'duboku' - IE_DESC = 'www.duboku.co' + IE_DESC = 'www.duboku.io' - _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*' + _VALID_URL = r'(?:https?://[^/]+\.duboku\.io/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*' _TESTS = [{ - 'url': 'https://www.duboku.co/vodplay/1575-1-1.html', + 'url': 'https://w.duboku.io/vodplay/1575-1-1.html', 'info_dict': { 'id': '1575-1-1', - 'ext': 'ts', + 'ext': 'mp4', 'series': '白色月光', 'title': 'contains:白色月光', 'season_number': 1, 'episode_number': 1, + 'season': 'Season 1', + 'episode_id': '1', + 'season_id': '1', + 'episode': 'Episode 1', }, 'params': { 'skip_download': 'm3u8 download', }, }, { - 'url': 'https://www.duboku.co/vodplay/1588-1-1.html', + 'url': 'https://w.duboku.io/vodplay/1588-1-1.html', 'info_dict': { 'id': '1588-1-1', - 'ext': 'ts', + 'ext': 'mp4', 'series': '亲爱的自己', - 'title': 'contains:预告片', + 'title': 'contains:第1集', 'season_number': 1, 'episode_number': 1, + 'episode': 'Episode 1', + 'season': 'Season 1', + 'episode_id': '1', + 'season_id': '1', }, 'params': { 'skip_download': 'm3u8 download', @@ -94,7 +99,7 @@ class DubokuIE(InfoExtractor): season_id = temp[1] episode_id = temp[2] - webpage_url = 'https://www.duboku.co/vodplay/%s.html' % video_id + webpage_url = 'https://w.duboku.io/vodplay/%s.html' % video_id webpage_html = self._download_webpage(webpage_url, video_id) # extract video url @@ -127,12 +132,13 @@ class DubokuIE(InfoExtractor): data_from = player_data.get('from') # if it is an embedded iframe, maybe it's an external source + headers = {'Referer': webpage_url} if data_from == 'iframe': # use _type url_transparent to retain the meaningful details # of the video. return { '_type': 'url_transparent', - 'url': smuggle_url(data_url, {'http_headers': {'Referer': webpage_url}}), + 'url': smuggle_url(data_url, {'http_headers': headers}), 'id': video_id, 'title': title, 'series': series_title, @@ -142,7 +148,7 @@ class DubokuIE(InfoExtractor): 'episode_id': episode_id, } - formats = self._extract_m3u8_formats(data_url, video_id, 'mp4') + formats = self._extract_m3u8_formats(data_url, video_id, 'mp4', headers=headers) return { 'id': video_id, @@ -153,36 +159,29 @@ class DubokuIE(InfoExtractor): 'episode_number': int_or_none(episode_id), 'episode_id': episode_id, 'formats': formats, - 'http_headers': {'Referer': 'https://www.duboku.co/static/player/videojs.html'} + 'http_headers': headers } class DubokuPlaylistIE(InfoExtractor): IE_NAME = 'duboku:list' - IE_DESC = 'www.duboku.co entire series' + IE_DESC = 'www.duboku.io entire series' - _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/voddetail/)(?P<id>[0-9]+)\.html.*' + _VALID_URL = r'(?:https?://[^/]+\.duboku\.io/voddetail/)(?P<id>[0-9]+)\.html.*' _TESTS = [{ - 'url': 'https://www.duboku.co/voddetail/1575.html', + 'url': 'https://w.duboku.io/voddetail/1575.html', 'info_dict': { 'id': 'startswith:1575', 'title': '白色月光', }, 'playlist_count': 12, }, { - 'url': 'https://www.duboku.co/voddetail/1554.html', + 'url': 'https://w.duboku.io/voddetail/1554.html', 'info_dict': { 'id': 'startswith:1554', 'title': '以家人之名', }, 'playlist_mincount': 30, - }, { - 'url': 'https://www.duboku.co/voddetail/1554.html#playlist2', - 'info_dict': { - 'id': '1554#playlist2', - 'title': '以家人之名', - }, - 'playlist_mincount': 27, }] def _real_extract(self, url): @@ -192,7 +191,7 @@ class DubokuPlaylistIE(InfoExtractor): series_id = mobj.group('id') fragment = compat_urlparse.urlparse(url).fragment - webpage_url = 'https://www.duboku.co/voddetail/%s.html' % series_id + webpage_url = 'https://w.duboku.io/voddetail/%s.html' % series_id webpage_html = self._download_webpage(webpage_url, series_id) # extract title @@ -237,6 +236,6 @@ class DubokuPlaylistIE(InfoExtractor): # return url results return self.playlist_result([ self.url_result( - compat_urlparse.urljoin('https://www.duboku.co', x['href']), + compat_urlparse.urljoin('https://w.duboku.io', x['href']), ie=DubokuIE.ie_key(), video_title=x.get('title')) for x in playlist], series_id + '#' + playlist_id, title) diff --git a/hypervideo_dl/extractor/dumpert.py b/hypervideo_dl/extractor/dumpert.py index d9d9afd..010c2d0 100644 --- a/hypervideo_dl/extractor/dumpert.py +++ b/hypervideo_dl/extractor/dumpert.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -51,7 +48,6 @@ class DumpertIE(InfoExtractor): 'format_id': version, 'quality': quality(version), }) - self._sort_formats(formats) thumbnails = [] stills = item.get('stills') or {} diff --git a/hypervideo_dl/extractor/dvtv.py b/hypervideo_dl/extractor/dvtv.py index 08663cf..e671433 100644 --- a/hypervideo_dl/extractor/dvtv.py +++ b/hypervideo_dl/extractor/dvtv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -145,7 +142,6 @@ class DVTVIE(InfoExtractor): 'format_id': join_nonempty('http', ext, label), 'height': int_or_none(height), }) - self._sort_formats(formats) return { 'id': data.get('mediaid') or video_id, diff --git a/hypervideo_dl/extractor/dw.py b/hypervideo_dl/extractor/dw.py index 6eaee07..9c4a08e 100644 --- a/hypervideo_dl/extractor/dw.py +++ b/hypervideo_dl/extractor/dw.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -65,7 +62,6 @@ class DWIE(InfoExtractor): transform_source=lambda s: s.replace( 'rtmp://tv-od.dw.de/flash/', 'http://tv-download.dw.de/dwtv_video/flv/')) - self._sort_formats(formats) upload_date = hidden_inputs.get('display_date') if not upload_date: diff --git a/hypervideo_dl/extractor/eagleplatform.py b/hypervideo_dl/extractor/eagleplatform.py index f86731a..9ebd24d 100644 --- a/hypervideo_dl/extractor/eagleplatform.py +++ b/hypervideo_dl/extractor/eagleplatform.py @@ -1,6 +1,4 @@ -# coding: utf-8 -from __future__ import unicode_literals - +import functools import re from .common import InfoExtractor @@ -8,6 +6,7 @@ from ..compat import compat_HTTPError from ..utils import ( ExtractorError, int_or_none, + smuggle_url, unsmuggle_url, url_or_none, ) @@ -21,6 +20,7 @@ class EaglePlatformIE(InfoExtractor): ) (?P<id>\d+) ''' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.+?\.media\.eagleplatform\.com/index/player\?.+?)\1'] _TESTS = [{ # http://lenta.ru/news/2015/03/06/navalny/ 'url': 'http://lentaru.media.eagleplatform.com/index/player?player=new&record_id=227304&player_template_id=5201', @@ -55,14 +55,14 @@ class EaglePlatformIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - # Regular iframe embedding - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.+?\.media\.eagleplatform\.com/index/player\?.+?)\1', - webpage) - if mobj is not None: - return mobj.group('url') + @classmethod + def _extract_embed_urls(cls, url, webpage): + add_referer = functools.partial(smuggle_url, data={'referrer': url}) + + res = tuple(super()._extract_embed_urls(url, webpage)) + if res: + return map(add_referer, res) + PLAYER_JS_RE = r''' <script[^>]+ src=(?P<qjs>["\'])(?:https?:)?//(?P<host>(?:(?!(?P=qjs)).)+\.media\.eagleplatform\.com)/player/player\.js(?P=qjs) @@ -77,7 +77,7 @@ class EaglePlatformIE(InfoExtractor): data-id=["\'](?P<id>\d+) ''' % PLAYER_JS_RE, webpage) if mobj is not None: - return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict() + return [add_referer('eagleplatform:%(host)s:%(id)s' % mobj.groupdict())] # Generalization of "Javascript code usage", "Combined usage" and # "Usage without attaching to DOM" embeddings (see # http://dultonmedia.github.io/eplayer/) @@ -98,7 +98,7 @@ class EaglePlatformIE(InfoExtractor): </script> ''' % PLAYER_JS_RE, webpage) if mobj is not None: - return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict() + return [add_referer('eagleplatform:%(host)s:%(id)s' % mobj.groupdict())] @staticmethod def _handle_error(response): @@ -192,8 +192,6 @@ class EaglePlatformIE(InfoExtractor): f['url'] = format_url formats.append(f) - self._sort_formats(formats) - return { 'id': video_id, 'title': title, @@ -204,3 +202,14 @@ class EaglePlatformIE(InfoExtractor): 'age_limit': age_limit, 'formats': formats, } + + +class ClipYouEmbedIE(InfoExtractor): + _VALID_URL = False + + @classmethod + def _extract_embed_urls(cls, url, webpage): + mobj = re.search( + r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage) + if mobj is not None: + yield smuggle_url('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), {'referrer': url}) diff --git a/hypervideo_dl/extractor/ebaumsworld.py b/hypervideo_dl/extractor/ebaumsworld.py index c97682c..0854d03 100644 --- a/hypervideo_dl/extractor/ebaumsworld.py +++ b/hypervideo_dl/extractor/ebaumsworld.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/echomsk.py b/hypervideo_dl/extractor/echomsk.py index 6b7cc65..850eabb 100644 --- a/hypervideo_dl/extractor/echomsk.py +++ b/hypervideo_dl/extractor/echomsk.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/egghead.py b/hypervideo_dl/extractor/egghead.py index b6b8676..a4b2a12 100644 --- a/hypervideo_dl/extractor/egghead.py +++ b/hypervideo_dl/extractor/egghead.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -120,7 +117,6 @@ class EggheadLessonIE(EggheadBaseIE): formats.append({ 'url': format_url, }) - self._sort_formats(formats) return { 'id': lesson_id, diff --git a/hypervideo_dl/extractor/ehow.py b/hypervideo_dl/extractor/ehow.py index b1cd4f5..74469ce 100644 --- a/hypervideo_dl/extractor/ehow.py +++ b/hypervideo_dl/extractor/ehow.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote diff --git a/hypervideo_dl/extractor/eighttracks.py b/hypervideo_dl/extractor/eighttracks.py index 9a44f89..3dd9ab1 100644 --- a/hypervideo_dl/extractor/eighttracks.py +++ b/hypervideo_dl/extractor/eighttracks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import random diff --git a/hypervideo_dl/extractor/einthusan.py b/hypervideo_dl/extractor/einthusan.py index 7af279a..53bc253 100644 --- a/hypervideo_dl/extractor/einthusan.py +++ b/hypervideo_dl/extractor/einthusan.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -92,8 +89,6 @@ class EinthusanIE(InfoExtractor): 'url': mp4_url, }) - self._sort_formats(formats) - description = get_elements_by_class('synopsis', webpage)[0] thumbnail = self._html_search_regex( r'''<img[^>]+src=(["'])(?P<url>(?!\1).+?/moviecovers/(?!\1).+?)\1''', diff --git a/hypervideo_dl/extractor/eitb.py b/hypervideo_dl/extractor/eitb.py index ee5ead1..bd027da 100644 --- a/hypervideo_dl/extractor/eitb.py +++ b/hypervideo_dl/extractor/eitb.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( float_or_none, @@ -74,8 +71,6 @@ class EitbIE(InfoExtractor): '%s?hdcore=3.7.0' % hds_url.replace('euskalsvod', 'euskalvod'), video_id, f4m_id='hds', fatal=False)) - self._sort_formats(formats) - return { 'id': video_id, 'title': media.get('NAME_ES') or media.get('name') or media['NAME_EU'], diff --git a/hypervideo_dl/extractor/ellentube.py b/hypervideo_dl/extractor/ellentube.py index d451bc0..6eb00f9 100644 --- a/hypervideo_dl/extractor/ellentube.py +++ b/hypervideo_dl/extractor/ellentube.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( clean_html, @@ -31,7 +28,6 @@ class EllenTubeBaseIE(InfoExtractor): entry_protocol='m3u8_native', m3u8_id='hls') duration = int_or_none(entry.get('duration')) break - self._sort_formats(formats) def get_insight(kind): return int_or_none(try_get( diff --git a/hypervideo_dl/extractor/elonet.py b/hypervideo_dl/extractor/elonet.py index 9c6aea2..c5558ff 100644 --- a/hypervideo_dl/extractor/elonet.py +++ b/hypervideo_dl/extractor/elonet.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import determine_ext @@ -56,7 +53,6 @@ class ElonetIE(InfoExtractor): else: formats, subtitles = [], {} self.raise_no_formats(f'Unknown streaming format {ext}') - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/elpais.py b/hypervideo_dl/extractor/elpais.py index b89f6db..7c6c880 100644 --- a/hypervideo_dl/extractor/elpais.py +++ b/hypervideo_dl/extractor/elpais.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import strip_jsonp, unified_strdate diff --git a/hypervideo_dl/extractor/embedly.py b/hypervideo_dl/extractor/embedly.py index a5820b2..483d018 100644 --- a/hypervideo_dl/extractor/embedly.py +++ b/hypervideo_dl/extractor/embedly.py @@ -1,6 +1,5 @@ -# coding: utf-8 -from __future__ import unicode_literals - +import re +import urllib.parse from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote @@ -12,5 +11,14 @@ class EmbedlyIE(InfoExtractor): 'only_matching': True, }] + @classmethod + def _extract_embed_urls(cls, url, webpage): + # Bypass suitable check + for mobj in re.finditer(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage): + yield mobj.group('url') + + for mobj in re.finditer(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage): + yield urllib.parse.unquote(mobj.group('url')) + def _real_extract(self, url): return self.url_result(compat_urllib_parse_unquote(self._match_id(url))) diff --git a/hypervideo_dl/extractor/engadget.py b/hypervideo_dl/extractor/engadget.py index 733bf32..e7c5d7b 100644 --- a/hypervideo_dl/extractor/engadget.py +++ b/hypervideo_dl/extractor/engadget.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/epicon.py b/hypervideo_dl/extractor/epicon.py index cd19325..3bfcc54 100644 --- a/hypervideo_dl/extractor/epicon.py +++ b/hypervideo_dl/extractor/epicon.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -62,7 +59,6 @@ class EpiconIE(InfoExtractor): description = self._og_search_description(webpage) or None thumbnail = self._og_search_thumbnail(webpage) or None formats = self._extract_m3u8_formats(data_json['url']['video_url'], id) - self._sort_formats(formats) subtitles = {} for subtitle in data_json.get('subtitles', []): diff --git a/hypervideo_dl/extractor/epoch.py b/hypervideo_dl/extractor/epoch.py new file mode 100644 index 0000000..110e78c --- /dev/null +++ b/hypervideo_dl/extractor/epoch.py @@ -0,0 +1,55 @@ +from .common import InfoExtractor +from ..utils import extract_attributes, get_element_html_by_id + + +class EpochIE(InfoExtractor): + _VALID_URL = r'https?://www.theepochtimes\.com/[\w-]+_(?P<id>\d+).html' + _TESTS = [ + { + 'url': 'https://www.theepochtimes.com/they-can-do-audio-video-physical-surveillance-on-you-24h-365d-a-year-rex-lee-on-intrusive-apps_4661688.html', + 'info_dict': { + 'id': 'a3dd732c-4750-4bc8-8156-69180668bda1', + 'ext': 'mp4', + 'title': '‘They Can Do Audio, Video, Physical Surveillance on You 24H/365D a Year’: Rex Lee on Intrusive Apps', + } + }, + { + 'url': 'https://www.theepochtimes.com/the-communist-partys-cyberattacks-on-america-explained-rex-lee-talks-tech-hybrid-warfare_4342413.html', + 'info_dict': { + 'id': '276c7f46-3bbf-475d-9934-b9bbe827cf0a', + 'ext': 'mp4', + 'title': 'The Communist Party’s Cyberattacks on America Explained; Rex Lee Talks Tech Hybrid Warfare', + } + }, + { + 'url': 'https://www.theepochtimes.com/kash-patel-a-6-year-saga-of-government-corruption-from-russiagate-to-mar-a-lago_4690250.html', + 'info_dict': { + 'id': 'aa9ceecd-a127-453d-a2de-7153d6fd69b6', + 'ext': 'mp4', + 'title': 'Kash Patel: A ‘6-Year-Saga’ of Government Corruption, From Russiagate to Mar-a-Lago', + } + }, + { + 'url': 'https://www.theepochtimes.com/dick-morris-discusses-his-book-the-return-trumps-big-2024-comeback_4819205.html', + 'info_dict': { + 'id': '9489f994-2a20-4812-b233-ac0e5c345632', + 'ext': 'mp4', + 'title': 'Dick Morris Discusses His Book ‘The Return: Trump’s Big 2024 Comeback’', + } + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + youmaker_video_id = extract_attributes(get_element_html_by_id('videobox', webpage))['data-id'] + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + f'http://vs1.youmaker.com/assets/{youmaker_video_id}/playlist.m3u8', video_id, 'mp4', m3u8_id='hls') + + return { + 'id': youmaker_video_id, + 'formats': formats, + 'subtitles': subtitles, + 'title': self._html_extract_title(webpage) + } diff --git a/hypervideo_dl/extractor/eporner.py b/hypervideo_dl/extractor/eporner.py index 25a0d97..a233797 100644 --- a/hypervideo_dl/extractor/eporner.py +++ b/hypervideo_dl/extractor/eporner.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( encode_base_n, @@ -110,7 +106,6 @@ class EpornerIE(InfoExtractor): 'height': height, 'fps': fps, }) - self._sort_formats(formats) json_ld = self._search_json_ld(webpage, display_id, default={}) diff --git a/hypervideo_dl/extractor/eroprofile.py b/hypervideo_dl/extractor/eroprofile.py index 5d5e7f2..2b61f3b 100644 --- a/hypervideo_dl/extractor/eroprofile.py +++ b/hypervideo_dl/extractor/eroprofile.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/ertgr.py b/hypervideo_dl/extractor/ertgr.py index 19ce23f..9ecdf5d 100644 --- a/hypervideo_dl/extractor/ertgr.py +++ b/hypervideo_dl/extractor/ertgr.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import re @@ -18,7 +15,6 @@ from ..utils import ( parse_iso8601, str_or_none, try_get, - unescapeHTML, url_or_none, variadic, ) @@ -77,7 +73,7 @@ class ERTFlixCodenameIE(ERTFlixBaseIE): }, ] - def _extract_formats_and_subs(self, video_id, allow_none=True): + def _extract_formats_and_subs(self, video_id): media_info = self._call_api(video_id, codename=video_id) formats, subs = [], {} for media_file in try_get(media_info, lambda x: x['MediaFiles'], list) or []: @@ -101,8 +97,6 @@ class ERTFlixCodenameIE(ERTFlixBaseIE): formats.extend(formats_) self._merge_subtitles(subs_, target=subs) - if formats or not allow_none: - self._sort_formats(formats) return formats, subs def _real_extract(self, url): @@ -122,7 +116,7 @@ class ERTFlixCodenameIE(ERTFlixBaseIE): class ERTFlixIE(ERTFlixBaseIE): IE_NAME = 'ertflix' IE_DESC = 'ERTFLIX videos' - _VALID_URL = r'https?://www\.ertflix\.gr/(?:series|vod)/(?P<id>[a-z]{3}\.\d+)' + _VALID_URL = r'https?://www\.ertflix\.gr/(?:[^/]+/)?(?:series|vod)/(?P<id>[a-z]{3}\.\d+)' _TESTS = [{ 'url': 'https://www.ertflix.gr/vod/vod.173258-aoratoi-ergates', 'md5': '6479d5e60fd7e520b07ba5411dcdd6e7', @@ -174,6 +168,9 @@ class ERTFlixIE(ERTFlixBaseIE): 'title': 'Το δίκτυο', }, 'playlist_mincount': 9, + }, { + 'url': 'https://www.ertflix.gr/en/vod/vod.127652-ta-kalytera-mas-chronia-ep1-mia-volta-sto-feggari', + 'only_matching': True, }] def _extract_episode(self, episode): @@ -275,6 +272,7 @@ class ERTWebtvEmbedIE(InfoExtractor): IE_DESC = 'ert.gr webtv embedded videos' _BASE_PLAYER_URL_RE = re.escape('//www.ert.gr/webtv/live-uni/vod/dt-uni-vod.php') _VALID_URL = rf'https?:{_BASE_PLAYER_URL_RE}\?([^#]+&)?f=(?P<id>[^#&]+)' + _EMBED_REGEX = [rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>(?:https?:)?{_BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+)(?P=_q1)'] _TESTS = [{ 'url': 'https://www.ert.gr/webtv/live-uni/vod/dt-uni-vod.php?f=trailers/E2251_TO_DIKTYO_E09_16-01_1900.mp4&bgimg=/photos/2022/1/to_diktio_ep09_i_istoria_tou_diadiktiou_stin_Ellada_1021x576.jpg', @@ -287,23 +285,11 @@ class ERTWebtvEmbedIE(InfoExtractor): }, }] - @classmethod - def _extract_urls(cls, webpage): - EMBED_URL_RE = rf'(?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=_q1)).)+' - EMBED_RE = rf'<iframe[^>]+?src=(?P<_q1>["\'])(?P<url>{EMBED_URL_RE})(?P=_q1)' - - for mobj in re.finditer(EMBED_RE, webpage): - url = unescapeHTML(mobj.group('url')) - if not cls.suitable(url): - continue - yield url - def _real_extract(self, url): video_id = self._match_id(url) formats, subs = self._extract_m3u8_formats_and_subtitles( f'https://mediastream.ert.gr/vodedge/_definst_/mp4:dvrorigin/{video_id}/playlist.m3u8', video_id, 'mp4') - self._sort_formats(formats) thumbnail_id = parse_qs(url).get('bgimg', [None])[0] if thumbnail_id and not thumbnail_id.startswith('http'): thumbnail_id = f'https://program.ert.gr{thumbnail_id}' diff --git a/hypervideo_dl/extractor/escapist.py b/hypervideo_dl/extractor/escapist.py index 4cd815e..85a1cbf 100644 --- a/hypervideo_dl/extractor/escapist.py +++ b/hypervideo_dl/extractor/escapist.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -97,7 +95,6 @@ class EscapistIE(InfoExtractor): 'format_id': '%s-%sp' % (determine_ext(video['src']), video['res']), 'height': int_or_none(video.get('res')), } for video in data['files']['videos']] - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/espn.py b/hypervideo_dl/extractor/espn.py index dc50f3b..f4b0134 100644 --- a/hypervideo_dl/extractor/espn.py +++ b/hypervideo_dl/extractor/espn.py @@ -1,14 +1,16 @@ -from __future__ import unicode_literals - +import base64 +import json import re +import urllib.parse +from .adobepass import AdobePassIE from .common import InfoExtractor from .once import OnceIE -from ..compat import compat_str from ..utils import ( determine_ext, dict_get, int_or_none, + traverse_obj, unified_strdate, unified_timestamp, ) @@ -26,7 +28,6 @@ class ESPNIE(OnceIE): (?: (?: video/(?:clip|iframe/twitter)| - watch/player ) (?: .*?\?.*?\bid=| @@ -49,6 +50,8 @@ class ESPNIE(OnceIE): 'description': 'md5:39370c2e016cb4ecf498ffe75bef7f0f', 'timestamp': 1390936111, 'upload_date': '20140128', + 'duration': 1302, + 'thumbnail': r're:https://.+\.jpg', }, 'params': { 'skip_download': True, @@ -74,15 +77,6 @@ class ESPNIE(OnceIE): 'url': 'https://cdn.espn.go.com/video/clip/_/id/19771774', 'only_matching': True, }, { - 'url': 'http://www.espn.com/watch/player?id=19141491', - 'only_matching': True, - }, { - 'url': 'http://www.espn.com/watch/player?bucketId=257&id=19505875', - 'only_matching': True, - }, { - 'url': 'http://www.espn.com/watch/player/_/id/19141491', - 'only_matching': True, - }, { 'url': 'http://www.espn.com/video/clip?id=10365079', 'only_matching': True, }, { @@ -100,7 +94,13 @@ class ESPNIE(OnceIE): }, { 'url': 'http://www.espn.com/espnw/video/26066627/arkansas-gibson-completes-hr-cycle-four-innings', 'only_matching': True, - }] + }, { + 'url': 'http://www.espn.com/watch/player?id=19141491', + 'only_matching': True, + }, { + 'url': 'http://www.espn.com/watch/player?bucketId=257&id=19505875', + 'only_matching': True, + }, ] def _real_extract(self, url): video_id = self._match_id(url) @@ -118,7 +118,7 @@ class ESPNIE(OnceIE): for source_id, source in source.items(): if source_id == 'alert': continue - elif isinstance(source, compat_str): + elif isinstance(source, str): extract_source(source, base_source_id) elif isinstance(source, dict): traverse_source( @@ -162,7 +162,6 @@ class ESPNIE(OnceIE): links = clip.get('links', {}) traverse_source(links.get('source', {})) traverse_source(links.get('mobile', {})) - self._sort_formats(formats) description = clip.get('caption') or clip.get('description') thumbnail = clip.get('thumbnail') @@ -198,7 +197,7 @@ class ESPNArticleIE(InfoExtractor): @classmethod def suitable(cls, url): - return False if ESPNIE.suitable(url) else super(ESPNArticleIE, cls).suitable(url) + return False if (ESPNIE.suitable(url) or WatchESPNIE.suitable(url)) else super().suitable(url) def _real_extract(self, url): video_id = self._match_id(url) @@ -269,7 +268,6 @@ class ESPNCricInfoIE(InfoExtractor): 'url': item['url'], 'vcodec': 'none', }) - self._sort_formats(formats) return { 'id': id, 'title': data_json.get('title'), @@ -279,3 +277,134 @@ class ESPNCricInfoIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, } + + +class WatchESPNIE(AdobePassIE): + _VALID_URL = r'https?://(?:www\.)?espn\.com/(?:watch|espnplus)/player/_/id/(?P<id>[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})' + _TESTS = [{ + 'url': 'https://www.espn.com/watch/player/_/id/dbbc6b1d-c084-4b47-9878-5f13c56ce309', + 'info_dict': { + 'id': 'dbbc6b1d-c084-4b47-9878-5f13c56ce309', + 'ext': 'mp4', + 'title': 'Huddersfield vs. Burnley', + 'duration': 7500, + 'thumbnail': 'https://artwork.api.espn.com/artwork/collections/media/dbbc6b1d-c084-4b47-9878-5f13c56ce309/default?width=640&apikey=1ngjw23osgcis1i1vbj96lmfqs', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.espn.com/watch/player/_/id/a049a56e-a7ce-477e-aef3-c7e48ef8221c', + 'info_dict': { + 'id': 'a049a56e-a7ce-477e-aef3-c7e48ef8221c', + 'ext': 'mp4', + 'title': 'Dynamo Dresden vs. VfB Stuttgart (Round #1) (German Cup)', + 'duration': 8335, + 'thumbnail': 'https://s.secure.espncdn.com/stitcher/artwork/collections/media/bd1f3d12-0654-47d9-852e-71b85ea695c7/16x9.jpg?timestamp=202201112217&showBadge=true&cb=12&package=ESPN_PLUS', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.espn.com/espnplus/player/_/id/317f5fd1-c78a-4ebe-824a-129e0d348421', + 'info_dict': { + 'id': '317f5fd1-c78a-4ebe-824a-129e0d348421', + 'ext': 'mp4', + 'title': 'The Wheel - Episode 10', + 'duration': 3352, + 'thumbnail': 'https://s.secure.espncdn.com/stitcher/artwork/collections/media/317f5fd1-c78a-4ebe-824a-129e0d348421/16x9.jpg?timestamp=202205031523&showBadge=true&cb=12&package=ESPN_PLUS', + }, + 'params': { + 'skip_download': True, + }, + }] + + _API_KEY = 'ZXNwbiZicm93c2VyJjEuMC4w.ptUt7QxsteaRruuPmGZFaJByOoqKvDP2a5YkInHrc7c' + + def _call_bamgrid_api(self, path, video_id, payload=None, headers={}): + if 'Authorization' not in headers: + headers['Authorization'] = f'Bearer {self._API_KEY}' + parse = urllib.parse.urlencode if path == 'token' else json.dumps + return self._download_json( + f'https://espn.api.edge.bamgrid.com/{path}', video_id, headers=headers, data=parse(payload).encode()) + + def _real_extract(self, url): + video_id = self._match_id(url) + cdn_data = self._download_json( + f'https://watch-cdn.product.api.espn.com/api/product/v3/watchespn/web/playback/event?id={video_id}', + video_id) + video_data = cdn_data['playbackState'] + + # ESPN+ subscription required, through cookies + if 'DTC' in video_data.get('sourceId'): + cookie = self._get_cookies(url).get('ESPN-ONESITE.WEB-PROD.token') + if not cookie: + self.raise_login_required(method='cookies') + + assertion = self._call_bamgrid_api( + 'devices', video_id, + headers={'Content-Type': 'application/json; charset=UTF-8'}, + payload={ + 'deviceFamily': 'android', + 'applicationRuntime': 'android', + 'deviceProfile': 'tv', + 'attributes': {}, + })['assertion'] + token = self._call_bamgrid_api( + 'token', video_id, payload={ + 'subject_token': assertion, + 'subject_token_type': 'urn:bamtech:params:oauth:token-type:device', + 'platform': 'android', + 'grant_type': 'urn:ietf:params:oauth:grant-type:token-exchange' + })['access_token'] + + assertion = self._call_bamgrid_api( + 'accounts/grant', video_id, payload={'id_token': cookie.value.split('|')[1]}, + headers={ + 'Authorization': token, + 'Content-Type': 'application/json; charset=UTF-8' + })['assertion'] + token = self._call_bamgrid_api( + 'token', video_id, payload={ + 'subject_token': assertion, + 'subject_token_type': 'urn:bamtech:params:oauth:token-type:account', + 'platform': 'android', + 'grant_type': 'urn:ietf:params:oauth:grant-type:token-exchange' + })['access_token'] + + playback = self._download_json( + video_data['videoHref'].format(scenario='browser~ssai'), video_id, + headers={ + 'Accept': 'application/vnd.media-service+json; version=5', + 'Authorization': token + }) + m3u8_url, headers = playback['stream']['complete'][0]['url'], {'authorization': token} + + # No login required + elif video_data.get('sourceId') == 'ESPN_FREE': + asset = self._download_json( + f'https://watch.auth.api.espn.com/video/auth/media/{video_id}/asset?apikey=uiqlbgzdwuru14v627vdusswb', + video_id) + m3u8_url, headers = asset['stream'], {} + + # TV Provider required + else: + resource = self._get_mvpd_resource('ESPN', video_data['name'], video_id, None) + auth = self._extract_mvpd_auth(url, video_id, 'ESPN', resource).encode() + + asset = self._download_json( + f'https://watch.auth.api.espn.com/video/auth/media/{video_id}/asset?apikey=uiqlbgzdwuru14v627vdusswb', + video_id, data=f'adobeToken={urllib.parse.quote_plus(base64.b64encode(auth))}&drmSupport=HLS'.encode()) + m3u8_url, headers = asset['stream'], {} + + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4', m3u8_id='hls') + + return { + 'id': video_id, + 'duration': traverse_obj(cdn_data, ('tracking', 'duration')), + 'title': video_data.get('name'), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': video_data.get('posterHref'), + 'http_headers': headers, + } diff --git a/hypervideo_dl/extractor/esri.py b/hypervideo_dl/extractor/esri.py index e9dcaeb..02e7efa 100644 --- a/hypervideo_dl/extractor/esri.py +++ b/hypervideo_dl/extractor/esri.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -46,7 +43,6 @@ class EsriVideoIE(InfoExtractor): 'height': int(height), 'filesize_approx': parse_filesize(filesize), }) - self._sort_formats(formats) title = self._html_search_meta('title', webpage, 'title') description = self._html_search_meta( diff --git a/hypervideo_dl/extractor/europa.py b/hypervideo_dl/extractor/europa.py index 60ab2ce..c2b4937 100644 --- a/hypervideo_dl/extractor/europa.py +++ b/hypervideo_dl/extractor/europa.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -79,7 +76,6 @@ class EuropaIE(InfoExtractor): 'format_note': xpath_text(file_, './lglabel'), 'language_preference': language_preference(lang) }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/europeantour.py b/hypervideo_dl/extractor/europeantour.py index e28f067..1995a74 100644 --- a/hypervideo_dl/extractor/europeantour.py +++ b/hypervideo_dl/extractor/europeantour.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/eurosport.py b/hypervideo_dl/extractor/eurosport.py new file mode 100644 index 0000000..654e112 --- /dev/null +++ b/hypervideo_dl/extractor/eurosport.py @@ -0,0 +1,97 @@ +from .common import InfoExtractor +from ..utils import traverse_obj + + +class EurosportIE(InfoExtractor): + _VALID_URL = r'https?://www\.eurosport\.com/\w+/[\w-]+/\d+/[\w-]+_(?P<id>vid\d+)' + _TESTS = [{ + 'url': 'https://www.eurosport.com/tennis/roland-garros/2022/highlights-rafael-nadal-brushes-aside-caper-ruud-to-win-record-extending-14th-french-open-title_vid1694147/video.shtml', + 'info_dict': { + 'id': '2480939', + 'ext': 'mp4', + 'title': 'Highlights: Rafael Nadal brushes aside Caper Ruud to win record-extending 14th French Open title', + 'description': 'md5:b564db73ecfe4b14ebbd8e62a3692c76', + 'thumbnail': 'https://imgresizer.eurosport.com/unsafe/1280x960/smart/filters:format(jpeg)/origin-imgresizer.eurosport.com/2022/06/05/3388285-69245968-2560-1440.png', + 'duration': 195.0, + 'display_id': 'vid1694147', + 'timestamp': 1654446698, + 'upload_date': '20220605', + } + }, { + 'url': 'https://www.eurosport.com/tennis/roland-garros/2022/watch-the-top-five-shots-from-men-s-final-as-rafael-nadal-beats-casper-ruud-to-seal-14th-french-open_vid1694283/video.shtml', + 'info_dict': { + 'id': '2481254', + 'ext': 'mp4', + 'title': 'md5:149dcc5dfb38ab7352acc008cc9fb071', + 'duration': 130.0, + 'thumbnail': 'https://imgresizer.eurosport.com/unsafe/1280x960/smart/filters:format(jpeg)/origin-imgresizer.eurosport.com/2022/06/05/3388422-69248708-2560-1440.png', + 'description': 'md5:a0c8a7f6b285e48ae8ddbe7aa85cfee6', + 'display_id': 'vid1694283', + 'timestamp': 1654456090, + 'upload_date': '20220605', + } + }, { + # geo-fence but can bypassed by xff + 'url': 'https://www.eurosport.com/cycling/tour-de-france-femmes/2022/incredible-ride-marlen-reusser-storms-to-stage-4-win-at-tour-de-france-femmes_vid1722221/video.shtml', + 'info_dict': { + 'id': '2582552', + 'ext': 'mp4', + 'title': '‘Incredible ride!’ - Marlen Reusser storms to Stage 4 win at Tour de France Femmes', + 'duration': 188.0, + 'display_id': 'vid1722221', + 'timestamp': 1658936167, + 'thumbnail': 'https://imgresizer.eurosport.com/unsafe/1280x960/smart/filters:format(jpeg)/origin-imgresizer.eurosport.com/2022/07/27/3423347-69852108-2560-1440.jpg', + 'description': 'md5:32bbe3a773ac132c57fb1e8cca4b7c71', + 'upload_date': '20220727', + } + }] + + _TOKEN = None + + # actually defined in https://netsport.eurosport.io/?variables={"databaseId":<databaseId>,"playoutType":"VDP"}&extensions={"persistedQuery":{"version":1 .. + # but this method require to get sha256 hash + _GEO_COUNTRIES = ['DE', 'NL', 'EU', 'IT', 'FR'] # Not complete list but it should work + + def _real_initialize(self): + if EurosportIE._TOKEN is None: + EurosportIE._TOKEN = self._download_json( + 'https://eu3-prod-direct.eurosport.com/token?realm=eurosport', None, + 'Trying to get token')['data']['attributes']['token'] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + json_data = self._download_json( + f'https://eu3-prod-direct.eurosport.com/playback/v2/videoPlaybackInfo/sourceSystemId/eurosport-{display_id}', + display_id, query={'usePreAuth': True}, headers={'Authorization': f'Bearer {EurosportIE._TOKEN}'})['data'] + + json_ld_data = self._search_json_ld(webpage, display_id) + + formats, subtitles = [], {} + for stream_type in json_data['attributes']['streaming']: + if stream_type == 'hls': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + traverse_obj(json_data, ('attributes', 'streaming', stream_type, 'url')), display_id, ext='mp4') + elif stream_type == 'dash': + fmts, subs = self._extract_mpd_formats_and_subtitles( + traverse_obj(json_data, ('attributes', 'streaming', stream_type, 'url')), display_id) + elif stream_type == 'mss': + fmts, subs = self._extract_ism_formats_and_subtitles( + traverse_obj(json_data, ('attributes', 'streaming', stream_type, 'url')), display_id) + + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + return { + 'id': json_data['id'], + 'title': json_ld_data.get('title') or self._og_search_title(webpage), + 'display_id': display_id, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': json_ld_data.get('thumbnails'), + 'description': (json_ld_data.get('description') + or self._html_search_meta(['og:description', 'description'], webpage)), + 'duration': json_ld_data.get('duration'), + 'timestamp': json_ld_data.get('timestamp'), + } diff --git a/hypervideo_dl/extractor/euscreen.py b/hypervideo_dl/extractor/euscreen.py index 2759e74..65a1dc7 100644 --- a/hypervideo_dl/extractor/euscreen.py +++ b/hypervideo_dl/extractor/euscreen.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( @@ -48,7 +45,6 @@ class EUScreenIE(InfoExtractor): formats = [{ 'url': source['src'], } for source in video_json.get('sources', [])] - self._sort_formats(formats) return { 'id': id, diff --git a/hypervideo_dl/extractor/everyonesmixtape.py b/hypervideo_dl/extractor/everyonesmixtape.py deleted file mode 100644 index 80cb032..0000000 --- a/hypervideo_dl/extractor/everyonesmixtape.py +++ /dev/null @@ -1,76 +0,0 @@ -from __future__ import unicode_literals - - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - sanitized_Request, -) - - -class EveryonesMixtapeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?everyonesmixtape\.com/#/mix/(?P<id>[0-9a-zA-Z]+)(?:/(?P<songnr>[0-9]))?$' - - _TESTS = [{ - 'url': 'http://everyonesmixtape.com/#/mix/m7m0jJAbMQi/5', - 'info_dict': { - 'id': '5bfseWNmlds', - 'ext': 'mp4', - 'title': "Passion Pit - \"Sleepyhead\" (Official Music Video)", - 'uploader': 'FKR.TV', - 'uploader_id': 'frenchkissrecords', - 'description': "Music video for \"Sleepyhead\" from Passion Pit's debut EP Chunk Of Change.\nBuy on iTunes: https://itunes.apple.com/us/album/chunk-of-change-ep/id300087641\n\nDirected by The Wilderness.\n\nhttp://www.passionpitmusic.com\nhttp://www.frenchkissrecords.com", - 'upload_date': '20081015' - }, - 'params': { - 'skip_download': True, # This is simply YouTube - } - }, { - 'url': 'http://everyonesmixtape.com/#/mix/m7m0jJAbMQi', - 'info_dict': { - 'id': 'm7m0jJAbMQi', - 'title': 'Driving', - }, - 'playlist_count': 24 - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - playlist_id = mobj.group('id') - - pllist_url = 'http://everyonesmixtape.com/mixtape.php?a=getMixes&u=-1&linked=%s&explore=' % playlist_id - pllist_req = sanitized_Request(pllist_url) - pllist_req.add_header('X-Requested-With', 'XMLHttpRequest') - - playlist_list = self._download_json( - pllist_req, playlist_id, note='Downloading playlist metadata') - try: - playlist_no = next(playlist['id'] - for playlist in playlist_list - if playlist['code'] == playlist_id) - except StopIteration: - raise ExtractorError('Playlist id not found') - - pl_url = 'http://everyonesmixtape.com/mixtape.php?a=getMix&id=%s&userId=null&code=' % playlist_no - pl_req = sanitized_Request(pl_url) - pl_req.add_header('X-Requested-With', 'XMLHttpRequest') - playlist = self._download_json( - pl_req, playlist_id, note='Downloading playlist info') - - entries = [{ - '_type': 'url', - 'url': t['url'], - 'title': t['title'], - } for t in playlist['tracks']] - - if mobj.group('songnr'): - songnr = int(mobj.group('songnr')) - 1 - return entries[songnr] - - playlist_title = playlist['mixData']['name'] - return { - '_type': 'playlist', - 'id': playlist_id, - 'title': playlist_title, - 'entries': entries, - } diff --git a/hypervideo_dl/extractor/expotv.py b/hypervideo_dl/extractor/expotv.py index 95a8977..bda6e3c 100644 --- a/hypervideo_dl/extractor/expotv.py +++ b/hypervideo_dl/extractor/expotv.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -51,7 +49,6 @@ class ExpoTVIE(InfoExtractor): r'filename=.*\.([a-z0-9_A-Z]+)&', media_url, 'file extension', default=None) or fcfg.get('type'), }) - self._sort_formats(formats) title = self._og_search_title(webpage) description = self._og_search_description(webpage) diff --git a/hypervideo_dl/extractor/expressen.py b/hypervideo_dl/extractor/expressen.py index dc8b855..86967b6 100644 --- a/hypervideo_dl/extractor/expressen.py +++ b/hypervideo_dl/extractor/expressen.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -20,11 +15,13 @@ class ExpressenIE(InfoExtractor): tv/(?:[^/]+/)* (?P<id>[^/?#&]+) ''' + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?(?:expressen|di)\.se/(?:tvspelare/video|videoplayer/embed)/tv/.+?)\1'] _TESTS = [{ 'url': 'https://www.expressen.se/tv/ledare/ledarsnack/ledarsnack-om-arbetslosheten-bland-kvinnor-i-speciellt-utsatta-omraden/', - 'md5': '2fbbe3ca14392a6b1b36941858d33a45', + 'md5': 'deb2ca62e7b1dcd19fa18ba37523f66e', 'info_dict': { - 'id': '8690962', + 'id': 'ba90f5a9-78d1-4511-aa02-c177b9c99136', + 'display_id': 'ledarsnack-om-arbetslosheten-bland-kvinnor-i-speciellt-utsatta-omraden', 'ext': 'mp4', 'title': 'Ledarsnack: Om arbetslösheten bland kvinnor i speciellt utsatta områden', 'description': 'md5:f38c81ff69f3de4d269bbda012fcbbba', @@ -47,13 +44,6 @@ class ExpressenIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') for mobj in re.finditer( - r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?(?:expressen|di)\.se/(?:tvspelare/video|videoplayer/embed)/tv/.+?)\1', - webpage)] - def _real_extract(self, url): display_id = self._match_id(url) @@ -67,7 +57,7 @@ class ExpressenIE(InfoExtractor): display_id, transform_source=unescapeHTML) info = extract_data('video-tracking-info') - video_id = info['videoId'] + video_id = info['contentId'] data = extract_data('article-data') stream = data['stream'] @@ -80,7 +70,6 @@ class ExpressenIE(InfoExtractor): formats = [{ 'url': stream, }] - self._sort_formats(formats) title = info.get('titleRaw') or data['title'] description = info.get('descriptionRaw') diff --git a/hypervideo_dl/extractor/extractors.py b/hypervideo_dl/extractor/extractors.py index 457f4c2..610e02f 100644 --- a/hypervideo_dl/extractor/extractors.py +++ b/hypervideo_dl/extractor/extractors.py @@ -1,2144 +1,26 @@ -# flake8: noqa -from __future__ import unicode_literals +import contextlib +import os -from .abc import ( - ABCIE, - ABCIViewIE, - ABCIViewShowSeriesIE, -) -from .abcnews import ( - AbcNewsIE, - AbcNewsVideoIE, -) -from .abcotvs import ( - ABCOTVSIE, - ABCOTVSClipsIE, -) -from .abematv import ( - AbemaTVIE, - AbemaTVTitleIE, -) -from .academicearth import AcademicEarthCourseIE -from .acast import ( - ACastIE, - ACastChannelIE, -) -from .adn import ADNIE -from .adobeconnect import AdobeConnectIE -from .adobetv import ( - AdobeTVEmbedIE, - AdobeTVIE, - AdobeTVShowIE, - AdobeTVChannelIE, - AdobeTVVideoIE, -) -from .adultswim import AdultSwimIE -from .aenetworks import ( - AENetworksIE, - AENetworksCollectionIE, - AENetworksShowIE, - HistoryTopicIE, - HistoryPlayerIE, - BiographyIE, -) -from .afreecatv import ( - AfreecaTVIE, - AfreecaTVLiveIE, -) -from .airmozilla import AirMozillaIE -from .aljazeera import AlJazeeraIE -from .alphaporno import AlphaPornoIE -from .amara import AmaraIE -from .alura import ( - AluraIE, - AluraCourseIE -) -from .amcnetworks import AMCNetworksIE -from .animelab import ( - AnimeLabIE, - AnimeLabShowsIE, -) -from .amazon import AmazonStoreIE -from .americastestkitchen import ( - AmericasTestKitchenIE, - AmericasTestKitchenSeasonIE, -) -from .animeondemand import AnimeOnDemandIE -from .anvato import AnvatoIE -from .aol import AolIE -from .allocine import AllocineIE -from .aliexpress import AliExpressLiveIE -from .alsace20tv import ( - Alsace20TVIE, - Alsace20TVEmbedIE, -) -from .apa import APAIE -from .aparat import AparatIE -from .appleconnect import AppleConnectIE -from .appletrailers import ( - AppleTrailersIE, - AppleTrailersSectionIE, -) -from .applepodcasts import ApplePodcastsIE -from .archiveorg import ( - ArchiveOrgIE, - YoutubeWebArchiveIE, -) -from .arcpublishing import ArcPublishingIE -from .arkena import ArkenaIE -from .ard import ( - ARDBetaMediathekIE, - ARDIE, - ARDMediathekIE, -) -from .arte import ( - ArteTVIE, - ArteTVEmbedIE, - ArteTVPlaylistIE, - ArteTVCategoryIE, -) -from .arnes import ArnesIE -from .asiancrush import ( - AsianCrushIE, - AsianCrushPlaylistIE, -) -from .atresplayer import AtresPlayerIE -from .atttechchannel import ATTTechChannelIE -from .atvat import ATVAtIE -from .audimedia import AudiMediaIE -from .audioboom import AudioBoomIE -from .audiomack import AudiomackIE, AudiomackAlbumIE -from .audius import ( - AudiusIE, - AudiusTrackIE, - AudiusPlaylistIE, - AudiusProfileIE, -) -from .awaan import ( - AWAANIE, - AWAANVideoIE, - AWAANLiveIE, - AWAANSeasonIE, -) -from .azmedien import AZMedienIE -from .baidu import BaiduVideoIE -from .banbye import ( - BanByeIE, - BanByeChannelIE, -) -from .bandaichannel import BandaiChannelIE -from .bandcamp import ( - BandcampIE, - BandcampAlbumIE, - BandcampWeeklyIE, - BandcampUserIE, -) -from .bannedvideo import BannedVideoIE -from .bbc import ( - BBCCoUkIE, - BBCCoUkArticleIE, - BBCCoUkIPlayerEpisodesIE, - BBCCoUkIPlayerGroupIE, - BBCCoUkPlaylistIE, - BBCIE, -) -from .beeg import BeegIE -from .behindkink import BehindKinkIE -from .bellmedia import BellMediaIE -from .beatport import BeatportIE -from .bet import BetIE -from .bfi import BFIPlayerIE -from .bfmtv import ( - BFMTVIE, - BFMTVLiveIE, - BFMTVArticleIE, -) -from .bibeltv import BibelTVIE -from .bigflix import BigflixIE -from .bigo import BigoIE -from .bild import BildIE -from .bilibili import ( - BiliBiliIE, - BiliBiliSearchIE, - BilibiliCategoryIE, - BiliBiliBangumiIE, - BilibiliAudioIE, - BilibiliAudioAlbumIE, - BiliBiliPlayerIE, - BilibiliChannelIE, - BiliIntlIE, - BiliIntlSeriesIE, -) -from .biobiochiletv import BioBioChileTVIE -from .bitchute import ( - BitChuteIE, - BitChuteChannelIE, -) -from .bitwave import ( - BitwaveReplayIE, - BitwaveStreamIE, -) -from .biqle import BIQLEIE -from .blackboardcollaborate import BlackboardCollaborateIE -from .bleacherreport import ( - BleacherReportIE, - BleacherReportCMSIE, -) -from .blogger import BloggerIE -from .bloomberg import BloombergIE -from .bokecc import BokeCCIE -from .bongacams import BongaCamsIE -from .bostonglobe import BostonGlobeIE -from .box import BoxIE -from .bpb import BpbIE -from .br import ( - BRIE, - BRMediathekIE, -) -from .bravotv import BravoTVIE -from .breakcom import BreakIE -from .breitbart import BreitBartIE -from .brightcove import ( - BrightcoveLegacyIE, - BrightcoveNewIE, -) -from .businessinsider import BusinessInsiderIE -from .buzzfeed import BuzzFeedIE -from .byutv import BYUtvIE -from .c56 import C56IE -from .cableav import CableAVIE -from .callin import CallinIE -from .caltrans import CaltransIE -from .cam4 import CAM4IE -from .camdemy import ( - CamdemyIE, - CamdemyFolderIE -) -from .cammodels import CamModelsIE -from .camwithher import CamWithHerIE -from .canalalpha import CanalAlphaIE -from .canalplus import CanalplusIE -from .canalc2 import Canalc2IE -from .canvas import ( - CanvasIE, - CanvasEenIE, - VrtNUIE, - DagelijkseKostIE, -) -from .carambatv import ( - CarambaTVIE, - CarambaTVPageIE, -) -from .cartoonnetwork import CartoonNetworkIE -from .cbc import ( - CBCIE, - CBCPlayerIE, - CBCGemIE, - CBCGemPlaylistIE, - CBCGemLiveIE, -) -from .cbs import CBSIE -from .cbslocal import ( - CBSLocalIE, - CBSLocalArticleIE, -) -from .cbsinteractive import CBSInteractiveIE -from .cbsnews import ( - CBSNewsEmbedIE, - CBSNewsIE, - CBSNewsLiveVideoIE, -) -from .cbssports import ( - CBSSportsEmbedIE, - CBSSportsIE, - TwentyFourSevenSportsIE, -) -from .ccc import ( - CCCIE, - CCCPlaylistIE, -) -from .ccma import CCMAIE -from .cctv import CCTVIE -from .cda import CDAIE -from .ceskatelevize import CeskaTelevizeIE -from .cgtn import CGTNIE -from .channel9 import Channel9IE -from .charlierose import CharlieRoseIE -from .chaturbate import ChaturbateIE -from .chilloutzone import ChilloutzoneIE -from .chingari import ( - ChingariIE, - ChingariUserIE, -) -from .chirbit import ( - ChirbitIE, - ChirbitProfileIE, -) -from .cinchcast import CinchcastIE -from .cinemax import CinemaxIE -from .ciscolive import ( - CiscoLiveSessionIE, - CiscoLiveSearchIE, -) -from .ciscowebex import CiscoWebexIE -from .cjsw import CJSWIE -from .cliphunter import CliphunterIE -from .clippit import ClippitIE -from .cliprs import ClipRsIE -from .clipsyndicate import ClipsyndicateIE -from .closertotruth import CloserToTruthIE -from .cloudflarestream import CloudflareStreamIE -from .cloudy import CloudyIE -from .clubic import ClubicIE -from .clyp import ClypIE -from .cmt import CMTIE -from .cnbc import ( - CNBCIE, - CNBCVideoIE, -) -from .cnn import ( - CNNIE, - CNNBlogsIE, - CNNArticleIE, -) -from .coub import CoubIE -from .comedycentral import ( - ComedyCentralIE, - ComedyCentralTVIE, -) -from .commonmistakes import CommonMistakesIE, UnicodeBOMIE -from .commonprotocols import ( - MmsIE, - RtmpIE, - ViewSourceIE, -) -from .condenast import CondeNastIE -from .contv import CONtvIE -from .corus import CorusIE -from .cpac import ( - CPACIE, - CPACPlaylistIE, -) -from .cozytv import CozyTVIE -from .cracked import CrackedIE -from .crackle import CrackleIE -from .craftsy import CraftsyIE -from .crooksandliars import CrooksAndLiarsIE -from .crowdbunker import ( - CrowdBunkerIE, - CrowdBunkerChannelIE, -) -from .crunchyroll import ( - CrunchyrollIE, - CrunchyrollShowPlaylistIE, - CrunchyrollBetaIE, - CrunchyrollBetaShowIE, -) -from .cspan import CSpanIE, CSpanCongressIE -from .ctsnews import CtsNewsIE -from .ctv import CTVIE -from .ctvnews import CTVNewsIE -from .cultureunplugged import CultureUnpluggedIE -from .curiositystream import ( - CuriosityStreamIE, - CuriosityStreamCollectionsIE, - CuriosityStreamSeriesIE, -) -from .cwtv import CWTVIE -from .cybrary import ( - CybraryIE, - CybraryCourseIE -) -from .daftsex import DaftsexIE -from .dailymail import DailyMailIE -from .dailymotion import ( - DailymotionIE, - DailymotionPlaylistIE, - DailymotionUserIE, -) -from .damtomo import ( - DamtomoRecordIE, - DamtomoVideoIE, -) -from .daum import ( - DaumIE, - DaumClipIE, - DaumPlaylistIE, - DaumUserIE, -) -from .daystar import DaystarClipIE -from .dbtv import DBTVIE -from .dctp import DctpTvIE -from .deezer import ( - DeezerPlaylistIE, - DeezerAlbumIE, -) -from .democracynow import DemocracynowIE -from .dfb import DFBIE -from .dhm import DHMIE -from .digg import DiggIE -from .dotsub import DotsubIE -from .douyutv import ( - DouyuShowIE, - DouyuTVIE, -) -from .dplay import ( - DPlayIE, - DiscoveryPlusIE, - HGTVDeIE, - GoDiscoveryIE, - TravelChannelIE, - CookingChannelIE, - HGTVUsaIE, - FoodNetworkIE, - InvestigationDiscoveryIE, - DestinationAmericaIE, - AmHistoryChannelIE, - ScienceChannelIE, - DIYNetworkIE, - DiscoveryLifeIE, - AnimalPlanetIE, - TLCIE, - DiscoveryPlusIndiaIE, - DiscoveryNetworksDeIE, - DiscoveryPlusItalyIE, - DiscoveryPlusItalyShowIE, - DiscoveryPlusIndiaShowIE, -) -from .dreisat import DreiSatIE -from .drbonanza import DRBonanzaIE -from .drtuber import DrTuberIE -from .drtv import ( - DRTVIE, - DRTVLiveIE, -) -from .dtube import DTubeIE -from .dvtv import DVTVIE -from .duboku import ( - DubokuIE, - DubokuPlaylistIE -) -from .dumpert import DumpertIE -from .defense import DefenseGouvFrIE -from .digitalconcerthall import DigitalConcertHallIE -from .discovery import DiscoveryIE -from .disney import DisneyIE -from .dispeak import DigitallySpeakingIE -from .doodstream import DoodStreamIE -from .dropbox import DropboxIE -from .dropout import ( - DropoutSeasonIE, - DropoutIE -) -from .dw import ( - DWIE, - DWArticleIE, -) -from .eagleplatform import EaglePlatformIE -from .ebaumsworld import EbaumsWorldIE -from .echomsk import EchoMskIE -from .egghead import ( - EggheadCourseIE, - EggheadLessonIE, -) -from .ehow import EHowIE -from .eighttracks import EightTracksIE -from .einthusan import EinthusanIE -from .eitb import EitbIE -from .ellentube import ( - EllenTubeIE, - EllenTubeVideoIE, - EllenTubePlaylistIE, -) -from .elonet import ElonetIE -from .elpais import ElPaisIE -from .embedly import EmbedlyIE -from .engadget import EngadgetIE -from .epicon import ( - EpiconIE, - EpiconSeriesIE, -) -from .eporner import EpornerIE -from .eroprofile import ( - EroProfileIE, - EroProfileAlbumIE, -) -from .ertgr import ( - ERTFlixCodenameIE, - ERTFlixIE, - ERTWebtvEmbedIE, -) -from .escapist import EscapistIE -from .espn import ( - ESPNIE, - ESPNArticleIE, - FiveThirtyEightIE, - ESPNCricInfoIE, -) -from .esri import EsriVideoIE -from .europa import EuropaIE -from .europeantour import EuropeanTourIE -from .euscreen import EUScreenIE -from .expotv import ExpoTVIE -from .expressen import ExpressenIE -from .extremetube import ExtremeTubeIE -from .eyedotv import EyedoTVIE -from .facebook import ( - FacebookIE, - FacebookPluginsVideoIE, - FacebookRedirectURLIE, -) -from .fancode import ( - FancodeVodIE, - FancodeLiveIE -) +from ..utils import load_plugins -from .faz import FazIE -from .fc2 import ( - FC2IE, - FC2EmbedIE, - FC2LiveIE, -) -from .fczenit import FczenitIE -from .filmmodu import FilmmoduIE -from .filmon import ( - FilmOnIE, - FilmOnChannelIE, -) -from .filmweb import FilmwebIE -from .firsttv import FirstTVIE -from .fivetv import FiveTVIE -from .flickr import FlickrIE -from .folketinget import FolketingetIE -from .footyroom import FootyRoomIE -from .formula1 import Formula1IE -from .fourtube import ( - FourTubeIE, - PornTubeIE, - PornerBrosIE, - FuxIE, -) -from .fox import FOXIE -from .fox9 import ( - FOX9IE, - FOX9NewsIE, -) -from .foxgay import FoxgayIE -from .foxnews import ( - FoxNewsIE, - FoxNewsArticleIE, -) -from .foxsports import FoxSportsIE -from .fptplay import FptplayIE -from .franceculture import FranceCultureIE -from .franceinter import FranceInterIE -from .francetv import ( - FranceTVIE, - FranceTVSiteIE, - FranceTVInfoIE, -) -from .freesound import FreesoundIE -from .freespeech import FreespeechIE -from .frontendmasters import ( - FrontendMastersIE, - FrontendMastersLessonIE, - FrontendMastersCourseIE -) -from .fujitv import FujiTVFODPlus7IE -from .funimation import ( - FunimationIE, - FunimationPageIE, - FunimationShowIE, -) -from .funk import FunkIE -from .fusion import FusionIE -from .gab import ( - GabTVIE, - GabIE, -) -from .gaia import GaiaIE -from .gameinformer import GameInformerIE -from .gamejolt import ( - GameJoltIE, - GameJoltUserIE, - GameJoltGameIE, - GameJoltGameSoundtrackIE, - GameJoltCommunityIE, - GameJoltSearchIE, -) -from .gamespot import GameSpotIE -from .gamestar import GameStarIE -from .gaskrank import GaskrankIE -from .gazeta import GazetaIE -from .gdcvault import GDCVaultIE -from .gedidigital import GediDigitalIE -from .generic import GenericIE -from .gettr import ( - GettrIE, - GettrStreamingIE, -) -from .gfycat import GfycatIE -from .giantbomb import GiantBombIE -from .giga import GigaIE -from .glide import GlideIE -from .globo import ( - GloboIE, - GloboArticleIE, -) -from .go import GoIE -from .godtube import GodTubeIE -from .gofile import GofileIE -from .golem import GolemIE -from .googledrive import GoogleDriveIE -from .googlepodcasts import ( - GooglePodcastsIE, - GooglePodcastsFeedIE, -) -from .googlesearch import GoogleSearchIE -from .gopro import GoProIE -from .goshgay import GoshgayIE -from .gotostage import GoToStageIE -from .gputechconf import GPUTechConfIE -from .gronkh import GronkhIE -from .groupon import GrouponIE -from .hbo import HBOIE -from .hearthisat import HearThisAtIE -from .heise import HeiseIE -from .hellporno import HellPornoIE -from .helsinki import HelsinkiIE -from .hentaistigma import HentaiStigmaIE -from .hgtv import HGTVComShowIE -from .hketv import HKETVIE -from .hidive import HiDiveIE -from .historicfilms import HistoricFilmsIE -from .hitbox import HitboxIE, HitboxLiveIE -from .hitrecord import HitRecordIE -from .hotnewhiphop import HotNewHipHopIE -from .hotstar import ( - HotStarIE, - HotStarPlaylistIE, - HotStarSeriesIE, -) -from .howcast import HowcastIE -from .howstuffworks import HowStuffWorksIE -from .hrfensehen import HRFernsehenIE -from .hrti import ( - HRTiIE, - HRTiPlaylistIE, -) -from .hse import ( - HSEShowIE, - HSEProductIE, -) -from .huajiao import HuajiaoIE -from .huya import HuyaLiveIE -from .huffpost import HuffPostIE -from .hungama import ( - HungamaIE, - HungamaSongIE, - HungamaAlbumPlaylistIE, -) -from .hypem import HypemIE -from .ichinanalive import ( - IchinanaLiveIE, - IchinanaLiveClipIE, -) -from .ign import ( - IGNIE, - IGNVideoIE, - IGNArticleIE, -) -from .iheart import ( - IHeartRadioIE, - IHeartRadioPodcastIE, -) -from .imdb import ( - ImdbIE, - ImdbListIE -) -from .imgur import ( - ImgurIE, - ImgurAlbumIE, - ImgurGalleryIE, -) -from .ina import InaIE -from .inc import IncIE -from .indavideo import IndavideoEmbedIE -from .infoq import InfoQIE -from .instagram import ( - InstagramIE, - InstagramIOSIE, - InstagramUserIE, - InstagramTagIE, - InstagramStoryIE, -) -from .internazionale import InternazionaleIE -from .internetvideoarchive import InternetVideoArchiveIE -from .iprima import ( - IPrimaIE, - IPrimaCNNIE -) -from .iqiyi import ( - IqiyiIE, - IqIE, - IqAlbumIE -) +# NB: Must be before other imports so that plugins can be correctly injected +_PLUGIN_CLASSES = load_plugins('extractor', 'IE', {}) -from .itprotv import ( - ITProTVIE, - ITProTVCourseIE -) +_LAZY_LOADER = False +if not os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'): + with contextlib.suppress(ImportError): + from .lazy_extractors import * # noqa: F403 + from .lazy_extractors import _ALL_CLASSES + _LAZY_LOADER = True -from .itv import ( - ITVIE, - ITVBTCCIE, -) -from .ivi import ( - IviIE, - IviCompilationIE -) -from .ivideon import IvideonIE -from .iwara import IwaraIE -from .izlesene import IzleseneIE -from .jamendo import ( - JamendoIE, - JamendoAlbumIE, -) -from .jeuxvideo import JeuxVideoIE -from .jove import JoveIE -from .joj import JojIE -from .jwplatform import JWPlatformIE -from .kakao import KakaoIE -from .kaltura import KalturaIE -from .karaoketv import KaraoketvIE -from .karrierevideos import KarriereVideosIE -from .keezmovies import KeezMoviesIE -from .kelbyone import KelbyOneIE -from .ketnet import KetnetIE -from .khanacademy import ( - KhanAcademyIE, - KhanAcademyUnitIE, -) -from .kickstarter import KickStarterIE -from .kinja import KinjaEmbedIE -from .kinopoisk import KinoPoiskIE -from .konserthusetplay import KonserthusetPlayIE -from .koo import KooIE -from .krasview import KrasViewIE -from .ku6 import Ku6IE -from .kusi import KUSIIE -from .kuwo import ( - KuwoIE, - KuwoAlbumIE, - KuwoChartIE, - KuwoSingerIE, - KuwoCategoryIE, - KuwoMvIE, -) -from .la7 import ( - LA7IE, - LA7PodcastEpisodeIE, - LA7PodcastIE, -) -from .laola1tv import ( - Laola1TvEmbedIE, - Laola1TvIE, - EHFTVIE, - ITTFIE, -) -from .lastfm import ( - LastFMIE, - LastFMPlaylistIE, - LastFMUserIE, -) -from .lbry import ( - LBRYIE, - LBRYChannelIE, -) -from .lci import LCIIE -from .lcp import ( - LcpPlayIE, - LcpIE, -) -from .lecture2go import Lecture2GoIE -from .lecturio import ( - LecturioIE, - LecturioCourseIE, - LecturioDeCourseIE, -) -from .leeco import ( - LeIE, - LePlaylistIE, - LetvCloudIE, -) -from .lego import LEGOIE -from .lemonde import LemondeIE -from .lenta import LentaIE -from .libraryofcongress import LibraryOfCongressIE -from .libsyn import LibsynIE -from .lifenews import ( - LifeNewsIE, - LifeEmbedIE, -) -from .limelight import ( - LimelightMediaIE, - LimelightChannelIE, - LimelightChannelListIE, -) -from .line import ( - LineLiveIE, - LineLiveChannelIE, -) -from .linkedin import ( - LinkedInIE, - LinkedInLearningIE, - LinkedInLearningCourseIE, -) -from .linuxacademy import LinuxAcademyIE -from .litv import LiTVIE -from .livejournal import LiveJournalIE -from .livestream import ( - LivestreamIE, - LivestreamOriginalIE, - LivestreamShortenerIE, -) -from .lnkgo import ( - LnkGoIE, - LnkIE, -) -from .localnews8 import LocalNews8IE -from .lovehomeporn import LoveHomePornIE -from .lrt import LRTIE -from .lynda import ( - LyndaIE, - LyndaCourseIE -) -from .m6 import M6IE -from .magentamusik360 import MagentaMusik360IE -from .mailru import ( - MailRuIE, - MailRuMusicIE, - MailRuMusicSearchIE, -) -from .mainstreaming import MainStreamingIE -from .malltv import MallTVIE -from .mangomolo import ( - MangomoloVideoIE, - MangomoloLiveIE, -) -from .manoto import ( - ManotoTVIE, - ManotoTVShowIE, - ManotoTVLiveIE, -) -from .manyvids import ManyVidsIE -from .maoritv import MaoriTVIE -from .markiza import ( - MarkizaIE, - MarkizaPageIE, -) -from .massengeschmacktv import MassengeschmackTVIE -from .matchtv import MatchTVIE -from .mdr import MDRIE -from .medaltv import MedalTVIE -from .mediaite import MediaiteIE -from .mediaklikk import MediaKlikkIE -from .mediaset import ( - MediasetIE, - MediasetShowIE, -) -from .mediasite import ( - MediasiteIE, - MediasiteCatalogIE, - MediasiteNamedCatalogIE, -) -from .medici import MediciIE -from .megaphone import MegaphoneIE -from .meipai import MeipaiIE -from .melonvod import MelonVODIE -from .meta import METAIE -from .metacafe import MetacafeIE -from .metacritic import MetacriticIE -from .mgoon import MgoonIE -from .mgtv import MGTVIE -from .miaopai import MiaoPaiIE -from .microsoftstream import MicrosoftStreamIE -from .microsoftvirtualacademy import ( - MicrosoftVirtualAcademyIE, - MicrosoftVirtualAcademyCourseIE, -) -from .mildom import ( - MildomIE, - MildomVodIE, - MildomClipIE, - MildomUserVodIE, -) -from .minds import ( - MindsIE, - MindsChannelIE, - MindsGroupIE, -) -from .ministrygrid import MinistryGridIE -from .minoto import MinotoIE -from .miomio import MioMioIE -from .mirrativ import ( - MirrativIE, - MirrativUserIE, -) -from .mit import TechTVMITIE, OCWMITIE -from .mitele import MiTeleIE -from .mixch import ( - MixchIE, - MixchArchiveIE, -) -from .mixcloud import ( - MixcloudIE, - MixcloudUserIE, - MixcloudPlaylistIE, -) -from .mlb import ( - MLBIE, - MLBVideoIE, -) -from .mlssoccer import MLSSoccerIE -from .mnet import MnetIE -from .moevideo import MoeVideoIE -from .mofosex import ( - MofosexIE, - MofosexEmbedIE, -) -from .mojvideo import MojvideoIE -from .morningstar import MorningstarIE -from .motherless import ( - MotherlessIE, - MotherlessGroupIE -) -from .motorsport import MotorsportIE -from .movieclips import MovieClipsIE -from .moviezine import MoviezineIE -from .movingimage import MovingImageIE -from .msn import MSNIE -from .mtv import ( - MTVIE, - MTVVideoIE, - MTVServicesEmbeddedIE, - MTVDEIE, - MTVJapanIE, - MTVItaliaIE, - MTVItaliaProgrammaIE, -) -from .muenchentv import MuenchenTVIE -from .murrtube import MurrtubeIE, MurrtubeUserIE -from .musescore import MuseScoreIE -from .musicdex import ( - MusicdexSongIE, - MusicdexAlbumIE, - MusicdexArtistIE, - MusicdexPlaylistIE, -) -from .mwave import MwaveIE, MwaveMeetGreetIE -from .mxplayer import ( - MxplayerIE, - MxplayerShowIE, -) -from .mychannels import MyChannelsIE -from .myspace import MySpaceIE, MySpaceAlbumIE -from .myspass import MySpassIE -from .myvi import ( - MyviIE, - MyviEmbedIE, -) -from .myvideoge import MyVideoGeIE -from .myvidster import MyVidsterIE -from .n1 import ( - N1InfoAssetIE, - N1InfoIIE, -) -from .nate import ( - NateIE, - NateProgramIE, -) -from .nationalgeographic import ( - NationalGeographicVideoIE, - NationalGeographicTVIE, -) -from .naver import ( - NaverIE, - NaverLiveIE, -) -from .nba import ( - NBAWatchEmbedIE, - NBAWatchIE, - NBAWatchCollectionIE, - NBAEmbedIE, - NBAIE, - NBAChannelIE, -) -from .nbc import ( - NBCIE, - NBCNewsIE, - NBCOlympicsIE, - NBCOlympicsStreamIE, - NBCSportsIE, - NBCSportsStreamIE, - NBCSportsVPlayerIE, -) -from .ndr import ( - NDRIE, - NJoyIE, - NDREmbedBaseIE, - NDREmbedIE, - NJoyEmbedIE, -) -from .ndtv import NDTVIE -from .nebula import ( - NebulaIE, - NebulaCollectionIE, -) -from .nerdcubed import NerdCubedFeedIE -from .netzkino import NetzkinoIE -from .neteasemusic import ( - NetEaseMusicIE, - NetEaseMusicAlbumIE, - NetEaseMusicSingerIE, - NetEaseMusicListIE, - NetEaseMusicMvIE, - NetEaseMusicProgramIE, - NetEaseMusicDjRadioIE, -) -from .newgrounds import ( - NewgroundsIE, - NewgroundsPlaylistIE, - NewgroundsUserIE, -) -from .newstube import NewstubeIE -from .newsy import NewsyIE -from .nextmedia import ( - NextMediaIE, - NextMediaActionNewsIE, - AppleDailyIE, - NextTVIE, -) -from .nexx import ( - NexxIE, - NexxEmbedIE, -) -from .nfb import NFBIE -from .nfhsnetwork import NFHSNetworkIE -from .nfl import ( - NFLIE, - NFLArticleIE, -) -from .nhk import ( - NhkVodIE, - NhkVodProgramIE, - NhkForSchoolBangumiIE, - NhkForSchoolSubjectIE, - NhkForSchoolProgramListIE, -) -from .nhl import NHLIE -from .nick import ( - NickIE, - NickBrIE, - NickDeIE, - NickNightIE, - NickRuIE, -) -from .niconico import ( - NiconicoIE, - NiconicoPlaylistIE, - NiconicoUserIE, - NiconicoSeriesIE, - NiconicoHistoryIE, - NicovideoSearchDateIE, - NicovideoSearchIE, - NicovideoSearchURLIE, - NicovideoTagURLIE, -) -from .ninecninemedia import ( - NineCNineMediaIE, - CPTwentyFourIE, -) -from .ninegag import NineGagIE -from .ninenow import NineNowIE -from .nintendo import NintendoIE -from .nitter import NitterIE -from .njpwworld import NJPWWorldIE -from .nobelprize import NobelPrizeIE -from .nonktube import NonkTubeIE -from .noodlemagazine import NoodleMagazineIE -from .noovo import NoovoIE -from .normalboots import NormalbootsIE -from .nosvideo import NosVideoIE -from .nova import ( - NovaEmbedIE, - NovaIE, -) -from .novaplay import NovaPlayIE -from .nowness import ( - NownessIE, - NownessPlaylistIE, - NownessSeriesIE, -) -from .noz import NozIE -from .npo import ( - AndereTijdenIE, - NPOIE, - NPOLiveIE, - NPORadioIE, - NPORadioFragmentIE, - SchoolTVIE, - HetKlokhuisIE, - VPROIE, - WNLIE, -) -from .npr import NprIE -from .nrk import ( - NRKIE, - NRKPlaylistIE, - NRKSkoleIE, - NRKTVIE, - NRKTVDirekteIE, - NRKRadioPodkastIE, - NRKTVEpisodeIE, - NRKTVEpisodesIE, - NRKTVSeasonIE, - NRKTVSeriesIE, -) -from .nrl import NRLTVIE -from .ntvcojp import NTVCoJpCUIE -from .ntvde import NTVDeIE -from .ntvru import NTVRuIE -from .nytimes import ( - NYTimesIE, - NYTimesArticleIE, - NYTimesCookingIE, -) -from .nuvid import NuvidIE -from .nzherald import NZHeraldIE -from .nzz import NZZIE -from .odatv import OdaTVIE -from .odnoklassniki import OdnoklassnikiIE -from .oktoberfesttv import OktoberfestTVIE -from .olympics import OlympicsReplayIE -from .on24 import On24IE -from .ondemandkorea import OnDemandKoreaIE -from .onefootball import OneFootballIE -from .onet import ( - OnetIE, - OnetChannelIE, - OnetMVPIE, - OnetPlIE, -) -from .onionstudios import OnionStudiosIE -from .ooyala import ( - OoyalaIE, - OoyalaExternalIE, -) -from .opencast import ( - OpencastIE, - OpencastPlaylistIE, -) -from .openrec import ( - OpenRecIE, - OpenRecCaptureIE, - OpenRecMovieIE, -) -from .ora import OraTVIE -from .orf import ( - ORFTVthekIE, - ORFFM4IE, - ORFFM4StoryIE, - ORFOE1IE, - ORFOE3IE, - ORFNOEIE, - ORFWIEIE, - ORFBGLIE, - ORFOOEIE, - ORFSTMIE, - ORFKTNIE, - ORFSBGIE, - ORFTIRIE, - ORFVBGIE, - ORFIPTVIE, -) -from .outsidetv import OutsideTVIE -from .packtpub import ( - PacktPubIE, - PacktPubCourseIE, -) -from .palcomp3 import ( - PalcoMP3IE, - PalcoMP3ArtistIE, - PalcoMP3VideoIE, -) -from .pandoratv import PandoraTVIE -from .panopto import ( - PanoptoIE, - PanoptoListIE, - PanoptoPlaylistIE -) -from .paramountplus import ( - ParamountPlusIE, - ParamountPlusSeriesIE, -) -from .parliamentliveuk import ParliamentLiveUKIE -from .parlview import ParlviewIE -from .patreon import ( - PatreonIE, - PatreonUserIE -) -from .pbs import PBSIE -from .pearvideo import PearVideoIE -from .peekvids import PeekVidsIE, PlayVidsIE -from .peertube import ( - PeerTubeIE, - PeerTubePlaylistIE, -) -from .peertv import PeerTVIE -from .peloton import ( - PelotonIE, - PelotonLiveIE -) -from .people import PeopleIE -from .performgroup import PerformGroupIE -from .periscope import ( - PeriscopeIE, - PeriscopeUserIE, -) -from .philharmoniedeparis import PhilharmonieDeParisIE -from .phoenix import PhoenixIE -from .photobucket import PhotobucketIE -from .piapro import PiaproIE -from .picarto import ( - PicartoIE, - PicartoVodIE, -) -from .piksel import PikselIE -from .pinkbike import PinkbikeIE -from .pinterest import ( - PinterestIE, - PinterestCollectionIE, -) -from .pixivsketch import ( - PixivSketchIE, - PixivSketchUserIE, -) -from .pladform import PladformIE -from .planetmarathi import PlanetMarathiIE -from .platzi import ( - PlatziIE, - PlatziCourseIE, -) -from .playfm import PlayFMIE -from .playplustv import PlayPlusTVIE -from .plays import PlaysTVIE -from .playstuff import PlayStuffIE -from .playtvak import PlaytvakIE -from .playvid import PlayvidIE -from .playwire import PlaywireIE -from .plutotv import PlutoTVIE -from .pluralsight import ( - PluralsightIE, - PluralsightCourseIE, -) -from .podomatic import PodomaticIE -from .pokemon import ( - PokemonIE, - PokemonWatchIE, - PokemonSoundLibraryIE, -) -from .pokergo import ( - PokerGoIE, - PokerGoCollectionIE, -) -from .polsatgo import PolsatGoIE -from .polskieradio import ( - PolskieRadioIE, - PolskieRadioCategoryIE, - PolskieRadioPlayerIE, - PolskieRadioPodcastIE, - PolskieRadioPodcastListIE, - PolskieRadioRadioKierowcowIE, -) -from .popcorntimes import PopcorntimesIE -from .popcorntv import PopcornTVIE -from .porn91 import Porn91IE -from .porncom import PornComIE -from .pornflip import PornFlipIE -from .pornhd import PornHdIE -from .pornhub import ( - PornHubIE, - PornHubUserIE, - PornHubPlaylistIE, - PornHubPagedVideoListIE, - PornHubUserVideosUploadIE, -) -from .pornotube import PornotubeIE -from .pornovoisines import PornoVoisinesIE -from .pornoxo import PornoXOIE -from .pornez import PornezIE -from .puhutv import ( - PuhuTVIE, - PuhuTVSerieIE, -) -from .presstv import PressTVIE -from .projectveritas import ProjectVeritasIE -from .prosiebensat1 import ProSiebenSat1IE -from .prx import ( - PRXStoryIE, - PRXSeriesIE, - PRXAccountIE, - PRXStoriesSearchIE, - PRXSeriesSearchIE -) -from .puls4 import Puls4IE -from .pyvideo import PyvideoIE -from .qqmusic import ( - QQMusicIE, - QQMusicSingerIE, - QQMusicAlbumIE, - QQMusicToplistIE, - QQMusicPlaylistIE, -) -from .r7 import ( - R7IE, - R7ArticleIE, -) -from .radiko import RadikoIE, RadikoRadioIE -from .radiocanada import ( - RadioCanadaIE, - RadioCanadaAudioVideoIE, -) -from .radiode import RadioDeIE -from .radiojavan import RadioJavanIE -from .radiobremen import RadioBremenIE -from .radiofrance import RadioFranceIE -from .radiozet import RadioZetPodcastIE -from .radiokapital import ( - RadioKapitalIE, - RadioKapitalShowIE, -) -from .radlive import ( - RadLiveIE, - RadLiveChannelIE, - RadLiveSeasonIE, -) -from .rai import ( - RaiPlayIE, - RaiPlayLiveIE, - RaiPlayPlaylistIE, - RaiPlaySoundIE, - RaiPlaySoundLiveIE, - RaiPlaySoundPlaylistIE, - RaiIE, -) -from .raywenderlich import ( - RayWenderlichIE, - RayWenderlichCourseIE, -) -from .rbmaradio import RBMARadioIE -from .rcs import ( - RCSIE, - RCSEmbedsIE, - RCSVariousIE, -) -from .rcti import ( - RCTIPlusIE, - RCTIPlusSeriesIE, - RCTIPlusTVIE, -) -from .rds import RDSIE -from .redbulltv import ( - RedBullTVIE, - RedBullEmbedIE, - RedBullTVRrnContentIE, - RedBullIE, -) -from .reddit import RedditIE -from .redgifs import ( - RedGifsIE, - RedGifsSearchIE, - RedGifsUserIE, -) -from .redtube import RedTubeIE -from .regiotv import RegioTVIE -from .rentv import ( - RENTVIE, - RENTVArticleIE, -) -from .restudy import RestudyIE -from .reuters import ReutersIE -from .reverbnation import ReverbNationIE -from .rice import RICEIE -from .rmcdecouverte import RMCDecouverteIE -from .rockstargames import RockstarGamesIE -from .rokfin import ( - RokfinIE, - RokfinStackIE, - RokfinChannelIE, -) -from .roosterteeth import RoosterTeethIE, RoosterTeethSeriesIE -from .rottentomatoes import RottenTomatoesIE -from .rozhlas import RozhlasIE -from .rtbf import RTBFIE -from .rte import RteIE, RteRadioIE -from .rtlnl import RtlNlIE -from .rtl2 import ( - RTL2IE, - RTL2YouIE, - RTL2YouSeriesIE, -) -from .rtnews import ( - RTNewsIE, - RTDocumentryIE, - RTDocumentryPlaylistIE, - RuptlyIE, -) -from .rtp import RTPIE -from .rtrfm import RTRFMIE -from .rts import RTSIE -from .rtve import ( - RTVEALaCartaIE, - RTVEAudioIE, - RTVELiveIE, - RTVEInfantilIE, - RTVETelevisionIE, -) -from .rtvnh import RTVNHIE -from .rtvs import RTVSIE -from .ruhd import RUHDIE -from .rule34video import Rule34VideoIE -from .rumble import ( - RumbleEmbedIE, - RumbleChannelIE, -) -from .rutube import ( - RutubeIE, - RutubeChannelIE, - RutubeEmbedIE, - RutubeMovieIE, - RutubePersonIE, - RutubePlaylistIE, - RutubeTagsIE, -) -from .glomex import ( - GlomexIE, - GlomexEmbedIE, -) -from .megatvcom import ( - MegaTVComIE, - MegaTVComEmbedIE, -) -from .ant1newsgr import ( - Ant1NewsGrWatchIE, - Ant1NewsGrArticleIE, - Ant1NewsGrEmbedIE, -) -from .rutv import RUTVIE -from .ruutu import RuutuIE -from .ruv import ( - RuvIE, - RuvSpilaIE -) -from .safari import ( - SafariIE, - SafariApiIE, - SafariCourseIE, -) -from .saitosan import SaitosanIE -from .samplefocus import SampleFocusIE -from .sapo import SapoIE -from .savefrom import SaveFromIE -from .sbs import SBSIE -from .screencast import ScreencastIE -from .screencastomatic import ScreencastOMaticIE -from .scrippsnetworks import ( - ScrippsNetworksWatchIE, - ScrippsNetworksIE, -) -from .scte import ( - SCTEIE, - SCTECourseIE, -) -from .seeker import SeekerIE -from .senategov import SenateISVPIE, SenateGovIE -from .sendtonews import SendtoNewsIE -from .servus import ServusIE -from .sevenplus import SevenPlusIE -from .sexu import SexuIE -from .seznamzpravy import ( - SeznamZpravyIE, - SeznamZpravyArticleIE, -) -from .shahid import ( - ShahidIE, - ShahidShowIE, -) -from .shared import ( - SharedIE, - VivoIE, -) -from .shemaroome import ShemarooMeIE -from .showroomlive import ShowRoomLiveIE -from .simplecast import ( - SimplecastIE, - SimplecastEpisodeIE, - SimplecastPodcastIE, -) -from .sina import SinaIE -from .sixplay import SixPlayIE -from .skeb import SkebIE -from .skyit import ( - SkyItPlayerIE, - SkyItVideoIE, - SkyItVideoLiveIE, - SkyItIE, - SkyItAcademyIE, - SkyItArteIE, - CieloTVItIE, - TV8ItIE, -) -from .skylinewebcams import SkylineWebcamsIE -from .skynewsarabia import ( - SkyNewsArabiaIE, - SkyNewsArabiaArticleIE, -) -from .skynewsau import SkyNewsAUIE -from .sky import ( - SkyNewsIE, - SkyNewsStoryIE, - SkySportsIE, - SkySportsNewsIE, -) -from .slideshare import SlideshareIE -from .slideslive import SlidesLiveIE -from .slutload import SlutloadIE -from .snotr import SnotrIE -from .sohu import SohuIE -from .sonyliv import ( - SonyLIVIE, - SonyLIVSeriesIE, -) -from .soundcloud import ( - SoundcloudEmbedIE, - SoundcloudIE, - SoundcloudSetIE, - SoundcloudRelatedIE, - SoundcloudUserIE, - SoundcloudTrackStationIE, - SoundcloudPlaylistIE, - SoundcloudSearchIE, -) -from .soundgasm import ( - SoundgasmIE, - SoundgasmProfileIE -) -from .southpark import ( - SouthParkIE, - SouthParkDeIE, - SouthParkDkIE, - SouthParkEsIE, - SouthParkNlIE -) -from .sovietscloset import ( - SovietsClosetIE, - SovietsClosetPlaylistIE -) -from .spankbang import ( - SpankBangIE, - SpankBangPlaylistIE, -) -from .spankwire import SpankwireIE -from .spiegel import SpiegelIE -from .spike import ( - BellatorIE, - ParamountNetworkIE, -) -from .stitcher import ( - StitcherIE, - StitcherShowIE, -) -from .sport5 import Sport5IE -from .sportbox import SportBoxIE -from .sportdeutschland import SportDeutschlandIE -from .spotify import ( - SpotifyIE, - SpotifyShowIE, -) -from .spreaker import ( - SpreakerIE, - SpreakerPageIE, - SpreakerShowIE, - SpreakerShowPageIE, -) -from .springboardplatform import SpringboardPlatformIE -from .sprout import SproutIE -from .srgssr import ( - SRGSSRIE, - SRGSSRPlayIE, -) -from .srmediathek import SRMediathekIE -from .stanfordoc import StanfordOpenClassroomIE -from .startv import StarTVIE -from .steam import SteamIE -from .storyfire import ( - StoryFireIE, - StoryFireUserIE, - StoryFireSeriesIE, -) -from .streamable import StreamableIE -from .streamanity import StreamanityIE -from .streamcloud import StreamcloudIE -from .streamcz import StreamCZIE -from .streamff import StreamFFIE -from .streetvoice import StreetVoiceIE -from .stretchinternet import StretchInternetIE -from .stripchat import StripchatIE -from .stv import STVPlayerIE -from .sunporno import SunPornoIE -from .sverigesradio import ( - SverigesRadioEpisodeIE, - SverigesRadioPublicationIE, -) -from .svt import ( - SVTIE, - SVTPageIE, - SVTPlayIE, - SVTSeriesIE, -) -from .swrmediathek import SWRMediathekIE -from .syfy import SyfyIE -from .sztvhu import SztvHuIE -from .tagesschau import TagesschauIE -from .tass import TassIE -from .tbs import TBSIE -from .tdslifeway import TDSLifewayIE -from .teachable import ( - TeachableIE, - TeachableCourseIE, -) -from .teachertube import ( - TeacherTubeIE, - TeacherTubeUserIE, -) -from .teachingchannel import TeachingChannelIE -from .teamcoco import TeamcocoIE -from .teamtreehouse import TeamTreeHouseIE -from .techtalks import TechTalksIE -from .ted import ( - TedEmbedIE, - TedPlaylistIE, - TedSeriesIE, - TedTalkIE, -) -from .tele5 import Tele5IE -from .tele13 import Tele13IE -from .telebruxelles import TeleBruxellesIE -from .telecinco import TelecincoIE -from .telegraaf import TelegraafIE -from .telegram import TelegramEmbedIE -from .telemb import TeleMBIE -from .telemundo import TelemundoIE -from .telequebec import ( - TeleQuebecIE, - TeleQuebecSquatIE, - TeleQuebecEmissionIE, - TeleQuebecLiveIE, - TeleQuebecVideoIE, -) -from .teletask import TeleTaskIE -from .telewebion import TelewebionIE -from .tennistv import TennisTVIE -from .tenplay import TenPlayIE -from .testurl import TestURLIE -from .tf1 import TF1IE -from .tfo import TFOIE -from .theintercept import TheInterceptIE -from .theplatform import ( - ThePlatformIE, - ThePlatformFeedIE, -) -from .thestar import TheStarIE -from .thesun import TheSunIE -from .theta import ( - ThetaVideoIE, - ThetaStreamIE, -) -from .theweatherchannel import TheWeatherChannelIE -from .thisamericanlife import ThisAmericanLifeIE -from .thisav import ThisAVIE -from .thisoldhouse import ThisOldHouseIE -from .threespeak import ( - ThreeSpeakIE, - ThreeSpeakUserIE, -) -from .threeqsdn import ThreeQSDNIE -from .tiktok import ( - TikTokIE, - TikTokUserIE, - TikTokSoundIE, - TikTokEffectIE, - TikTokTagIE, - TikTokVMIE, - DouyinIE, -) -from .tinypic import TinyPicIE -from .tmz import TMZIE -from .tnaflix import ( - TNAFlixNetworkEmbedIE, - TNAFlixIE, - EMPFlixIE, - MovieFapIE, -) -from .toggle import ( - ToggleIE, - MeWatchIE, -) -from .toggo import ( - ToggoIE, -) -from .tokentube import ( - TokentubeIE, - TokentubeChannelIE -) -from .tonline import TOnlineIE -from .toongoggles import ToonGogglesIE -from .toutv import TouTvIE -from .toypics import ToypicsUserIE, ToypicsIE -from .traileraddict import TrailerAddictIE -from .trilulilu import TriluliluIE -from .trovo import ( - TrovoIE, - TrovoVodIE, - TrovoChannelVodIE, - TrovoChannelClipIE, -) -from .trueid import TrueIDIE -from .trunews import TruNewsIE -from .trutv import TruTVIE -from .tube8 import Tube8IE -from .tubitv import ( - TubiTvIE, - TubiTvShowIE, -) -from .tumblr import TumblrIE -from .tunein import ( - TuneInClipIE, - TuneInStationIE, - TuneInProgramIE, - TuneInTopicIE, - TuneInShortenerIE, -) -from .tunepk import TunePkIE -from .turbo import TurboIE -from .tv2 import ( - TV2IE, - TV2ArticleIE, - KatsomoIE, - MTVUutisetArticleIE, -) -from .tv2dk import ( - TV2DKIE, - TV2DKBornholmPlayIE, -) -from .tv2hu import ( - TV2HuIE, - TV2HuSeriesIE, -) -from .tv4 import TV4IE -from .tv5mondeplus import TV5MondePlusIE -from .tv5unis import ( - TV5UnisVideoIE, - TV5UnisIE, -) -from .tva import ( - TVAIE, - QubIE, -) -from .tvanouvelles import ( - TVANouvellesIE, - TVANouvellesArticleIE, -) -from .tvc import ( - TVCIE, - TVCArticleIE, -) -from .tver import TVerIE -from .tvigle import TvigleIE -from .tvland import TVLandIE -from .tvn24 import TVN24IE -from .tvnet import TVNetIE -from .tvnoe import TVNoeIE -from .tvnow import ( - TVNowIE, - TVNowFilmIE, - TVNowNewIE, - TVNowSeasonIE, - TVNowAnnualIE, - TVNowShowIE, -) -from .tvopengr import ( - TVOpenGrWatchIE, - TVOpenGrEmbedIE, -) -from .tvp import ( - TVPEmbedIE, - TVPIE, - TVPStreamIE, - TVPWebsiteIE, -) -from .tvplay import ( - TVPlayIE, - ViafreeIE, - TVPlayHomeIE, -) -from .tvplayer import TVPlayerIE -from .tweakers import TweakersIE -from .twentyfourvideo import TwentyFourVideoIE -from .twentymin import TwentyMinutenIE -from .twentythreevideo import TwentyThreeVideoIE -from .twitcasting import ( - TwitCastingIE, - TwitCastingLiveIE, - TwitCastingUserIE, -) -from .twitch import ( - TwitchVodIE, - TwitchCollectionIE, - TwitchVideosIE, - TwitchVideosClipsIE, - TwitchVideosCollectionsIE, - TwitchStreamIE, - TwitchClipsIE, -) -from .twitter import ( - TwitterCardIE, - TwitterIE, - TwitterAmplifyIE, - TwitterBroadcastIE, - TwitterShortenerIE, -) -from .udemy import ( - UdemyIE, - UdemyCourseIE -) -from .udn import UDNEmbedIE -from .ufctv import ( - UFCTVIE, - UFCArabiaIE, -) -from .ukcolumn import UkColumnIE -from .uktvplay import UKTVPlayIE -from .digiteka import DigitekaIE -from .dlive import ( - DLiveVODIE, - DLiveStreamIE, -) -from .drooble import DroobleIE -from .umg import UMGDeIE -from .unistra import UnistraIE -from .unity import UnityIE -from .uol import UOLIE -from .uplynk import ( - UplynkIE, - UplynkPreplayIE, -) -from .urort import UrortIE -from .urplay import URPlayIE -from .usanetwork import USANetworkIE -from .usatoday import USATodayIE -from .ustream import UstreamIE, UstreamChannelIE -from .ustudio import ( - UstudioIE, - UstudioEmbedIE, -) -from .utreon import UtreonIE -from .varzesh3 import Varzesh3IE -from .vbox7 import Vbox7IE -from .veehd import VeeHDIE -from .veo import VeoIE -from .veoh import VeohIE -from .vesti import VestiIE -from .vevo import ( - VevoIE, - VevoPlaylistIE, -) -from .vgtv import ( - BTArticleIE, - BTVestlendingenIE, - VGTVIE, -) -from .vh1 import VH1IE -from .vice import ( - ViceIE, - ViceArticleIE, - ViceShowIE, -) -from .vidbit import VidbitIE -from .viddler import ViddlerIE -from .videa import VideaIE -from .videocampus_sachsen import ( - VideocampusSachsenIE, - VideocampusSachsenEmbedIE, -) -from .videodetective import VideoDetectiveIE -from .videofyme import VideofyMeIE -from .videomore import ( - VideomoreIE, - VideomoreVideoIE, - VideomoreSeasonIE, -) -from .videopress import VideoPressIE -from .vidio import ( - VidioIE, - VidioPremierIE, - VidioLiveIE -) -from .vidlii import VidLiiIE -from .vier import VierIE, VierVideosIE -from .viewlift import ( - ViewLiftIE, - ViewLiftEmbedIE, -) -from .viidea import ViideaIE -from .vimeo import ( - VimeoIE, - VimeoAlbumIE, - VimeoChannelIE, - VimeoGroupsIE, - VimeoLikesIE, - VimeoOndemandIE, - VimeoReviewIE, - VimeoUserIE, - VimeoWatchLaterIE, - VHXEmbedIE, -) -from .vimm import ( - VimmIE, - VimmRecordingIE, -) -from .vimple import VimpleIE -from .vine import ( - VineIE, - VineUserIE, -) -from .viki import ( - VikiIE, - VikiChannelIE, -) -from .viqeo import ViqeoIE -from .viu import ( - ViuIE, - ViuPlaylistIE, - ViuOTTIE, -) -from .vk import ( - VKIE, - VKUserVideosIE, - VKWallPostIE, -) -from .vlive import ( - VLiveIE, - VLivePostIE, - VLiveChannelIE, -) -from .vodlocker import VodlockerIE -from .vodpl import VODPlIE -from .vodplatform import VODPlatformIE -from .voicerepublic import VoiceRepublicIE -from .voicy import ( - VoicyIE, - VoicyChannelIE, -) -from .voot import ( - VootIE, - VootSeriesIE, -) -from .voxmedia import ( - VoxMediaVolumeIE, - VoxMediaIE, -) -from .vrt import VRTIE -from .vrak import VrakIE -from .vrv import ( - VRVIE, - VRVSeriesIE, -) -from .vshare import VShareIE -from .vtm import VTMIE -from .medialaan import MedialaanIE -from .vuclip import VuClipIE -from .vupload import VuploadIE -from .vvvvid import ( - VVVVIDIE, - VVVVIDShowIE, -) -from .vyborymos import VyboryMosIE -from .vzaar import VzaarIE -from .wakanim import WakanimIE -from .walla import WallaIE -from .washingtonpost import ( - WashingtonPostIE, - WashingtonPostArticleIE, -) -from .wasdtv import ( - WASDTVStreamIE, - WASDTVRecordIE, - WASDTVClipIE, -) -from .wat import WatIE -from .watchbox import WatchBoxIE -from .watchindianporn import WatchIndianPornIE -from .wdr import ( - WDRIE, - WDRPageIE, - WDRElefantIE, - WDRMobileIE, -) -from .webcaster import ( - WebcasterIE, - WebcasterFeedIE, -) -from .webofstories import ( - WebOfStoriesIE, - WebOfStoriesPlaylistIE, -) -from .weibo import ( - WeiboIE, - WeiboMobileIE -) -from .weiqitv import WeiqiTVIE -from .willow import WillowIE -from .wimtv import WimTVIE -from .whowatch import WhoWatchIE -from .wistia import ( - WistiaIE, - WistiaPlaylistIE, -) -from .worldstarhiphop import WorldStarHipHopIE -from .wppilot import ( - WPPilotIE, - WPPilotChannelsIE, -) -from .wsj import ( - WSJIE, - WSJArticleIE, -) -from .wwe import WWEIE -from .xbef import XBefIE -from .xboxclips import XboxClipsIE -from .xfileshare import XFileShareIE -from .xhamster import ( - XHamsterIE, - XHamsterEmbedIE, - XHamsterUserIE, -) -from .xiami import ( - XiamiSongIE, - XiamiAlbumIE, - XiamiArtistIE, - XiamiCollectionIE -) -from .ximalaya import ( - XimalayaIE, - XimalayaAlbumIE -) -from .xinpianchang import XinpianchangIE -from .xminus import XMinusIE -from .xnxx import XNXXIE -from .xstream import XstreamIE -from .xtube import XTubeUserIE, XTubeIE -from .xuite import XuiteIE -from .xvideos import XVideosIE -from .xxxymovies import XXXYMoviesIE -from .yahoo import ( - YahooIE, - YahooSearchIE, - YahooGyaOPlayerIE, - YahooGyaOIE, - YahooJapanNewsIE, -) -from .yandexdisk import YandexDiskIE -from .yandexmusic import ( - YandexMusicTrackIE, - YandexMusicAlbumIE, - YandexMusicPlaylistIE, - YandexMusicArtistTracksIE, - YandexMusicArtistAlbumsIE, -) -from .yandexvideo import ( - YandexVideoIE, - YandexVideoPreviewIE, - ZenYandexIE, - ZenYandexChannelIE, -) -from .yapfiles import YapFilesIE -from .yesjapan import YesJapanIE -from .yinyuetai import YinYueTaiIE -from .ynet import YnetIE -from .youjizz import YouJizzIE -from .youku import ( - YoukuIE, - YoukuShowIE, -) -from .younow import ( - YouNowLiveIE, - YouNowChannelIE, - YouNowMomentIE, -) -from .youporn import YouPornIE -from .yourporn import YourPornIE -from .yourupload import YourUploadIE -from .youtube import ( - YoutubeIE, - YoutubeClipIE, - YoutubeFavouritesIE, - YoutubeHistoryIE, - YoutubeTabIE, - YoutubeLivestreamEmbedIE, - YoutubePlaylistIE, - YoutubeRecommendedIE, - YoutubeSearchDateIE, - YoutubeSearchIE, - YoutubeSearchURLIE, - YoutubeMusicSearchURLIE, - YoutubeSubscriptionsIE, - YoutubeTruncatedIDIE, - YoutubeTruncatedURLIE, - YoutubeYtBeIE, - YoutubeYtUserIE, - YoutubeWatchLaterIE, -) -from .zapiks import ZapiksIE -from .zattoo import ( - BBVTVIE, - EinsUndEinsTVIE, - EWETVIE, - GlattvisionTVIE, - MNetTVIE, - MyVisionTVIE, - NetPlusIE, - OsnatelTVIE, - QuantumTVIE, - QuicklineIE, - QuicklineLiveIE, - SaltTVIE, - SAKTVIE, - VTXTVIE, - WalyTVIE, - ZattooIE, - ZattooLiveIE, -) -from .zdf import ZDFIE, ZDFChannelIE -from .zee5 import ( - Zee5IE, - Zee5SeriesIE, -) -from .zhihu import ZhihuIE -from .zingmp3 import ( - ZingMp3IE, - ZingMp3AlbumIE, -) -from .zoom import ZoomIE -from .zype import ZypeIE +if not _LAZY_LOADER: + from ._extractors import * # noqa: F403 + _ALL_CLASSES = [ # noqa: F811 + klass + for name, klass in globals().items() + if name.endswith('IE') and name != 'GenericIE' + ] + _ALL_CLASSES.append(GenericIE) # noqa: F405 + +globals().update(_PLUGIN_CLASSES) +_ALL_CLASSES[:0] = _PLUGIN_CLASSES.values() diff --git a/hypervideo_dl/extractor/extremetube.py b/hypervideo_dl/extractor/extremetube.py index acd4090..2c19698 100644 --- a/hypervideo_dl/extractor/extremetube.py +++ b/hypervideo_dl/extractor/extremetube.py @@ -1,10 +1,8 @@ -from __future__ import unicode_literals - from ..utils import str_to_int from .keezmovies import KeezMoviesIE -class ExtremeTubeIE(KeezMoviesIE): +class ExtremeTubeIE(KeezMoviesIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'https?://(?:www\.)?extremetube\.com/(?:[^/]+/)?video/(?P<id>[^/#?&]+)' _TESTS = [{ 'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431', diff --git a/hypervideo_dl/extractor/eyedotv.py b/hypervideo_dl/extractor/eyedotv.py index f62ddeb..d8b068e 100644 --- a/hypervideo_dl/extractor/eyedotv.py +++ b/hypervideo_dl/extractor/eyedotv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( xpath_text, diff --git a/hypervideo_dl/extractor/facebook.py b/hypervideo_dl/extractor/facebook.py index 022ea85..a58d9c8 100644 --- a/hypervideo_dl/extractor/facebook.py +++ b/hypervideo_dl/extractor/facebook.py @@ -1,21 +1,18 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import re +import urllib.parse from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, compat_str, compat_urllib_parse_unquote, - compat_urllib_parse_unquote_plus, ) from ..utils import ( + ExtractorError, clean_html, determine_ext, error_to_compat_str, - ExtractorError, float_or_none, get_element_by_id, get_first, @@ -60,6 +57,13 @@ class FacebookIE(InfoExtractor): ) (?P<id>[0-9]+) ''' + _EMBED_REGEX = [ + r'<iframe[^>]+?src=(["\'])(?P<url>https?://www\.facebook\.com/(?:video/embed|plugins/video\.php).+?)\1', + # Facebook API embed https://developers.facebook.com/docs/plugins/embedded-video-player + r'''(?x)<div[^>]+ + class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+ + data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', + ] _LOGIN_URL = 'https://www.facebook.com/login.php?next=http%3A%2F%2Ffacebook.com%2Fhome.php&login_attempt=1' _CHECKPOINT_URL = 'https://www.facebook.com/checkpoint/?next=http%3A%2F%2Ffacebook.com%2Fhome.php&_fb_noscript=1' _NETRC_MACHINE = 'facebook' @@ -314,21 +318,6 @@ class FacebookIE(InfoExtractor): 'graphURI': '/api/graphql/' } - @staticmethod - def _extract_urls(webpage): - urls = [] - for mobj in re.finditer( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://www\.facebook\.com/(?:video/embed|plugins/video\.php).+?)\1', - webpage): - urls.append(mobj.group('url')) - # Facebook API embed - # see https://developers.facebook.com/docs/plugins/embedded-video-player - for mobj in re.finditer(r'''(?x)<div[^>]+ - class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+ - data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage): - urls.append(mobj.group('url')) - return urls - def _perform_login(self, username, password): login_page_req = sanitized_Request(self._LOGIN_URL) self._set_cookie('facebook.com', 'locale', 'en_US') @@ -397,10 +386,8 @@ class FacebookIE(InfoExtractor): r'handleWithCustomApplyEach\(\s*ScheduledApplyEach\s*,\s*(\{.+?\})\s*\);', webpage)] post = traverse_obj(post_data, ( ..., 'require', ..., ..., ..., '__bbox', 'result', 'data'), expected_type=dict) or [] - media = traverse_obj( - post, - (..., 'attachments', ..., 'media', lambda _, m: str(m['id']) == video_id and m['__typename'] == 'Video'), - expected_type=dict) + media = traverse_obj(post, (..., 'attachments', ..., lambda k, v: ( + k == 'media' and str(v['id']) == video_id and v['__typename'] == 'Video')), expected_type=dict) title = get_first(media, ('title', 'text')) description = get_first(media, ('creation_story', 'comet_sections', 'message', 'story', 'message', 'text')) uploader_data = get_first(media, 'owner') or get_first(post, ('node', 'actors', ...)) or {} @@ -472,15 +459,14 @@ class FacebookIE(InfoExtractor): dash_manifest = video.get('dash_manifest') if dash_manifest: formats.extend(self._parse_mpd_formats( - compat_etree_fromstring(compat_urllib_parse_unquote_plus(dash_manifest)))) + compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)))) - def process_formats(formats): + def process_formats(info): # Downloads with browser's User-Agent are rate limited. Working around # with non-browser User-Agent. - for f in formats: + for f in info['formats']: f.setdefault('http_headers', {})['User-Agent'] = 'facebookexternalhit/1.1' - - self._sort_formats(formats, ('res', 'quality')) + info['_format_sort_fields'] = ('res', 'quality') def extract_relay_data(_filter): return self._parse_json(self._search_regex( @@ -523,16 +509,17 @@ class FacebookIE(InfoExtractor): 'url': playable_url, }) extract_dash_manifest(video, formats) - process_formats(formats) v_id = video.get('videoId') or video.get('id') or video_id info = { 'id': v_id, 'formats': formats, - 'thumbnail': try_get(video, lambda x: x['thumbnailImage']['uri']), + 'thumbnail': traverse_obj( + video, ('thumbnailImage', 'uri'), ('preferred_thumbnail', 'image', 'uri')), 'uploader_id': try_get(video, lambda x: x['owner']['id']), 'timestamp': int_or_none(video.get('publish_time')), 'duration': float_or_none(video.get('playable_duration_in_ms'), 1000), } + process_formats(info) description = try_get(video, lambda x: x['savable_description']['text']) title = video.get('name') if title: @@ -699,13 +686,12 @@ class FacebookIE(InfoExtractor): if subtitles_src: subtitles.setdefault('en', []).append({'url': subtitles_src}) - process_formats(formats) - info_dict = { 'id': video_id, 'formats': formats, 'subtitles': subtitles, } + process_formats(info_dict) info_dict.update(extract_metadata(webpage)) return info_dict @@ -784,3 +770,30 @@ class FacebookRedirectURLIE(InfoExtractor): if not redirect_url: raise ExtractorError('Invalid facebook redirect URL', expected=True) return self.url_result(redirect_url) + + +class FacebookReelIE(InfoExtractor): + _VALID_URL = r'https?://(?:[\w-]+\.)?facebook\.com/reel/(?P<id>\d+)' + IE_NAME = 'facebook:reel' + + _TESTS = [{ + 'url': 'https://www.facebook.com/reel/1195289147628387', + 'md5': 'c4ff9a7182ff9ff7d6f7a83603bae831', + 'info_dict': { + 'id': '1195289147628387', + 'ext': 'mp4', + 'title': 'md5:9f5b142921b2dc57004fa13f76005f87', + 'description': 'md5:24ea7ef062215d295bdde64e778f5474', + 'uploader': 'Beast Camp Training', + 'uploader_id': '1738535909799870', + 'duration': 9.536, + 'thumbnail': r're:^https?://.*', + 'upload_date': '20211121', + 'timestamp': 1637502604, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + f'https://m.facebook.com/watch/?v={video_id}&_rdr', FacebookIE, video_id) diff --git a/hypervideo_dl/extractor/fancode.py b/hypervideo_dl/extractor/fancode.py index 7ea16c6..1b5db81 100644 --- a/hypervideo_dl/extractor/fancode.py +++ b/hypervideo_dl/extractor/fancode.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str @@ -128,7 +125,7 @@ class FancodeVodIE(InfoExtractor): } -class FancodeLiveIE(FancodeVodIE): +class FancodeLiveIE(FancodeVodIE): # XXX: Do not subclass from concrete IE IE_NAME = 'fancode:live' _VALID_URL = r'https?://(www\.)?fancode\.com/match/(?P<id>[0-9]+).+' diff --git a/hypervideo_dl/extractor/faz.py b/hypervideo_dl/extractor/faz.py index 312ee2a..bca62ad 100644 --- a/hypervideo_dl/extractor/faz.py +++ b/hypervideo_dl/extractor/faz.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -81,7 +78,6 @@ class FazIE(InfoExtractor): 'tbr': tbr or int(mobj.group(3)), }) formats.append(f) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/fc2.py b/hypervideo_dl/extractor/fc2.py index 54a83aa..dd5e088 100644 --- a/hypervideo_dl/extractor/fc2.py +++ b/hypervideo_dl/extractor/fc2.py @@ -1,19 +1,13 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, -) +from ..compat import compat_parse_qs +from ..dependencies import websockets from ..utils import ( ExtractorError, WebSocketsWrapper, - has_websockets, js_to_json, sanitized_Request, - std_headers, traverse_obj, update_url_query, urlencode_postdata, @@ -84,7 +78,7 @@ class FC2IE(InfoExtractor): webpage = None if not url.startswith('fc2:'): webpage = self._download_webpage(url, video_id) - self._downloader.cookiejar.clear_session_cookies() # must clear + self.cookiejar.clear_session_cookies() # must clear self._login() title, thumbnail, description = None, None, None @@ -173,7 +167,7 @@ class FC2LiveIE(InfoExtractor): }] def _real_extract(self, url): - if not has_websockets: + if not websockets: raise ExtractorError('websockets library is not available. Please install it.', expected=True) video_id = self._match_id(url) webpage = self._download_webpage('https://live.fc2.com/%s/' % video_id, video_id) @@ -210,10 +204,10 @@ class FC2LiveIE(InfoExtractor): 'Cookie': str(self._get_cookies('https://live.fc2.com/'))[12:], 'Origin': 'https://live.fc2.com', 'Accept': '*/*', - 'User-Agent': std_headers['User-Agent'], + 'User-Agent': self.get_param('http_headers')['User-Agent'], }) - self.write_debug('[debug] Sending HLS server request') + self.write_debug('Sending HLS server request') while True: recv = ws.recv() @@ -235,13 +229,10 @@ class FC2LiveIE(InfoExtractor): if not data or not isinstance(data, dict): continue if data.get('name') == '_response_' and data.get('id') == 1: - self.write_debug('[debug] Goodbye.') + self.write_debug('Goodbye') playlist_data = data break - elif self._downloader.params.get('verbose', False): - if len(recv) > 100: - recv = recv[:100] + '...' - self.to_screen('[debug] Server said: %s' % recv) + self.write_debug('Server said: %s%s' % (recv[:100], '...' if len(recv) > 100 else '')) if not playlist_data: raise ExtractorError('Unable to fetch HLS playlist info via WebSocket') @@ -259,7 +250,6 @@ class FC2LiveIE(InfoExtractor): 'Referer': url, })) - self._sort_formats(formats) for fmt in formats: fmt.update({ 'protocol': 'fc2_live', diff --git a/hypervideo_dl/extractor/fczenit.py b/hypervideo_dl/extractor/fczenit.py index 8db7c59..8175b6b 100644 --- a/hypervideo_dl/extractor/fczenit.py +++ b/hypervideo_dl/extractor/fczenit.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -41,8 +38,6 @@ class FczenitIE(InfoExtractor): 'height': int_or_none(q.get('label')), } for q in msi_data['qualities'] if q.get('url')] - self._sort_formats(formats) - tags = [tag['label'] for tag in msi_data.get('tags', []) if tag.get('label')] return { diff --git a/hypervideo_dl/extractor/fifa.py b/hypervideo_dl/extractor/fifa.py new file mode 100644 index 0000000..dc00edc --- /dev/null +++ b/hypervideo_dl/extractor/fifa.py @@ -0,0 +1,94 @@ +from .common import InfoExtractor + +from ..utils import ( + int_or_none, + traverse_obj, + unified_timestamp, +) + + +class FifaIE(InfoExtractor): + _VALID_URL = r'https?://www.fifa.com/fifaplus/(?P<locale>\w{2})/watch/([^#?]+/)?(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://www.fifa.com/fifaplus/en/watch/7on10qPcnyLajDDU3ntg6y', + 'info_dict': { + 'id': '7on10qPcnyLajDDU3ntg6y', + 'title': 'Italy v France | Final | 2006 FIFA World Cup Germany™ | Full Match Replay', + 'description': 'md5:f4520d0ee80529c8ba4134a7d692ff8b', + 'ext': 'mp4', + 'categories': ['FIFA Tournaments'], + 'thumbnail': 'https://digitalhub.fifa.com/transform/fa6f0b3e-a2e9-4cf7-9f32-53c57bcb7360/2006_Final_ITA_FRA', + 'duration': 8165, + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.fifa.com/fifaplus/pt/watch/1cg5r5Qt6Qt12ilkDgb1sV', + 'info_dict': { + 'id': '1cg5r5Qt6Qt12ilkDgb1sV', + 'title': 'Brazil v Germany | Semi-finals | 2014 FIFA World Cup Brazil™ | Extended Highlights', + 'description': 'md5:d908c74ee66322b804ae2e521b02a855', + 'ext': 'mp4', + 'categories': ['FIFA Tournaments', 'Highlights'], + 'thumbnail': 'https://digitalhub.fifa.com/transform/d8fe6f61-276d-4a73-a7fe-6878a35fd082/FIFAPLS_100EXTHL_2014BRAvGER_TMB', + 'duration': 902, + 'release_timestamp': 1404777600, + 'release_date': '20140708', + }, + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://www.fifa.com/fifaplus/fr/watch/3C6gQH9C2DLwzNx7BMRQdp', + 'info_dict': { + 'id': '3C6gQH9C2DLwzNx7BMRQdp', + 'title': 'Josimar goal against Northern Ireland | Classic Goals', + 'description': 'md5:cbe7e7bb52f603c9f1fe9a4780fe983b', + 'ext': 'mp4', + 'categories': ['FIFA Tournaments', 'Goal'], + 'duration': 28, + 'thumbnail': 'https://digitalhub.fifa.com/transform/f9301391-f8d9-48b5-823e-c093ac5e3e11/CG_MEN_1986_JOSIMAR', + }, + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id, locale = self._match_valid_url(url).group('id', 'locale') + webpage = self._download_webpage(url, video_id) + + preconnect_link = self._search_regex( + r'<link[^>]+rel\s*=\s*"preconnect"[^>]+href\s*=\s*"([^"]+)"', webpage, 'Preconnect Link') + + video_details = self._download_json( + f'{preconnect_link}/sections/videoDetails/{video_id}', video_id, 'Downloading Video Details', fatal=False) + + preplay_parameters = self._download_json( + f'{preconnect_link}/videoPlayerData/{video_id}', video_id, 'Downloading Preplay Parameters')['preplayParameters'] + + cid = preplay_parameters['contentId'] + content_data = self._download_json( + f'https://content.uplynk.com/preplay/{cid}/multiple.json', video_id, 'Downloading Content Data', query={ + 'v': preplay_parameters['preplayAPIVersion'], + 'tc': preplay_parameters['tokenCheckAlgorithmVersion'], + 'rn': preplay_parameters['randomNumber'], + 'exp': preplay_parameters['tokenExpirationDate'], + 'ct': preplay_parameters['contentType'], + 'cid': cid, + 'mbtracks': preplay_parameters['tracksAssetNumber'], + 'ad': preplay_parameters['adConfiguration'], + 'ad.preroll': int(preplay_parameters['adPreroll']), + 'ad.cmsid': preplay_parameters['adCMSSourceId'], + 'ad.vid': preplay_parameters['adSourceVideoID'], + 'sig': preplay_parameters['signature'], + }) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles(content_data['playURL'], video_id) + + return { + 'id': video_id, + 'title': video_details.get('title'), + 'description': video_details.get('description'), + 'duration': int_or_none(video_details.get('duration')), + 'release_timestamp': unified_timestamp(video_details.get('dateOfRelease')), + 'categories': traverse_obj(video_details, (('videoCategory', 'videoSubcategory'),)), + 'thumbnail': traverse_obj(video_details, ('backgroundImage', 'src')), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/hypervideo_dl/extractor/filmmodu.py b/hypervideo_dl/extractor/filmmodu.py index 2746876..9eb550e 100644 --- a/hypervideo_dl/extractor/filmmodu.py +++ b/hypervideo_dl/extractor/filmmodu.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import int_or_none @@ -54,8 +51,6 @@ class FilmmoduIE(InfoExtractor): 'protocol': 'm3u8_native', } for source in data['sources']] - self._sort_formats(formats) - subtitles = {} if data.get('subtitle'): diff --git a/hypervideo_dl/extractor/filmon.py b/hypervideo_dl/extractor/filmon.py index 7b43ecc..9a93cb9 100644 --- a/hypervideo_dl/extractor/filmon.py +++ b/hypervideo_dl/extractor/filmon.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_str, @@ -68,7 +65,6 @@ class FilmOnIE(InfoExtractor): 'quality': QUALITY(stream.get('quality')), 'protocol': 'm3u8_native', }) - self._sort_formats(formats) thumbnails = [] poster = response.get('poster', {}) @@ -156,7 +152,6 @@ class FilmOnChannelIE(InfoExtractor): 'ext': 'mp4', 'quality': QUALITY(quality), }) - self._sort_formats(formats) thumbnails = [] for name, width, height in self._THUMBNAIL_RES: diff --git a/hypervideo_dl/extractor/filmweb.py b/hypervideo_dl/extractor/filmweb.py index 5e323b4..cfea1f2 100644 --- a/hypervideo_dl/extractor/filmweb.py +++ b/hypervideo_dl/extractor/filmweb.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/firsttv.py b/hypervideo_dl/extractor/firsttv.py index ccad173..f74bd13 100644 --- a/hypervideo_dl/extractor/firsttv.py +++ b/hypervideo_dl/extractor/firsttv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_str, @@ -126,7 +123,6 @@ class FirstTVIE(InfoExtractor): % (path, m3u8_path), display_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) - self._sort_formats(formats) thumbnail = item.get('poster') or self._og_search_thumbnail(webpage) duration = int_or_none(item.get('duration') or self._html_search_meta( diff --git a/hypervideo_dl/extractor/fivemin.py b/hypervideo_dl/extractor/fivemin.py deleted file mode 100644 index f3f876e..0000000 --- a/hypervideo_dl/extractor/fivemin.py +++ /dev/null @@ -1,54 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class FiveMinIE(InfoExtractor): - IE_NAME = '5min' - _VALID_URL = r'(?:5min:|https?://(?:[^/]*?5min\.com/|delivery\.vidible\.tv/aol)(?:(?:Scripts/PlayerSeed\.js|playerseed/?)?\?.*?playList=)?)(?P<id>\d+)' - - _TESTS = [ - { - # From http://www.engadget.com/2013/11/15/ipad-mini-retina-display-review/ - 'url': 'http://pshared.5min.com/Scripts/PlayerSeed.js?sid=281&width=560&height=345&playList=518013791', - 'md5': '4f7b0b79bf1a470e5004f7112385941d', - 'info_dict': { - 'id': '518013791', - 'ext': 'mp4', - 'title': 'iPad Mini with Retina Display Review', - 'description': 'iPad mini with Retina Display review', - 'duration': 177, - 'uploader': 'engadget', - 'upload_date': '20131115', - 'timestamp': 1384515288, - }, - 'params': { - # m3u8 download - 'skip_download': True, - } - }, - { - # From http://on.aol.com/video/how-to-make-a-next-level-fruit-salad-518086247 - 'url': '5min:518086247', - 'md5': 'e539a9dd682c288ef5a498898009f69e', - 'info_dict': { - 'id': '518086247', - 'ext': 'mp4', - 'title': 'How to Make a Next-Level Fruit Salad', - 'duration': 184, - }, - 'skip': 'no longer available', - }, - { - 'url': 'http://embed.5min.com/518726732/', - 'only_matching': True, - }, - { - 'url': 'http://delivery.vidible.tv/aol?playList=518013791', - 'only_matching': True, - } - ] - - def _real_extract(self, url): - video_id = self._match_id(url) - return self.url_result('aol-video:%s' % video_id) diff --git a/hypervideo_dl/extractor/fivetv.py b/hypervideo_dl/extractor/fivetv.py index d6bebd1..1f48cfd 100644 --- a/hypervideo_dl/extractor/fivetv.py +++ b/hypervideo_dl/extractor/fivetv.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import int_or_none @@ -75,7 +71,7 @@ class FiveTVIE(InfoExtractor): r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"'], webpage, 'video url') - title = self._og_search_title(webpage, default=None) or self._html_extract_title(webpage) + title = self._generic_title('', webpage) duration = int_or_none(self._og_search_property( 'video:duration', webpage, 'duration', default=None)) diff --git a/hypervideo_dl/extractor/flickr.py b/hypervideo_dl/extractor/flickr.py index 2ed6c2b..89a40d7 100644 --- a/hypervideo_dl/extractor/flickr.py +++ b/hypervideo_dl/extractor/flickr.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_str, @@ -91,12 +89,11 @@ class FlickrIE(InfoExtractor): 'url': stream['_content'], 'quality': preference(stream_type), }) - self._sort_formats(formats) owner = video_info.get('owner', {}) uploader_id = owner.get('nsid') uploader_path = owner.get('path_alias') or uploader_id - uploader_url = format_field(uploader_path, template='https://www.flickr.com/photos/%s/') + uploader_url = format_field(uploader_path, None, 'https://www.flickr.com/photos/%s/') return { 'id': video_id, diff --git a/hypervideo_dl/extractor/folketinget.py b/hypervideo_dl/extractor/folketinget.py index b3df93f..55a11e5 100644 --- a/hypervideo_dl/extractor/folketinget.py +++ b/hypervideo_dl/extractor/folketinget.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_parse_qs from ..utils import ( @@ -62,7 +59,6 @@ class FolketingetIE(InfoExtractor): 'url': xpath_text(n, './url', fatal=True), 'tbr': int_or_none(n.attrib['bitrate']), } for n in doc.findall('.//streams/stream')] - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/footyroom.py b/hypervideo_dl/extractor/footyroom.py index 118325b..4a1316b 100644 --- a/hypervideo_dl/extractor/footyroom.py +++ b/hypervideo_dl/extractor/footyroom.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from .streamable import StreamableIE diff --git a/hypervideo_dl/extractor/formula1.py b/hypervideo_dl/extractor/formula1.py index 67662e6..0a8ef85 100644 --- a/hypervideo_dl/extractor/formula1.py +++ b/hypervideo_dl/extractor/formula1.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/fourtube.py b/hypervideo_dl/extractor/fourtube.py index d4d955b..b6368b8 100644 --- a/hypervideo_dl/extractor/fourtube.py +++ b/hypervideo_dl/extractor/fourtube.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -37,7 +35,6 @@ class FourTubeBaseIE(InfoExtractor): 'resolution': format + 'p', 'quality': int(format), } for format in sources] - self._sort_formats(formats) return formats def _real_extract(self, url): diff --git a/hypervideo_dl/extractor/fourzerostudio.py b/hypervideo_dl/extractor/fourzerostudio.py new file mode 100644 index 0000000..c388a3a --- /dev/null +++ b/hypervideo_dl/extractor/fourzerostudio.py @@ -0,0 +1,106 @@ +from .common import InfoExtractor +from ..utils import traverse_obj, unified_timestamp + + +class FourZeroStudioArchiveIE(InfoExtractor): + _VALID_URL = r'https?://0000\.studio/(?P<uploader_id>[^/]+)/broadcasts/(?P<id>[^/]+)/archive' + IE_NAME = '0000studio:archive' + _TESTS = [{ + 'url': 'https://0000.studio/mumeijiten/broadcasts/1290f433-fce0-4909-a24a-5f7df09665dc/archive', + 'info_dict': { + 'id': '1290f433-fce0-4909-a24a-5f7df09665dc', + 'title': 'noteで『canape』様へのファンレターを執筆します。(数秘術その2)', + 'timestamp': 1653802534, + 'release_timestamp': 1653796604, + 'thumbnails': 'count:1', + 'comments': 'count:7', + 'uploader': '『中崎雄心』の執務室。', + 'uploader_id': 'mumeijiten', + } + }] + + def _real_extract(self, url): + video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id') + webpage = self._download_webpage(url, video_id) + nuxt_data = self._search_nuxt_data(webpage, video_id, traverse=None) + + pcb = traverse_obj(nuxt_data, ('ssrRefs', lambda _, v: v['__typename'] == 'PublicCreatorBroadcast'), get_all=False) + uploader_internal_id = traverse_obj(nuxt_data, ( + 'ssrRefs', lambda _, v: v['__typename'] == 'PublicUser', 'id'), get_all=False) + + formats, subs = self._extract_m3u8_formats_and_subtitles(pcb['archiveUrl'], video_id, ext='mp4') + + return { + 'id': video_id, + 'title': pcb.get('title'), + 'age_limit': 18 if pcb.get('isAdult') else None, + 'timestamp': unified_timestamp(pcb.get('finishTime')), + 'release_timestamp': unified_timestamp(pcb.get('createdAt')), + 'thumbnails': [{ + 'url': pcb['thumbnailUrl'], + 'ext': 'png', + }] if pcb.get('thumbnailUrl') else None, + 'formats': formats, + 'subtitles': subs, + 'comments': [{ + 'author': c.get('username'), + 'author_id': c.get('postedUserId'), + 'author_thumbnail': c.get('userThumbnailUrl'), + 'id': c.get('id'), + 'text': c.get('body'), + 'timestamp': unified_timestamp(c.get('createdAt')), + 'like_count': c.get('likeCount'), + 'is_favorited': c.get('isLikedByOwner'), + 'author_is_uploader': c.get('postedUserId') == uploader_internal_id, + } for c in traverse_obj(nuxt_data, ( + 'ssrRefs', ..., lambda _, v: v['__typename'] == 'PublicCreatorBroadcastComment')) or []], + 'uploader_id': uploader_id, + 'uploader': traverse_obj(nuxt_data, ( + 'ssrRefs', lambda _, v: v['__typename'] == 'PublicUser', 'username'), get_all=False), + } + + +class FourZeroStudioClipIE(InfoExtractor): + _VALID_URL = r'https?://0000\.studio/(?P<uploader_id>[^/]+)/archive-clip/(?P<id>[^/]+)' + IE_NAME = '0000studio:clip' + _TESTS = [{ + 'url': 'https://0000.studio/soeji/archive-clip/e46b0278-24cd-40a8-92e1-b8fc2b21f34f', + 'info_dict': { + 'id': 'e46b0278-24cd-40a8-92e1-b8fc2b21f34f', + 'title': 'わたベーさんからイラスト差し入れいただきました。ありがとうございました!', + 'timestamp': 1652109105, + 'like_count': 1, + 'uploader': 'ソエジマケイタ', + 'uploader_id': 'soeji', + } + }] + + def _real_extract(self, url): + video_id, uploader_id = self._match_valid_url(url).group('id', 'uploader_id') + webpage = self._download_webpage(url, video_id) + nuxt_data = self._search_nuxt_data(webpage, video_id, traverse=None) + + clip_info = traverse_obj(nuxt_data, ('ssrRefs', lambda _, v: v['__typename'] == 'PublicCreatorArchivedClip'), get_all=False) + + info = next(( + m for m in self._parse_html5_media_entries(url, webpage, video_id) + if 'mp4' in traverse_obj(m, ('formats', ..., 'ext')) + ), None) + if not info: + self.report_warning('Failed to find a desired media element. Falling back to using NUXT data.') + info = { + 'formats': [{ + 'ext': 'mp4', + 'url': url, + } for url in clip_info.get('mediaFiles') or [] if url], + } + return { + **info, + 'id': video_id, + 'title': clip_info.get('clipComment'), + 'timestamp': unified_timestamp(clip_info.get('createdAt')), + 'like_count': clip_info.get('likeCount'), + 'uploader_id': uploader_id, + 'uploader': traverse_obj(nuxt_data, ( + 'ssrRefs', lambda _, v: v['__typename'] == 'PublicUser', 'username'), get_all=False), + } diff --git a/hypervideo_dl/extractor/fox.py b/hypervideo_dl/extractor/fox.py index 4c52b9a..15c0c48 100644 --- a/hypervideo_dl/extractor/fox.py +++ b/hypervideo_dl/extractor/fox.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import uuid @@ -15,8 +12,10 @@ from ..utils import ( int_or_none, parse_age_limit, parse_duration, + traverse_obj, try_get, unified_timestamp, + url_or_none, ) @@ -37,7 +36,8 @@ class FOXIE(InfoExtractor): 'creator': 'FOX', 'series': 'Gotham', 'age_limit': 14, - 'episode': 'Aftermath: Bruce Wayne Develops Into The Dark Knight' + 'episode': 'Aftermath: Bruce Wayne Develops Into The Dark Knight', + 'thumbnail': r're:^https?://.*\.jpg$', }, 'params': { 'skip_download': True, @@ -132,7 +132,6 @@ class FOXIE(InfoExtractor): formats = self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') - self._sort_formats(formats) data = try_get( video, lambda x: x['trackingData']['properties'], dict) or {} @@ -168,6 +167,7 @@ class FOXIE(InfoExtractor): 'season_number': int_or_none(video.get('seasonNumber')), 'episode': video.get('name'), 'episode_number': int_or_none(video.get('episodeNumber')), + 'thumbnail': traverse_obj(video, ('images', 'still', 'raw'), expected_type=url_or_none), 'release_year': int_or_none(video.get('releaseYear')), 'subtitles': subtitles, } diff --git a/hypervideo_dl/extractor/fox9.py b/hypervideo_dl/extractor/fox9.py index 91f8f7b..dfbafa7 100644 --- a/hypervideo_dl/extractor/fox9.py +++ b/hypervideo_dl/extractor/fox9.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/foxgay.py b/hypervideo_dl/extractor/foxgay.py index 1c53e06..f4f29c6 100644 --- a/hypervideo_dl/extractor/foxgay.py +++ b/hypervideo_dl/extractor/foxgay.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import itertools from .common import InfoExtractor @@ -33,7 +31,7 @@ class FoxgayIE(InfoExtractor): description = get_element_by_id('inf_tit', webpage) # The default user-agent with foxgay cookies leads to pages without videos - self._downloader.cookiejar.clear('.foxgay.com') + self.cookiejar.clear('.foxgay.com') # Find the URL for the iFrame which contains the actual video. iframe_url = self._html_search_regex( r'<iframe[^>]+src=([\'"])(?P<url>[^\'"]+)\1', webpage, @@ -50,8 +48,6 @@ class FoxgayIE(InfoExtractor): } for source, resolution in zip( video_data['sources'], video_data.get('resolutions', itertools.repeat(None)))] - self._sort_formats(formats) - return { 'id': video_id, 'title': title, diff --git a/hypervideo_dl/extractor/foxnews.py b/hypervideo_dl/extractor/foxnews.py index 18fa0a5..52172aa 100644 --- a/hypervideo_dl/extractor/foxnews.py +++ b/hypervideo_dl/extractor/foxnews.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .amp import AMPIE @@ -58,13 +56,15 @@ class FoxNewsIE(AMPIE): }, ] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<(?:amp-)?iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//video\.foxnews\.com/v/video-embed\.html?.*?\bvideo_id=\d+.*?)\1', - webpage)] + @classmethod + def _extract_embed_urls(cls, url, webpage): + for mobj in re.finditer( + r'''(?x) + <(?:script|(?:amp-)?iframe)[^>]+\bsrc=["\'] + (?:https?:)?//video\.foxnews\.com/v/(?:video-embed\.html|embed\.js)\? + (?:[^>"\']+&)?(?:video_)?id=(?P<video_id>\d+) + ''', webpage): + yield f'https://video.foxnews.com/v/video-embed.html?video_id={mobj.group("video_id")}' def _real_extract(self, url): host, video_id = self._match_valid_url(url).groups() @@ -75,6 +75,29 @@ class FoxNewsIE(AMPIE): return info +class FoxNewsVideoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?foxnews\.com/video/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.foxnews.com/video/6313058664112', + 'info_dict': { + 'id': '6313058664112', + 'ext': 'mp4', + 'thumbnail': r're:https://.+/1280x720/match/image\.jpg', + 'upload_date': '20220930', + 'description': 'New York City, Kids Therapy, Biden', + 'duration': 2415, + 'title': 'Gutfeld! - Thursday, September 29', + 'timestamp': 1664527538, + }, + 'expected_warnings': ['Ignoring subtitle tracks'], + 'params': {'skip_download': 'm3u8'}, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result(f'https://video.foxnews.com/v/{video_id}', FoxNewsIE, video_id) + + class FoxNewsArticleIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?(?:insider\.)?foxnews\.com/(?!v)([^/]+/)+(?P<id>[a-z-]+)' IE_NAME = 'foxnews:article' @@ -124,4 +147,4 @@ class FoxNewsArticleIE(InfoExtractor): 'http://video.foxnews.com/v/' + video_id, FoxNewsIE.ie_key()) return self.url_result( - FoxNewsIE._extract_urls(webpage)[0], FoxNewsIE.ie_key()) + next(FoxNewsIE._extract_embed_urls(url, webpage)), FoxNewsIE.ie_key()) diff --git a/hypervideo_dl/extractor/foxsports.py b/hypervideo_dl/extractor/foxsports.py index 2b2cb6c..f9d7fe5 100644 --- a/hypervideo_dl/extractor/foxsports.py +++ b/hypervideo_dl/extractor/foxsports.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/fptplay.py b/hypervideo_dl/extractor/fptplay.py index a34e90b..85613ba 100644 --- a/hypervideo_dl/extractor/fptplay.py +++ b/hypervideo_dl/extractor/fptplay.py @@ -1,18 +1,17 @@ -# coding: utf-8 -from __future__ import unicode_literals - import hashlib import time import urllib.parse from .common import InfoExtractor from ..utils import ( + clean_html, join_nonempty, + strip_or_none, ) class FptplayIE(InfoExtractor): - _VALID_URL = r'https?://fptplay\.vn/(?P<type>xem-video)/[^/]+\-(?P<id>\w+)(?:/tap-(?P<episode>[^/]+)?/?(?:[?#]|$)|)' + _VALID_URL = r'https?://fptplay\.vn/xem-video/[^/]+\-(?P<id>\w+)(?:/tap-(?P<episode>\d+)?/?(?:[?#]|$)|)' _GEO_COUNTRIES = ['VN'] IE_NAME = 'fptplay' IE_DESC = 'fptplay.vn' @@ -22,7 +21,7 @@ class FptplayIE(InfoExtractor): 'info_dict': { 'id': '621a123016f369ebbde55945', 'ext': 'mp4', - 'title': 'Nhân Duyên Đại Nhân Xin Dừng Bước - Ms. Cupid In Love', + 'title': 'Nhân Duyên Đại Nhân Xin Dừng Bước - Tập 1A', 'description': 'md5:23cf7d1ce0ade8e21e76ae482e6a8c6c', }, }, { @@ -31,25 +30,41 @@ class FptplayIE(InfoExtractor): 'info_dict': { 'id': '61f3aa8a6b3b1d2e73c60eb5', 'ext': 'mp4', - 'title': 'Má Tôi Là Đại Gia - 3', + 'title': 'Má Tôi Là Đại Gia - Tập 3', 'description': 'md5:ff8ba62fb6e98ef8875c42edff641d1c', }, }, { + 'url': 'https://fptplay.vn/xem-video/lap-toi-do-giam-under-the-skin-6222d9684ec7230fa6e627a2/tap-4', + 'md5': 'bcb06c55ec14786d7d4eda07fa1ccbb9', + 'info_dict': { + 'id': '6222d9684ec7230fa6e627a2', + 'ext': 'mp4', + 'title': 'Lạp Tội Đồ Giám - Tập 2B', + 'description': 'md5:e5a47e9d35fbf7e9479ca8a77204908b', + }, + }, { 'url': 'https://fptplay.vn/xem-video/nha-co-chuyen-hi-alls-well-ends-well-1997-6218995f6af792ee370459f0', 'only_matching': True, }] def _real_extract(self, url): - type_url, video_id, episode = self._match_valid_url(url).group('type', 'id', 'episode') - webpage = self._download_webpage(url, video_id=video_id, fatal=False) - info = self._download_json(self.get_api_with_st_token(video_id, episode or 0), video_id) + video_id, slug_episode = self._match_valid_url(url).group('id', 'episode') + webpage = self._download_webpage(url, video_id=video_id, fatal=False) or '' + title = self._search_regex( + r'(?s)<h4\s+class="mb-1 text-2xl text-white"[^>]*>(.+)</h4>', webpage, 'title', fatal=False) + real_episode = slug_episode if not title else self._search_regex( + r'<p.+title="(?P<episode>[^">]+)"\s+class="epi-title active"', webpage, 'episode', fatal=False) + title = strip_or_none(title) or self._html_search_meta(('og:title', 'twitter:title'), webpage) + + info = self._download_json( + self.get_api_with_st_token(video_id, int(slug_episode) - 1 if slug_episode else 0), video_id) formats, subtitles = self._extract_m3u8_formats_and_subtitles(info['data']['url'], video_id, 'mp4') - self._sort_formats(formats) return { 'id': video_id, - 'title': join_nonempty( - self._html_search_meta(('og:title', 'twitter:title'), webpage), episode, delim=' - '), - 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage), + 'title': join_nonempty(title, real_episode, delim=' - '), + 'description': ( + clean_html(self._search_regex(r'<p\s+class="overflow-hidden"[^>]*>(.+)</p>', webpage, 'description')) + or self._html_search_meta(('og:description', 'twitter:description'), webpage)), 'formats': formats, 'subtitles': subtitles, } diff --git a/hypervideo_dl/extractor/franceculture.py b/hypervideo_dl/extractor/franceculture.py deleted file mode 100644 index 9dc28d8..0000000 --- a/hypervideo_dl/extractor/franceculture.py +++ /dev/null @@ -1,128 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -from .common import InfoExtractor -from ..utils import ( - determine_ext, - extract_attributes, - int_or_none, - traverse_obj, - unified_strdate, -) - - -class FranceCultureIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emissions/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TESTS = [{ - # playlist - 'url': 'https://www.franceculture.fr/emissions/serie/hasta-dente', - 'playlist_count': 12, - 'info_dict': { - 'id': 'hasta-dente', - 'title': 'Hasta Dente', - 'description': 'md5:57479af50648d14e9bb649e6b1f8f911', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20201024', - }, - 'playlist': [{ - 'info_dict': { - 'id': '3c1c2e55-41a0-11e5-9fe0-005056a87c89', - 'ext': 'mp3', - 'title': 'Jeudi, vous avez dit bizarre ?', - 'description': 'md5:47cf1e00cc21c86b0210279996a812c6', - 'duration': 604, - 'upload_date': '20201024', - 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1603576680 - }, - }, - ], - }, { - 'url': 'https://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks', - 'info_dict': { - 'id': 'rendez-vous-au-pays-des-geeks', - 'display_id': 'rendez-vous-au-pays-des-geeks', - 'ext': 'mp3', - 'title': 'Rendez-vous au pays des geeks', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20140301', - 'vcodec': 'none', - 'duration': 3569, - }, - }, { - # no thumbnail - 'url': 'https://www.franceculture.fr/emissions/la-recherche-montre-en-main/la-recherche-montre-en-main-du-mercredi-10-octobre-2018', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - info = { - 'id': display_id, - 'title': self._html_search_regex( - r'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>', - webpage, 'title', default=self._og_search_title(webpage)), - 'description': self._html_search_regex( - r'(?s)<div[^>]+class="excerpt"[^>]*>(.*?)</div>', webpage, 'description', default=None), - 'thumbnail': self._og_search_thumbnail(webpage), - 'uploader': self._html_search_regex( - r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None), - 'upload_date': unified_strdate(self._html_search_regex( - r'(?s)class="teaser-text-date".*?(\d{2}/\d{2}/\d{4})', webpage, 'date', default=None)), - } - - playlist_data = self._search_regex( - r'''(?sx) - <section[^>]+data-xiti-place="[^"]*?liste_episodes[^"?]*?"[^>]*> - (.*?) - </section> - ''', - webpage, 'playlist data', fatal=False, default=None) - - if playlist_data: - entries = [] - for item, item_description in re.findall( - r'(?s)(<button[^<]*class="[^"]*replay-button[^>]*>).*?<p[^>]*class="[^"]*teaser-text-chapo[^>]*>(.*?)</p>', - playlist_data): - - item_attributes = extract_attributes(item) - entries.append({ - 'id': item_attributes.get('data-emission-uuid'), - 'url': item_attributes.get('data-url'), - 'title': item_attributes.get('data-diffusion-title'), - 'duration': int_or_none(traverse_obj(item_attributes, 'data-duration-seconds', 'data-duration-seconds')), - 'description': item_description, - 'timestamp': int_or_none(item_attributes.get('data-start-time')), - 'thumbnail': info['thumbnail'], - 'uploader': info['uploader'], - }) - - return { - '_type': 'playlist', - 'entries': entries, - **info - } - - video_data = extract_attributes(self._search_regex( - r'''(?sx) - (?: - </h1>| - <div[^>]+class="[^"]*?(?:title-zone-diffusion|heading-zone-(?:wrapper|player-button))[^"]*?"[^>]*> - ).*? - (<button[^>]+data-(?:url|asset-source)="[^"]+"[^>]+>) - ''', - webpage, 'video data')) - video_url = traverse_obj(video_data, 'data-url', 'data-asset-source') - ext = determine_ext(video_url.lower()) - - return { - 'display_id': display_id, - 'url': video_url, - 'ext': ext, - 'vcodec': 'none' if ext == 'mp3' else None, - 'duration': int_or_none(video_data.get('data-duration')), - **info - } diff --git a/hypervideo_dl/extractor/franceinter.py b/hypervideo_dl/extractor/franceinter.py index ae822a5..779249b 100644 --- a/hypervideo_dl/extractor/franceinter.py +++ b/hypervideo_dl/extractor/franceinter.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import month_by_name diff --git a/hypervideo_dl/extractor/francetv.py b/hypervideo_dl/extractor/francetv.py index 347a766..0523172 100644 --- a/hypervideo_dl/extractor/francetv.py +++ b/hypervideo_dl/extractor/francetv.py @@ -1,8 +1,3 @@ -# coding: utf-8 - -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -37,6 +32,7 @@ class FranceTVIE(InfoExtractor): (?P<id>[^@]+)(?:@(?P<catalog>.+))? ) ''' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1'] _TESTS = [{ # without catalog @@ -195,8 +191,6 @@ class FranceTVIE(InfoExtractor): } for sheet in spritesheets] }) - self._sort_formats(formats) - if subtitle: title += ' - %s' % subtitle title = title.strip() @@ -375,7 +369,7 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): webpage = self._download_webpage(url, display_id) - dailymotion_urls = DailymotionIE._extract_urls(webpage) + dailymotion_urls = tuple(DailymotionIE._extract_embed_urls(url, webpage)) if dailymotion_urls: return self.playlist_result([ self.url_result(dailymotion_url, DailymotionIE.ie_key()) diff --git a/hypervideo_dl/extractor/freesound.py b/hypervideo_dl/extractor/freesound.py index 138b6bc..8b5f227 100644 --- a/hypervideo_dl/extractor/freesound.py +++ b/hypervideo_dl/extractor/freesound.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -65,7 +63,6 @@ class FreesoundIE(InfoExtractor): 'format_note': channels, 'quality': quality, } for quality, format_url in enumerate(audio_urls)] - self._sort_formats(formats) return { 'id': audio_id, diff --git a/hypervideo_dl/extractor/freespeech.py b/hypervideo_dl/extractor/freespeech.py index ea9c3e3..aea5513 100644 --- a/hypervideo_dl/extractor/freespeech.py +++ b/hypervideo_dl/extractor/freespeech.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from .youtube import YoutubeIE diff --git a/hypervideo_dl/extractor/freetv.py b/hypervideo_dl/extractor/freetv.py new file mode 100644 index 0000000..757a10d --- /dev/null +++ b/hypervideo_dl/extractor/freetv.py @@ -0,0 +1,139 @@ +import itertools +import re + +from .common import InfoExtractor +from ..utils import int_or_none, traverse_obj, urlencode_postdata + + +class FreeTvBaseIE(InfoExtractor): + def _get_api_response(self, content_id, resource_type, postdata): + return self._download_json( + 'https://www.freetv.com/wordpress/wp-admin/admin-ajax.php', + content_id, data=urlencode_postdata(postdata), + note=f'Downloading {content_id} {resource_type} JSON')['data'] + + +class FreeTvMoviesIE(FreeTvBaseIE): + _VALID_URL = r'https?://(?:www\.)?freetv\.com/peliculas/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://www.freetv.com/peliculas/atrapame-si-puedes/', + 'md5': 'dc62d5abf0514726640077cd1591aa92', + 'info_dict': { + 'id': '428021', + 'title': 'Atrápame Si Puedes', + 'description': 'md5:ca63bc00898aeb2f64ec87c6d3a5b982', + 'ext': 'mp4', + } + }, { + 'url': 'https://www.freetv.com/peliculas/monstruoso/', + 'md5': '509c15c68de41cb708d1f92d071f20aa', + 'info_dict': { + 'id': '377652', + 'title': 'Monstruoso', + 'description': 'md5:333fc19ee327b457b980e54a911ea4a3', + 'ext': 'mp4', + } + }] + + def _extract_video(self, content_id, action='olyott_video_play'): + api_response = self._get_api_response(content_id, 'video', { + 'action': action, + 'contentID': content_id, + }) + + video_id, video_url = api_response['displayMeta']['contentID'], api_response['displayMeta']['streamURLVideo'] + formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, video_id, 'mp4') + + return { + 'id': video_id, + 'title': traverse_obj(api_response, ('displayMeta', 'title')), + 'description': traverse_obj(api_response, ('displayMeta', 'desc')), + 'formats': formats, + 'subtitles': subtitles, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + return self._extract_video( + self._search_regex(( + r'class=["\'][^>]+postid-(?P<video_id>\d+)', + r'<link[^>]+freetv.com/\?p=(?P<video_id>\d+)', + r'<div[^>]+data-params=["\'][^>]+post_id=(?P<video_id>\d+)', + ), webpage, 'video id', group='video_id')) + + +class FreeTvIE(FreeTvBaseIE): + IE_NAME = 'freetv:series' + _VALID_URL = r'https?://(?:www\.)?freetv\.com/series/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://www.freetv.com/series/el-detective-l/', + 'info_dict': { + 'id': 'el-detective-l', + 'title': 'El Detective L', + 'description': 'md5:f9f1143bc33e9856ecbfcbfb97a759be' + }, + 'playlist_count': 24, + }, { + 'url': 'https://www.freetv.com/series/esmeraldas/', + 'info_dict': { + 'id': 'esmeraldas', + 'title': 'Esmeraldas', + 'description': 'md5:43d7ec45bd931d8268a4f5afaf4c77bf' + }, + 'playlist_count': 62, + }, { + 'url': 'https://www.freetv.com/series/las-aventuras-de-leonardo/', + 'info_dict': { + 'id': 'las-aventuras-de-leonardo', + 'title': 'Las Aventuras de Leonardo', + 'description': 'md5:0c47130846c141120a382aca059288f6' + }, + 'playlist_count': 13, + }, + ] + + def _extract_series_season(self, season_id, series_title): + episodes = self._get_api_response(season_id, 'series', { + 'contentID': season_id, + 'action': 'olyott_get_dynamic_series_content', + 'type': 'list', + 'perPage': '1000', + })['1'] + + for episode in episodes: + video_id = str(episode['contentID']) + formats, subtitles = self._extract_m3u8_formats_and_subtitles(episode['streamURL'], video_id, 'mp4') + + yield { + 'id': video_id, + 'title': episode.get('fullTitle'), + 'description': episode.get('description'), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': episode.get('thumbnail'), + 'series': series_title, + 'series_id': traverse_obj(episode, ('contentMeta', 'displayMeta', 'seriesID')), + 'season_id': traverse_obj(episode, ('contentMeta', 'displayMeta', 'seasonID')), + 'season_number': traverse_obj( + episode, ('contentMeta', 'displayMeta', 'seasonNum'), expected_type=int_or_none), + 'episode_number': traverse_obj( + episode, ('contentMeta', 'displayMeta', 'episodeNum'), expected_type=int_or_none), + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + title = self._html_search_regex( + r'<h1[^>]+class=["\']synopis[^>]>(?P<title>[^<]+)', webpage, 'title', group='title', fatal=False) + description = self._html_search_regex( + r'<div[^>]+class=["\']+synopis content[^>]><p>(?P<description>[^<]+)', + webpage, 'description', group='description', fatal=False) + + return self.playlist_result( + itertools.chain.from_iterable( + self._extract_series_season(season_id, title) + for season_id in re.findall(r'<option[^>]+value=["\'](\d+)["\']', webpage)), + display_id, title, description) diff --git a/hypervideo_dl/extractor/freshlive.py b/hypervideo_dl/extractor/freshlive.py deleted file mode 100644 index 72a8459..0000000 --- a/hypervideo_dl/extractor/freshlive.py +++ /dev/null @@ -1,83 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - ExtractorError, - int_or_none, - try_get, - unified_timestamp, -) - - -class FreshLiveIE(InfoExtractor): - _VALID_URL = r'https?://freshlive\.tv/[^/]+/(?P<id>\d+)' - _TEST = { - 'url': 'https://freshlive.tv/satotv/74712', - 'md5': '9f0cf5516979c4454ce982df3d97f352', - 'info_dict': { - 'id': '74712', - 'ext': 'mp4', - 'title': 'テスト', - 'description': 'テスト', - 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 1511, - 'timestamp': 1483619655, - 'upload_date': '20170105', - 'uploader': 'サトTV', - 'uploader_id': 'satotv', - 'view_count': int, - 'comment_count': int, - 'is_live': False, - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - options = self._parse_json( - self._search_regex( - r'window\.__CONTEXT__\s*=\s*({.+?});\s*</script>', - webpage, 'initial context'), - video_id) - - info = options['context']['dispatcher']['stores']['ProgramStore']['programs'][video_id] - - title = info['title'] - - if info.get('status') == 'upcoming': - raise ExtractorError('Stream %s is upcoming' % video_id, expected=True) - - stream_url = info.get('liveStreamUrl') or info['archiveStreamUrl'] - - is_live = info.get('liveStreamUrl') is not None - - formats = self._extract_m3u8_formats( - stream_url, video_id, 'mp4', - 'm3u8_native', m3u8_id='hls') - - if is_live: - title = self._live_title(title) - - return { - 'id': video_id, - 'formats': formats, - 'title': title, - 'description': info.get('description'), - 'thumbnail': info.get('thumbnailUrl'), - 'duration': int_or_none(info.get('airTime')), - 'timestamp': unified_timestamp(info.get('createdAt')), - 'uploader': try_get( - info, lambda x: x['channel']['title'], compat_str), - 'uploader_id': try_get( - info, lambda x: x['channel']['code'], compat_str), - 'uploader_url': try_get( - info, lambda x: x['channel']['permalink'], compat_str), - 'view_count': int_or_none(info.get('viewCount')), - 'comment_count': int_or_none(info.get('commentCount')), - 'tags': info.get('tags', []), - 'is_live': is_live, - } diff --git a/hypervideo_dl/extractor/frontendmasters.py b/hypervideo_dl/extractor/frontendmasters.py index fc67a84..3bae8ad 100644 --- a/hypervideo_dl/extractor/frontendmasters.py +++ b/hypervideo_dl/extractor/frontendmasters.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -163,7 +160,6 @@ class FrontendMastersIE(FrontendMastersBaseIE): 'format_id': format_id, }) formats.append(f) - self._sort_formats(formats) subtitles = { 'en': [{ diff --git a/hypervideo_dl/extractor/fujitv.py b/hypervideo_dl/extractor/fujitv.py index 4fdfe12..668bb27 100644 --- a/hypervideo_dl/extractor/fujitv.py +++ b/hypervideo_dl/extractor/fujitv.py @@ -1,5 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals from ..utils import HEADRequest from .common import InfoExtractor @@ -19,7 +17,7 @@ class FujiTVFODPlus7IE(InfoExtractor): 'url': 'https://fod.fujitv.co.jp/title/5d40/5d40110076', 'info_dict': { 'id': '5d40110076', - 'ext': 'mp4', + 'ext': 'ts', 'title': '#1318 『まる子、まぼろしの洋館を見る』の巻', 'series': 'ちびまる子ちゃん', 'series_id': '5d40', @@ -30,7 +28,7 @@ class FujiTVFODPlus7IE(InfoExtractor): 'url': 'https://fod.fujitv.co.jp/title/5d40/5d40810083', 'info_dict': { 'id': '5d40810083', - 'ext': 'mp4', + 'ext': 'ts', 'title': '#1324 『まる子とオニの子』の巻/『結成!2月をムダにしない会』の巻', 'description': 'md5:3972d900b896adc8ab1849e310507efa', 'series': 'ちびまる子ちゃん', @@ -47,19 +45,18 @@ class FujiTVFODPlus7IE(InfoExtractor): if token: json_info = self._download_json('https://fod-sp.fujitv.co.jp/apps/api/episode/detail/?ep_id=%s&is_premium=false' % video_id, video_id, headers={'x-authorization': f'Bearer {token.value}'}, fatal=False) else: - self.report_warning(f'The token cookie is needed to extract video metadata. {self._LOGIN_HINTS["cookies"]}') + self.report_warning(f'The token cookie is needed to extract video metadata. {self._login_hint("cookies")}') formats, subtitles = [], {} src_json = self._download_json(f'{self._BASE_URL}abrjson_v2/tv_android/{video_id}', video_id) for src in src_json['video_selector']: if not src.get('url'): continue - fmt, subs = self._extract_m3u8_formats_and_subtitles(src['url'], video_id, 'mp4') + fmt, subs = self._extract_m3u8_formats_and_subtitles(src['url'], video_id, 'ts') for f in fmt: f.update(dict(zip(('height', 'width'), self._BITRATE_MAP.get(f.get('tbr'), ())))) formats.extend(fmt) subtitles = self._merge_subtitles(subtitles, subs) - self._sort_formats(formats, ['tbr']) return { 'id': video_id, @@ -70,4 +67,5 @@ class FujiTVFODPlus7IE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, 'thumbnail': f'{self._BASE_URL}img/program/{series_id}/episode/{video_id}_a.jpg', + '_format_sort_fields': ('tbr', ) } diff --git a/hypervideo_dl/extractor/funimation.py b/hypervideo_dl/extractor/funimation.py index 6aa9bc9..18363c1 100644 --- a/hypervideo_dl/extractor/funimation.py +++ b/hypervideo_dl/extractor/funimation.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import random import re import string @@ -8,17 +5,18 @@ import string from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( + ExtractorError, determine_ext, int_or_none, join_nonempty, js_to_json, + make_archive_id, orderedSet, qualities, str_or_none, traverse_obj, try_get, urlencode_postdata, - ExtractorError, ) @@ -245,11 +243,14 @@ class FunimationIE(FunimationBaseIE): 'language_preference': language_preference(lang.lower()), }) formats.extend(current_formats) + if not formats and (requested_languages or requested_versions): + self.raise_no_formats( + 'There are no video formats matching the requested languages/versions', expected=True, video_id=display_id) self._remove_duplicate_formats(formats) - self._sort_formats(formats, ('lang', 'source')) return { - 'id': initial_experience_id if only_initial_experience else episode_id, + 'id': episode_id, + '_old_archive_ids': [make_archive_id(self, initial_experience_id)], 'display_id': display_id, 'duration': duration, 'title': episode['episodeTitle'], @@ -264,6 +265,7 @@ class FunimationIE(FunimationBaseIE): 'formats': formats, 'thumbnails': thumbnails, 'subtitles': subtitles, + '_format_sort_fields': ('lang', 'source'), } def _get_subtitles(self, subtitles, experience_id, episode, display_id, format_name): diff --git a/hypervideo_dl/extractor/funk.py b/hypervideo_dl/extractor/funk.py index 2c5cfe8..539d719 100644 --- a/hypervideo_dl/extractor/funk.py +++ b/hypervideo_dl/extractor/funk.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from .nexx import NexxIE from ..utils import ( diff --git a/hypervideo_dl/extractor/fusion.py b/hypervideo_dl/extractor/fusion.py index a3f44b8..689422f 100644 --- a/hypervideo_dl/extractor/fusion.py +++ b/hypervideo_dl/extractor/fusion.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -72,7 +70,6 @@ class FusionIE(InfoExtractor): 'protocol': 'm3u8_native' if ext == 'm3u8' else 'https', }) if formats: - self._sort_formats(formats) info['formats'] = formats else: info.update({ diff --git a/hypervideo_dl/extractor/fuyintv.py b/hypervideo_dl/extractor/fuyintv.py new file mode 100644 index 0000000..197901d --- /dev/null +++ b/hypervideo_dl/extractor/fuyintv.py @@ -0,0 +1,30 @@ +from .common import InfoExtractor +from ..utils import traverse_obj + + +class FuyinTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?fuyin\.tv/html/(?:\d+)/(?P<id>\d+)\.html' + _TESTS = [{ + 'url': 'https://www.fuyin.tv/html/2733/44129.html', + 'info_dict': { + 'id': '44129', + 'ext': 'mp4', + 'title': '第1集', + 'description': 'md5:21a3d238dc8d49608e1308e85044b9c3', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + json_data = self._download_json( + 'https://www.fuyin.tv/api/api/tv.movie/url', + video_id, query={'urlid': f'{video_id}'}) + webpage = self._download_webpage(url, video_id, fatal=False) + + return { + 'id': video_id, + 'title': traverse_obj(json_data, ('data', 'title')), + 'url': json_data['data']['url'], + 'ext': 'mp4', + 'description': self._html_search_meta('description', webpage), + } diff --git a/hypervideo_dl/extractor/fxnetworks.py b/hypervideo_dl/extractor/fxnetworks.py deleted file mode 100644 index 00e6742..0000000 --- a/hypervideo_dl/extractor/fxnetworks.py +++ /dev/null @@ -1,77 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .adobepass import AdobePassIE -from ..utils import ( - extract_attributes, - int_or_none, - parse_age_limit, - smuggle_url, - update_url_query, -) - - -class FXNetworksIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?(?:fxnetworks|simpsonsworld)\.com/video/(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://www.fxnetworks.com/video/1032565827847', - 'md5': '8d99b97b4aa7a202f55b6ed47ea7e703', - 'info_dict': { - 'id': 'dRzwHC_MMqIv', - 'ext': 'mp4', - 'title': 'First Look: Better Things - Season 2', - 'description': 'Because real life is like a fart. Watch this FIRST LOOK to see what inspired the new season of Better Things.', - 'age_limit': 14, - 'uploader': 'NEWA-FNG-FX', - 'upload_date': '20170825', - 'timestamp': 1503686274, - 'episode_number': 0, - 'season_number': 2, - 'series': 'Better Things', - }, - 'add_ie': ['ThePlatform'], - }, { - 'url': 'http://www.simpsonsworld.com/video/716094019682', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - if 'The content you are trying to access is not available in your region.' in webpage: - self.raise_geo_restricted() - video_data = extract_attributes(self._search_regex( - r'(<a.+?rel="https?://link\.theplatform\.com/s/.+?</a>)', webpage, 'video data')) - player_type = self._search_regex(r'playerType\s*=\s*[\'"]([^\'"]+)', webpage, 'player type', default=None) - release_url = video_data['rel'] - title = video_data['data-title'] - rating = video_data.get('data-rating') - query = { - 'mbr': 'true', - } - if player_type == 'movies': - query.update({ - 'manifest': 'm3u', - }) - else: - query.update({ - 'switch': 'http', - }) - if video_data.get('data-req-auth') == '1': - resource = self._get_mvpd_resource( - video_data['data-channel'], title, - video_data.get('data-guid'), rating) - query['auth'] = self._extract_mvpd_auth(url, video_id, 'fx', resource) - - return { - '_type': 'url_transparent', - 'id': video_id, - 'title': title, - 'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}), - 'series': video_data.get('data-show-title'), - 'episode_number': int_or_none(video_data.get('data-episode')), - 'season_number': int_or_none(video_data.get('data-season')), - 'thumbnail': video_data.get('data-large-thumb'), - 'age_limit': parse_age_limit(rating), - 'ie_key': 'ThePlatform', - } diff --git a/hypervideo_dl/extractor/gab.py b/hypervideo_dl/extractor/gab.py index 9ba0b1c..5016e2f 100644 --- a/hypervideo_dl/extractor/gab.py +++ b/hypervideo_dl/extractor/gab.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -57,7 +54,6 @@ class GabTVIE(InfoExtractor): else: frmt['height'] = str_to_int(resolution.replace('p', '')) formats.append(frmt) - self._sort_formats(formats) return { 'id': id, @@ -123,8 +119,6 @@ class GabIE(InfoExtractor): } for url, f in ((media.get('url'), metadata.get('original') or {}), (media.get('source_mp4'), metadata.get('playable') or {})) if url] - self._sort_formats(formats) - author = json_data.get('account') or {} entries.append({ 'id': f'{post_id}-{idx}', diff --git a/hypervideo_dl/extractor/gaia.py b/hypervideo_dl/extractor/gaia.py index 5b0195c..c84386f 100644 --- a/hypervideo_dl/extractor/gaia.py +++ b/hypervideo_dl/extractor/gaia.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..compat import ( compat_str, @@ -92,7 +88,6 @@ class GaiaIE(InfoExtractor): media_id, headers=headers) formats = self._extract_m3u8_formats( media['mediaUrls']['bcHLS'], media_id, 'mp4') - self._sort_formats(formats) subtitles = {} text_tracks = media.get('textTracks', {}) diff --git a/hypervideo_dl/extractor/gameinformer.py b/hypervideo_dl/extractor/gameinformer.py index f1b96c1..2664edb 100644 --- a/hypervideo_dl/extractor/gameinformer.py +++ b/hypervideo_dl/extractor/gameinformer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .brightcove import BrightcoveNewIE from .common import InfoExtractor from ..utils import ( diff --git a/hypervideo_dl/extractor/gamejolt.py b/hypervideo_dl/extractor/gamejolt.py index a13e528..440b832 100644 --- a/hypervideo_dl/extractor/gamejolt.py +++ b/hypervideo_dl/extractor/gamejolt.py @@ -1,4 +1,3 @@ -# coding: utf-8 import itertools import json import math diff --git a/hypervideo_dl/extractor/gamespot.py b/hypervideo_dl/extractor/gamespot.py index 7a1beae..8dec252 100644 --- a/hypervideo_dl/extractor/gamespot.py +++ b/hypervideo_dl/extractor/gamespot.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .once import OnceIE from ..compat import compat_urllib_parse_unquote @@ -67,8 +65,6 @@ class GameSpotIE(OnceIE): formats.extend(self._extract_mpd_formats( mpd_url, page_id, mpd_id='dash', fatal=False)) - self._sort_formats(formats) - return { 'id': data_video.get('guid') or page_id, 'display_id': page_id, diff --git a/hypervideo_dl/extractor/gamestar.py b/hypervideo_dl/extractor/gamestar.py index e882fa6..e9966f5 100644 --- a/hypervideo_dl/extractor/gamestar.py +++ b/hypervideo_dl/extractor/gamestar.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( int_or_none, diff --git a/hypervideo_dl/extractor/gaskrank.py b/hypervideo_dl/extractor/gaskrank.py index 03acd2a..e0bbdae 100644 --- a/hypervideo_dl/extractor/gaskrank.py +++ b/hypervideo_dl/extractor/gaskrank.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor from ..utils import ( @@ -96,6 +93,5 @@ class GaskrankIE(InfoExtractor): 'view_count': view_count, 'average_rating': average_rating, }) - self._sort_formats(entry['formats']) return entry diff --git a/hypervideo_dl/extractor/gazeta.py b/hypervideo_dl/extractor/gazeta.py index 3671870..c6868a6 100644 --- a/hypervideo_dl/extractor/gazeta.py +++ b/hypervideo_dl/extractor/gazeta.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/gdcvault.py b/hypervideo_dl/extractor/gdcvault.py index c3ad6b4..2878bbd 100644 --- a/hypervideo_dl/extractor/gdcvault.py +++ b/hypervideo_dl/extractor/gdcvault.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/gedidigital.py b/hypervideo_dl/extractor/gedidigital.py index ec386c2..1878d63 100644 --- a/hypervideo_dl/extractor/gedidigital.py +++ b/hypervideo_dl/extractor/gedidigital.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -14,7 +11,7 @@ from ..utils import ( class GediDigitalIE(InfoExtractor): - _VALID_URL = r'''(?x)(?P<url>(?:https?:)//video\. + _VALID_URL = r'''(?x:(?P<base_url>(?:https?:)//video\. (?: (?: (?:espresso\.)?repubblica @@ -36,7 +33,13 @@ class GediDigitalIE(InfoExtractor): |corrierealpi |lasentinella )\.gelocal - )\.it(?:/[^/]+){2,4}/(?P<id>\d+))(?:$|[?&].*)''' + )\.it(?:/[^/]+){2,4}/(?P<id>\d+))(?:$|[?&].*))''' + _EMBED_REGEX = [rf'''(?x) + (?: + data-frame-src=| + <iframe[^\n]+src= + ) + (["'])(?P<url>{_VALID_URL})\1'''] _TESTS = [{ 'url': 'https://video.lastampa.it/politica/il-paradosso-delle-regionali-la-lega-vince-ma-sembra-aver-perso/121559/121683', 'md5': '84658d7fb9e55a6e57ecc77b73137494', @@ -112,22 +115,9 @@ class GediDigitalIE(InfoExtractor): urls[i] = urljoin(base_url(e), url_basename(e)) return urls - @staticmethod - def _extract_urls(webpage): - entries = [ - mobj.group('eurl') - for mobj in re.finditer(r'''(?x) - (?: - data-frame-src=| - <iframe[^\n]+src= - ) - (["'])(?P<eurl>%s)\1''' % GediDigitalIE._VALID_URL, webpage)] - return GediDigitalIE._sanitize_urls(entries) - - @staticmethod - def _extract_url(webpage): - urls = GediDigitalIE._extract_urls(webpage) - return urls[0] if urls else None + @classmethod + def _extract_embed_urls(cls, url, webpage): + return cls._sanitize_urls(tuple(super()._extract_embed_urls(url, webpage))) @staticmethod def _clean_formats(formats): @@ -142,8 +132,7 @@ class GediDigitalIE(InfoExtractor): formats[:] = clean_formats def _real_extract(self, url): - video_id = self._match_id(url) - url = self._match_valid_url(url).group('url') + video_id, url = self._match_valid_url(url).group('id', 'base_url') webpage = self._download_webpage(url, video_id) title = self._html_search_meta( ['twitter:title', 'og:title'], webpage, fatal=True) @@ -197,7 +186,6 @@ class GediDigitalIE(InfoExtractor): duration = int_or_none(v) self._clean_formats(formats) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/generic.py b/hypervideo_dl/extractor/generic.py index 03e6eb2..f28a77e 100644 --- a/hypervideo_dl/extractor/generic.py +++ b/hypervideo_dl/extractor/generic.py @@ -1,162 +1,49 @@ -# coding: utf-8 - -from __future__ import unicode_literals - import os import re -import sys +import types +import urllib.parse +import xml.etree.ElementTree -from .common import InfoExtractor +from .common import InfoExtractor # isort: split +from .commonprotocols import RtmpIE from .youtube import YoutubeIE -from ..compat import ( - compat_etree_fromstring, - compat_str, - compat_urllib_parse_unquote, - compat_urlparse, - compat_xml_parse_error, -) +from ..compat import compat_etree_fromstring from ..utils import ( + KNOWN_EXTENSIONS, + MEDIA_EXTENSIONS, + ExtractorError, + UnsupportedError, determine_ext, dict_get, - ExtractorError, - float_or_none, - HEADRequest, + format_field, int_or_none, is_html, js_to_json, - KNOWN_EXTENSIONS, merge_dicts, mimetype2ext, orderedSet, parse_duration, parse_resolution, - sanitized_Request, smuggle_url, str_or_none, + traverse_obj, + try_call, unescapeHTML, unified_timestamp, unsmuggle_url, - UnsupportedError, url_or_none, + variadic, xpath_attr, xpath_text, xpath_with_ns, ) -from .commonprotocols import RtmpIE -from .brightcove import ( - BrightcoveLegacyIE, - BrightcoveNewIE, -) -from .nexx import ( - NexxIE, - NexxEmbedIE, -) -from .nbc import NBCSportsVPlayerIE -from .ooyala import OoyalaIE -from .rutv import RUTVIE -from .tvc import TVCIE -from .sportbox import SportBoxIE -from .myvi import MyviIE -from .condenast import CondeNastIE -from .udn import UDNEmbedIE -from .senategov import SenateISVPIE -from .svt import SVTIE -from .pornhub import PornHubIE -from .xhamster import XHamsterEmbedIE -from .tnaflix import TNAFlixNetworkEmbedIE -from .drtuber import DrTuberIE -from .redtube import RedTubeIE -from .tube8 import Tube8IE -from .mofosex import MofosexEmbedIE -from .spankwire import SpankwireIE -from .youporn import YouPornIE -from .vimeo import ( - VimeoIE, - VHXEmbedIE, -) -from .dailymotion import DailymotionIE -from .dailymail import DailyMailIE -from .onionstudios import OnionStudiosIE -from .viewlift import ViewLiftEmbedIE -from .mtv import MTVServicesEmbeddedIE -from .pladform import PladformIE -from .videomore import VideomoreIE -from .webcaster import WebcasterFeedIE -from .googledrive import GoogleDriveIE -from .jwplatform import JWPlatformIE -from .digiteka import DigitekaIE -from .arkena import ArkenaIE -from .instagram import InstagramIE -from .threeqsdn import ThreeQSDNIE -from .theplatform import ThePlatformIE -from .kaltura import KalturaIE -from .eagleplatform import EaglePlatformIE -from .facebook import FacebookIE -from .soundcloud import SoundcloudEmbedIE -from .tunein import TuneInBaseIE -from .vbox7 import Vbox7IE -from .dbtv import DBTVIE -from .piksel import PikselIE -from .videa import VideaIE -from .twentymin import TwentyMinutenIE -from .ustream import UstreamIE -from .arte import ArteTVEmbedIE -from .videopress import VideoPressIE -from .rutube import RutubeIE -from .glomex import GlomexEmbedIE -from .megatvcom import MegaTVComEmbedIE -from .ant1newsgr import Ant1NewsGrEmbedIE -from .limelight import LimelightBaseIE -from .anvato import AnvatoIE -from .washingtonpost import WashingtonPostIE -from .wistia import WistiaIE -from .mediaset import MediasetIE -from .joj import JojIE -from .megaphone import MegaphoneIE -from .vzaar import VzaarIE -from .channel9 import Channel9IE -from .vshare import VShareIE -from .mediasite import MediasiteIE -from .springboardplatform import SpringboardPlatformIE -from .ted import TedEmbedIE -from .yapfiles import YapFilesIE -from .vice import ViceIE -from .xfileshare import XFileShareIE -from .cloudflarestream import CloudflareStreamIE -from .peertube import PeerTubeIE -from .teachable import TeachableIE -from .indavideo import IndavideoEmbedIE -from .apa import APAIE -from .foxnews import FoxNewsIE -from .viqeo import ViqeoIE -from .expressen import ExpressenIE -from .zype import ZypeIE -from .odnoklassniki import OdnoklassnikiIE -from .vk import VKIE -from .kinja import KinjaEmbedIE -from .gedidigital import GediDigitalIE -from .rcs import RCSEmbedsIE -from .bitchute import BitChuteIE -from .rumble import RumbleEmbedIE -from .arcpublishing import ArcPublishingIE -from .medialaan import MedialaanIE -from .simplecast import SimplecastIE -from .wimtv import WimTVIE -from .tvopengr import TVOpenGrEmbedIE -from .ertgr import ERTWebtvEmbedIE -from .tvp import TVPEmbedIE -from .blogger import BloggerIE -from .mainstreaming import MainStreamingIE -from .gfycat import GfycatIE -from .panopto import PanoptoBaseIE -from .ruutu import RuutuIE class GenericIE(InfoExtractor): IE_DESC = 'Generic downloader that works on some sites' _VALID_URL = r'.*' IE_NAME = 'generic' - _NETRC_MACHINE = False # Supress username warning + _NETRC_MACHINE = False # Suppress username warning _TESTS = [ # Direct link to a video { @@ -474,188 +361,6 @@ class GenericIE(InfoExtractor): }, 'skip': 'There is a limit of 200 free downloads / month for the test song', }, - { - # embedded brightcove video - # it also tests brightcove videos that need to set the 'Referer' - # in the http requests - 'add_ie': ['BrightcoveLegacy'], - 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', - 'info_dict': { - 'id': '2765128793001', - 'ext': 'mp4', - 'title': 'Le cours de bourse : l’analyse technique', - 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9', - 'uploader': 'BFM BUSINESS', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # embedded with itemprop embedURL and video id spelled as `idVideo` - 'add_id': ['BrightcoveLegacy'], - 'url': 'http://bfmbusiness.bfmtv.com/mediaplayer/chroniques/olivier-delamarche/', - 'info_dict': { - 'id': '5255628253001', - 'ext': 'mp4', - 'title': 'md5:37c519b1128915607601e75a87995fc0', - 'description': 'md5:37f7f888b434bb8f8cc8dbd4f7a4cf26', - 'uploader': 'BFM BUSINESS', - 'uploader_id': '876450612001', - 'timestamp': 1482255315, - 'upload_date': '20161220', - }, - 'params': { - 'skip_download': True, - }, - }, - { - # https://github.com/ytdl-org/youtube-dl/issues/2253 - 'url': 'http://bcove.me/i6nfkrc3', - 'md5': '0ba9446db037002366bab3b3eb30c88c', - 'info_dict': { - 'id': '3101154703001', - 'ext': 'mp4', - 'title': 'Still no power', - 'uploader': 'thestar.com', - 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.', - }, - 'add_ie': ['BrightcoveLegacy'], - 'skip': 'video gone', - }, - { - 'url': 'http://www.championat.com/video/football/v/87/87499.html', - 'md5': 'fb973ecf6e4a78a67453647444222983', - 'info_dict': { - 'id': '3414141473001', - 'ext': 'mp4', - 'title': 'Видео. Удаление Дзагоева (ЦСКА)', - 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"', - 'uploader': 'Championat', - }, - }, - { - # https://github.com/ytdl-org/youtube-dl/issues/3541 - 'add_ie': ['BrightcoveLegacy'], - 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1', - 'info_dict': { - 'id': '3866516442001', - 'ext': 'mp4', - 'title': 'Leer mij vrouwen kennen: Aflevering 1', - 'description': 'Leer mij vrouwen kennen: Aflevering 1', - 'uploader': 'SBS Broadcasting', - }, - 'skip': 'Restricted to Netherlands', - 'params': { - 'skip_download': True, # m3u8 download - }, - }, - { - # Brightcove video in <iframe> - 'url': 'http://www.un.org/chinese/News/story.asp?NewsID=27724', - 'md5': '36d74ef5e37c8b4a2ce92880d208b968', - 'info_dict': { - 'id': '5360463607001', - 'ext': 'mp4', - 'title': '叙利亚失明儿童在废墟上演唱《心跳》 呼吁获得正常童年生活', - 'description': '联合国儿童基金会中东和北非区域大使、作曲家扎德·迪拉尼(Zade Dirani)在3月15日叙利亚冲突爆发7周年纪念日之际发布了为叙利亚谱写的歌曲《心跳》(HEARTBEAT),为受到六年冲突影响的叙利亚儿童发出强烈呐喊,呼吁世界做出共同努力,使叙利亚儿童重新获得享有正常童年生活的权利。', - 'uploader': 'United Nations', - 'uploader_id': '1362235914001', - 'timestamp': 1489593889, - 'upload_date': '20170315', - }, - 'add_ie': ['BrightcoveLegacy'], - }, - { - # Brightcove with alternative playerID key - 'url': 'http://www.nature.com/nmeth/journal/v9/n7/fig_tab/nmeth.2062_SV1.html', - 'info_dict': { - 'id': 'nmeth.2062_SV1', - 'title': 'Simultaneous multiview imaging of the Drosophila syncytial blastoderm : Quantitative high-speed imaging of entire developing embryos with simultaneous multiview light-sheet microscopy : Nature Methods : Nature Research', - }, - 'playlist': [{ - 'info_dict': { - 'id': '2228375078001', - 'ext': 'mp4', - 'title': 'nmeth.2062-sv1', - 'description': 'nmeth.2062-sv1', - 'timestamp': 1363357591, - 'upload_date': '20130315', - 'uploader': 'Nature Publishing Group', - 'uploader_id': '1964492299001', - }, - }], - }, - { - # Brightcove with UUID in videoPlayer - 'url': 'http://www8.hp.com/cn/zh/home.html', - 'info_dict': { - 'id': '5255815316001', - 'ext': 'mp4', - 'title': 'Sprocket Video - China', - 'description': 'Sprocket Video - China', - 'uploader': 'HP-Video Gallery', - 'timestamp': 1482263210, - 'upload_date': '20161220', - 'uploader_id': '1107601872001', - }, - 'params': { - 'skip_download': True, # m3u8 download - }, - 'skip': 'video rotates...weekly?', - }, - { - # Brightcove:new type [2]. - 'url': 'http://www.delawaresportszone.com/video-st-thomas-more-earns-first-trip-to-basketball-semis', - 'md5': '2b35148fcf48da41c9fb4591650784f3', - 'info_dict': { - 'id': '5348741021001', - 'ext': 'mp4', - 'upload_date': '20170306', - 'uploader_id': '4191638492001', - 'timestamp': 1488769918, - 'title': 'VIDEO: St. Thomas More earns first trip to basketball semis', - - }, - }, - { - # Alternative brightcove <video> attributes - 'url': 'http://www.programme-tv.net/videos/extraits/81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche/', - 'info_dict': { - 'id': '81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche', - 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche, Extraits : toutes les vidéos avec Télé-Loisirs", - }, - 'playlist': [{ - 'md5': '732d22ba3d33f2f3fc253c39f8f36523', - 'info_dict': { - 'id': '5311302538001', - 'ext': 'mp4', - 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche", - 'description': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche (France 2, 5 février 2017)", - 'timestamp': 1486321708, - 'upload_date': '20170205', - 'uploader_id': '800000640001', - }, - 'only_matching': True, - }], - }, - { - # Brightcove with UUID in videoPlayer - 'url': 'http://www8.hp.com/cn/zh/home.html', - 'info_dict': { - 'id': '5255815316001', - 'ext': 'mp4', - 'title': 'Sprocket Video - China', - 'description': 'Sprocket Video - China', - 'uploader': 'HP-Video Gallery', - 'timestamp': 1482263210, - 'upload_date': '20161220', - 'uploader_id': '1107601872001', - }, - 'params': { - 'skip_download': True, # m3u8 download - }, - }, # ooyala video { 'url': 'http://www.rollingstone.com/music/videos/norwegian-dj-cashmere-cat-goes-spartan-on-with-me-premiere-20131219', @@ -947,45 +652,6 @@ class GenericIE(InfoExtractor): 'skip_download': True, } }, - # YouTube <object> embed - { - 'url': 'http://www.improbable.com/2017/04/03/untrained-modern-youths-and-ancient-masters-in-selfie-portraits/', - 'md5': '516718101ec834f74318df76259fb3cc', - 'info_dict': { - 'id': 'msN87y-iEx0', - 'ext': 'webm', - 'title': 'Feynman: Mirrors FUN TO IMAGINE 6', - 'upload_date': '20080526', - 'description': 'md5:0ffc78ea3f01b2e2c247d5f8d1d3c18d', - 'uploader': 'Christopher Sykes', - 'uploader_id': 'ChristopherJSykes', - }, - 'add_ie': ['Youtube'], - }, - # Camtasia studio - { - 'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/', - 'playlist': [{ - 'md5': '0c5e352edabf715d762b0ad4e6d9ee67', - 'info_dict': { - 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final', - 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - video1', - 'ext': 'flv', - 'duration': 2235.90, - } - }, { - 'md5': '10e4bb3aaca9fd630e273ff92d9f3c63', - 'info_dict': { - 'id': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final_PIP', - 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final - pip', - 'ext': 'flv', - 'duration': 2235.93, - } - }], - 'info_dict': { - 'title': 'Fenn-AA_PA_Radar_Course_Lecture_1c_Final', - } - }, # Flowplayer { 'url': 'http://www.handjobhub.com/video/busty-blonde-siri-tit-fuck-while-wank-6313.html', @@ -998,20 +664,6 @@ class GenericIE(InfoExtractor): 'title': 'Busty Blonde Siri Tit Fuck While Wank at HandjobHub.com', } }, - # Multiple brightcove videos - # https://github.com/ytdl-org/youtube-dl/issues/2283 - { - 'url': 'http://www.newyorker.com/online/blogs/newsdesk/2014/01/always-never-nuclear-command-and-control.html', - 'info_dict': { - 'id': 'always-never', - 'title': 'Always / Never - The New Yorker', - }, - 'playlist_count': 3, - 'params': { - 'extract_flat': False, - 'skip_download': True, - } - }, # MLB embed { 'url': 'http://umpire-empire.com/index.php/topic/58125-laz-decides-no-thats-low/', @@ -1027,36 +679,6 @@ class GenericIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', }, }, - # Wistia embed - { - 'url': 'http://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', - 'md5': '1953f3a698ab51cfc948ed3992a0b7ff', - 'info_dict': { - 'id': '6e2wtrbdaf', - 'ext': 'mov', - 'title': 'paywall_north-american-exploration-failed-colonies-of-spain-france-england', - 'description': 'a Paywall Videos video from Remilon', - 'duration': 644.072, - 'uploader': 'study.com', - 'timestamp': 1459678540, - 'upload_date': '20160403', - 'filesize': 24687186, - }, - }, - { - 'url': 'http://thoughtworks.wistia.com/medias/uxjb0lwrcz', - 'md5': 'baf49c2baa8a7de5f3fc145a8506dcd4', - 'info_dict': { - 'id': 'uxjb0lwrcz', - 'ext': 'mp4', - 'title': 'Conversation about Hexagonal Rails Part 1', - 'description': 'a Martin Fowler video from ThoughtWorks', - 'duration': 1715.0, - 'uploader': 'thoughtworks.wistia.com', - 'timestamp': 1401832161, - 'upload_date': '20140603', - }, - }, # Wistia standard embed (async) { 'url': 'https://www.getdrip.com/university/brennan-dunn-drip-workshop/', @@ -1071,7 +693,8 @@ class GenericIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, + 'skip': 'webpage 404 not found', }, # Soundcloud embed { @@ -1255,18 +878,6 @@ class GenericIE(InfoExtractor): } }, { - # JWPlatform iframe - 'url': 'https://www.covermagazine.co.uk/feature/2465255/business-protection-involved', - 'info_dict': { - 'id': 'AG26UQXM', - 'ext': 'mp4', - 'upload_date': '20160719', - 'timestamp': 468923808, - 'title': '2016_05_18 Cover L&G Business Protection V1 FINAL.mp4', - }, - 'add_ie': [JWPlatformIE.ie_key()], - }, - { # Video.js embed, multiple formats 'url': 'http://ortcam.com/solidworks-урок-6-настройка-чертежа_33f9b7351.html', 'info_dict': { @@ -1545,21 +1156,6 @@ class GenericIE(InfoExtractor): }, 'expected_warnings': ['Failed to parse JSON Expecting value'], }, - # Brightcove URL in single quotes - { - 'url': 'http://www.sportsnet.ca/baseball/mlb/sn-presents-russell-martin-world-citizen/', - 'md5': '4ae374f1f8b91c889c4b9203c8c752af', - 'info_dict': { - 'id': '4255764656001', - 'ext': 'mp4', - 'title': 'SN Presents: Russell Martin, World Citizen', - 'description': 'To understand why he was the Toronto Blue Jays’ top off-season priority is to appreciate his background and upbringing in Montreal, where he first developed his baseball skills. Written and narrated by Stephen Brunt.', - 'uploader': 'Rogers Sportsnet', - 'uploader_id': '1704050871', - 'upload_date': '20150525', - 'timestamp': 1432570283, - }, - }, # Kinja embed { 'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537', @@ -1595,52 +1191,6 @@ class GenericIE(InfoExtractor): 'duration': 248.667, }, }, - # BrightcoveInPageEmbed embed - { - 'url': 'http://www.geekandsundry.com/tabletop-bonus-wils-final-thoughts-on-dread/', - 'info_dict': { - 'id': '4238694884001', - 'ext': 'flv', - 'title': 'Tabletop: Dread, Last Thoughts', - 'description': 'Tabletop: Dread, Last Thoughts', - 'duration': 51690, - }, - }, - # Brightcove embed, with no valid 'renditions' but valid 'IOSRenditions' - # This video can't be played in browsers if Flash disabled and UA set to iPhone, which is actually a false alarm - { - 'url': 'https://dl.dropboxusercontent.com/u/29092637/interview.html', - 'info_dict': { - 'id': '4785848093001', - 'ext': 'mp4', - 'title': 'The Cardinal Pell Interview', - 'description': 'Sky News Contributor Andrew Bolt interviews George Pell in Rome, following the Cardinal\'s evidence before the Royal Commission into Child Abuse. ', - 'uploader': 'GlobeCast Australia - GlobeStream', - 'uploader_id': '2733773828001', - 'upload_date': '20160304', - 'timestamp': 1457083087, - }, - 'params': { - # m3u8 downloads - 'skip_download': True, - }, - }, - { - # Brightcove embed with whitespace around attribute names - 'url': 'http://www.stack.com/video/3167554373001/learn-to-hit-open-three-pointers-with-damian-lillard-s-baseline-drift-drill', - 'info_dict': { - 'id': '3167554373001', - 'ext': 'mp4', - 'title': "Learn to Hit Open Three-Pointers With Damian Lillard's Baseline Drift Drill", - 'description': 'md5:57bacb0e0f29349de4972bfda3191713', - 'uploader_id': '1079349493', - 'upload_date': '20140207', - 'timestamp': 1391810548, - }, - 'params': { - 'skip_download': True, - }, - }, # Another form of arte.tv embed { 'url': 'http://www.tv-replay.fr/redirection/09-04-16/arte-reportage-arte-11508975.html', @@ -1691,7 +1241,7 @@ class GenericIE(InfoExtractor): 'timestamp': 1464107587, 'uploader': 'TheAtlantic', }, - 'add_ie': ['BrightcoveLegacy'], + 'skip': 'Private Youtube video', }, # Facebook <iframe> embed { @@ -1800,7 +1350,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [ArkenaIE.ie_key()], + 'add_ie': ['Arkena'], }, { 'url': 'http://nova.bg/news/view/2016/08/16/156543/%D0%BD%D0%B0-%D0%BA%D0%BE%D1%81%D1%8A%D0%BC-%D0%BE%D1%82-%D0%B2%D0%B7%D1%80%D0%B8%D0%B2-%D0%BE%D1%82%D1%86%D0%B5%D0%BF%D0%B8%D1%85%D0%B0-%D1%86%D1%8F%D0%BB-%D0%BA%D0%B2%D0%B0%D1%80%D1%82%D0%B0%D0%BB-%D0%B7%D0%B0%D1%80%D0%B0%D0%B4%D0%B8-%D0%B8%D0%B7%D1%82%D0%B8%D1%87%D0%B0%D0%BD%D0%B5-%D0%BD%D0%B0-%D0%B3%D0%B0%D0%B7-%D0%B2-%D0%BF%D0%BB%D0%BE%D0%B2%D0%B4%D0%B8%D0%B2/', @@ -1812,7 +1362,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [Vbox7IE.ie_key()], + 'add_ie': ['Vbox7'], }, { # DBTV embeds @@ -1844,7 +1394,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [TwentyMinutenIE.ie_key()], + 'add_ie': ['TwentyMinuten'], }, { # VideoPress embed @@ -1859,7 +1409,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [VideoPressIE.ie_key()], + 'add_ie': ['VideoPress'], }, { # Rutube embed @@ -1876,7 +1426,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [RutubeIE.ie_key()], + 'add_ie': ['Rutube'], }, { # glomex:embed @@ -1948,7 +1498,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'Integrated Senate Video Player', }, - 'add_ie': [SenateISVPIE.ie_key()], + 'add_ie': ['SenateISVP'], }, { # Limelight embeds (1 channel embed + 4 media embeds) @@ -1995,7 +1545,7 @@ class GenericIE(InfoExtractor): 'uploader': 'The Washington Post', 'upload_date': '20160211', }, - 'add_ie': [WashingtonPostIE.ie_key()], + 'add_ie': ['WashingtonPost'], }, { # Mediaset embed @@ -2008,7 +1558,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [MediasetIE.ie_key()], + 'add_ie': ['Mediaset'], }, { # JOJ.sk embeds @@ -2018,7 +1568,7 @@ class GenericIE(InfoExtractor): 'title': 'Slovenskom sa prehnala vlna silných búrok', }, 'playlist_mincount': 5, - 'add_ie': [JojIE.ie_key()], + 'add_ie': ['Joj'], }, { # AMP embed (see https://www.ampproject.org/docs/reference/components/amp-video) @@ -2084,7 +1634,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, - 'add_ie': [SpringboardPlatformIE.ie_key()], + 'add_ie': ['SpringboardPlatform'], }, { 'url': 'https://www.yapfiles.ru/show/1872528/690b05d3054d2dbe1e69523aa21bb3b1.mp4.html', @@ -2093,7 +1643,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'Котята', }, - 'add_ie': [YapFilesIE.ie_key()], + 'add_ie': ['YapFiles'], 'params': { 'skip_download': True, }, @@ -2106,7 +1656,7 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': '31c9291ab41fac05471db4e73aa11717', }, - 'add_ie': [CloudflareStreamIE.ie_key()], + 'add_ie': ['CloudflareStream'], 'params': { 'skip_download': True, }, @@ -2133,7 +1683,7 @@ class GenericIE(InfoExtractor): 'uploader': 'StreetKitchen', 'uploader_id': '546363', }, - 'add_ie': [IndavideoEmbedIE.ie_key()], + 'add_ie': ['IndavideoEmbed'], 'params': { 'skip_download': True, }, @@ -2174,22 +1724,6 @@ class GenericIE(InfoExtractor): }, 'playlist_count': 6, }, - { - # Squarespace video embed, 2019-08-28 - 'url': 'http://ootboxford.com', - 'info_dict': { - 'id': 'Tc7b_JGdZfw', - 'title': 'Out of the Blue, at Childish Things 10', - 'ext': 'mp4', - 'description': 'md5:a83d0026666cf5ee970f8bd1cfd69c7f', - 'uploader_id': 'helendouglashouse', - 'uploader': 'Helen & Douglas House', - 'upload_date': '20140328', - }, - 'params': { - 'skip_download': True, - }, - }, # { # # Zype embed # 'url': 'https://www.cookscountry.com/episode/554-smoky-barbecue-favorites', @@ -2508,10 +2042,10 @@ class GenericIE(InfoExtractor): # Panopto embeds 'url': 'https://www.monash.edu/learning-teaching/teachhq/learning-technologies/panopto/how-to/insert-a-quiz-into-a-panopto-video', 'info_dict': { - 'title': 'Insert a quiz into a Panopto video', - 'id': 'insert-a-quiz-into-a-panopto-video' + 'ext': 'mp4', + 'id': '0bd3f16c-824a-436a-8486-ac5900693aef', + 'title': 'Quizzes in Panopto', }, - 'playlist_count': 1 }, { # Ruutu embed @@ -2530,114 +2064,178 @@ class GenericIE(InfoExtractor): 'upload_date': '20220308', }, }, + { + # Multiple Ruutu embeds + 'url': 'https://www.hs.fi/kotimaa/art-2000008762560.html', + 'info_dict': { + 'title': 'Koronavirus | Epidemiahuippu voi olla Suomessa ohi, mutta koronaviruksen poistamista yleisvaarallisten tautien joukosta harkitaan vasta syksyllä', + 'id': 'art-2000008762560' + }, + 'playlist_count': 3 + }, + { + # Ruutu embed in hs.fi with a single video + 'url': 'https://www.hs.fi/kotimaa/art-2000008793421.html', + 'md5': 'f8964e65d8fada6e8a562389bf366bb4', + 'info_dict': { + 'id': '4081841', + 'ext': 'mp4', + 'title': 'Puolustusvoimat siirsi panssariajoneuvoja harjoituksiin Niinisaloon 2.5.2022', + 'thumbnail': r're:^https?://.+\.jpg$', + 'duration': 138, + 'age_limit': 0, + 'upload_date': '20220504', + }, + }, + { + # Webpage contains double BOM + 'url': 'https://www.filmarkivet.se/movies/paris-d-moll/', + 'md5': 'df02cadc719dcc63d43288366f037754', + 'info_dict': { + 'id': 'paris-d-moll', + 'ext': 'mp4', + 'upload_date': '20220518', + 'title': 'Paris d-moll', + 'description': 'md5:319e37ea5542293db37e1e13072fe330', + 'thumbnail': 'https://www.filmarkivet.se/wp-content/uploads/parisdmoll2.jpg', + 'timestamp': 1652833414, + 'age_limit': 0, + } + }, + { + 'url': 'https://www.mollymovieclub.com/p/interstellar?s=r#details', + 'md5': '198bde8bed23d0b23c70725c83c9b6d9', + 'info_dict': { + 'id': '53602801', + 'ext': 'mpga', + 'title': 'Interstellar', + 'description': 'Listen now | Episode One', + 'thumbnail': 'md5:c30d9c83f738e16d8551d7219d321538', + 'uploader': 'Molly Movie Club', + 'uploader_id': '839621', + }, + }, + { + 'url': 'https://www.blockedandreported.org/p/episode-117-lets-talk-about-depp?s=r', + 'md5': 'c0cc44ee7415daeed13c26e5b56d6aa0', + 'info_dict': { + 'id': '57962052', + 'ext': 'mpga', + 'title': 'md5:855b2756f0ee10f6723fa00b16266f8d', + 'description': 'md5:fe512a5e94136ad260c80bde00ea4eef', + 'thumbnail': 'md5:2218f27dfe517bb5ac16c47d0aebac59', + 'uploader': 'Blocked and Reported', + 'uploader_id': '500230', + }, + }, + { + 'url': 'https://www.skimag.com/video/ski-people-1980/', + 'md5': '022a7e31c70620ebec18deeab376ee03', + 'info_dict': { + 'id': 'YTmgRiNU', + 'ext': 'mp4', + 'title': '1980 Ski People', + 'timestamp': 1610407738, + 'description': 'md5:cf9c3d101452c91e141f292b19fe4843', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/YTmgRiNU/poster.jpg?width=720', + 'duration': 5688.0, + 'upload_date': '20210111', + } + }, + { + 'note': 'JSON LD with multiple @type', + 'url': 'https://www.nu.nl/280161/video/hoe-een-bladvlo-dit-verwoestende-japanse-onkruid-moet-vernietigen.html', + 'md5': 'c7949f34f57273013fb7ccb1156393db', + 'info_dict': { + 'id': 'ipy2AcGL', + 'ext': 'mp4', + 'description': 'md5:6a9d644bab0dc2dc06849c2505d8383d', + 'thumbnail': r're:https://media\.nu\.nl/m/.+\.jpg', + 'title': 'Hoe een bladvlo dit verwoestende Japanse onkruid moet vernietigen', + 'timestamp': 1586577474, + 'upload_date': '20200411', + 'age_limit': 0, + 'duration': 111.0, + } + }, + { + 'note': 'JSON LD with unexpected data type', + 'url': 'https://www.autoweek.nl/autotests/artikel/porsche-911-gt3-rs-rij-impressie-2/', + 'info_dict': { + 'id': 'porsche-911-gt3-rs-rij-impressie-2', + 'ext': 'mp4', + 'title': 'Test: Porsche 911 GT3 RS', + 'description': 'Je ziet het niet, maar het is er wel. Downforce, hebben we het dan over. En in de nieuwe Porsche 911 GT3 RS is er zelfs heel veel downforce.', + 'timestamp': 1664920902, + 'upload_date': '20221004', + 'thumbnail': r're:^https://media.autoweek.nl/m/.+\.jpg$', + 'age_limit': 0, + 'direct': True, + } + } ] def report_following_redirect(self, new_url): """Report information extraction.""" self._downloader.to_screen('[redirect] Following redirect to %s' % new_url) - def report_detected(self, name): - self._downloader.write_debug(f'Identified a {name}') + def report_detected(self, name, num=1, note=None): + if num > 1: + name += 's' + elif not num: + return + else: + num = 'a' - def _extract_rss(self, url, video_id, doc): - playlist_title = doc.find('./channel/title').text - playlist_desc_el = doc.find('./channel/description') - playlist_desc = None if playlist_desc_el is None else playlist_desc_el.text + self._downloader.write_debug(f'Identified {num} {name}{format_field(note, None, "; %s")}') + def _fragment_query(self, url): + if self._configuration_arg('fragment_query'): + query_string = urllib.parse.urlparse(url).query + if query_string: + return {'extra_param_to_segment_url': query_string} + return {} + + def _extract_rss(self, url, video_id, doc): NS_MAP = { 'itunes': 'http://www.itunes.com/dtds/podcast-1.0.dtd', } entries = [] for it in doc.findall('./channel/item'): - next_url = None - enclosure_nodes = it.findall('./enclosure') - for e in enclosure_nodes: - next_url = e.attrib.get('url') - if next_url: - break - - if not next_url: - next_url = xpath_text(it, 'link', fatal=False) - + next_url = next( + (e.attrib.get('url') for e in it.findall('./enclosure')), + xpath_text(it, 'link', fatal=False)) if not next_url: continue - if it.find('guid').text is not None: - next_url = smuggle_url(next_url, {'force_videoid': it.find('guid').text}) + guid = try_call(lambda: it.find('guid').text) + if guid: + next_url = smuggle_url(next_url, {'force_videoid': guid}) def itunes(key): - return xpath_text( - it, xpath_with_ns('./itunes:%s' % key, NS_MAP), - default=None) - - duration = itunes('duration') - explicit = (itunes('explicit') or '').lower() - if explicit in ('true', 'yes'): - age_limit = 18 - elif explicit in ('false', 'no'): - age_limit = 0 - else: - age_limit = None + return xpath_text(it, xpath_with_ns(f'./itunes:{key}', NS_MAP), default=None) entries.append({ '_type': 'url_transparent', 'url': next_url, - 'title': it.find('title').text, + 'title': try_call(lambda: it.find('title').text), 'description': xpath_text(it, 'description', default=None), - 'timestamp': unified_timestamp( - xpath_text(it, 'pubDate', default=None)), - 'duration': int_or_none(duration) or parse_duration(duration), + 'timestamp': unified_timestamp(xpath_text(it, 'pubDate', default=None)), + 'duration': parse_duration(itunes('duration')), 'thumbnail': url_or_none(xpath_attr(it, xpath_with_ns('./itunes:image', NS_MAP), 'href')), 'episode': itunes('title'), 'episode_number': int_or_none(itunes('episode')), 'season_number': int_or_none(itunes('season')), - 'age_limit': age_limit, + 'age_limit': {'true': 18, 'yes': 18, 'false': 0, 'no': 0}.get((itunes('explicit') or '').lower()), }) return { '_type': 'playlist', 'id': url, - 'title': playlist_title, - 'description': playlist_desc, - 'entries': entries, - } - - def _extract_camtasia(self, url, video_id, webpage): - """ Returns None if no camtasia video can be found. """ - - camtasia_cfg = self._search_regex( - r'fo\.addVariable\(\s*"csConfigFile",\s*"([^"]+)"\s*\);', - webpage, 'camtasia configuration file', default=None) - if camtasia_cfg is None: - return None - - title = self._html_search_meta('DC.title', webpage, fatal=True) - - camtasia_url = compat_urlparse.urljoin(url, camtasia_cfg) - camtasia_cfg = self._download_xml( - camtasia_url, video_id, - note='Downloading camtasia configuration', - errnote='Failed to download camtasia configuration') - fileset_node = camtasia_cfg.find('./playlist/array/fileset') - - entries = [] - for n in fileset_node.getchildren(): - url_n = n.find('./uri') - if url_n is None: - continue - - entries.append({ - 'id': os.path.splitext(url_n.text.rpartition('/')[2])[0], - 'title': '%s - %s' % (title, n.tag), - 'url': compat_urlparse.urljoin(url, url_n.text), - 'duration': float_or_none(n.find('./duration').text), - }) - - return { - '_type': 'playlist', + 'title': try_call(lambda: doc.find('./channel/title').text), + 'description': try_call(lambda: doc.find('./channel/description').text), 'entries': entries, - 'title': title, } def _kvs_getrealurl(self, video_url, license_code): @@ -2651,7 +2249,7 @@ class GenericIE(InfoExtractor): for o in range(len(newmagic) - 1, -1, -1): new = '' - l = (o + sum([int(n) for n in license[o:]])) % 32 + l = (o + sum(int(n) for n in license[o:])) % 32 for i in range(0, len(newmagic)): if i == o: @@ -2682,7 +2280,7 @@ class GenericIE(InfoExtractor): if url.startswith('//'): return self.url_result(self.http_scheme() + url) - parsed_url = compat_urlparse.urlparse(url) + parsed_url = urllib.parse.urlparse(url) if not parsed_url.scheme: default_search = self.get_param('default_search') if default_search is None: @@ -2713,59 +2311,59 @@ class GenericIE(InfoExtractor): default_search += ':' return self.url_result(default_search + url) - url, smuggled_data = unsmuggle_url(url) + original_url = url + url, smuggled_data = unsmuggle_url(url, {}) force_videoid = None - is_intentional = smuggled_data and smuggled_data.get('to_generic') - if smuggled_data and 'force_videoid' in smuggled_data: + is_intentional = smuggled_data.get('to_generic') + if 'force_videoid' in smuggled_data: force_videoid = smuggled_data['force_videoid'] video_id = force_videoid else: video_id = self._generic_id(url) - self.to_screen('%s: Requesting header' % video_id) - - head_req = HEADRequest(url) - head_response = self._request_webpage( - head_req, video_id, - note=False, errnote='Could not send HEAD request to %s' % url, - fatal=False) - - if head_response is not False: - # Check for redirect - new_url = head_response.geturl() - if url != new_url: - self.report_following_redirect(new_url) - if force_videoid: - new_url = smuggle_url( - new_url, {'force_videoid': force_videoid}) - return self.url_result(new_url) - - full_response = None - if head_response is False: - request = sanitized_Request(url) - request.add_header('Accept-Encoding', '*') - full_response = self._request_webpage(request, video_id) - head_response = full_response + # Some webservers may serve compressed content of rather big size (e.g. gzipped flac) + # making it impossible to download only chunk of the file (yet we need only 512kB to + # test whether it's HTML or not). According to hypervideo default Accept-Encoding + # that will always result in downloading the whole file that is not desirable. + # Therefore for extraction pass we have to override Accept-Encoding to any in order + # to accept raw bytes and being able to download only a chunk. + # It may probably better to solve this by checking Content-Type for application/octet-stream + # after a HEAD request, but not sure if we can rely on this. + full_response = self._request_webpage(url, video_id, headers={ + 'Accept-Encoding': '*', + **smuggled_data.get('http_headers', {}) + }) + new_url = full_response.geturl() + if new_url == urllib.parse.urlparse(url)._replace(scheme='https').geturl(): + url = new_url + elif url != new_url: + self.report_following_redirect(new_url) + if force_videoid: + new_url = smuggle_url(new_url, {'force_videoid': force_videoid}) + return self.url_result(new_url) info_dict = { 'id': video_id, 'title': self._generic_title(url), - 'timestamp': unified_timestamp(head_response.headers.get('Last-Modified')) + 'timestamp': unified_timestamp(full_response.headers.get('Last-Modified')) } # Check for direct link to a video - content_type = head_response.headers.get('Content-Type', '').lower() + content_type = full_response.headers.get('Content-Type', '').lower() m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>[^;\s]+)', content_type) if m: self.report_detected('direct video link') - format_id = compat_str(m.group('format_id')) + headers = smuggled_data.get('http_headers', {}) + format_id = str(m.group('format_id')) subtitles = {} if format_id.endswith('mpegurl'): - formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4', headers=headers) + info_dict.update(self._fragment_query(url)) elif format_id.endswith('mpd') or format_id.endswith('dash+xml'): - formats, subtitles = self._extract_mpd_formats_and_subtitles(url, video_id) + formats, subtitles = self._extract_mpd_formats_and_subtitles(url, video_id, headers=headers) + info_dict.update(self._fragment_query(url)) elif format_id == 'f4m': - formats = self._extract_f4m_formats(url, video_id) + formats = self._extract_f4m_formats(url, video_id, headers=headers) else: formats = [{ 'format_id': format_id, @@ -2773,28 +2371,16 @@ class GenericIE(InfoExtractor): 'vcodec': 'none' if m.group('type') == 'audio' else None }] info_dict['direct'] = True - self._sort_formats(formats) - info_dict['formats'] = formats - info_dict['subtitles'] = subtitles + info_dict.update({ + 'formats': formats, + 'subtitles': subtitles, + 'http_headers': headers, + }) return info_dict if not self.get_param('test', False) and not is_intentional: force = self.get_param('force_generic_extractor', False) - self.report_warning( - '%s on generic information extractor.' % ('Forcing' if force else 'Falling back')) - - if not full_response: - request = sanitized_Request(url) - # Some webservers may serve compressed content of rather big size (e.g. gzipped flac) - # making it impossible to download only chunk of the file (yet we need only 512kB to - # test whether it's HTML or not). According to hypervideo default Accept-Encoding - # that will always result in downloading the whole file that is not desirable. - # Therefore for extraction pass we have to override Accept-Encoding to any in order - # to accept raw bytes and being able to download only a chunk. - # It may probably better to solve this by checking Content-Type for application/octet-stream - # after HEAD request finishes, but not sure if we can rely on this. - request.add_header('Accept-Encoding', '*') - full_response = self._request_webpage(request, video_id) + self.report_warning('%s generic information extractor' % ('Forcing' if force else 'Falling back on')) first_bytes = full_response.read(512) @@ -2802,7 +2388,7 @@ class GenericIE(InfoExtractor): if first_bytes.startswith(b'#EXTM3U'): self.report_detected('M3U playlist') info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') - self._sort_formats(info_dict['formats']) + info_dict.update(self._fragment_query(url)) return info_dict # Maybe it's a direct link to a video? @@ -2828,7 +2414,7 @@ class GenericIE(InfoExtractor): try: try: doc = compat_etree_fromstring(webpage) - except compat_xml_parse_error: + except xml.etree.ElementTree.ParseError: doc = compat_etree_fromstring(webpage.encode('utf-8')) if doc.tag == 'rss': self.report_detected('RSS feed') @@ -2836,12 +2422,10 @@ class GenericIE(InfoExtractor): elif doc.tag == 'SmoothStreamingMedia': info_dict['formats'], info_dict['subtitles'] = self._parse_ism_formats_and_subtitles(doc, url) self.report_detected('ISM manifest') - self._sort_formats(info_dict['formats']) return info_dict elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): smil = self._parse_smil(doc, url, video_id) self.report_detected('SMIL file') - self._sort_formats(smil['formats']) return smil elif doc.tag == '{http://xspf.org/ns/0/}playlist': self.report_detected('XSPF playlist') @@ -2855,947 +2439,83 @@ class GenericIE(InfoExtractor): doc, mpd_base_url=full_response.geturl().rpartition('/')[0], mpd_url=url) + info_dict.update(self._fragment_query(url)) self.report_detected('DASH manifest') - self._sort_formats(info_dict['formats']) return info_dict elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): info_dict['formats'] = self._parse_f4m_formats(doc, url, video_id) self.report_detected('F4M manifest') - self._sort_formats(info_dict['formats']) return info_dict - except compat_xml_parse_error: + except xml.etree.ElementTree.ParseError: pass - # Is it a Camtasia project? - camtasia_res = self._extract_camtasia(url, video_id, webpage) - if camtasia_res is not None: - self.report_detected('Camtasia video') - return camtasia_res + info_dict.update({ + # it's tempting to parse this further, but you would + # have to take into account all the variations like + # Video Title - Site Name + # Site Name | Video Title + # Video Title - Tagline | Site Name + # and so on and so forth; it's just not practical + 'title': self._generic_title('', webpage, default='video'), + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'age_limit': self._rta_search(webpage), + }) + + self._downloader.write_debug('Looking for embeds') + embeds = list(self._extract_embeds(original_url, webpage, urlh=full_response, info_dict=info_dict)) + if len(embeds) == 1: + return {**info_dict, **embeds[0]} + elif embeds: + return self.playlist_result(embeds, **info_dict) + raise UnsupportedError(url) + + def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}): + """Returns an iterator of video entries""" + info_dict = types.MappingProxyType(info_dict) # Prevents accidental mutation + video_id = traverse_obj(info_dict, 'display_id', 'id') or self._generic_id(url) + url, smuggled_data = unsmuggle_url(url, {}) + actual_url = urlh.geturl() if urlh else url # Sometimes embedded video player is hidden behind percent encoding # (e.g. https://github.com/ytdl-org/youtube-dl/issues/2448) # Unescaping the whole page allows to handle those cases in a generic way # FIXME: unescaping the whole page may break URLs, commenting out for now. # There probably should be a second run of generic extractor on unescaped webpage. - # webpage = compat_urllib_parse_unquote(webpage) - - # Unescape squarespace embeds to be detected by generic extractor, - # see https://github.com/ytdl-org/youtube-dl/issues/21294 - webpage = re.sub( - r'<div[^>]+class=[^>]*?\bsqs-video-wrapper\b[^>]*>', - lambda x: unescapeHTML(x.group(0)), webpage) - - # it's tempting to parse this further, but you would - # have to take into account all the variations like - # Video Title - Site Name - # Site Name | Video Title - # Video Title - Tagline | Site Name - # and so on and so forth; it's just not practical - video_title = (self._og_search_title(webpage, default=None) - or self._html_extract_title(webpage, 'video title', default='video')) - - # Try to detect age limit automatically - age_limit = self._rta_search(webpage) - # And then there are the jokers who advertise that they use RTA, - # but actually don't. - AGE_LIMIT_MARKERS = [ - r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>', - ] - if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS): - age_limit = 18 - - # video uploader is domain name - video_uploader = self._search_regex( - r'^(?:https?://)?([^/]*)/.*', url, 'video uploader') - - video_description = self._og_search_description(webpage, default=None) - video_thumbnail = self._og_search_thumbnail(webpage, default=None) - - info_dict.update({ - 'title': video_title, - 'description': video_description, - 'thumbnail': video_thumbnail, - 'age_limit': age_limit, - }) - - self._downloader.write_debug('Looking for video embeds') - - # Look for Brightcove Legacy Studio embeds - bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) - if bc_urls: - entries = [{ - '_type': 'url', - 'url': smuggle_url(bc_url, {'Referer': url}), - 'ie_key': 'BrightcoveLegacy' - } for bc_url in bc_urls] - - return { - '_type': 'playlist', - 'title': video_title, - 'id': video_id, - 'entries': entries, - } - - # Look for Brightcove New Studio embeds - bc_urls = BrightcoveNewIE._extract_urls(self, webpage) - if bc_urls: - return self.playlist_from_matches( - bc_urls, video_id, video_title, - getter=lambda x: smuggle_url(x, {'referrer': url}), - ie='BrightcoveNew') - - # Look for Nexx embeds - nexx_urls = NexxIE._extract_urls(webpage) - if nexx_urls: - return self.playlist_from_matches(nexx_urls, video_id, video_title, ie=NexxIE.ie_key()) - - # Look for Nexx iFrame embeds - nexx_embed_urls = NexxEmbedIE._extract_urls(webpage) - if nexx_embed_urls: - return self.playlist_from_matches(nexx_embed_urls, video_id, video_title, ie=NexxEmbedIE.ie_key()) - - # Look for ThePlatform embeds - tp_urls = ThePlatformIE._extract_urls(webpage) - if tp_urls: - return self.playlist_from_matches(tp_urls, video_id, video_title, ie='ThePlatform') - - arc_urls = ArcPublishingIE._extract_urls(webpage) - if arc_urls: - return self.playlist_from_matches(arc_urls, video_id, video_title, ie=ArcPublishingIE.ie_key()) - - mychannels_urls = MedialaanIE._extract_urls(webpage) - if mychannels_urls: - return self.playlist_from_matches( - mychannels_urls, video_id, video_title, ie=MedialaanIE.ie_key()) - - # Look for embedded rtl.nl player - matches = re.findall( - r'<iframe[^>]+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"', - webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl') - - vimeo_urls = VimeoIE._extract_urls(url, webpage) - if vimeo_urls: - return self.playlist_from_matches(vimeo_urls, video_id, video_title, ie=VimeoIE.ie_key()) - - vhx_url = VHXEmbedIE._extract_url(webpage) - if vhx_url: - return self.url_result(vhx_url, VHXEmbedIE.ie_key()) - - # Invidious Instances - # https://github.com/hypervideo/hypervideo/issues/195 - # https://github.com/iv-org/invidious/pull/1730 - youtube_url = self._search_regex( - r'<link rel="alternate" href="(https://www\.youtube\.com/watch\?v=[0-9A-Za-z_-]{11})"', - webpage, 'youtube link', default=None) - if youtube_url: - return self.url_result(youtube_url, YoutubeIE.ie_key()) - - # Look for YouTube embeds - youtube_urls = YoutubeIE._extract_urls(webpage) - if youtube_urls: - return self.playlist_from_matches( - youtube_urls, video_id, video_title, ie=YoutubeIE.ie_key()) - - matches = DailymotionIE._extract_urls(webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title) - - # Look for embedded Dailymotion playlist player (#3822) - m = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.[a-z]{2,3}/widget/jukebox\?.+?)\1', webpage) - if m: - playlists = re.findall( - r'list\[\]=/playlist/([^/]+)/', unescapeHTML(m.group('url'))) - if playlists: - return self.playlist_from_matches( - playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p) - - # Look for DailyMail embeds - dailymail_urls = DailyMailIE._extract_urls(webpage) - if dailymail_urls: - return self.playlist_from_matches( - dailymail_urls, video_id, video_title, ie=DailyMailIE.ie_key()) - - # Look for Teachable embeds, must be before Wistia - teachable_url = TeachableIE._extract_url(webpage, url) - if teachable_url: - return self.url_result(teachable_url) - - # Look for embedded Wistia player - wistia_urls = WistiaIE._extract_urls(webpage) - if wistia_urls: - playlist = self.playlist_from_matches(wistia_urls, video_id, video_title, ie=WistiaIE.ie_key()) - for entry in playlist['entries']: - entry.update({ - '_type': 'url_transparent', - 'uploader': video_uploader, - }) - return playlist - - # Look for SVT player - svt_url = SVTIE._extract_url(webpage) - if svt_url: - return self.url_result(svt_url, 'SVT') - - # Look for Bandcamp pages with custom domain - mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage) - if mobj is not None: - burl = unescapeHTML(mobj.group(1)) - # Don't set the extractor because it can be a track url or an album - return self.url_result(burl) - - # Look for embedded Vevo player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for embedded Viddler player - mobj = re.search( - r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for NYTimes player - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for Libsyn player - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for Ooyala videos - mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) - or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) - or re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P<ec>.{32})[\'"]', webpage) - or re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) - or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage)) - if mobj is not None: - embed_token = self._search_regex( - r'embedToken[\'"]?\s*:\s*[\'"]([^\'"]+)', - webpage, 'ooyala embed token', default=None) - return OoyalaIE._build_url_result(smuggle_url( - mobj.group('ec'), { - 'domain': url, - 'embed_token': embed_token, - })) - - # Look for multiple Ooyala embeds on SBN network websites - mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage) - if mobj is not None: - embeds = self._parse_json(mobj.group(1), video_id, fatal=False) - if embeds: - return self.playlist_from_matches( - embeds, video_id, video_title, - getter=lambda v: OoyalaIE._url_for_embed_code(smuggle_url(v['provider_video_id'], {'domain': url})), ie='Ooyala') - - # Look for Aparat videos - mobj = re.search(r'<iframe .*?src="(http://www\.aparat\.com/video/[^"]+)"', webpage) - if mobj is not None: - return self.url_result(mobj.group(1), 'Aparat') - - # Look for MPORA videos - mobj = re.search(r'<iframe .*?src="(http://mpora\.(?:com|de)/videos/[^"]+)"', webpage) - if mobj is not None: - return self.url_result(mobj.group(1), 'Mpora') - - # Look for embedded Facebook player - facebook_urls = FacebookIE._extract_urls(webpage) - if facebook_urls: - return self.playlist_from_matches(facebook_urls, video_id, video_title) - - # Look for embedded VK player - mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'VK') + # webpage = urllib.parse.unquote(webpage) - # Look for embedded Odnoklassniki player - odnoklassniki_url = OdnoklassnikiIE._extract_url(webpage) - if odnoklassniki_url: - return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key()) - - # Look for sibnet embedded player - sibnet_urls = VKIE._extract_sibnet_urls(webpage) - if sibnet_urls: - return self.playlist_from_matches(sibnet_urls, video_id, video_title) - - # Look for embedded ivi player - mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Ivi') - - # Look for embedded Huffington Post player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'HuffPost') - - # Look for embed.ly - mobj = re.search(r'class=["\']embedly-card["\'][^>]href=["\'](?P<url>[^"\']+)', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - mobj = re.search(r'class=["\']embedly-embed["\'][^>]src=["\'][^"\']*url=(?P<url>[^&]+)', webpage) - if mobj is not None: - return self.url_result(compat_urllib_parse_unquote(mobj.group('url'))) - - # Look for funnyordie embed - matches = re.findall(r'<iframe[^>]+?src="(https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"', webpage) - if matches: - return self.playlist_from_matches( - matches, video_id, video_title, getter=unescapeHTML, ie='FunnyOrDie') - - # Look for Simplecast embeds - simplecast_urls = SimplecastIE._extract_urls(webpage) - if simplecast_urls: - return self.playlist_from_matches( - simplecast_urls, video_id, video_title) - - # Look for BBC iPlayer embed - matches = re.findall(r'setPlaylist\("(https?://www\.bbc\.co\.uk/iplayer/[^/]+/[\da-z]{8})"\)', webpage) - if matches: - return self.playlist_from_matches(matches, video_id, video_title, ie='BBCCoUk') - - # Look for embedded RUTV player - rutv_url = RUTVIE._extract_url(webpage) - if rutv_url: - return self.url_result(rutv_url, 'RUTV') - - # Look for embedded TVC player - tvc_url = TVCIE._extract_url(webpage) - if tvc_url: - return self.url_result(tvc_url, 'TVC') - - # Look for embedded SportBox player - sportbox_urls = SportBoxIE._extract_urls(webpage) - if sportbox_urls: - return self.playlist_from_matches(sportbox_urls, video_id, video_title, ie=SportBoxIE.ie_key()) - - # Look for embedded XHamster player - xhamster_urls = XHamsterEmbedIE._extract_urls(webpage) - if xhamster_urls: - return self.playlist_from_matches(xhamster_urls, video_id, video_title, ie='XHamsterEmbed') - - # Look for embedded TNAFlixNetwork player - tnaflix_urls = TNAFlixNetworkEmbedIE._extract_urls(webpage) - if tnaflix_urls: - return self.playlist_from_matches(tnaflix_urls, video_id, video_title, ie=TNAFlixNetworkEmbedIE.ie_key()) - - # Look for embedded PornHub player - pornhub_urls = PornHubIE._extract_urls(webpage) - if pornhub_urls: - return self.playlist_from_matches(pornhub_urls, video_id, video_title, ie=PornHubIE.ie_key()) - - # Look for embedded DrTuber player - drtuber_urls = DrTuberIE._extract_urls(webpage) - if drtuber_urls: - return self.playlist_from_matches(drtuber_urls, video_id, video_title, ie=DrTuberIE.ie_key()) - - # Look for embedded RedTube player - redtube_urls = RedTubeIE._extract_urls(webpage) - if redtube_urls: - return self.playlist_from_matches(redtube_urls, video_id, video_title, ie=RedTubeIE.ie_key()) - - # Look for embedded Tube8 player - tube8_urls = Tube8IE._extract_urls(webpage) - if tube8_urls: - return self.playlist_from_matches(tube8_urls, video_id, video_title, ie=Tube8IE.ie_key()) - - # Look for embedded Mofosex player - mofosex_urls = MofosexEmbedIE._extract_urls(webpage) - if mofosex_urls: - return self.playlist_from_matches(mofosex_urls, video_id, video_title, ie=MofosexEmbedIE.ie_key()) - - # Look for embedded Spankwire player - spankwire_urls = SpankwireIE._extract_urls(webpage) - if spankwire_urls: - return self.playlist_from_matches(spankwire_urls, video_id, video_title, ie=SpankwireIE.ie_key()) - - # Look for embedded YouPorn player - youporn_urls = YouPornIE._extract_urls(webpage) - if youporn_urls: - return self.playlist_from_matches(youporn_urls, video_id, video_title, ie=YouPornIE.ie_key()) - - # Look for embedded Tvigle player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Tvigle') - - # Look for embedded TED player - ted_urls = TedEmbedIE._extract_urls(webpage) - if ted_urls: - return self.playlist_from_matches(ted_urls, video_id, video_title, ie=TedEmbedIE.ie_key()) - - # Look for embedded Ustream videos - ustream_url = UstreamIE._extract_url(webpage) - if ustream_url: - return self.url_result(ustream_url, UstreamIE.ie_key()) - - # Look for embedded arte.tv player - arte_urls = ArteTVEmbedIE._extract_urls(webpage) - if arte_urls: - return self.playlist_from_matches(arte_urls, video_id, video_title) - - # Look for embedded francetv player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?://)?embed\.francetv\.fr/\?ue=.+?)\1', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for embedded Myvi.ru player - myvi_url = MyviIE._extract_url(webpage) - if myvi_url: - return self.url_result(myvi_url) - - # Look for embedded soundcloud player - soundcloud_urls = SoundcloudEmbedIE._extract_urls(webpage) - if soundcloud_urls: - return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML) - - # Look for tunein player - tunein_urls = TuneInBaseIE._extract_urls(webpage) - if tunein_urls: - return self.playlist_from_matches(tunein_urls, video_id, video_title) - - # Look for embedded mtvservices player - mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage) - if mtvservices_url: - return self.url_result(mtvservices_url, ie='MTVServicesEmbedded') - - # Look for embedded yahoo player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Yahoo') - - # Look for embedded sbs.com.au player - mobj = re.search( - r'''(?x) - (?: - <meta\s+property="og:video"\s+content=| - <iframe[^>]+?src= - ) - (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1''', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'SBS') - - # Look for embedded Cinchcast player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.cinchcast\.com/.+?)\1', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Cinchcast') - - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1', - webpage) - if not mobj: - mobj = re.search( - r'data-video-link=["\'](?P<url>http://m\.mlb\.com/video/[^"\']+)', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'MLB') - - mobj = re.search( - r'<(?:iframe|script)[^>]+?src=(["\'])(?P<url>%s)\1' % CondeNastIE.EMBED_URL, - webpage) - if mobj is not None: - return self.url_result(self._proto_relative_url(mobj.group('url'), scheme='http:'), 'CondeNast') - - mobj = re.search( - r'<iframe[^>]+src="(?P<url>https?://(?:new\.)?livestream\.com/[^"]+/player[^"]+)"', - webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Livestream') - - # Look for Zapiks embed - mobj = re.search( - r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Zapiks') - - # Look for Kaltura embeds - kaltura_urls = KalturaIE._extract_urls(webpage) - if kaltura_urls: - return self.playlist_from_matches( - kaltura_urls, video_id, video_title, - getter=lambda x: smuggle_url(x, {'source_url': url}), - ie=KalturaIE.ie_key()) - - # Look for EaglePlatform embeds - eagleplatform_url = EaglePlatformIE._extract_url(webpage) - if eagleplatform_url: - return self.url_result(smuggle_url(eagleplatform_url, {'referrer': url}), EaglePlatformIE.ie_key()) - - # Look for ClipYou (uses EaglePlatform) embeds - mobj = re.search( - r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage) - if mobj is not None: - return self.url_result('eagleplatform:%(host)s:%(id)s' % mobj.groupdict(), 'EaglePlatform') - - # Look for Pladform embeds - pladform_url = PladformIE._extract_url(webpage) - if pladform_url: - return self.url_result(pladform_url) - - # Look for Videomore embeds - videomore_url = VideomoreIE._extract_url(webpage) - if videomore_url: - return self.url_result(videomore_url) - - # Look for Webcaster embeds - webcaster_url = WebcasterFeedIE._extract_url(self, webpage) - if webcaster_url: - return self.url_result(webcaster_url, ie=WebcasterFeedIE.ie_key()) - - # Look for Playwire embeds - mobj = re.search( - r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for Crooks and Liars embeds - mobj = re.search( - r'<(?:iframe[^>]+src|param[^>]+value)=(["\'])(?P<url>(?:https?:)?//embed\.crooksandliars\.com/(?:embed|v)/.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url')) - - # Look for NBC Sports VPlayer embeds - nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) - if nbc_sports_url: - return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') - - # Look for NBC News embeds - nbc_news_embed_url = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//www\.nbcnews\.com/widget/video-embed/[^"\']+)\1', webpage) - if nbc_news_embed_url: - return self.url_result(nbc_news_embed_url.group('url'), 'NBCNews') - - # Look for Google Drive embeds - google_drive_url = GoogleDriveIE._extract_url(webpage) - if google_drive_url: - return self.url_result(google_drive_url, 'GoogleDrive') - - # Look for UDN embeds - mobj = re.search( - r'<iframe[^>]+src="(?:https?:)?(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage) - if mobj is not None: - return self.url_result( - compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed') - - # Look for Senate ISVP iframe - senate_isvp_url = SenateISVPIE._search_iframe_url(webpage) - if senate_isvp_url: - return self.url_result(senate_isvp_url, 'SenateISVP') - - # Look for Kinja embeds - kinja_embed_urls = KinjaEmbedIE._extract_urls(webpage, url) - if kinja_embed_urls: - return self.playlist_from_matches( - kinja_embed_urls, video_id, video_title) - - # Look for OnionStudios embeds - onionstudios_url = OnionStudiosIE._extract_url(webpage) - if onionstudios_url: - return self.url_result(onionstudios_url) - - # Look for Blogger embeds - blogger_urls = BloggerIE._extract_urls(webpage) - if blogger_urls: - return self.playlist_from_matches(blogger_urls, video_id, video_title, ie=BloggerIE.ie_key()) - - # Look for ViewLift embeds - viewlift_url = ViewLiftEmbedIE._extract_url(webpage) - if viewlift_url: - return self.url_result(viewlift_url) - - # Look for JWPlatform embeds - jwplatform_urls = JWPlatformIE._extract_urls(webpage) - if jwplatform_urls: - return self.playlist_from_matches(jwplatform_urls, video_id, video_title, ie=JWPlatformIE.ie_key()) - - # Look for Digiteka embeds - digiteka_url = DigitekaIE._extract_url(webpage) - if digiteka_url: - return self.url_result(self._proto_relative_url(digiteka_url), DigitekaIE.ie_key()) - - # Look for Arkena embeds - arkena_url = ArkenaIE._extract_url(webpage) - if arkena_url: - return self.url_result(arkena_url, ArkenaIE.ie_key()) - - # Look for Piksel embeds - piksel_url = PikselIE._extract_url(webpage) - if piksel_url: - return self.url_result(piksel_url, PikselIE.ie_key()) - - # Look for Limelight embeds - limelight_urls = LimelightBaseIE._extract_urls(webpage, url) - if limelight_urls: - return self.playlist_result( - limelight_urls, video_id, video_title, video_description) - - # Look for Anvato embeds - anvato_urls = AnvatoIE._extract_urls(self, webpage, video_id) - if anvato_urls: - return self.playlist_result( - anvato_urls, video_id, video_title, video_description) - - # Look for AdobeTVVideo embeds - mobj = re.search( - r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]', - webpage) - if mobj is not None: - return self.url_result( - self._proto_relative_url(unescapeHTML(mobj.group(1))), - 'AdobeTVVideo') - - # Look for Vine embeds - mobj = re.search( - r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?vine\.co/v/[^/]+/embed/(?:simple|postcard))', - webpage) - if mobj is not None: - return self.url_result( - self._proto_relative_url(unescapeHTML(mobj.group(1))), 'Vine') - - # Look for VODPlatform embeds - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:(?:www\.)?vod-platform\.net|embed\.kwikmotion\.com)/[eE]mbed/.+?)\1', - webpage) - if mobj is not None: - return self.url_result( - self._proto_relative_url(unescapeHTML(mobj.group('url'))), 'VODPlatform') - - # Look for Mangomolo embeds - mobj = re.search( - r'''(?x)<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?// - (?: - admin\.mangomolo\.com/analytics/index\.php/customers/embed| - player\.mangomolo\.com/v1 - )/ - (?: - video\?.*?\bid=(?P<video_id>\d+)| - (?:index|live)\?.*?\bchannelid=(?P<channel_id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+) - ).+?)\1''', webpage) - if mobj is not None: - info = { - '_type': 'url_transparent', - 'url': self._proto_relative_url(unescapeHTML(mobj.group('url'))), - 'title': video_title, - 'description': video_description, - 'thumbnail': video_thumbnail, - 'uploader': video_uploader, - } - video_id = mobj.group('video_id') - if video_id: - info.update({ - 'ie_key': 'MangomoloVideo', - 'id': video_id, - }) - else: - info.update({ - 'ie_key': 'MangomoloLive', - 'id': mobj.group('channel_id'), - }) - return info - - # Look for Instagram embeds - instagram_embed_url = InstagramIE._extract_embed_url(webpage) - if instagram_embed_url is not None: - return self.url_result( - self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key()) - - # Look for 3Q SDN embeds - threeqsdn_url = ThreeQSDNIE._extract_url(webpage) - if threeqsdn_url: - return { - '_type': 'url_transparent', - 'ie_key': ThreeQSDNIE.ie_key(), - 'url': self._proto_relative_url(threeqsdn_url), - 'title': video_title, - 'description': video_description, - 'thumbnail': video_thumbnail, - 'uploader': video_uploader, - } - - # Look for VBOX7 embeds - vbox7_url = Vbox7IE._extract_url(webpage) - if vbox7_url: - return self.url_result(vbox7_url, Vbox7IE.ie_key()) - - # Look for DBTV embeds - dbtv_urls = DBTVIE._extract_urls(webpage) - if dbtv_urls: - return self.playlist_from_matches(dbtv_urls, video_id, video_title, ie=DBTVIE.ie_key()) - - # Look for Videa embeds - videa_urls = VideaIE._extract_urls(webpage) - if videa_urls: - return self.playlist_from_matches(videa_urls, video_id, video_title, ie=VideaIE.ie_key()) - - # Look for 20 minuten embeds - twentymin_urls = TwentyMinutenIE._extract_urls(webpage) - if twentymin_urls: - return self.playlist_from_matches( - twentymin_urls, video_id, video_title, ie=TwentyMinutenIE.ie_key()) - - # Look for VideoPress embeds - videopress_urls = VideoPressIE._extract_urls(webpage) - if videopress_urls: - return self.playlist_from_matches( - videopress_urls, video_id, video_title, ie=VideoPressIE.ie_key()) - - # Look for Rutube embeds - rutube_urls = RutubeIE._extract_urls(webpage) - if rutube_urls: - return self.playlist_from_matches( - rutube_urls, video_id, video_title, ie=RutubeIE.ie_key()) - - # Look for Glomex embeds - glomex_urls = list(GlomexEmbedIE._extract_urls(webpage, url)) - if glomex_urls: - return self.playlist_from_matches( - glomex_urls, video_id, video_title, ie=GlomexEmbedIE.ie_key()) - - # Look for megatv.com embeds - megatvcom_urls = list(MegaTVComEmbedIE._extract_urls(webpage)) - if megatvcom_urls: - return self.playlist_from_matches( - megatvcom_urls, video_id, video_title, ie=MegaTVComEmbedIE.ie_key()) - - # Look for ant1news.gr embeds - ant1newsgr_urls = list(Ant1NewsGrEmbedIE._extract_urls(webpage)) - if ant1newsgr_urls: - return self.playlist_from_matches( - ant1newsgr_urls, video_id, video_title, ie=Ant1NewsGrEmbedIE.ie_key()) - - # Look for WashingtonPost embeds - wapo_urls = WashingtonPostIE._extract_urls(webpage) - if wapo_urls: - return self.playlist_from_matches( - wapo_urls, video_id, video_title, ie=WashingtonPostIE.ie_key()) - - # Look for Mediaset embeds - mediaset_urls = MediasetIE._extract_urls(self, webpage) - if mediaset_urls: - return self.playlist_from_matches( - mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key()) - - # Look for JOJ.sk embeds - joj_urls = JojIE._extract_urls(webpage) - if joj_urls: - return self.playlist_from_matches( - joj_urls, video_id, video_title, ie=JojIE.ie_key()) - - # Look for megaphone.fm embeds - mpfn_urls = MegaphoneIE._extract_urls(webpage) - if mpfn_urls: - return self.playlist_from_matches( - mpfn_urls, video_id, video_title, ie=MegaphoneIE.ie_key()) - - # Look for vzaar embeds - vzaar_urls = VzaarIE._extract_urls(webpage) - if vzaar_urls: - return self.playlist_from_matches( - vzaar_urls, video_id, video_title, ie=VzaarIE.ie_key()) - - channel9_urls = Channel9IE._extract_urls(webpage) - if channel9_urls: - return self.playlist_from_matches( - channel9_urls, video_id, video_title, ie=Channel9IE.ie_key()) - - vshare_urls = VShareIE._extract_urls(webpage) - if vshare_urls: - return self.playlist_from_matches( - vshare_urls, video_id, video_title, ie=VShareIE.ie_key()) - - # Look for Mediasite embeds - mediasite_urls = MediasiteIE._extract_urls(webpage) - if mediasite_urls: - entries = [ - self.url_result(smuggle_url( - compat_urlparse.urljoin(url, mediasite_url), - {'UrlReferrer': url}), ie=MediasiteIE.ie_key()) - for mediasite_url in mediasite_urls] - return self.playlist_result(entries, video_id, video_title) - - springboardplatform_urls = SpringboardPlatformIE._extract_urls(webpage) - if springboardplatform_urls: - return self.playlist_from_matches( - springboardplatform_urls, video_id, video_title, - ie=SpringboardPlatformIE.ie_key()) - - yapfiles_urls = YapFilesIE._extract_urls(webpage) - if yapfiles_urls: - return self.playlist_from_matches( - yapfiles_urls, video_id, video_title, ie=YapFilesIE.ie_key()) - - vice_urls = ViceIE._extract_urls(webpage) - if vice_urls: - return self.playlist_from_matches( - vice_urls, video_id, video_title, ie=ViceIE.ie_key()) - - xfileshare_urls = XFileShareIE._extract_urls(webpage) - if xfileshare_urls: - return self.playlist_from_matches( - xfileshare_urls, video_id, video_title, ie=XFileShareIE.ie_key()) - - cloudflarestream_urls = CloudflareStreamIE._extract_urls(webpage) - if cloudflarestream_urls: - return self.playlist_from_matches( - cloudflarestream_urls, video_id, video_title, ie=CloudflareStreamIE.ie_key()) - - peertube_urls = PeerTubeIE._extract_urls(webpage, url) - if peertube_urls: - return self.playlist_from_matches( - peertube_urls, video_id, video_title, ie=PeerTubeIE.ie_key()) - - indavideo_urls = IndavideoEmbedIE._extract_urls(webpage) - if indavideo_urls: - return self.playlist_from_matches( - indavideo_urls, video_id, video_title, ie=IndavideoEmbedIE.ie_key()) - - apa_urls = APAIE._extract_urls(webpage) - if apa_urls: - return self.playlist_from_matches( - apa_urls, video_id, video_title, ie=APAIE.ie_key()) - - foxnews_urls = FoxNewsIE._extract_urls(webpage) - if foxnews_urls: - return self.playlist_from_matches( - foxnews_urls, video_id, video_title, ie=FoxNewsIE.ie_key()) - - sharevideos_urls = [sharevideos_mobj.group('url') for sharevideos_mobj in re.finditer( - r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1', - webpage)] - if sharevideos_urls: - return self.playlist_from_matches( - sharevideos_urls, video_id, video_title) - - viqeo_urls = ViqeoIE._extract_urls(webpage) - if viqeo_urls: - return self.playlist_from_matches( - viqeo_urls, video_id, video_title, ie=ViqeoIE.ie_key()) - - expressen_urls = ExpressenIE._extract_urls(webpage) - if expressen_urls: - return self.playlist_from_matches( - expressen_urls, video_id, video_title, ie=ExpressenIE.ie_key()) - - zype_urls = ZypeIE._extract_urls(webpage) - if zype_urls: - return self.playlist_from_matches( - zype_urls, video_id, video_title, ie=ZypeIE.ie_key()) - - gedi_urls = GediDigitalIE._extract_urls(webpage) - if gedi_urls: - return self.playlist_from_matches( - gedi_urls, video_id, video_title, ie=GediDigitalIE.ie_key()) - - # Look for RCS media group embeds - rcs_urls = RCSEmbedsIE._extract_urls(webpage) - if rcs_urls: - return self.playlist_from_matches( - rcs_urls, video_id, video_title, ie=RCSEmbedsIE.ie_key()) - - wimtv_urls = WimTVIE._extract_urls(webpage) - if wimtv_urls: - return self.playlist_from_matches( - wimtv_urls, video_id, video_title, ie=WimTVIE.ie_key()) - - bitchute_urls = BitChuteIE._extract_urls(webpage) - if bitchute_urls: - return self.playlist_from_matches( - bitchute_urls, video_id, video_title, ie=BitChuteIE.ie_key()) - - rumble_urls = RumbleEmbedIE._extract_urls(webpage) - if len(rumble_urls) == 1: - return self.url_result(rumble_urls[0], RumbleEmbedIE.ie_key()) - if rumble_urls: - return self.playlist_from_matches( - rumble_urls, video_id, video_title, ie=RumbleEmbedIE.ie_key()) - - # Look for (tvopen|ethnos).gr embeds - tvopengr_urls = list(TVOpenGrEmbedIE._extract_urls(webpage)) - if tvopengr_urls: - return self.playlist_from_matches(tvopengr_urls, video_id, video_title, ie=TVOpenGrEmbedIE.ie_key()) - - # Look for ert.gr webtv embeds - ertwebtv_urls = list(ERTWebtvEmbedIE._extract_urls(webpage)) - if len(ertwebtv_urls) == 1: - return self.url_result(self._proto_relative_url(ertwebtv_urls[0]), video_title=video_title, url_transparent=True) - elif ertwebtv_urls: - return self.playlist_from_matches(ertwebtv_urls, video_id, video_title, ie=ERTWebtvEmbedIE.ie_key()) - - tvp_urls = TVPEmbedIE._extract_urls(webpage) - if tvp_urls: - return self.playlist_from_matches(tvp_urls, video_id, video_title, ie=TVPEmbedIE.ie_key()) - - # Look for MainStreaming embeds - mainstreaming_urls = MainStreamingIE._extract_urls(webpage) - if mainstreaming_urls: - return self.playlist_from_matches(mainstreaming_urls, video_id, video_title, ie=MainStreamingIE.ie_key()) - - # Look for Gfycat Embeds - gfycat_urls = GfycatIE._extract_urls(webpage) - if gfycat_urls: - return self.playlist_from_matches(gfycat_urls, video_id, video_title, ie=GfycatIE.ie_key()) - - panopto_urls = PanoptoBaseIE._extract_urls(webpage) - if panopto_urls: - return self.playlist_from_matches(panopto_urls, video_id, video_title) - - # Look for Ruutu embeds - ruutu_url = RuutuIE._extract_url(webpage) - if ruutu_url: - return self.url_result(ruutu_url, RuutuIE) - - # Look for HTML5 media - entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') - if entries: - self.report_detected('HTML5 media') - if len(entries) == 1: - entries[0].update({ - 'id': video_id, - 'title': video_title, - }) - else: - for num, entry in enumerate(entries, start=1): - entry.update({ - 'id': '%s-%s' % (video_id, num), - 'title': '%s (%d)' % (video_title, num), - }) - for entry in entries: - self._sort_formats(entry['formats']) - return self.playlist_result(entries, video_id, video_title) + embeds = [] + for ie in self._downloader._ies.values(): + if ie.ie_key() in smuggled_data.get('block_ies', []): + continue + gen = ie.extract_from_webpage(self._downloader, url, webpage) + current_embeds = [] + try: + while True: + current_embeds.append(next(gen)) + except self.StopExtraction: + self.report_detected(f'{ie.IE_NAME} exclusive embed', len(current_embeds), + embeds and 'discarding other embeds') + return current_embeds + except StopIteration: + self.report_detected(f'{ie.IE_NAME} embed', len(current_embeds)) + embeds.extend(current_embeds) + + if embeds: + return embeds jwplayer_data = self._find_jwplayer_data( webpage, video_id, transform_source=js_to_json) if jwplayer_data: if isinstance(jwplayer_data.get('playlist'), str): self.report_detected('JW Player playlist') - return { - **info_dict, - '_type': 'url', - 'ie_key': JWPlatformIE.ie_key(), - 'url': jwplayer_data['playlist'], - } + return [self.url_result(jwplayer_data['playlist'], 'JWPlatform')] try: info = self._parse_jwplayer_data( jwplayer_data, video_id, require_title=False, base_url=url) - self.report_detected('JW Player data') - return merge_dicts(info, info_dict) + if traverse_obj(info, 'formats', ('entries', ..., 'formats')): + self.report_detected('JW Player data') + return [info] except ExtractorError: # See https://github.com/ytdl-org/youtube-dl/pull/16735 pass @@ -3806,24 +2526,21 @@ class GenericIE(InfoExtractor): webpage) if mobj is not None: varname = mobj.group(1) - sources = self._parse_json( - mobj.group(2), video_id, transform_source=js_to_json, - fatal=False) or [] - if not isinstance(sources, list): - sources = [sources] + sources = variadic(self._parse_json( + mobj.group(2), video_id, transform_source=js_to_json, fatal=False) or []) formats = [] subtitles = {} for source in sources: src = source.get('src') - if not src or not isinstance(src, compat_str): + if not src or not isinstance(src, str): continue - src = compat_urlparse.urljoin(url, src) + src = urllib.parse.urljoin(url, src) src_type = source.get('type') - if isinstance(src_type, compat_str): + if isinstance(src_type, str): src_type = src_type.lower() ext = determine_ext(src).lower() if src_type == 'video/youtube': - return self.url_result(src, YoutubeIE.ie_key()) + return [self.url_result(src, YoutubeIE.ie_key())] if src_type == 'application/dash+xml' or ext == 'mpd': fmts, subs = self._extract_mpd_formats_and_subtitles( src, video_id, mpd_id='dash', fatal=False) @@ -3835,13 +2552,16 @@ class GenericIE(InfoExtractor): m3u8_id='hls', fatal=False) formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) - else: + for fmt in formats: + fmt.update(self._fragment_query(src)) + + if not formats: formats.append({ 'url': src, 'ext': (mimetype2ext(src_type) or ext if ext in KNOWN_EXTENSIONS else 'mp4'), 'http_headers': { - 'Referer': full_response.geturl(), + 'Referer': actual_url, }, }) # https://docs.videojs.com/player#addRemoteTextTrack @@ -3853,39 +2573,36 @@ class GenericIE(InfoExtractor): if not src: continue subtitles.setdefault(dict_get(sub, ('language', 'srclang')) or 'und', []).append({ - 'url': compat_urlparse.urljoin(url, src), + 'url': urllib.parse.urljoin(url, src), 'name': sub.get('label'), 'http_headers': { - 'Referer': full_response.geturl(), + 'Referer': actual_url, }, }) if formats or subtitles: self.report_detected('video.js embed') - self._sort_formats(formats) - info_dict['formats'] = formats - info_dict['subtitles'] = subtitles - return info_dict + return [{'formats': formats, 'subtitles': subtitles}] # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld(webpage, video_id, default={}) if json_ld.get('url') not in (url, None): self.report_detected('JSON LD') - if determine_ext(json_ld['url']) == 'm3u8': - json_ld['formats'], json_ld['subtitles'] = self._extract_m3u8_formats_and_subtitles( - json_ld['url'], video_id, 'mp4') - json_ld.pop('url') - self._sort_formats(json_ld['formats']) - else: - json_ld['_type'] = 'url_transparent' - json_ld['url'] = smuggle_url(json_ld['url'], {'force_videoid': video_id, 'to_generic': True}) - return merge_dicts(json_ld, info_dict) + is_direct = json_ld.get('ext') not in (None, *MEDIA_EXTENSIONS.manifests) + return [merge_dicts({ + '_type': 'video' if is_direct else 'url_transparent', + 'url': smuggle_url(json_ld['url'], { + 'force_videoid': video_id, + 'to_generic': True, + 'http_headers': {'Referer': url}, + }), + }, json_ld)] def check_video(vurl): if YoutubeIE.suitable(vurl): return True if RtmpIE.suitable(vurl): return True - vpath = compat_urlparse.urlparse(vurl).path + vpath = urllib.parse.urlparse(vurl).path vext = determine_ext(vpath, None) return vext not in (None, 'swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml') @@ -3947,15 +2664,13 @@ class GenericIE(InfoExtractor): if not formats[-1].get('height'): formats[-1]['quality'] = 1 - self._sort_formats(formats) - - return { + return [{ 'id': flashvars['video_id'], 'display_id': display_id, 'title': title, 'thumbnail': thumbnail, 'formats': formats, - } + }] if not found: # Broaden the search a little bit found = filter_video(re.findall(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)) @@ -3994,7 +2709,7 @@ class GenericIE(InfoExtractor): self.report_detected('Twitter card') if not found: # We look for Open Graph info: - # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am) + # We have to match any number spaces between elements, some sites try to align them, e.g.: statigr.am m_video_type = re.findall(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage) # We only look in og:video if the MIME type is a video, don't try if it's a Flash player: if m_video_type is not None: @@ -4009,20 +2724,14 @@ class GenericIE(InfoExtractor): webpage) if not found: # Look also in Refresh HTTP header - refresh_header = head_response.headers.get('Refresh') + refresh_header = urlh and urlh.headers.get('Refresh') if refresh_header: - # In python 2 response HTTP headers are bytestrings - if sys.version_info < (3, 0) and isinstance(refresh_header, str): - refresh_header = refresh_header.decode('iso-8859-1') found = re.search(REDIRECT_REGEX, refresh_header) if found: - new_url = compat_urlparse.urljoin(url, unescapeHTML(found.group(1))) + new_url = urllib.parse.urljoin(url, unescapeHTML(found.group(1))) if new_url != url: self.report_following_redirect(new_url) - return { - '_type': 'url', - 'url': new_url, - } + return [self.url_result(new_url)] else: found = None @@ -4033,34 +2742,35 @@ class GenericIE(InfoExtractor): embed_url = self._html_search_meta('twitter:player', webpage, default=None) if embed_url and embed_url != url: self.report_detected('twitter:player iframe') - return self.url_result(embed_url) + return [self.url_result(embed_url)] if not found: - raise UnsupportedError(url) + return [] + + domain_name = self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader', default=None) entries = [] for video_url in orderedSet(found): video_url = unescapeHTML(video_url) video_url = video_url.replace('\\/', '/') - video_url = compat_urlparse.urljoin(url, video_url) - video_id = compat_urllib_parse_unquote(os.path.basename(video_url)) + video_url = urllib.parse.urljoin(url, video_url) + video_id = urllib.parse.unquote(os.path.basename(video_url)) # Sometimes, jwplayer extraction will result in a YouTube URL if YoutubeIE.suitable(video_url): entries.append(self.url_result(video_url, 'Youtube')) continue - # here's a fun little line of code for you: video_id = os.path.splitext(video_id)[0] headers = { - 'referer': full_response.geturl() + 'referer': actual_url } entry_info_dict = { 'id': video_id, - 'uploader': video_uploader, - 'title': video_title, - 'age_limit': age_limit, + 'uploader': domain_name, + 'title': info_dict['title'], + 'age_limit': info_dict['age_limit'], 'http_headers': headers, } @@ -4077,11 +2787,13 @@ class GenericIE(InfoExtractor): if ext == 'smil': entry_info_dict = {**self._extract_smil_info(video_url, video_id), **entry_info_dict} elif ext == 'xspf': - return self.playlist_result(self._extract_xspf_playlist(video_url, video_id), video_id) + return [self._extract_xspf_playlist(video_url, video_id)] elif ext == 'm3u8': entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4', headers=headers) + entry_info_dict.update(self._fragment_query(video_url)) elif ext == 'mpd': entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_mpd_formats_and_subtitles(video_url, video_id, headers=headers) + entry_info_dict.update(self._fragment_query(video_url)) elif ext == 'f4m': entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id, headers=headers) elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url: @@ -4102,19 +2814,11 @@ class GenericIE(InfoExtractor): else: entry_info_dict['url'] = video_url - if entry_info_dict.get('formats'): - self._sort_formats(entry_info_dict['formats']) - entries.append(entry_info_dict) - if len(entries) == 1: - return entries[0] - else: + if len(entries) > 1: for num, e in enumerate(entries, start=1): # 'url' results don't have a title if e.get('title') is not None: e['title'] = '%s (%d)' % (e['title'], num) - return { - '_type': 'playlist', - 'entries': entries, - } + return entries diff --git a/hypervideo_dl/extractor/genericembeds.py b/hypervideo_dl/extractor/genericembeds.py new file mode 100644 index 0000000..9b4f14d --- /dev/null +++ b/hypervideo_dl/extractor/genericembeds.py @@ -0,0 +1,114 @@ +import re +import urllib.parse + +from .common import InfoExtractor +from ..utils import make_archive_id, unescapeHTML + + +class HTML5MediaEmbedIE(InfoExtractor): + _VALID_URL = False + IE_NAME = 'html5' + _WEBPAGE_TESTS = [ + { + 'url': 'https://html.com/media/', + 'info_dict': { + 'title': 'HTML5 Media', + 'description': 'md5:933b2d02ceffe7a7a0f3c8326d91cc2a', + }, + 'playlist_count': 2 + } + ] + + def _extract_from_webpage(self, url, webpage): + video_id, title = self._generic_id(url), self._generic_title(url, webpage) + entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') or [] + for num, entry in enumerate(entries, start=1): + entry.update({ + 'id': f'{video_id}-{num}', + 'title': f'{title} ({num})', + '_old_archive_ids': [ + make_archive_id('generic', f'{video_id}-{num}' if len(entries) > 1 else video_id), + ], + }) + yield entry + + +class QuotedHTMLIE(InfoExtractor): + """For common cases of quoted/escaped html parts in the webpage""" + _VALID_URL = False + IE_NAME = 'generic:quoted-html' + IE_DESC = False # Do not list + _WEBPAGE_TESTS = [{ + # 2 YouTube embeds in data-html + 'url': 'https://24tv.ua/bronetransporteri-ozbroyenni-zsu-shho-vidomo-pro-bronovik-wolfhound_n2167966', + 'info_dict': { + 'id': 'bronetransporteri-ozbroyenni-zsu-shho-vidomo-pro-bronovik-wolfhound_n2167966', + 'title': 'Броньовик Wolfhound: гігант, який допомагає ЗСУ знищувати окупантів на фронті', + 'thumbnail': r're:^https?://.*\.jpe?g', + 'timestamp': float, + 'upload_date': str, + 'description': 'md5:6816e1e5a65304bd7898e4c7eb1b26f7', + 'age_limit': 0, + }, + 'playlist_count': 2 + }, { + # Generic iframe embed of TV24UAPlayerIE within data-html + 'url': 'https://24tv.ua/harkivyani-zgaduyut-misto-do-viyni-shhemlive-video_n1887584', + 'info_dict': { + 'id': '1887584', + 'ext': 'mp4', + 'title': 'Харків\'яни згадують місто до війни: щемливе відео', + 'thumbnail': r're:^https?://.*\.jpe?g', + }, + 'params': {'skip_download': True} + }, { + # YouTube embeds on Squarespace (data-html): https://github.com/ytdl-org/youtube-dl/issues/21294 + 'url': 'https://www.harvardballetcompany.org/past-productions', + 'info_dict': { + 'id': 'past-productions', + 'title': 'Productions — Harvard Ballet Company', + 'age_limit': 0, + 'description': 'Past Productions', + }, + 'playlist_mincount': 26 + }, { + # Squarespace video embed, 2019-08-28, data-html + 'url': 'http://ootboxford.com', + 'info_dict': { + 'id': 'Tc7b_JGdZfw', + 'title': 'Out of the Blue, at Childish Things 10', + 'ext': 'mp4', + 'description': 'md5:a83d0026666cf5ee970f8bd1cfd69c7f', + 'uploader_id': 'helendouglashouse', + 'uploader': 'Helen & Douglas House', + 'upload_date': '20140328', + 'availability': 'public', + 'view_count': int, + 'channel': 'Helen & Douglas House', + 'comment_count': int, + 'uploader_url': 'http://www.youtube.com/user/helendouglashouse', + 'duration': 253, + 'channel_url': 'https://www.youtube.com/channel/UCTChGezrZVmlYlpMlkmulPA', + 'playable_in_embed': True, + 'age_limit': 0, + 'channel_follower_count': int, + 'channel_id': 'UCTChGezrZVmlYlpMlkmulPA', + 'tags': 'count:6', + 'categories': ['Nonprofits & Activism'], + 'like_count': int, + 'thumbnail': 'https://i.ytimg.com/vi/Tc7b_JGdZfw/hqdefault.jpg', + }, + 'params': { + 'skip_download': True, + }, + }] + + def _extract_from_webpage(self, url, webpage): + combined = '' + for _, html in re.findall(r'(?s)\bdata-html=(["\'])((?:(?!\1).)+)\1', webpage): + # unescapeHTML can handle " etc., unquote can handle percent encoding + unquoted_html = unescapeHTML(urllib.parse.unquote(html)) + if unquoted_html != html: + combined += unquoted_html + if combined: + yield from self._extract_generic_embeds(url, combined) diff --git a/hypervideo_dl/extractor/genius.py b/hypervideo_dl/extractor/genius.py new file mode 100644 index 0000000..62f5a28 --- /dev/null +++ b/hypervideo_dl/extractor/genius.py @@ -0,0 +1,127 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + js_to_json, + smuggle_url, + str_or_none, + traverse_obj, + unescapeHTML, +) + + +class GeniusIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?genius\.com/videos/(?P<id>[^?/#]+)' + _TESTS = [{ + 'url': 'https://genius.com/videos/Vince-staples-breaks-down-the-meaning-of-when-sparks-fly', + 'md5': '64c2ad98cfafcfda23bfa0ad0c512f4c', + 'info_dict': { + 'id': '6313303597112', + 'ext': 'mp4', + 'title': 'Vince Staples Breaks Down The Meaning Of “When Sparks Fly”', + 'description': 'md5:bc15e00342c537c0039d414423ae5752', + 'tags': 'count:1', + 'uploader_id': '4863540648001', + 'duration': 388.416, + 'upload_date': '20221005', + 'timestamp': 1664982341, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }, { + 'url': 'https://genius.com/videos/Breaking-down-drakes-certified-lover-boy-kanye-beef-way-2-sexy-cudi', + 'md5': 'b8ed87a5efd1473bd027c20a969d4060', + 'info_dict': { + 'id': '6271792014001', + 'ext': 'mp4', + 'title': 'md5:c6355f7fa8a70bc86492a3963919fc15', + 'description': 'md5:1774638c31548b31b037c09e9b821393', + 'tags': 'count:3', + 'uploader_id': '4863540648001', + 'duration': 2685.099, + 'upload_date': '20210909', + 'timestamp': 1631209167, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + metadata = self._search_json( + r'<meta content="', webpage, 'metadata', display_id, transform_source=unescapeHTML) + video_id = traverse_obj( + metadata, ('video', 'provider_id'), + ('dfp_kv', lambda _, x: x['name'] == 'brightcove_video_id', 'values', 0), get_all=False) + if not video_id: + raise ExtractorError('Brightcove video id not found in webpage') + + config = self._search_json(r'var\s*APP_CONFIG\s*=', webpage, 'config', video_id, default={}) + account_id = config.get('brightcove_account_id', '4863540648001') + player_id = traverse_obj( + config, 'brightcove_standard_web_player_id', 'brightcove_standard_no_autoplay_web_player_id', + 'brightcove_modal_web_player_id', 'brightcove_song_story_web_player_id', default='S1ZcmcOC1x') + + return self.url_result( + smuggle_url( + f'https://players.brightcove.net/{account_id}/{player_id}_default/index.html?videoId={video_id}', + {'referrer': url}), 'BrightcoveNew', video_id) + + +class GeniusLyricsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?genius\.com/(?P<id>[^?/#]+)-lyrics[?/#]?' + _TESTS = [{ + 'url': 'https://genius.com/Lil-baby-heyy-lyrics', + 'playlist_mincount': 2, + 'info_dict': { + 'id': '8454545', + 'title': 'Heyy', + 'description': 'Heyy by Lil Baby', + }, + }, { + 'url': 'https://genius.com/Outkast-two-dope-boyz-in-a-cadillac-lyrics', + 'playlist_mincount': 1, + 'info_dict': { + 'id': '36239', + 'title': 'Two Dope Boyz (In a Cadillac)', + 'description': 'Two Dope Boyz (In a Cadillac) by OutKast', + }, + }, { + 'url': 'https://genius.com/Playboi-carti-rip-lyrics', + 'playlist_mincount': 1, + 'info_dict': { + 'id': '3710582', + 'title': 'R.I.P.', + 'description': 'R.I.P. by Playboi Carti', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + json_string = self._search_json( + r'window\.__PRELOADED_STATE__\s*=\s*JSON\.parse\(', webpage, 'json string', + display_id, transform_source=js_to_json, contains_pattern=r'\'{(?s:.+)}\'') + song_info = self._parse_json(json_string, display_id) + song_id = str_or_none(traverse_obj(song_info, ('songPage', 'song'))) + if not song_id: + raise ExtractorError('Song id not found in webpage') + + title = traverse_obj( + song_info, ('songPage', 'trackingData', lambda _, x: x['key'] == 'Title', 'value'), + get_all=False, default='untitled') + artist = traverse_obj( + song_info, ('songPage', 'trackingData', lambda _, x: x['key'] == 'Primary Artist', 'value'), + get_all=False, default='unknown artist') + media = traverse_obj( + song_info, ('entities', 'songs', song_id, 'media'), expected_type=list, default=[]) + + entries = [] + for m in media: + if m.get('type') in ('video', 'audio') and m.get('url'): + if m.get('provider') == 'spotify': + self.to_screen(f'{song_id}: Skipping Spotify audio embed') + else: + entries.append(self.url_result(m['url'])) + + return self.playlist_result(entries, song_id, title, f'{title} by {artist}') diff --git a/hypervideo_dl/extractor/gettr.py b/hypervideo_dl/extractor/gettr.py index 327a4d0..7795dc5 100644 --- a/hypervideo_dl/extractor/gettr.py +++ b/hypervideo_dl/extractor/gettr.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( bool_or_none, @@ -124,8 +121,6 @@ class GettrIE(GettrBaseIE): 'height': int_or_none(post_data.get('vid_hgt')), }) - self._sort_formats(formats) - return { 'id': post_id, 'title': title, @@ -195,8 +190,6 @@ class GettrStreamingIE(GettrBaseIE): 'url': urljoin(self._MEDIA_BASE_URL, thumbnail), } for thumbnail in try_get(video_info, lambda x: x['postData']['imgs'], list) or []] - self._sort_formats(formats) - return { 'id': video_id, 'title': try_get(video_info, lambda x: x['postData']['ttl'], str), diff --git a/hypervideo_dl/extractor/gfycat.py b/hypervideo_dl/extractor/gfycat.py index 2ad03e2..edc2e56 100644 --- a/hypervideo_dl/extractor/gfycat.py +++ b/hypervideo_dl/extractor/gfycat.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -13,7 +8,8 @@ from ..utils import ( class GfycatIE(InfoExtractor): - _VALID_URL = r'(?i)https?://(?:(?:www|giant|thumbs)\.)?gfycat\.com/(?:ru/|ifr/|gifs/detail/)?(?P<id>[^-/?#\."\']+)' + _VALID_URL = r'https?://(?:(?:www|giant|thumbs)\.)?gfycat\.com/(?i:ru/|ifr/|gifs/detail/)?(?P<id>[^-/?#\."\']+)' + _EMBED_REGEX = [rf'<(?:iframe|source)[^>]+\bsrc=["\'](?P<url>{_VALID_URL})'] _TESTS = [{ 'url': 'http://gfycat.com/DeadlyDecisiveGermanpinscher', 'info_dict': { @@ -85,14 +81,6 @@ class GfycatIE(InfoExtractor): 'only_matching': True }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<(?:iframe|source)[^>]+\bsrc=["\'](?P<url>%s)' % GfycatIE._VALID_URL, - webpage)] - def _real_extract(self, url): video_id = self._match_id(url) @@ -139,7 +127,6 @@ class GfycatIE(InfoExtractor): 'filesize': filesize, 'quality': quality(format_id), }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/giantbomb.py b/hypervideo_dl/extractor/giantbomb.py index 1920923..1125723 100644 --- a/hypervideo_dl/extractor/giantbomb.py +++ b/hypervideo_dl/extractor/giantbomb.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -76,8 +74,6 @@ class GiantBombIE(InfoExtractor): if youtube_id: return self.url_result(youtube_id, 'Youtube') - self._sort_formats(formats) - return { 'id': video_id, 'display_id': display_id, diff --git a/hypervideo_dl/extractor/giga.py b/hypervideo_dl/extractor/giga.py index 5a9992a..b59c129 100644 --- a/hypervideo_dl/extractor/giga.py +++ b/hypervideo_dl/extractor/giga.py @@ -1,16 +1,8 @@ -# coding: utf-8 -from __future__ import unicode_literals - import itertools from .common import InfoExtractor -from ..utils import ( - qualities, - compat_str, - parse_duration, - parse_iso8601, - str_to_int, -) +from ..compat import compat_str +from ..utils import parse_duration, parse_iso8601, qualities, str_to_int class GigaIE(InfoExtractor): @@ -67,7 +59,6 @@ class GigaIE(InfoExtractor): 'format_id': '%s-%s' % (fmt['quality'], fmt['type'].split('/')[-1]), 'quality': quality(fmt['quality']), }) - self._sort_formats(formats) title = self._html_search_meta( 'title', webpage, 'title', fatal=True) diff --git a/hypervideo_dl/extractor/gigya.py b/hypervideo_dl/extractor/gigya.py index 4121784..c5bc86b 100644 --- a/hypervideo_dl/extractor/gigya.py +++ b/hypervideo_dl/extractor/gigya.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( diff --git a/hypervideo_dl/extractor/glide.py b/hypervideo_dl/extractor/glide.py index 12af859..d114f34 100644 --- a/hypervideo_dl/extractor/glide.py +++ b/hypervideo_dl/extractor/glide.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor @@ -23,7 +20,7 @@ class GlideIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - title = self._html_extract_title(webpage, default=None) or self._og_search_title(webpage) + title = self._generic_title('', webpage) video_url = self._proto_relative_url(self._search_regex( r'<source[^>]+src=(["\'])(?P<url>.+?)\1', webpage, 'video URL', default=None, diff --git a/hypervideo_dl/extractor/globo.py b/hypervideo_dl/extractor/globo.py index f6aaae1..a7be2cb 100644 --- a/hypervideo_dl/extractor/globo.py +++ b/hypervideo_dl/extractor/globo.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import base64 import hashlib import json @@ -142,7 +139,6 @@ class GloboIE(InfoExtractor): fmts, subtitles = self._extract_m3u8_formats_and_subtitles( signed_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) formats.extend(fmts) - self._sort_formats(formats) for resource in video['resources']: if resource.get('type') == 'subtitle': @@ -181,12 +177,12 @@ class GloboArticleIE(InfoExtractor): _VALID_URL = r'https?://.+?\.globo\.com/(?:[^/]+/)*(?P<id>[^/.]+)(?:\.html)?' _VIDEOID_REGEXES = [ - r'\bdata-video-id=["\'](\d{7,})', - r'\bdata-player-videosids=["\'](\d{7,})', + r'\bdata-video-id=["\'](\d{7,})["\']', + r'\bdata-player-videosids=["\'](\d{7,})["\']', r'\bvideosIDs\s*:\s*["\']?(\d{7,})', - r'\bdata-id=["\'](\d{7,})', - r'<div[^>]+\bid=["\'](\d{7,})', - r'<bs-player[^>]+\bvideoid=["\'](\d{8,})', + r'\bdata-id=["\'](\d{7,})["\']', + r'<div[^>]+\bid=["\'](\d{7,})["\']', + r'<bs-player[^>]+\bvideoid=["\'](\d{8,})["\']', ] _TESTS = [{ @@ -222,6 +218,14 @@ class GloboArticleIE(InfoExtractor): 'description': 'md5:2d089d036c4c9675117d3a56f8c61739', }, 'playlist_count': 1, + }, { + 'url': 'https://redeglobo.globo.com/rpc/meuparana/noticia/a-producao-de-chocolates-no-parana.ghtml', + 'info_dict': { + 'id': 'a-producao-de-chocolates-no-parana', + 'title': 'A produção de chocolates no Paraná', + 'description': 'md5:f2e3daf00ffd1dc0e9a8a6c7cfb0a89e', + }, + 'playlist_count': 2, }] @classmethod @@ -237,6 +241,6 @@ class GloboArticleIE(InfoExtractor): entries = [ self.url_result('globo:%s' % video_id, GloboIE.ie_key()) for video_id in orderedSet(video_ids)] - title = self._og_search_title(webpage) + title = self._og_search_title(webpage).strip() description = self._html_search_meta('description', webpage) return self.playlist_result(entries, display_id, title, description) diff --git a/hypervideo_dl/extractor/glomex.py b/hypervideo_dl/extractor/glomex.py index d9ef433..22aac0d 100644 --- a/hypervideo_dl/extractor/glomex.py +++ b/hypervideo_dl/extractor/glomex.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re import urllib.parse @@ -85,7 +82,6 @@ class GlomexBaseIE(InfoExtractor): if video.get('language'): for fmt in formats: fmt['language'] = video['language'] - self._sort_formats(formats) images = (video.get('images') or []) + [video.get('image') or {}] thumbnails = [{ @@ -177,7 +173,7 @@ class GlomexEmbedIE(GlomexBaseIE): return cls._smuggle_origin_url(f'https:{cls._BASE_PLAYER_URL}?{query_string}', origin_url) @classmethod - def _extract_urls(cls, webpage, origin_url): + def _extract_embed_urls(cls, url, webpage): # https://docs.glomex.com/publisher/video-player-integration/javascript-api/ quot_re = r'["\']' @@ -186,9 +182,9 @@ class GlomexEmbedIE(GlomexBaseIE): (?:https?:)?{cls._BASE_PLAYER_URL_RE}\?(?:(?!(?P=q)).)+ )(?P=q)''' for mobj in re.finditer(regex, webpage): - url = unescapeHTML(mobj.group('url')) - if cls.suitable(url): - yield cls._smuggle_origin_url(url, origin_url) + embed_url = unescapeHTML(mobj.group('url')) + if cls.suitable(embed_url): + yield cls._smuggle_origin_url(embed_url, url) regex = fr'''(?x) <glomex-player [^>]+?>| @@ -196,7 +192,7 @@ class GlomexEmbedIE(GlomexBaseIE): for mobj in re.finditer(regex, webpage): attrs = extract_attributes(mobj.group(0)) if attrs.get('data-integration-id') and attrs.get('data-playlist-id'): - yield cls.build_player_url(attrs['data-playlist-id'], attrs['data-integration-id'], origin_url) + yield cls.build_player_url(attrs['data-playlist-id'], attrs['data-integration-id'], url) # naive parsing of inline scripts for hard-coded integration parameters regex = fr'''(?x) @@ -209,7 +205,7 @@ class GlomexEmbedIE(GlomexBaseIE): continue playlist_id = re.search(regex % 'playlistId', script) if playlist_id: - yield cls.build_player_url(playlist_id, integration_id, origin_url) + yield cls.build_player_url(playlist_id, integration_id, url) def _real_extract(self, url): url, origin_url = self._unsmuggle_origin_url(url) diff --git a/hypervideo_dl/extractor/go.py b/hypervideo_dl/extractor/go.py index f92e166..b075a02 100644 --- a/hypervideo_dl/extractor/go.py +++ b/hypervideo_dl/extractor/go.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .adobepass import AdobePassIE @@ -14,6 +11,8 @@ from ..utils import ( try_get, urlencode_postdata, ExtractorError, + unified_timestamp, + traverse_obj, ) @@ -73,7 +72,7 @@ class GoIE(AdobePassIE): }, 'skip': 'This content is no longer available.', }, { - 'url': 'http://watchdisneyxd.go.com/doraemon', + 'url': 'https://disneynow.com/shows/big-hero-6-the-series', 'info_dict': { 'title': 'Doraemon', 'id': 'SH55574025', @@ -83,10 +82,19 @@ class GoIE(AdobePassIE): 'url': 'http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood', 'info_dict': { 'id': 'VDKA3609139', - 'ext': 'mp4', 'title': 'This Guilty Blood', 'description': 'md5:f18e79ad1c613798d95fdabfe96cd292', 'age_limit': 14, + 'episode': 'Episode 1', + 'upload_date': '20170102', + 'season': 'Season 2', + 'thumbnail': 'http://cdn1.edgedatg.com/aws/v2/abcf/Shadowhunters/video/201/ae5f75608d86bf88aa4f9f4aa76ab1b7/579x325-Q100_ae5f75608d86bf88aa4f9f4aa76ab1b7.jpg', + 'duration': 2544, + 'season_number': 2, + 'series': 'Shadowhunters', + 'episode_number': 1, + 'timestamp': 1483387200, + 'ext': 'mp4' }, 'params': { 'geo_bypass_ip_block': '3.244.239.0/24', @@ -94,13 +102,22 @@ class GoIE(AdobePassIE): 'skip_download': True, }, }, { - 'url': 'https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet', + 'url': 'https://abc.com/shows/the-rookie/episode-guide/season-04/12-the-knock', 'info_dict': { - 'id': 'VDKA13435179', - 'ext': 'mp4', - 'title': 'The Bet', - 'description': 'md5:c66de8ba2e92c6c5c113c3ade84ab404', + 'id': 'VDKA26050359', + 'title': 'The Knock', + 'description': 'md5:0c2947e3ada4c31f28296db7db14aa64', 'age_limit': 14, + 'ext': 'mp4', + 'thumbnail': 'http://cdn1.edgedatg.com/aws/v2/abc/TheRookie/video/412/daf830d06e83b11eaf5c0a299d993ae3/1556x876-Q75_daf830d06e83b11eaf5c0a299d993ae3.jpg', + 'episode': 'Episode 12', + 'season_number': 4, + 'season': 'Season 4', + 'timestamp': 1642975200, + 'episode_number': 12, + 'upload_date': '20220123', + 'series': 'The Rookie', + 'duration': 2572, }, 'params': { 'geo_bypass_ip_block': '3.244.239.0/24', @@ -111,24 +128,18 @@ class GoIE(AdobePassIE): 'url': 'https://fxnow.fxnetworks.com/shows/better-things/video/vdka12782841', 'info_dict': { 'id': 'VDKA12782841', - 'ext': 'mp4', 'title': 'First Look: Better Things - Season 2', 'description': 'md5:fa73584a95761c605d9d54904e35b407', - }, - 'params': { - 'geo_bypass_ip_block': '3.244.239.0/24', - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'https://abc.com/shows/modern-family/episode-guide/season-01/101-pilot', - 'info_dict': { - 'id': 'VDKA22600213', 'ext': 'mp4', - 'title': 'Pilot', - 'description': 'md5:74306df917cfc199d76d061d66bebdb4', + 'age_limit': 14, + 'upload_date': '20170825', + 'duration': 161, + 'series': 'Better Things', + 'thumbnail': 'http://cdn1.edgedatg.com/aws/v2/fx/BetterThings/video/12782841/b6b05e58264121cc2c98811318e6d507/1556x876-Q75_b6b05e58264121cc2c98811318e6d507.jpg', + 'timestamp': 1503661074, }, 'params': { + 'geo_bypass_ip_block': '3.244.239.0/24', # m3u8 download 'skip_download': True, }, @@ -282,7 +293,6 @@ class GoIE(AdobePassIE): 'height': height, }) formats.append(f) - self._sort_formats(formats) for cc in video_data.get('closedcaption', {}).get('src', []): cc_url = cc.get('value') @@ -319,4 +329,5 @@ class GoIE(AdobePassIE): 'thumbnails': thumbnails, 'formats': formats, 'subtitles': subtitles, + 'timestamp': unified_timestamp(traverse_obj(video_data, ('airdates', 'airdate', 0))), } diff --git a/hypervideo_dl/extractor/godtube.py b/hypervideo_dl/extractor/godtube.py index 96e68b4..6975401 100644 --- a/hypervideo_dl/extractor/godtube.py +++ b/hypervideo_dl/extractor/godtube.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( parse_duration, diff --git a/hypervideo_dl/extractor/gofile.py b/hypervideo_dl/extractor/gofile.py index 62d778c..ddbce2e 100644 --- a/hypervideo_dl/extractor/gofile.py +++ b/hypervideo_dl/extractor/gofile.py @@ -1,4 +1,5 @@ -# coding: utf-8 +import hashlib + from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -19,22 +20,34 @@ class GofileIE(InfoExtractor): 'id': 'de571ac1-5edc-42e2-8ec2-bdac83ad4a31', 'filesize': 928116, 'ext': 'mp4', - 'title': 'nuuh' + 'title': 'nuuh', + 'release_timestamp': 1638338704, + 'release_date': '20211201', } }] - }, { # URL to test mixed file types - 'url': 'https://gofile.io/d/avt34h', + }, { + 'url': 'https://gofile.io/d/is8lKr', 'info_dict': { - 'id': 'avt34h', - }, - 'playlist_mincount': 1, - }, { # URL to test no video/audio error - 'url': 'https://gofile.io/d/aB03lZ', - 'info_dict': { - 'id': 'aB03lZ', + 'id': 'TMjXd9', + 'ext': 'mp4', }, 'playlist_count': 0, 'skip': 'No video/audio found at provided URL.', + }, { + 'url': 'https://gofile.io/d/TMjXd9', + 'info_dict': { + 'id': 'TMjXd9', + }, + 'playlist_count': 1, + }, { + 'url': 'https://gofile.io/d/gqOtRf', + 'info_dict': { + 'id': 'gqOtRf', + }, + 'playlist_mincount': 1, + 'params': { + 'videopassword': 'password', + }, }] _TOKEN = None @@ -50,12 +63,22 @@ class GofileIE(InfoExtractor): self._set_cookie('gofile.io', 'accountToken', self._TOKEN) def _entries(self, file_id): + query_params = { + 'contentId': file_id, + 'token': self._TOKEN, + 'websiteToken': 12345, + } + password = self.get_param('videopassword') + if password: + query_params['password'] = hashlib.sha256(password.encode('utf-8')).hexdigest() files = self._download_json( - f'https://api.gofile.io/getContent?contentId={file_id}&token={self._TOKEN}&websiteToken=websiteToken&cache=true', - 'Gofile', note='Getting filelist') + 'https://api.gofile.io/getContent', file_id, note='Getting filelist', query=query_params) status = files['status'] - if status != 'ok': + if status == 'error-passwordRequired': + raise ExtractorError( + 'This video is protected by a password, use the --video-password option', expected=True) + elif status != 'ok': raise ExtractorError(f'{self.IE_NAME} said: status {status}', expected=True) found_files = False @@ -65,7 +88,7 @@ class GofileIE(InfoExtractor): continue found_files = True - file_url = file.get('directLink') + file_url = file.get('link') if file_url: yield { 'id': file['id'], diff --git a/hypervideo_dl/extractor/golem.py b/hypervideo_dl/extractor/golem.py index 47a068e..c33d950 100644 --- a/hypervideo_dl/extractor/golem.py +++ b/hypervideo_dl/extractor/golem.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_str, @@ -54,7 +51,6 @@ class GolemIE(InfoExtractor): 'filesize': self._int(e.findtext('filesize'), 'filesize'), 'ext': determine_ext(e.findtext('./filename')), }) - self._sort_formats(formats) info['formats'] = formats thumbnails = [] diff --git a/hypervideo_dl/extractor/goodgame.py b/hypervideo_dl/extractor/goodgame.py new file mode 100644 index 0000000..c17ad56 --- /dev/null +++ b/hypervideo_dl/extractor/goodgame.py @@ -0,0 +1,57 @@ +from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, + str_or_none, + traverse_obj, +) + + +class GoodGameIE(InfoExtractor): + IE_NAME = 'goodgame:stream' + _VALID_URL = r'https?://goodgame\.ru/channel/(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://goodgame.ru/channel/Pomi/#autoplay', + 'info_dict': { + 'id': 'pomi', + 'ext': 'mp4', + 'title': r're:Reynor vs Special \(1/2,bo3\) Wardi Spring EU \- playoff \(финальный день\) \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'channel_id': '1644', + 'channel': 'Pomi', + 'channel_url': 'https://goodgame.ru/channel/Pomi/', + 'description': 'md5:4a87b775ee7b2b57bdccebe285bbe171', + 'thumbnail': r're:^https?://.*\.jpg$', + 'live_status': 'is_live', + 'view_count': int, + }, + 'params': {'skip_download': 'm3u8'}, + 'skip': 'May not be online', + }] + + def _real_extract(self, url): + channel_name = self._match_id(url) + response = self._download_json(f'https://api2.goodgame.ru/v2/streams/{channel_name}', channel_name) + player_id = response['channel']['gg_player_src'] + + formats, subtitles = [], {} + if response.get('status') == 'Live': + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + f'https://hls.goodgame.ru/manifest/{player_id}_master.m3u8', + channel_name, 'mp4', live=True) + else: + self.raise_no_formats('User is offline', expected=True, video_id=channel_name) + + return { + 'id': player_id, + 'formats': formats, + 'subtitles': subtitles, + 'title': traverse_obj(response, ('channel', 'title')), + 'channel': channel_name, + 'channel_id': str_or_none(traverse_obj(response, ('channel', 'id'))), + 'channel_url': response.get('url'), + 'description': clean_html(traverse_obj(response, ('channel', 'description'))), + 'thumbnail': traverse_obj(response, ('channel', 'thumb')), + 'is_live': bool(formats), + 'view_count': int_or_none(response.get('viewers')), + 'age_limit': 18 if traverse_obj(response, ('channel', 'adult')) else None, + } diff --git a/hypervideo_dl/extractor/googledrive.py b/hypervideo_dl/extractor/googledrive.py index 7b5bf28..e027ea7 100644 --- a/hypervideo_dl/extractor/googledrive.py +++ b/hypervideo_dl/extractor/googledrive.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -79,13 +77,13 @@ class GoogleDriveIE(InfoExtractor): _caption_formats_ext = [] _captions_xml = None - @staticmethod - def _extract_url(webpage): + @classmethod + def _extract_embed_urls(cls, url, webpage): mobj = re.search( r'<iframe[^>]+src="https?://(?:video\.google\.com/get_player\?.*?docid=|(?:docs|drive)\.google\.com/file/d/)(?P<id>[a-zA-Z0-9_-]{28,})', webpage) if mobj: - return 'https://drive.google.com/file/d/%s' % mobj.group('id') + yield 'https://drive.google.com/file/d/%s' % mobj.group('id') def _download_subtitles_xml(self, video_id, subtitles_id, hl): if self._captions_xml: @@ -255,8 +253,6 @@ class GoogleDriveIE(InfoExtractor): if not formats and reason: self.raise_no_formats(reason, expected=True) - self._sort_formats(formats) - hl = get_value('hl') subtitles_id = None ttsurl = get_value('ttsurl') @@ -266,7 +262,7 @@ class GoogleDriveIE(InfoExtractor): subtitles_id = ttsurl.encode('utf-8').decode( 'unicode_escape').split('=')[-1] - self._downloader.cookiejar.clear(domain='.google.com', path='/', name='NID') + self.cookiejar.clear(domain='.google.com', path='/', name='NID') return { 'id': video_id, @@ -278,3 +274,59 @@ class GoogleDriveIE(InfoExtractor): 'automatic_captions': self.extract_automatic_captions( video_id, subtitles_id, hl), } + + +class GoogleDriveFolderIE(InfoExtractor): + IE_NAME = 'GoogleDrive:Folder' + _VALID_URL = r'https?://(?:docs|drive)\.google\.com/drive/folders/(?P<id>[\w-]{28,})' + _TESTS = [{ + 'url': 'https://drive.google.com/drive/folders/1dQ4sx0-__Nvg65rxTSgQrl7VyW_FZ9QI', + 'info_dict': { + 'id': '1dQ4sx0-__Nvg65rxTSgQrl7VyW_FZ9QI', + 'title': 'Forrest' + }, + 'playlist_count': 3, + }] + _BOUNDARY = '=====vc17a3rwnndj=====' + _REQUEST = "/drive/v2beta/files?openDrive=true&reason=102&syncType=0&errorRecovery=false&q=trashed%20%3D%20false%20and%20'{folder_id}'%20in%20parents&fields=kind%2CnextPageToken%2Citems(kind%2CmodifiedDate%2CmodifiedByMeDate%2ClastViewedByMeDate%2CfileSize%2Cowners(kind%2CpermissionId%2Cid)%2ClastModifyingUser(kind%2CpermissionId%2Cid)%2ChasThumbnail%2CthumbnailVersion%2Ctitle%2Cid%2CresourceKey%2Cshared%2CsharedWithMeDate%2CuserPermission(role)%2CexplicitlyTrashed%2CmimeType%2CquotaBytesUsed%2Ccopyable%2CfileExtension%2CsharingUser(kind%2CpermissionId%2Cid)%2Cspaces%2Cversion%2CteamDriveId%2ChasAugmentedPermissions%2CcreatedDate%2CtrashingUser(kind%2CpermissionId%2Cid)%2CtrashedDate%2Cparents(id)%2CshortcutDetails(targetId%2CtargetMimeType%2CtargetLookupStatus)%2Ccapabilities(canCopy%2CcanDownload%2CcanEdit%2CcanAddChildren%2CcanDelete%2CcanRemoveChildren%2CcanShare%2CcanTrash%2CcanRename%2CcanReadTeamDrive%2CcanMoveTeamDriveItem)%2Clabels(starred%2Ctrashed%2Crestricted%2Cviewed))%2CincompleteSearch&appDataFilter=NO_APP_DATA&spaces=drive&pageToken={page_token}&maxResults=50&supportsTeamDrives=true&includeItemsFromAllDrives=true&corpora=default&orderBy=folder%2Ctitle_natural%20asc&retryCount=0&key={key} HTTP/1.1" + _DATA = f'''--{_BOUNDARY} +content-type: application/http +content-transfer-encoding: binary + +GET %s + +--{_BOUNDARY} +''' + + def _call_api(self, folder_id, key, data, **kwargs): + response = self._download_webpage( + 'https://clients6.google.com/batch/drive/v2beta', + folder_id, data=data.encode('utf-8'), + headers={ + 'Content-Type': 'text/plain;charset=UTF-8;', + 'Origin': 'https://drive.google.com', + }, query={ + '$ct': f'multipart/mixed; boundary="{self._BOUNDARY}"', + 'key': key + }, **kwargs) + return self._search_json('', response, 'api response', folder_id, **kwargs) or {} + + def _get_folder_items(self, folder_id, key): + page_token = '' + while page_token is not None: + request = self._REQUEST.format(folder_id=folder_id, page_token=page_token, key=key) + page = self._call_api(folder_id, key, self._DATA % request) + yield from page['items'] + page_token = page.get('nextPageToken') + + def _real_extract(self, url): + folder_id = self._match_id(url) + + webpage = self._download_webpage(url, folder_id) + key = self._search_regex(r'"(\w{39})"', webpage, 'key') + + folder_info = self._call_api(folder_id, key, self._DATA % f'/drive/v2beta/files/{folder_id} HTTP/1.1', fatal=False) + + return self.playlist_from_matches( + self._get_folder_items(folder_id, key), folder_id, folder_info.get('title'), + ie=GoogleDriveIE, getter=lambda item: f'https://drive.google.com/file/d/{item["id"]}') diff --git a/hypervideo_dl/extractor/googlepodcasts.py b/hypervideo_dl/extractor/googlepodcasts.py index 25631e2..8b2351b 100644 --- a/hypervideo_dl/extractor/googlepodcasts.py +++ b/hypervideo_dl/extractor/googlepodcasts.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/googlesearch.py b/hypervideo_dl/extractor/googlesearch.py index 4b8b1bc..67ca0e5 100644 --- a/hypervideo_dl/extractor/googlesearch.py +++ b/hypervideo_dl/extractor/googlesearch.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import itertools import re diff --git a/hypervideo_dl/extractor/goplay.py b/hypervideo_dl/extractor/goplay.py new file mode 100644 index 0000000..2882b49 --- /dev/null +++ b/hypervideo_dl/extractor/goplay.py @@ -0,0 +1,394 @@ +import base64 +import binascii +import datetime +import hashlib +import hmac +import json +import os + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + traverse_obj, + unescapeHTML, +) + + +class GoPlayIE(InfoExtractor): + _VALID_URL = r'https?://(www\.)?goplay\.be/video/([^/]+/[^/]+/|)(?P<display_id>[^/#]+)' + + _NETRC_MACHINE = 'goplay' + + _TESTS = [{ + 'url': 'https://www.goplay.be/video/de-container-cup/de-container-cup-s3/de-container-cup-s3-aflevering-2#autoplay', + 'info_dict': { + 'id': '9c4214b8-e55d-4e4b-a446-f015f6c6f811', + 'ext': 'mp4', + 'title': 'S3 - Aflevering 2', + 'series': 'De Container Cup', + 'season': 'Season 3', + 'season_number': 3, + 'episode': 'Episode 2', + 'episode_number': 2, + }, + 'skip': 'This video is only available for registered users' + }, { + 'url': 'https://www.goplay.be/video/a-family-for-thr-holidays-s1-aflevering-1#autoplay', + 'info_dict': { + 'id': '74e3ed07-748c-49e4-85a0-393a93337dbf', + 'ext': 'mp4', + 'title': 'A Family for the Holidays', + }, + 'skip': 'This video is only available for registered users' + }] + + _id_token = None + + def _perform_login(self, username, password): + self.report_login() + aws = AwsIdp(ie=self, pool_id='eu-west-1_dViSsKM5Y', client_id='6s1h851s8uplco5h6mqh1jac8m') + self._id_token, _ = aws.authenticate(username=username, password=password) + + def _real_initialize(self): + if not self._id_token: + raise self.raise_login_required(method='password') + + def _real_extract(self, url): + url, display_id = self._match_valid_url(url).group(0, 'display_id') + webpage = self._download_webpage(url, display_id) + video_data_json = self._html_search_regex(r'<div\s+data-hero="([^"]+)"', webpage, 'video_data') + video_data = self._parse_json(unescapeHTML(video_data_json), display_id).get('data') + + movie = video_data.get('movie') + if movie: + video_id = movie['videoUuid'] + info_dict = { + 'title': movie.get('title') + } + else: + episode = traverse_obj(video_data, ('playlists', ..., 'episodes', lambda _, v: v['pageInfo']['url'] == url), get_all=False) + video_id = episode['videoUuid'] + info_dict = { + 'title': episode.get('episodeTitle'), + 'series': traverse_obj(episode, ('program', 'title')), + 'season_number': episode.get('seasonNumber'), + 'episode_number': episode.get('episodeNumber'), + } + + api = self._download_json( + f'https://api.viervijfzes.be/content/{video_id}', + video_id, headers={'Authorization': self._id_token}) + + formats, subs = self._extract_m3u8_formats_and_subtitles( + api['video']['S'], video_id, ext='mp4', m3u8_id='HLS') + + info_dict.update({ + 'id': video_id, + 'formats': formats, + }) + + return info_dict + + +# Taken from https://github.com/add-ons/plugin.video.viervijfzes/blob/master/resources/lib/viervijfzes/auth_awsidp.py +# Released into Public domain by https://github.com/michaelarnauts + +class InvalidLoginException(ExtractorError): + """ The login credentials are invalid """ + + +class AuthenticationException(ExtractorError): + """ Something went wrong while logging in """ + + +class AwsIdp: + """ AWS Identity Provider """ + + def __init__(self, ie, pool_id, client_id): + """ + :param InfoExtrator ie: The extractor that instantiated this class. + :param str pool_id: The AWS user pool to connect to (format: <region>_<poolid>). + E.g.: eu-west-1_aLkOfYN3T + :param str client_id: The client application ID (the ID of the application connecting) + """ + + self.ie = ie + + self.pool_id = pool_id + if "_" not in self.pool_id: + raise ValueError("Invalid pool_id format. Should be <region>_<poolid>.") + + self.client_id = client_id + self.region = self.pool_id.split("_")[0] + self.url = "https://cognito-idp.%s.amazonaws.com/" % (self.region,) + + # Initialize the values + # https://github.com/aws/amazon-cognito-identity-js/blob/master/src/AuthenticationHelper.js#L22 + self.n_hex = 'FFFFFFFFFFFFFFFFC90FDAA22168C234C4C6628B80DC1CD1' + \ + '29024E088A67CC74020BBEA63B139B22514A08798E3404DD' + \ + 'EF9519B3CD3A431B302B0A6DF25F14374FE1356D6D51C245' + \ + 'E485B576625E7EC6F44C42E9A637ED6B0BFF5CB6F406B7ED' + \ + 'EE386BFB5A899FA5AE9F24117C4B1FE649286651ECE45B3D' + \ + 'C2007CB8A163BF0598DA48361C55D39A69163FA8FD24CF5F' + \ + '83655D23DCA3AD961C62F356208552BB9ED529077096966D' + \ + '670C354E4ABC9804F1746C08CA18217C32905E462E36CE3B' + \ + 'E39E772C180E86039B2783A2EC07A28FB5C55DF06F4C52C9' + \ + 'DE2BCBF6955817183995497CEA956AE515D2261898FA0510' + \ + '15728E5A8AAAC42DAD33170D04507A33A85521ABDF1CBA64' + \ + 'ECFB850458DBEF0A8AEA71575D060C7DB3970F85A6E1E4C7' + \ + 'ABF5AE8CDB0933D71E8C94E04A25619DCEE3D2261AD2EE6B' + \ + 'F12FFA06D98A0864D87602733EC86A64521F2B18177B200C' + \ + 'BBE117577A615D6C770988C0BAD946E208E24FA074E5AB31' + \ + '43DB5BFCE0FD108E4B82D120A93AD2CAFFFFFFFFFFFFFFFF' + + # https://github.com/aws/amazon-cognito-identity-js/blob/master/src/AuthenticationHelper.js#L49 + self.g_hex = '2' + self.info_bits = bytearray('Caldera Derived Key', 'utf-8') + + self.big_n = self.__hex_to_long(self.n_hex) + self.g = self.__hex_to_long(self.g_hex) + self.k = self.__hex_to_long(self.__hex_hash('00' + self.n_hex + '0' + self.g_hex)) + self.small_a_value = self.__generate_random_small_a() + self.large_a_value = self.__calculate_a() + + def authenticate(self, username, password): + """ Authenticate with a username and password. """ + # Step 1: First initiate an authentication request + auth_data_dict = self.__get_authentication_request(username) + auth_data = json.dumps(auth_data_dict).encode("utf-8") + auth_headers = { + "X-Amz-Target": "AWSCognitoIdentityProviderService.InitiateAuth", + "Accept-Encoding": "identity", + "Content-Type": "application/x-amz-json-1.1" + } + auth_response_json = self.ie._download_json( + self.url, None, data=auth_data, headers=auth_headers, + note='Authenticating username', errnote='Invalid username') + challenge_parameters = auth_response_json.get("ChallengeParameters") + + if auth_response_json.get("ChallengeName") != "PASSWORD_VERIFIER": + raise AuthenticationException(auth_response_json["message"]) + + # Step 2: Respond to the Challenge with a valid ChallengeResponse + challenge_request = self.__get_challenge_response_request(challenge_parameters, password) + challenge_data = json.dumps(challenge_request).encode("utf-8") + challenge_headers = { + "X-Amz-Target": "AWSCognitoIdentityProviderService.RespondToAuthChallenge", + "Content-Type": "application/x-amz-json-1.1" + } + auth_response_json = self.ie._download_json( + self.url, None, data=challenge_data, headers=challenge_headers, + note='Authenticating password', errnote='Invalid password') + + if 'message' in auth_response_json: + raise InvalidLoginException(auth_response_json['message']) + return ( + auth_response_json['AuthenticationResult']['IdToken'], + auth_response_json['AuthenticationResult']['RefreshToken'] + ) + + def __get_authentication_request(self, username): + """ + + :param str username: The username to use + + :return: A full Authorization request. + :rtype: dict + """ + auth_request = { + "AuthParameters": { + "USERNAME": username, + "SRP_A": self.__long_to_hex(self.large_a_value) + }, + "AuthFlow": "USER_SRP_AUTH", + "ClientId": self.client_id + } + return auth_request + + def __get_challenge_response_request(self, challenge_parameters, password): + """ Create a Challenge Response Request object. + + :param dict[str,str|imt] challenge_parameters: The parameters for the challenge. + :param str password: The password. + + :return: A valid and full request data object to use as a response for a challenge. + :rtype: dict + """ + user_id = challenge_parameters["USERNAME"] + user_id_for_srp = challenge_parameters["USER_ID_FOR_SRP"] + srp_b = challenge_parameters["SRP_B"] + salt = challenge_parameters["SALT"] + secret_block = challenge_parameters["SECRET_BLOCK"] + + timestamp = self.__get_current_timestamp() + + # Get a HKDF key for the password, SrpB and the Salt + hkdf = self.__get_hkdf_key_for_password( + user_id_for_srp, + password, + self.__hex_to_long(srp_b), + salt + ) + secret_block_bytes = base64.standard_b64decode(secret_block) + + # the message is a combo of the pool_id, provided SRP userId, the Secret and Timestamp + msg = \ + bytearray(self.pool_id.split('_')[1], 'utf-8') + \ + bytearray(user_id_for_srp, 'utf-8') + \ + bytearray(secret_block_bytes) + \ + bytearray(timestamp, 'utf-8') + hmac_obj = hmac.new(hkdf, msg, digestmod=hashlib.sha256) + signature_string = base64.standard_b64encode(hmac_obj.digest()).decode('utf-8') + challenge_request = { + "ChallengeResponses": { + "USERNAME": user_id, + "TIMESTAMP": timestamp, + "PASSWORD_CLAIM_SECRET_BLOCK": secret_block, + "PASSWORD_CLAIM_SIGNATURE": signature_string + }, + "ChallengeName": "PASSWORD_VERIFIER", + "ClientId": self.client_id + } + return challenge_request + + def __get_hkdf_key_for_password(self, username, password, server_b_value, salt): + """ Calculates the final hkdf based on computed S value, and computed U value and the key. + + :param str username: Username. + :param str password: Password. + :param int server_b_value: Server B value. + :param int salt: Generated salt. + + :return Computed HKDF value. + :rtype: object + """ + + u_value = self.__calculate_u(self.large_a_value, server_b_value) + if u_value == 0: + raise ValueError('U cannot be zero.') + username_password = '%s%s:%s' % (self.pool_id.split('_')[1], username, password) + username_password_hash = self.__hash_sha256(username_password.encode('utf-8')) + + x_value = self.__hex_to_long(self.__hex_hash(self.__pad_hex(salt) + username_password_hash)) + g_mod_pow_xn = pow(self.g, x_value, self.big_n) + int_value2 = server_b_value - self.k * g_mod_pow_xn + s_value = pow(int_value2, self.small_a_value + u_value * x_value, self.big_n) + hkdf = self.__compute_hkdf( + bytearray.fromhex(self.__pad_hex(s_value)), + bytearray.fromhex(self.__pad_hex(self.__long_to_hex(u_value))) + ) + return hkdf + + def __compute_hkdf(self, ikm, salt): + """ Standard hkdf algorithm + + :param {Buffer} ikm Input key material. + :param {Buffer} salt Salt value. + :return {Buffer} Strong key material. + """ + + prk = hmac.new(salt, ikm, hashlib.sha256).digest() + info_bits_update = self.info_bits + bytearray(chr(1), 'utf-8') + hmac_hash = hmac.new(prk, info_bits_update, hashlib.sha256).digest() + return hmac_hash[:16] + + def __calculate_u(self, big_a, big_b): + """ Calculate the client's value U which is the hash of A and B + + :param int big_a: Large A value. + :param int big_b: Server B value. + + :return Computed U value. + :rtype: int + """ + + u_hex_hash = self.__hex_hash(self.__pad_hex(big_a) + self.__pad_hex(big_b)) + return self.__hex_to_long(u_hex_hash) + + def __generate_random_small_a(self): + """ Helper function to generate a random big integer + + :return a random value. + :rtype: int + """ + random_long_int = self.__get_random(128) + return random_long_int % self.big_n + + def __calculate_a(self): + """ Calculate the client's public value A = g^a%N with the generated random number a + + :return Computed large A. + :rtype: int + """ + + big_a = pow(self.g, self.small_a_value, self.big_n) + # safety check + if (big_a % self.big_n) == 0: + raise ValueError('Safety check for A failed') + return big_a + + @staticmethod + def __long_to_hex(long_num): + return '%x' % long_num + + @staticmethod + def __hex_to_long(hex_string): + return int(hex_string, 16) + + @staticmethod + def __hex_hash(hex_string): + return AwsIdp.__hash_sha256(bytearray.fromhex(hex_string)) + + @staticmethod + def __hash_sha256(buf): + """AuthenticationHelper.hash""" + digest = hashlib.sha256(buf).hexdigest() + return (64 - len(digest)) * '0' + digest + + @staticmethod + def __pad_hex(long_int): + """ Converts a Long integer (or hex string) to hex format padded with zeroes for hashing + + :param int|str long_int: Number or string to pad. + + :return Padded hex string. + :rtype: str + """ + + if not isinstance(long_int, str): + hash_str = AwsIdp.__long_to_hex(long_int) + else: + hash_str = long_int + if len(hash_str) % 2 == 1: + hash_str = '0%s' % hash_str + elif hash_str[0] in '89ABCDEFabcdef': + hash_str = '00%s' % hash_str + return hash_str + + @staticmethod + def __get_random(nbytes): + random_hex = binascii.hexlify(os.urandom(nbytes)) + return AwsIdp.__hex_to_long(random_hex) + + @staticmethod + def __get_current_timestamp(): + """ Creates a timestamp with the correct English format. + + :return: timestamp in format 'Sun Jan 27 19:00:04 UTC 2019' + :rtype: str + """ + + # We need US only data, so we cannot just do a strftime: + # Sun Jan 27 19:00:04 UTC 2019 + months = [None, 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] + days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] + + time_now = datetime.datetime.utcnow() + format_string = "{} {} {} %H:%M:%S UTC %Y".format(days[time_now.weekday()], months[time_now.month], time_now.day) + time_string = datetime.datetime.utcnow().strftime(format_string) + return time_string + + def __str__(self): + return "AWS IDP Client for:\nRegion: %s\nPoolId: %s\nAppId: %s" % ( + self.region, self.pool_id.split("_")[1], self.client_id + ) diff --git a/hypervideo_dl/extractor/gopro.py b/hypervideo_dl/extractor/gopro.py index 10cc1ae..ae96537 100644 --- a/hypervideo_dl/extractor/gopro.py +++ b/hypervideo_dl/extractor/gopro.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -81,8 +78,6 @@ class GoProIE(InfoExtractor): 'height': int_or_none(fmt.get('height')), }) - self._sort_formats(formats) - title = str_or_none( try_get(metadata, lambda x: x['collection']['title']) or self._html_search_meta(['og:title', 'twitter:title'], webpage) diff --git a/hypervideo_dl/extractor/goshgay.py b/hypervideo_dl/extractor/goshgay.py index 377981d..9a1f32b 100644 --- a/hypervideo_dl/extractor/goshgay.py +++ b/hypervideo_dl/extractor/goshgay.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_parse_qs, diff --git a/hypervideo_dl/extractor/gotostage.py b/hypervideo_dl/extractor/gotostage.py index 6aa9610..112293b 100644 --- a/hypervideo_dl/extractor/gotostage.py +++ b/hypervideo_dl/extractor/gotostage.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( diff --git a/hypervideo_dl/extractor/gputechconf.py b/hypervideo_dl/extractor/gputechconf.py index 73dc62c..2d13bf4 100644 --- a/hypervideo_dl/extractor/gputechconf.py +++ b/hypervideo_dl/extractor/gputechconf.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/gronkh.py b/hypervideo_dl/extractor/gronkh.py index c9f1dd2..b9370e3 100644 --- a/hypervideo_dl/extractor/gronkh.py +++ b/hypervideo_dl/extractor/gronkh.py @@ -1,20 +1,34 @@ -# coding: utf-8 -from __future__ import unicode_literals +import functools from .common import InfoExtractor -from ..utils import unified_strdate +from ..utils import ( + OnDemandPagedList, + traverse_obj, + unified_strdate, +) class GronkhIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?gronkh\.tv/(?:watch/)?stream/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?gronkh\.tv/(?:watch/)?streams?/(?P<id>\d+)' _TESTS = [{ + 'url': 'https://gronkh.tv/streams/657', + 'info_dict': { + 'id': '657', + 'ext': 'mp4', + 'title': 'H.O.R.D.E. - DAS ZWEiTE ZEiTALTER 🎲 Session 1', + 'view_count': int, + 'thumbnail': 'https://01.cdn.vod.farm/preview/9e2555d3a23bf4e5c5b7c6b3b70a9d84.jpg', + 'upload_date': '20221111' + }, + 'params': {'skip_download': True} + }, { 'url': 'https://gronkh.tv/stream/536', 'info_dict': { 'id': '536', 'ext': 'mp4', 'title': 'GTV0536, 2021-10-01 - MARTHA IS DEAD #FREiAB1830 !FF7 !horde !archiv', - 'view_count': 19491, + 'view_count': int, 'thumbnail': 'https://01.cdn.vod.farm/preview/6436746cce14e25f751260a692872b9b.jpg', 'upload_date': '20211001' }, @@ -34,7 +48,6 @@ class GronkhIE(InfoExtractor): 'url': data_json['vtt_url'], 'ext': 'vtt', }) - self._sort_formats(formats) return { 'id': id, 'title': data_json.get('title'), @@ -44,3 +57,54 @@ class GronkhIE(InfoExtractor): 'formats': formats, 'subtitles': subtitles, } + + +class GronkhFeedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gronkh\.tv(?:/feed)?/?(?:#|$)' + IE_NAME = 'gronkh:feed' + + _TESTS = [{ + 'url': 'https://gronkh.tv/feed', + 'info_dict': { + 'id': 'feed', + }, + 'playlist_count': 16, + }, { + 'url': 'https://gronkh.tv', + 'only_matching': True, + }] + + def _entries(self): + for type_ in ('recent', 'views'): + info = self._download_json( + f'https://api.gronkh.tv/v1/video/discovery/{type_}', 'feed', note=f'Downloading {type_} API JSON') + for item in traverse_obj(info, ('discovery', ...)) or []: + yield self.url_result(f'https://gronkh.tv/watch/stream/{item["episode"]}', GronkhIE, item.get('title')) + + def _real_extract(self, url): + return self.playlist_result(self._entries(), 'feed') + + +class GronkhVodsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gronkh\.tv/vods/streams/?(?:#|$)' + IE_NAME = 'gronkh:vods' + + _TESTS = [{ + 'url': 'https://gronkh.tv/vods/streams', + 'info_dict': { + 'id': 'vods', + }, + 'playlist_mincount': 150, + }] + _PER_PAGE = 25 + + def _fetch_page(self, page): + items = traverse_obj(self._download_json( + 'https://api.gronkh.tv/v1/search', 'vods', query={'offset': self._PER_PAGE * page, 'first': self._PER_PAGE}, + note=f'Downloading stream video page {page + 1}'), ('results', 'videos', ...)) + for item in items or []: + yield self.url_result(f'https://gronkh.tv/watch/stream/{item["episode"]}', GronkhIE, item['episode'], item.get('title')) + + def _real_extract(self, url): + entries = OnDemandPagedList(functools.partial(self._fetch_page), self._PER_PAGE) + return self.playlist_result(entries, 'vods') diff --git a/hypervideo_dl/extractor/groupon.py b/hypervideo_dl/extractor/groupon.py index a6da909..362d3ff 100644 --- a/hypervideo_dl/extractor/groupon.py +++ b/hypervideo_dl/extractor/groupon.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/harpodeon.py b/hypervideo_dl/extractor/harpodeon.py new file mode 100644 index 0000000..0aa4733 --- /dev/null +++ b/hypervideo_dl/extractor/harpodeon.py @@ -0,0 +1,70 @@ +from .common import InfoExtractor +from ..utils import unified_strdate + + +class HarpodeonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?harpodeon\.com/(?:video|preview)/\w+/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.harpodeon.com/video/The_Smoking_Out_of_Bella_Butts/268068288', + 'md5': '727371564a6a9ebccef2073535b5b6bd', + 'skip': 'Free video could become unavailable', + 'info_dict': { + 'id': '268068288', + 'ext': 'mp4', + 'title': 'The Smoking Out of Bella Butts', + 'description': 'md5:47e16bdb41fc8a79c83ab83af11c8b77', + 'creator': 'Vitagraph Company of America', + 'release_date': '19150101' + } + }, { + 'url': 'https://www.harpodeon.com/preview/The_Smoking_Out_of_Bella_Butts/268068288', + 'md5': '6dfea5412845f690c7331be703f884db', + 'info_dict': { + 'id': '268068288', + 'ext': 'mp4', + 'title': 'The Smoking Out of Bella Butts', + 'description': 'md5:47e16bdb41fc8a79c83ab83af11c8b77', + 'creator': 'Vitagraph Company of America', + 'release_date': '19150101' + } + }, { + 'url': 'https://www.harpodeon.com/preview/Behind_the_Screen/421838710', + 'md5': '7979df9ca04637282cb7d172ab3a9c3b', + 'info_dict': { + 'id': '421838710', + 'ext': 'mp4', + 'title': 'Behind the Screen', + 'description': 'md5:008972a3dc51fba3965ee517d2ba9155', + 'creator': 'Lone Star Corporation', + 'release_date': '19160101' + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title, creator, release_year = self._search_regex( + r'''(?x) + <div[^>]+videoInfo[^<]*<h2[^>]*>(?P<title>[^>]+)</h2> + (?:\s*<p[^>]*>\((?P<creator>.+),\s*)?(?P<release_year>\d{4})?''', + webpage, 'title', group=('title', 'creator', 'release_year'), + fatal=False) or (None, None, None) + + hp_base = self._html_search_regex(r'hpBase\(\s*["\']([^"\']+)', webpage, 'hp_base') + + hp_inject_video, hp_resolution = self._search_regex( + r'''(?x) + hpInjectVideo\([\'\"](?P<hp_inject_video>\w+)[\'\"], + [\'\"](?P<hp_resolution>\d+)[\'\"]''', + webpage, 'hp_inject_video', group=['hp_inject_video', 'hp_resolution']) + + return { + 'id': video_id, + 'title': title, + 'url': f'{hp_base}{hp_inject_video}_{hp_resolution}.mp4', + 'http_headers': {'Referer': url}, + 'description': self._html_search_meta('description', webpage, fatal=False), + 'creator': creator, + 'release_date': unified_strdate(f'{release_year}0101') + } diff --git a/hypervideo_dl/extractor/hbo.py b/hypervideo_dl/extractor/hbo.py index 68df748..530bdb7 100644 --- a/hypervideo_dl/extractor/hbo.py +++ b/hypervideo_dl/extractor/hbo.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -115,7 +112,6 @@ class HBOBaseIE(InfoExtractor): 'width': format_info.get('width'), 'height': format_info.get('height'), }) - self._sort_formats(formats) thumbnails = [] card_sizes = xpath_element(video_data, 'titleCardSizes') diff --git a/hypervideo_dl/extractor/hearthisat.py b/hypervideo_dl/extractor/hearthisat.py index a3d6a05..d1a400d 100644 --- a/hypervideo_dl/extractor/hearthisat.py +++ b/hypervideo_dl/extractor/hearthisat.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -85,7 +81,6 @@ class HearThisAtIE(InfoExtractor): 'acodec': ext, 'quality': 2, # Usually better quality }) - self._sort_formats(formats) return { 'id': track_id, diff --git a/hypervideo_dl/extractor/heise.py b/hypervideo_dl/extractor/heise.py index cbe564a..27d737c 100644 --- a/hypervideo_dl/extractor/heise.py +++ b/hypervideo_dl/extractor/heise.py @@ -1,13 +1,12 @@ -# coding: utf-8 -from __future__ import unicode_literals +import urllib.parse from .common import InfoExtractor from .kaltura import KalturaIE from .youtube import YoutubeIE from ..utils import ( + NO_DEFAULT, determine_ext, int_or_none, - NO_DEFAULT, parse_iso8601, smuggle_url, xpath_text, @@ -26,6 +25,9 @@ class HeiseIE(InfoExtractor): 'timestamp': 1512734959, 'upload_date': '20171208', 'description': 'md5:c934cbfb326c669c2bcabcbe3d3fcd20', + 'thumbnail': 're:^https?://.*/thumbnail/.*', + 'duration': 2845, + 'view_count': int, }, 'params': { 'skip_download': True, @@ -37,11 +39,27 @@ class HeiseIE(InfoExtractor): 'info_dict': { 'id': '6kmWbXleKW4', 'ext': 'mp4', - 'title': 'NEU IM SEPTEMBER | Netflix', - 'description': 'md5:2131f3c7525e540d5fd841de938bd452', + 'title': 'Neu im September 2017 | Netflix', + 'description': 'md5:d6852d1f96bb80760608eed3b907437c', 'upload_date': '20170830', 'uploader': 'Netflix Deutschland, Österreich und Schweiz', 'uploader_id': 'netflixdach', + 'categories': ['Entertainment'], + 'tags': 'count:27', + 'age_limit': 0, + 'availability': 'public', + 'comment_count': int, + 'channel_id': 'UCZqgRlLcvO3Fnx_npQJygcQ', + 'thumbnail': 'https://i.ytimg.com/vi_webp/6kmWbXleKW4/maxresdefault.webp', + 'uploader_url': 'http://www.youtube.com/user/netflixdach', + 'playable_in_embed': True, + 'live_status': 'not_live', + 'channel_url': 'https://www.youtube.com/channel/UCZqgRlLcvO3Fnx_npQJygcQ', + 'view_count': int, + 'channel': 'Netflix Deutschland, Österreich und Schweiz', + 'channel_follower_count': int, + 'like_count': int, + 'duration': 67, }, 'params': { 'skip_download': True, @@ -55,11 +73,15 @@ class HeiseIE(InfoExtractor): 'description': 'md5:47e8ffb6c46d85c92c310a512d6db271', 'timestamp': 1512470717, 'upload_date': '20171205', + 'duration': 786, + 'view_count': int, + 'thumbnail': 're:^https?://.*/thumbnail/.*', }, 'params': { 'skip_download': True, }, }, { + # FIXME: Video m3u8 fails to download; issue with Kaltura extractor 'url': 'https://www.heise.de/ct/artikel/c-t-uplink-20-8-Staubsaugerroboter-Xiaomi-Vacuum-2-AR-Brille-Meta-2-und-Android-rooten-3959893.html', 'info_dict': { 'id': '1_59mk80sf', @@ -73,6 +95,18 @@ class HeiseIE(InfoExtractor): 'skip_download': True, }, }, { + # videout + 'url': 'https://www.heise.de/ct/artikel/c-t-uplink-3-8-Anonyme-SIM-Karten-G-Sync-Monitore-Citizenfour-2440327.html', + 'info_dict': { + 'id': '2440327', + 'ext': 'mp4', + 'title': 'c\'t uplink 3.8: Anonyme SIM-Karten, G-Sync-Monitore, Citizenfour', + 'thumbnail': 'http://www.heise.de/imagine/yxM2qmol0xV3iFB7qFb70dGvXjc/gallery/', + 'description': 'md5:fa164d8c8707dff124a9626d39205f5d', + 'timestamp': 1414825200, + 'upload_date': '20141101', + } + }, { 'url': 'http://www.heise.de/ct/artikel/c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2403911.html', 'only_matching': True, }, { @@ -124,26 +158,28 @@ class HeiseIE(InfoExtractor): if kaltura_id: return _make_kaltura_result('kaltura:2238431:%s' % kaltura_id) - yt_urls = YoutubeIE._extract_urls(webpage) + yt_urls = tuple(YoutubeIE._extract_embed_urls(url, webpage)) if yt_urls: return self.playlist_from_matches( yt_urls, video_id, title, ie=YoutubeIE.ie_key()) title = extract_title() - - container_id = self._search_regex( - r'<div class="videoplayerjw"[^>]+data-container="([0-9]+)"', - webpage, 'container ID') - - sequenz_id = self._search_regex( - r'<div class="videoplayerjw"[^>]+data-sequenz="([0-9]+)"', - webpage, 'sequenz ID') - - doc = self._download_xml( - 'http://www.heise.de/videout/feed', video_id, query={ + api_params = urllib.parse.parse_qs( + self._search_regex(r'/videout/feed\.json\?([^\']+)', webpage, 'feed params', default=None) or '') + if not api_params or 'container' not in api_params or 'sequenz' not in api_params: + container_id = self._search_regex( + r'<div class="videoplayerjw"[^>]+data-container="([0-9]+)"', + webpage, 'container ID') + + sequenz_id = self._search_regex( + r'<div class="videoplayerjw"[^>]+data-sequenz="([0-9]+)"', + webpage, 'sequenz ID') + api_params = { 'container': container_id, 'sequenz': sequenz_id, - }) + } + doc = self._download_xml( + 'http://www.heise.de/videout/feed', video_id, query=api_params) formats = [] for source_node in doc.findall('.//{http://rss.jwpcdn.com/}source'): @@ -158,7 +194,6 @@ class HeiseIE(InfoExtractor): 'format_id': '%s_%s' % (ext, label), 'height': height, }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/hellporno.py b/hypervideo_dl/extractor/hellporno.py index 92d32cd..fa32b27 100644 --- a/hypervideo_dl/extractor/hellporno.py +++ b/hypervideo_dl/extractor/hellporno.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -41,7 +39,6 @@ class HellPornoIE(InfoExtractor): title = remove_end(self._html_extract_title(webpage), ' - Hell Porno') info = self._parse_html5_media_entries(url, webpage, display_id)[0] - self._sort_formats(info['formats']) video_id = self._search_regex( (r'chs_object\s*=\s*["\'](\d+)', diff --git a/hypervideo_dl/extractor/helsinki.py b/hypervideo_dl/extractor/helsinki.py index 575fb33..e518cae 100644 --- a/hypervideo_dl/extractor/helsinki.py +++ b/hypervideo_dl/extractor/helsinki.py @@ -1,7 +1,3 @@ -# coding: utf-8 - -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import js_to_json @@ -33,7 +29,6 @@ class HelsinkiIE(InfoExtractor): 'url': s['file'], 'ext': 'mp4', } for s in params['sources']] - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/hentaistigma.py b/hypervideo_dl/extractor/hentaistigma.py index 86a93de..ca5ffc2 100644 --- a/hypervideo_dl/extractor/hentaistigma.py +++ b/hypervideo_dl/extractor/hentaistigma.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/hgtv.py b/hypervideo_dl/extractor/hgtv.py index a4f3325..c40017d 100644 --- a/hypervideo_dl/extractor/hgtv.py +++ b/hypervideo_dl/extractor/hgtv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/hidive.py b/hypervideo_dl/extractor/hidive.py index 46d7d62..3a53f2c 100644 --- a/hypervideo_dl/extractor/hidive.py +++ b/hypervideo_dl/extractor/hidive.py @@ -1,4 +1,3 @@ -# coding: utf-8 import re from .common import InfoExtractor @@ -39,7 +38,9 @@ class HiDiveIE(InfoExtractor): webpage = self._download_webpage(self._LOGIN_URL, None) form = self._search_regex( r'(?s)<form[^>]+action="/account/login"[^>]*>(.+?)</form>', - webpage, 'login form') + webpage, 'login form', default=None) + if not form: # logged in + return data = self._hidden_inputs(form) data.update({ 'Email': username, @@ -102,7 +103,6 @@ class HiDiveIE(InfoExtractor): f['language'] = audio f['format_note'] = f'{version}, {extra}' formats.extend(frmt) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/historicfilms.py b/hypervideo_dl/extractor/historicfilms.py index 56343e9..c428fee 100644 --- a/hypervideo_dl/extractor/historicfilms.py +++ b/hypervideo_dl/extractor/historicfilms.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import parse_duration diff --git a/hypervideo_dl/extractor/hitbox.py b/hypervideo_dl/extractor/hitbox.py index 0470d0a..f0c6898 100644 --- a/hypervideo_dl/extractor/hitbox.py +++ b/hypervideo_dl/extractor/hitbox.py @@ -1,16 +1,13 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( clean_html, - parse_iso8601, + determine_ext, float_or_none, int_or_none, - compat_str, - determine_ext, + parse_iso8601, ) @@ -121,7 +118,6 @@ class HitboxIE(InfoExtractor): 'tbr': bitrate, 'format_note': label, }) - self._sort_formats(formats) metadata = self._extract_metadata( 'https://www.smashcast.tv/api/media/video', video_id) @@ -130,7 +126,7 @@ class HitboxIE(InfoExtractor): return metadata -class HitboxLiveIE(HitboxIE): +class HitboxLiveIE(HitboxIE): # XXX: Do not subclass from concrete IE IE_NAME = 'hitbox:live' _VALID_URL = r'https?://(?:www\.)?(?:hitbox|smashcast)\.tv/(?P<id>[^/?#&]+)' _TESTS = [{ @@ -203,7 +199,6 @@ class HitboxLiveIE(HitboxIE): 'page_url': url, 'player_url': 'http://www.hitbox.tv/static/player/flowplayer/flowplayer.commercial-3.2.16.swf', }) - self._sort_formats(formats) metadata = self._extract_metadata( 'https://www.smashcast.tv/api/media/live', video_id) diff --git a/hypervideo_dl/extractor/hitrecord.py b/hypervideo_dl/extractor/hitrecord.py index fd5dc29..902af44 100644 --- a/hypervideo_dl/extractor/hitrecord.py +++ b/hypervideo_dl/extractor/hitrecord.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( diff --git a/hypervideo_dl/extractor/hketv.py b/hypervideo_dl/extractor/hketv.py index 1f3502b..1087956 100644 --- a/hypervideo_dl/extractor/hketv.py +++ b/hypervideo_dl/extractor/hketv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -140,7 +137,6 @@ class HKETVIE(InfoExtractor): 'width': w, 'height': h, }) - self._sort_formats(formats) subtitles = {} tracks = try_get(playlist0, lambda x: x['tracks'], list) or [] diff --git a/hypervideo_dl/extractor/holodex.py b/hypervideo_dl/extractor/holodex.py new file mode 100644 index 0000000..a2b73ec --- /dev/null +++ b/hypervideo_dl/extractor/holodex.py @@ -0,0 +1,100 @@ +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..utils import traverse_obj + + +class HolodexIE(InfoExtractor): + _VALID_URL = r'''(?x)https?://(?:www\.|staging\.)?holodex\.net/(?: + api/v2/playlist/(?P<playlist>\d+)| + watch/(?P<id>[\w-]{11})(?:\?(?:[^#]+&)?playlist=(?P<playlist2>\d+))? + )''' + _TESTS = [{ + 'url': 'https://holodex.net/watch/9kQ2GtvDV3s', + 'md5': 'be5ffce2f0feae8ba4c01553abc0f175', + 'info_dict': { + 'ext': 'mp4', + 'id': '9kQ2GtvDV3s', + 'title': '【おちゃめ機能】ホロライブが吹っ切れた【24人で歌ってみた】', + 'channel_id': 'UCJFZiqLMntJufDCHc6bQixg', + 'playable_in_embed': True, + 'tags': 'count:43', + 'age_limit': 0, + 'live_status': 'not_live', + 'description': 'md5:040e866c09dc4ab899b36479f4b7c7a2', + 'channel_url': 'https://www.youtube.com/channel/UCJFZiqLMntJufDCHc6bQixg', + 'upload_date': '20200406', + 'uploader_url': 'http://www.youtube.com/channel/UCJFZiqLMntJufDCHc6bQixg', + 'view_count': int, + 'channel': 'hololive ホロライブ - VTuber Group', + 'categories': ['Music'], + 'uploader': 'hololive ホロライブ - VTuber Group', + 'channel_follower_count': int, + 'uploader_id': 'UCJFZiqLMntJufDCHc6bQixg', + 'availability': 'public', + 'thumbnail': 'https://i.ytimg.com/vi_webp/9kQ2GtvDV3s/maxresdefault.webp', + 'duration': 263, + 'like_count': int, + }, + }, { + 'url': 'https://holodex.net/api/v2/playlist/239', + 'info_dict': { + 'id': '239', + 'title': 'Songs/Videos that made fall into the rabbit hole (from my google activity history)', + }, + 'playlist_count': 14, + }, { + 'url': 'https://holodex.net/watch/_m2mQyaofjI?foo=bar&playlist=69', + 'info_dict': { + 'id': '69', + 'title': '拿著金斧頭的藍髮大姊姊' + }, + 'playlist_count': 3, + }, { + 'url': 'https://holodex.net/watch/_m2mQyaofjI?playlist=69', + 'info_dict': { + 'id': '_m2mQyaofjI', + 'ext': 'mp4', + 'playable_in_embed': True, + 'like_count': int, + 'uploader': 'Ernst / エンスト', + 'duration': 11, + 'uploader_url': 'http://www.youtube.com/channel/UCqSX4PPZY0cyetqKVY_wRVA', + 'categories': ['Entertainment'], + 'title': '【星街すいせい】星街向你獻上晚安', + 'upload_date': '20210705', + 'description': 'md5:8b8ffb157bae77f2d109021a0b577d4a', + 'channel': 'Ernst / エンスト', + 'channel_id': 'UCqSX4PPZY0cyetqKVY_wRVA', + 'channel_follower_count': int, + 'view_count': int, + 'tags': [], + 'live_status': 'not_live', + 'channel_url': 'https://www.youtube.com/channel/UCqSX4PPZY0cyetqKVY_wRVA', + 'availability': 'public', + 'thumbnail': 'https://i.ytimg.com/vi_webp/_m2mQyaofjI/maxresdefault.webp', + 'age_limit': 0, + 'uploader_id': 'UCqSX4PPZY0cyetqKVY_wRVA', + 'comment_count': int, + }, + 'params': {'noplaylist': True}, + }, { + 'url': 'https://staging.holodex.net/api/v2/playlist/125', + 'only_matching': True, + }, { + 'url': 'https://staging.holodex.net/watch/rJJTJA_T_b0?playlist=25', + 'only_matching': True, + }, { + 'url': 'https://staging.holodex.net/watch/s1ifBeukThg', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id, playlist_id, pl_id2 = self._match_valid_url(url).group('id', 'playlist', 'playlist2') + playlist_id = playlist_id or pl_id2 + + if not self._yes_playlist(playlist_id, video_id): + return self.url_result(f'https://www.youtube.com/watch?v={video_id}', YoutubeIE) + + data = self._download_json(f'https://holodex.net/api/v2/playlist/{playlist_id}', playlist_id) + return self.playlist_from_matches( + traverse_obj(data, ('videos', ..., 'id')), playlist_id, data.get('name'), ie=YoutubeIE) diff --git a/hypervideo_dl/extractor/hornbunny.py b/hypervideo_dl/extractor/hornbunny.py deleted file mode 100644 index c458a95..0000000 --- a/hypervideo_dl/extractor/hornbunny.py +++ /dev/null @@ -1,49 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - parse_duration, -) - - -class HornBunnyIE(InfoExtractor): - _VALID_URL = r'http?://(?:www\.)?hornbunny\.com/videos/(?P<title_dash>[a-z-]+)-(?P<id>\d+)\.html' - _TEST = { - 'url': 'http://hornbunny.com/videos/panty-slut-jerk-off-instruction-5227.html', - 'md5': 'e20fd862d1894b67564c96f180f43924', - 'info_dict': { - 'id': '5227', - 'ext': 'mp4', - 'title': 'panty slut jerk off instruction', - 'duration': 550, - 'age_limit': 18, - 'view_count': int, - 'thumbnail': r're:^https?://.*\.jpg$', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - title = self._og_search_title(webpage) - info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0] - - duration = parse_duration(self._search_regex( - r'<strong>Runtime:</strong>\s*([0-9:]+)</div>', - webpage, 'duration', fatal=False)) - view_count = int_or_none(self._search_regex( - r'<strong>Views:</strong>\s*(\d+)</div>', - webpage, 'view count', fatal=False)) - - info_dict.update({ - 'id': video_id, - 'title': title, - 'duration': duration, - 'view_count': view_count, - 'age_limit': 18, - }) - - return info_dict diff --git a/hypervideo_dl/extractor/hotnewhiphop.py b/hypervideo_dl/extractor/hotnewhiphop.py index 4703e18..f8570cb 100644 --- a/hypervideo_dl/extractor/hotnewhiphop.py +++ b/hypervideo_dl/extractor/hotnewhiphop.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_b64decode from ..utils import ( diff --git a/hypervideo_dl/extractor/hotstar.py b/hypervideo_dl/extractor/hotstar.py index d55a79b..61eec7b 100644 --- a/hypervideo_dl/extractor/hotstar.py +++ b/hypervideo_dl/extractor/hotstar.py @@ -1,31 +1,33 @@ -# coding: utf-8 -from __future__ import unicode_literals - import hashlib import hmac +import json import re import time import uuid -import json from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str -) +from ..compat import compat_HTTPError, compat_str from ..utils import ( - determine_ext, ExtractorError, + determine_ext, int_or_none, + join_nonempty, str_or_none, - try_get, + traverse_obj, url_or_none, ) class HotStarBaseIE(InfoExtractor): + _BASE_URL = 'https://www.hotstar.com' + _API_URL = 'https://api.hotstar.com' _AKAMAI_ENCRYPTION_KEY = b'\x05\xfc\x1a\x01\xca\xc9\x4b\xc4\x12\xfc\x53\x12\x07\x75\xf9\xee' + def _call_api_v1(self, path, *args, **kwargs): + return self._download_json( + f'{self._API_URL}/o/v1/{path}', *args, **kwargs, + headers={'x-country-code': 'IN', 'x-platform-code': 'PCTV'}) + def _call_api_impl(self, path, video_id, query, st=None, cookies=None): st = int_or_none(st) or int(time.time()) exp = st + 6000 @@ -36,7 +38,7 @@ class HotStarBaseIE(InfoExtractor): token = cookies.get('userUP').value else: token = self._download_json( - 'https://api.hotstar.com/um/v3/users', + f'{self._API_URL}/um/v3/users', video_id, note='Downloading token', data=json.dumps({"device_ids": [{"id": compat_str(uuid.uuid4()), "type": "device_id"}]}).encode('utf-8'), headers={ @@ -46,58 +48,48 @@ class HotStarBaseIE(InfoExtractor): })['user_identity'] response = self._download_json( - 'https://api.hotstar.com/' + path, video_id, headers={ + f'{self._API_URL}/{path}', video_id, query=query, + headers={ 'hotstarauth': auth, 'x-hs-appversion': '6.72.2', 'x-hs-platform': 'web', 'x-hs-usertoken': token, - }, query=query) + }) if response['message'] != "Playback URL's fetched successfully": raise ExtractorError( response['message'], expected=True) return response['data'] - def _call_api(self, path, video_id, query_name='contentId'): - return self._download_json('https://api.hotstar.com/' + path, video_id=video_id, query={ - query_name: video_id, - 'tas': 10000, - }, headers={ - 'x-country-code': 'IN', - 'x-platform-code': 'PCTV', - }) - def _call_api_v2(self, path, video_id, st=None, cookies=None): return self._call_api_impl( - '%s/content/%s' % (path, video_id), video_id, st=st, cookies=cookies, query={ + f'{path}/content/{video_id}', video_id, st=st, cookies=cookies, query={ 'desired-config': 'audio_channel:stereo|container:fmp4|dynamic_range:hdr|encryption:plain|ladder:tv|package:dash|resolution:fhd|subs-tag:HotstarVIP|video_codec:h265', 'device-id': cookies.get('device_id').value if cookies.get('device_id') else compat_str(uuid.uuid4()), 'os-name': 'Windows', 'os-version': '10', }) + def _playlist_entries(self, path, item_id, root=None, **kwargs): + results = self._call_api_v1(path, item_id, **kwargs)['body']['results'] + for video in traverse_obj(results, (('assets', None), 'items', ...)): + if video.get('contentId'): + yield self.url_result( + HotStarIE._video_url(video['contentId'], root=root), HotStarIE, video['contentId']) + class HotStarIE(HotStarBaseIE): IE_NAME = 'hotstar' _VALID_URL = r'''(?x) - (?: - hotstar\:| - https?://(?:www\.)?hotstar\.com(?:/in)?/(?!in/) - ) - (?: - (?P<type>movies|sports|episode|(?P<tv>tv)) - (?: - \:| - /[^/?#]+/ - (?(tv) - (?:[^/?#]+/){2}| - (?:[^/?#]+/)* - ) - )| - [^/?#]+/ - )? - (?P<id>\d{10}) - ''' + https?://(?:www\.)?hotstar\.com(?:/in)?/(?!in/) + (?: + (?P<type>movies|sports|episode|(?P<tv>tv))/ + (?(tv)(?:[^/?#]+/){2}|[^?#]*) + )? + [^/?#]+/ + (?P<id>\d{10}) + ''' + _TESTS = [{ 'url': 'https://www.hotstar.com/can-you-not-spread-rumours/1000076273', 'info_dict': { @@ -108,38 +100,9 @@ class HotStarIE(HotStarBaseIE): 'timestamp': 1447248600, 'upload_date': '20151111', 'duration': 381, + 'episode': 'Can You Not Spread Rumours?', }, - }, { - 'url': 'hotstar:1000076273', - 'only_matching': True, - }, { - 'url': 'https://www.hotstar.com/movies/radha-gopalam/1000057157', - 'info_dict': { - 'id': '1000057157', - 'ext': 'mp4', - 'title': 'Radha Gopalam', - 'description': 'md5:be3bc342cc120bbc95b3b0960e2b0d22', - 'timestamp': 1140805800, - 'upload_date': '20060224', - 'duration': 9182, - }, - }, { - 'url': 'hotstar:movies:1000057157', - 'only_matching': True, - }, { - 'url': 'https://www.hotstar.com/in/sports/cricket/follow-the-blues-2021/recap-eng-fight-back-on-day-2/1260066104', - 'only_matching': True, - }, { - 'url': 'https://www.hotstar.com/in/sports/football/most-costly-pl-transfers-ft-grealish/1260065956', - 'only_matching': True, - }, { - # contentData - 'url': 'hotstar:sports:1260065956', - 'only_matching': True, - }, { - # contentData - 'url': 'hotstar:sports:1260066104', - 'only_matching': True, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://www.hotstar.com/tv/ek-bhram-sarvagun-sampanna/s-2116/janhvi-targets-suman/1000234847', 'info_dict': { @@ -158,12 +121,19 @@ class HotStarIE(HotStarBaseIE): 'season_id': 6771, 'episode': 'Janhvi Targets Suman', 'episode_number': 8, - }, + } }, { - 'url': 'hotstar:episode:1000234847', + 'url': 'https://www.hotstar.com/movies/radha-gopalam/1000057157', + 'only_matching': True, + }, { + 'url': 'https://www.hotstar.com/in/sports/cricket/follow-the-blues-2021/recap-eng-fight-back-on-day-2/1260066104', + 'only_matching': True, + }, { + 'url': 'https://www.hotstar.com/in/sports/football/most-costly-pl-transfers-ft-grealish/1260065956', 'only_matching': True, }] _GEO_BYPASS = False + _TYPE = { 'movies': 'movie', 'sports': 'match', @@ -172,41 +142,54 @@ class HotStarIE(HotStarBaseIE): None: 'content', } + _IGNORE_MAP = { + 'res': 'resolution', + 'vcodec': 'video_codec', + 'dr': 'dynamic_range', + } + + @classmethod + def _video_url(cls, video_id, video_type=None, *, slug='ignore_me', root=None): + assert None in (video_type, root) + if not root: + root = join_nonempty(cls._BASE_URL, video_type, delim='/') + return f'{root}/{slug}/{video_id}' + def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - video_type = mobj.group('type') - cookies = self._get_cookies(url) + video_id, video_type = self._match_valid_url(url).group('id', 'type') video_type = self._TYPE.get(video_type, video_type) - video_data = self._call_api(f'o/v1/{video_type}/detail', video_id)['body']['results']['item'] - title = video_data['title'] + cookies = self._get_cookies(url) # Cookies before any request + video_data = self._call_api_v1(f'{video_type}/detail', video_id, + query={'tas': 10000, 'contentId': video_id})['body']['results']['item'] if not self.get_param('allow_unplayable_formats') and video_data.get('drmProtected'): self.report_drm(video_id) - headers = {'Referer': 'https://www.hotstar.com/in'} - formats = [] - subs = {} + # See https://github.com/hypervideo/hypervideo/issues/396 + st = self._download_webpage_handle(f'{self._BASE_URL}/in', video_id)[1].headers.get('x-origin-date') + geo_restricted = False - _, urlh = self._download_webpage_handle('https://www.hotstar.com/in', video_id) - # Required to fix https://github.com/hypervideo/hypervideo/issues/396 - st = urlh.headers.get('x-origin-date') + formats, subs = [], {} + headers = {'Referer': f'{self._BASE_URL}/in'} + # change to v2 in the future playback_sets = self._call_api_v2('play/v1/playback', video_id, st=st, cookies=cookies)['playBackSets'] for playback_set in playback_sets: if not isinstance(playback_set, dict): continue - dr = re.search(r'dynamic_range:(?P<dr>[a-z]+)', playback_set.get('tagsCombination')).group('dr') + tags = str_or_none(playback_set.get('tagsCombination')) or '' + if any(f'{prefix}:{ignore}' in tags + for key, prefix in self._IGNORE_MAP.items() + for ignore in self._configuration_arg(key)): + continue + format_url = url_or_none(playback_set.get('playbackUrl')) if not format_url: continue - format_url = re.sub( - r'(?<=//staragvod)(\d)', r'web\1', format_url) - tags = str_or_none(playback_set.get('tagsCombination')) or '' - ingored_res, ignored_vcodec, ignored_dr = self._configuration_arg('res'), self._configuration_arg('vcodec'), self._configuration_arg('dr') - if any(f'resolution:{ig_res}' in tags for ig_res in ingored_res) or any(f'video_codec:{ig_vc}' in tags for ig_vc in ignored_vcodec) or any(f'dynamic_range:{ig_dr}' in tags for ig_dr in ignored_dr): - continue + format_url = re.sub(r'(?<=//staragvod)(\d)', r'web\1', format_url) + dr = re.search(r'dynamic_range:(?P<dr>[a-z]+)', playback_set.get('tagsCombination')).group('dr') ext = determine_ext(format_url) + current_formats, current_subs = [], {} try: if 'package:hls' in tags or ext == 'm3u8': @@ -218,8 +201,7 @@ class HotStarIE(HotStarBaseIE): current_formats, current_subs = self._extract_mpd_formats_and_subtitles( format_url, video_id, mpd_id=f'{dr}-dash', headers=headers) elif ext == 'f4m': - # produce broken files - pass + pass # XXX: produce broken files else: current_formats = [{ 'url': format_url, @@ -230,6 +212,7 @@ class HotStarIE(HotStarBaseIE): if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: geo_restricted = True continue + if tags and 'encryption:plain' not in tags: for f in current_formats: f['has_drm'] = True @@ -238,18 +221,18 @@ class HotStarIE(HotStarBaseIE): for f in current_formats: if not f.get('langauge'): f['language'] = lang + formats.extend(current_formats) subs = self._merge_subtitles(subs, current_subs) + if not formats and geo_restricted: self.raise_geo_restricted(countries=['IN'], metadata_available=True) - self._sort_formats(formats) - for f in formats: f.setdefault('http_headers', {}).update(headers) return { 'id': video_id, - 'title': title, + 'title': video_data.get('title'), 'description': video_data.get('description'), 'duration': int_or_none(video_data.get('duration')), 'timestamp': int_or_none(video_data.get('broadcastDate') or video_data.get('startDate')), @@ -261,17 +244,51 @@ class HotStarIE(HotStarBaseIE): 'season': video_data.get('seasonName'), 'season_number': int_or_none(video_data.get('seasonNo')), 'season_id': video_data.get('seasonId'), - 'episode': title, + 'episode': video_data.get('title'), 'episode_number': int_or_none(video_data.get('episodeNo')), - 'http_headers': { - 'Referer': 'https://www.hotstar.com/in', - } } +class HotStarPrefixIE(InfoExtractor): + """ The "hotstar:" prefix is no longer in use, but this is kept for backward compatibility """ + IE_DESC = False + _VALID_URL = r'hotstar:(?:(?P<type>\w+):)?(?P<id>\d+)$' + _TESTS = [{ + 'url': 'hotstar:1000076273', + 'only_matching': True, + }, { + 'url': 'hotstar:movies:1260009879', + 'info_dict': { + 'id': '1260009879', + 'ext': 'mp4', + 'title': 'Nuvvu Naaku Nachav', + 'description': 'md5:d43701b1314e6f8233ce33523c043b7d', + 'timestamp': 1567525674, + 'upload_date': '20190903', + 'duration': 10787, + 'episode': 'Nuvvu Naaku Nachav', + }, + }, { + 'url': 'hotstar:episode:1000234847', + 'only_matching': True, + }, { + # contentData + 'url': 'hotstar:sports:1260065956', + 'only_matching': True, + }, { + # contentData + 'url': 'hotstar:sports:1260066104', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id, video_type = self._match_valid_url(url).group('id', 'type') + return self.url_result(HotStarIE._video_url(video_id, video_type), HotStarIE, video_id) + + class HotStarPlaylistIE(HotStarBaseIE): IE_NAME = 'hotstar:playlist' - _VALID_URL = r'https?://(?:www\.)?hotstar\.com/tv/[^/]+/s-\w+/list/[^/]+/t-(?P<id>\w+)' + _VALID_URL = r'https?://(?:www\.)?hotstar\.com(?:/in)?/tv(?:/[^/]+){2}/list/[^/]+/t-(?P<id>\w+)' _TESTS = [{ 'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/popular-clips/t-3_2_26', 'info_dict': { @@ -281,25 +298,49 @@ class HotStarPlaylistIE(HotStarBaseIE): }, { 'url': 'https://www.hotstar.com/tv/savdhaan-india/s-26/list/extras/t-2480', 'only_matching': True, + }, { + 'url': 'https://www.hotstar.com/in/tv/karthika-deepam/15457/list/popular-clips/t-3_2_1272', + 'only_matching': True, }] def _real_extract(self, url): - playlist_id = self._match_id(url) + id_ = self._match_id(url) + return self.playlist_result( + self._playlist_entries('tray/find', id_, query={'tas': 10000, 'uqId': id_}), id_) - collection = self._call_api('o/v1/tray/find', playlist_id, 'uqId')['body']['results'] - entries = [ - self.url_result( - 'https://www.hotstar.com/%s' % video['contentId'], - ie=HotStarIE.ie_key(), video_id=video['contentId']) - for video in collection['assets']['items'] - if video.get('contentId')] - return self.playlist_result(entries, playlist_id) +class HotStarSeasonIE(HotStarBaseIE): + IE_NAME = 'hotstar:season' + _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com(?:/in)?/tv/[^/]+/\w+)/seasons/[^/]+/ss-(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://www.hotstar.com/tv/radhakrishn/1260000646/seasons/season-2/ss-8028', + 'info_dict': { + 'id': '8028', + }, + 'playlist_mincount': 35, + }, { + 'url': 'https://www.hotstar.com/in/tv/ishqbaaz/9567/seasons/season-2/ss-4357', + 'info_dict': { + 'id': '4357', + }, + 'playlist_mincount': 30, + }, { + 'url': 'https://www.hotstar.com/in/tv/bigg-boss/14714/seasons/season-4/ss-8208/', + 'info_dict': { + 'id': '8208', + }, + 'playlist_mincount': 19, + }] + + def _real_extract(self, url): + url, season_id = self._match_valid_url(url).groups() + return self.playlist_result(self._playlist_entries( + 'season/asset', season_id, url, query={'tao': 0, 'tas': 0, 'size': 10000, 'id': season_id}), season_id) class HotStarSeriesIE(HotStarBaseIE): IE_NAME = 'hotstar:series' - _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com(?:/in)?/tv/[^/]+/(?P<id>\d+))' + _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com(?:/in)?/tv/[^/]+/(?P<id>\d+))/?(?:[#?]|$)' _TESTS = [{ 'url': 'https://www.hotstar.com/in/tv/radhakrishn/1260000646', 'info_dict': { @@ -317,25 +358,13 @@ class HotStarSeriesIE(HotStarBaseIE): 'info_dict': { 'id': '435', }, - 'playlist_mincount': 269, + 'playlist_mincount': 267, }] def _real_extract(self, url): url, series_id = self._match_valid_url(url).groups() - headers = { - 'x-country-code': 'IN', - 'x-platform-code': 'PCTV', - } - detail_json = self._download_json('https://api.hotstar.com/o/v1/show/detail?contentId=' + series_id, - video_id=series_id, headers=headers) - id = compat_str(try_get(detail_json, lambda x: x['body']['results']['item']['id'], int)) - item_json = self._download_json('https://api.hotstar.com/o/v1/tray/g/1/items?etid=0&tao=0&tas=10000&eid=' + id, - video_id=series_id, headers=headers) - entries = [ - self.url_result( - '%s/ignoreme/%d' % (url, video['contentId']), - ie=HotStarIE.ie_key(), video_id=video['contentId']) - for video in item_json['body']['results']['items'] - if video.get('contentId')] - - return self.playlist_result(entries, series_id) + id_ = self._call_api_v1( + 'show/detail', series_id, query={'contentId': series_id})['body']['results']['item']['id'] + + return self.playlist_result(self._playlist_entries( + 'tray/g/1/items', series_id, url, query={'tao': 0, 'tas': 10000, 'etid': 0, 'eid': id_}), series_id) diff --git a/hypervideo_dl/extractor/howcast.py b/hypervideo_dl/extractor/howcast.py index 7e36b85..59cf80f 100644 --- a/hypervideo_dl/extractor/howcast.py +++ b/hypervideo_dl/extractor/howcast.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import parse_iso8601 diff --git a/hypervideo_dl/extractor/howstuffworks.py b/hypervideo_dl/extractor/howstuffworks.py index cf90ab3..238fc0b 100644 --- a/hypervideo_dl/extractor/howstuffworks.py +++ b/hypervideo_dl/extractor/howstuffworks.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( find_xpath_attr, @@ -77,8 +75,6 @@ class HowStuffWorksIE(InfoExtractor): 'vbr': vbr, }) - self._sort_formats(formats) - return { 'id': '%s' % video_id, 'display_id': display_id, diff --git a/hypervideo_dl/extractor/hrfensehen.py b/hypervideo_dl/extractor/hrfensehen.py index e39ded2..35e9f67 100644 --- a/hypervideo_dl/extractor/hrfensehen.py +++ b/hypervideo_dl/extractor/hrfensehen.py @@ -1,17 +1,19 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import re -from ..utils import int_or_none, unified_timestamp, unescapeHTML from .common import InfoExtractor +from ..utils import ( + int_or_none, + traverse_obj, + try_call, + unescapeHTML, + unified_timestamp, +) class HRFernsehenIE(InfoExtractor): IE_NAME = 'hrfernsehen' _VALID_URL = r'^https?://www\.(?:hr-fernsehen|hessenschau)\.de/.*,video-(?P<id>[0-9]{6})\.html' - _TESTS = [{ 'url': 'https://www.hessenschau.de/tv-sendung/hessenschau-vom-26082020,video-130546.html', 'md5': '5c4e0ba94677c516a2f65a84110fc536', @@ -24,10 +26,11 @@ class HRFernsehenIE(InfoExtractor): 'subtitles': {'de': [{ 'url': 'https://hr-a.akamaihd.net/video/as/hessenschau/2020_08/hrLogo_200826200407_L385592_512x288-25p-500kbit.vtt' }]}, - 'timestamp': 1598470200, + 'timestamp': 1598400000, 'upload_date': '20200826', - 'thumbnail': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9__medium.jpg', - 'title': 'hessenschau vom 26.08.2020' + 'thumbnail': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9.jpg', + 'title': 'hessenschau vom 26.08.2020', + 'duration': 1654 } }, { 'url': 'https://www.hr-fernsehen.de/sendungen-a-z/mex/sendungen/fair-und-gut---was-hinter-aldis-eigenem-guetesiegel-steckt,video-130544.html', @@ -36,25 +39,18 @@ class HRFernsehenIE(InfoExtractor): _GEO_COUNTRIES = ['DE'] - def extract_airdate(self, loader_data): - airdate_str = loader_data.get('mediaMetadata', {}).get('agf', {}).get('airdate') - - if airdate_str is None: - return None - - return unified_timestamp(airdate_str) - def extract_formats(self, loader_data): stream_formats = [] - for stream_obj in loader_data["videoResolutionLevels"]: + data = loader_data['mediaCollection']['streams'][0]['media'] + for inner in data[1:]: stream_format = { - 'format_id': str(stream_obj['verticalResolution']) + "p", - 'height': stream_obj['verticalResolution'], - 'url': stream_obj['url'], + 'format_id': try_call(lambda: f'{inner["maxHResolutionPx"]}p'), + 'height': inner.get('maxHResolutionPx'), + 'url': inner['url'], } quality_information = re.search(r'([0-9]{3,4})x([0-9]{3,4})-([0-9]{2})p-([0-9]{3,4})kbit', - stream_obj['url']) + inner['url']) if quality_information: stream_format['width'] = int_or_none(quality_information.group(1)) stream_format['height'] = int_or_none(quality_information.group(2)) @@ -62,8 +58,6 @@ class HRFernsehenIE(InfoExtractor): stream_format['tbr'] = int_or_none(quality_information.group(4)) stream_formats.append(stream_format) - - self._sort_formats(stream_formats) return stream_formats def _real_extract(self, url): @@ -75,22 +69,22 @@ class HRFernsehenIE(InfoExtractor): description = self._html_search_meta( ['description'], webpage) - loader_str = unescapeHTML(self._search_regex(r"data-new-hr-mediaplayer-loader='([^']*)'", webpage, "ardloader")) + loader_str = unescapeHTML(self._search_regex(r"data-(?:new-)?hr-mediaplayer-loader='([^']*)'", webpage, 'ardloader')) loader_data = json.loads(loader_str) + subtitle = traverse_obj(loader_data, ('mediaCollection', 'subTitles', 0, 'sources', 0, 'url')) + info = { 'id': video_id, 'title': title, 'description': description, 'formats': self.extract_formats(loader_data), - 'timestamp': self.extract_airdate(loader_data) + 'subtitles': {'de': [{'url': subtitle}]}, + 'timestamp': unified_timestamp(self._search_regex( + r'<time\sdatetime="(\d{4}\W\d{1,2}\W\d{1,2})', webpage, 'datetime', fatal=False)), + 'duration': int_or_none(traverse_obj( + loader_data, ('playerConfig', 'pluginData', 'trackingAti@all', 'richMedia', 'duration'))), + 'thumbnail': self._search_regex(r'thumbnailUrl\W*([^"]+)', webpage, 'thumbnail', default=None), } - if "subtitle" in loader_data: - info["subtitles"] = {"de": [{"url": loader_data["subtitle"]}]} - - thumbnails = list(set([t for t in loader_data.get("previewImageUrl", {}).values()])) - if len(thumbnails) > 0: - info["thumbnails"] = [{"url": t} for t in thumbnails] - return info diff --git a/hypervideo_dl/extractor/hrti.py b/hypervideo_dl/extractor/hrti.py index 36d6007..cfec80d 100644 --- a/hypervideo_dl/extractor/hrti.py +++ b/hypervideo_dl/extractor/hrti.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -147,7 +144,6 @@ class HRTiIE(HRTiBaseIE): formats = self._extract_m3u8_formats( m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') - self._sort_formats(formats) description = clean_html(title_info.get('summary_long')) age_limit = parse_age_limit(video.get('parental_control', {}).get('rating')) diff --git a/hypervideo_dl/extractor/hse.py b/hypervideo_dl/extractor/hse.py index 9144ff8..3cb21d2 100644 --- a/hypervideo_dl/extractor/hse.py +++ b/hypervideo_dl/extractor/hse.py @@ -1,4 +1,3 @@ -# coding: utf-8 from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -26,7 +25,6 @@ class HSEShowBaseInfoExtractor(InfoExtractor): fmts, subs = self._extract_m3u8_formats_and_subtitles(src['url'], video_id, ext='mp4') formats.extend(fmts) subtitles = self._merge_subtitles(subtitles, subs) - self._sort_formats(formats) return formats, subtitles diff --git a/hypervideo_dl/extractor/huajiao.py b/hypervideo_dl/extractor/huajiao.py index 4ca275d..c498fa3 100644 --- a/hypervideo_dl/extractor/huajiao.py +++ b/hypervideo_dl/extractor/huajiao.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( parse_duration, diff --git a/hypervideo_dl/extractor/huffpost.py b/hypervideo_dl/extractor/huffpost.py index 54385ba..69fdc34 100644 --- a/hypervideo_dl/extractor/huffpost.py +++ b/hypervideo_dl/extractor/huffpost.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -19,6 +17,7 @@ class HuffPostIE(InfoExtractor): HPLEmbedPlayer/\?segmentId= ) (?P<id>[0-9a-f]+)''' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>https?://embed\.live\.huffingtonpost\.com/.+?)\1'] _TEST = { 'url': 'http://live.huffingtonpost.com/r/segment/legalese-it/52dd3e4b02a7602131000677', @@ -80,8 +79,6 @@ class HuffPostIE(InfoExtractor): 'vcodec': 'none' if key.startswith('audio/') else None, }) - self._sort_formats(formats) - return { 'id': video_id, 'title': video_title, diff --git a/hypervideo_dl/extractor/hungama.py b/hypervideo_dl/extractor/hungama.py index 821b16e..2e99396 100644 --- a/hypervideo_dl/extractor/hungama.py +++ b/hypervideo_dl/extractor/hungama.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -23,15 +20,17 @@ class HungamaIE(InfoExtractor): ''' _TESTS = [{ 'url': 'http://www.hungama.com/video/krishna-chants/39349649/', - 'md5': 'a845a6d1ebd08d80c1035126d49bd6a0', + 'md5': '687c5f1e9f832f3b59f44ed0eb1f120a', 'info_dict': { - 'id': '2931166', + 'id': '39349649', 'ext': 'mp4', - 'title': 'Lucky Ali - Kitni Haseen Zindagi', - 'track': 'Kitni Haseen Zindagi', - 'artist': 'Lucky Ali', - 'album': 'Aks', - 'release_year': 2000, + 'title': 'Krishna Chants', + 'description': 'Watch Krishna Chants video now. You can also watch other latest videos only at Hungama', + 'upload_date': '20180829', + 'duration': 264, + 'timestamp': 1535500800, + 'view_count': int, + 'thumbnail': 'https://images.hungama.com/c/1/0dc/2ca/39349649/39349649_700x394.jpg', } }, { 'url': 'https://www.hungama.com/movie/kahaani-2/44129919/', @@ -43,12 +42,7 @@ class HungamaIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - info = self._search_json_ld(webpage, video_id) - - m3u8_url = self._download_json( + video_json = self._download_json( 'https://www.hungama.com/index.php', video_id, data=urlencode_postdata({'content_id': video_id}), headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', @@ -56,18 +50,24 @@ class HungamaIE(InfoExtractor): }, query={ 'c': 'common', 'm': 'get_video_mdn_url', - })['stream_url'] + }) - formats = self._extract_m3u8_formats( - m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id='hls') - self._sort_formats(formats) + formats = self._extract_m3u8_formats(video_json['stream_url'], video_id, ext='mp4', m3u8_id='hls') - info.update({ + json_ld = self._search_json_ld( + self._download_webpage(url, video_id, fatal=False) or '', video_id, fatal=False) + + return { + **json_ld, 'id': video_id, 'formats': formats, - }) - return info + 'subtitles': { + 'en': [{ + 'url': video_json['sub_title'], + 'ext': 'vtt', + }] + } if video_json.get('sub_title') else None, + } class HungamaSongIE(InfoExtractor): diff --git a/hypervideo_dl/extractor/huya.py b/hypervideo_dl/extractor/huya.py index 4e96f22..b6e9eec 100644 --- a/hypervideo_dl/extractor/huya.py +++ b/hypervideo_dl/extractor/huya.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import hashlib import random @@ -9,7 +6,6 @@ from ..compat import compat_urlparse, compat_b64decode from ..utils import ( ExtractorError, int_or_none, - js_to_json, str_or_none, try_get, unescapeHTML, @@ -58,11 +54,7 @@ class HuyaLiveIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id=video_id) - json_stream = self._search_regex(r'"stream":\s+"([a-zA-Z0-9+=/]+)"', webpage, 'stream', default=None) - if not json_stream: - raise ExtractorError('Video is offline', expected=True) - stream_data = self._parse_json(compat_b64decode(json_stream).decode(), video_id=video_id, - transform_source=js_to_json) + stream_data = self._search_json(r'stream:\s', webpage, 'stream', video_id=video_id, default=None) room_info = try_get(stream_data, lambda x: x['data'][0]['gameLiveInfo']) if not room_info: raise ExtractorError('Can not extract the room info', expected=True) @@ -70,6 +62,8 @@ class HuyaLiveIE(InfoExtractor): screen_type = room_info.get('screenType') live_source_type = room_info.get('liveSourceType') stream_info_list = stream_data['data'][0]['gameStreamInfoList'] + if not stream_info_list: + raise ExtractorError('Video is offline', expected=True) formats = [] for stream_info in stream_info_list: stream_url = stream_info.get('sFlvUrl') @@ -99,8 +93,6 @@ class HuyaLiveIE(InfoExtractor): **self._RESOLUTION.get(si.get('sDisplayName'), {}), }) - self._sort_formats(formats) - return { 'id': video_id, 'title': title, diff --git a/hypervideo_dl/extractor/hypem.py b/hypervideo_dl/extractor/hypem.py index 9ca28d6..54db7b3 100644 --- a/hypervideo_dl/extractor/hypem.py +++ b/hypervideo_dl/extractor/hypem.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import int_or_none diff --git a/hypervideo_dl/extractor/hytale.py b/hypervideo_dl/extractor/hytale.py new file mode 100644 index 0000000..0f4dcc3 --- /dev/null +++ b/hypervideo_dl/extractor/hytale.py @@ -0,0 +1,58 @@ +import re + +from .common import InfoExtractor +from ..utils import traverse_obj + + +class HytaleIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hytale\.com/news/\d+/\d+/(?P<id>[a-z0-9-]+)' + _TESTS = [{ + 'url': 'https://hytale.com/news/2021/07/summer-2021-development-update', + 'info_dict': { + 'id': 'summer-2021-development-update', + 'title': 'Summer 2021 Development Update', + }, + 'playlist_count': 4, + 'playlist': [{ + 'md5': '0854ebe347d233ee19b86ab7b2ead610', + 'info_dict': { + 'id': 'ed51a2609d21bad6e14145c37c334999', + 'ext': 'mp4', + 'title': 'Avatar Personalization', + 'thumbnail': r're:https://videodelivery\.net/\w+/thumbnails/thumbnail\.jpg', + } + }] + }, { + 'url': 'https://www.hytale.com/news/2019/11/hytale-graphics-update', + 'info_dict': { + 'id': 'hytale-graphics-update', + 'title': 'Hytale graphics update', + }, + 'playlist_count': 2, + }] + + def _real_initialize(self): + media_webpage = self._download_webpage( + 'https://hytale.com/media', None, note='Downloading list of media', fatal=False) or '' + + clips_json = traverse_obj( + self._search_json( + r'window\.__INITIAL_COMPONENTS_STATE__\s*=\s*\[', + media_webpage, 'clips json', None), + ('media', 'clips')) or [] + + self._titles = {clip.get('src'): clip.get('caption') for clip in clips_json} + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + entries = [ + self.url_result( + f'https://cloudflarestream.com/{video_hash}/manifest/video.mpd?parentOrigin=https%3A%2F%2Fhytale.com', + title=self._titles.get(video_hash), url_transparent=True) + for video_hash in re.findall( + r'<stream\s+class\s*=\s*"ql-video\s+cf-stream"\s+src\s*=\s*"([a-f0-9]{32})"', + webpage) + ] + + return self.playlist_result(entries, playlist_id, self._og_search_title(webpage)) diff --git a/hypervideo_dl/extractor/icareus.py b/hypervideo_dl/extractor/icareus.py new file mode 100644 index 0000000..d081cf4 --- /dev/null +++ b/hypervideo_dl/extractor/icareus.py @@ -0,0 +1,179 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + determine_ext, + get_element_by_class, + int_or_none, + merge_dicts, + parse_bitrate, + parse_resolution, + remove_end, + str_or_none, + url_or_none, + urlencode_postdata, +) + + +class IcareusIE(InfoExtractor): + _DOMAINS = '|'.join(map(re.escape, ( + 'asahitv.fi', + 'helsinkikanava.fi', + 'hyvinvointitv.fi', + 'inez.fi', + 'permanto.fi', + 'suite.icareus.com', + 'videos.minifiddlers.org', + ))) + _VALID_URL = rf'(?P<base_url>https?://(?:www\.)?(?:{_DOMAINS}))/[^?#]+/player/[^?#]+\?(?:[^#]+&)?(?:assetId|eventId)=(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.helsinkikanava.fi/fi_FI/web/helsinkikanava/player/vod?assetId=68021894', + 'md5': 'ca0b62ffc814a5411dfa6349cf5adb8a', + 'info_dict': { + 'id': '68021894', + 'ext': 'mp4', + 'title': 'Perheiden parhaaksi', + 'description': 'md5:295785ea408e5ac00708766465cc1325', + 'thumbnail': 'https://www.helsinkikanava.fi/image/image_gallery?img_id=68022501', + 'upload_date': '20200924', + 'timestamp': 1600938300, + }, + }, { # Recorded livestream + 'url': 'https://www.helsinkikanava.fi/fi/web/helsinkikanava/player/event/view?eventId=76241489', + 'md5': '014327e69dfa7b949fcc861f6d162d6d', + 'info_dict': { + 'id': '76258304', + 'ext': 'mp4', + 'title': 'Helsingin kaupungin ja HUSin tiedotustilaisuus koronaepidemiatilanteesta 24.11.2020', + 'description': 'md5:3129d041c6fbbcdc7fe68d9a938fef1c', + 'thumbnail': 'https://icareus-suite.secure2.footprint.net/image/image_gallery?img_id=76288630', + 'upload_date': '20201124', + 'timestamp': 1606206600, + }, + }, { # Non-m3u8 stream + 'url': 'https://suite.icareus.com/fi/web/westend-indians/player/vod?assetId=47567389', + 'md5': '72fc04ee971bbedc44405cdf16c990b6', + 'info_dict': { + 'id': '47567389', + 'ext': 'mp4', + 'title': 'Omatoiminen harjoittelu - Laukominen', + 'description': '', + 'thumbnail': 'https://suite.icareus.com/image/image_gallery?img_id=47568162', + 'upload_date': '20200319', + 'timestamp': 1584658080, + }, + }, { + 'url': 'https://asahitv.fi/fi/web/asahi/player/vod?assetId=89415818', + 'only_matching': True + }, { + 'url': 'https://hyvinvointitv.fi/fi/web/hyvinvointitv/player/vod?assetId=89149730', + 'only_matching': True + }, { + 'url': 'https://inez.fi/fi/web/inez-media/player/vod?assetId=71328822', + 'only_matching': True + }, { + 'url': 'https://www.permanto.fi/fi/web/alfatv/player/vod?assetId=135497515', + 'only_matching': True + }, { + 'url': 'https://videos.minifiddlers.org/web/international-minifiddlers/player/vod?assetId=1982759', + 'only_matching': True + }] + + def _real_extract(self, url): + base_url, temp_id = self._match_valid_url(url).groups() + webpage = self._download_webpage(url, temp_id) + + video_id = self._search_regex(r"_icareus\['itemId'\]\s*=\s*'(\d+)'", webpage, 'video_id') + organization_id = self._search_regex(r"_icareus\['organizationId'\]\s*=\s*'(\d+)'", webpage, 'organization_id') + + assets = self._download_json( + self._search_regex(r'var\s+publishingServiceURL\s*=\s*"(http[^"]+)";', webpage, 'api_base'), + video_id, data=urlencode_postdata({ + 'version': '03', + 'action': 'getAssetPlaybackUrls', + 'organizationId': organization_id, + 'assetId': video_id, + 'token': self._search_regex(r"_icareus\['token'\]\s*=\s*'([a-f0-9]+)'", webpage, 'icareus_token'), + })) + + subtitles = { + remove_end(sdesc.split(' ')[0], ':'): [{'url': url_or_none(surl)}] + for _, sdesc, surl in assets.get('subtitles') or [] + } + + formats = [{ + 'format': item.get('name'), + 'format_id': 'audio', + 'vcodec': 'none', + 'url': url_or_none(item['url']), + 'tbr': int_or_none(self._search_regex( + r'\((\d+)\s*k\)', item.get('name') or '', 'audio bitrate', default=None)), + } for item in assets.get('audio_urls') or [] if url_or_none(item.get('url'))] + + for item in assets.get('urls') or []: + video_url = url_or_none(item.get('url')) + if video_url is None: + continue + ext = determine_ext(video_url) + if ext == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + video_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + fmt = item.get('name') + formats.append({ + 'url': video_url, + 'format': fmt, + 'tbr': parse_bitrate(fmt), + 'format_id': str_or_none(item.get('id')), + **parse_resolution(fmt), + }) + + info, token, live_title = self._search_json_ld(webpage, video_id, default={}), None, None + if not info: + token = self._search_regex( + r'data\s*:\s*{action:"getAsset".*?token:\'([a-f0-9]+)\'}', webpage, 'token', default=None) + if not token: + live_title = get_element_by_class('unpublished-info-item future-event-title', webpage) + + if token: + metadata = self._download_json( + f'{base_url}/icareus-suite-api-portlet/publishing', + video_id, fatal=False, data=urlencode_postdata({ + 'version': '03', + 'action': 'getAsset', + 'organizationId': organization_id, + 'assetId': video_id, + 'languageId': 'en_US', + 'userId': '0', + 'token': token, + })) or {} + info = { + 'title': metadata.get('name'), + 'description': metadata.get('description'), + 'timestamp': int_or_none(metadata.get('date'), scale=1000), + 'duration': int_or_none(metadata.get('duration')), + 'thumbnail': url_or_none(metadata.get('thumbnailMedium')), + } + elif live_title: # Recorded livestream + info = { + 'title': live_title, + 'description': get_element_by_class('unpublished-info-item future-event-description', webpage), + 'timestamp': int_or_none(self._search_regex( + r'var startEvent\s*=\s*(\d+);', webpage, 'uploadDate', fatal=False), scale=1000), + } + + thumbnails = info.get('thumbnails') or [{ + 'url': url_or_none(info.get('thumbnail') or assets.get('thumbnail')) + }] + + return merge_dicts({ + 'id': video_id, + 'title': None, + 'formats': formats, + 'subtitles': subtitles, + 'description': clean_html(info.get('description')), + 'thumbnails': thumbnails if thumbnails[0]['url'] else None, + }, info) diff --git a/hypervideo_dl/extractor/ichinanalive.py b/hypervideo_dl/extractor/ichinanalive.py index cb39f82..9d55ddc 100644 --- a/hypervideo_dl/extractor/ichinanalive.py +++ b/hypervideo_dl/extractor/ichinanalive.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ExtractorError, str_or_none, traverse_obj, unified_strdate from ..compat import compat_str @@ -76,8 +73,6 @@ class IchinanaLiveIE(InfoExtractor): 'acodec': 'aac', }) - self._sort_formats(formats) - return { 'id': video_id, 'title': uploader or video_id, @@ -150,8 +145,6 @@ class IchinanaLiveClipIE(InfoExtractor): 'http_headers': {'Referer': url}, }) - self._sort_formats(formats) - return { 'id': video_id, 'title': uploader or video_id, diff --git a/hypervideo_dl/extractor/ign.py b/hypervideo_dl/extractor/ign.py index c826eb3..d4797d3 100644 --- a/hypervideo_dl/extractor/ign.py +++ b/hypervideo_dl/extractor/ign.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -104,8 +102,6 @@ class IGNIE(IGNBaseIE): 'url': mezzanine_url, }) - self._sort_formats(formats) - thumbnails = [] for thumbnail in (video.get('thumbnails') or []): thumbnail_url = thumbnail.get('url') diff --git a/hypervideo_dl/extractor/iheart.py b/hypervideo_dl/extractor/iheart.py index b54c05e..2c6a5b6 100644 --- a/hypervideo_dl/extractor/iheart.py +++ b/hypervideo_dl/extractor/iheart.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( clean_html, diff --git a/hypervideo_dl/extractor/iltalehti.py b/hypervideo_dl/extractor/iltalehti.py new file mode 100644 index 0000000..0e7e82c --- /dev/null +++ b/hypervideo_dl/extractor/iltalehti.py @@ -0,0 +1,51 @@ +from .common import InfoExtractor +from ..utils import js_to_json, traverse_obj + + +class IltalehtiIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?iltalehti\.fi/[^/?#]+/a/(?P<id>[^/?#])' + _TESTS = [ + # jwplatform embed main_media + { + 'url': 'https://www.iltalehti.fi/ulkomaat/a/9fbd067f-94e4-46cd-8748-9d958eb4dae2', + 'md5': 'af12d42c539f1f49f0b62d231fe72dcd', + 'info_dict': { + 'id': 'gYjjaf1L', + 'ext': 'mp4', + 'title': 'Sensuroimaton Päivärinta, jakso 227: Vieraana Suomen Venäjän ex-suurlähettiläs René Nyberg ja Kenraalimajuri evp Pekka Toveri', + 'description': '', + 'upload_date': '20220928', + 'timestamp': 1664360878, + 'duration': 2089, + 'thumbnail': r're:^https?://.*\.jpg', + }, + }, + # jwplatform embed body + { + 'url': 'https://www.iltalehti.fi/politiikka/a/1ce49d85-1670-428b-8db8-d2479b9950a4', + 'md5': '9e50334b8f8330ce8828b567a82a3c65', + 'info_dict': { + 'id': '18R6zkLi', + 'ext': 'mp4', + 'title': 'Pekka Toverin arvio: Näin Nord Stream -kaasuputken räjäyttäminen on saatettu toteuttaa', + 'description': 'md5:3d1302c9e17e7ffd564143ff58f8de35', + 'upload_date': '20220929', + 'timestamp': 1664435867, + 'duration': 165.0, + 'thumbnail': r're:^https?://.*\.jpg', + }, + }, + ] + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + info = self._search_json( + r'<script>\s*window.App\s*=', webpage, 'json', article_id, + transform_source=js_to_json) + props = traverse_obj(info, ( + 'state', 'articles', ..., 'items', (('main_media', 'properties'), ('body', ..., 'properties')))) + video_ids = traverse_obj(props, (lambda _, v: v['provider'] == 'jwplayer', 'id')) + return self.playlist_from_matches( + video_ids, article_id, ie='JWPlatform', getter=lambda id: f'jwplatform:{id}', + title=traverse_obj(info, ('state', 'articles', ..., 'items', 'canonical_title'), get_all=False)) diff --git a/hypervideo_dl/extractor/imdb.py b/hypervideo_dl/extractor/imdb.py index 96cee2e..557a3b7 100644 --- a/hypervideo_dl/extractor/imdb.py +++ b/hypervideo_dl/extractor/imdb.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import base64 import json import re @@ -102,7 +100,6 @@ class ImdbIE(InfoExtractor): 'ext': ext, 'quality': quality(format_id), }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/imggaming.py b/hypervideo_dl/extractor/imggaming.py index ce7b21a..8e220fd 100644 --- a/hypervideo_dl/extractor/imggaming.py +++ b/hypervideo_dl/extractor/imggaming.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -106,7 +103,6 @@ class ImgGamingBaseIE(InfoExtractor): formats.extend(self._extract_mpd_formats( media_url, media_id, mpd_id='dash', fatal=False, headers=self._MANIFEST_HEADERS)) - self._sort_formats(formats) subtitles = {} for subtitle in video_data.get('subtitles', []): diff --git a/hypervideo_dl/extractor/imgur.py b/hypervideo_dl/extractor/imgur.py index c917cf1..061c4cc 100644 --- a/hypervideo_dl/extractor/imgur.py +++ b/hypervideo_dl/extractor/imgur.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -86,8 +84,6 @@ class ImgurIE(InfoExtractor): }, }) - self._sort_formats(formats) - return { 'id': video_id, 'formats': formats, @@ -140,7 +136,7 @@ class ImgurGalleryIE(InfoExtractor): return self.url_result('http://imgur.com/%s' % gallery_id, ImgurIE.ie_key(), gallery_id) -class ImgurAlbumIE(ImgurGalleryIE): +class ImgurAlbumIE(ImgurGalleryIE): # XXX: Do not subclass from concrete IE IE_NAME = 'imgur:album' _VALID_URL = r'https?://(?:i\.)?imgur\.com/a/(?P<id>[a-zA-Z0-9]+)' diff --git a/hypervideo_dl/extractor/ina.py b/hypervideo_dl/extractor/ina.py index b3b2683..857013d 100644 --- a/hypervideo_dl/extractor/ina.py +++ b/hypervideo_dl/extractor/ina.py @@ -1,26 +1,19 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor -from ..utils import ( - determine_ext, - int_or_none, - strip_or_none, - xpath_attr, - xpath_text, -) +from ..utils import unified_strdate class InaIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www|m)\.)?ina\.fr/(?:video|audio)/(?P<id>[A-Z0-9_]+)' + _VALID_URL = r'https?://(?:(?:www|m)\.)?ina\.fr/(?:[^?#]+/)(?P<id>[\w-]+)' _TESTS = [{ - 'url': 'http://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html', - 'md5': 'a667021bf2b41f8dc6049479d9bb38a3', + 'url': 'https://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html', + 'md5': 'c5a09e5cb5604ed10709f06e7a377dda', 'info_dict': { 'id': 'I12055569', 'ext': 'mp4', 'title': 'François Hollande "Je crois que c\'est clair"', - 'description': 'md5:3f09eb072a06cb286b8f7e4f77109663', + 'description': 'md5:19f61e2b4844ed4bb2e3df9ab9f527ff', + 'upload_date': '20070712', + 'thumbnail': 'https://cdn-hub.ina.fr/notice/690x517/3c4/I12055569.jpeg', } }, { 'url': 'https://www.ina.fr/video/S806544_001/don-d-organes-des-avancees-mais-d-importants-besoins-video.html', @@ -34,53 +27,58 @@ class InaIE(InfoExtractor): }, { 'url': 'http://m.ina.fr/video/I12055569', 'only_matching': True, + }, { + 'url': 'https://www.ina.fr/ina-eclaire-actu/video/cpb8205116303/les-jeux-electroniques', + 'md5': '4b8284a9a3a184fdc7e744225b8251e7', + 'info_dict': { + 'id': 'CPB8205116303', + 'ext': 'mp4', + 'title': 'Les jeux électroniques', + 'description': 'md5:e09f7683dad1cc60b74950490127d233', + 'upload_date': '19821204', + 'duration': 657, + 'thumbnail': 'https://cdn-hub.ina.fr/notice/690x517/203/CPB8205116303.jpeg', + }, + }, { + 'url': 'https://www.ina.fr/ina-eclaire-actu/arletty-carriere-conseils-actrice-marcel-carne', + 'md5': '743d6f069a00e19dda0da166a54eeccb', + 'info_dict': { + 'id': 'I22203233', + 'ext': 'mp4', + 'title': 'Arletty sur le métier d\'actrice', + 'description': 'md5:3d89b5e419d8514c934f146045ccdbad', + 'upload_date': '19581128', + 'thumbnail': 'https://cdn-hub.ina.fr/notice/690x517/082/I22203233.jpeg', + }, + }, { + 'url': 'https://www.ina.fr/ina-eclaire-actu/chasse-croise-sncf-gare-d-austerlitz-vacances-d-ete', + 'md5': 'a96fb85e9ba3b5c5b2eeb0c5daa55f2f', + 'info_dict': { + 'id': 'CAF91038285', + 'ext': 'mp4', + 'title': 'Les grands départs : les trains', + 'description': 'md5:1630ee819d8d4da97df53459e99f72bb', + 'upload_date': '19740801', + 'thumbnail': 'https://cdn-hub.ina.fr/notice/690x517/2cf/CAF91038285.jpeg', + }, }] def _real_extract(self, url): - video_id = self._match_id(url) - info_doc = self._download_xml( - 'http://player.ina.fr/notices/%s.mrss' % video_id, video_id) - item = info_doc.find('channel/item') - title = xpath_text(item, 'title', fatal=True) - media_ns_xpath = lambda x: self._xpath_ns(x, 'http://search.yahoo.com/mrss/') - content = item.find(media_ns_xpath('content')) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) - get_furl = lambda x: xpath_attr(content, media_ns_xpath(x), 'url') - formats = [] - for q, w, h in (('bq', 400, 300), ('mq', 512, 384), ('hq', 768, 576)): - q_url = get_furl(q) - if not q_url: - continue - formats.append({ - 'format_id': q, - 'url': q_url, - 'width': w, - 'height': h, - }) - if not formats: - furl = get_furl('player') or content.attrib['url'] - ext = determine_ext(furl) - formats = [{ - 'url': furl, - 'vcodec': 'none' if ext == 'mp3' else None, - 'ext': ext, - }] + api_url = self._html_search_regex(r'asset-details-url\s*=\s*["\'](?P<api_url>[^"\']+)', webpage, 'api_url') + asset_id = self._search_regex(r'assets/([^?/]+)', api_url, 'asset_id') - thumbnails = [] - for thumbnail in content.findall(media_ns_xpath('thumbnail')): - thumbnail_url = thumbnail.get('url') - if not thumbnail_url: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'height': int_or_none(thumbnail.get('height')), - 'width': int_or_none(thumbnail.get('width')), - }) + api_response = self._download_json(api_url.replace(asset_id, f'{asset_id}.json'), asset_id) return { - 'id': video_id, - 'formats': formats, - 'title': title, - 'description': strip_or_none(xpath_text(item, 'description')), - 'thumbnails': thumbnails, + 'id': asset_id, + 'url': api_response['resourceUrl'], + 'ext': {'video': 'mp4', 'audio': 'mp3'}.get(api_response.get('type')), + 'title': api_response.get('title'), + 'description': api_response.get('description'), + 'upload_date': unified_strdate(api_response.get('dateOfBroadcast')), + 'duration': api_response.get('duration'), + 'thumbnail': api_response.get('resourceThumbnail'), } diff --git a/hypervideo_dl/extractor/inc.py b/hypervideo_dl/extractor/inc.py index d5b258a..9b3fe9a 100644 --- a/hypervideo_dl/extractor/inc.py +++ b/hypervideo_dl/extractor/inc.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from .kaltura import KalturaIE diff --git a/hypervideo_dl/extractor/indavideo.py b/hypervideo_dl/extractor/indavideo.py index 4c16243..4fa97d8 100644 --- a/hypervideo_dl/extractor/indavideo.py +++ b/hypervideo_dl/extractor/indavideo.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -15,6 +10,14 @@ from ..utils import ( class IndavideoEmbedIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:embed\.)?indavideo\.hu/player/video/|assets\.indavideo\.hu/swf/player\.swf\?.*\b(?:v(?:ID|id))=)(?P<id>[\da-f]+)' + # Some example URLs covered by generic extractor: + # http://indavideo.hu/video/Vicces_cica_1 + # http://index.indavideo.hu/video/2015_0728_beregszasz + # http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko + # http://erotika.indavideo.hu/video/Amator_tini_punci + # http://film.indavideo.hu/video/f_hrom_nagymamm_volt + # http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//embed\.indavideo\.hu/player/video/[\da-f]+)'] _TESTS = [{ 'url': 'http://indavideo.hu/player/video/1bdc3c6d80/', 'md5': 'c8a507a1c7410685f83a06eaeeaafeab', @@ -40,20 +43,6 @@ class IndavideoEmbedIE(InfoExtractor): 'only_matching': True, }] - # Some example URLs covered by generic extractor: - # http://indavideo.hu/video/Vicces_cica_1 - # http://index.indavideo.hu/video/2015_0728_beregszasz - # http://auto.indavideo.hu/video/Sajat_utanfutoban_a_kis_tacsko - # http://erotika.indavideo.hu/video/Amator_tini_punci - # http://film.indavideo.hu/video/f_hrom_nagymamm_volt - # http://palyazat.indavideo.hu/video/Embertelen_dal_Dodgem_egyuttes - - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//embed\.indavideo\.hu/player/video/[\da-f]+)', - webpage) - def _real_extract(self, url): video_id = self._match_id(url) @@ -100,7 +89,6 @@ class IndavideoEmbedIE(InfoExtractor): 'url': video_url, 'height': height, }) - self._sort_formats(formats) timestamp = video.get('date') if timestamp: diff --git a/hypervideo_dl/extractor/infoq.py b/hypervideo_dl/extractor/infoq.py index 347cc51..192bcfe 100644 --- a/hypervideo_dl/extractor/infoq.py +++ b/hypervideo_dl/extractor/infoq.py @@ -1,15 +1,13 @@ -# coding: utf-8 - -from __future__ import unicode_literals - from ..compat import ( compat_b64decode, compat_urllib_parse_unquote, compat_urlparse, ) from ..utils import ( + ExtractorError, determine_ext, update_url_query, + traverse_obj, ) from .bokecc import BokeCCBaseIE @@ -38,6 +36,7 @@ class InfoQIE(BokeCCBaseIE): 'ext': 'flv', 'description': 'md5:308d981fb28fa42f49f9568322c683ff', }, + 'skip': 'Sorry, the page you visited does not exist', }, { 'url': 'https://www.infoq.com/presentations/Simple-Made-Easy', 'md5': '0e34642d4d9ef44bf86f66f6399672db', @@ -90,8 +89,10 @@ class InfoQIE(BokeCCBaseIE): }] def _extract_http_audio(self, webpage, video_id): - fields = self._form_hidden_inputs('mp3Form', webpage) - http_audio_url = fields.get('filename') + try: + http_audio_url = traverse_obj(self._form_hidden_inputs('mp3Form', webpage), 'filename') + except ExtractorError: + http_audio_url = None if not http_audio_url: return [] @@ -127,8 +128,6 @@ class InfoQIE(BokeCCBaseIE): + self._extract_http_video(webpage) + self._extract_http_audio(webpage, video_id)) - self._sort_formats(formats) - return { 'id': video_id, 'title': video_title, diff --git a/hypervideo_dl/extractor/instagram.py b/hypervideo_dl/extractor/instagram.py index 970f2c8..0233513 100644 --- a/hypervideo_dl/extractor/instagram.py +++ b/hypervideo_dl/extractor/instagram.py @@ -1,19 +1,17 @@ -# coding: utf-8 - -import itertools import hashlib +import itertools import json import re import time +import urllib.error from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, -) from ..utils import ( ExtractorError, - format_field, + decode_base_n, + encode_base_n, float_or_none, + format_field, get_element_by_attribute, int_or_none, lowercase_escape, @@ -24,42 +22,59 @@ from ..utils import ( urlencode_postdata, ) +_ENCODING_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_' + + +def _pk_to_id(id): + """Source: https://stackoverflow.com/questions/24437823/getting-instagram-post-url-from-media-id""" + return encode_base_n(int(id.split('_')[0]), table=_ENCODING_CHARS) + + +def _id_to_pk(shortcode): + """Covert a shortcode to a numeric value""" + return decode_base_n(shortcode[:11], table=_ENCODING_CHARS) + class InstagramBaseIE(InfoExtractor): _NETRC_MACHINE = 'instagram' _IS_LOGGED_IN = False + _API_BASE_URL = 'https://i.instagram.com/api/v1' + _LOGIN_URL = 'https://www.instagram.com/accounts/login' + _API_HEADERS = { + 'X-IG-App-ID': '936619743392459', + 'X-ASBD-ID': '198387', + 'X-IG-WWW-Claim': '0', + 'Origin': 'https://www.instagram.com', + 'Accept': '*/*', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36', + } + def _perform_login(self, username, password): if self._IS_LOGGED_IN: return login_webpage = self._download_webpage( - 'https://www.instagram.com/accounts/login/', None, - note='Downloading login webpage', errnote='Failed to download login webpage') + self._LOGIN_URL, None, note='Downloading login webpage', errnote='Failed to download login webpage') - shared_data = self._parse_json( - self._search_regex( - r'window\._sharedData\s*=\s*({.+?});', - login_webpage, 'shared data', default='{}'), - None) - - login = self._download_json('https://www.instagram.com/accounts/login/ajax/', None, note='Logging in', headers={ - 'Accept': '*/*', - 'X-IG-App-ID': '936619743392459', - 'X-ASBD-ID': '198387', - 'X-IG-WWW-Claim': '0', - 'X-Requested-With': 'XMLHttpRequest', - 'X-CSRFToken': shared_data['config']['csrf_token'], - 'X-Instagram-AJAX': shared_data['rollout_hash'], - 'Referer': 'https://www.instagram.com/', - }, data=urlencode_postdata({ - 'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{int(time.time())}:{password}', - 'username': username, - 'queryParams': '{}', - 'optIntoOneTap': 'false', - 'stopDeletionNonce': '', - 'trustedDeviceRecords': '{}', - })) + shared_data = self._parse_json(self._search_regex( + r'window\._sharedData\s*=\s*({.+?});', login_webpage, 'shared data', default='{}'), None) + + login = self._download_json( + f'{self._LOGIN_URL}/ajax/', None, note='Logging in', headers={ + **self._API_HEADERS, + 'X-Requested-With': 'XMLHttpRequest', + 'X-CSRFToken': shared_data['config']['csrf_token'], + 'X-Instagram-AJAX': shared_data['rollout_hash'], + 'Referer': 'https://www.instagram.com/', + }, data=urlencode_postdata({ + 'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{int(time.time())}:{password}', + 'username': username, + 'queryParams': '{}', + 'optIntoOneTap': 'false', + 'stopDeletionNonce': '', + 'trustedDeviceRecords': '{}', + })) if not login.get('authenticated'): if login.get('message'): @@ -124,7 +139,7 @@ class InstagramBaseIE(InfoExtractor): } def _extract_product_media(self, product_media): - media_id = product_media.get('code') or product_media.get('id') + media_id = product_media.get('code') or _pk_to_id(product_media.get('pk')) vcodec = product_media.get('video_codec') dash_manifest_raw = product_media.get('video_dash_manifest') videos_list = product_media.get('video_versions') @@ -140,7 +155,6 @@ class InstagramBaseIE(InfoExtractor): } for format in videos_list or []] if dash_manifest_raw: formats.extend(self._parse_mpd_formats(self._parse_xml(dash_manifest_raw, media_id), mpd_id='dash')) - self._sort_formats(formats) thumbnails = [{ 'url': thumbnail.get('url'), @@ -160,7 +174,7 @@ class InstagramBaseIE(InfoExtractor): user_info = product_info.get('user') or {} info_dict = { - 'id': product_info.get('code') or product_info.get('id'), + 'id': _pk_to_id(traverse_obj(product_info, 'pk', 'id', expected_type=str_or_none)[:19]), 'title': product_info.get('title') or f'Video by {user_info.get("username")}', 'description': traverse_obj(product_info, ('caption', 'text'), expected_type=str_or_none), 'timestamp': int_or_none(product_info.get('taken_at')), @@ -170,6 +184,7 @@ class InstagramBaseIE(InfoExtractor): 'view_count': int_or_none(product_info.get('view_count')), 'like_count': int_or_none(product_info.get('like_count')), 'comment_count': int_or_none(product_info.get('comment_count')), + '__post_extractor': self.extract_comments(_pk_to_id(product_info.get('pk'))), 'http_headers': { 'Referer': 'https://www.instagram.com/', } @@ -191,6 +206,23 @@ class InstagramBaseIE(InfoExtractor): **self._extract_product_media(product_info) } + def _get_comments(self, video_id): + comments_info = self._download_json( + f'{self._API_BASE_URL}/media/{_id_to_pk(video_id)}/comments/?can_support_threading=true&permalink_enabled=false', video_id, + fatal=False, errnote='Comments extraction failed', note='Downloading comments info', headers=self._API_HEADERS) or {} + + comment_data = traverse_obj(comments_info, ('edge_media_to_parent_comment', 'edges'), 'comments') + for comment_dict in comment_data or []: + yield { + 'author': traverse_obj(comment_dict, ('node', 'owner', 'username'), ('user', 'username')), + 'author_id': traverse_obj(comment_dict, ('node', 'owner', 'id'), ('user', 'pk')), + 'author_thumbnail': traverse_obj(comment_dict, ('node', 'owner', 'profile_pic_url'), ('user', 'profile_pic_url'), expected_type=url_or_none), + 'id': traverse_obj(comment_dict, ('node', 'id'), 'pk'), + 'text': traverse_obj(comment_dict, ('node', 'text'), 'text'), + 'like_count': traverse_obj(comment_dict, ('node', 'edge_liked_by', 'count'), 'comment_like_count', expected_type=int_or_none), + 'timestamp': traverse_obj(comment_dict, ('node', 'created_at'), 'created_at', expected_type=int_or_none), + } + class InstagramIOSIE(InfoExtractor): IE_DESC = 'IOS instagram:// URL' @@ -216,27 +248,14 @@ class InstagramIOSIE(InfoExtractor): 'add_ie': ['Instagram'] }] - def _get_id(self, id): - """Source: https://stackoverflow.com/questions/24437823/getting-instagram-post-url-from-media-id""" - chrs = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_' - media_id = int(id.split('_')[0]) - shortened_id = '' - while media_id > 0: - r = media_id % 64 - media_id = (media_id - r) // 64 - shortened_id = chrs[r] + shortened_id - return shortened_id - def _real_extract(self, url): - return { - '_type': 'url_transparent', - 'url': f'http://instagram.com/tv/{self._get_id(self._match_id(url))}/', - 'ie_key': 'Instagram', - } + video_id = _pk_to_id(self._match_id(url)) + return self.url_result(f'http://instagram.com/tv/{video_id}', InstagramIE, video_id) class InstagramIE(InstagramBaseIE): _VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com(?:/[^/]+)?/(?:p|tv|reel)/(?P<id>[^/?#&]+))' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1'] _TESTS = [{ 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', 'md5': '0d2da106a9d2631273e192b372806516', @@ -246,7 +265,7 @@ class InstagramIE(InstagramBaseIE): 'title': 'Video by naomipq', 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 0, + 'duration': 8.747, 'timestamp': 1371748545, 'upload_date': '20130620', 'uploader_id': '2815873', @@ -256,27 +275,34 @@ class InstagramIE(InstagramBaseIE): 'comment_count': int, 'comments': list, }, + 'expected_warnings': [ + 'General metadata extraction failed', + 'Main webpage is locked behind the login page', + ], }, { - # missing description - 'url': 'https://www.instagram.com/p/BA-pQFBG8HZ/?taken-by=britneyspears', + # reel + 'url': 'https://www.instagram.com/reel/Chunk8-jurw/', + 'md5': 'f6d8277f74515fa3ff9f5791426e42b1', 'info_dict': { - 'id': 'BA-pQFBG8HZ', + 'id': 'Chunk8-jurw', 'ext': 'mp4', - 'title': 'Video by britneyspears', + 'title': 'Video by instagram', + 'description': 'md5:c9cde483606ed6f80fbe9283a6a2b290', 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 0, - 'timestamp': 1453760977, - 'upload_date': '20160125', - 'uploader_id': '12246775', - 'uploader': 'Britney Spears', - 'channel': 'britneyspears', + 'duration': 5.016, + 'timestamp': 1661529231, + 'upload_date': '20220826', + 'uploader_id': '25025320', + 'uploader': 'Instagram', + 'channel': 'instagram', 'like_count': int, 'comment_count': int, 'comments': list, }, - 'params': { - 'skip_download': True, - }, + 'expected_warnings': [ + 'General metadata extraction failed', + 'Main webpage is locked behind the login page', + ], }, { # multi video post 'url': 'https://www.instagram.com/p/BQ0eAlwhDrw/', @@ -285,18 +311,24 @@ class InstagramIE(InstagramBaseIE): 'id': 'BQ0dSaohpPW', 'ext': 'mp4', 'title': 'Video 1', + 'thumbnail': r're:^https?://.*\.jpg', + 'view_count': int, }, }, { 'info_dict': { 'id': 'BQ0dTpOhuHT', 'ext': 'mp4', 'title': 'Video 2', + 'thumbnail': r're:^https?://.*\.jpg', + 'view_count': int, }, }, { 'info_dict': { 'id': 'BQ0dT7RBFeF', 'ext': 'mp4', 'title': 'Video 3', + 'thumbnail': r're:^https?://.*\.jpg', + 'view_count': int, }, }], 'info_dict': { @@ -304,6 +336,10 @@ class InstagramIE(InstagramBaseIE): 'title': 'Post by instagram', 'description': 'md5:0f9203fc6a2ce4d228da5754bcf54957', }, + 'expected_warnings': [ + 'General metadata extraction failed', + 'Main webpage is locked behind the login page', + ], }, { # IGTV 'url': 'https://www.instagram.com/tv/BkfuX9UB-eK/', @@ -322,7 +358,11 @@ class InstagramIE(InstagramBaseIE): 'comment_count': int, 'comments': list, 'description': 'Meet Cass Hirst (@cass.fb), a fingerboarding pro who can perform tiny ollies and kickflips while blindfolded.', - } + }, + 'expected_warnings': [ + 'General metadata extraction failed', + 'Main webpage is locked behind the login page', + ], }, { 'url': 'https://instagram.com/p/-Cmh1cukG2/', 'only_matching': True, @@ -340,59 +380,88 @@ class InstagramIE(InstagramBaseIE): 'only_matching': True, }] - @staticmethod - def _extract_embed_url(webpage): - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1', - webpage) - if mobj: - return mobj.group('url') - - blockquote_el = get_element_by_attribute( - 'class', 'instagram-media', webpage) - if blockquote_el is None: - return + @classmethod + def _extract_embed_urls(cls, url, webpage): + res = tuple(super()._extract_embed_urls(url, webpage)) + if res: + return res - mobj = re.search( - r'<a[^>]+href=([\'"])(?P<link>[^\'"]+)\1', blockquote_el) + mobj = re.search(r'<a[^>]+href=([\'"])(?P<link>[^\'"]+)\1', + get_element_by_attribute('class', 'instagram-media', webpage) or '') if mobj: - return mobj.group('link') + return [mobj.group('link')] def _real_extract(self, url): video_id, url = self._match_valid_url(url).group('id', 'url') - webpage, urlh = self._download_webpage_handle(url, video_id) - if 'www.instagram.com/accounts/login' in urlh.geturl(): - self.report_warning('Main webpage is locked behind the login page. ' - 'Retrying with embed webpage (Note that some metadata might be missing)') - webpage = self._download_webpage( - 'https://www.instagram.com/p/%s/embed/' % video_id, video_id, note='Downloading embed webpage') - - shared_data = self._parse_json( - self._search_regex( - r'window\._sharedData\s*=\s*({.+?});', - webpage, 'shared data', default='{}'), - video_id, fatal=False) - media = traverse_obj( - shared_data, - ('entry_data', 'PostPage', 0, 'graphql', 'shortcode_media'), - ('entry_data', 'PostPage', 0, 'media'), - expected_type=dict) - - # _sharedData.entry_data.PostPage is empty when authenticated (see - # https://github.com/ytdl-org/youtube-dl/pull/22880) - if not media: - additional_data = self._parse_json( - self._search_regex( - r'window\.__additionalDataLoaded\s*\(\s*[^,]+,\s*({.+?})\s*\);', - webpage, 'additional data', default='{}'), - video_id, fatal=False) - product_item = traverse_obj(additional_data, ('items', 0), expected_type=dict) - if product_item: - return self._extract_product(product_item) - media = traverse_obj(additional_data, ('graphql', 'shortcode_media'), 'shortcode_media', expected_type=dict) or {} - - if not media and 'www.instagram.com/accounts/login' in urlh.geturl(): - self.raise_login_required('You need to log in to access this content') + media, webpage = {}, '' + + if self._get_cookies(url).get('sessionid'): + info = traverse_obj(self._download_json( + f'{self._API_BASE_URL}/media/{_id_to_pk(video_id)}/info/', video_id, + fatal=False, errnote='Video info extraction failed', + note='Downloading video info', headers=self._API_HEADERS), ('items', 0)) + if info: + media.update(info) + return self._extract_product(media) + + api_check = self._download_json( + f'{self._API_BASE_URL}/web/get_ruling_for_content/?content_type=MEDIA&target_id={_id_to_pk(video_id)}', + video_id, headers=self._API_HEADERS, fatal=False, note='Setting up session', errnote=False) or {} + csrf_token = self._get_cookies('https://www.instagram.com').get('csrftoken') + + if not csrf_token: + self.report_warning('No csrf token set by Instagram API', video_id) + else: + csrf_token = csrf_token.value if api_check.get('status') == 'ok' else None + if not csrf_token: + self.report_warning('Instagram API is not granting access', video_id) + + variables = { + 'shortcode': video_id, + 'child_comment_count': 3, + 'fetch_comment_count': 40, + 'parent_comment_count': 24, + 'has_threaded_comments': True, + } + general_info = self._download_json( + 'https://www.instagram.com/graphql/query/', video_id, fatal=False, errnote=False, + headers={ + **self._API_HEADERS, + 'X-CSRFToken': csrf_token or '', + 'X-Requested-With': 'XMLHttpRequest', + 'Referer': url, + }, query={ + 'query_hash': '9f8827793ef34641b2fb195d4d41151c', + 'variables': json.dumps(variables, separators=(',', ':')), + }) + media.update(traverse_obj(general_info, ('data', 'shortcode_media')) or {}) + + if not general_info: + self.report_warning('General metadata extraction failed (some metadata might be missing).', video_id) + webpage, urlh = self._download_webpage_handle(url, video_id) + shared_data = self._search_json( + r'window\._sharedData\s*=', webpage, 'shared data', video_id, fatal=False) or {} + + if shared_data and self._LOGIN_URL not in urlh.geturl(): + media.update(traverse_obj( + shared_data, ('entry_data', 'PostPage', 0, 'graphql', 'shortcode_media'), + ('entry_data', 'PostPage', 0, 'media'), expected_type=dict) or {}) + else: + self.report_warning('Main webpage is locked behind the login page. Retrying with embed webpage (some metadata might be missing).') + webpage = self._download_webpage( + f'{url}/embed/', video_id, note='Downloading embed webpage', fatal=False) + additional_data = self._search_json( + r'window\.__additionalDataLoaded\s*\(\s*[^,]+,', webpage, 'additional data', video_id, fatal=False) + if not additional_data and not media: + self.raise_login_required('Requested content is not available, rate-limit reached or login required') + + product_item = traverse_obj(additional_data, ('items', 0), expected_type=dict) + if product_item: + media.update(product_item) + return self._extract_product(media) + + media.update(traverse_obj( + additional_data, ('graphql', 'shortcode_media'), 'shortcode_media', expected_type=dict) or {}) username = traverse_obj(media, ('owner', 'username')) or self._search_regex( r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"', webpage, 'username', fatal=False) @@ -412,7 +481,7 @@ class InstagramIE(InstagramBaseIE): if nodes: return self.playlist_result( self._extract_nodes(nodes, True), video_id, - format_field(username, template='Post by %s'), description) + format_field(username, None, 'Post by %s'), description) video_url = self._og_search_video_url(webpage, secure=False) @@ -424,7 +493,6 @@ class InstagramIE(InstagramBaseIE): dash = traverse_obj(media, ('dash_info', 'video_dash_manifest')) if dash: formats.extend(self._parse_mpd_formats(self._parse_xml(dash, video_id), mpd_id='dash')) - self._sort_formats(formats) comment_data = traverse_obj(media, ('edge_media_to_parent_comment', 'edges')) comments = [{ @@ -521,7 +589,7 @@ class InstagramPlaylistBaseIE(InstagramBaseIE): except ExtractorError as e: # if it's an error caused by a bad query, and there are # more GIS templates to try, ignore it and keep trying - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 403: if gis_tmpl != gis_tmpls[-1]: continue raise @@ -631,41 +699,32 @@ class InstagramStoryIE(InstagramBaseIE): def _real_extract(self, url): username, story_id = self._match_valid_url(url).groups() - - story_info_url = f'{username}/{story_id}/?__a=1' if username == 'highlights' else f'{username}/?__a=1' - story_info = self._download_json(f'https://www.instagram.com/stories/{story_info_url}', story_id, headers={ - 'X-IG-App-ID': 936619743392459, - 'X-ASBD-ID': 198387, - 'X-IG-WWW-Claim': 0, - 'X-Requested-With': 'XMLHttpRequest', - 'Referer': url, - }) - user_id = story_info['user']['id'] - highlight_title = traverse_obj(story_info, ('highlight', 'title')) + story_info = self._download_webpage(url, story_id) + user_info = self._search_json(r'"user":', story_info, 'user info', story_id, fatal=False) + if not user_info: + self.raise_login_required('This content is unreachable') + user_id = user_info.get('id') story_info_url = user_id if username != 'highlights' else f'highlight:{story_id}' - videos = self._download_json(f'https://i.instagram.com/api/v1/feed/reels_media/?reel_ids={story_info_url}', story_id, headers={ - 'X-IG-App-ID': 936619743392459, - 'X-ASBD-ID': 198387, - 'X-IG-WWW-Claim': 0, - })['reels'] - - full_name = traverse_obj(videos, ('user', 'full_name')) - - user_info = {} - if not (username and username != 'highlights' and full_name): - user_info = self._download_json( - f'https://i.instagram.com/api/v1/users/{user_id}/info/', story_id, headers={ - 'User-Agent': 'Mozilla/5.0 (Linux; Android 11; SM-A505F Build/RP1A.200720.012; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/96.0.4664.45 Mobile Safari/537.36 Instagram 214.1.0.29.120 Android (30/11; 450dpi; 1080x2122; samsung; SM-A505F; a50; exynos9610; en_US; 333717274)', - }, note='Downloading user info') + videos = traverse_obj(self._download_json( + f'{self._API_BASE_URL}/feed/reels_media/?reel_ids={story_info_url}', + story_id, errnote=False, fatal=False, headers=self._API_HEADERS), 'reels') + if not videos: + self.raise_login_required('You need to log in to access this content') - username = traverse_obj(user_info, ('user', 'username')) or username - full_name = traverse_obj(user_info, ('user', 'full_name')) or full_name + full_name = traverse_obj(videos, (f'highlight:{story_id}', 'user', 'full_name'), (str(user_id), 'user', 'full_name')) + story_title = traverse_obj(videos, (f'highlight:{story_id}', 'title')) + if not story_title: + story_title = f'Story by {username}' highlights = traverse_obj(videos, (f'highlight:{story_id}', 'items'), (str(user_id), 'items')) - return self.playlist_result([{ - **self._extract_product(highlight), - 'title': f'Story by {username}', - 'uploader': full_name, - 'uploader_id': user_id, - } for highlight in highlights], playlist_id=story_id, playlist_title=highlight_title) + info_data = [] + for highlight in highlights: + highlight_data = self._extract_product(highlight) + if highlight_data.get('formats'): + info_data.append({ + **highlight_data, + 'uploader': full_name, + 'uploader_id': user_id, + }) + return self.playlist_result(info_data, playlist_id=story_id, playlist_title=story_title) diff --git a/hypervideo_dl/extractor/internazionale.py b/hypervideo_dl/extractor/internazionale.py index 45e2af6..1b1cb57 100644 --- a/hypervideo_dl/extractor/internazionale.py +++ b/hypervideo_dl/extractor/internazionale.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import unified_timestamp @@ -63,7 +60,6 @@ class InternazionaleIE(InfoExtractor): entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) formats.extend(self._extract_mpd_formats( video_base + 'mpd', display_id, mpd_id='dash', fatal=False)) - self._sort_formats(formats) timestamp = unified_timestamp(self._html_search_meta( 'article:published_time', webpage, 'timestamp')) diff --git a/hypervideo_dl/extractor/internetvideoarchive.py b/hypervideo_dl/extractor/internetvideoarchive.py index 880918c..9d2574c 100644 --- a/hypervideo_dl/extractor/internetvideoarchive.py +++ b/hypervideo_dl/extractor/internetvideoarchive.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import json import re @@ -50,7 +48,6 @@ class InternetVideoArchiveIE(InfoExtractor): replace_url('.mpd'), video_id, mpd_id='dash', fatal=False)) formats.extend(self._extract_ism_formats( replace_url('Manifest'), video_id, ism_id='mss', fatal=False)) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/iprima.py b/hypervideo_dl/extractor/iprima.py index 1a20384..1818205 100644 --- a/hypervideo_dl/extractor/iprima.py +++ b/hypervideo_dl/extractor/iprima.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re import time @@ -151,9 +148,8 @@ class IPrimaIE(InfoExtractor): elif manifest_type == 'DASH' or ext == 'mpd': formats += self._extract_mpd_formats( manifest_url, video_id, mpd_id='dash', fatal=False) - self._sort_formats(formats) - final_result = self._search_json_ld(webpage, video_id) or {} + final_result = self._search_json_ld(webpage, video_id, default={}) final_result.update({ 'id': video_id, 'title': title, @@ -251,8 +247,6 @@ class IPrimaCNNIE(InfoExtractor): if not formats and '>GEO_IP_NOT_ALLOWED<' in playerpage: self.raise_geo_restricted(countries=['CZ'], metadata_available=True) - self._sort_formats(formats) - return { 'id': video_id, 'title': title, diff --git a/hypervideo_dl/extractor/iqiyi.py b/hypervideo_dl/extractor/iqiyi.py index d07b39d..c41f6db 100644 --- a/hypervideo_dl/extractor/iqiyi.py +++ b/hypervideo_dl/extractor/iqiyi.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import hashlib import itertools import re @@ -218,7 +215,6 @@ class IqiyiIE(InfoExtractor): self._sleep(5, video_id) - self._sort_formats(formats) title = (get_element_by_id('widget-videotitle', webpage) or clean_html(get_element_by_attribute('class', 'mod-play-tit', webpage)) or self._html_search_regex(r'<span[^>]+data-videochanged-title="word"[^>]*>([^<]+)</span>', webpage, 'title')) @@ -274,6 +270,7 @@ class IqIE(InfoExtractor): '1': 'zh_CN', '2': 'zh_TW', '3': 'en', + '4': 'kor', '18': 'th', '21': 'my', '23': 'vi', @@ -354,7 +351,7 @@ class IqIE(InfoExtractor): ''' def _extract_vms_player_js(self, webpage, video_id): - player_js_cache = self._downloader.cache.load('iq', 'player_js') + player_js_cache = self.cache.load('iq', 'player_js') if player_js_cache: return player_js_cache webpack_js_url = self._proto_relative_url(self._search_regex( @@ -367,7 +364,7 @@ class IqIE(InfoExtractor): f'https://stc.iqiyipic.com/_next/static/chunks/{webpack_map1.get(module_index, module_index)}.{webpack_map2[module_index]}.js', video_id, note=f'Downloading #{module_index} module JS', errnote='Unable to download module JS', fatal=False) or '' if 'vms request' in module_js: - self._downloader.cache.store('iq', 'player_js', module_js) + self.cache.store('iq', 'player_js', module_js) return module_js raise ExtractorError('Unable to extract player JS') @@ -420,8 +417,9 @@ class IqIE(InfoExtractor): ut_list = ['0'] # bid 0 as an initial format checker - dash_paths = self._parse_json(PhantomJSwrapper(self).get( - url, html='<!DOCTYPE html>', video_id=video_id, note2='Executing signature code', jscode=self._DASH_JS % { + dash_paths = self._parse_json(PhantomJSwrapper(self, timeout=120_000).get( + url, note2='Executing signature code (this may take a couple minutes)', + html='<!DOCTYPE html>', video_id=video_id, jscode=self._DASH_JS % { 'tvid': video_info['tvId'], 'vid': video_info['vid'], 'src': traverse_obj(next_props, ('initialProps', 'pageProps', 'ptid'), @@ -443,7 +441,7 @@ class IqIE(InfoExtractor): preview_time = traverse_obj( initial_format_data, ('boss_ts', (None, 'data'), ('previewTime', 'rtime')), expected_type=float_or_none, get_all=False) if traverse_obj(initial_format_data, ('boss_ts', 'data', 'prv'), expected_type=int_or_none): - self.report_warning('This preview video is limited%s' % format_field(preview_time, template=' to %s seconds')) + self.report_warning('This preview video is limited%s' % format_field(preview_time, None, ' to %s seconds')) # TODO: Extract audio-only formats for bid in set(traverse_obj(initial_format_data, ('program', 'video', ..., 'bid'), expected_type=str_or_none, default=[])): @@ -498,8 +496,6 @@ class IqIE(InfoExtractor): }) formats.extend(extracted_formats) - self._sort_formats(formats) - for sub_format in traverse_obj(initial_format_data, ('program', 'stl', ...), expected_type=dict, default=[]): lang = self._LID_TAGS.get(str_or_none(sub_format.get('lid')), sub_format.get('_name')) subtitles.setdefault(lang, []).extend([{ diff --git a/hypervideo_dl/extractor/ir90tv.py b/hypervideo_dl/extractor/ir90tv.py deleted file mode 100644 index d5a3f6f..0000000 --- a/hypervideo_dl/extractor/ir90tv.py +++ /dev/null @@ -1,42 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import remove_start - - -class Ir90TvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?90tv\.ir/video/(?P<id>[0-9]+)/.*' - _TESTS = [{ - 'url': 'http://90tv.ir/video/95719/%D8%B4%D8%A7%DB%8C%D8%B9%D8%A7%D8%AA-%D9%86%D9%82%D9%84-%D9%88-%D8%A7%D9%86%D8%AA%D9%82%D8%A7%D9%84%D8%A7%D8%AA-%D9%85%D9%87%D9%85-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7-940218', - 'md5': '411dbd94891381960cb9e13daa47a869', - 'info_dict': { - 'id': '95719', - 'ext': 'mp4', - 'title': 'شایعات نقل و انتقالات مهم فوتبال اروپا 94/02/18', - 'thumbnail': r're:^https?://.*\.jpg$', - } - }, { - 'url': 'http://www.90tv.ir/video/95719/%D8%B4%D8%A7%DB%8C%D8%B9%D8%A7%D8%AA-%D9%86%D9%82%D9%84-%D9%88-%D8%A7%D9%86%D8%AA%D9%82%D8%A7%D9%84%D8%A7%D8%AA-%D9%85%D9%87%D9%85-%D9%81%D9%88%D8%AA%D8%A8%D8%A7%D9%84-%D8%A7%D8%B1%D9%88%D9%BE%D8%A7-940218', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - title = remove_start(self._html_search_regex( - r'<title>([^<]+)</title>', webpage, 'title'), '90tv.ir :: ') - - video_url = self._search_regex( - r'<source[^>]+src="([^"]+)"', webpage, 'video url') - - thumbnail = self._search_regex(r'poster="([^"]+)"', webpage, 'thumbnail url', fatal=False) - - return { - 'url': video_url, - 'id': video_id, - 'title': title, - 'video_url': video_url, - 'thumbnail': thumbnail, - } diff --git a/hypervideo_dl/extractor/islamchannel.py b/hypervideo_dl/extractor/islamchannel.py new file mode 100644 index 0000000..253a846 --- /dev/null +++ b/hypervideo_dl/extractor/islamchannel.py @@ -0,0 +1,81 @@ +import re + +from .common import InfoExtractor +from ..utils import traverse_obj, urljoin + + +class IslamChannelIE(InfoExtractor): + _VALID_URL = r'https?://watch\.islamchannel\.tv/watch/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://watch.islamchannel.tv/watch/38604310', + 'info_dict': { + 'id': '38604310', + 'title': 'Omar - Young Omar', + 'description': 'md5:5cc7ddecef064ea7afe52eb5e0e33b55', + 'thumbnail': r're:https?://.+', + 'ext': 'mp4', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + thumbnail = self._search_regex( + r'data-poster="([^"]+)"', webpage, 'data poster', fatal=False) or \ + self._html_search_meta(('og:image', 'twitter:image'), webpage) + + headers = { + 'Token': self._search_regex(r'data-token="([^"]+)"', webpage, 'data token'), + 'Token-Expiry': self._search_regex(r'data-expiry="([^"]+)"', webpage, 'data expiry'), + 'Uvid': video_id, + } + show_stream = self._download_json( + f'https://v2-streams-elb.simplestreamcdn.com/api/show/stream/{video_id}', video_id, + query={ + 'key': self._search_regex(r'data-key="([^"]+)"', webpage, 'data key'), + 'platform': 'chrome', + }, headers=headers) + # TODO: show_stream['stream'] and show_stream['drm'] may contain something interesting + streams = self._download_json( + traverse_obj(show_stream, ('response', 'tokenization', 'url')), video_id, + headers=headers) + formats, subs = self._extract_m3u8_formats_and_subtitles(traverse_obj(streams, ('Streams', 'Adaptive')), video_id, 'mp4') + + return { + 'id': video_id, + 'title': self._html_search_meta(('og:title', 'twitter:title'), webpage), + 'description': self._html_search_meta(('og:description', 'twitter:description', 'description'), webpage), + 'formats': formats, + 'subtitles': subs, + 'thumbnails': [{ + 'id': 'unscaled', + 'url': thumbnail.split('?')[0], + 'ext': 'jpg', + 'preference': 2, + }, { + 'id': 'orig', + 'url': thumbnail, + 'ext': 'jpg', + 'preference': 1, + }] if thumbnail else None, + } + + +class IslamChannelSeriesIE(InfoExtractor): + _VALID_URL = r'https?://watch\.islamchannel\.tv/series/(?P<id>[a-f\d-]+)' + _TESTS = [{ + 'url': 'https://watch.islamchannel.tv/series/a6cccef3-3ef1-11eb-bc19-06b69c2357cd', + 'info_dict': { + 'id': 'a6cccef3-3ef1-11eb-bc19-06b69c2357cd', + }, + 'playlist_mincount': 31, + }] + + def _real_extract(self, url): + pl_id = self._match_id(url) + webpage = self._download_webpage(url, pl_id) + + return self.playlist_from_matches( + re.finditer(r'<a\s+href="(/watch/\d+)"[^>]+?data-video-type="show">', webpage), + pl_id, getter=lambda x: urljoin(url, x.group(1)), ie=IslamChannelIE) diff --git a/hypervideo_dl/extractor/israelnationalnews.py b/hypervideo_dl/extractor/israelnationalnews.py new file mode 100644 index 0000000..35040f5 --- /dev/null +++ b/hypervideo_dl/extractor/israelnationalnews.py @@ -0,0 +1,50 @@ +from .common import InfoExtractor +from ..utils import ExtractorError, traverse_obj + + +class IsraelNationalNewsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?israelnationalnews\.com/news/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.israelnationalnews.com/news/354520', + 'info_dict': { + 'id': '354520' + }, + 'playlist': [{ + 'info_dict': { + 'id': 'jA84wQhVvg8', + 'title': 'Even CNN Host Is Shocked by How Bad Biden\'s Approval Ratings Have Gotten | DM CLIPS | Rubin Report', + 'ext': 'mp4', + 'description': 'md5:b7325a3d00c7596337dc3ae37e32d35c', + 'channel': 'The Rubin Report', + 'channel_follower_count': int, + 'comment_count': int, + 'categories': ['News & Politics'], + 'like_count': int, + 'uploader_url': 'http://www.youtube.com/user/RubinReport', + 'uploader_id': 'RubinReport', + 'availability': 'public', + 'view_count': int, + 'duration': 240, + 'thumbnail': 'https://i.ytimg.com/vi_webp/jA84wQhVvg8/maxresdefault.webp', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'age_limit': 0, + 'tags': 'count:29', + 'channel_id': 'UCJdKr0Bgd_5saZYqLCa9mng', + 'channel_url': 'https://www.youtube.com/channel/UCJdKr0Bgd_5saZYqLCa9mng', + 'upload_date': '20220606', + 'uploader': 'The Rubin Report', + } + }] + }] + + def _real_extract(self, url): + news_article_id = self._match_id(url) + article_json = self._download_json( + f'https://www.israelnationalnews.com/Generic/NewAPI/Item?type=0&Item={news_article_id}', news_article_id) + + urls = traverse_obj(article_json, ('Content2', ..., 'content', ..., 'attrs', 'src')) + if not urls: + raise ExtractorError('This article does not have any videos', expected=True) + + return self.playlist_from_matches(urls, news_article_id, ie='Youtube') diff --git a/hypervideo_dl/extractor/itprotv.py b/hypervideo_dl/extractor/itprotv.py index 64cb4e6..4ac1260 100644 --- a/hypervideo_dl/extractor/itprotv.py +++ b/hypervideo_dl/extractor/itprotv.py @@ -1,5 +1,3 @@ -# coding: utf-8 - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/itv.py b/hypervideo_dl/extractor/itv.py index 66705a2..0681050 100644 --- a/hypervideo_dl/extractor/itv.py +++ b/hypervideo_dl/extractor/itv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -175,7 +172,6 @@ class ITVIE(InfoExtractor): formats.append({ 'url': href, }) - self._sort_formats(formats) info = self._search_json_ld(webpage, video_id, default={}) if not info: json_ld = self._parse_json(self._search_regex( diff --git a/hypervideo_dl/extractor/ivi.py b/hypervideo_dl/extractor/ivi.py index 098ab66..27a222a 100644 --- a/hypervideo_dl/extractor/ivi.py +++ b/hypervideo_dl/extractor/ivi.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import re @@ -16,6 +13,7 @@ class IviIE(InfoExtractor): IE_DESC = 'ivi.ru' IE_NAME = 'ivi' _VALID_URL = r'https?://(?:www\.)?ivi\.(?:ru|tv)/(?:watch/(?:[^/]+/)?|video/player\?.*?videoId=)(?P<id>\d+)' + _EMBED_REGEX = [r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1'] _GEO_BYPASS = False _GEO_COUNTRIES = ['RU'] _LIGHT_KEY = b'\xf1\x02\x32\xb7\xbc\x5c\x7a\xe8\xf7\x96\xc1\x33\x2b\x27\xa1\x8c' @@ -168,7 +166,6 @@ class IviIE(InfoExtractor): 'quality': quality(content_format), 'filesize': int_or_none(f.get('size_in_bytes')), }) - self._sort_formats(formats) compilation = result.get('compilation') episode = title if compilation else None diff --git a/hypervideo_dl/extractor/ivideon.py b/hypervideo_dl/extractor/ivideon.py index 44b2208..7d1e554 100644 --- a/hypervideo_dl/extractor/ivideon.py +++ b/hypervideo_dl/extractor/ivideon.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..compat import ( compat_urllib_parse_urlencode, @@ -71,7 +67,6 @@ class IvideonIE(InfoExtractor): 'ext': 'flv', 'quality': quality(format_id), } for format_id in self._QUALITIES] - self._sort_formats(formats) return { 'id': server_id, diff --git a/hypervideo_dl/extractor/iwara.py b/hypervideo_dl/extractor/iwara.py index c0e01e3..ec3e59c 100644 --- a/hypervideo_dl/extractor/iwara.py +++ b/hypervideo_dl/extractor/iwara.py @@ -1,21 +1,29 @@ -# coding: utf-8 -from __future__ import unicode_literals +import itertools import re +import urllib.parse from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlparse from ..utils import ( int_or_none, mimetype2ext, remove_end, - url_or_none, - unified_strdate, strip_or_none, + unified_strdate, + url_or_none, + urljoin, ) -class IwaraIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.|ecchi\.)?iwara\.tv/videos/(?P<id>[a-zA-Z0-9]+)' +class IwaraBaseIE(InfoExtractor): + _BASE_REGEX = r'(?P<base_url>https?://(?:www\.|ecchi\.)?iwara\.tv)' + + def _extract_playlist(self, base_url, webpage): + for path in re.findall(r'class="title">\s*<a[^<]+href="([^"]+)', webpage): + yield self.url_result(urljoin(base_url, path)) + + +class IwaraIE(IwaraBaseIE): + _VALID_URL = fr'{IwaraBaseIE._BASE_REGEX}/videos/(?P<id>[a-zA-Z0-9]+)' _TESTS = [{ 'url': 'http://iwara.tv/videos/amVwUl1EHpAD9RD', # md5 is unstable @@ -60,7 +68,7 @@ class IwaraIE(InfoExtractor): webpage, urlh = self._download_webpage_handle(url, video_id) - hostname = compat_urllib_parse_urlparse(urlh.geturl()).hostname + hostname = urllib.parse.urlparse(urlh.geturl()).hostname # ecchi is 'sexy' in Japanese age_limit = 18 if hostname.split('.')[0] == 'ecchi' else 0 @@ -108,8 +116,6 @@ class IwaraIE(InfoExtractor): 'quality': 1 if format_id == 'Source' else 0, }) - self._sort_formats(formats) - return { 'id': video_id, 'title': title, @@ -120,3 +126,114 @@ class IwaraIE(InfoExtractor): 'upload_date': upload_date, 'description': description, } + + +class IwaraPlaylistIE(IwaraBaseIE): + _VALID_URL = fr'{IwaraBaseIE._BASE_REGEX}/playlist/(?P<id>[^/?#&]+)' + IE_NAME = 'iwara:playlist' + + _TESTS = [{ + 'url': 'https://ecchi.iwara.tv/playlist/best-enf', + 'info_dict': { + 'title': 'Best enf', + 'uploader': 'Jared98112', + 'id': 'best-enf', + }, + 'playlist_mincount': 1097, + }, { + # urlencoded + 'url': 'https://ecchi.iwara.tv/playlist/%E3%83%97%E3%83%AC%E3%82%A4%E3%83%AA%E3%82%B9%E3%83%88-2', + 'info_dict': { + 'id': 'プレイリスト-2', + 'title': 'プレイリスト', + 'uploader': 'mainyu', + }, + 'playlist_mincount': 91, + }] + + def _real_extract(self, url): + playlist_id, base_url = self._match_valid_url(url).group('id', 'base_url') + playlist_id = urllib.parse.unquote(playlist_id) + webpage = self._download_webpage(url, playlist_id) + + return { + '_type': 'playlist', + 'id': playlist_id, + 'title': self._html_search_regex(r'class="title"[^>]*>([^<]+)', webpage, 'title', fatal=False), + 'uploader': self._html_search_regex(r'<h2>([^<]+)', webpage, 'uploader', fatal=False), + 'entries': self._extract_playlist(base_url, webpage), + } + + +class IwaraUserIE(IwaraBaseIE): + _VALID_URL = fr'{IwaraBaseIE._BASE_REGEX}/users/(?P<id>[^/?#&]+)' + IE_NAME = 'iwara:user' + + _TESTS = [{ + 'note': 'number of all videos page is just 1 page. less than 40 videos', + 'url': 'https://ecchi.iwara.tv/users/infinityyukarip', + 'info_dict': { + 'title': 'Uploaded videos from Infinity_YukariP', + 'id': 'infinityyukarip', + 'uploader': 'Infinity_YukariP', + 'uploader_id': 'infinityyukarip', + }, + 'playlist_mincount': 39, + }, { + 'note': 'no even all videos page. probably less than 10 videos', + 'url': 'https://ecchi.iwara.tv/users/mmd-quintet', + 'info_dict': { + 'title': 'Uploaded videos from mmd quintet', + 'id': 'mmd-quintet', + 'uploader': 'mmd quintet', + 'uploader_id': 'mmd-quintet', + }, + 'playlist_mincount': 6, + }, { + 'note': 'has paging. more than 40 videos', + 'url': 'https://ecchi.iwara.tv/users/theblackbirdcalls', + 'info_dict': { + 'title': 'Uploaded videos from TheBlackbirdCalls', + 'id': 'theblackbirdcalls', + 'uploader': 'TheBlackbirdCalls', + 'uploader_id': 'theblackbirdcalls', + }, + 'playlist_mincount': 420, + }, { + 'note': 'foreign chars in URL. there must be foreign characters in URL', + 'url': 'https://ecchi.iwara.tv/users/ぶた丼', + 'info_dict': { + 'title': 'Uploaded videos from ぶた丼', + 'id': 'ぶた丼', + 'uploader': 'ぶた丼', + 'uploader_id': 'ぶた丼', + }, + 'playlist_mincount': 170, + }] + + def _entries(self, playlist_id, base_url): + webpage = self._download_webpage( + f'{base_url}/users/{playlist_id}', playlist_id) + videos_url = self._search_regex(r'<a href="(/users/[^/]+/videos)(?:\?[^"]+)?">', webpage, 'all videos url', default=None) + if not videos_url: + yield from self._extract_playlist(base_url, webpage) + return + + videos_url = urljoin(base_url, videos_url) + + for n in itertools.count(1): + page = self._download_webpage( + videos_url, playlist_id, note=f'Downloading playlist page {n}', + query={'page': str(n - 1)} if n > 1 else {}) + yield from self._extract_playlist( + base_url, page) + + if f'page={n}' not in page: + break + + def _real_extract(self, url): + playlist_id, base_url = self._match_valid_url(url).group('id', 'base_url') + playlist_id = urllib.parse.unquote(playlist_id) + + return self.playlist_result( + self._entries(playlist_id, base_url), playlist_id) diff --git a/hypervideo_dl/extractor/ixigua.py b/hypervideo_dl/extractor/ixigua.py new file mode 100644 index 0000000..1f086d2 --- /dev/null +++ b/hypervideo_dl/extractor/ixigua.py @@ -0,0 +1,83 @@ +import base64 + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + get_element_by_id, + int_or_none, + js_to_json, + str_or_none, + traverse_obj, +) + + +class IxiguaIE(InfoExtractor): + _VALID_URL = r'https?://(?:\w+\.)?ixigua\.com/(?:video/)?(?P<id>\d+).+' + _TESTS = [{ + 'url': 'https://www.ixigua.com/6996881461559165471', + 'info_dict': { + 'id': '6996881461559165471', + 'ext': 'mp4', + 'title': '盲目涉水风险大,亲身示范高水位行车注意事项', + 'description': 'md5:8c82f46186299add4a1c455430740229', + 'tags': ['video_car'], + 'like_count': int, + 'dislike_count': int, + 'view_count': int, + 'uploader': '懂车帝原创', + 'uploader_id': '6480145787', + 'thumbnail': r're:^https?://.+\.(avif|webp)', + 'timestamp': 1629088414, + 'duration': 1030, + } + }] + + def _get_json_data(self, webpage, video_id): + js_data = get_element_by_id('SSR_HYDRATED_DATA', webpage) + if not js_data: + if self._cookies_passed: + raise ExtractorError('Failed to get SSR_HYDRATED_DATA') + raise ExtractorError('Cookies (not necessarily logged in) are needed', expected=True) + + return self._parse_json( + js_data.replace('window._SSR_HYDRATED_DATA=', ''), video_id, transform_source=js_to_json) + + def _media_selector(self, json_data): + for path, override in ( + (('video_list', ), {}), + (('dynamic_video', 'dynamic_video_list'), {'acodec': 'none'}), + (('dynamic_video', 'dynamic_audio_list'), {'vcodec': 'none', 'ext': 'm4a'}), + ): + for media in traverse_obj(json_data, (..., *path, lambda _, v: v['main_url'])): + yield { + 'url': base64.b64decode(media['main_url']).decode(), + 'width': int_or_none(media.get('vwidth')), + 'height': int_or_none(media.get('vheight')), + 'fps': int_or_none(media.get('fps')), + 'vcodec': media.get('codec_type'), + 'format_id': str_or_none(media.get('quality_type')), + 'filesize': int_or_none(media.get('size')), + 'ext': 'mp4', + **override, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + json_data = self._get_json_data(webpage, video_id)['anyVideo']['gidInformation']['packerData']['video'] + + formats = list(self._media_selector(json_data.get('videoResource'))) + return { + 'id': video_id, + 'title': json_data.get('title'), + 'description': json_data.get('video_abstract'), + 'formats': formats, + 'like_count': json_data.get('video_like_count'), + 'duration': int_or_none(json_data.get('duration')), + 'tags': [json_data.get('tag')], + 'uploader_id': traverse_obj(json_data, ('user_info', 'user_id')), + 'uploader': traverse_obj(json_data, ('user_info', 'name')), + 'view_count': json_data.get('video_watch_count'), + 'dislike_count': json_data.get('video_unlike_count'), + 'timestamp': int_or_none(json_data.get('video_publish_time')), + } diff --git a/hypervideo_dl/extractor/izlesene.py b/hypervideo_dl/extractor/izlesene.py index f8fca6c..5cdf870 100644 --- a/hypervideo_dl/extractor/izlesene.py +++ b/hypervideo_dl/extractor/izlesene.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_str, @@ -81,7 +78,6 @@ class IzleseneIE(InfoExtractor): 'ext': ext, 'height': height, }) - self._sort_formats(formats) description = self._og_search_description(webpage, default=None) thumbnail = video.get('posterURL') or self._proto_relative_url( diff --git a/hypervideo_dl/extractor/jable.py b/hypervideo_dl/extractor/jable.py new file mode 100644 index 0000000..84c3225 --- /dev/null +++ b/hypervideo_dl/extractor/jable.py @@ -0,0 +1,103 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + InAdvancePagedList, + int_or_none, + orderedSet, + unified_strdate, +) + + +class JableIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?jable.tv/videos/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://jable.tv/videos/pppd-812/', + 'md5': 'f1537283a9bc073c31ff86ca35d9b2a6', + 'info_dict': { + 'id': 'pppd-812', + 'ext': 'mp4', + 'title': 'PPPD-812 只要表現好巨乳女教師吉根柚莉愛就獎勵學生們在白虎穴內射出精液', + 'description': 'md5:5b6d4199a854f62c5e56e26ccad19967', + 'thumbnail': r're:^https?://.*\.jpg$', + 'age_limit': 18, + 'like_count': int, + 'view_count': int, + }, + }, { + 'url': 'https://jable.tv/videos/apak-220/', + 'md5': '71f9239d69ced58ab74a816908847cc1', + 'info_dict': { + 'id': 'apak-220', + 'ext': 'mp4', + 'title': 'md5:5c3861b7cf80112a6e2b70bccf170824', + 'description': '', + 'thumbnail': r're:^https?://.*\.jpg$', + 'age_limit': 18, + 'like_count': int, + 'view_count': int, + 'upload_date': '20220319', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + formats = self._extract_m3u8_formats( + self._search_regex(r'var\s+hlsUrl\s*=\s*\'([^\']+)', webpage, 'hls_url'), video_id, 'mp4', m3u8_id='hls') + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage, default=''), + 'thumbnail': self._og_search_thumbnail(webpage, default=None), + 'formats': formats, + 'age_limit': 18, + 'upload_date': unified_strdate(self._search_regex( + r'class="inactive-color">\D+\s+(\d{4}-\d+-\d+)', webpage, 'upload_date', default=None)), + 'view_count': int_or_none(self._search_regex( + r'#icon-eye"></use></svg>\n*<span class="mr-3">([\d ]+)', + webpage, 'view_count', default='').replace(' ', '')), + 'like_count': int_or_none(self._search_regex( + r'#icon-heart"></use></svg><span class="count">(\d+)', webpage, 'link_count', default=None)), + } + + +class JablePlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?jable.tv/(?:categories|models|tags)/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://jable.tv/models/kaede-karen/', + 'info_dict': { + 'id': 'kaede-karen', + 'title': '楓カレン', + }, + 'playlist_count': 34, + }, { + 'url': 'https://jable.tv/categories/roleplay/', + 'only_matching': True, + }, { + 'url': 'https://jable.tv/tags/girl/', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + def page_func(page_num): + return [ + self.url_result(player_url, JableIE) + for player_url in orderedSet(re.findall( + r'href="(https://jable.tv/videos/[\w-]+/?)"', + self._download_webpage(url, playlist_id, query={ + 'mode': 'async', + 'from': page_num + 1, + 'function': 'get_block', + 'block_id': 'list_videos_common_videos_list', + }, note=f'Downloading page {page_num + 1}')))] + + return self.playlist_result( + InAdvancePagedList(page_func, int_or_none(self._search_regex( + r'from:(\d+)">[^<]+\s*»', webpage, 'last page number', default=1)), 24), + playlist_id, self._search_regex( + r'<h2 class="h3-md mb-1">([^<]+)', webpage, 'playlist title', default=None)) diff --git a/hypervideo_dl/extractor/jamendo.py b/hypervideo_dl/extractor/jamendo.py index 755d970..a2bbba3 100644 --- a/hypervideo_dl/extractor/jamendo.py +++ b/hypervideo_dl/extractor/jamendo.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import hashlib import random @@ -31,10 +28,11 @@ class JamendoIE(InfoExtractor): 'ext': 'flac', # 'title': 'Maya Filipič - Stories from Emona I', 'title': 'Stories from Emona I', - # 'artist': 'Maya Filipič', + 'artist': 'Maya Filipič', + 'album': 'Between two worlds', 'track': 'Stories from Emona I', 'duration': 210, - 'thumbnail': r're:^https?://.*\.jpg', + 'thumbnail': 'https://usercontent.jamendo.com?type=album&id=29279&width=300&trackid=196219', 'timestamp': 1217438117, 'upload_date': '20080730', 'license': 'by-nc-nd', @@ -48,11 +46,11 @@ class JamendoIE(InfoExtractor): 'only_matching': True, }] - def _call_api(self, resource, resource_id): + def _call_api(self, resource, resource_id, fatal=True): path = '/api/%ss' % resource rand = compat_str(random.random()) return self._download_json( - 'https://www.jamendo.com' + path, resource_id, query={ + 'https://www.jamendo.com' + path, resource_id, fatal=fatal, query={ 'id[]': resource_id, }, headers={ 'X-Jam-Call': '$%s*%s~' % (hashlib.sha1((path + rand).encode()).hexdigest(), rand) @@ -74,6 +72,8 @@ class JamendoIE(InfoExtractor): # if artist_name: # title = '%s - %s' % (artist_name, title) # album = get_model('album') + artist = self._call_api("artist", track.get('artistId'), fatal=False) + album = self._call_api("album", track.get('albumId'), fatal=False) formats = [{ 'url': 'https://%s.jamendo.com/?trackid=%s&format=%s&from=app-97dab294' @@ -87,7 +87,6 @@ class JamendoIE(InfoExtractor): ('ogg1', 'ogg', 'ogg'), ('flac', 'flac', 'flac'), ))] - self._sort_formats(formats) urls = [] thumbnails = [] @@ -121,9 +120,9 @@ class JamendoIE(InfoExtractor): 'title': title, 'description': track.get('description'), 'duration': int_or_none(track.get('duration')), - # 'artist': artist_name, + 'artist': artist.get('name'), 'track': track_name, - # 'album': album.get('name'), + 'album': album.get('name'), 'formats': formats, 'license': '-'.join(license) if license else None, 'timestamp': int_or_none(track.get('dateCreated')), @@ -134,7 +133,7 @@ class JamendoIE(InfoExtractor): } -class JamendoAlbumIE(JamendoIE): +class JamendoAlbumIE(JamendoIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'https?://(?:www\.)?jamendo\.com/album/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'https://www.jamendo.com/album/121486/duck-on-cover', @@ -148,22 +147,38 @@ class JamendoAlbumIE(JamendoIE): 'info_dict': { 'id': '1032333', 'ext': 'flac', - 'title': 'Shearer - Warmachine', + 'title': 'Warmachine', 'artist': 'Shearer', 'track': 'Warmachine', 'timestamp': 1368089771, 'upload_date': '20130509', + 'view_count': int, + 'thumbnail': 'https://usercontent.jamendo.com?type=album&id=121486&width=300&trackid=1032333', + 'duration': 190, + 'license': 'by', + 'album': 'Duck On Cover', + 'average_rating': 4, + 'tags': ['rock', 'drums', 'bass', 'world', 'punk', 'neutral'], + 'like_count': int, } }, { 'md5': '1f358d7b2f98edfe90fd55dac0799d50', 'info_dict': { 'id': '1032330', 'ext': 'flac', - 'title': 'Shearer - Without Your Ghost', + 'title': 'Without Your Ghost', 'artist': 'Shearer', 'track': 'Without Your Ghost', 'timestamp': 1368089771, 'upload_date': '20130509', + 'duration': 192, + 'tags': ['rock', 'drums', 'bass', 'world', 'punk'], + 'album': 'Duck On Cover', + 'thumbnail': 'https://usercontent.jamendo.com?type=album&id=121486&width=300&trackid=1032330', + 'view_count': int, + 'average_rating': 4, + 'license': 'by', + 'like_count': int, } }], 'params': { diff --git a/hypervideo_dl/extractor/japandiet.py b/hypervideo_dl/extractor/japandiet.py new file mode 100644 index 0000000..6c65056 --- /dev/null +++ b/hypervideo_dl/extractor/japandiet.py @@ -0,0 +1,274 @@ +import re + +from ..utils import ( + ExtractorError, + clean_html, + int_or_none, + join_nonempty, + parse_qs, + smuggle_url, + traverse_obj, + try_call, + unsmuggle_url +) +from .common import InfoExtractor + + +def _parse_japanese_date(text): + if not text: + return None + ERA_TABLE = { + '明治': 1868, + '大正': 1912, + '昭和': 1926, + '平成': 1989, + '令和': 2019, + } + ERA_RE = '|'.join(map(re.escape, ERA_TABLE.keys())) + mobj = re.search(rf'({ERA_RE})?(\d+)年(\d+)月(\d+)日', re.sub(r'[\s\u3000]+', '', text)) + if not mobj: + return None + era, year, month, day = mobj.groups() + year, month, day = map(int, (year, month, day)) + if era: + # example input: 令和5年3月34日 + # even though each era have their end, don't check here + year += ERA_TABLE[era] + return '%04d%02d%02d' % (year, month, day) + + +def _parse_japanese_duration(text): + mobj = re.search(r'(?:(\d+)日間?)?(?:(\d+)時間?)?(?:(\d+)分)?(?:(\d+)秒)?', re.sub(r'[\s\u3000]+', '', text or '')) + if not mobj: + return + days, hours, mins, secs = [int_or_none(x, default=0) for x in mobj.groups()] + return secs + mins * 60 + hours * 60 * 60 + days * 24 * 60 * 60 + + +class ShugiinItvBaseIE(InfoExtractor): + _INDEX_ROOMS = None + + @classmethod + def _find_rooms(cls, webpage): + return [{ + '_type': 'url', + 'id': x.group(1), + 'title': clean_html(x.group(2)).strip(), + 'url': smuggle_url(f'https://www.shugiintv.go.jp/jp/index.php?room_id={x.group(1)}', {'g': x.groups()}), + 'ie_key': ShugiinItvLiveIE.ie_key(), + } for x in re.finditer(r'(?s)<a\s+href="[^"]+\?room_id=(room\d+)"\s*class="play_live".+?class="s12_14">(.+?)</td>', webpage)] + + def _fetch_rooms(self): + if not self._INDEX_ROOMS: + webpage = self._download_webpage( + 'https://www.shugiintv.go.jp/jp/index.php', None, + encoding='euc-jp', note='Downloading proceedings info') + ShugiinItvBaseIE._INDEX_ROOMS = self._find_rooms(webpage) + return self._INDEX_ROOMS + + +class ShugiinItvLiveIE(ShugiinItvBaseIE): + _VALID_URL = r'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)(?:/index\.php)?$' + IE_DESC = '衆議院インターネット審議中継' + + _TESTS = [{ + 'url': 'https://www.shugiintv.go.jp/jp/index.php', + 'info_dict': { + '_type': 'playlist', + 'title': 'All proceedings for today', + }, + # expect at least one proceedings is running + 'playlist_mincount': 1, + }] + + @classmethod + def suitable(cls, url): + return super().suitable(url) and not any(x.suitable(url) for x in (ShugiinItvLiveRoomIE, ShugiinItvVodIE)) + + def _real_extract(self, url): + self.to_screen( + 'Downloading all running proceedings. To specify one proceeding, use direct link from the website') + return self.playlist_result(self._fetch_rooms(), playlist_title='All proceedings for today') + + +class ShugiinItvLiveRoomIE(ShugiinItvBaseIE): + _VALID_URL = r'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)/index\.php\?room_id=(?P<id>room\d+)' + IE_DESC = '衆議院インターネット審議中継 (中継)' + + _TESTS = [{ + 'url': 'https://www.shugiintv.go.jp/jp/index.php?room_id=room01', + 'info_dict': { + 'id': 'room01', + 'title': '内閣委員会', + }, + 'skip': 'this runs for a time and not every day', + }, { + 'url': 'https://www.shugiintv.go.jp/jp/index.php?room_id=room11', + 'info_dict': { + 'id': 'room11', + 'title': '外務委員会', + }, + 'skip': 'this runs for a time and not every day', + }] + + def _real_extract(self, url): + url, smug = unsmuggle_url(url, default={}) + if smug.get('g'): + room_id, title = smug['g'] + else: + room_id = self._match_id(url) + title = traverse_obj(self._fetch_rooms(), (lambda k, v: v['id'] == room_id, 'title'), get_all=False) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + f'https://hlslive.shugiintv.go.jp/{room_id}/amlst:{room_id}/playlist.m3u8', + room_id, ext='mp4') + + return { + 'id': room_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': True, + } + + +class ShugiinItvVodIE(ShugiinItvBaseIE): + _VALID_URL = r'https?://(?:www\.)?shugiintv\.go\.jp/(?:jp|en)/index\.php\?ex=VL(?:\&[^=]+=[^&]*)*\&deli_id=(?P<id>\d+)' + IE_DESC = '衆議院インターネット審議中継 (ビデオライブラリ)' + _TESTS = [{ + 'url': 'https://www.shugiintv.go.jp/jp/index.php?ex=VL&media_type=&deli_id=53846', + 'info_dict': { + 'id': '53846', + 'title': 'ウクライナ大統領国会演説(オンライン)', + 'release_date': '20220323', + 'chapters': 'count:4', + } + }, { + 'url': 'https://www.shugiintv.go.jp/en/index.php?ex=VL&media_type=&deli_id=53846', + 'only_matching': True + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + f'https://www.shugiintv.go.jp/jp/index.php?ex=VL&media_type=&deli_id={video_id}', video_id, + encoding='euc-jp') + + m3u8_url = self._search_regex( + r'id="vtag_src_base_vod"\s*value="(http.+?\.m3u8)"', webpage, 'm3u8 url') + m3u8_url = re.sub(r'^http://', 'https://', m3u8_url) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + m3u8_url, video_id, ext='mp4') + + title = self._html_search_regex( + (r'<td\s+align="left">(.+)\s*\(\d+分\)', + r'<TD.+?<IMG\s*src=".+?/spacer\.gif".+?height="15">(.+?)<IMG'), webpage, 'title', fatal=False) + + release_date = _parse_japanese_date(self._html_search_regex( + r'開会日</td>\s*<td.+?/td>\s*<TD>(.+?)</TD>', + webpage, 'title', fatal=False)) + + chapters = [] + for chp in re.finditer(r'(?i)<A\s+HREF="([^"]+?)"\s*class="play_vod">(?!<img)(.+)</[Aa]>', webpage): + chapters.append({ + 'title': clean_html(chp.group(2)).strip(), + 'start_time': try_call(lambda: float(parse_qs(chp.group(1))['time'][0].strip())), + }) + # NOTE: there are blanks at the first and the end of the videos, + # so getting/providing the video duration is not possible + # also, the exact end_time for the last chapter is unknown (we can get at most minutes of granularity) + last_tr = re.findall(r'(?s)<TR\s*class="s14_24">(.+?)</TR>', webpage)[-1] + if last_tr and chapters: + last_td = re.findall(r'<TD.+?</TD>', last_tr)[-1] + if last_td: + chapters[-1]['end_time'] = chapters[-1]['start_time'] + _parse_japanese_duration(clean_html(last_td)) + + return { + 'id': video_id, + 'title': title, + 'release_date': release_date, + 'chapters': chapters, + 'formats': formats, + 'subtitles': subtitles, + } + + +class SangiinInstructionIE(InfoExtractor): + _VALID_URL = r'^https?://www\.webtv\.sangiin\.go\.jp/webtv/index\.php' + IE_DESC = False # this shouldn't be listed as a supported site + + def _real_extract(self, url): + raise ExtractorError('Copy the link from the botton below the video description or player, and use the link to download. If there are no button in the frame, get the URL of the frame showing the video.', expected=True) + + +class SangiinIE(InfoExtractor): + _VALID_URL = r'https?://www\.webtv\.sangiin\.go\.jp/webtv/detail\.php\?sid=(?P<id>\d+)' + IE_DESC = '参議院インターネット審議中継 (archive)' + + _TESTS = [{ + 'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7052', + 'info_dict': { + 'id': '7052', + 'title': '2022年10月7日 本会議', + 'description': 'md5:0a5fed523f95c88105a0b0bf1dd71489', + 'upload_date': '20221007', + 'ext': 'mp4', + }, + }, { + 'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7037', + 'info_dict': { + 'id': '7037', + 'title': '2022年10月3日 開会式', + 'upload_date': '20221003', + 'ext': 'mp4', + }, + }, { + 'url': 'https://www.webtv.sangiin.go.jp/webtv/detail.php?sid=7076', + 'info_dict': { + 'id': '7076', + 'title': '2022年10月27日 法務委員会', + 'upload_date': '20221027', + 'ext': 'mp4', + 'is_live': True, + }, + 'skip': 'this live is turned into archive after it ends', + }, ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + date = self._html_search_regex( + r'<dt[^>]*>\s*開会日\s*</dt>\s*<dd[^>]*>\s*(.+?)\s*</dd>', webpage, + 'date', fatal=False) + upload_date = _parse_japanese_date(date) + + title = self._html_search_regex( + r'<dt[^>]*>\s*会議名\s*</dt>\s*<dd[^>]*>\s*(.+?)\s*</dd>', webpage, + 'date', fatal=False) + + # some videos don't have the elements, so assume it's missing + description = self._html_search_regex( + r'会議の経過\s*</h3>\s*<span[^>]*>(.+?)</span>', webpage, + 'description', default=None) + + # this row appears only when it's livestream + is_live = bool(self._html_search_regex( + r'<dt[^>]*>\s*公報掲載時刻\s*</dt>\s*<dd[^>]*>\s*(.+?)\s*</dd>', webpage, + 'is_live', default=None)) + + m3u8_url = self._search_regex( + r'var\s+videopath\s*=\s*(["\'])([^"\']+)\1', webpage, + 'm3u8 url', group=2) + + formats, subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4') + + return { + 'id': video_id, + 'title': join_nonempty(date, title, delim=' '), + 'description': description, + 'upload_date': upload_date, + 'formats': formats, + 'subtitles': subs, + 'is_live': is_live, + } diff --git a/hypervideo_dl/extractor/jeuxvideo.py b/hypervideo_dl/extractor/jeuxvideo.py index 77c0f52..56ea15c 100644 --- a/hypervideo_dl/extractor/jeuxvideo.py +++ b/hypervideo_dl/extractor/jeuxvideo.py @@ -1,8 +1,3 @@ -# coding: utf-8 - -from __future__ import unicode_literals - - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/jixie.py b/hypervideo_dl/extractor/jixie.py new file mode 100644 index 0000000..4830e61 --- /dev/null +++ b/hypervideo_dl/extractor/jixie.py @@ -0,0 +1,47 @@ +from .common import InfoExtractor +from ..utils import clean_html, float_or_none, traverse_obj, try_call + + +class JixieBaseIE(InfoExtractor): + """ + API Reference: + https://jixie.atlassian.net/servicedesk/customer/portal/2/article/1339654214?src=-1456335525, + https://scripts.jixie.media/jxvideo.3.1.min.js + """ + + def _extract_data_from_jixie_id(self, display_id, video_id, webpage): + json_data = self._download_json( + 'https://apidam.jixie.io/api/public/stream', display_id, + query={'metadata': 'full', 'video_id': video_id})['data'] + + formats, subtitles = [], {} + for stream in json_data['streams']: + if stream.get('type') == 'HLS': + fmt, sub = self._extract_m3u8_formats_and_subtitles(stream.get('url'), display_id, ext='mp4') + if json_data.get('drm'): + for f in fmt: + f['has_drm'] = True + formats.extend(fmt) + self._merge_subtitles(sub, target=subtitles) + else: + formats.append({ + 'url': stream.get('url'), + 'width': stream.get('width'), + 'height': stream.get('height'), + 'ext': 'mp4', + }) + + return { + 'id': video_id, + 'display_id': display_id, + 'formats': formats, + 'subtitles': subtitles, + 'title': json_data.get('title') or self._html_search_meta(['og:title', 'twitter:title'], webpage), + 'description': (clean_html(traverse_obj(json_data, ('metadata', 'description'))) + or self._html_search_meta(['description', 'og:description', 'twitter:description'], webpage)), + 'thumbnails': traverse_obj(json_data, ('metadata', 'thumbnails')), + 'duration': float_or_none(traverse_obj(json_data, ('metadata', 'duration'))), + 'tags': try_call(lambda: (json_data['metadata']['keywords'] or None).split(',')), + 'categories': try_call(lambda: (json_data['metadata']['categories'] or None).split(',')), + 'uploader_id': json_data.get('owner_id'), + } diff --git a/hypervideo_dl/extractor/joj.py b/hypervideo_dl/extractor/joj.py index 7350f53..9b62284 100644 --- a/hypervideo_dl/extractor/joj.py +++ b/hypervideo_dl/extractor/joj.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -21,6 +16,7 @@ class JojIE(InfoExtractor): ) (?P<id>[^/?#^]+) ''' + _EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//media\.joj\.sk/embed/(?:(?!\1).)+)\1'] _TESTS = [{ 'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932', 'info_dict': { @@ -41,14 +37,6 @@ class JojIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//media\.joj\.sk/embed/(?:(?!\1).)+)\1', - webpage)] - def _real_extract(self, url): video_id = self._match_id(url) @@ -73,7 +61,7 @@ class JojIE(InfoExtractor): r'(\d+)[pP]\.', format_url, 'height', default=None) formats.append({ 'url': format_url, - 'format_id': format_field(height, template='%sp'), + 'format_id': format_field(height, None, '%sp'), 'height': int(height), }) if not formats: @@ -93,7 +81,6 @@ class JojIE(InfoExtractor): r'(\d+)[pP]', format_id or path, 'height', default=None)), }) - self._sort_formats(formats) thumbnail = self._og_search_thumbnail(webpage) diff --git a/hypervideo_dl/extractor/jove.py b/hypervideo_dl/extractor/jove.py index 4b7dfc5..245fe73 100644 --- a/hypervideo_dl/extractor/jove.py +++ b/hypervideo_dl/extractor/jove.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( ExtractorError, diff --git a/hypervideo_dl/extractor/jwplatform.py b/hypervideo_dl/extractor/jwplatform.py index 5aa508b..c949689 100644 --- a/hypervideo_dl/extractor/jwplatform.py +++ b/hypervideo_dl/extractor/jwplatform.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -8,7 +5,7 @@ from ..utils import unsmuggle_url class JWPlatformIE(InfoExtractor): - _VALID_URL = r'(?:https?://(?:content\.jwplatform|cdn\.jwplayer)\.com/(?:(?:feed|player|thumb|preview)s|jw6|v2/media)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})' + _VALID_URL = r'(?:https?://(?:content\.jwplatform|cdn\.jwplayer)\.com/(?:(?:feed|player|thumb|preview|manifest)s|jw6|v2/media)/|jwplatform:)(?P<id>[a-zA-Z0-9]{8})' _TESTS = [{ 'url': 'http://content.jwplatform.com/players/nPripu9l-ALJ3XQCI.js', 'md5': 'fa8899fa601eb7c83a64e9d568bdf325', @@ -25,21 +22,48 @@ class JWPlatformIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - urls = JWPlatformIE._extract_urls(webpage) - return urls[0] if urls else None + _WEBPAGE_TESTS = [{ + # JWPlatform iframe + 'url': 'https://www.covermagazine.co.uk/feature/2465255/business-protection-involved', + 'info_dict': { + 'id': 'AG26UQXM', + 'ext': 'mp4', + 'upload_date': '20160719', + 'timestamp': 1468923808, + 'title': '2016_05_18 Cover L&G Business Protection V1 FINAL.mp4', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/AG26UQXM/poster.jpg?width=720', + 'description': '', + 'duration': 294.0, + }, + }, { + # Player url not surrounded by quotes + 'url': 'https://www.deutsche-kinemathek.de/en/online/streaming/darling-berlin', + 'info_dict': { + 'id': 'R10NQdhY', + 'title': 'Playgirl', + 'ext': 'mp4', + 'upload_date': '20220624', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/R10NQdhY/poster.jpg?width=720', + 'timestamp': 1656064800, + 'description': 'BRD 1966, Will Tremper', + 'duration': 5146.0, + }, + 'params': {'allowed_extractors': ['generic', 'jwplatform']}, + }] - @staticmethod - def _extract_urls(webpage): + @classmethod + def _extract_embed_urls(cls, url, webpage): for tag, key in ((r'(?:script|iframe)', 'src'), ('input', 'value')): # <input value=URL> is used by hyland.com # if we find <iframe>, dont look for <input> ret = re.findall( - r'<%s[^>]+?%s=["\']((?:https?:)?//(?:content\.jwplatform|cdn\.jwplayer)\.com/players/[a-zA-Z0-9]{8})' % (tag, key), + r'<%s[^>]+?%s=["\']?((?:https?:)?//(?:content\.jwplatform|cdn\.jwplayer)\.com/players/[a-zA-Z0-9]{8})' % (tag, key), webpage) if ret: return ret + mobj = re.search(r'<div\b[^>]* data-video-jw-id="([a-zA-Z0-9]{8})"', webpage) + if mobj: + return [f'jwplatform:{mobj.group(1)}'] def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) diff --git a/hypervideo_dl/extractor/kakao.py b/hypervideo_dl/extractor/kakao.py index 483ab71..1f0f0a5 100644 --- a/hypervideo_dl/extractor/kakao.py +++ b/hypervideo_dl/extractor/kakao.py @@ -1,7 +1,3 @@ -# coding: utf-8 - -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( @@ -109,6 +105,7 @@ class KakaoIE(InfoExtractor): resp = self._parse_json(e.cause.read().decode(), video_id) if resp.get('code') == 'GeoBlocked': self.raise_geo_restricted() + raise fmt_url = traverse_obj(fmt_url_json, ('videoLocation', 'url')) if not fmt_url: @@ -123,7 +120,6 @@ class KakaoIE(InfoExtractor): 'filesize': int_or_none(fmt.get('filesize')), 'tbr': int_or_none(fmt.get('kbps')), }) - self._sort_formats(formats) thumbs = [] for thumb in clip.get('clipChapterThumbnailList') or []: diff --git a/hypervideo_dl/extractor/kaltura.py b/hypervideo_dl/extractor/kaltura.py index f6dfc9c..95e2dee 100644 --- a/hypervideo_dl/extractor/kaltura.py +++ b/hypervideo_dl/extractor/kaltura.py @@ -1,8 +1,6 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re import base64 +import json +import re from .common import InfoExtractor from ..compat import ( @@ -16,13 +14,15 @@ from ..utils import ( int_or_none, unsmuggle_url, smuggle_url, + traverse_obj, + remove_start ) class KalturaIE(InfoExtractor): _VALID_URL = r'''(?x) (?: - kaltura:(?P<partner_id>\d+):(?P<id>[0-9a-z_]+)| + kaltura:(?P<partner_id>\w+):(?P<id>\w+)(?::(?P<player_type>\w+))?| https?:// (:?(?:www|cdnapi(?:sec)?)\.)?kaltura\.com(?::\d+)?/ (?: @@ -36,7 +36,7 @@ class KalturaIE(InfoExtractor): ) ''' _SERVICE_URL = 'http://cdnapi.kaltura.com' - _SERVICE_BASE = '/api_v3/index.php' + _SERVICE_BASE = '/api_v3/service/multirequest' # See https://github.com/kaltura/server/blob/master/plugins/content/caption/base/lib/model/enums/CaptionType.php _CAPTION_TYPES = { 1: 'srt', @@ -57,6 +57,7 @@ class KalturaIE(InfoExtractor): 'thumbnail': 're:^https?://.*/thumbnail/.*', 'timestamp': int, }, + 'skip': 'The access to this service is forbidden since the specified partner is blocked' }, { 'url': 'http://www.kaltura.com/index.php/kwidget/cache_st/1300318621/wid/_269692/uiconf_id/3873291/entry_id/1_1jc2y3e4', @@ -109,16 +110,85 @@ class KalturaIE(InfoExtractor): # unavailable source format 'url': 'kaltura:513551:1_66x4rg7o', 'only_matching': True, + }, + { + # html5lib URL using kwidget player + 'url': 'https://cdnapisec.kaltura.com/html5/html5lib/v2.46/mwEmbedFrame.php/p/691292/uiconf_id/20499062/entry_id/0_c076mna6?wid=_691292&iframeembed=true&playerId=kaltura_player_1420508608&entry_id=0_c076mna6&flashvars%5BakamaiHD.loadingPolicy%5D=preInitialize&flashvars%5BakamaiHD.asyncInit%5D=true&flashvars%5BstreamerType%5D=hdnetwork', + 'info_dict': { + 'id': '0_c076mna6', + 'ext': 'mp4', + 'title': 'md5:4883e7acbcbf42583a2dddc97dee4855', + 'duration': 3608, + 'uploader_id': 'commons@swinburne.edu.au', + 'timestamp': 1408086874, + 'view_count': int, + 'upload_date': '20140815', + 'thumbnail': 'http://cfvod.kaltura.com/p/691292/sp/69129200/thumbnail/entry_id/0_c076mna6/version/100022', + } + }, + { + # html5lib playlist URL using kwidget player + 'url': 'https://cdnapisec.kaltura.com/html5/html5lib/v2.89/mwEmbedFrame.php/p/2019031/uiconf_id/40436601?wid=1_4j3m32cv&iframeembed=true&playerId=kaltura_player_&flashvars[playlistAPI.kpl0Id]=1_jovey5nu&flashvars[ks]=&&flashvars[imageDefaultDuration]=30&flashvars[localizationCode]=en&flashvars[leadWithHTML5]=true&flashvars[forceMobileHTML5]=true&flashvars[nextPrevBtn.plugin]=true&flashvars[hotspots.plugin]=true&flashvars[sideBarContainer.plugin]=true&flashvars[sideBarContainer.position]=left&flashvars[sideBarContainer.clickToClose]=true&flashvars[chapters.plugin]=true&flashvars[chapters.layout]=vertical&flashvars[chapters.thumbnailRotator]=false&flashvars[streamSelector.plugin]=true&flashvars[EmbedPlayer.SpinnerTarget]=videoHolder&flashvars[dualScreen.plugin]=true&flashvars[playlistAPI.playlistUrl]=https://canvasgatechtest.kaf.kaltura.com/playlist/details/{playlistAPI.kpl0Id}/categoryid/126428551', + 'info_dict': { + 'id': '1_jovey5nu', + 'title': '00-00 Introduction' + }, + 'playlist': [ + { + 'info_dict': { + 'id': '1_b1y5hlvx', + 'ext': 'mp4', + 'title': 'CS7646_00-00 Introductio_Introduction', + 'duration': 91, + 'thumbnail': 'http://cfvod.kaltura.com/p/2019031/sp/201903100/thumbnail/entry_id/1_b1y5hlvx/version/100001', + 'view_count': int, + 'timestamp': 1533154447, + 'upload_date': '20180801', + 'uploader_id': 'djoyner3', + } + }, { + 'info_dict': { + 'id': '1_jfb7mdpn', + 'ext': 'mp4', + 'title': 'CS7646_00-00 Introductio_Three parts to the course', + 'duration': 63, + 'thumbnail': 'http://cfvod.kaltura.com/p/2019031/sp/201903100/thumbnail/entry_id/1_jfb7mdpn/version/100001', + 'view_count': int, + 'timestamp': 1533154489, + 'upload_date': '20180801', + 'uploader_id': 'djoyner3', + } + }, { + 'info_dict': { + 'id': '1_8xflxdp7', + 'ext': 'mp4', + 'title': 'CS7646_00-00 Introductio_Textbooks', + 'duration': 37, + 'thumbnail': 'http://cfvod.kaltura.com/p/2019031/sp/201903100/thumbnail/entry_id/1_8xflxdp7/version/100001', + 'view_count': int, + 'timestamp': 1533154512, + 'upload_date': '20180801', + 'uploader_id': 'djoyner3', + } + }, { + 'info_dict': { + 'id': '1_3hqew8kn', + 'ext': 'mp4', + 'title': 'CS7646_00-00 Introductio_Prerequisites', + 'duration': 49, + 'thumbnail': 'http://cfvod.kaltura.com/p/2019031/sp/201903100/thumbnail/entry_id/1_3hqew8kn/version/100001', + 'view_count': int, + 'timestamp': 1533154536, + 'upload_date': '20180801', + 'uploader_id': 'djoyner3', + } + } + ] } ] - @staticmethod - def _extract_url(webpage): - urls = KalturaIE._extract_urls(webpage) - return urls[0] if urls else None - - @staticmethod - def _extract_urls(webpage): + @classmethod + def _extract_embed_urls(cls, url, webpage): # Embed codes: https://knowledge.kaltura.com/embedding-kaltura-media-players-your-site finditer = ( list(re.finditer( @@ -160,63 +230,80 @@ class KalturaIE(InfoExtractor): for k, v in embed_info.items(): if v: embed_info[k] = v.strip() - url = 'kaltura:%(partner_id)s:%(id)s' % embed_info + embed_url = 'kaltura:%(partner_id)s:%(id)s' % embed_info escaped_pid = re.escape(embed_info['partner_id']) service_mobj = re.search( r'<script[^>]+src=(["\'])(?P<id>(?:https?:)?//(?:(?!\1).)+)/p/%s/sp/%s00/embedIframeJs' % (escaped_pid, escaped_pid), webpage) if service_mobj: - url = smuggle_url(url, {'service_url': service_mobj.group('id')}) - urls.append(url) + embed_url = smuggle_url(embed_url, {'service_url': service_mobj.group('id')}) + urls.append(embed_url) return urls def _kaltura_api_call(self, video_id, actions, service_url=None, *args, **kwargs): params = actions[0] - if len(actions) > 1: - for i, a in enumerate(actions[1:], start=1): - for k, v in a.items(): - params['%d:%s' % (i, k)] = v + params.update({i: a for i, a in enumerate(actions[1:], start=1)}) data = self._download_json( (service_url or self._SERVICE_URL) + self._SERVICE_BASE, - video_id, query=params, *args, **kwargs) + video_id, data=json.dumps(params).encode('utf-8'), + headers={ + 'Content-Type': 'application/json', + 'Accept-Encoding': 'gzip, deflate, br', + }, *args, **kwargs) - status = data if len(actions) == 1 else data[0] - if status.get('objectType') == 'KalturaAPIException': - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, status['message'])) + for idx, status in enumerate(data): + if not isinstance(status, dict): + continue + if status.get('objectType') == 'KalturaAPIException': + raise ExtractorError( + '%s said: %s (%d)' % (self.IE_NAME, status['message'], idx)) + + data[1] = traverse_obj(data, (1, 'objects', 0)) return data - def _get_video_info(self, video_id, partner_id, service_url=None): + def _get_video_info(self, video_id, partner_id, service_url=None, player_type='html5'): + assert player_type in ('html5', 'kwidget') + if player_type == 'kwidget': + return self._get_video_info_kwidget(video_id, partner_id, service_url) + + return self._get_video_info_html5(video_id, partner_id, service_url) + + def _get_video_info_html5(self, video_id, partner_id, service_url=None): actions = [ { - 'action': 'null', - 'apiVersion': '3.1.5', - 'clientTag': 'kdp:v3.8.5', + 'apiVersion': '3.3.0', + 'clientTag': 'html5:v3.1.0', 'format': 1, # JSON, 2 = XML, 3 = PHP - 'service': 'multirequest', + 'ks': '', + 'partnerId': partner_id, }, { 'expiry': 86400, 'service': 'session', 'action': 'startWidgetSession', - 'widgetId': '_%s' % partner_id, + 'widgetId': self._build_widget_id(partner_id), }, + # info { - 'action': 'get', - 'entryId': video_id, + 'action': 'list', + 'filter': {'redirectFromEntryId': video_id}, 'service': 'baseentry', 'ks': '{1:result:ks}', - 'responseProfile:fields': 'createdAt,dataUrl,duration,name,plays,thumbnailUrl,userId', - 'responseProfile:type': 1, + 'responseProfile': { + 'type': 1, + 'fields': 'createdAt,dataUrl,duration,name,plays,thumbnailUrl,userId', + }, }, + # flavor_assets { 'action': 'getbyentryid', 'entryId': video_id, 'service': 'flavorAsset', 'ks': '{1:result:ks}', }, + # captions { 'action': 'list', 'filter:entryIdEqual': video_id, @@ -225,17 +312,85 @@ class KalturaIE(InfoExtractor): }, ] return self._kaltura_api_call( - video_id, actions, service_url, note='Downloading video info JSON') + video_id, actions, service_url, note='Downloading video info JSON (Kaltura html5 player)') + + def _get_video_info_kwidget(self, video_id, partner_id, service_url=None): + actions = [ + { + 'service': 'multirequest', + 'apiVersion': '3.1', + 'expiry': 86400, + 'clientTag': 'kwidget:v2.89', + 'format': 1, # JSON, 2 = XML, 3 = PHP + 'ignoreNull': 1, + 'action': 'null', + }, + # header + { + 'expiry': 86400, + 'service': 'session', + 'action': 'startWidgetSession', + 'widgetId': self._build_widget_id(partner_id), + }, + # (empty) + { + 'expiry': 86400, + 'service': 'session', + 'action': 'startwidgetsession', + 'widgetId': self._build_widget_id(partner_id), + 'format': 9, + 'apiVersion': '3.1', + 'clientTag': 'kwidget:v2.89', + 'ignoreNull': 1, + 'ks': '{1:result:ks}' + }, + # info + { + 'action': 'list', + 'filter': {'redirectFromEntryId': video_id}, + 'service': 'baseentry', + 'ks': '{1:result:ks}', + 'responseProfile': { + 'type': 1, + 'fields': 'createdAt,dataUrl,duration,name,plays,thumbnailUrl,userId', + }, + }, + # flavor_assets + { + 'action': 'getbyentryid', + 'entryId': video_id, + 'service': 'flavorAsset', + 'ks': '{1:result:ks}', + }, + # captions + { + 'action': 'list', + 'filter:entryIdEqual': video_id, + 'service': 'caption_captionasset', + 'ks': '{1:result:ks}', + }, + ] + # second object (representing the second start widget session) is None + header, _, _info, flavor_assets, captions = self._kaltura_api_call( + video_id, actions, service_url, note='Downloading video info JSON (Kaltura kwidget player)') + info = _info['objects'][0] + return header, info, flavor_assets, captions + + def _build_widget_id(self, partner_id): + return partner_id if '_' in partner_id else f'_{partner_id}' + + IFRAME_PACKAGE_DATA_REGEX = r'window\.kalturaIframePackageData\s*=' def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) mobj = self._match_valid_url(url) - partner_id, entry_id = mobj.group('partner_id', 'id') - ks = None - captions = None + partner_id, entry_id, player_type = mobj.group('partner_id', 'id', 'player_type') + ks, captions = None, None + if not player_type: + player_type = 'kwidget' if 'html5lib/v2' in url else 'html5' if partner_id and entry_id: - _, info, flavor_assets, captions = self._get_video_info(entry_id, partner_id, smuggled_data.get('service_url')) + _, info, flavor_assets, captions = self._get_video_info(entry_id, partner_id, smuggled_data.get('service_url'), player_type=player_type) else: path, query = mobj.group('path', 'query') if not path and not query: @@ -247,7 +402,7 @@ class KalturaIE(InfoExtractor): splitted_path = path.split('/') params.update(dict((zip(splitted_path[::2], [[v] for v in splitted_path[1::2]])))) if 'wid' in params: - partner_id = params['wid'][0][1:] + partner_id = remove_start(params['wid'][0], '_') elif 'p' in params: partner_id = params['p'][0] elif 'partner_id' in params: @@ -256,14 +411,13 @@ class KalturaIE(InfoExtractor): raise ExtractorError('Invalid URL', expected=True) if 'entry_id' in params: entry_id = params['entry_id'][0] - _, info, flavor_assets, captions = self._get_video_info(entry_id, partner_id) + _, info, flavor_assets, captions = self._get_video_info(entry_id, partner_id, player_type=player_type) elif 'uiconf_id' in params and 'flashvars[referenceId]' in params: reference_id = params['flashvars[referenceId]'][0] webpage = self._download_webpage(url, reference_id) - entry_data = self._parse_json(self._search_regex( - r'window\.kalturaIframePackageData\s*=\s*({.*});', - webpage, 'kalturaIframePackageData'), - reference_id)['entryResult'] + entry_data = self._search_json( + self.IFRAME_PACKAGE_DATA_REGEX, webpage, + 'kalturaIframePackageData', reference_id)['entryResult'] info, flavor_assets = entry_data['meta'], entry_data['contextData']['flavorAssets'] entry_id = info['id'] # Unfortunately, data returned in kalturaIframePackageData lacks @@ -271,16 +425,29 @@ class KalturaIE(InfoExtractor): # regular approach since we now know the entry_id try: _, info, flavor_assets, captions = self._get_video_info( - entry_id, partner_id) + entry_id, partner_id, player_type=player_type) except ExtractorError: # Regular scenario failed but we already have everything # extracted apart from captions and can process at least # with this pass + elif 'uiconf_id' in params and 'flashvars[playlistAPI.kpl0Id]' in params: + playlist_id = params['flashvars[playlistAPI.kpl0Id]'][0] + webpage = self._download_webpage(url, playlist_id) + playlist_data = self._search_json( + self.IFRAME_PACKAGE_DATA_REGEX, webpage, + 'kalturaIframePackageData', playlist_id)['playlistResult'] + return self.playlist_from_matches( + traverse_obj(playlist_data, (playlist_id, 'items', ..., 'id')), + playlist_id, traverse_obj(playlist_data, (playlist_id, 'name')), + ie=KalturaIE, getter=lambda x: f'kaltura:{partner_id}:{x}:{player_type}') else: raise ExtractorError('Invalid URL', expected=True) ks = params.get('flashvars[ks]', [None])[0] + return self._per_video_extract(smuggled_data, entry_id, info, ks, flavor_assets, captions) + + def _per_video_extract(self, smuggled_data, entry_id, info, ks, flavor_assets, captions): source_url = smuggled_data.get('source_url') if source_url: referrer = base64.b64encode( @@ -351,8 +518,6 @@ class KalturaIE(InfoExtractor): formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) - self._sort_formats(formats) - if captions: for caption in captions.get('objects', []): # Continue if caption is not ready @@ -376,5 +541,5 @@ class KalturaIE(InfoExtractor): 'duration': info.get('duration'), 'timestamp': info.get('createdAt'), 'uploader_id': format_field(info, 'userId', ignore=('None', None)), - 'view_count': info.get('plays'), + 'view_count': int_or_none(info.get('plays')), } diff --git a/hypervideo_dl/extractor/kanal2.py b/hypervideo_dl/extractor/kanal2.py new file mode 100644 index 0000000..3c0efe5 --- /dev/null +++ b/hypervideo_dl/extractor/kanal2.py @@ -0,0 +1,66 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + join_nonempty, + traverse_obj, + unified_timestamp, + update_url_query, +) + + +class Kanal2IE(InfoExtractor): + _VALID_URL = r'https?://kanal2\.postimees\.ee/[^?#]+\?([^#]+&)?id=(?P<id>\d+)' + _TESTS = [{ + 'note': 'Test standard url (#5575)', + 'url': 'https://kanal2.postimees.ee/pluss/video/?id=40792', + 'md5': '7ea7b16266ec1798743777df241883dd', + 'info_dict': { + 'id': '40792', + 'ext': 'mp4', + 'title': 'Aedniku aabits / Osa 53 (05.08.2016 20:00)', + 'thumbnail': r're:https?://.*\.jpg$', + 'description': 'md5:53cabf3c5d73150d594747f727431248', + 'upload_date': '20160805', + 'timestamp': 1470420000, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + playlist = self._download_json( + f'https://kanal2.postimees.ee/player/playlist/{video_id}', + video_id, query={'type': 'episodes'}, + headers={'X-Requested-With': 'XMLHttpRequest'}) + + return { + 'id': video_id, + 'title': join_nonempty(*traverse_obj(playlist, ('info', ('title', 'subtitle'))), delim=' / '), + 'description': traverse_obj(playlist, ('info', 'description')), + 'thumbnail': traverse_obj(playlist, ('data', 'image')), + 'formats': self.get_formats(playlist, video_id), + 'timestamp': unified_timestamp(self._search_regex( + r'\((\d{2}\.\d{2}\.\d{4}\s\d{2}:\d{2})\)$', + traverse_obj(playlist, ('info', 'subtitle')), 'timestamp', default='') + ' +0200'), + } + + def get_formats(self, playlist, video_id): + path = traverse_obj(playlist, ('data', 'path')) + if not path: + raise ExtractorError('Path value not found in playlist JSON response') + session = self._download_json( + 'https://sts.postimees.ee/session/register', + video_id, note='Creating session', errnote='Error creating session', + headers={ + 'X-Original-URI': path, + 'Accept': 'application/json', + }) + if session.get('reason') != 'OK' or not session.get('session'): + reason = session.get('reason', 'unknown error') + raise ExtractorError(f'Unable to obtain session: {reason}') + + formats = [] + for stream in traverse_obj(playlist, ('data', 'streams', ..., 'file')): + formats.extend(self._extract_m3u8_formats( + update_url_query(stream, {'s': session['session']}), video_id, 'mp4')) + + return formats diff --git a/hypervideo_dl/extractor/kanalplay.py b/hypervideo_dl/extractor/kanalplay.py deleted file mode 100644 index 5e24f7e..0000000 --- a/hypervideo_dl/extractor/kanalplay.py +++ /dev/null @@ -1,96 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - float_or_none, - srt_subtitles_timecode, -) - - -class KanalPlayIE(InfoExtractor): - IE_DESC = 'Kanal 5/9/11 Play' - _VALID_URL = r'https?://(?:www\.)?kanal(?P<channel_id>5|9|11)play\.se/(?:#!/)?(?:play/)?program/\d+/video/(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://www.kanal5play.se/#!/play/program/3060212363/video/3270012277', - 'info_dict': { - 'id': '3270012277', - 'ext': 'flv', - 'title': 'Saknar både dusch och avlopp', - 'description': 'md5:6023a95832a06059832ae93bc3c7efb7', - 'duration': 2636.36, - }, - 'params': { - # rtmp download - 'skip_download': True, - } - }, { - 'url': 'http://www.kanal9play.se/#!/play/program/335032/video/246042', - 'only_matching': True, - }, { - 'url': 'http://www.kanal11play.se/#!/play/program/232835958/video/367135199', - 'only_matching': True, - }] - - def _fix_subtitles(self, subs): - return '\r\n\r\n'.join( - '%s\r\n%s --> %s\r\n%s' - % ( - num, - srt_subtitles_timecode(item['startMillis'] / 1000.0), - srt_subtitles_timecode(item['endMillis'] / 1000.0), - item['text'], - ) for num, item in enumerate(subs, 1)) - - def _get_subtitles(self, channel_id, video_id): - subs = self._download_json( - 'http://www.kanal%splay.se/api/subtitles/%s' % (channel_id, video_id), - video_id, 'Downloading subtitles JSON', fatal=False) - return {'sv': [{'ext': 'srt', 'data': self._fix_subtitles(subs)}]} if subs else {} - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - channel_id = mobj.group('channel_id') - - video = self._download_json( - 'http://www.kanal%splay.se/api/getVideo?format=FLASH&videoId=%s' % (channel_id, video_id), - video_id) - - reasons_for_no_streams = video.get('reasonsForNoStreams') - if reasons_for_no_streams: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, '\n'.join(reasons_for_no_streams)), - expected=True) - - title = video['title'] - description = video.get('description') - duration = float_or_none(video.get('length'), 1000) - thumbnail = video.get('posterUrl') - - stream_base_url = video['streamBaseUrl'] - - formats = [{ - 'url': stream_base_url, - 'play_path': stream['source'], - 'ext': 'flv', - 'tbr': float_or_none(stream.get('bitrate'), 1000), - 'rtmp_real_time': True, - } for stream in video['streams']] - self._sort_formats(formats) - - subtitles = {} - if video.get('hasSubtitle'): - subtitles = self.extract_subtitles(channel_id, video_id) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - 'subtitles': subtitles, - } diff --git a/hypervideo_dl/extractor/kankan.py b/hypervideo_dl/extractor/kankan.py deleted file mode 100644 index a677ff4..0000000 --- a/hypervideo_dl/extractor/kankan.py +++ /dev/null @@ -1,48 +0,0 @@ -from __future__ import unicode_literals - -import re -import hashlib - -from .common import InfoExtractor - -_md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest() - - -class KankanIE(InfoExtractor): - _VALID_URL = r'https?://(?:.*?\.)?kankan\.com/.+?/(?P<id>\d+)\.shtml' - - _TEST = { - 'url': 'http://yinyue.kankan.com/vod/48/48863.shtml', - 'md5': '29aca1e47ae68fc28804aca89f29507e', - 'info_dict': { - 'id': '48863', - 'ext': 'flv', - 'title': 'Ready To Go', - }, - 'skip': 'Only available from China', - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - title = self._search_regex(r'(?:G_TITLE=|G_MOVIE_TITLE = )[\'"](.+?)[\'"]', webpage, 'video title') - surls = re.search(r'surls:\[\'.+?\'\]|lurl:\'.+?\.flv\'', webpage).group(0) - gcids = re.findall(r'http://.+?/.+?/(.+?)/', surls) - gcid = gcids[-1] - - info_url = 'http://p2s.cl.kankan.com/getCdnresource_flv?gcid=%s' % gcid - video_info_page = self._download_webpage( - info_url, video_id, 'Downloading video url info') - ip = self._search_regex(r'ip:"(.+?)"', video_info_page, 'video url ip') - path = self._search_regex(r'path:"(.+?)"', video_info_page, 'video url path') - param1 = self._search_regex(r'param1:(\d+)', video_info_page, 'param1') - param2 = self._search_regex(r'param2:(\d+)', video_info_page, 'param2') - key = _md5('xl_mp43651' + param1 + param2) - video_url = 'http://%s%s?key=%s&key1=%s' % (ip, path, key, param2) - - return { - 'id': video_id, - 'title': title, - 'url': video_url, - } diff --git a/hypervideo_dl/extractor/karaoketv.py b/hypervideo_dl/extractor/karaoketv.py index bfccf89..381dc00 100644 --- a/hypervideo_dl/extractor/karaoketv.py +++ b/hypervideo_dl/extractor/karaoketv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/karrierevideos.py b/hypervideo_dl/extractor/karrierevideos.py index 7b291e0..28d4841 100644 --- a/hypervideo_dl/extractor/karrierevideos.py +++ b/hypervideo_dl/extractor/karrierevideos.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( diff --git a/hypervideo_dl/extractor/keezmovies.py b/hypervideo_dl/extractor/keezmovies.py index 06dbcbb..b50da42 100644 --- a/hypervideo_dl/extractor/keezmovies.py +++ b/hypervideo_dl/extractor/keezmovies.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -7,7 +5,6 @@ from ..aes import aes_decrypt_text from ..compat import compat_urllib_parse_unquote from ..utils import ( determine_ext, - ExtractorError, format_field, int_or_none, str_to_int, @@ -70,7 +67,7 @@ class KeezMoviesIE(InfoExtractor): video_url, title, 32).decode('utf-8') formats.append({ 'url': format_url, - 'format_id': format_field(height, template='%dp'), + 'format_id': format_field(height, None, '%dp'), 'height': height, 'tbr': tbr, }) @@ -105,12 +102,6 @@ class KeezMoviesIE(InfoExtractor): self.raise_no_formats( 'Video %s is no longer available' % video_id, expected=True) - try: - self._sort_formats(formats) - except ExtractorError: - if fatal: - raise - if not title: title = self._html_search_regex( r'<h1[^>]*>([^<]+)', webpage, 'title') diff --git a/hypervideo_dl/extractor/kelbyone.py b/hypervideo_dl/extractor/kelbyone.py index 20c26cf..2ca9ad4 100644 --- a/hypervideo_dl/extractor/kelbyone.py +++ b/hypervideo_dl/extractor/kelbyone.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import int_or_none @@ -62,7 +59,6 @@ class KelbyOneIE(InfoExtractor): subtitles.setdefault('en', []).append({ 'url': track['file'], }) - self._sort_formats(formats) yield { 'id': video_id, 'title': item['title'], diff --git a/hypervideo_dl/extractor/ketnet.py b/hypervideo_dl/extractor/ketnet.py index e0599d0..ab62767 100644 --- a/hypervideo_dl/extractor/ketnet.py +++ b/hypervideo_dl/extractor/ketnet.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .canvas import CanvasIE from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote diff --git a/hypervideo_dl/extractor/khanacademy.py b/hypervideo_dl/extractor/khanacademy.py index 87e5203..5333036 100644 --- a/hypervideo_dl/extractor/khanacademy.py +++ b/hypervideo_dl/extractor/khanacademy.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -27,16 +25,21 @@ class KhanAcademyBaseIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - component_props = self._parse_json(self._download_json( - 'https://www.khanacademy.org/api/internal/graphql', + content = self._download_json( + 'https://www.khanacademy.org/api/internal/graphql/FetchContentData', display_id, query={ - 'hash': 1604303425, + 'fastly_cacheable': 'persist_until_publish', + 'hash': '4134764944', + 'lang': 'en', 'variables': json.dumps({ 'path': display_id, - 'queryParams': '', + 'queryParams': 'lang=en', + 'isModal': False, + 'followRedirects': True, + 'countryCode': 'US', }), - })['data']['contentJson'], display_id)['componentProps'] - return self._parse_component_props(component_props) + })['data']['contentJson'] + return self._parse_component_props(self._parse_json(content, display_id)['componentProps']) class KhanAcademyIE(KhanAcademyBaseIE): diff --git a/hypervideo_dl/extractor/kicker.py b/hypervideo_dl/extractor/kicker.py new file mode 100644 index 0000000..a2c7dd4 --- /dev/null +++ b/hypervideo_dl/extractor/kicker.py @@ -0,0 +1,55 @@ +from .common import InfoExtractor +from .dailymotion import DailymotionIE + + +class KickerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)kicker\.(?:de)/(?P<id>[\w-]+)/video' + _TESTS = [{ + 'url': 'https://www.kicker.de/pogba-dembel-co-die-top-11-der-abloesefreien-spieler-905049/video', + 'info_dict': { + 'id': 'km04mrK0DrRAVxy2GcA', + 'title': 'md5:b91d145bac5745ac58d5479d8347a875', + 'ext': 'mp4', + 'duration': 350, + 'description': 'md5:a5a3dd77dbb6550dbfb997be100b9998', + 'uploader_id': 'x2dfupo', + 'timestamp': 1654677626, + 'like_count': int, + 'uploader': 'kicker.de', + 'view_count': int, + 'age_limit': 0, + 'thumbnail': r're:https://s\d+\.dmcdn\.net/v/T-x741YeYAx8aSZ0Z/x1080', + 'tags': ['published', 'category.InternationalSoccer'], + 'upload_date': '20220608' + } + }, { + 'url': 'https://www.kicker.de/ex-unioner-in-der-bezirksliga-felix-kroos-vereinschallenge-in-pankow-902825/video', + 'info_dict': { + 'id': 'k2omNsJKdZ3TxwxYSFJ', + 'title': 'md5:72ec24d7f84b8436fe1e89d198152adf', + 'ext': 'mp4', + 'uploader_id': 'x2dfupo', + 'duration': 331, + 'timestamp': 1652966015, + 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/TxU4Z1YYCmtisTbMq/x1080', + 'tags': ['FELIX KROOS', 'EINFACH MAL LUPPEN', 'KROOS', 'FSV FORTUNA PANKOW', 'published', 'category.Amateurs', 'marketingpreset.Spreekick'], + 'age_limit': 0, + 'view_count': int, + 'upload_date': '20220519', + 'uploader': 'kicker.de', + 'description': 'md5:0c2060c899a91c8bf40f578f78c5846f', + 'like_count': int, + } + }] + + def _real_extract(self, url): + video_slug = self._match_id(url) + + webpage = self._download_webpage(url, video_slug) + dailymotion_video_id = self._search_regex( + r'data-dmprivateid\s*=\s*[\'"](?P<video_id>\w+)', webpage, + 'video id', group='video_id') + + return self.url_result( + f'https://www.dailymotion.com/video/{dailymotion_video_id}', + ie=DailymotionIE, video_title=self._html_extract_title(webpage)) diff --git a/hypervideo_dl/extractor/kickstarter.py b/hypervideo_dl/extractor/kickstarter.py index d4da8f4..c0d851d 100644 --- a/hypervideo_dl/extractor/kickstarter.py +++ b/hypervideo_dl/extractor/kickstarter.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import smuggle_url diff --git a/hypervideo_dl/extractor/kinja.py b/hypervideo_dl/extractor/kinja.py index 1be8b48..df1386f 100644 --- a/hypervideo_dl/extractor/kinja.py +++ b/hypervideo_dl/extractor/kinja.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..compat import ( compat_str, @@ -13,8 +8,6 @@ from ..utils import ( parse_iso8601, strip_or_none, try_get, - unescapeHTML, - urljoin, ) @@ -58,6 +51,7 @@ class KinjaEmbedIE(InfoExtractor): vine| youtube-(?:list|video) )-(?P<id>[^&]+)''' % (_DOMAIN_REGEX, _COMMON_REGEX) + _EMBED_REGEX = [rf'(?x)<iframe[^>]+?src=(?P<q>["\'])(?P<url>(?:(?:https?:)?//{_DOMAIN_REGEX})?{_COMMON_REGEX}(?:(?!\1).)+)\1'] _TESTS = [{ 'url': 'https://kinja.com/ajax/inset/iframe?id=fb-10103303356633621', 'only_matching': True, @@ -122,12 +116,6 @@ class KinjaEmbedIE(InfoExtractor): 'youtube-video': ('youtube.com/embed/', 'Youtube'), } - @staticmethod - def _extract_urls(webpage, url): - return [urljoin(url, unescapeHTML(mobj.group('url'))) for mobj in re.finditer( - r'(?x)<iframe[^>]+?src=(?P<q>["\'])(?P<url>(?:(?:https?:)?//%s)?%s(?:(?!\1).)+)\1' % (KinjaEmbedIE._DOMAIN_REGEX, KinjaEmbedIE._COMMON_REGEX), - webpage)] - def _real_extract(self, url): video_type, video_id = self._match_valid_url(url).groups() @@ -159,7 +147,6 @@ class KinjaEmbedIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - self._sort_formats(formats) thumbnail = None poster = data.get('poster') or {} @@ -207,8 +194,6 @@ class KinjaEmbedIE(InfoExtractor): 'url': fallback_rendition_url, }) - self._sort_formats(formats) - return { 'id': video_id, 'title': title, diff --git a/hypervideo_dl/extractor/kinopoisk.py b/hypervideo_dl/extractor/kinopoisk.py index cdbb642..5db9083 100644 --- a/hypervideo_dl/extractor/kinopoisk.py +++ b/hypervideo_dl/extractor/kinopoisk.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( dict_get, @@ -47,7 +44,6 @@ class KinoPoiskIE(InfoExtractor): formats = self._extract_m3u8_formats( data['playlistEntity']['uri'], video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') - self._sort_formats(formats) description = dict_get( film, ('descriptscription', 'description', diff --git a/hypervideo_dl/extractor/kompas.py b/hypervideo_dl/extractor/kompas.py new file mode 100644 index 0000000..8bad961 --- /dev/null +++ b/hypervideo_dl/extractor/kompas.py @@ -0,0 +1,26 @@ +from .jixie import JixieBaseIE + + +class KompasVideoIE(JixieBaseIE): + _VALID_URL = r'https?://video\.kompas\.com/\w+/(?P<id>\d+)/(?P<slug>[\w-]+)' + _TESTS = [{ + 'url': 'https://video.kompas.com/watch/164474/kim-jong-un-siap-kirim-nuklir-lawan-as-dan-korsel', + 'info_dict': { + 'id': '164474', + 'ext': 'mp4', + 'title': 'Kim Jong Un Siap Kirim Nuklir Lawan AS dan Korsel', + 'description': 'md5:262530c4fb7462398235f9a5dba92456', + 'uploader_id': '9262bf2590d558736cac4fff7978fcb1', + 'display_id': 'kim-jong-un-siap-kirim-nuklir-lawan-as-dan-korsel', + 'duration': 85.066667, + 'categories': ['news'], + 'thumbnail': 'https://video.jixie.media/1001/164474/164474_1280x720.jpg', + 'tags': 'count:9', + } + }] + + def _real_extract(self, url): + video_id, display_id = self._match_valid_url(url).group('id', 'slug') + webpage = self._download_webpage(url, display_id) + + return self._extract_data_from_jixie_id(display_id, video_id, webpage) diff --git a/hypervideo_dl/extractor/konserthusetplay.py b/hypervideo_dl/extractor/konserthusetplay.py index dd42bb2..10767f1 100644 --- a/hypervideo_dl/extractor/konserthusetplay.py +++ b/hypervideo_dl/extractor/konserthusetplay.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -98,8 +95,6 @@ class KonserthusetPlayIE(InfoExtractor): 'url': fallback_url, }) - self._sort_formats(formats) - title = player_config.get('title') or media['title'] description = player_config.get('mediaInfo', {}).get('description') thumbnail = media.get('image') diff --git a/hypervideo_dl/extractor/koo.py b/hypervideo_dl/extractor/koo.py index 2d6ed3b..6616ccd 100644 --- a/hypervideo_dl/extractor/koo.py +++ b/hypervideo_dl/extractor/koo.py @@ -1,5 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( clean_html, @@ -103,7 +101,6 @@ class KooIE(InfoExtractor): if not formats: self.raise_no_formats('No video/audio found at the provided url.', expected=True) - self._sort_formats(formats) return { 'id': id, 'title': clean_html(item_json.get('title')), diff --git a/hypervideo_dl/extractor/krasview.py b/hypervideo_dl/extractor/krasview.py index d27d052..4323aa4 100644 --- a/hypervideo_dl/extractor/krasview.py +++ b/hypervideo_dl/extractor/krasview.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/kth.py b/hypervideo_dl/extractor/kth.py new file mode 100644 index 0000000..e17c6db --- /dev/null +++ b/hypervideo_dl/extractor/kth.py @@ -0,0 +1,28 @@ +from .common import InfoExtractor +from ..utils import smuggle_url + + +class KTHIE(InfoExtractor): + _VALID_URL = r'https?://play\.kth\.se/(?:[^/]+/)+(?P<id>[a-z0-9_]+)' + _TEST = { + 'url': 'https://play.kth.se/media/Lunch+breakA+De+nya+aff%C3%A4rerna+inom+Fordonsdalen/0_uoop6oz9', + 'md5': 'd83ada6d00ca98b73243a88efe19e8a6', + 'info_dict': { + 'id': '0_uoop6oz9', + 'ext': 'mp4', + 'title': 'md5:bd1d6931facb6828762a33e6ce865f37', + 'thumbnail': 're:https?://.+/thumbnail/.+', + 'duration': 3516, + 'timestamp': 1647345358, + 'upload_date': '20220315', + 'uploader_id': 'md5:0ec23e33a89e795a4512930c8102509f', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + result = self.url_result( + smuggle_url('kaltura:308:%s' % video_id, { + 'service_url': 'https://api.kaltura.nordu.net'}), + 'Kaltura') + return result diff --git a/hypervideo_dl/extractor/ku6.py b/hypervideo_dl/extractor/ku6.py index a574408..31b4ea0 100644 --- a/hypervideo_dl/extractor/ku6.py +++ b/hypervideo_dl/extractor/ku6.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/kusi.py b/hypervideo_dl/extractor/kusi.py index 707fe18..a23ad89 100644 --- a/hypervideo_dl/extractor/kusi.py +++ b/hypervideo_dl/extractor/kusi.py @@ -1,13 +1,10 @@ -# coding: utf-8 -from __future__ import unicode_literals - import random +import urllib.parse from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote_plus from ..utils import ( - int_or_none, float_or_none, + int_or_none, timeconvert, update_url_query, xpath_text, @@ -69,12 +66,11 @@ class KUSIIE(InfoExtractor): formats = [] for quality in quality_options: formats.append({ - 'url': compat_urllib_parse_unquote_plus(quality.attrib['url']), + 'url': urllib.parse.unquote_plus(quality.attrib['url']), 'height': int_or_none(quality.attrib.get('height')), 'width': int_or_none(quality.attrib.get('width')), 'vbr': float_or_none(quality.attrib.get('bitratebits'), scale=1000), }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/kuwo.py b/hypervideo_dl/extractor/kuwo.py index 460a425..cfec1c5 100644 --- a/hypervideo_dl/extractor/kuwo.py +++ b/hypervideo_dl/extractor/kuwo.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -107,7 +104,6 @@ class KuwoIE(KuwoBaseIE): lrc_content = None formats = self._get_formats(song_id) - self._sort_formats(formats) album_id = self._html_search_regex( r'<a[^>]+href="http://www\.kuwo\.cn/album/(\d+)/"', @@ -342,8 +338,6 @@ class KuwoMvIE(KuwoBaseIE): 'format_id': 'mv', }) - self._sort_formats(formats) - return { 'id': song_id, 'title': song_name, diff --git a/hypervideo_dl/extractor/la7.py b/hypervideo_dl/extractor/la7.py index de985e4..68dc1d4 100644 --- a/hypervideo_dl/extractor/la7.py +++ b/hypervideo_dl/extractor/la7.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -81,8 +78,6 @@ class LA7IE(InfoExtractor): if http_f: formats.append(http_f) - self._sort_formats(formats) - return { 'id': video_id, 'title': self._og_search_title(webpage, default=None), @@ -139,7 +134,6 @@ class LA7PodcastEpisodeIE(InfoExtractor): 'format_id': ext, 'ext': ext, }] - self._sort_formats(formats) title = self._html_search_regex( (r'<div class="title">(?P<title>.+?)</', @@ -197,7 +191,7 @@ class LA7PodcastEpisodeIE(InfoExtractor): return self._extract_info(webpage, video_id) -class LA7PodcastIE(LA7PodcastEpisodeIE): +class LA7PodcastIE(LA7PodcastEpisodeIE): # XXX: Do not subclass from concrete IE IE_NAME = 'la7.it:podcast' _VALID_URL = r'(https?://)?(www\.)?la7\.it/(?P<id>[^/]+)/podcast/?(?:$|[#?])' diff --git a/hypervideo_dl/extractor/laola1tv.py b/hypervideo_dl/extractor/laola1tv.py index b5d27c2..416dd7e 100644 --- a/hypervideo_dl/extractor/laola1tv.py +++ b/hypervideo_dl/extractor/laola1tv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import re @@ -52,7 +49,6 @@ class Laola1TvEmbedIE(InfoExtractor): formats = self._extract_akamai_formats( '%s?hdnea=%s' % (token_attrib['url'], token_attrib['auth']), video_id) - self._sort_formats(formats) return formats def _real_extract(self, url): @@ -121,7 +117,7 @@ class Laola1TvEmbedIE(InfoExtractor): } -class Laola1TvBaseIE(Laola1TvEmbedIE): +class Laola1TvBaseIE(Laola1TvEmbedIE): # XXX: Do not subclass from concrete IE def _extract_video(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) diff --git a/hypervideo_dl/extractor/lastfm.py b/hypervideo_dl/extractor/lastfm.py index 5215717..f14198c 100644 --- a/hypervideo_dl/extractor/lastfm.py +++ b/hypervideo_dl/extractor/lastfm.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -18,7 +15,7 @@ class LastFMPlaylistBaseIE(InfoExtractor): for page_number in range(start_page_number, (last_page_number or start_page_number) + 1): webpage = self._download_webpage( url, playlist_id, - note='Downloading page %d%s' % (page_number, format_field(last_page_number, template=' of %d')), + note='Downloading page %d%s' % (page_number, format_field(last_page_number, None, ' of %d')), query={'page': page_number}) page_entries = [ self.url_result(player_url, 'Youtube') diff --git a/hypervideo_dl/extractor/lbry.py b/hypervideo_dl/extractor/lbry.py index 5d5457c..b5def1e 100644 --- a/hypervideo_dl/extractor/lbry.py +++ b/hypervideo_dl/extractor/lbry.py @@ -1,23 +1,18 @@ -# coding: utf-8 -from __future__ import unicode_literals - import functools import json from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse_unquote, -) +from ..compat import compat_str, compat_urllib_parse_unquote from ..utils import ( - determine_ext, ExtractorError, + HEADRequest, + OnDemandPagedList, + UnsupportedError, + determine_ext, int_or_none, mimetype2ext, parse_qs, - OnDemandPagedList, try_get, - UnsupportedError, urljoin, ) @@ -29,10 +24,14 @@ class LBRYBaseIE(InfoExtractor): _SUPPORTED_STREAM_TYPES = ['video', 'audio'] def _call_api_proxy(self, method, display_id, params, resource): + headers = {'Content-Type': 'application/json-rpc'} + token = try_get(self._get_cookies('https://odysee.com'), lambda x: x['auth_token'].value) + if token: + headers['x-lbry-auth-token'] = token response = self._download_json( 'https://api.lbry.tv/api/v1/proxy', display_id, 'Downloading %s JSON metadata' % resource, - headers={'Content-Type': 'application/json-rpc'}, + headers=headers, data=json.dumps({ 'method': method, 'params': params, @@ -94,7 +93,7 @@ class LBRYIE(LBRYBaseIE): _TESTS = [{ # Video 'url': 'https://lbry.tv/@Mantega:1/First-day-LBRY:1', - 'md5': '65bd7ec1f6744ada55da8e4c48a2edf9', + 'md5': 'fffd15d76062e9a985c22c7c7f2f4805', 'info_dict': { 'id': '17f983b61f53091fb8ea58a9c56804e4ff8cff4d', 'ext': 'mp4', @@ -106,6 +105,19 @@ class LBRYIE(LBRYBaseIE): 'release_date': '20200721', 'width': 1280, 'height': 720, + 'thumbnail': 'https://spee.ch/7/67f2d809c263288c.png', + 'license': 'None', + 'duration': 346, + 'channel': 'LBRY/Odysee rats united!!!', + 'channel_id': '1c8ad6a2ab4e889a71146ae4deeb23bb92dab627', + 'channel_url': 'https://lbry.tv/@Mantega:1c8ad6a2ab4e889a71146ae4deeb23bb92dab627', + 'tags': [ + 'first day in lbry', + 'lbc', + 'lbry', + 'start', + 'tutorial' + ], } }, { # Audio @@ -126,11 +138,13 @@ class LBRYIE(LBRYBaseIE): 'channel_id': '0ed629d2b9c601300cacf7eabe9da0be79010212', 'channel_url': 'https://lbry.tv/@LBRYFoundation:0ed629d2b9c601300cacf7eabe9da0be79010212', 'vcodec': 'none', + 'thumbnail': 'https://spee.ch/d/0bc63b0e6bf1492d.png', + 'license': 'None', } }, { # HLS 'url': 'https://odysee.com/@gardeningincanada:b/plants-i-will-never-grow-again.-the:e', - 'md5': 'fc82f45ea54915b1495dd7cb5cc1289f', + 'md5': '25049011f3c8bc2f8b60ad88a031837e', 'info_dict': { 'id': 'e51671357333fe22ae88aad320bde2f6f96b1410', 'ext': 'mp4', @@ -146,12 +160,37 @@ class LBRYIE(LBRYBaseIE): 'channel_id': 'b8be0e93b423dad221abe29545fbe8ec36e806bc', 'channel_url': 'https://odysee.com/@gardeningincanada:b8be0e93b423dad221abe29545fbe8ec36e806bc', 'formats': 'mincount:3', + 'thumbnail': 'https://thumbnails.lbry.com/AgHSc_HzrrE', + 'license': 'Copyrighted (contact publisher)', } }, { + # HLS live stream (might expire) + 'url': 'https://odysee.com/@RT:fd/livestream_RT:d', + 'info_dict': { + 'id': 'fdd11cb3ab75f95efb7b3bc2d726aa13ac915b66', + 'ext': 'mp4', + 'live_status': 'is_live', + 'title': 'startswith:RT News | Livestream 24/7', + 'description': 'md5:fe68d0056dfe79c1a6b8ce8c34d5f6fa', + 'timestamp': int, + 'upload_date': str, + 'release_timestamp': int, + 'release_date': str, + 'tags': list, + 'duration': None, + 'channel': 'RT', + 'channel_id': 'fdd11cb3ab75f95efb7b3bc2d726aa13ac915b66', + 'channel_url': 'https://odysee.com/@RT:fdd11cb3ab75f95efb7b3bc2d726aa13ac915b66', + 'formats': 'mincount:1', + 'thumbnail': 'startswith:https://thumb', + 'license': 'None', + }, + 'params': {'skip_download': True} + }, { 'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e', 'only_matching': True, }, { - 'url': "https://odysee.com/@ScammerRevolts:b0/I-SYSKEY'D-THE-SAME-SCAMMERS-3-TIMES!:b", + 'url': 'https://odysee.com/@ScammerRevolts:b0/I-SYSKEY\'D-THE-SAME-SCAMMERS-3-TIMES!:b', 'only_matching': True, }, { 'url': 'https://lbry.tv/Episode-1:e7d93d772bd87e2b62d5ab993c1c3ced86ebb396', @@ -185,20 +224,24 @@ class LBRYIE(LBRYBaseIE): display_id = compat_urllib_parse_unquote(display_id) uri = 'lbry://' + display_id result = self._resolve_url(uri, display_id, 'stream') + headers = {'Referer': 'https://odysee.com/'} if result['value'].get('stream_type') in self._SUPPORTED_STREAM_TYPES: - claim_id, is_live, headers = result['claim_id'], False, None + claim_id, is_live = result['claim_id'], False streaming_url = self._call_api_proxy( 'get', claim_id, {'uri': uri}, 'streaming url')['streaming_url'] final_url = self._request_webpage( - streaming_url, display_id, note='Downloading streaming redirect url info').geturl() + HEADRequest(streaming_url), display_id, headers=headers, + note='Downloading streaming redirect url info').geturl() elif result.get('value_type') == 'stream': claim_id, is_live = result['signing_channel']['claim_id'], True - headers = {'referer': 'https://player.odysee.live/'} live_data = self._download_json( - f'https://api.live.odysee.com/v1/odysee/live/{claim_id}', claim_id, + 'https://api.odysee.live/livestream/is_live', claim_id, + query={'channel_claim_id': claim_id}, note='Downloading livestream JSON metadata')['data'] - streaming_url = final_url = live_data.get('url') - if not final_url and not live_data.get('live'): + streaming_url = final_url = live_data.get('VideoURL') + # Upcoming videos may still give VideoURL + if not live_data.get('Live'): + streaming_url = final_url = None self.raise_no_formats('This stream is not live', True, claim_id) else: raise UnsupportedError(url) @@ -207,7 +250,6 @@ class LBRYIE(LBRYBaseIE): if determine_ext(final_url) == 'm3u8': info['formats'] = self._extract_m3u8_formats( final_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls', live=is_live, headers=headers) - self._sort_formats(info['formats']) else: info['url'] = streaming_url return { @@ -229,7 +271,7 @@ class LBRYChannelIE(LBRYBaseIE): 'title': 'The LBRY Foundation', 'description': 'Channel for the LBRY Foundation. Follow for updates and news.', }, - 'playlist_count': 29, + 'playlist_mincount': 29, }, { 'url': 'https://lbry.tv/@LBRYFoundation', 'only_matching': True, diff --git a/hypervideo_dl/extractor/lci.py b/hypervideo_dl/extractor/lci.py index 920872f..e7d2f8a 100644 --- a/hypervideo_dl/extractor/lci.py +++ b/hypervideo_dl/extractor/lci.py @@ -1,26 +1,28 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor class LCIIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?lci\.fr/[^/]+/[\w-]+-(?P<id>\d+)\.html' - _TEST = { - 'url': 'http://www.lci.fr/international/etats-unis-a-j-62-hillary-clinton-reste-sans-voix-2001679.html', - 'md5': '2fdb2538b884d4d695f9bd2bde137e6c', + _VALID_URL = r'https?://(?:www\.)?(?:lci|tf1info)\.fr/[^/]+/[\w-]+-(?P<id>\d+)\.html' + _TESTS = [{ + 'url': 'https://www.tf1info.fr/politique/election-presidentielle-2022-second-tour-j-2-marine-le-pen-et-emmanuel-macron-en-interview-de-lci-vendredi-soir-2217486.html', 'info_dict': { - 'id': '13244802', + 'id': '13875948', 'ext': 'mp4', - 'title': 'Hillary Clinton et sa quinte de toux, en plein meeting', - 'description': 'md5:a4363e3a960860132f8124b62f4a01c9', - } - } + 'title': 'md5:660df5481fd418bc3bbb0d070e6fdb5a', + 'thumbnail': 'https://photos.tf1.fr/1280/720/presidentielle-2022-marine-le-pen-et-emmanuel-macron-invites-de-lci-ce-vendredi-9c0e73-e1a036-0@1x.jpg', + 'upload_date': '20220422', + 'duration': 33, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.lci.fr/politique/election-presidentielle-2022-second-tour-j-2-marine-le-pen-et-emmanuel-macron-en-interview-de-lci-vendredi-soir-2217486.html', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - wat_id = self._search_regex( - (r'data-watid=[\'"](\d+)', r'idwat["\']?\s*:\s*["\']?(\d+)'), - webpage, 'wat id') + wat_id = self._search_regex(r'watId["\']?\s*:\s*["\']?(\d+)', webpage, 'wat id') return self.url_result('wat:' + wat_id, 'Wat', wat_id) diff --git a/hypervideo_dl/extractor/lcp.py b/hypervideo_dl/extractor/lcp.py index ade27a9..9846319 100644 --- a/hypervideo_dl/extractor/lcp.py +++ b/hypervideo_dl/extractor/lcp.py @@ -1,11 +1,8 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from .arkena import ArkenaIE -class LcpPlayIE(ArkenaIE): +class LcpPlayIE(ArkenaIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'https?://play\.lcp\.fr/embed/(?P<id>[^/]+)/(?P<account_id>[^/]+)/[^/]+/[^/]+' _TESTS = [{ 'url': 'http://play.lcp.fr/embed/327336/131064/darkmatter/0', diff --git a/hypervideo_dl/extractor/lecture2go.py b/hypervideo_dl/extractor/lecture2go.py index 81b5d41..3a9b30a 100644 --- a/hypervideo_dl/extractor/lecture2go.py +++ b/hypervideo_dl/extractor/lecture2go.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -52,8 +49,6 @@ class Lecture2GoIE(InfoExtractor): 'url': url, }) - self._sort_formats(formats) - creator = self._html_search_regex( r'<div[^>]+id="description">([^<]+)</div>', webpage, 'creator', fatal=False) duration = parse_duration(self._html_search_regex( diff --git a/hypervideo_dl/extractor/lecturio.py b/hypervideo_dl/extractor/lecturio.py index 0ee1eeb..973764c 100644 --- a/hypervideo_dl/extractor/lecturio.py +++ b/hypervideo_dl/extractor/lecturio.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -140,7 +137,6 @@ class LecturioIE(LecturioBaseIE): 'height': int(mobj.group(1)), }) formats.append(f) - self._sort_formats(formats) subtitles = {} automatic_captions = {} diff --git a/hypervideo_dl/extractor/leeco.py b/hypervideo_dl/extractor/leeco.py index d5e1142..85033b8 100644 --- a/hypervideo_dl/extractor/leeco.py +++ b/hypervideo_dl/extractor/leeco.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import datetime import hashlib import re @@ -185,7 +182,6 @@ class LeIE(InfoExtractor): f['height'] = int_or_none(format_id[:-1]) formats.append(f) - self._sort_formats(formats, ('res', 'quality')) publish_time = parse_iso8601(self._html_search_regex( r'发布时间 ([^<>]+) ', page, 'publish time', default=None), @@ -199,6 +195,7 @@ class LeIE(InfoExtractor): 'thumbnail': playurl['pic'], 'description': description, 'timestamp': publish_time, + '_format_sort_fields': ('res', 'quality'), } @@ -359,7 +356,6 @@ class LetvCloudIE(InfoExtractor): media_id = uu + '_' + vu formats = self._get_formats('flash', uu, vu, media_id) + self._get_formats('html5', uu, vu, media_id) - self._sort_formats(formats) return { 'id': media_id, diff --git a/hypervideo_dl/extractor/lego.py b/hypervideo_dl/extractor/lego.py index 901f43b..811b447 100644 --- a/hypervideo_dl/extractor/lego.py +++ b/hypervideo_dl/extractor/lego.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import uuid from .common import InfoExtractor @@ -116,7 +113,6 @@ class LEGOIE(InfoExtractor): 'width': quality[2], }), formats.append(f) - self._sort_formats(formats) subtitles = {} sub_file_id = video.get('SubFileId') diff --git a/hypervideo_dl/extractor/lemonde.py b/hypervideo_dl/extractor/lemonde.py index 3306892..c916791 100644 --- a/hypervideo_dl/extractor/lemonde.py +++ b/hypervideo_dl/extractor/lemonde.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/lenta.py b/hypervideo_dl/extractor/lenta.py index 2ebd4e5..10aac98 100644 --- a/hypervideo_dl/extractor/lenta.py +++ b/hypervideo_dl/extractor/lenta.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/libraryofcongress.py b/hypervideo_dl/extractor/libraryofcongress.py index 03f2051..b76ca09 100644 --- a/hypervideo_dl/extractor/libraryofcongress.py +++ b/hypervideo_dl/extractor/libraryofcongress.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -129,8 +126,6 @@ class LibraryOfCongressIE(InfoExtractor): 'filesize_approx': parse_filesize(m.group('size')), }) - self._sort_formats(formats) - duration = float_or_none(data.get('duration')) view_count = int_or_none(data.get('viewCount')) diff --git a/hypervideo_dl/extractor/libsyn.py b/hypervideo_dl/extractor/libsyn.py index d1fcda4..29bbb03 100644 --- a/hypervideo_dl/extractor/libsyn.py +++ b/hypervideo_dl/extractor/libsyn.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( clean_html, @@ -14,6 +10,7 @@ from ..utils import ( class LibsynIE(InfoExtractor): _VALID_URL = r'(?P<mainurl>https?://html5-player\.libsyn\.com/embed/episode/id/(?P<id>[0-9]+))' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//html5-player\.libsyn\.com/embed/.+?)\1'] _TESTS = [{ 'url': 'http://html5-player.libsyn.com/embed/episode/id/6385796/', diff --git a/hypervideo_dl/extractor/lifenews.py b/hypervideo_dl/extractor/lifenews.py index 49a0a59..919cfcb 100644 --- a/hypervideo_dl/extractor/lifenews.py +++ b/hypervideo_dl/extractor/lifenews.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -226,8 +223,6 @@ class LifeEmbedIE(InfoExtractor): else: extract_original(video_url) - self._sort_formats(formats) - thumbnail = thumbnail or self._search_regex( r'"image"\s*:\s*"([^"]+)', webpage, 'thumbnail', default=None) diff --git a/hypervideo_dl/extractor/likee.py b/hypervideo_dl/extractor/likee.py new file mode 100644 index 0000000..74ee2be --- /dev/null +++ b/hypervideo_dl/extractor/likee.py @@ -0,0 +1,192 @@ +import json + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + js_to_json, + parse_iso8601, + str_or_none, + traverse_obj, +) + + +class LikeeIE(InfoExtractor): + IE_NAME = 'likee' + _VALID_URL = r'(?x)https?://(www\.)?likee\.video/(?:(?P<channel_name>[^/]+)/video/|v/)(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://likee.video/@huynh_hong_quan_/video/7093444807096327263', + 'info_dict': { + 'id': '7093444807096327263', + 'ext': 'mp4', + 'title': '🤴🤴🤴', + 'description': 'md5:9a7ebe816f0e78722ee5ed76f75983b4', + 'thumbnail': r're:^https?://.+\.jpg', + 'uploader': 'Huỳnh Hồng Quân ', + 'play_count': int, + 'download_count': int, + 'artist': 'Huỳnh Hồng Quân ', + 'timestamp': 1651571320, + 'upload_date': '20220503', + 'view_count': int, + 'uploader_id': 'huynh_hong_quan_', + 'duration': 12374, + 'comment_count': int, + 'like_count': int, + }, + }, { + 'url': 'https://likee.video/@649222262/video/7093167848050058862', + 'info_dict': { + 'id': '7093167848050058862', + 'ext': 'mp4', + 'title': 'likee video #7093167848050058862', + 'description': 'md5:3f971c8c6ee8a216f2b1a9094c5de99f', + 'thumbnail': r're:^https?://.+\.jpg', + 'comment_count': int, + 'like_count': int, + 'uploader': 'Vương Phước Nhi', + 'download_count': int, + 'timestamp': 1651506835, + 'upload_date': '20220502', + 'duration': 60024, + 'play_count': int, + 'artist': 'Vương Phước Nhi', + 'uploader_id': '649222262', + 'view_count': int, + }, + }, { + 'url': 'https://likee.video/@fernanda_rivasg/video/6932224568407629502', + 'info_dict': { + 'id': '6932224568407629502', + 'ext': 'mp4', + 'title': 'Un trend viejito🔥 #LIKEE #Ferlovers #trend ', + 'description': 'md5:c42b903a72a99d6d8b73e3d1126fbcef', + 'thumbnail': r're:^https?://.+\.jpg', + 'comment_count': int, + 'duration': 9684, + 'uploader_id': 'fernanda_rivasg', + 'view_count': int, + 'play_count': int, + 'artist': 'La Cami La✨', + 'download_count': int, + 'like_count': int, + 'uploader': 'Fernanda Rivas🎶', + 'timestamp': 1614034308, + 'upload_date': '20210222', + }, + }, { + 'url': 'https://likee.video/v/k6QcOp', + 'info_dict': { + 'id': 'k6QcOp', + 'ext': 'mp4', + 'title': '#AguaChallenge tú ya lo intentaste?😱🤩', + 'description': 'md5:b0cc462689d4ff2b624daa4dba7640d9', + 'thumbnail': r're:^https?://.+\.jpg', + 'comment_count': int, + 'duration': 18014, + 'play_count': int, + 'view_count': int, + 'timestamp': 1611694774, + 'like_count': int, + 'uploader': 'Fernanda Rivas🎶', + 'uploader_id': 'fernanda_rivasg', + 'download_count': int, + 'artist': 'ʟᴇʀɪᴋ_ᴜɴɪᴄᴏʀɴ♡︎', + 'upload_date': '20210126', + }, + }, { + 'url': 'https://www.likee.video/@649222262/video/7093167848050058862', + 'only_matching': True, + }, { + 'url': 'https://www.likee.video/v/k6QcOp', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + info = self._parse_json( + self._search_regex(r'window\.data\s=\s({.+?});', webpage, 'video info'), + video_id, transform_source=js_to_json) + video_url = traverse_obj(info, 'video_url', ('originVideoInfo', 'video_url')) + if not video_url: + self.raise_no_formats('Video was deleted', expected=True) + formats = [{ + 'format_id': 'mp4-with-watermark', + 'url': video_url, + 'height': info.get('video_height'), + 'width': info.get('video_width'), + }, { + 'format_id': 'mp4-without-watermark', + 'url': video_url.replace('_4', ''), + 'height': info.get('video_height'), + 'width': info.get('video_width'), + 'quality': 1, + }] + return { + 'id': video_id, + 'title': info.get('msgText'), + 'description': info.get('share_desc'), + 'view_count': int_or_none(info.get('video_count')), + 'like_count': int_or_none(info.get('likeCount')), + 'play_count': int_or_none(info.get('play_count')), + 'download_count': int_or_none(info.get('download_count')), + 'comment_count': int_or_none(info.get('comment_count')), + 'uploader': str_or_none(info.get('nick_name')), + 'uploader_id': str_or_none(info.get('likeeId')), + 'artist': str_or_none(traverse_obj(info, ('sound', 'owner_name'))), + 'timestamp': parse_iso8601(info.get('uploadDate')), + 'thumbnail': info.get('coverUrl'), + 'duration': int_or_none(traverse_obj(info, ('option_data', 'dur'))), + 'formats': formats, + } + + +class LikeeUserIE(InfoExtractor): + IE_NAME = 'likee:user' + _VALID_URL = r'https?://(www\.)?likee\.video/(?P<id>[^/]+)/?$' + _TESTS = [{ + 'url': 'https://likee.video/@fernanda_rivasg', + 'info_dict': { + 'id': '925638334', + 'title': 'fernanda_rivasg', + }, + 'playlist_mincount': 500, + }, { + 'url': 'https://likee.video/@may_hmoob', + 'info_dict': { + 'id': '2943949041', + 'title': 'may_hmoob', + }, + 'playlist_mincount': 80, + }] + _PAGE_SIZE = 50 + _API_GET_USER_VIDEO = 'https://api.like-video.com/likee-activity-flow-micro/videoApi/getUserVideo' + + def _entries(self, user_name, user_id): + last_post_id = '' + while True: + user_videos = self._download_json( + self._API_GET_USER_VIDEO, user_name, + data=json.dumps({ + 'uid': user_id, + 'count': self._PAGE_SIZE, + 'lastPostId': last_post_id, + 'tabType': 0, + }).encode('utf-8'), + headers={'content-type': 'application/json'}, + note=f'Get user info with lastPostId #{last_post_id}') + items = traverse_obj(user_videos, ('data', 'videoList')) + if not items: + break + for item in items: + last_post_id = item['postId'] + yield self.url_result(f'https://likee.video/{user_name}/video/{last_post_id}') + + def _real_extract(self, url): + user_name = self._match_id(url) + webpage = self._download_webpage(url, user_name) + info = self._parse_json( + self._search_regex(r'window\.data\s*=\s*({.+?});', webpage, 'user info'), + user_name, transform_source=js_to_json) + user_id = traverse_obj(info, ('userinfo', 'uid')) + return self.playlist_result(self._entries(user_name, user_id), user_id, traverse_obj(info, ('userinfo', 'user_name'))) diff --git a/hypervideo_dl/extractor/limelight.py b/hypervideo_dl/extractor/limelight.py index b20681a..e11ec43 100644 --- a/hypervideo_dl/extractor/limelight.py +++ b/hypervideo_dl/extractor/limelight.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -20,7 +17,7 @@ class LimelightBaseIE(InfoExtractor): _PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/%s/%s/%s' @classmethod - def _extract_urls(cls, webpage, source_url): + def _extract_embed_urls(cls, url, webpage): lm = { 'Media': 'media', 'Channel': 'channel', @@ -28,7 +25,7 @@ class LimelightBaseIE(InfoExtractor): } def smuggle(url): - return smuggle_url(url, {'source_url': source_url}) + return smuggle_url(url, {'source_url': url}) entries = [] for kind, video_id in re.findall( @@ -182,8 +179,6 @@ class LimelightBaseIE(InfoExtractor): 'ext': ext, }) - self._sort_formats(formats) - subtitles = {} for flag in mobile_item.get('flags'): if flag == 'ClosedCaptions': diff --git a/hypervideo_dl/extractor/line.py b/hypervideo_dl/extractor/line.py index 987c434..3fab9c8 100644 --- a/hypervideo_dl/extractor/line.py +++ b/hypervideo_dl/extractor/line.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -38,7 +34,7 @@ class LineLiveBaseIE(InfoExtractor): 'timestamp': int_or_none(item.get('createdAt')), 'channel': channel.get('name'), 'channel_id': channel_id, - 'channel_url': format_field(channel_id, template='https://live.line.me/channels/%s'), + 'channel_url': format_field(channel_id, None, 'https://live.line.me/channels/%s'), 'duration': int_or_none(item.get('archiveDuration')), 'view_count': int_or_none(item.get('viewerCount')), 'comment_count': int_or_none(item.get('chatCount')), @@ -102,7 +98,6 @@ class LineLiveIE(LineLiveBaseIE): archive_status = item.get('archiveStatus') if archive_status != 'ARCHIVED': self.raise_no_formats('this video has been ' + archive_status.lower(), expected=True) - self._sort_formats(formats) info['formats'] = formats return info diff --git a/hypervideo_dl/extractor/linkedin.py b/hypervideo_dl/extractor/linkedin.py index 0f57bfa..2bf2e9a 100644 --- a/hypervideo_dl/extractor/linkedin.py +++ b/hypervideo_dl/extractor/linkedin.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from itertools import zip_longest import re @@ -114,8 +111,6 @@ class LinkedInIE(LinkedInBaseIE): 'tbr': float_or_none(source.get('data-bitrate'), scale=1000), } for source in sources] - self._sort_formats(formats) - return { 'id': video_id, 'formats': formats, @@ -190,10 +185,6 @@ class LinkedInLearningIE(LinkedInLearningBaseIE): streaming_url, video_slug, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - # It seems like this would be correctly handled by default - # However, unless someone can confirm this, the old - # behaviour is being kept as-is - self._sort_formats(formats, ('res', 'source_preference')) subtitles = {} duration = int_or_none(video_data.get('durationInSeconds')) transcript_lines = try_get(video_data, lambda x: x['transcript']['lines'], expected_type=list) @@ -211,6 +202,10 @@ class LinkedInLearningIE(LinkedInLearningBaseIE): 'timestamp': float_or_none(video_data.get('publishedOn'), 1000), 'duration': duration, 'subtitles': subtitles, + # It seems like this would be correctly handled by default + # However, unless someone can confirm this, the old + # behaviour is being kept as-is + '_format_sort_fields': ('res', 'source_preference') } diff --git a/hypervideo_dl/extractor/linuxacademy.py b/hypervideo_dl/extractor/linuxacademy.py index 6aff88e..a570248 100644 --- a/hypervideo_dl/extractor/linuxacademy.py +++ b/hypervideo_dl/extractor/linuxacademy.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import json import random @@ -220,7 +218,6 @@ class LinuxAcademyIE(InfoExtractor): formats = self._extract_m3u8_formats( m3u8_url, item_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') - self._sort_formats(formats) info = { 'id': item_id, 'formats': formats, diff --git a/hypervideo_dl/extractor/liputan6.py b/hypervideo_dl/extractor/liputan6.py new file mode 100644 index 0000000..c4477b9 --- /dev/null +++ b/hypervideo_dl/extractor/liputan6.py @@ -0,0 +1,64 @@ +from .common import InfoExtractor +from .vidio import VidioIE + + +class Liputan6IE(InfoExtractor): + _VALID_URL = r'https?://www\.liputan6\.com/\w+/read/\d+/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.liputan6.com/news/read/5007510/video-duh-perawat-rs-di-medan-diduga-salah-berikan-obat-ke-pasien', + 'info_dict': { + 'id': '7082548', + 'ext': 'mp4', + 'title': 'Duh, Perawat RS di Medan Diduga Salah Berikan Obat Ke Pasien', + 'thumbnail': 'https://thumbor.prod.vidiocdn.com/lOz5pStm9X-jjlTa_VQQUelOPtw=/640x360/filters:quality(70)/vidio-web-prod-video/uploads/video/image/7082548/duh-perawat-rs-di-medan-diduga-salah-berikan-obat-ke-pasien-ca1125.jpg', + 'channel_id': '185693', + 'uploader': 'Liputan6.com', + 'duration': 104, + 'uploader_url': 'https://www.vidio.com/@liputan6', + 'description': 'md5:3b58ecff10ec3a41d4304cf98228435a', + 'timestamp': 1657159427, + 'uploader_id': 'liputan6', + 'display_id': 'video-duh-perawat-rs-di-medan-diduga-salah-berikan-obat-ke-pasien', + 'like_count': int, + 'view_count': int, + 'comment_count': int, + 'tags': ['perawat indonesia', 'rumah sakit', 'Medan', 'viral hari ini', 'viral', 'enamplus'], + 'channel': 'Default Channel', + 'dislike_count': int, + 'upload_date': '20220707' + } + }, { + 'url': 'https://www.liputan6.com/tv/read/5007719/video-program-minyakita-minyak-goreng-kemasan-sederhana-seharga-rp-14-ribu', + 'info_dict': { + 'id': '7082543', + 'ext': 'mp4', + 'title': 'md5:ecb7b3c598b97798bfd0eb50c6233b8c', + 'channel_id': '604054', + 'dislike_count': int, + 'comment_count': int, + 'timestamp': 1657159211, + 'upload_date': '20220707', + 'tags': ['minyakita', 'minyak goreng', 'liputan 6', 'sctv'], + 'uploader_url': 'https://www.vidio.com/@sctv', + 'display_id': 'video-program-minyakita-minyak-goreng-kemasan-sederhana-seharga-rp-14-ribu', + 'like_count': int, + 'uploader': 'SCTV', + 'description': 'md5:6c374d82589b71fb98b3d550edb6873f', + 'duration': 99, + 'uploader_id': 'sctv', + 'thumbnail': 'https://thumbor.prod.vidiocdn.com/AAIOjz-64hKojjdw5hr0oNNEeJg=/640x360/filters:quality(70)/vidio-web-prod-video/uploads/video/image/7082543/program-minyakita-minyak-goreng-kemasan-sederhana-seharga-rp14-ribu-_-liputan-6-7d9fbb.jpg', + 'channel': 'Liputan 6 Pagi', + 'view_count': int, + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + json_data = self._search_json( + r'window.kmklabs.gtm\s*=', webpage, 'json_data', display_id) + video_id = json_data['videos']['video_1']['video_id'] + + return self.url_result( + f'https://www.vidio.com/watch/{video_id}-{display_id}', ie=VidioIE, video_id=display_id) diff --git a/hypervideo_dl/extractor/listennotes.py b/hypervideo_dl/extractor/listennotes.py new file mode 100644 index 0000000..4ebc9be --- /dev/null +++ b/hypervideo_dl/extractor/listennotes.py @@ -0,0 +1,86 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + extract_attributes, + get_element_by_class, + get_element_html_by_id, + get_element_text_and_html_by_tag, + parse_duration, + strip_or_none, + traverse_obj, + try_call, +) + + +class ListenNotesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?listennotes\.com/podcasts/[^/]+/[^/]+-(?P<id>.+)/' + _TESTS = [{ + 'url': 'https://www.listennotes.com/podcasts/thriving-on-overload/tim-oreilly-on-noticing-KrDgvNb_u1n/', + 'md5': '5b91a32f841e5788fb82b72a1a8af7f7', + 'info_dict': { + 'id': 'KrDgvNb_u1n', + 'ext': 'mp3', + 'title': 'md5:32236591a921adf17bbdbf0441b6c0e9', + 'description': 'md5:c581ed197eeddcee55a67cdb547c8cbd', + 'duration': 2148.0, + 'channel': 'Thriving on Overload', + 'channel_id': 'ed84wITivxF', + 'episode_id': 'e1312583fa7b4e24acfbb5131050be00', + 'thumbnail': 'https://production.listennotes.com/podcasts/thriving-on-overload-ross-dawson-1wb_KospA3P-ed84wITivxF.300x300.jpg', + 'channel_url': 'https://www.listennotes.com/podcasts/thriving-on-overload-ross-dawson-ed84wITivxF/', + 'cast': ['Tim O’Reilly', 'Cookie Monster', 'Lao Tzu', 'Wallace Steven', 'Eric Raymond', 'Christine Peterson', 'John Maynard Keyne', 'Ross Dawson'], + } + }, { + 'url': 'https://www.listennotes.com/podcasts/ask-noah-show/episode-177-wireguard-with-lwEA3154JzG/', + 'md5': '62fb4ffe7fc525632a1138bf72a5ce53', + 'info_dict': { + 'id': 'lwEA3154JzG', + 'ext': 'mp3', + 'title': 'Episode 177: WireGuard with Jason Donenfeld', + 'description': 'md5:24744f36456a3e95f83c1193a3458594', + 'duration': 3861.0, + 'channel': 'Ask Noah Show', + 'channel_id': '4DQTzdS5-j7', + 'episode_id': '8c8954b95e0b4859ad1eecec8bf6d3a4', + 'channel_url': 'https://www.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-4DQTzdS5-j7/', + 'thumbnail': 'https://production.listennotes.com/podcasts/ask-noah-show-noah-j-chelliah-cfbRUw9Gs3F-4DQTzdS5-j7.300x300.jpg', + 'cast': ['noah showlink', 'noah show', 'noah dashboard', 'jason donenfeld'], + } + }] + + def _clean_description(self, description): + return clean_html(re.sub(r'(</?(div|p)>\s*)+', '<br/><br/>', description or '')) + + def _real_extract(self, url): + audio_id = self._match_id(url) + webpage = self._download_webpage(url, audio_id) + data = self._search_json( + r'<script id="original-content"[^>]+\btype="application/json">', webpage, 'content', audio_id) + data.update(extract_attributes(get_element_html_by_id( + r'episode-play-button-toolbar|episode-no-play-button-toolbar', webpage, escape_value=False))) + + duration, description = self._search_regex( + r'(?P<duration>[\d:]+)\s*-\s*(?P<description>.+)', + self._html_search_meta(['og:description', 'description', 'twitter:description'], webpage), + 'description', fatal=False, group=('duration', 'description')) or (None, None) + + return { + 'id': audio_id, + 'url': data['audio'], + 'title': (data.get('data-title') + or try_call(lambda: get_element_text_and_html_by_tag('h1', webpage)[0]) + or self._html_search_meta(('og:title', 'title', 'twitter:title'), webpage, 'title')), + 'description': (self._clean_description(get_element_by_class('ln-text-p', webpage)) + or strip_or_none(description)), + 'duration': parse_duration(traverse_obj(data, 'audio_length', 'data-duration') or duration), + 'episode_id': traverse_obj(data, 'uuid', 'data-episode-uuid'), + **traverse_obj(data, { + 'thumbnail': 'data-image', + 'channel': 'data-channel-title', + 'cast': ('nlp_entities', ..., 'name'), + 'channel_url': 'channel_url', + 'channel_id': 'channel_short_uuid', + }) + } diff --git a/hypervideo_dl/extractor/litv.py b/hypervideo_dl/extractor/litv.py index 16b475a..31826ac 100644 --- a/hypervideo_dl/extractor/litv.py +++ b/hypervideo_dl/extractor/litv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/livejournal.py b/hypervideo_dl/extractor/livejournal.py index 3a9f455..96bd8b2 100644 --- a/hypervideo_dl/extractor/livejournal.py +++ b/hypervideo_dl/extractor/livejournal.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import int_or_none diff --git a/hypervideo_dl/extractor/liveleak.py b/hypervideo_dl/extractor/liveleak.py deleted file mode 100644 index 4ac437c..0000000 --- a/hypervideo_dl/extractor/liveleak.py +++ /dev/null @@ -1,191 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import int_or_none - - -class LiveLeakIE(InfoExtractor): - _VALID_URL = r'https?://(?:\w+\.)?liveleak\.com/view\?.*?\b[it]=(?P<id>[\w_]+)' - _TESTS = [{ - 'url': 'http://www.liveleak.com/view?i=757_1364311680', - 'md5': '0813c2430bea7a46bf13acf3406992f4', - 'info_dict': { - 'id': '757_1364311680', - 'ext': 'mp4', - 'description': 'extremely bad day for this guy..!', - 'uploader': 'ljfriel2', - 'title': 'Most unlucky car accident', - 'thumbnail': r're:^https?://.*\.jpg$' - } - }, { - 'url': 'http://www.liveleak.com/view?i=f93_1390833151', - 'md5': 'd3f1367d14cc3c15bf24fbfbe04b9abf', - 'info_dict': { - 'id': 'f93_1390833151', - 'ext': 'mp4', - 'description': 'German Television Channel NDR does an exclusive interview with Edward Snowden.\r\nUploaded on LiveLeak cause German Television thinks the rest of the world isn\'t intereseted in Edward Snowden.', - 'uploader': 'ARD_Stinkt', - 'title': 'German Television does first Edward Snowden Interview (ENGLISH)', - 'thumbnail': r're:^https?://.*\.jpg$' - } - }, { - # Prochan embed - 'url': 'http://www.liveleak.com/view?i=4f7_1392687779', - 'md5': '42c6d97d54f1db107958760788c5f48f', - 'info_dict': { - 'id': '4f7_1392687779', - 'ext': 'mp4', - 'description': "The guy with the cigarette seems amazingly nonchalant about the whole thing... I really hope my friends' reactions would be a bit stronger.\r\n\r\nAction-go to 0:55.", - 'uploader': 'CapObveus', - 'title': 'Man is Fatally Struck by Reckless Car While Packing up a Moving Truck', - 'age_limit': 18, - }, - 'skip': 'Video is dead', - }, { - # Covers https://github.com/ytdl-org/youtube-dl/pull/5983 - # Multiple resolutions - 'url': 'http://www.liveleak.com/view?i=801_1409392012', - 'md5': 'c3a449dbaca5c0d1825caecd52a57d7b', - 'info_dict': { - 'id': '801_1409392012', - 'ext': 'mp4', - 'description': 'Happened on 27.7.2014. \r\nAt 0:53 you can see people still swimming at near beach.', - 'uploader': 'bony333', - 'title': 'Crazy Hungarian tourist films close call waterspout in Croatia', - 'thumbnail': r're:^https?://.*\.jpg$' - } - }, { - # Covers https://github.com/ytdl-org/youtube-dl/pull/10664#issuecomment-247439521 - 'url': 'http://m.liveleak.com/view?i=763_1473349649', - 'add_ie': ['Youtube'], - 'info_dict': { - 'id': '763_1473349649', - 'ext': 'mp4', - 'title': 'Reporters and public officials ignore epidemic of black on asian violence in Sacramento | Colin Flaherty', - 'description': 'Colin being the warrior he is and showing the injustice Asians in Sacramento are being subjected to.', - 'uploader': 'Ziz', - 'upload_date': '20160908', - 'uploader_id': 'UCEbta5E_jqlZmEJsriTEtnw' - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.liveleak.com/view?i=677_1439397581', - 'info_dict': { - 'id': '677_1439397581', - 'title': 'Fuel Depot in China Explosion caught on video', - }, - 'playlist_count': 3, - }, { - 'url': 'https://www.liveleak.com/view?t=HvHi_1523016227', - 'only_matching': True, - }, { - # No original video - 'url': 'https://www.liveleak.com/view?t=C26ZZ_1558612804', - 'only_matching': True, - }] - - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+src="(https?://(?:\w+\.)?liveleak\.com/ll_embed\?[^"]*[ift]=[\w_]+[^"]+)"', - webpage) - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip() - video_description = self._og_search_description(webpage) - video_uploader = self._html_search_regex( - r'By:.*?(\w+)</a>', webpage, 'uploader', fatal=False) - age_limit = int_or_none(self._search_regex( - r'you confirm that you are ([0-9]+) years and over.', - webpage, 'age limit', default=None)) - video_thumbnail = self._og_search_thumbnail(webpage) - - entries = self._parse_html5_media_entries(url, webpage, video_id) - if not entries: - # Maybe an embed? - embed_url = self._search_regex( - r'<iframe[^>]+src="((?:https?:)?//(?:www\.)?(?:prochan|youtube)\.com/embed[^"]+)"', - webpage, 'embed URL') - return { - '_type': 'url_transparent', - 'url': embed_url, - 'id': video_id, - 'title': video_title, - 'description': video_description, - 'uploader': video_uploader, - 'age_limit': age_limit, - } - - for idx, info_dict in enumerate(entries): - formats = [] - for a_format in info_dict['formats']: - if not a_format.get('height'): - a_format['height'] = int_or_none(self._search_regex( - r'([0-9]+)p\.mp4', a_format['url'], 'height label', - default=None)) - formats.append(a_format) - - # Removing '.*.mp4' gives the raw video, which is essentially - # the same video without the LiveLeak logo at the top (see - # https://github.com/ytdl-org/youtube-dl/pull/4768) - orig_url = re.sub(r'\.mp4\.[^.]+', '', a_format['url']) - if a_format['url'] != orig_url: - format_id = a_format.get('format_id') - format_id = 'original' + ('-' + format_id if format_id else '') - if self._is_valid_url(orig_url, video_id, format_id): - formats.append({ - 'format_id': format_id, - 'url': orig_url, - 'preference': 1, - }) - self._sort_formats(formats) - info_dict['formats'] = formats - - # Don't append entry ID for one-video pages to keep backward compatibility - if len(entries) > 1: - info_dict['id'] = '%s_%s' % (video_id, idx + 1) - else: - info_dict['id'] = video_id - - info_dict.update({ - 'title': video_title, - 'description': video_description, - 'uploader': video_uploader, - 'age_limit': age_limit, - 'thumbnail': video_thumbnail, - }) - - return self.playlist_result(entries, video_id, video_title) - - -class LiveLeakEmbedIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?liveleak\.com/ll_embed\?.*?\b(?P<kind>[ift])=(?P<id>[\w_]+)' - - # See generic.py for actual test cases - _TESTS = [{ - 'url': 'https://www.liveleak.com/ll_embed?i=874_1459135191', - 'only_matching': True, - }, { - 'url': 'https://www.liveleak.com/ll_embed?f=ab065df993c1', - 'only_matching': True, - }] - - def _real_extract(self, url): - kind, video_id = re.match(self._VALID_URL, url).groups() - - if kind == 'f': - webpage = self._download_webpage(url, video_id) - liveleak_url = self._search_regex( - r'(?:logourl\s*:\s*|window\.open\()(?P<q1>[\'"])(?P<url>%s)(?P=q1)' % LiveLeakIE._VALID_URL, - webpage, 'LiveLeak URL', group='url') - else: - liveleak_url = 'http://www.liveleak.com/view?%s=%s' % (kind, video_id) - - return self.url_result(liveleak_url, ie=LiveLeakIE.ie_key()) diff --git a/hypervideo_dl/extractor/livestream.py b/hypervideo_dl/extractor/livestream.py index 45bf26d..d883eaf 100644 --- a/hypervideo_dl/extractor/livestream.py +++ b/hypervideo_dl/extractor/livestream.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re import itertools @@ -25,6 +23,8 @@ from ..utils import ( class LivestreamIE(InfoExtractor): IE_NAME = 'livestream' _VALID_URL = r'https?://(?:new\.)?livestream\.com/(?:accounts/(?P<account_id>\d+)|(?P<account_name>[^/]+))/(?:events/(?P<event_id>\d+)|(?P<event_name>[^/]+))(?:/videos/(?P<id>\d+))?' + _EMBED_REGEX = [r'<iframe[^>]+src="(?P<url>https?://(?:new\.)?livestream\.com/[^"]+/player[^"]+)"'] + _TESTS = [{ 'url': 'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370', 'md5': '53274c76ba7754fb0e8d072716f2292b', @@ -126,7 +126,6 @@ class LivestreamIE(InfoExtractor): if f4m_url: formats.extend(self._extract_f4m_formats( f4m_url, video_id, f4m_id='hds', fatal=False)) - self._sort_formats(formats) comments = [{ 'author_id': comment.get('author_id'), @@ -171,7 +170,6 @@ class LivestreamIE(InfoExtractor): 'url': rtsp_url, 'format_id': 'rtsp', }) - self._sort_formats(formats) return { 'id': broadcast_id, @@ -300,7 +298,6 @@ class LivestreamOriginalIE(InfoExtractor): 'format_id': 'rtsp', }) - self._sort_formats(formats) return formats def _extract_folder(self, url, folder_id): diff --git a/hypervideo_dl/extractor/livestreamfails.py b/hypervideo_dl/extractor/livestreamfails.py new file mode 100644 index 0000000..0df6384 --- /dev/null +++ b/hypervideo_dl/extractor/livestreamfails.py @@ -0,0 +1,37 @@ +from .common import InfoExtractor +from ..utils import format_field, traverse_obj, unified_timestamp + + +class LivestreamfailsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?livestreamfails\.com/(?:clip|post)/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'https://livestreamfails.com/clip/139200', + 'md5': '8a03aea1a46e94a05af6410337463102', + 'info_dict': { + 'id': '139200', + 'ext': 'mp4', + 'display_id': 'ConcernedLitigiousSalmonPeteZaroll-O8yo9W2L8OZEKhV2', + 'title': 'Streamer jumps off a trampoline at full speed', + 'creator': 'paradeev1ch', + 'thumbnail': r're:^https?://.+', + 'timestamp': 1656271785, + 'upload_date': '20220626', + } + }, { + 'url': 'https://livestreamfails.com/post/139200', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + api_response = self._download_json(f'https://api.livestreamfails.com/clip/{video_id}', video_id) + + return { + 'id': video_id, + 'display_id': api_response.get('sourceId'), + 'timestamp': unified_timestamp(api_response.get('createdAt')), + 'url': f'https://livestreamfails-video-prod.b-cdn.net/video/{api_response["videoId"]}', + 'title': api_response.get('label'), + 'creator': traverse_obj(api_response, ('streamer', 'label')), + 'thumbnail': format_field(api_response, 'imageId', 'https://livestreamfails-image-prod.b-cdn.net/image/%s') + } diff --git a/hypervideo_dl/extractor/lnkgo.py b/hypervideo_dl/extractor/lnkgo.py index bd2dffa..6282d2e 100644 --- a/hypervideo_dl/extractor/lnkgo.py +++ b/hypervideo_dl/extractor/lnkgo.py @@ -1,11 +1,7 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( clean_html, - compat_str, format_field, int_or_none, parse_iso8601, @@ -71,7 +67,6 @@ class LnkGoIE(InfoExtractor): formats = self._extract_m3u8_formats( self._M3U8_TEMPL % (prefix, video_info['videoUrl'], video_info.get('secureTokenParams') or ''), video_id, 'mp4', 'm3u8_native') - self._sort_formats(formats) return { 'id': video_id, @@ -153,7 +148,6 @@ class LnkIE(InfoExtractor): formats.extend(fmts) subtitles = self._merge_subtitles(subtitles, subs) - self._sort_formats(formats) return { 'id': id, 'title': video_json.get('title'), diff --git a/hypervideo_dl/extractor/localnews8.py b/hypervideo_dl/extractor/localnews8.py index c3e9d10..6f3f02c 100644 --- a/hypervideo_dl/extractor/localnews8.py +++ b/hypervideo_dl/extractor/localnews8.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/lovehomeporn.py b/hypervideo_dl/extractor/lovehomeporn.py index ca4b5f3..ba5a13a 100644 --- a/hypervideo_dl/extractor/lovehomeporn.py +++ b/hypervideo_dl/extractor/lovehomeporn.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .nuevo import NuevoBaseIE diff --git a/hypervideo_dl/extractor/lrt.py b/hypervideo_dl/extractor/lrt.py index 4024aef..80d4d1c 100644 --- a/hypervideo_dl/extractor/lrt.py +++ b/hypervideo_dl/extractor/lrt.py @@ -1,21 +1,58 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( clean_html, merge_dicts, + traverse_obj, + url_or_none, ) -class LRTIE(InfoExtractor): - IE_NAME = 'lrt.lt' +class LRTBaseIE(InfoExtractor): + def _extract_js_var(self, webpage, var_name, default=None): + return self._search_regex( + fr'{var_name}\s*=\s*(["\'])((?:(?!\1).)+)\1', + webpage, var_name.replace('_', ' '), default, group=2) + + +class LRTStreamIE(LRTBaseIE): + _VALID_URL = r'https?://(?:www\.)?lrt\.lt/mediateka/tiesiogiai/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.lrt.lt/mediateka/tiesiogiai/lrt-opus', + 'info_dict': { + 'id': 'lrt-opus', + 'live_status': 'is_live', + 'title': 're:^LRT Opus.+$', + 'ext': 'mp4' + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + streams_data = self._download_json(self._extract_js_var(webpage, 'tokenURL'), video_id) + + formats, subtitles = [], {} + for stream_url in traverse_obj(streams_data, ( + 'response', 'data', lambda k, _: k.startswith('content')), expected_type=url_or_none): + fmts, subs = self._extract_m3u8_formats_and_subtitles(stream_url, video_id, 'mp4', m3u8_id='hls', live=True) + formats.extend(fmts) + subtitles = self._merge_subtitles(subtitles, subs) + + stream_title = self._extract_js_var(webpage, 'video_title', 'LRT') + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'is_live': True, + 'title': f'{self._og_search_title(webpage)} - {stream_title}' + } + + +class LRTVODIE(LRTBaseIE): _VALID_URL = r'https?://(?:www\.)?lrt\.lt(?P<path>/mediateka/irasas/(?P<id>[0-9]+))' _TESTS = [{ # m3u8 download 'url': 'https://www.lrt.lt/mediateka/irasas/2000127261/greita-ir-gardu-sicilijos-ikvepta-klasikiniu-makaronu-su-baklazanais-vakariene', - 'md5': '85cb2bb530f31d91a9c65b479516ade4', 'info_dict': { 'id': '2000127261', 'ext': 'mp4', @@ -24,6 +61,8 @@ class LRTIE(InfoExtractor): 'duration': 3035, 'timestamp': 1604079000, 'upload_date': '20201030', + 'tags': ['LRT TELEVIZIJA', 'Beatos virtuvė', 'Beata Nicholson', 'Makaronai', 'Baklažanai', 'Vakarienė', 'Receptas'], + 'thumbnail': 'https://www.lrt.lt/img/2020/10/30/764041-126478-1287x836.jpg' }, }, { # direct mp3 download @@ -40,11 +79,6 @@ class LRTIE(InfoExtractor): }, }] - def _extract_js_var(self, webpage, var_name, default): - return self._search_regex( - r'%s\s*=\s*(["\'])((?:(?!\1).)+)\1' % var_name, - webpage, var_name.replace('_', ' '), default, group=2) - def _real_extract(self, url): path, video_id = self._match_valid_url(url).groups() webpage = self._download_webpage(url, video_id) diff --git a/hypervideo_dl/extractor/lynda.py b/hypervideo_dl/extractor/lynda.py index ce30474..768ce91 100644 --- a/hypervideo_dl/extractor/lynda.py +++ b/hypervideo_dl/extractor/lynda.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -159,7 +157,6 @@ class LyndaIE(LyndaBaseIE): 'format_id': '%s-%s' % (cdn, format_id) if cdn else format_id, 'height': int_or_none(format_id), }) - self._sort_formats(formats) conviva = self._download_json( 'https://www.lynda.com/ajax/player/conviva', video_id, @@ -209,7 +206,6 @@ class LyndaIE(LyndaBaseIE): } for format_id, video_url in prioritized_stream.items()]) self._check_formats(formats, video_id) - self._sort_formats(formats) subtitles = self.extract_subtitles(video_id) diff --git a/hypervideo_dl/extractor/m6.py b/hypervideo_dl/extractor/m6.py index 9806875..9dcc601 100644 --- a/hypervideo_dl/extractor/m6.py +++ b/hypervideo_dl/extractor/m6.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/magentamusik360.py b/hypervideo_dl/extractor/magentamusik360.py index 5c27490..5d0cb3b 100644 --- a/hypervideo_dl/extractor/magentamusik360.py +++ b/hypervideo_dl/extractor/magentamusik360.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/mailru.py b/hypervideo_dl/extractor/mailru.py index 5d9f80b..387d211 100644 --- a/hypervideo_dl/extractor/mailru.py +++ b/hypervideo_dl/extractor/mailru.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import itertools import json import re @@ -163,7 +160,6 @@ class MailRuIE(InfoExtractor): 'height': height, 'http_headers': headers, }) - self._sort_formats(formats) meta_data = video_data['meta'] title = remove_end(meta_data['title'], '.mp4') diff --git a/hypervideo_dl/extractor/mainstreaming.py b/hypervideo_dl/extractor/mainstreaming.py index 0f349a7..fe5589d 100644 --- a/hypervideo_dl/extractor/mainstreaming.py +++ b/hypervideo_dl/extractor/mainstreaming.py @@ -1,4 +1,3 @@ -# coding: utf-8 import re from .common import InfoExtractor @@ -15,6 +14,7 @@ from ..utils import ( class MainStreamingIE(InfoExtractor): _VALID_URL = r'https?://(?:webtools-?)?(?P<host>[A-Za-z0-9-]*\.msvdn.net)/(?:embed|amp_embed|content)/(?P<id>\w+)' + _EMBED_REGEX = [rf'<iframe[^>]+?src=["\']?(?P<url>{_VALID_URL})["\']?'] IE_DESC = 'MainStreaming Player' _TESTS = [ @@ -103,13 +103,6 @@ class MainStreamingIE(InfoExtractor): } ] - @staticmethod - def _extract_urls(webpage): - mobj = re.findall( - r'<iframe[^>]+?src=["\']?(?P<url>%s)["\']?' % MainStreamingIE._VALID_URL, webpage) - if mobj: - return [group[0] for group in mobj] - def _playlist_entries(self, host, playlist_content): for entry in playlist_content: content_id = entry.get('contentID') @@ -204,8 +197,6 @@ class MainStreamingIE(InfoExtractor): subtitles = self._merge_subtitles(m3u8_subs, mpd_subs) formats.extend(m3u8_formats + mpd_formats) - self._sort_formats(formats) - return { 'id': video_id, 'title': title, diff --git a/hypervideo_dl/extractor/malltv.py b/hypervideo_dl/extractor/malltv.py index fadfd93..e1031d8 100644 --- a/hypervideo_dl/extractor/malltv.py +++ b/hypervideo_dl/extractor/malltv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( clean_html, @@ -17,7 +14,7 @@ class MallTVIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www|sk)\.)?mall\.tv/(?:[^/]+/)*(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://www.mall.tv/18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', - 'md5': '1c4a37f080e1f3023103a7b43458e518', + 'md5': 'cd69ce29176f6533b65bff69ed9a5f2a', 'info_dict': { 'id': 't0zzt0', 'display_id': '18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', @@ -28,6 +25,11 @@ class MallTVIE(InfoExtractor): 'timestamp': 1538870400, 'upload_date': '20181007', 'view_count': int, + 'comment_count': int, + 'thumbnail': 'https://cdn.vpplayer.tech/agmipnzv/encode/vjsnigfq/thumbnails/retina.jpg', + 'average_rating': 9.060869565217391, + 'dislike_count': int, + 'like_count': int, } }, { 'url': 'https://www.mall.tv/kdo-to-plati/18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice', @@ -35,6 +37,24 @@ class MallTVIE(InfoExtractor): }, { 'url': 'https://sk.mall.tv/gejmhaus/reklamacia-nehreje-vyrobnik-tepla-alebo-spekacka', 'only_matching': True, + }, { + 'url': 'https://www.mall.tv/zivoty-slavnych/nadeje-vychodu-i-zapadu-jak-michail-gorbacov-zmenil-politickou-mapu-sveta-a-ziskal-za-to-nobelovu-cenu-miru', + 'info_dict': { + 'id': 'yx010y', + 'ext': 'mp4', + 'dislike_count': int, + 'description': 'md5:aee02bee5a8d072c6a8207b91d1905a9', + 'thumbnail': 'https://cdn.vpplayer.tech/agmipnzv/encode/vjsnjdeu/thumbnails/retina.jpg', + 'comment_count': int, + 'display_id': 'md5:0ec2afa94d2e2b7091c019cef2a43a9b', + 'like_count': int, + 'duration': 752, + 'timestamp': 1646956800, + 'title': 'md5:fe79385daaf16d74c12c1ec4a26687af', + 'view_count': int, + 'upload_date': '20220311', + 'average_rating': 9.685714285714285, + } }] def _real_extract(self, url): @@ -46,13 +66,12 @@ class MallTVIE(InfoExtractor): video = self._parse_json(self._search_regex( r'videoObject\s*=\s*JSON\.parse\(JSON\.stringify\(({.+?})\)\);', webpage, 'video object'), display_id) - video_source = video['VideoSource'] + video_id = self._search_regex( - r'/([\da-z]+)/index\b', video_source, 'video id') + r'<input\s*id\s*=\s*player-id-name\s*[^>]+value\s*=\s*(\w+)', webpage, 'video id') formats = self._extract_m3u8_formats( - video_source + '.m3u8', video_id, 'mp4', 'm3u8_native') - self._sort_formats(formats) + video['VideoSource'], video_id, 'mp4', 'm3u8_native') subtitles = {} for s in (video.get('Subtitles') or {}): @@ -72,7 +91,7 @@ class MallTVIE(InfoExtractor): info = self._search_json_ld(webpage, video_id, default={}) return merge_dicts({ - 'id': video_id, + 'id': str(video_id), 'display_id': display_id, 'title': video.get('Title'), 'description': clean_html(video.get('Description')), diff --git a/hypervideo_dl/extractor/mangomolo.py b/hypervideo_dl/extractor/mangomolo.py index 68ce138..efaf66f 100644 --- a/hypervideo_dl/extractor/mangomolo.py +++ b/hypervideo_dl/extractor/mangomolo.py @@ -1,16 +1,31 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_b64decode, compat_urllib_parse_unquote, ) -from ..utils import int_or_none +from ..utils import classproperty, int_or_none class MangomoloBaseIE(InfoExtractor): - _BASE_REGEX = r'https?://(?:admin\.mangomolo\.com/analytics/index\.php/customers/embed/|player\.mangomolo\.com/v1/)' + _BASE_REGEX = r'(?:https?:)?//(?:admin\.mangomolo\.com/analytics/index\.php/customers/embed/|player\.mangomolo\.com/v1/)' + _SLUG = None + + @classproperty + def _VALID_URL(cls): + return f'{cls._BASE_REGEX}{cls._SLUG}' + + @classproperty + def _EMBED_REGEX(cls): + return [rf'<iframe[^>]+src=(["\'])(?P<url>{cls._VALID_URL}.+?)\1'] + + def _extract_from_webpage(self, url, webpage): + for res in super()._extract_from_webpage(url, webpage): + yield { + **res, + '_type': 'url_transparent', + 'id': self._search_regex(self._SLUG, res['url'], 'id', group='id'), + 'uploader': self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader'), + } def _get_real_id(self, page_id): return page_id @@ -29,7 +44,6 @@ class MangomoloBaseIE(InfoExtractor): ], webpage, 'format url') formats = self._extract_wowza_formats( format_url, page_id, m3u8_entry_protocol, ['smil']) - self._sort_formats(formats) return { 'id': page_id, @@ -44,14 +58,15 @@ class MangomoloBaseIE(InfoExtractor): class MangomoloVideoIE(MangomoloBaseIE): _TYPE = 'video' IE_NAME = 'mangomolo:' + _TYPE - _VALID_URL = MangomoloBaseIE._BASE_REGEX + r'video\?.*?\bid=(?P<id>\d+)' + _SLUG = r'video\?.*?\bid=(?P<id>\d+)' + _IS_LIVE = False class MangomoloLiveIE(MangomoloBaseIE): _TYPE = 'live' IE_NAME = 'mangomolo:' + _TYPE - _VALID_URL = MangomoloBaseIE._BASE_REGEX + r'(live|index)\?.*?\bchannelid=(?P<id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+)' + _SLUG = r'(?:live|index)\?.*?\bchannelid=(?P<id>(?:[A-Za-z0-9+/=]|%2B|%2F|%3D)+)' _IS_LIVE = True def _get_real_id(self, page_id): diff --git a/hypervideo_dl/extractor/manoto.py b/hypervideo_dl/extractor/manoto.py index d12aa5f..2792e6e 100644 --- a/hypervideo_dl/extractor/manoto.py +++ b/hypervideo_dl/extractor/manoto.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( clean_html, @@ -57,7 +54,6 @@ class ManotoTVIE(InfoExtractor): episode_json = self._download_json(_API_URL.format('showmodule', 'episodedetails', video_id), video_id) details = episode_json.get('details', {}) formats = self._extract_m3u8_formats(details.get('videoM3u8Url'), video_id, 'mp4') - self._sort_formats(formats) return { 'id': video_id, 'series': details.get('showTitle'), @@ -129,7 +125,6 @@ class ManotoTVLiveIE(InfoExtractor): details = json.get('details', {}) video_url = details.get('liveUrl') formats = self._extract_m3u8_formats(video_url, video_id, 'mp4', live=True) - self._sort_formats(formats) return { 'id': video_id, 'title': 'Manoto TV Live', diff --git a/hypervideo_dl/extractor/manyvids.py b/hypervideo_dl/extractor/manyvids.py index bd24f88..7417453 100644 --- a/hypervideo_dl/extractor/manyvids.py +++ b/hypervideo_dl/extractor/manyvids.py @@ -1,11 +1,12 @@ -# coding: utf-8 -from __future__ import unicode_literals +import re from .common import InfoExtractor from ..utils import ( determine_ext, + extract_attributes, int_or_none, str_to_int, + url_or_none, urlencode_postdata, ) @@ -20,17 +21,20 @@ class ManyVidsIE(InfoExtractor): 'id': '133957', 'ext': 'mp4', 'title': 'everthing about me (Preview)', + 'uploader': 'ellyxxix', 'view_count': int, 'like_count': int, }, }, { # full video 'url': 'https://www.manyvids.com/Video/935718/MY-FACE-REVEAL/', - 'md5': 'f3e8f7086409e9b470e2643edb96bdcc', + 'md5': 'bb47bab0e0802c2a60c24ef079dfe60f', 'info_dict': { 'id': '935718', 'ext': 'mp4', 'title': 'MY FACE REVEAL', + 'description': 'md5:ec5901d41808b3746fed90face161612', + 'uploader': 'Sarah Calanthe', 'view_count': int, 'like_count': int, }, @@ -39,17 +43,50 @@ class ManyVidsIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + real_url = 'https://www.manyvids.com/video/%s/gtm.js' % (video_id, ) + try: + webpage = self._download_webpage(real_url, video_id) + except Exception: + # probably useless fallback + webpage = self._download_webpage(url, video_id) - video_url = self._search_regex( - r'data-(?:video-filepath|meta-video)\s*=s*(["\'])(?P<url>(?:(?!\1).)+)\1', - webpage, 'video URL', group='url') + info = self._search_regex( + r'''(<div\b[^>]*\bid\s*=\s*(['"])pageMetaDetails\2[^>]*>)''', + webpage, 'meta details', default='') + info = extract_attributes(info) - title = self._html_search_regex( - (r'<span[^>]+class=["\']item-title[^>]+>([^<]+)', - r'<h2[^>]+class=["\']h2 m-0["\'][^>]*>([^<]+)'), - webpage, 'title', default=None) or self._html_search_meta( - 'twitter:title', webpage, 'title', fatal=True) + player = self._search_regex( + r'''(<div\b[^>]*\bid\s*=\s*(['"])rmpPlayerStream\2[^>]*>)''', + webpage, 'player details', default='') + player = extract_attributes(player) + + video_urls_and_ids = ( + (info.get('data-meta-video'), 'video'), + (player.get('data-video-transcoded'), 'transcoded'), + (player.get('data-video-filepath'), 'filepath'), + (self._og_search_video_url(webpage, secure=False, default=None), 'og_video'), + ) + + def txt_or_none(s, default=None): + return (s.strip() or default) if isinstance(s, str) else default + + uploader = txt_or_none(info.get('data-meta-author')) + + def mung_title(s): + if uploader: + s = re.sub(r'^\s*%s\s+[|-]' % (re.escape(uploader), ), '', s) + return txt_or_none(s) + + title = ( + mung_title(info.get('data-meta-title')) + or self._html_search_regex( + (r'<span[^>]+class=["\']item-title[^>]+>([^<]+)', + r'<h2[^>]+class=["\']h2 m-0["\'][^>]*>([^<]+)'), + webpage, 'title', default=None) + or self._html_search_meta( + 'twitter:title', webpage, 'title', fatal=True)) + + title = re.sub(r'\s*[|-]\s+ManyVids\s*$', '', title) or title if any(p in webpage for p in ('preview_videos', '_preview.mp4')): title += ' (Preview)' @@ -62,7 +99,8 @@ class ManyVidsIE(InfoExtractor): # Sets some cookies self._download_webpage( 'https://www.manyvids.com/includes/ajax_repository/you_had_me_at_hello.php', - video_id, fatal=False, data=urlencode_postdata({ + video_id, note='Setting format cookies', fatal=False, + data=urlencode_postdata({ 'mvtoken': mv_token, 'vid': video_id, }), headers={ @@ -70,24 +108,54 @@ class ManyVidsIE(InfoExtractor): 'X-Requested-With': 'XMLHttpRequest' }) - if determine_ext(video_url) == 'm3u8': - formats = self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') - else: - formats = [{'url': video_url}] + formats = [] + for v_url, fmt in video_urls_and_ids: + v_url = url_or_none(v_url) + if not v_url: + continue + if determine_ext(v_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + v_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls')) + else: + formats.append({ + 'url': v_url, + 'format_id': fmt, + }) + + self._remove_duplicate_formats(formats) + + for f in formats: + if f.get('height') is None: + f['height'] = int_or_none( + self._search_regex(r'_(\d{2,3}[02468])_', f['url'], 'video height', default=None)) + if '/preview/' in f['url']: + f['format_id'] = '_'.join(filter(None, (f.get('format_id'), 'preview'))) + f['preference'] = -10 + if 'transcoded' in f['format_id']: + f['preference'] = f.get('preference', -1) - 1 + + def get_likes(): + likes = self._search_regex( + r'''(<a\b[^>]*\bdata-id\s*=\s*(['"])%s\2[^>]*>)''' % (video_id, ), + webpage, 'likes', default='') + likes = extract_attributes(likes) + return int_or_none(likes.get('data-likes')) - like_count = int_or_none(self._search_regex( - r'data-likes=["\'](\d+)', webpage, 'like count', default=None)) - view_count = str_to_int(self._html_search_regex( - r'(?s)<span[^>]+class="views-wrapper"[^>]*>(.+?)</span', webpage, - 'view count', default=None)) + def get_views(): + return str_to_int(self._html_search_regex( + r'''(?s)<span\b[^>]*\bclass\s*=["']views-wrapper\b[^>]+>.+?<span\b[^>]+>\s*(\d[\d,.]*)\s*</span>''', + webpage, 'view count', default=None)) return { 'id': video_id, 'title': title, - 'view_count': view_count, - 'like_count': like_count, 'formats': formats, - 'uploader': self._html_search_regex(r'<meta[^>]+name="author"[^>]*>([^<]+)', webpage, 'uploader'), + 'description': txt_or_none(info.get('data-meta-description')), + 'uploader': txt_or_none(info.get('data-meta-author')), + 'thumbnail': ( + url_or_none(info.get('data-meta-image')) + or url_or_none(player.get('data-video-screenshot'))), + 'view_count': get_views(), + 'like_count': get_likes(), } diff --git a/hypervideo_dl/extractor/maoritv.py b/hypervideo_dl/extractor/maoritv.py index 0d23fec..67780ea 100644 --- a/hypervideo_dl/extractor/maoritv.py +++ b/hypervideo_dl/extractor/maoritv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/markiza.py b/hypervideo_dl/extractor/markiza.py index def960a..53ed791 100644 --- a/hypervideo_dl/extractor/markiza.py +++ b/hypervideo_dl/extractor/markiza.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/massengeschmacktv.py b/hypervideo_dl/extractor/massengeschmacktv.py index b381d31..7dacb43 100644 --- a/hypervideo_dl/extractor/massengeschmacktv.py +++ b/hypervideo_dl/extractor/massengeschmacktv.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -67,8 +65,6 @@ class MassengeschmackTVIE(InfoExtractor): 'vcodec': 'none' if format_id.startswith('Audio') else None, }) - self._sort_formats(formats) - return { 'id': episode, 'title': title, diff --git a/hypervideo_dl/extractor/masters.py b/hypervideo_dl/extractor/masters.py new file mode 100644 index 0000000..716f1c9 --- /dev/null +++ b/hypervideo_dl/extractor/masters.py @@ -0,0 +1,38 @@ +from __future__ import unicode_literals +from .common import InfoExtractor +from ..utils import ( + traverse_obj, + unified_strdate, +) + + +class MastersIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?masters\.com/en_US/watch/(?P<date>\d{4}-\d{2}-\d{2})/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.masters.com/en_US/watch/2022-04-07/16493755593805191/sungjae_im_thursday_interview_2022.html', + 'info_dict': { + 'id': '16493755593805191', + 'ext': 'mp4', + 'title': 'Sungjae Im: Thursday Interview 2022', + 'upload_date': '20220407', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }] + + def _real_extract(self, url): + video_id, upload_date = self._match_valid_url(url).group('id', 'date') + content_resp = self._download_json( + f'https://www.masters.com/relatedcontent/rest/v2/masters_v1/en/content/masters_v1_{video_id}_en', + video_id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles(traverse_obj(content_resp, ('media', 'm3u8')), video_id, 'mp4') + + thumbnails = [{'id': name, 'url': url} for name, url in traverse_obj(content_resp, ('images', 0), default={}).items()] + + return { + 'id': video_id, + 'title': content_resp.get('title'), + 'formats': formats, + 'subtitles': subtitles, + 'upload_date': unified_strdate(upload_date), + 'thumbnails': thumbnails, + } diff --git a/hypervideo_dl/extractor/matchtv.py b/hypervideo_dl/extractor/matchtv.py index e003b8d..a67fa9f 100644 --- a/hypervideo_dl/extractor/matchtv.py +++ b/hypervideo_dl/extractor/matchtv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import random from .common import InfoExtractor @@ -46,7 +43,6 @@ class MatchTVIE(InfoExtractor): })['data']['videoUrl'] f4m_url = xpath_text(self._download_xml(video_url, video_id), './to') formats = self._extract_f4m_formats(f4m_url, video_id) - self._sort_formats(formats) return { 'id': video_id, 'title': 'Матч ТВ - Прямой эфир', diff --git a/hypervideo_dl/extractor/mdr.py b/hypervideo_dl/extractor/mdr.py index 3ca174c..49f5b49 100644 --- a/hypervideo_dl/extractor/mdr.py +++ b/hypervideo_dl/extractor/mdr.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( @@ -165,8 +162,6 @@ class MDRIE(InfoExtractor): formats.append(f) - self._sort_formats(formats) - description = xpath_text(doc, './broadcast/broadcastDescription', 'description') timestamp = parse_iso8601( xpath_text( diff --git a/hypervideo_dl/extractor/medaltv.py b/hypervideo_dl/extractor/medaltv.py index 59cc307..82be823 100644 --- a/hypervideo_dl/extractor/medaltv.py +++ b/hypervideo_dl/extractor/medaltv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -11,15 +8,33 @@ from ..utils import ( float_or_none, int_or_none, str_or_none, - try_get, + traverse_obj, ) class MedalTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?medal\.tv/clips/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?medal\.tv/(?P<path>games/[^/?#&]+/clips)/(?P<id>[^/?#&]+)' _TESTS = [{ - 'url': 'https://medal.tv/clips/2mA60jWAGQCBH', - 'md5': '7b07b064331b1cf9e8e5c52a06ae68fa', + 'url': 'https://medal.tv/games/valorant/clips/jTBFnLKdLy15K', + 'md5': '6930f8972914b6b9fdc2bb3918098ba0', + 'info_dict': { + 'id': 'jTBFnLKdLy15K', + 'ext': 'mp4', + 'title': "Mornu's clutch", + 'description': '', + 'uploader': 'Aciel', + 'timestamp': 1651628243, + 'upload_date': '20220504', + 'uploader_id': '19335460', + 'uploader_url': 'https://medal.tv/users/19335460', + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'duration': 13, + } + }, { + 'url': 'https://medal.tv/games/cod%20cold%20war/clips/2mA60jWAGQCBH', + 'md5': '3d19d426fe0b2d91c26e412684e66a06', 'info_dict': { 'id': '2mA60jWAGQCBH', 'ext': 'mp4', @@ -29,9 +44,15 @@ class MedalTVIE(InfoExtractor): 'timestamp': 1603165266, 'upload_date': '20201020', 'uploader_id': '10619174', + 'thumbnail': 'https://cdn.medal.tv/10619174/thumbnail-34934644-720p.jpg?t=1080p&c=202042&missing', + 'uploader_url': 'https://medal.tv/users/10619174', + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'duration': 23, } }, { - 'url': 'https://medal.tv/clips/2um24TWdty0NA', + 'url': 'https://medal.tv/games/cod%20cold%20war/clips/2um24TWdty0NA', 'md5': 'b6dc76b78195fff0b4f8bf4a33ec2148', 'info_dict': { 'id': '2um24TWdty0NA', @@ -42,25 +63,42 @@ class MedalTVIE(InfoExtractor): 'timestamp': 1605580939, 'upload_date': '20201117', 'uploader_id': '5156321', + 'thumbnail': 'https://cdn.medal.tv/5156321/thumbnail-36787208-360p.jpg?t=1080p&c=202046&missing', + 'uploader_url': 'https://medal.tv/users/5156321', + 'comment_count': int, + 'view_count': int, + 'like_count': int, + 'duration': 9, } }, { - 'url': 'https://medal.tv/clips/37rMeFpryCC-9', + 'url': 'https://medal.tv/games/valorant/clips/37rMeFpryCC-9', 'only_matching': True, }, { - 'url': 'https://medal.tv/clips/2WRj40tpY_EU9', + 'url': 'https://medal.tv/games/valorant/clips/2WRj40tpY_EU9', 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) + path = self._match_valid_url(url).group('path') + webpage = self._download_webpage(url, video_id) - hydration_data = self._parse_json(self._search_regex( - r'<script[^>]*>\s*(?:var\s*)?hydrationData\s*=\s*({.+?})\s*</script>', - webpage, 'hydration data', default='{}'), video_id) + next_data = self._search_json( + '<script[^>]*__NEXT_DATA__[^>]*>', webpage, + 'next data', video_id, end_pattern='</script>', fatal=False) - clip = try_get( - hydration_data, lambda x: x['clips'][video_id], dict) or {} + build_id = next_data.get('buildId') + if not build_id: + raise ExtractorError( + 'Could not find build ID.', video_id=video_id) + + locale = next_data.get('locale', 'en') + + api_response = self._download_json( + f'https://medal.tv/_next/data/{build_id}/{locale}/{path}/{video_id}.json', video_id) + + clip = traverse_obj(api_response, ('pageProps', 'clip')) or {} if not clip: raise ExtractorError( 'Could not find video information.', video_id=video_id) @@ -112,14 +150,11 @@ class MedalTVIE(InfoExtractor): 'An unknown error occurred ({0}).'.format(error), video_id=video_id) - self._sort_formats(formats) - # Necessary because the id of the author is not known in advance. # Won't raise an issue if no profile can be found as this is optional. - author = try_get( - hydration_data, lambda x: list(x['profiles'].values())[0], dict) or {} - author_id = str_or_none(author.get('id')) - author_url = format_field(author_id, template='https://medal.tv/users/%s') + author = traverse_obj(api_response, ('pageProps', 'profile')) or {} + author_id = str_or_none(author.get('userId')) + author_url = format_field(author_id, None, 'https://medal.tv/users/%s') return { 'id': video_id, diff --git a/hypervideo_dl/extractor/mediaite.py b/hypervideo_dl/extractor/mediaite.py index b670f0d..0f9079b 100644 --- a/hypervideo_dl/extractor/mediaite.py +++ b/hypervideo_dl/extractor/mediaite.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/mediaklikk.py b/hypervideo_dl/extractor/mediaklikk.py index 18ff3be..4636508 100644 --- a/hypervideo_dl/extractor/mediaklikk.py +++ b/hypervideo_dl/extractor/mediaklikk.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from ..utils import ( unified_strdate ) @@ -92,7 +89,6 @@ class MediaKlikkIE(InfoExtractor): formats = self._extract_wowza_formats( playlist_url, video_id, skip_protocols=['f4m', 'smil', 'dash']) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/medialaan.py b/hypervideo_dl/extractor/medialaan.py index 788acf7..bce20dc 100644 --- a/hypervideo_dl/extractor/medialaan.py +++ b/hypervideo_dl/extractor/medialaan.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -71,8 +69,8 @@ class MedialaanIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): + @classmethod + def _extract_embed_urls(cls, url, webpage): entries = [] for element in re.findall(r'(<div[^>]+data-mychannels-type="video"[^>]*>)', webpage): mychannels_id = extract_attributes(element).get('data-mychannels-id') @@ -102,7 +100,6 @@ class MedialaanIE(InfoExtractor): 'ext': ext, 'url': src, }) - self._sort_formats(formats) return { 'id': production_id, diff --git a/hypervideo_dl/extractor/mediaset.py b/hypervideo_dl/extractor/mediaset.py index d6b456c..61bdb2a 100644 --- a/hypervideo_dl/extractor/mediaset.py +++ b/hypervideo_dl/extractor/mediaset.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import functools import re @@ -23,10 +20,10 @@ class MediasetIE(ThePlatformBaseIE): (?: mediaset:| https?:// - (?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/ + (?:\w+\.)+mediaset\.it/ (?: (?:video|on-demand|movie)/(?:[^/]+/)+[^/]+_| - player/index\.html\?.*?\bprogramGuid= + player/(?:v\d+/)?index\.html\?.*?\bprogramGuid= ) )(?P<id>[0-9A-Z]{16,}) ''' @@ -145,6 +142,10 @@ class MediasetIE(ThePlatformBaseIE): 'url': 'https://static3.mediasetplay.mediaset.it/player/index.html?appKey=5ad3966b1de1c4000d5cec48&programGuid=FAFU000000665104&id=665104', 'only_matching': True, }, { + # embedUrl (from https://www.wittytv.it/amici/est-ce-que-tu-maimes-gabriele-5-dicembre-copia/) + 'url': 'https://static3.mediasetplay.mediaset.it/player/v2/index.html?partnerId=wittytv&configId=&programGuid=FD00000000153323&autoplay=true&purl=http://www.wittytv.it/amici/est-ce-que-tu-maimes-gabriele-5-dicembre-copia/', + 'only_matching': True, + }, { 'url': 'mediaset:FAFU000000665924', 'only_matching': True, }, { @@ -162,36 +163,36 @@ class MediasetIE(ThePlatformBaseIE): }, { 'url': 'https://www.mediasetplay.mediaset.it/movie/herculeslaleggendahainizio/hercules-la-leggenda-ha-inizio_F305927501000102', 'only_matching': True, + }, { + 'url': 'https://mediasetinfinity.mediaset.it/video/braveandbeautiful/episodio-113_F310948005000402', + 'only_matching': True, + }, { + 'url': 'https://static3.mediasetplay.mediaset.it/player/v2/index.html?partnerId=wittytv&configId=&programGuid=FD00000000153323', + 'only_matching': True, }] - @staticmethod - def _extract_urls(ie, webpage): - def _qs(url): - return parse_qs(url) - + def _extract_from_webpage(self, url, webpage): def _program_guid(qs): return qs.get('programGuid', [None])[0] - entries = [] for mobj in re.finditer( r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?video\.mediaset\.it/player/playerIFrame(?:Twitter)?\.shtml.*?)\1', webpage): embed_url = mobj.group('url') - embed_qs = _qs(embed_url) + embed_qs = parse_qs(embed_url) program_guid = _program_guid(embed_qs) if program_guid: - entries.append(embed_url) + yield self.url_result(embed_url) continue + video_id = embed_qs.get('id', [None])[0] if not video_id: continue - urlh = ie._request_webpage( - embed_url, video_id, note='Following embed URL redirect') + urlh = self._request_webpage(embed_url, video_id, note='Following embed URL redirect') embed_url = urlh.geturl() - program_guid = _program_guid(_qs(embed_url)) + program_guid = _program_guid(parse_qs(embed_url)) if program_guid: - entries.append(embed_url) - return entries + yield self.url_result(embed_url) def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): for video in smil.findall(self._xpath_ns('.//video', namespace)): @@ -246,8 +247,6 @@ class MediasetIE(ThePlatformBaseIE): if (first_e or geo_e) and not formats: raise geo_e or first_e - self._sort_formats(formats) - feed_data = self._download_json( 'https://feed.entertainment.tv.theplatform.eu/f/PR1GhC/mediaset-prod-all-programs-v2/guid/-/' + guid, guid, fatal=False) @@ -285,11 +284,11 @@ class MediasetIE(ThePlatformBaseIE): return info -class MediasetShowIE(MediasetIE): +class MediasetShowIE(MediasetIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'''(?x) (?: https?:// - (?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/ + (\w+\.)+mediaset\.it/ (?: (?:fiction|programmi-tv|serie-tv|kids)/(?:.+?/)? (?:[a-z-]+)_SE(?P<id>\d{12}) diff --git a/hypervideo_dl/extractor/mediasite.py b/hypervideo_dl/extractor/mediasite.py index fbf9223..fe549c4 100644 --- a/hypervideo_dl/extractor/mediasite.py +++ b/hypervideo_dl/extractor/mediasite.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re import json @@ -16,7 +13,7 @@ from ..utils import ( str_or_none, try_call, try_get, - unescapeHTML, + smuggle_url, unsmuggle_url, url_or_none, urljoin, @@ -28,6 +25,7 @@ _ID_RE = r'(?:[0-9a-f]{32,34}|[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0 class MediasiteIE(InfoExtractor): _VALID_URL = r'(?xi)https?://[^/]+/Mediasite/(?:Play|Showcase/[^/#?]+/Presentation)/(?P<id>%s)(?P<query>\?[^#]+|)' % _ID_RE + _EMBED_REGEX = [r'(?xi)<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:(?:https?:)?//[^/]+)?/Mediasite/Play/%s(?:\?.*?)?)\1' % _ID_RE] _TESTS = [ { 'url': 'https://hitsmediaweb.h-its.org/mediasite/Play/2db6c271681e4f199af3c60d1f82869b1d', @@ -115,13 +113,10 @@ class MediasiteIE(InfoExtractor): 5: 'video3', } - @staticmethod - def _extract_urls(webpage): - return [ - unescapeHTML(mobj.group('url')) - for mobj in re.finditer( - r'(?xi)<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:(?:https?:)?//[^/]+)?/Mediasite/Play/%s(?:\?.*?)?)\1' % _ID_RE, - webpage)] + @classmethod + def _extract_embed_urls(cls, url, webpage): + for embed_url in super()._extract_embed_urls(url, webpage): + yield smuggle_url(embed_url, {'UrlReferrer': url}) def __extract_slides(self, *, stream_id, snum, Stream, duration, images): slide_base_url = Stream['SlideBaseUrl'] @@ -269,8 +264,6 @@ class MediasiteIE(InfoExtractor): }) formats.extend(stream_formats) - self._sort_formats(formats) - # XXX: Presentation['Presenters'] # XXX: Presentation['Transcript'] diff --git a/hypervideo_dl/extractor/mediaworksnz.py b/hypervideo_dl/extractor/mediaworksnz.py new file mode 100644 index 0000000..62e37d2 --- /dev/null +++ b/hypervideo_dl/extractor/mediaworksnz.py @@ -0,0 +1,103 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + bug_reports_message, + float_or_none, + traverse_obj, + unified_timestamp, +) + + +class MediaWorksNZVODIE(InfoExtractor): + _VALID_URL_BASE_RE = r'https?://vodupload-api\.mediaworks\.nz/library/asset/published/' + _VALID_URL_ID_RE = r'(?P<id>[A-Za-z0-9-]+)' + _VALID_URL = rf'{_VALID_URL_BASE_RE}{_VALID_URL_ID_RE}' + _TESTS = [{ + 'url': 'https://vodupload-api.mediaworks.nz/library/asset/published/VID00359', + 'info_dict': { + 'id': 'VID00359', + 'ext': 'mp4', + 'title': 'GRG Jacinda Ardern safe drug testing 1920x1080', + 'description': 'md5:d4d7dc366742e86d8130b257dcb520ba', + 'duration': 142.76, + 'timestamp': 1604268608, + 'upload_date': '20201101', + 'thumbnail': r're:^https?://.*\.jpg$', + 'channel': 'George FM' + } + }, { + # has audio-only format + 'url': 'https://vodupload-api.mediaworks.nz/library/asset/published/VID02627', + 'info_dict': { + 'id': 'VID02627', + 'ext': 'mp3', + 'title': 'Tova O\'Brien meets Ukraine President Volodymyr Zelensky', + 'channel': 'Today FM', + 'description': 'Watch in full the much anticipated interview of Volodymyr Zelensky', + 'duration': 2061.16, + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20220822', + 'timestamp': 1661152289, + }, + 'params': {'format': 'ba[ext=mp3]'} + }] + + _WEBPAGE_TESTS = [{ + 'url': 'https://www.rova.nz/home/podcasts/socrates-walks-into-a-bar/the-trolley-problem---episode-1.html', + 'info_dict': { + 'id': 'VID02494', + 'ext': 'mp4', + 'title': 'The Trolley Problem', + 'duration': 2843.56, + 'channel': 'Other', + 'timestamp': 1658356489, + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'Socrates Walks Into A Bar Podcast Episode 1', + 'upload_date': '20220720', + } + }] + + @classmethod + def _extract_embed_urls(cls, url, webpage): + for mobj in re.finditer( + rf'''(?x)<div\s+\bid=["']Player-Attributes-JWID[^>]+\b + data-request-url=["']{cls._VALID_URL_BASE_RE}["'][^>]+\b + data-asset-id=["']{cls._VALID_URL_ID_RE}["']''', webpage + ): + yield f'https://vodupload-api.mediaworks.nz/library/asset/published/{mobj.group("id")}' + + def _real_extract(self, url): + video_id = self._match_id(url) + asset = self._download_json(url, video_id)['asset'] + + if asset.get('drm') not in ('NonDRM', None): + self.report_drm(video_id) + + content_type = asset.get('type') + if content_type and content_type != 'video': + self.report_warning(f'Unknown content type: {content_type}' + bug_reports_message(), video_id) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles(asset['streamingUrl'], video_id) + + audio_streaming_url = traverse_obj( + asset, 'palyoutPathAudio', 'playoutpathaudio', expected_type=str) + if audio_streaming_url: + audio_formats = self._extract_m3u8_formats(audio_streaming_url, video_id, fatal=False, ext='mp3') + for audio_format in audio_formats: + # all the audio streams appear to be aac + audio_format.setdefault('vcodec', 'none') + audio_format.setdefault('acodec', 'aac') + formats.append(audio_format) + + return { + 'id': video_id, + 'title': asset.get('title'), + 'description': asset.get('description'), + 'duration': float_or_none(asset.get('duration')), + 'timestamp': unified_timestamp(asset.get('dateadded')), + 'channel': asset.get('brand'), + 'thumbnails': [{'url': thumbnail_url} for thumbnail_url in asset.get('thumbnails') or []], + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/hypervideo_dl/extractor/medici.py b/hypervideo_dl/extractor/medici.py index cd91023..328ccd2 100644 --- a/hypervideo_dl/extractor/medici.py +++ b/hypervideo_dl/extractor/medici.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( unified_strdate, diff --git a/hypervideo_dl/extractor/megaphone.py b/hypervideo_dl/extractor/megaphone.py index 5bafa6c..af80523 100644 --- a/hypervideo_dl/extractor/megaphone.py +++ b/hypervideo_dl/extractor/megaphone.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import js_to_json @@ -11,6 +6,7 @@ class MegaphoneIE(InfoExtractor): IE_NAME = 'megaphone.fm' IE_DESC = 'megaphone.fm embedded players' _VALID_URL = r'https://player\.megaphone\.fm/(?P<id>[A-Z0-9]+)' + _EMBED_REGEX = [rf'<iframe[^>]*?\ssrc=["\'](?P<url>{_VALID_URL})'] _TEST = { 'url': 'https://player.megaphone.fm/GLT9749789991?"', 'md5': '4816a0de523eb3e972dc0dda2c191f96', @@ -48,8 +44,3 @@ class MegaphoneIE(InfoExtractor): 'duration': episode_data['duration'], 'formats': formats, } - - @classmethod - def _extract_urls(cls, webpage): - return [m[0] for m in re.findall( - r'<iframe[^>]*?\ssrc=["\'](%s)' % cls._VALID_URL, webpage)] diff --git a/hypervideo_dl/extractor/megatvcom.py b/hypervideo_dl/extractor/megatvcom.py index 0d6793a..2f3f11f 100644 --- a/hypervideo_dl/extractor/megatvcom.py +++ b/hypervideo_dl/extractor/megatvcom.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -90,7 +87,6 @@ class MegaTVComIE(MegaTVComBaseIE): formats, subs = [{'url': source}], {} if player_attrs.get('subs'): self._merge_subtitles({'und': [{'url': player_attrs['subs']}]}, target=subs) - self._sort_formats(formats) return { 'id': video_id, 'display_id': display_id, @@ -107,7 +103,7 @@ class MegaTVComEmbedIE(MegaTVComBaseIE): IE_NAME = 'megatvcom:embed' IE_DESC = 'megatv.com embedded videos' _VALID_URL = r'(?:https?:)?//(?:www\.)?megatv\.com/embed/?\?p=(?P<id>\d+)' - _EMBED_RE = re.compile(rf'''<iframe[^>]+?src=(?P<_q1>["'])(?P<url>{_VALID_URL})(?P=_q1)''') + _EMBED_REGEX = [rf'''<iframe[^>]+?src=(?P<_q1>["'])(?P<url>{_VALID_URL})(?P=_q1)'''] _TESTS = [{ 'url': 'https://www.megatv.com/embed/?p=2020520979', @@ -137,11 +133,6 @@ class MegaTVComEmbedIE(MegaTVComBaseIE): }, }] - @classmethod - def _extract_urls(cls, webpage): - for mobj in cls._EMBED_RE.finditer(webpage): - yield unescapeHTML(mobj.group('url')) - def _match_canonical_url(self, webpage): LINK_RE = r'''(?x) <link(?: diff --git a/hypervideo_dl/extractor/meipai.py b/hypervideo_dl/extractor/meipai.py index 2445b8b..1a6f3cd 100644 --- a/hypervideo_dl/extractor/meipai.py +++ b/hypervideo_dl/extractor/meipai.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -51,9 +48,7 @@ class MeipaiIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = self._og_search_title( - webpage, default=None) or self._html_search_regex( - r'<title[^>]*>([^<]+)</title>', webpage, 'title') + title = self._generic_title('', webpage) formats = [] diff --git a/hypervideo_dl/extractor/melonvod.py b/hypervideo_dl/extractor/melonvod.py index bd8cf13..1d3fff8 100644 --- a/hypervideo_dl/extractor/melonvod.py +++ b/hypervideo_dl/extractor/melonvod.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -47,7 +44,6 @@ class MelonVODIE(InfoExtractor): formats = self._extract_m3u8_formats( stream_info['encUrl'], video_id, 'mp4', m3u8_id='hls') - self._sort_formats(formats) artist_list = play_info.get('artistList') artist = None diff --git a/hypervideo_dl/extractor/meta.py b/hypervideo_dl/extractor/meta.py index cdb46e1..7c11e60 100644 --- a/hypervideo_dl/extractor/meta.py +++ b/hypervideo_dl/extractor/meta.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from .pladform import PladformIE from ..utils import ( diff --git a/hypervideo_dl/extractor/metacafe.py b/hypervideo_dl/extractor/metacafe.py index 7b2d4a0..d7f5def 100644 --- a/hypervideo_dl/extractor/metacafe.py +++ b/hypervideo_dl/extractor/metacafe.py @@ -1,19 +1,14 @@ -from __future__ import unicode_literals - import json import re +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_parse, - compat_urllib_parse_unquote, -) +from ..compat import compat_parse_qs, compat_urllib_parse_unquote from ..utils import ( - determine_ext, ExtractorError, - int_or_none, + determine_ext, get_element_by_attribute, + int_or_none, mimetype2ext, ) @@ -145,7 +140,7 @@ class MetacafeIE(InfoExtractor): headers = { # Disable family filter - 'Cookie': 'user=%s; ' % compat_urllib_parse.quote(json.dumps({'ffilter': False})) + 'Cookie': 'user=%s; ' % urllib.parse.quote(json.dumps({'ffilter': False})) } # AnyClip videos require the flashversion cookie so that we get the link @@ -272,7 +267,6 @@ class MetacafeIE(InfoExtractor): 'url': video_url, 'ext': video_ext, }] - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/metacritic.py b/hypervideo_dl/extractor/metacritic.py index 1424288..1441054 100644 --- a/hypervideo_dl/extractor/metacritic.py +++ b/hypervideo_dl/extractor/metacritic.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -51,7 +49,6 @@ class MetacriticIE(InfoExtractor): 'format_id': rate_str, 'tbr': int(rate_str), }) - self._sort_formats(formats) description = self._html_search_regex(r'<b>Description:</b>(.*?)</p>', webpage, 'description', flags=re.DOTALL) diff --git a/hypervideo_dl/extractor/mgoon.py b/hypervideo_dl/extractor/mgoon.py index 184c311..2388a71 100644 --- a/hypervideo_dl/extractor/mgoon.py +++ b/hypervideo_dl/extractor/mgoon.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -72,7 +68,6 @@ class MgoonIE(InfoExtractor): 'ext': fmt['format'], }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/mgtv.py b/hypervideo_dl/extractor/mgtv.py index 4ac70ea..edc92b3 100644 --- a/hypervideo_dl/extractor/mgtv.py +++ b/hypervideo_dl/extractor/mgtv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import base64 import time import uuid @@ -70,7 +67,7 @@ class MGTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) tk2 = base64.urlsafe_b64encode( - f'did={compat_str(uuid.uuid4()).encode()}|pno=1030|ver=0.3.0301|clit={int(time.time())}'.encode())[::-1] + f'did={str(uuid.uuid4())}|pno=1030|ver=0.3.0301|clit={int(time.time())}'.encode())[::-1] try: api_data = self._download_json( 'https://pcweb.api.mgtv.com/player/video', video_id, query={ @@ -120,7 +117,6 @@ class MGTVIE(InfoExtractor): }, 'format_note': stream.get('name'), }) - self._sort_formats(formats) return { 'id': video_id, @@ -140,14 +136,15 @@ class MGTVIE(InfoExtractor): url_sub = sub.get('url') if not url_sub: continue - locale = sub.get('captionCountrySimpleName') + locale = sub.get('captionSimpleName') or 'en' sub = self._download_json(f'{domain}{url_sub}', video_id, fatal=False, note=f'Download subtitle for locale {sub.get("name")} ({locale})') or {} sub_url = url_or_none(sub.get('info')) if not sub_url: continue - subtitles.setdefault(locale or 'en', []).append({ + subtitles.setdefault(locale.lower(), []).append({ 'url': sub_url, + 'name': sub.get('name'), 'ext': 'srt' }) return subtitles diff --git a/hypervideo_dl/extractor/miaopai.py b/hypervideo_dl/extractor/miaopai.py index cf0610b..329ce36 100644 --- a/hypervideo_dl/extractor/miaopai.py +++ b/hypervideo_dl/extractor/miaopai.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/microsoftembed.py b/hypervideo_dl/extractor/microsoftembed.py new file mode 100644 index 0000000..f71ab3e --- /dev/null +++ b/hypervideo_dl/extractor/microsoftembed.py @@ -0,0 +1,65 @@ +from .common import InfoExtractor +from ..utils import int_or_none, traverse_obj, unified_timestamp + + +class MicrosoftEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?microsoft\.com/(?:[^/]+/)?videoplayer/embed/(?P<id>[a-z0-9A-Z]+)' + + _TESTS = [{ + 'url': 'https://www.microsoft.com/en-us/videoplayer/embed/RWL07e', + 'md5': 'eb0ae9007f9b305f9acd0a03e74cb1a9', + 'info_dict': { + 'id': 'RWL07e', + 'title': 'Microsoft for Public Health and Social Services', + 'ext': 'mp4', + 'thumbnail': 'http://img-prod-cms-rt-microsoft-com.akamaized.net/cms/api/am/imageFileData/RWL7Ju?ver=cae5', + 'age_limit': 0, + 'timestamp': 1631658316, + 'upload_date': '20210914' + } + }] + _API_URL = 'https://prod-video-cms-rt-microsoft-com.akamaized.net/vhs/api/videos/' + + def _real_extract(self, url): + video_id = self._match_id(url) + metadata = self._download_json(self._API_URL + video_id, video_id) + + formats = [] + for source_type, source in metadata['streams'].items(): + if source_type == 'smooth_Streaming': + formats.extend(self._extract_ism_formats(source['url'], video_id, 'mss')) + elif source_type == 'apple_HTTP_Live_Streaming': + formats.extend(self._extract_m3u8_formats(source['url'], video_id, 'mp4')) + elif source_type == 'mPEG_DASH': + formats.extend(self._extract_mpd_formats(source['url'], video_id)) + else: + formats.append({ + 'format_id': source_type, + 'url': source['url'], + 'height': source.get('heightPixels'), + 'width': source.get('widthPixels'), + }) + + subtitles = { + lang: [{ + 'url': data.get('url'), + 'ext': 'vtt', + }] for lang, data in traverse_obj(metadata, 'captions', default={}).items() + } + + thumbnails = [{ + 'url': thumb.get('url'), + 'width': thumb.get('width') or None, + 'height': thumb.get('height') or None, + } for thumb in traverse_obj(metadata, ('snippet', 'thumbnails', ...))] + self._remove_duplicate_formats(thumbnails) + + return { + 'id': video_id, + 'title': traverse_obj(metadata, ('snippet', 'title')), + 'timestamp': unified_timestamp(traverse_obj(metadata, ('snippet', 'activeStartDate'))), + 'age_limit': int_or_none(traverse_obj(metadata, ('snippet', 'minimumAge'))) or 0, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + } diff --git a/hypervideo_dl/extractor/microsoftstream.py b/hypervideo_dl/extractor/microsoftstream.py index 4d5a9df..9b50996 100644 --- a/hypervideo_dl/extractor/microsoftstream.py +++ b/hypervideo_dl/extractor/microsoftstream.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from base64 import b64decode from .common import InfoExtractor @@ -104,7 +101,6 @@ class MicrosoftStreamIE(InfoExtractor): playlist['playbackUrl'], video_id, ism_id='mss', fatal=False, headers=headers)) formats = [merge_dicts(f, {'language': language}) for f in formats] - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/microsoftvirtualacademy.py b/hypervideo_dl/extractor/microsoftvirtualacademy.py index 46abd2a..b759b18 100644 --- a/hypervideo_dl/extractor/microsoftvirtualacademy.py +++ b/hypervideo_dl/extractor/microsoftvirtualacademy.py @@ -1,11 +1,6 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor -from ..compat import ( - compat_xpath, -) from ..utils import ( int_or_none, parse_duration, @@ -70,9 +65,9 @@ class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE): formats = [] - for sources in settings.findall(compat_xpath('.//MediaSources')): + for sources in settings.findall('.//MediaSources'): sources_type = sources.get('videoType') - for source in sources.findall(compat_xpath('./MediaSource')): + for source in sources.findall('./MediaSource'): video_url = source.text if not video_url or not video_url.startswith('http'): continue @@ -98,10 +93,9 @@ class MicrosoftVirtualAcademyIE(MicrosoftVirtualAcademyBaseIE): 'acodec': acodec, 'vcodec': vcodec, }) - self._sort_formats(formats) subtitles = {} - for source in settings.findall(compat_xpath('.//MarkerResourceSource')): + for source in settings.findall('.//MarkerResourceSource'): subtitle_url = source.text if not subtitle_url: continue diff --git a/hypervideo_dl/extractor/mildom.py b/hypervideo_dl/extractor/mildom.py index 5f2df29..f64d575 100644 --- a/hypervideo_dl/extractor/mildom.py +++ b/hypervideo_dl/extractor/mildom.py @@ -1,8 +1,6 @@ -# coding: utf-8 -from __future__ import unicode_literals - import functools import json +import uuid from .common import InfoExtractor from ..utils import ( @@ -11,7 +9,6 @@ from ..utils import ( ExtractorError, float_or_none, OnDemandPagedList, - random_uuidv4, traverse_obj, ) @@ -21,7 +18,7 @@ class MildomBaseIE(InfoExtractor): def _call_api(self, url, video_id, query=None, note='Downloading JSON metadata', body=None): if not self._GUEST_ID: - self._GUEST_ID = f'pc-gp-{random_uuidv4()}' + self._GUEST_ID = f'pc-gp-{str(uuid.uuid4())}' content = self._download_json( url, video_id, note=note, data=json.dumps(body).encode() if body else None, @@ -77,8 +74,6 @@ class MildomIE(MildomBaseIE): for fmt in formats: fmt.setdefault('http_headers', {})['Referer'] = 'https://www.mildom.com/' - self._sort_formats(formats) - return { 'id': result_video_id, 'title': self._html_search_meta('twitter:description', webpage, default=None) or traverse_obj(enterstudio, 'anchor_intro'), @@ -169,8 +164,6 @@ class MildomVodIE(MildomBaseIE): 'ext': 'mp4' }) - self._sort_formats(formats) - return { 'id': video_id, 'title': self._html_search_meta(('og:description', 'description'), webpage, default=None) or autoplay.get('title'), diff --git a/hypervideo_dl/extractor/minds.py b/hypervideo_dl/extractor/minds.py index 9da0720..2fb1792 100644 --- a/hypervideo_dl/extractor/minds.py +++ b/hypervideo_dl/extractor/minds.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -79,7 +76,7 @@ class MindsIE(MindsBaseIE): else: return self.url_result(entity['perma_url']) else: - assert(entity['subtype'] == 'video') + assert entity['subtype'] == 'video' video_id = entity_id # 1080p and webm formats available only on the sources array video = self._call_api( @@ -95,7 +92,6 @@ class MindsIE(MindsBaseIE): 'height': int_or_none(source.get('size')), 'url': src, }) - self._sort_formats(formats) entity = video.get('entity') or entity owner = entity.get('ownerObj') or {} @@ -121,7 +117,7 @@ class MindsIE(MindsBaseIE): 'timestamp': int_or_none(entity.get('time_created')), 'uploader': strip_or_none(owner.get('name')), 'uploader_id': uploader_id, - 'uploader_url': format_field(uploader_id, template='https://www.minds.com/%s'), + 'uploader_url': format_field(uploader_id, None, 'https://www.minds.com/%s'), 'view_count': int_or_none(entity.get('play:count')), 'like_count': int_or_none(entity.get('thumbs:up:count')), 'dislike_count': int_or_none(entity.get('thumbs:down:count')), diff --git a/hypervideo_dl/extractor/ministrygrid.py b/hypervideo_dl/extractor/ministrygrid.py index 8ad9239..053c672 100644 --- a/hypervideo_dl/extractor/ministrygrid.py +++ b/hypervideo_dl/extractor/ministrygrid.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( ExtractorError, diff --git a/hypervideo_dl/extractor/minoto.py b/hypervideo_dl/extractor/minoto.py index 603ce94..8d18179 100644 --- a/hypervideo_dl/extractor/minoto.py +++ b/hypervideo_dl/extractor/minoto.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -39,7 +35,6 @@ class MinotoIE(InfoExtractor): 'height': int_or_none(fmt.get('height')), **parse_codecs(fmt.get('codecs')), }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/miomio.py b/hypervideo_dl/extractor/miomio.py index 40f72d6..a0a041e 100644 --- a/hypervideo_dl/extractor/miomio.py +++ b/hypervideo_dl/extractor/miomio.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import random from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/mirrativ.py b/hypervideo_dl/extractor/mirrativ.py index 2111de6..0a8ee0c 100644 --- a/hypervideo_dl/extractor/mirrativ.py +++ b/hypervideo_dl/extractor/mirrativ.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -57,7 +55,6 @@ class MirrativIE(MirrativBaseIE): hls_url, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls', live=is_live) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/mirrorcouk.py b/hypervideo_dl/extractor/mirrorcouk.py new file mode 100644 index 0000000..7b4f95b --- /dev/null +++ b/hypervideo_dl/extractor/mirrorcouk.py @@ -0,0 +1,98 @@ +from .common import InfoExtractor +from ..utils import unescapeHTML + + +class MirrorCoUKIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mirror\.co\.uk/[/+[\w-]+-(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.mirror.co.uk/tv/tv-news/love-island-fans-baffled-after-27163139', + 'info_dict': { + 'id': 'voyyS7SV', + 'ext': 'mp4', + 'title': 'Love Island: Gemma Owen enters the villa', + 'description': 'Love Island: Michael Owen\'s daughter Gemma Owen enters the villa.', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/voyyS7SV/poster.jpg?width=720', + 'display_id': '27163139', + 'timestamp': 1654547895, + 'duration': 57.0, + 'upload_date': '20220606', + }, + }, { + 'url': 'https://www.mirror.co.uk/3am/celebrity-news/michael-jacksons-son-blankets-new-25344890', + 'info_dict': { + 'id': 'jyXpdvxp', + 'ext': 'mp4', + 'title': 'Michael Jackson’s son Bigi calls for action on climate change', + 'description': 'md5:d39ceaba2b7a615b4ca6557e7bc40222', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/jyXpdvxp/poster.jpg?width=720', + 'display_id': '25344890', + 'timestamp': 1635749907, + 'duration': 56.0, + 'upload_date': '20211101', + }, + }, { + 'url': 'https://www.mirror.co.uk/sport/football/news/antonio-conte-next-tottenham-manager-25346042', + 'info_dict': { + 'id': 'q6FkKa4p', + 'ext': 'mp4', + 'title': 'Nuno sacked by Tottenham after fifth Premier League defeat of the season', + 'description': 'Nuno Espirito Santo has been sacked as Tottenham boss after only four months in charge.', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/q6FkKa4p/poster.jpg?width=720', + 'display_id': '25346042', + 'timestamp': 1635763157, + 'duration': 40.0, + 'upload_date': '20211101', + }, + }, { + 'url': 'https://www.mirror.co.uk/3am/celebrity-news/johnny-depp-splashes-50k-curry-27160737', + 'info_dict': { + 'id': 'IT0oa1nH', + 'ext': 'mp4', + 'title': 'Johnny Depp Leaves The Grand Hotel in Birmingham', + 'description': 'Johnny Depp Leaves The Grand Hotel in Birmingham.', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/IT0oa1nH/poster.jpg?width=720', + 'display_id': '27160737', + 'timestamp': 1654524120, + 'duration': 65.0, + 'upload_date': '20220606', + }, + }, { + 'url': 'https://www.mirror.co.uk/tv/tv-news/love-islands-liam-could-first-27162602', + 'info_dict': { + 'id': 'EaPr5Z2j', + 'ext': 'mp4', + 'title': 'Love Island: Davide reveals plot twist after receiving text', + 'description': 'Love Island: Davide reveals plot twist after receiving text', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/EaPr5Z2j/poster.jpg?width=720', + 'display_id': '27162602', + 'timestamp': 1654552597, + 'duration': 23.0, + 'upload_date': '20220606', + }, + }, { + 'url': 'https://www.mirror.co.uk/news/uk-news/william-kate-sent-message-george-27160572', + 'info_dict': { + 'id': 'ygtceXIu', + 'ext': 'mp4', + 'title': 'Prince William and Kate arrive in Wales with George and Charlotte', + 'description': 'Prince William and Kate Middleton arrive in Wales with children Prince George and Princess Charlotte.', + 'thumbnail': 'https://cdn.jwplayer.com/v2/media/ygtceXIu/poster.jpg?width=720', + 'display_id': '27160572', + 'timestamp': 1654349678, + 'duration': 106.0, + 'upload_date': '20220604', + }, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + data = self._search_json(r'div\s+class="json-placeholder"\s+data-json="', + webpage, 'data', display_id, transform_source=unescapeHTML)['videoData'] + + return { + '_type': 'url_transparent', + 'url': f'jwplatform:{data["videoId"]}', + 'ie_key': 'JWPlatform', + 'display_id': display_id, + } diff --git a/hypervideo_dl/extractor/mit.py b/hypervideo_dl/extractor/mit.py index 60e4569..38cc0c2 100644 --- a/hypervideo_dl/extractor/mit.py +++ b/hypervideo_dl/extractor/mit.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re import json diff --git a/hypervideo_dl/extractor/mitele.py b/hypervideo_dl/extractor/mitele.py index b593723..ea29986 100644 --- a/hypervideo_dl/extractor/mitele.py +++ b/hypervideo_dl/extractor/mitele.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .telecinco import TelecincoIE from ..utils import ( int_or_none, @@ -8,7 +5,7 @@ from ..utils import ( ) -class MiTeleIE(TelecincoIE): +class MiTeleIE(TelecincoIE): # XXX: Do not subclass from concrete IE IE_DESC = 'mitele.es' _VALID_URL = r'https?://(?:www\.)?mitele\.es/(?:[^/]+/)+(?P<id>[^/]+)/player' diff --git a/hypervideo_dl/extractor/mixch.py b/hypervideo_dl/extractor/mixch.py index 31f450d..3f430a7 100644 --- a/hypervideo_dl/extractor/mixch.py +++ b/hypervideo_dl/extractor/mixch.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( ExtractorError, diff --git a/hypervideo_dl/extractor/mixcloud.py b/hypervideo_dl/extractor/mixcloud.py index c2dd078..fb5a08c 100644 --- a/hypervideo_dl/extractor/mixcloud.py +++ b/hypervideo_dl/extractor/mixcloud.py @@ -1,15 +1,11 @@ -from __future__ import unicode_literals - import itertools from .common import InfoExtractor from ..compat import ( compat_b64decode, - compat_chr, compat_ord, compat_str, compat_urllib_parse_unquote, - compat_zip ) from ..utils import ( ExtractorError, @@ -75,8 +71,8 @@ class MixcloudIE(MixcloudBaseIE): def _decrypt_xor_cipher(key, ciphertext): """Encrypt/Decrypt XOR cipher. Both ways are possible because it's XOR.""" return ''.join([ - compat_chr(compat_ord(ch) ^ compat_ord(k)) - for ch, k in compat_zip(ciphertext, itertools.cycle(key))]) + chr(compat_ord(ch) ^ compat_ord(k)) + for ch, k in zip(ciphertext, itertools.cycle(key))]) def _real_extract(self, url): username, slug = self._match_valid_url(url).groups() @@ -163,6 +159,7 @@ class MixcloudIE(MixcloudBaseIE): formats.append({ 'format_id': 'http', 'url': decrypted, + 'vcodec': 'none', 'downloader_options': { # Mixcloud starts throttling at >~5M 'http_chunk_size': 5242880, @@ -172,8 +169,6 @@ class MixcloudIE(MixcloudBaseIE): if not formats and cloudcast.get('isExclusive'): self.raise_login_required(metadata_available=True) - self._sort_formats(formats) - comments = [] for edge in (try_get(cloudcast, lambda x: x['comments']['edges']) or []): node = edge.get('node') or {} diff --git a/hypervideo_dl/extractor/mlb.py b/hypervideo_dl/extractor/mlb.py index b69301d..72057dc 100644 --- a/hypervideo_dl/extractor/mlb.py +++ b/hypervideo_dl/extractor/mlb.py @@ -1,13 +1,15 @@ -from __future__ import unicode_literals - import re +import urllib.parse +import uuid from .common import InfoExtractor from ..utils import ( determine_ext, int_or_none, + join_nonempty, parse_duration, parse_iso8601, + traverse_obj, try_get, ) @@ -52,7 +54,6 @@ class MLBBaseIE(InfoExtractor): 'width': int(mobj.group(1)), }) formats.append(f) - self._sort_formats(formats) thumbnails = [] for cut in (try_get(feed, lambda x: x['image']['cuts'], list) or []): @@ -94,6 +95,10 @@ class MLBIE(MLBBaseIE): (?P<id>\d+) ) ''' + _EMBED_REGEX = [ + r'<iframe[^>]+?src=(["\'])(?P<url>https?://m(?:lb)?\.mlb\.com/shared/video/embed/embed\.html\?.+?)\1', + r'data-video-link=["\'](?P<url>http://m\.mlb\.com/video/[^"\']+)', + ] _TESTS = [ { 'url': 'https://www.mlb.com/mariners/video/ackleys-spectacular-catch/c-34698933', @@ -265,3 +270,112 @@ class MLBVideoIE(MLBBaseIE): } }''' % display_id, })['data']['mediaPlayback'][0] + + +class MLBTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?mlb\.com/tv/g(?P<id>\d{6})' + _NETRC_MACHINE = 'mlb' + + _TESTS = [{ + 'url': 'https://www.mlb.com/tv/g661581/vee2eff5f-a7df-4c20-bdb4-7b926fa12638', + 'info_dict': { + 'id': '661581', + 'ext': 'mp4', + 'title': '2022-07-02 - St. Louis Cardinals @ Philadelphia Phillies', + }, + 'params': { + 'skip_download': True, + }, + }] + _access_token = None + + def _real_initialize(self): + if not self._access_token: + self.raise_login_required( + 'All videos are only available to registered users', method='password') + + def _perform_login(self, username, password): + data = f'grant_type=password&username={urllib.parse.quote(username)}&password={urllib.parse.quote(password)}&scope=openid offline_access&client_id=0oa3e1nutA1HLzAKG356' + access_token = self._download_json( + 'https://ids.mlb.com/oauth2/aus1m088yK07noBfh356/v1/token', None, + headers={ + 'User-Agent': 'okhttp/3.12.1', + 'Content-Type': 'application/x-www-form-urlencoded' + }, data=data.encode())['access_token'] + + entitlement = self._download_webpage( + f'https://media-entitlement.mlb.com/api/v3/jwt?os=Android&appname=AtBat&did={str(uuid.uuid4())}', None, + headers={ + 'User-Agent': 'okhttp/3.12.1', + 'Authorization': f'Bearer {access_token}' + }) + + data = f'grant_type=urn:ietf:params:oauth:grant-type:token-exchange&subject_token={entitlement}&subject_token_type=urn:ietf:params:oauth:token-type:jwt&platform=android-tv' + self._access_token = self._download_json( + 'https://us.edge.bamgrid.com/token', None, + headers={ + 'Accept': 'application/json', + 'Authorization': 'Bearer bWxidHYmYW5kcm9pZCYxLjAuMA.6LZMbH2r--rbXcgEabaDdIslpo4RyZrlVfWZhsAgXIk', + 'Content-Type': 'application/x-www-form-urlencoded' + }, data=data.encode())['access_token'] + + def _real_extract(self, url): + video_id = self._match_id(url) + airings = self._download_json( + f'https://search-api-mlbtv.mlb.com/svc/search/v2/graphql/persisted/query/core/Airings?variables=%7B%22partnerProgramIds%22%3A%5B%22{video_id}%22%5D%2C%22applyEsniMediaRightsLabels%22%3Atrue%7D', + video_id)['data']['Airings'] + + formats, subtitles = [], {} + for airing in airings: + m3u8_url = self._download_json( + airing['playbackUrls'][0]['href'].format(scenario='browser~csai'), video_id, + headers={ + 'Authorization': self._access_token, + 'Accept': 'application/vnd.media-service+json; version=2' + })['stream']['complete'] + f, s = self._extract_m3u8_formats_and_subtitles( + m3u8_url, video_id, 'mp4', m3u8_id=join_nonempty(airing.get('feedType'), airing.get('feedLanguage'))) + formats.extend(f) + self._merge_subtitles(s, target=subtitles) + + return { + 'id': video_id, + 'title': traverse_obj(airings, (..., 'titles', 0, 'episodeName'), get_all=False), + 'is_live': traverse_obj(airings, (..., 'mediaConfig', 'productType'), get_all=False) == 'LIVE', + 'formats': formats, + 'subtitles': subtitles, + 'http_headers': {'Authorization': f'Bearer {self._access_token}'}, + } + + +class MLBArticleIE(InfoExtractor): + _VALID_URL = r'https?://www\.mlb\.com/news/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.mlb.com/news/manny-machado-robs-guillermo-heredia-reacts', + 'info_dict': { + 'id': '36db7394-343c-4ea3-b8ca-ead2e61bca9a', + 'title': 'Machado\'s grab draws hilarious irate reaction', + 'modified_timestamp': 1650130737, + 'description': 'md5:a19d4eb0487b2cb304e9a176f6b67676', + 'modified_date': '20220416', + }, + 'playlist_count': 2, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + apollo_cache_json = self._search_json(r'window\.initState\s*=', webpage, 'window.initState', display_id)['apolloCache'] + + content_data_id = traverse_obj( + apollo_cache_json, ('ROOT_QUERY', lambda k, _: k.startswith('getForgeContent'), 'id'), get_all=False) + + content_real_info = apollo_cache_json[content_data_id] + + return self.playlist_from_matches( + traverse_obj(content_real_info, ('parts', lambda _, v: v['typename'] == 'Video', 'id')), + getter=lambda x: f'https://www.mlb.com/video/{apollo_cache_json[x]["slug"]}', + ie=MLBVideoIE, playlist_id=content_real_info.get('_translationId'), + title=self._html_search_meta('og:title', webpage), + description=content_real_info.get('summary'), + modified_timestamp=parse_iso8601(content_real_info.get('lastUpdatedDate'))) diff --git a/hypervideo_dl/extractor/mlssoccer.py b/hypervideo_dl/extractor/mlssoccer.py index 1d6d4b8..9383f13 100644 --- a/hypervideo_dl/extractor/mlssoccer.py +++ b/hypervideo_dl/extractor/mlssoccer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/mnet.py b/hypervideo_dl/extractor/mnet.py index 0e26ca1..98bab2e 100644 --- a/hypervideo_dl/extractor/mnet.py +++ b/hypervideo_dl/extractor/mnet.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -62,7 +59,6 @@ class MnetIE(InfoExtractor): m3u8_url += '?' + token formats = self._extract_wowza_formats( m3u8_url, video_id, skip_protocols=['rtmp', 'rtsp', 'f4m']) - self._sort_formats(formats) description = info.get('ment') duration = parse_duration(info.get('time')) diff --git a/hypervideo_dl/extractor/mocha.py b/hypervideo_dl/extractor/mocha.py new file mode 100644 index 0000000..5f72b81 --- /dev/null +++ b/hypervideo_dl/extractor/mocha.py @@ -0,0 +1,64 @@ +from .common import InfoExtractor +from ..utils import int_or_none, traverse_obj + + +class MochaVideoIE(InfoExtractor): + _VALID_URL = r'https?://video.mocha.com.vn/(?P<video_slug>[\w-]+)' + _TESTS = [{ + 'url': 'http://video.mocha.com.vn/chuyen-meo-gia-su-tu-thong-diep-cuoc-song-v18694039', + 'info_dict': { + 'id': '18694039', + 'title': 'Chuyện mèo giả sư tử | Thông điệp cuộc sống', + 'ext': 'mp4', + 'view_count': int, + 'like_count': int, + 'dislike_count': int, + 'display_id': 'chuyen-meo-gia-su-tu-thong-diep-cuoc-song', + 'thumbnail': 'http://mcvideomd1fr.keeng.net/playnow/images/20220505/ad0a055d-2f69-42ca-b888-4790041fe6bc_640x480.jpg', + 'description': '', + 'duration': 70, + 'timestamp': 1652254203, + 'upload_date': '20220511', + 'comment_count': int, + 'categories': ['Kids'] + } + }] + + def _real_extract(self, url): + video_slug = self._match_valid_url(url).group('video_slug') + json_data = self._download_json( + 'http://apivideo.mocha.com.vn:8081/onMediaBackendBiz/mochavideo/getVideoDetail', + video_slug, query={'url': url, 'token': ''})['data']['videoDetail'] + video_id = str(json_data['id']) + video_urls = (json_data.get('list_resolution') or []) + [json_data.get('original_path')] + + formats, subtitles = [], {} + for video in video_urls: + if isinstance(video, str): + formats.extend([{'url': video, 'ext': 'mp4'}]) + else: + fmts, subs = self._extract_m3u8_formats_and_subtitles( + video.get('video_path'), video_id, ext='mp4') + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + return { + 'id': video_id, + 'display_id': json_data.get('slug') or video_slug, + 'title': json_data.get('name'), + 'formats': formats, + 'subtitles': subtitles, + 'description': json_data.get('description'), + 'duration': json_data.get('durationS'), + 'view_count': json_data.get('total_view'), + 'like_count': json_data.get('total_like'), + 'dislike_count': json_data.get('total_unlike'), + 'thumbnail': json_data.get('image_path_thumb'), + 'timestamp': int_or_none(json_data.get('publish_time'), scale=1000), + 'is_live': json_data.get('isLive'), + 'channel': traverse_obj(json_data, ('channels', '0', 'name')), + 'channel_id': traverse_obj(json_data, ('channels', '0', 'id')), + 'channel_follower_count': traverse_obj(json_data, ('channels', '0', 'numfollow')), + 'categories': traverse_obj(json_data, ('categories', ..., 'categoryname')), + 'comment_count': json_data.get('total_comment'), + } diff --git a/hypervideo_dl/extractor/moevideo.py b/hypervideo_dl/extractor/moevideo.py index a3f1b38..fda08ca 100644 --- a/hypervideo_dl/extractor/moevideo.py +++ b/hypervideo_dl/extractor/moevideo.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( clean_html, diff --git a/hypervideo_dl/extractor/mofosex.py b/hypervideo_dl/extractor/mofosex.py index 5234cac..9cb6980 100644 --- a/hypervideo_dl/extractor/mofosex.py +++ b/hypervideo_dl/extractor/mofosex.py @@ -1,7 +1,3 @@ -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -11,7 +7,7 @@ from ..utils import ( from .keezmovies import KeezMoviesIE -class MofosexIE(KeezMoviesIE): +class MofosexIE(KeezMoviesIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'https?://(?:www\.)?mofosex\.com/videos/(?P<id>\d+)/(?P<display_id>[^/?#&.]+)\.html' _TESTS = [{ 'url': 'http://www.mofosex.com/videos/318131/amateur-teen-playing-and-masturbating-318131.html', @@ -61,17 +57,12 @@ class MofosexIE(KeezMoviesIE): class MofosexEmbedIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=(?P<id>\d+)' + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=\d+)'] _TESTS = [{ 'url': 'https://www.mofosex.com/embed/?videoid=318131&referrer=KM', 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?mofosex\.com/embed/?\?.*?\bvideoid=\d+)', - webpage) - def _real_extract(self, url): video_id = self._match_id(url) return self.url_result( diff --git a/hypervideo_dl/extractor/mojvideo.py b/hypervideo_dl/extractor/mojvideo.py index 16d9405..d47ad07 100644 --- a/hypervideo_dl/extractor/mojvideo.py +++ b/hypervideo_dl/extractor/mojvideo.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( ExtractorError, diff --git a/hypervideo_dl/extractor/morningstar.py b/hypervideo_dl/extractor/morningstar.py index 71a22a6..e9fcfe3 100644 --- a/hypervideo_dl/extractor/morningstar.py +++ b/hypervideo_dl/extractor/morningstar.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/motherless.py b/hypervideo_dl/extractor/motherless.py index 111c7c5..c24ef9b 100644 --- a/hypervideo_dl/extractor/motherless.py +++ b/hypervideo_dl/extractor/motherless.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import datetime import re @@ -71,7 +69,7 @@ class MotherlessIE(InfoExtractor): 'title': 'a/ Hot Teens', 'categories': list, 'upload_date': '20210104', - 'uploader_id': 'yonbiw', + 'uploader_id': 'anonymous', 'thumbnail': r're:https?://.*\.jpg', 'age_limit': 18, }, @@ -125,11 +123,12 @@ class MotherlessIE(InfoExtractor): kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta} upload_date = (datetime.datetime.utcnow() - datetime.timedelta(**kwargs)).strftime('%Y%m%d') - comment_count = webpage.count('class="media-comment-contents"') + comment_count = len(re.findall(r'''class\s*=\s*['"]media-comment-contents\b''', webpage)) uploader_id = self._html_search_regex( - (r'"media-meta-member">\s+<a href="/m/([^"]+)"', - r'<span\b[^>]+\bclass="username">([^<]+)</span>'), + (r'''<span\b[^>]+\bclass\s*=\s*["']username\b[^>]*>([^<]+)</span>''', + r'''(?s)['"](?:media-meta-member|thumb-member-username)\b[^>]+>\s*<a\b[^>]+\bhref\s*=\s*['"]/m/([^"']+)'''), webpage, 'uploader_id', fatal=False) + categories = self._html_search_meta('keywords', webpage, default=None) if categories: categories = [cat.strip() for cat in categories.split(',')] @@ -219,23 +218,23 @@ class MotherlessGroupIE(InfoExtractor): r'<title>([\w\s]+\w)\s+-', webpage, 'title', fatal=False) description = self._html_search_meta( 'description', webpage, fatal=False) - page_count = self._int(self._search_regex( - r'(\d+)</(?:a|span)><(?:a|span)[^>]+rel="next">', - webpage, 'page_count', default=0), 'page_count') + page_count = str_to_int(self._search_regex( + r'(\d+)\s*</(?:a|span)>\s*<(?:a|span)[^>]+(?:>\s*NEXT|\brel\s*=\s*["\']?next)\b', + webpage, 'page_count', default=0)) if not page_count: message = self._search_regex( - r'class="error-page"[^>]*>\s*<p[^>]*>\s*(?P<error_msg>[^<]+)(?<=\S)\s*', + r'''class\s*=\s*['"]error-page\b[^>]*>\s*<p[^>]*>\s*(?P<error_msg>[^<]+)(?<=\S)\s*''', webpage, 'error_msg', default=None) or 'This group has no videos.' self.report_warning(message, group_id) + page_count = 1 PAGE_SIZE = 80 def _get_page(idx): - if not page_count: - return - webpage = self._download_webpage( - page_url, group_id, query={'page': idx + 1}, - note='Downloading page %d/%d' % (idx + 1, page_count) - ) + if idx > 0: + webpage = self._download_webpage( + page_url, group_id, query={'page': idx + 1}, + note='Downloading page %d/%d' % (idx + 1, page_count) + ) for entry in self._extract_entries(webpage, url): yield entry diff --git a/hypervideo_dl/extractor/motorsport.py b/hypervideo_dl/extractor/motorsport.py index c9d1ab6..efb087d 100644 --- a/hypervideo_dl/extractor/motorsport.py +++ b/hypervideo_dl/extractor/motorsport.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_urlparse, @@ -34,8 +31,13 @@ class MotorsportIE(InfoExtractor): webpage = self._download_webpage(url, display_id) iframe_path = self._html_search_regex( - r'<iframe id="player_iframe"[^>]+src="([^"]+)"', webpage, - 'iframe path') + r'<iframe id="player_iframe"[^>]+src="([^"]+)"', webpage, 'iframe path', default=None) + + if iframe_path is None: + iframe_path = self._html_search_regex( + r'<iframe [^>]*\bsrc="(https://motorsport\.tv/embed/[^"]+)', webpage, 'embed iframe path') + return self.url_result(iframe_path) + iframe = self._download_webpage( compat_urlparse.urljoin(url, iframe_path), display_id, 'Downloading iframe') diff --git a/hypervideo_dl/extractor/movieclips.py b/hypervideo_dl/extractor/movieclips.py index 5453da1..4777f44 100644 --- a/hypervideo_dl/extractor/movieclips.py +++ b/hypervideo_dl/extractor/movieclips.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( smuggle_url, diff --git a/hypervideo_dl/extractor/moviepilot.py b/hypervideo_dl/extractor/moviepilot.py new file mode 100644 index 0000000..ca54156 --- /dev/null +++ b/hypervideo_dl/extractor/moviepilot.py @@ -0,0 +1,112 @@ +from .dailymotion import DailymotionIE +from .common import InfoExtractor +from ..utils import ( + parse_iso8601, + try_get, +) + +import re + + +class MoviepilotIE(InfoExtractor): + _IE_NAME = 'moviepilot' + _IE_DESC = 'Moviepilot trailer' + _VALID_URL = r'https?://(?:www\.)?moviepilot\.de/movies/(?P<id>[^/]+)' + + _TESTS = [{ + 'url': 'https://www.moviepilot.de/movies/interstellar-2/', + 'info_dict': { + 'id': 'x7xdut5', + 'display_id': 'interstellar-2', + 'ext': 'mp4', + 'title': 'Interstellar', + 'thumbnail': r're:https://\w+\.dmcdn\.net/v/SaXev1VvzitVZMFsR/x720', + 'timestamp': 1400491705, + 'description': 'md5:7dfc5c1758e7322a7346934f1f0c489c', + 'uploader': 'Moviepilot', + 'like_count': int, + 'view_count': int, + 'uploader_id': 'x6nd9k', + 'upload_date': '20140519', + 'duration': 140, + 'age_limit': 0, + 'tags': ['Alle Trailer', 'Movie', 'Third Party'], + }, + }, { + 'url': 'https://www.moviepilot.de/movies/interstellar-2/trailer', + 'only_matching': True, + }, { + 'url': 'https://www.moviepilot.de/movies/interstellar-2/kinoprogramm/berlin', + 'only_matching': True, + }, { + 'url': 'https://www.moviepilot.de/movies/queen-slim/trailer', + 'info_dict': { + 'id': 'x7xj6o7', + 'display_id': 'queen-slim', + 'title': 'Queen & Slim', + 'ext': 'mp4', + 'thumbnail': r're:https://\w+\.dmcdn\.net/v/SbUM71WtomSjVmI_q/x720', + 'timestamp': 1571838685, + 'description': 'md5:73058bcd030aa12d991e4280d65fbebe', + 'uploader': 'Moviepilot', + 'like_count': int, + 'view_count': int, + 'uploader_id': 'x6nd9k', + 'upload_date': '20191023', + 'duration': 138, + 'age_limit': 0, + 'tags': ['Movie', 'Verleih', 'Neue Trailer'], + }, + }, { + 'url': 'https://www.moviepilot.de/movies/der-geiger-von-florenz/trailer', + 'info_dict': { + 'id': 'der-geiger-von-florenz', + 'title': 'Der Geiger von Florenz', + 'ext': 'mp4', + }, + 'skip': 'No trailer for this movie.', + }, { + 'url': 'https://www.moviepilot.de/movies/muellers-buero/', + 'info_dict': { + 'id': 'x7xcw1i', + 'display_id': 'muellers-buero', + 'title': 'Müllers Büro', + 'ext': 'mp4', + 'description': 'md5:57501251c05cdc61ca314b7633e0312e', + 'timestamp': 1287584475, + 'age_limit': 0, + 'duration': 82, + 'upload_date': '20101020', + 'thumbnail': r're:https://\w+\.dmcdn\.net/v/SaMes1WfAm1d6maq_/x720', + 'uploader': 'Moviepilot', + 'like_count': int, + 'view_count': int, + 'tags': ['Alle Trailer', 'Movie', 'Verleih'], + 'uploader_id': 'x6nd9k', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(f'https://www.moviepilot.de/movies/{video_id}/trailer', video_id) + + duration = try_get( + re.match(r'P(?P<hours>\d+)H(?P<mins>\d+)M(?P<secs>\d+)S', + self._html_search_meta('duration', webpage, fatal=False) or ''), + lambda mobj: sum(float(x) * y for x, y in zip(mobj.groups(), (3600, 60, 1)))) + # _html_search_meta is not used since we don't want name=description to match + description = self._html_search_regex( + '<meta[^>]+itemprop="description"[^>]+content="([^>"]+)"', webpage, 'description', fatal=False) + + return { + '_type': 'url_transparent', + 'ie_key': DailymotionIE.ie_key(), + 'display_id': video_id, + 'title': self._og_search_title(webpage), + 'url': self._html_search_meta('embedURL', webpage), + 'thumbnail': self._html_search_meta('thumbnailURL', webpage), + 'description': description, + 'duration': duration, + 'timestamp': parse_iso8601(self._html_search_meta('uploadDate', webpage), delimiter=' ') + } diff --git a/hypervideo_dl/extractor/moview.py b/hypervideo_dl/extractor/moview.py new file mode 100644 index 0000000..678b2eb --- /dev/null +++ b/hypervideo_dl/extractor/moview.py @@ -0,0 +1,43 @@ +from .jixie import JixieBaseIE + + +class MoviewPlayIE(JixieBaseIE): + _VALID_URL = r'https?://www\.moview\.id/play/\d+/(?P<id>[\w-]+)' + _TESTS = [ + { + # drm hls, only use direct link + 'url': 'https://www.moview.id/play/174/Candy-Monster', + 'info_dict': { + 'id': '146182', + 'ext': 'mp4', + 'display_id': 'Candy-Monster', + 'uploader_id': 'Mo165qXUUf', + 'duration': 528.2, + 'title': 'Candy Monster', + 'description': 'Mengapa Candy Monster ingin mengambil permen Chloe?', + 'thumbnail': 'https://video.jixie.media/1034/146182/146182_1280x720.jpg', + } + }, { + # non-drm hls + 'url': 'https://www.moview.id/play/75/Paris-Van-Java-Episode-16', + 'info_dict': { + 'id': '28210', + 'ext': 'mp4', + 'duration': 2595.666667, + 'display_id': 'Paris-Van-Java-Episode-16', + 'uploader_id': 'Mo165qXUUf', + 'thumbnail': 'https://video.jixie.media/1003/28210/28210_1280x720.jpg', + 'description': 'md5:2a5e18d98eef9b39d7895029cac96c63', + 'title': 'Paris Van Java Episode 16', + } + } + ] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + video_id = self._search_regex( + r'video_id\s*=\s*"(?P<video_id>[^"]+)', webpage, 'video_id') + + return self._extract_data_from_jixie_id(display_id, video_id, webpage) diff --git a/hypervideo_dl/extractor/moviezine.py b/hypervideo_dl/extractor/moviezine.py index 730da4b..cffcdcf 100644 --- a/hypervideo_dl/extractor/moviezine.py +++ b/hypervideo_dl/extractor/moviezine.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor @@ -33,8 +29,6 @@ class MoviezineIE(InfoExtractor): 'ext': 'mp4', }] - self._sort_formats(formats) - return { 'id': video_id, 'title': self._search_regex(r'title: "(.+?)",', jsplayer, 'title'), diff --git a/hypervideo_dl/extractor/movingimage.py b/hypervideo_dl/extractor/movingimage.py index 4f62d62..cdd8ba4 100644 --- a/hypervideo_dl/extractor/movingimage.py +++ b/hypervideo_dl/extractor/movingimage.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( unescapeHTML, diff --git a/hypervideo_dl/extractor/msn.py b/hypervideo_dl/extractor/msn.py index f34e210..f91c53e 100644 --- a/hypervideo_dl/extractor/msn.py +++ b/hypervideo_dl/extractor/msn.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -134,7 +131,6 @@ class MSNIE(InfoExtractor): 'vbr': int_or_none(self._search_regex(r'_(\d+)\.mp4', format_url, 'vbr', default=None)), 'quality': 1 if format_id == '1001' else None, }) - self._sort_formats(formats) subtitles = {} for file_ in video.get('files', []): diff --git a/hypervideo_dl/extractor/mtv.py b/hypervideo_dl/extractor/mtv.py index be5de0a..d91be62 100644 --- a/hypervideo_dl/extractor/mtv.py +++ b/hypervideo_dl/extractor/mtv.py @@ -1,13 +1,7 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_xpath, -) +from ..compat import compat_str from ..utils import ( ExtractorError, find_xpath_attr, @@ -108,8 +102,6 @@ class MTVServicesInfoExtractor(InfoExtractor): }]) except (KeyError, TypeError): raise ExtractorError('Invalid rendition field.') - if formats: - self._sort_formats(formats) return formats def _extract_subtitles(self, mdoc, mtvn_id): @@ -167,9 +159,9 @@ class MTVServicesInfoExtractor(InfoExtractor): itemdoc, './/{http://search.yahoo.com/mrss/}category', 'scheme', 'urn:mtvn:video_title') if title_el is None: - title_el = itemdoc.find(compat_xpath('.//{http://search.yahoo.com/mrss/}title')) + title_el = itemdoc.find('.//{http://search.yahoo.com/mrss/}title') if title_el is None: - title_el = itemdoc.find(compat_xpath('.//title')) + title_el = itemdoc.find('.//title') if title_el.text is None: title_el = None @@ -208,8 +200,6 @@ class MTVServicesInfoExtractor(InfoExtractor): if not formats: return None - self._sort_formats(formats) - return { 'title': title, 'formats': formats, @@ -337,6 +327,7 @@ class MTVServicesInfoExtractor(InfoExtractor): class MTVServicesEmbeddedIE(MTVServicesInfoExtractor): IE_NAME = 'mtvservices:embedded' _VALID_URL = r'https?://media\.mtvnservices\.com/embed/(?P<mgid>.+?)(\?|/|$)' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//media\.mtvnservices\.com/embed/.+?)\1'] _TEST = { # From http://www.thewrap.com/peter-dinklage-sums-up-game-of-thrones-in-45-seconds-video/ @@ -352,13 +343,6 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor): }, } - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//media\.mtvnservices\.com/embed/.+?)\1', webpage) - if mobj: - return mobj.group('url') - def _get_feed_url(self, uri, url=None): video_id = self._id_from_uri(uri) config = self._download_json( @@ -548,7 +532,7 @@ class MTVItaliaIE(MTVServicesInfoExtractor): } -class MTVItaliaProgrammaIE(MTVItaliaIE): +class MTVItaliaProgrammaIE(MTVItaliaIE): # XXX: Do not subclass from concrete IE IE_NAME = 'mtv.it:programma' _VALID_URL = r'https?://(?:www\.)?mtv\.it/(?:programmi|playlist)/(?P<id>[0-9a-z]+)' _TESTS = [{ diff --git a/hypervideo_dl/extractor/muenchentv.py b/hypervideo_dl/extractor/muenchentv.py index a53929e..36a2d46 100644 --- a/hypervideo_dl/extractor/muenchentv.py +++ b/hypervideo_dl/extractor/muenchentv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -63,7 +60,6 @@ class MuenchenTVIE(InfoExtractor): 'format_id': format_id, 'preference': -100 if '.smil' in s['file'] else 0, # Strictly inferior than all other formats? }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/murrtube.py b/hypervideo_dl/extractor/murrtube.py index 1eb5de6..6cdbbda 100644 --- a/hypervideo_dl/extractor/murrtube.py +++ b/hypervideo_dl/extractor/murrtube.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import functools import json @@ -102,7 +99,7 @@ query Medium($id: ID!) { } -class MurrtubeUserIE(MurrtubeIE): +class MurrtubeUserIE(MurrtubeIE): # XXX: Do not subclass from concrete IE IE_DESC = 'Murrtube user profile' _VALID_URL = r'https?://murrtube\.net/(?P<id>[^/]+)$' _TEST = { diff --git a/hypervideo_dl/extractor/musescore.py b/hypervideo_dl/extractor/musescore.py index 09fadf8..289ae57 100644 --- a/hypervideo_dl/extractor/musescore.py +++ b/hypervideo_dl/extractor/musescore.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/musicdex.py b/hypervideo_dl/extractor/musicdex.py index 05f7220..48f2970 100644 --- a/hypervideo_dl/extractor/musicdex.py +++ b/hypervideo_dl/extractor/musicdex.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( date_from_str, @@ -100,7 +97,7 @@ class MusicdexAlbumIE(MusicdexBaseIE): } -class MusicdexPageIE(MusicdexBaseIE): +class MusicdexPageIE(MusicdexBaseIE): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor def _entries(self, id): next_page_url = self._API_URL % id while next_page_url: diff --git a/hypervideo_dl/extractor/mwave.py b/hypervideo_dl/extractor/mwave.py index a672765..efbfd9d 100644 --- a/hypervideo_dl/extractor/mwave.py +++ b/hypervideo_dl/extractor/mwave.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -49,7 +47,6 @@ class MwaveIE(InfoExtractor): continue formats.extend( self._extract_f4m_formats(f4m_url + '&hdcore=3.0.3', video_id, f4m_id=stream_name)) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/mxplayer.py b/hypervideo_dl/extractor/mxplayer.py index 3c2afd8..1fdb08e 100644 --- a/hypervideo_dl/extractor/mxplayer.py +++ b/hypervideo_dl/extractor/mxplayer.py @@ -1,9 +1,11 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor from ..compat import compat_str -from ..utils import try_get +from ..utils import ( + int_or_none, + traverse_obj, + try_get, + urljoin, +) class MxplayerIE(InfoExtractor): @@ -12,6 +14,7 @@ class MxplayerIE(InfoExtractor): 'url': 'https://www.mxplayer.in/show/watch-my-girlfriend-is-an-alien-hindi-dubbed/season-1/episode-1-online-9d2013d31d5835bb8400e3b3c5e7bb72', 'info_dict': { 'id': '9d2013d31d5835bb8400e3b3c5e7bb72', + 'display_id': 'episode-1-online', 'ext': 'mp4', 'title': 'Episode 1', 'description': 'md5:62ed43eb9fec5efde5cf3bd1040b7670', @@ -20,7 +23,6 @@ class MxplayerIE(InfoExtractor): 'duration': 2451, 'season': 'Season 1', 'series': 'My Girlfriend Is An Alien (Hindi Dubbed)', - 'thumbnail': 'https://qqcdnpictest.mxplay.com/pic/9d2013d31d5835bb8400e3b3c5e7bb72/en/16x9/320x180/9562f5f8df42cad09c9a9c4e69eb1567_1920x1080.webp', 'episode': 'Episode 1' }, 'params': { @@ -31,21 +33,17 @@ class MxplayerIE(InfoExtractor): 'url': 'https://www.mxplayer.in/movie/watch-knock-knock-hindi-dubbed-movie-online-b9fa28df3bfb8758874735bbd7d2655a?watch=true', 'info_dict': { 'id': 'b9fa28df3bfb8758874735bbd7d2655a', + 'display_id': 'episode-1-online', 'ext': 'mp4', 'title': 'Knock Knock (Hindi Dubbed)', - 'description': 'md5:b195ba93ff1987309cfa58e2839d2a5b', - 'season_number': 0, - 'episode_number': 0, + 'description': 'md5:4160f2dfc3b87c524261366f6b736329', 'duration': 5970, - 'season': 'Season 0', - 'series': None, - 'thumbnail': 'https://qqcdnpictest.mxplay.com/pic/b9fa28df3bfb8758874735bbd7d2655a/en/16x9/320x180/test_pic1588676032011.webp', - 'episode': 'Episode 0' }, 'params': { 'format': 'bv', 'skip_download': True, }, + 'skip': 'No longer available', }, { 'url': 'https://www.mxplayer.in/show/watch-shaitaan/season-1/the-infamous-taxi-gang-of-meerut-online-45055d5bcff169ad48f2ad7552a83d6c', 'info_dict': { @@ -58,26 +56,26 @@ class MxplayerIE(InfoExtractor): 'duration': 2332, 'season': 'Season 1', 'series': 'Shaitaan', - 'thumbnail': 'https://qqcdnpictest.mxplay.com/pic/45055d5bcff169ad48f2ad7552a83d6c/en/16x9/320x180/voot_8e7d5f8d8183340869279c732c1e3a43.webp', 'episode': 'Episode 1' }, 'params': { 'format': 'best', 'skip_download': True, }, + 'skip': 'No longer available.' }, { 'url': 'https://www.mxplayer.in/show/watch-aashram/chapter-1/duh-swapna-online-d445579792b0135598ba1bc9088a84cb', 'info_dict': { 'id': 'd445579792b0135598ba1bc9088a84cb', + 'display_id': 'duh-swapna-online', 'ext': 'mp4', 'title': 'Duh Swapna', 'description': 'md5:35ff39c4bdac403c53be1e16a04192d8', 'season_number': 1, 'episode_number': 3, 'duration': 2568, - 'season': 'Chapter 1', + 'season': 'Season 1', 'series': 'Aashram', - 'thumbnail': 'https://qqcdnpictest.mxplay.com/pic/d445579792b0135598ba1bc9088a84cb/en/4x3/1600x1200/test_pic1624819307993.webp', 'episode': 'Episode 3' }, 'params': { @@ -88,6 +86,7 @@ class MxplayerIE(InfoExtractor): 'url': 'https://www.mxplayer.in/show/watch-dangerous/season-1/chapter-1-online-5a351b4f9fb69436f6bd6ae3a1a75292', 'info_dict': { 'id': '5a351b4f9fb69436f6bd6ae3a1a75292', + 'display_id': 'chapter-1-online', 'ext': 'mp4', 'title': 'Chapter 1', 'description': 'md5:233886b8598bc91648ac098abe1d288f', @@ -96,7 +95,6 @@ class MxplayerIE(InfoExtractor): 'duration': 1305, 'season': 'Season 1', 'series': 'Dangerous', - 'thumbnail': 'https://qqcdnpictest.mxplay.com/pic/5a351b4f9fb69436f6bd6ae3a1a75292/en/4x3/1600x1200/test_pic1624706302350.webp', 'episode': 'Episode 1' }, 'params': { @@ -110,72 +108,93 @@ class MxplayerIE(InfoExtractor): 'ext': 'mp4', 'title': 'The Attacks of 26/11', 'description': 'md5:689bacd29e97b3f31eaf519eb14127e5', - 'season_number': 0, - 'episode_number': 0, 'duration': 6085, - 'season': 'Season 0', - 'series': None, - 'thumbnail': 'https://qqcdnpictest.mxplay.com/pic/0452f0d80226c398d63ce7e3ea40fa2d/en/16x9/320x180/00c8955dab5e5d340dbde643f9b1f6fd_1920x1080.webp', - 'episode': 'Episode 0' }, 'params': { 'format': 'best', 'skip_download': True, }, + 'skip': 'No longer available. Cannot be played on browser' + }, { + 'url': 'https://www.mxplayer.in/movie/watch-kitne-door-kitne-paas-movie-online-a9e9c76c566205955f70d8b2cb88a6a2', + 'info_dict': { + 'id': 'a9e9c76c566205955f70d8b2cb88a6a2', + 'display_id': 'watch-kitne-door-kitne-paas-movie-online', + 'title': 'Kitne Door Kitne Paas', + 'duration': 8458, + 'ext': 'mp4', + 'description': 'md5:fb825f3c542513088024dcafef0921b4', + }, + 'params': { + 'format': 'bv', + 'skip_download': True, + }, + }, { + 'url': 'https://www.mxplayer.in/show/watch-ek-thi-begum-hindi/season-2/game-of-power-online-5e5305c28f1409847cdc4520b6ad77cf', + 'info_dict': { + 'id': '5e5305c28f1409847cdc4520b6ad77cf', + 'display_id': 'game-of-power-online', + 'title': 'Game Of Power', + 'duration': 1845, + 'ext': 'mp4', + 'description': 'md5:1d0948d2a5312d7013792d53542407f9', + 'series': 'Ek Thi Begum (Hindi)', + 'season': 'Season 2', + 'season_number': 2, + 'episode': 'Episode 2', + 'episode_number': 2, + }, + 'params': { + 'format': 'bv', + 'skip_download': True, + }, + }, { + 'url': 'https://www.mxplayer.in/movie/watch-deewane-huye-paagal-movie-online-4f9175c40a11c3994182a65afdd37ec6?watch=true', + 'info_dict': { + 'id': '4f9175c40a11c3994182a65afdd37ec6', + 'display_id': 'watch-deewane-huye-paagal-movie-online', + 'title': 'Deewane Huye Paagal', + 'duration': 9037, + 'ext': 'mp4', + 'description': 'md5:d17bd5c651016c4ed2e6f8a4ace15534', + }, + 'params': {'skip_download': 'm3u8'}, }] def _real_extract(self, url): - type, display_id, video_id = self._match_valid_url(url).groups() - type = 'movie_film' if type == 'movie' else 'tvshow_episode' - API_URL = 'https://androidapi.mxplay.com/v1/detail/' - headers = { - 'X-Av-Code': '23', - 'X-Country': 'IN', - 'X-Platform': 'android', - 'X-App-Version': '1370001318', - 'X-Resolution': '3840x2160', - } - data_json = self._download_json(f'{API_URL}{type}/{video_id}', display_id, headers=headers)['profile'] + video_type, display_id, video_id = self._match_valid_url(url).group('type', 'display_id', 'id') + if 'show' in video_type: + video_type = 'episode' - season, series = None, None - for dct in data_json.get('levelInfos', []): - if dct.get('type') == 'tvshow_season': - season = dct.get('name') - elif dct.get('type') == 'tvshow_show': - series = dct.get('name') - thumbnails = [] - for thumb in data_json.get('poster', []): - thumbnails.append({ - 'url': thumb.get('url'), - 'width': thumb.get('width'), - 'height': thumb.get('height'), - }) + data_json = self._download_json( + f'https://api.mxplay.com/v1/web/detail/video?type={video_type}&id={video_id}', display_id) - formats = [] - subtitles = {} - for dct in data_json.get('playInfo', []): - if dct.get('extension') == 'mpd': - frmt, subs = self._extract_mpd_formats_and_subtitles(dct.get('playUrl'), display_id, fatal=False) - formats.extend(frmt) - subtitles = self._merge_subtitles(subtitles, subs) - elif dct.get('extension') == 'm3u8': - frmt, subs = self._extract_m3u8_formats_and_subtitles(dct.get('playUrl'), display_id, fatal=False) - formats.extend(frmt) - subtitles = self._merge_subtitles(subtitles, subs) - self._sort_formats(formats) + formats, subtitles = [], {} + m3u8_url = urljoin('https://llvod.mxplay.com/', traverse_obj( + data_json, ('stream', (('thirdParty', 'hlsUrl'), ('hls', 'high'))), get_all=False)) + if m3u8_url: + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, display_id, 'mp4', fatal=False) + mpd_url = urljoin('https://llvod.mxplay.com/', traverse_obj( + data_json, ('stream', (('thirdParty', 'dashUrl'), ('dash', 'high'))), get_all=False)) + if mpd_url: + fmts, subs = self._extract_mpd_formats_and_subtitles(mpd_url, display_id, fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + season = traverse_obj(data_json, ('container', 'title')) return { 'id': video_id, + 'title': data_json.get('title'), + 'formats': formats, + 'subtitles': subtitles, 'display_id': display_id, - 'title': data_json.get('name') or display_id, - 'description': data_json.get('description'), - 'season_number': data_json.get('seasonNum'), - 'episode_number': data_json.get('episodeNum'), 'duration': data_json.get('duration'), + 'series': traverse_obj(data_json, ('container', 'container', 'title')), + 'description': data_json.get('description'), 'season': season, - 'series': series, - 'thumbnails': thumbnails, - 'formats': formats, - 'subtitles': subtitles, + 'season_number': int_or_none( + self._search_regex(r'Season (\d+)', season, 'Season Number', default=None)), + 'episode_number': data_json.get('sequence') or None, } diff --git a/hypervideo_dl/extractor/mychannels.py b/hypervideo_dl/extractor/mychannels.py index d820d4e..8a70c1f 100644 --- a/hypervideo_dl/extractor/mychannels.py +++ b/hypervideo_dl/extractor/mychannels.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/myspace.py b/hypervideo_dl/extractor/myspace.py index 4227d42..3451098 100644 --- a/hypervideo_dl/extractor/myspace.py +++ b/hypervideo_dl/extractor/myspace.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -125,7 +122,6 @@ class MySpaceIE(InfoExtractor): else: raise ExtractorError( 'Found song but don\'t know how to download it') - self._sort_formats(formats) return { 'id': video_id, 'title': self._og_search_title(webpage), @@ -143,7 +139,6 @@ class MySpaceIE(InfoExtractor): video.get('streamUrl'), video.get('hlsStreamUrl'), video.get('mp4StreamUrl'), int_or_none(video.get('width')), int_or_none(video.get('height'))) - self._sort_formats(formats) return { 'id': video_id, 'title': video['title'], diff --git a/hypervideo_dl/extractor/myspass.py b/hypervideo_dl/extractor/myspass.py index 1775d5f..28ac982 100644 --- a/hypervideo_dl/extractor/myspass.py +++ b/hypervideo_dl/extractor/myspass.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( diff --git a/hypervideo_dl/extractor/myvi.py b/hypervideo_dl/extractor/myvi.py index 75d2863..df7200b 100644 --- a/hypervideo_dl/extractor/myvi.py +++ b/hypervideo_dl/extractor/myvi.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from .vimple import SprutoBaseIE @@ -29,6 +24,7 @@ class MyviIE(SprutoBaseIE): ) (?P<id>[\da-zA-Z_-]+) ''' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//myvi\.(?:ru/player|tv)/(?:embed/html|flash)/[^"]+)\1'] _TESTS = [{ 'url': 'http://myvi.ru/player/embed/html/oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wObeRTZaCATzucDQIDph8hQU0', 'md5': '571bbdfba9f9ed229dc6d34cc0f335bf', @@ -59,13 +55,6 @@ class MyviIE(SprutoBaseIE): 'only_matching': True, }] - @classmethod - def _extract_url(cls, webpage): - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//myvi\.(?:ru/player|tv)/(?:embed/html|flash)/[^"]+)\1', webpage) - if mobj: - return mobj.group('url') - def _real_extract(self, url): video_id = self._match_id(url) diff --git a/hypervideo_dl/extractor/myvideoge.py b/hypervideo_dl/extractor/myvideoge.py index 0a1d7d0..513d4cb 100644 --- a/hypervideo_dl/extractor/myvideoge.py +++ b/hypervideo_dl/extractor/myvideoge.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import js_to_json diff --git a/hypervideo_dl/extractor/myvidster.py b/hypervideo_dl/extractor/myvidster.py index 2117d30..c91f294 100644 --- a/hypervideo_dl/extractor/myvidster.py +++ b/hypervideo_dl/extractor/myvidster.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/n1.py b/hypervideo_dl/extractor/n1.py index fdb7f32..55345f3 100644 --- a/hypervideo_dl/extractor/n1.py +++ b/hypervideo_dl/extractor/n1.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -27,8 +24,6 @@ class N1InfoAssetIE(InfoExtractor): formats = self._extract_m3u8_formats( url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) - self._sort_formats(formats) - return { 'id': video_id, 'title': video_id, diff --git a/hypervideo_dl/extractor/nate.py b/hypervideo_dl/extractor/nate.py index 072faf6..5e74caa 100644 --- a/hypervideo_dl/extractor/nate.py +++ b/hypervideo_dl/extractor/nate.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import itertools from .common import InfoExtractor @@ -71,7 +68,6 @@ class NateIE(InfoExtractor): 'height': self._QUALITY.get(f_url[-2:]), 'quality': int_or_none(f_url[-2:]), } for f_url in video_data.get('smcUriList') or []] - self._sort_formats(formats) return { 'id': id, 'title': video_data.get('clipTitle'), diff --git a/hypervideo_dl/extractor/nationalgeographic.py b/hypervideo_dl/extractor/nationalgeographic.py index ee12e2b..ad525c2 100644 --- a/hypervideo_dl/extractor/nationalgeographic.py +++ b/hypervideo_dl/extractor/nationalgeographic.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from .fox import FOXIE from ..utils import ( @@ -61,7 +59,7 @@ class NationalGeographicVideoIE(InfoExtractor): } -class NationalGeographicTVIE(FOXIE): +class NationalGeographicTVIE(FOXIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'https?://(?:www\.)?nationalgeographic\.com/tv/watch/(?P<id>[\da-fA-F]+)' _TESTS = [{ 'url': 'https://www.nationalgeographic.com/tv/watch/6a875e6e734b479beda26438c9f21138/', diff --git a/hypervideo_dl/extractor/naver.py b/hypervideo_dl/extractor/naver.py index a6821ba..e2e6e97 100644 --- a/hypervideo_dl/extractor/naver.py +++ b/hypervideo_dl/extractor/naver.py @@ -1,16 +1,19 @@ -# coding: utf-8 -from __future__ import unicode_literals - +import itertools import re +from urllib.parse import urlparse, parse_qs from .common import InfoExtractor from ..utils import ( + ExtractorError, clean_html, dict_get, - ExtractorError, int_or_none, + join_nonempty, + merge_dicts, parse_duration, + traverse_obj, try_get, + unified_timestamp, update_url_query, ) @@ -65,19 +68,16 @@ class NaverBaseIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( update_url_query(stream_url, query), video_id, 'mp4', 'm3u8_native', m3u8_id=stream_type, fatal=False)) - self._sort_formats(formats) replace_ext = lambda x, y: re.sub(self._CAPTION_EXT_RE, '.' + y, x) def get_subs(caption_url): if re.search(self._CAPTION_EXT_RE, caption_url): - return [{ - 'url': replace_ext(caption_url, 'ttml'), - }, { - 'url': replace_ext(caption_url, 'vtt'), - }] - else: - return [{'url': caption_url}] + return [ + replace_ext(caption_url, 'ttml'), + replace_ext(caption_url, 'vtt'), + ] + return [caption_url] automatic_captions = {} subtitles = {} @@ -86,7 +86,13 @@ class NaverBaseIE(InfoExtractor): if not caption_url: continue sub_dict = automatic_captions if caption.get('type') == 'auto' else subtitles - sub_dict.setdefault(dict_get(caption, ('locale', 'language')), []).extend(get_subs(caption_url)) + lang = caption.get('locale') or join_nonempty('language', 'country', from_dict=caption) or 'und' + if caption.get('type') == 'fan': + lang += '_fan%d' % next(i for i in itertools.count(1) if f'{lang}_fan{i}' not in sub_dict) + sub_dict.setdefault(lang, []).extend({ + 'url': sub_url, + 'name': join_nonempty('label', 'fanName', from_dict=caption, delim=' - '), + } for sub_url in get_subs(caption_url)) user = meta.get('user', {}) @@ -237,7 +243,6 @@ class NaverLiveIE(InfoExtractor): quality.get('url'), video_id, 'mp4', m3u8_id=quality.get('qualityId'), live=True )) - self._sort_formats(formats) return { 'id': video_id, @@ -250,3 +255,142 @@ class NaverLiveIE(InfoExtractor): 'categories': [meta.get('categoryId')], 'is_live': True } + + +class NaverNowIE(NaverBaseIE): + IE_NAME = 'navernow' + _VALID_URL = r'https?://now\.naver\.com/s/now\.(?P<id>\w+)' + _API_URL = 'https://apis.naver.com/now_web/oldnow_web/v4' + _TESTS = [{ + 'url': 'https://now.naver.com/s/now.4759?shareReplayId=26331132#replay=', + 'md5': 'e05854162c21c221481de16b2944a0bc', + 'info_dict': { + 'id': '4759-26331132', + 'title': '아이키X노제\r\n💖꽁냥꽁냥💖(1)', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.*\.jpg', + 'timestamp': 1650369600, + 'upload_date': '20220419', + 'uploader_id': 'now', + 'view_count': int, + 'uploader_url': 'https://now.naver.com/show/4759', + 'uploader': '아이키의 떰즈업', + }, + 'params': { + 'noplaylist': True, + } + }, { + 'url': 'https://now.naver.com/s/now.4759?shareHightlight=26601461#highlight=', + 'md5': '9f6118e398aa0f22b2152f554ea7851b', + 'info_dict': { + 'id': '4759-26601461', + 'title': '아이키: 나 리정한테 흔들렸어,,, 질투 폭발하는 노제 여보😾 [아이키의 떰즈업]ㅣ네이버 NOW.', + 'ext': 'mp4', + 'thumbnail': r're:^https?://.*\.jpg', + 'upload_date': '20220504', + 'timestamp': 1651648311, + 'uploader_id': 'now', + 'view_count': int, + 'uploader_url': 'https://now.naver.com/show/4759', + 'uploader': '아이키의 떰즈업', + }, + 'params': { + 'noplaylist': True, + }, + }, { + 'url': 'https://now.naver.com/s/now.4759', + 'info_dict': { + 'id': '4759', + 'title': '아이키의 떰즈업', + }, + 'playlist_mincount': 101 + }, { + 'url': 'https://now.naver.com/s/now.4759?shareReplayId=26331132#replay', + 'info_dict': { + 'id': '4759', + 'title': '아이키의 떰즈업', + }, + 'playlist_mincount': 101, + }, { + 'url': 'https://now.naver.com/s/now.4759?shareHightlight=26601461#highlight=', + 'info_dict': { + 'id': '4759', + 'title': '아이키의 떰즈업', + }, + 'playlist_mincount': 101, + }, { + 'url': 'https://now.naver.com/s/now.kihyunplay?shareReplayId=30573291#replay', + 'only_matching': True, + }] + + def _extract_replay(self, show_id, replay_id): + vod_info = self._download_json(f'{self._API_URL}/shows/now.{show_id}/vod/{replay_id}', replay_id) + in_key = self._download_json(f'{self._API_URL}/shows/now.{show_id}/vod/{replay_id}/inkey', replay_id)['inKey'] + return merge_dicts({ + 'id': f'{show_id}-{replay_id}', + 'title': traverse_obj(vod_info, ('episode', 'title')), + 'timestamp': unified_timestamp(traverse_obj(vod_info, ('episode', 'start_time'))), + 'thumbnail': vod_info.get('thumbnail_image_url'), + }, self._extract_video_info(replay_id, vod_info['video_id'], in_key)) + + def _extract_show_replays(self, show_id): + page_size = 15 + page = 1 + while True: + show_vod_info = self._download_json( + f'{self._API_URL}/vod-shows/now.{show_id}', show_id, + query={'page': page, 'page_size': page_size}, + note=f'Downloading JSON vod list for show {show_id} - page {page}' + )['response']['result'] + for v in show_vod_info.get('vod_list') or []: + yield self._extract_replay(show_id, v['id']) + + if len(show_vod_info.get('vod_list') or []) < page_size: + break + page += 1 + + def _extract_show_highlights(self, show_id, highlight_id=None): + page_size = 10 + page = 1 + while True: + highlights_videos = self._download_json( + f'{self._API_URL}/shows/now.{show_id}/highlights/videos/', show_id, + query={'page': page, 'page_size': page_size}, + note=f'Downloading JSON highlights for show {show_id} - page {page}') + + for highlight in highlights_videos.get('results') or []: + if highlight_id and highlight.get('clip_no') != int(highlight_id): + continue + yield merge_dicts({ + 'id': f'{show_id}-{highlight["clip_no"]}', + 'title': highlight.get('title'), + 'timestamp': unified_timestamp(highlight.get('regdate')), + 'thumbnail': highlight.get('thumbnail_url'), + }, self._extract_video_info(highlight['clip_no'], highlight['video_id'], highlight['video_inkey'])) + + if len(highlights_videos.get('results') or []) < page_size: + break + page += 1 + + def _extract_highlight(self, show_id, highlight_id): + try: + return next(self._extract_show_highlights(show_id, highlight_id)) + except StopIteration: + raise ExtractorError(f'Unable to find highlight {highlight_id} for show {show_id}') + + def _real_extract(self, url): + show_id = self._match_id(url) + qs = parse_qs(urlparse(url).query) + + if not self._yes_playlist(show_id, qs.get('shareHightlight')): + return self._extract_highlight(show_id, qs['shareHightlight'][0]) + elif not self._yes_playlist(show_id, qs.get('shareReplayId')): + return self._extract_replay(show_id, qs['shareReplayId'][0]) + + show_info = self._download_json( + f'{self._API_URL}/shows/now.{show_id}/', show_id, + note=f'Downloading JSON vod list for show {show_id}') + + return self.playlist_result( + itertools.chain(self._extract_show_replays(show_id), self._extract_show_highlights(show_id)), + show_id, show_info.get('title')) diff --git a/hypervideo_dl/extractor/nba.py b/hypervideo_dl/extractor/nba.py index 359cc52..d8fc824 100644 --- a/hypervideo_dl/extractor/nba.py +++ b/hypervideo_dl/extractor/nba.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import functools import re @@ -94,7 +92,6 @@ class NBAWatchBaseIE(NBACVPBaseIE): formats.extend(cvp_info['formats']) info = merge_dicts(info, cvp_info) - self._sort_formats(formats) info['formats'] = formats return info @@ -320,7 +317,6 @@ class NBABaseIE(NBACVPBaseIE): subtitles = self._merge_subtitles(subtitles, cvp_info['subtitles']) info = merge_dicts(info, cvp_info) - self._sort_formats(formats) else: info.update(self._embed_url_result(team, video['videoId'])) diff --git a/hypervideo_dl/extractor/nbc.py b/hypervideo_dl/extractor/nbc.py index 1094034..1ea6355 100644 --- a/hypervideo_dl/extractor/nbc.py +++ b/hypervideo_dl/extractor/nbc.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import base64 import json import re @@ -9,18 +7,24 @@ from .theplatform import ThePlatformIE from .adobepass import AdobePassIE from ..compat import compat_urllib_parse_unquote from ..utils import ( + ExtractorError, int_or_none, parse_age_limit, parse_duration, RegexNotFoundError, smuggle_url, + str_or_none, + traverse_obj, try_get, + unified_strdate, unified_timestamp, update_url_query, + url_basename, + variadic, ) -class NBCIE(ThePlatformIE): +class NBCIE(ThePlatformIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'https?(?P<permalink>://(?:www\.)?nbc\.com/(?:classic-tv/)?[^/]+/video/[^/]+/(?P<id>n?\d+))' _TESTS = [ @@ -186,6 +190,7 @@ class NBCIE(ThePlatformIE): class NBCSportsVPlayerIE(InfoExtractor): _VALID_URL_BASE = r'https?://(?:vplayer\.nbcsports\.com|(?:www\.)?nbcsports\.com/vplayer)/' _VALID_URL = _VALID_URL_BASE + r'(?:[^/]+/)+(?P<id>[0-9a-zA-Z_]+)' + _EMBED_REGEX = [r'(?:iframe[^>]+|var video|div[^>]+data-(?:mpx-)?)[sS]rc\s?=\s?"(?P<url>%s[^\"]+)' % _VALID_URL_BASE] _TESTS = [{ 'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/9CsDKds0kvHI', @@ -209,13 +214,6 @@ class NBCSportsVPlayerIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - video_urls = re.search( - r'(?:iframe[^>]+|var video|div[^>]+data-(?:mpx-)?)[sS]rc\s?=\s?"(?P<url>%s[^\"]+)' % NBCSportsVPlayerIE._VALID_URL_BASE, webpage) - if video_urls: - return video_urls.group('url') - def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -307,7 +305,6 @@ class NBCSportsStreamIE(AdobePassIE): 'resourceId': base64.b64encode(resource.encode()).decode(), }).encode())['tokenizedUrl'] formats = self._extract_m3u8_formats(tokenized_url, video_id, 'mp4') - self._sort_formats(formats) return { 'id': video_id, 'title': title, @@ -317,8 +314,9 @@ class NBCSportsStreamIE(AdobePassIE): } -class NBCNewsIE(ThePlatformIE): +class NBCNewsIE(ThePlatformIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'(?x)https?://(?:www\.)?(?:nbcnews|today|msnbc)\.com/([^/]+/)*(?:.*-)?(?P<id>[^/?]+)' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//www\.nbcnews\.com/widget/video-embed/[^"\']+)\1'] _TESTS = [ { @@ -438,7 +436,6 @@ class NBCNewsIE(ThePlatformIE): 'tbr': tbr, 'ext': 'mp4', }) - self._sort_formats(formats) subtitles = {} closed_captioning = video_data.get('closedCaptioning') @@ -581,8 +578,7 @@ class NBCOlympicsStreamIE(AdobePassIE): for f in formats: # -http_seekable requires ffmpeg 4.3+ but it doesnt seem possible to # download with ffmpeg without this option - f['_ffmpeg_args'] = ['-seekable', '0', '-http_seekable', '0', '-icy', '0'] - self._sort_formats(formats) + f['downloader_options'] = {'ffmpeg_args': ['-seekable', '0', '-http_seekable', '0', '-icy', '0']} return { 'id': pid, @@ -591,3 +587,168 @@ class NBCOlympicsStreamIE(AdobePassIE): 'formats': formats, 'is_live': is_live, } + + +class NBCStationsIE(InfoExtractor): + _DOMAIN_RE = '|'.join(map(re.escape, ( + 'nbcbayarea', 'nbcboston', 'nbcchicago', 'nbcconnecticut', 'nbcdfw', 'nbclosangeles', + 'nbcmiami', 'nbcnewyork', 'nbcphiladelphia', 'nbcsandiego', 'nbcwashington', + 'necn', 'telemundo52', 'telemundoarizona', 'telemundochicago', 'telemundonuevainglaterra', + ))) + _VALID_URL = rf'https?://(?:www\.)?(?P<site>{_DOMAIN_RE})\.com/(?:[^/?#]+/)*(?P<id>[^/?#]+)/?(?:$|[#?])' + + _TESTS = [{ + 'url': 'https://www.nbclosangeles.com/news/local/large-structure-fire-in-downtown-la-prompts-smoke-odor-advisory/2968618/', + 'md5': '462041d91bd762ef5a38b7d85d6dc18f', + 'info_dict': { + 'id': '2968618', + 'ext': 'mp4', + 'title': 'Large Structure Fire in Downtown LA Prompts Smoke Odor Advisory', + 'description': None, + 'timestamp': 1661135892, + 'upload_date': '20220821', + 'uploader': 'NBC 4', + 'uploader_id': 'KNBC', + 'channel': 'nbclosangeles', + }, + }, { + 'url': 'https://www.telemundoarizona.com/responde/huracan-complica-reembolso-para-televidente-de-tucson/2247002/', + 'md5': '0917dcf7885be1023a9220630d415f67', + 'info_dict': { + 'id': '2247002', + 'ext': 'mp4', + 'title': 'Huracán complica que televidente de Tucson reciba reembolso', + 'description': 'md5:af298dc73aab74d4fca6abfb12acb6cf', + 'timestamp': 1660886507, + 'upload_date': '20220819', + 'uploader': 'Telemundo Arizona', + 'uploader_id': 'KTAZ', + 'channel': 'telemundoarizona', + }, + }] + + _RESOLUTIONS = { + '1080': '1920', + '720': '1280', + '540': '960', + '360': '640', + '234': '416', + } + + def _real_extract(self, url): + channel, video_id = self._match_valid_url(url).group('site', 'id') + webpage = self._download_webpage(url, video_id) + + nbc_data = self._search_json( + r'<script>var\s*nbc\s*=', webpage, 'NBC JSON data', video_id) + pdk_acct = nbc_data.get('pdkAcct') or 'Yh1nAC' + fw_ssid = traverse_obj(nbc_data, ('video', 'fwSSID')) + fw_network_id = traverse_obj(nbc_data, ('video', 'fwNetworkID'), default='382114') + + video_data = self._parse_json(self._html_search_regex( + r'data-videos="([^"]*)"', webpage, 'video data', default='{}'), video_id) + video_data = variadic(video_data)[0] + video_data.update(self._parse_json(self._html_search_regex( + r'data-meta="([^"]*)"', webpage, 'metadata', default='{}'), video_id)) + + formats = [] + + if video_data.get('mpx_is_livestream') == '1': + live = True + player_id = traverse_obj( + video_data, 'mpx_m3upid', ('video', 'meta', 'mpx_m3upid'), 'mpx_pid', + ('video', 'meta', 'mpx_pid'), 'pid_streaming_web_medium') + query = { + 'mbr': 'true', + 'assetTypes': 'LegacyRelease', + 'fwsitesection': fw_ssid, + 'fwNetworkID': fw_network_id, + 'pprofile': 'ots_desktop_html', + 'sensitive': 'false', + 'w': '1920', + 'h': '1080', + 'rnd': '1660303', + 'mode': 'LIVE', + 'format': 'SMIL', + 'tracking': 'true', + 'formats': 'M3U+none,MPEG-DASH+none,MPEG4,MP3', + 'vpaid': 'script', + 'schema': '2.0', + 'SDK': 'PDK+6.1.3', + } + info = { + 'title': f'{channel} livestream', + } + + else: + live = False + player_id = traverse_obj( + video_data, ('video', 'meta', 'pid_streaming_web_high'), 'pid_streaming_web_high', + ('video', 'meta', 'mpx_pid'), 'mpx_pid') + + date_string = traverse_obj(video_data, 'date_string', 'date_gmt') + if date_string: + date_string = self._search_regex( + r'datetime="([^"]+)"', date_string, 'date string', fatal=False) + else: + date_string = traverse_obj( + nbc_data, ('dataLayer', 'adobe', 'prop70'), ('dataLayer', 'adobe', 'eVar70'), + ('dataLayer', 'adobe', 'eVar59')) + + video_url = traverse_obj(video_data, ('video', 'meta', 'mp4_url'), 'mp4_url') + if video_url: + height = url_basename(video_url).split('-')[1].split('p')[0] + formats.append({ + 'url': video_url, + 'ext': 'mp4', + 'width': int_or_none(self._RESOLUTIONS.get(height)), + 'height': int_or_none(height), + 'format_id': f'http-{height}', + }) + + query = { + 'mbr': 'true', + 'assetTypes': 'LegacyRelease', + 'fwsitesection': fw_ssid, + 'fwNetworkID': fw_network_id, + 'format': 'redirect', + 'manifest': 'm3u', + 'Tracking': 'true', + 'Embedded': 'true', + 'formats': 'MPEG4', + } + info = { + 'title': video_data.get('title') or traverse_obj( + nbc_data, ('dataLayer', 'contenttitle'), ('dataLayer', 'title'), + ('dataLayer', 'adobe', 'prop22'), ('dataLayer', 'id')), + 'description': traverse_obj(video_data, 'summary', 'excerpt', 'video_hero_text'), + 'upload_date': str_or_none(unified_strdate(date_string)), + 'timestamp': int_or_none(unified_timestamp(date_string)), + } + + if not player_id: + raise ExtractorError( + 'No video player ID or livestream player ID found in webpage', expected=True) + + headers = {'Origin': f'https://www.{channel}.com'} + manifest, urlh = self._download_webpage_handle( + f'https://link.theplatform.com/s/{pdk_acct}/{player_id}', video_id, + headers=headers, query=query, note='Downloading manifest') + if live: + manifest_url = self._search_regex(r'<video src="([^"]*)', manifest, 'manifest URL') + else: + manifest_url = urlh.geturl() + + formats.extend(self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', headers=headers, m3u8_id='hls', + fatal=live, live=live, errnote='No HLS formats found')) + + return { + 'id': str_or_none(video_id), + 'channel': channel, + 'uploader': str_or_none(nbc_data.get('on_air_name')), + 'uploader_id': str_or_none(nbc_data.get('callLetters')), + 'formats': formats, + 'is_live': live, + **info, + } diff --git a/hypervideo_dl/extractor/ndr.py b/hypervideo_dl/extractor/ndr.py index 1917254..41ea362 100644 --- a/hypervideo_dl/extractor/ndr.py +++ b/hypervideo_dl/extractor/ndr.py @@ -1,14 +1,15 @@ -# coding: utf-8 -from __future__ import unicode_literals +import re from .common import InfoExtractor +from ..compat import compat_urllib_parse_urlparse from ..utils import ( determine_ext, + ExtractorError, int_or_none, - parse_duration, + merge_dicts, + parse_iso8601, qualities, try_get, - unified_strdate, urljoin, ) @@ -17,120 +18,139 @@ class NDRBaseIE(InfoExtractor): def _real_extract(self, url): mobj = self._match_valid_url(url) display_id = next(group for group in mobj.groups() if group) - id = mobj.group('id') webpage = self._download_webpage(url, display_id) - return self._extract_embed(webpage, display_id, id) + return self._extract_embed(webpage, display_id, url) class NDRIE(NDRBaseIE): IE_NAME = 'ndr' IE_DESC = 'NDR.de - Norddeutscher Rundfunk' - _VALID_URL = r'https?://(?:www\.)?(?:daserste\.)?ndr\.de/(?:[^/]+/)*(?P<display_id>[^/?#]+),(?P<id>[\da-z]+)\.html' + _VALID_URL = r'https?://(?:\w+\.)*ndr\.de/(?:[^/]+/)*(?P<id>[^/?#]+),[\da-z]+\.html' _TESTS = [{ + # httpVideo, same content id 'url': 'http://www.ndr.de/fernsehen/Party-Poette-und-Parade,hafengeburtstag988.html', + 'md5': '6515bc255dc5c5f8c85bbc38e035a659', 'info_dict': { 'id': 'hafengeburtstag988', + 'display_id': 'Party-Poette-und-Parade', 'ext': 'mp4', 'title': 'Party, Pötte und Parade', - 'thumbnail': 'https://www.ndr.de/fernsehen/hafengeburtstag990_v-contentxl.jpg', 'description': 'md5:ad14f9d2f91d3040b6930c697e5f6b4c', - 'series': None, - 'channel': 'NDR Fernsehen', - 'upload_date': '20150508', + 'uploader': 'ndrtv', + 'timestamp': 1431255671, + 'upload_date': '20150510', 'duration': 3498, }, - }, { - 'url': 'https://www.ndr.de/sport/fussball/Rostocks-Matchwinner-Froede-Ein-Hansa-Debuet-wie-im-Maerchen,hansa10312.html', - 'only_matching': True - }, { - 'url': 'https://www.ndr.de/nachrichten/niedersachsen/kommunalwahl_niedersachsen_2021/Grosse-Parteien-zufrieden-mit-Ergebnissen-der-Kommunalwahl,kommunalwahl1296.html', - 'info_dict': { - 'id': 'kommunalwahl1296', - 'ext': 'mp4', - 'title': 'Die Spitzenrunde: Die Wahl aus Sicht der Landespolitik', - 'thumbnail': 'https://www.ndr.de/fernsehen/screenshot1194912_v-contentxl.jpg', - 'description': 'md5:5c6e2ad744cef499135735a1036d7aa7', - 'series': 'Hallo Niedersachsen', - 'channel': 'NDR Fernsehen', - 'upload_date': '20210913', - 'duration': 438, + 'params': { + 'skip_download': True, }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { - 'url': 'https://www.ndr.de/fernsehen/sendungen/extra_3/extra-3-Satiremagazin-mit-Christian-Ehring,sendung1091858.html', + # httpVideo, different content id + 'url': 'http://www.ndr.de/sport/fussball/40-Osnabrueck-spielt-sich-in-einen-Rausch,osna270.html', + 'md5': '1043ff203eab307f0c51702ec49e9a71', 'info_dict': { - 'id': 'sendung1091858', + 'id': 'osna272', + 'display_id': '40-Osnabrueck-spielt-sich-in-einen-Rausch', 'ext': 'mp4', - 'title': 'Extra 3 vom 11.11.2020 mit Christian Ehring', - 'thumbnail': 'https://www.ndr.de/fernsehen/screenshot983938_v-contentxl.jpg', - 'description': 'md5:700f6de264010585012a72f97b0ac0c9', - 'series': 'extra 3', - 'channel': 'NDR Fernsehen', - 'upload_date': '20201111', - 'duration': 1749, - } + 'title': 'Osnabrück - Wehen Wiesbaden: Die Highlights', + 'description': 'md5:32e9b800b3d2d4008103752682d5dc01', + 'uploader': 'ndrtv', + 'timestamp': 1442059200, + 'upload_date': '20150912', + 'duration': 510, + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'No longer available', }, { + # httpAudio, same content id 'url': 'http://www.ndr.de/info/La-Valette-entgeht-der-Hinrichtung,audio51535.html', + 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8', 'info_dict': { 'id': 'audio51535', + 'display_id': 'La-Valette-entgeht-der-Hinrichtung', 'ext': 'mp3', 'title': 'La Valette entgeht der Hinrichtung', - 'thumbnail': 'https://www.ndr.de/mediathek/mediathekbild140_v-podcast.jpg', 'description': 'md5:22f9541913a40fe50091d5cdd7c9f536', - 'upload_date': '20140729', - 'duration': 884.0, + 'uploader': 'ndrinfo', + 'timestamp': 1631711863, + 'upload_date': '20210915', + 'duration': 884, + }, + 'params': { + 'skip_download': True, + }, + }, { + # with subtitles + 'url': 'https://www.ndr.de/fernsehen/sendungen/extra_3/extra-3-Satiremagazin-mit-Christian-Ehring,sendung1091858.html', + 'info_dict': { + 'id': 'extra18674', + 'display_id': 'extra-3-Satiremagazin-mit-Christian-Ehring', + 'ext': 'mp4', + 'title': 'Extra 3 vom 11.11.2020 mit Christian Ehring', + 'description': 'md5:700f6de264010585012a72f97b0ac0c9', + 'uploader': 'ndrtv', + 'upload_date': '20201207', + 'timestamp': 1614349457, + 'duration': 1749, + 'subtitles': { + 'de': [{ + 'ext': 'ttml', + 'url': r're:^https://www\.ndr\.de.+', + }], + }, }, - 'expected_warnings': ['unable to extract json url'], + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Unable to download f4m manifest'], + }, { + 'url': 'https://www.ndr.de/Fettes-Brot-Ferris-MC-und-Thees-Uhlmann-live-on-stage,festivalsommer116.html', + 'only_matching': True, }] - def _extract_embed(self, webpage, display_id, id): - formats = [] - base_url = 'https://www.ndr.de' - json_url = self._search_regex(r'<iframe[^>]+src=\"([^\"]+)_theme-ndrde[^\.]*\.html\"', webpage, - 'json url', fatal=False) - if json_url: - data_json = self._download_json(base_url + json_url.replace('ardplayer_image', 'ardjson_image') + '.json', - id, fatal=False) - info_json = data_json.get('_info', {}) - media_json = try_get(data_json, lambda x: x['_mediaArray'][0]['_mediaStreamArray']) - for media in media_json: - if media.get('_quality') == 'auto': - formats.extend(self._extract_m3u8_formats(media['_stream'], id)) - subtitles = {} - sub_url = data_json.get('_subtitleUrl') - if sub_url: - subtitles.setdefault('de', []).append({ - 'url': base_url + sub_url, - }) - self._sort_formats(formats) - return { - 'id': id, - 'title': info_json.get('clipTitle'), - 'thumbnail': base_url + data_json.get('_previewImage'), - 'description': info_json.get('clipDescription'), - 'series': info_json.get('seriesTitle') or None, - 'channel': info_json.get('channelTitle'), - 'upload_date': unified_strdate(info_json.get('clipDate')), - 'duration': data_json.get('_duration'), - 'formats': formats, - 'subtitles': subtitles, - } - else: - json_url = base_url + self._search_regex(r'apiUrl\s?=\s?\'([^\']+)\'', webpage, 'json url').replace( - '_belongsToPodcast-', '') - data_json = self._download_json(json_url, id, fatal=False) - return { - 'id': id, - 'title': data_json.get('title'), - 'thumbnail': base_url + data_json.get('poster'), - 'description': data_json.get('summary'), - 'upload_date': unified_strdate(data_json.get('publicationDate')), - 'duration': parse_duration(data_json.get('duration')), - 'formats': [{ - 'url': try_get(data_json, (lambda x: x['audio'][0]['url'], lambda x: x['files'][0]['url'])), - 'vcodec': 'none', - 'ext': 'mp3', - }], - } + def _extract_embed(self, webpage, display_id, url): + embed_url = ( + self._html_search_meta( + 'embedURL', webpage, 'embed URL', + default=None) + or self._search_regex( + r'\bembedUrl["\']\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'embed URL', group='url', default=None) + or self._search_regex( + r'\bvar\s*sophoraID\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'embed URL', group='url', default='')) + # some more work needed if we only found sophoraID + if re.match(r'^[a-z]+\d+$', embed_url): + # get the initial part of the url path,. eg /panorama/archiv/2022/ + parsed_url = compat_urllib_parse_urlparse(url) + path = self._search_regex(r'(.+/)%s' % display_id, parsed_url.path or '', 'embed URL', default='') + # find tell-tale image with the actual ID + ndr_id = self._search_regex(r'%s([a-z]+\d+)(?!\.)\b' % (path, ), webpage, 'embed URL', default=None) + # or try to use special knowledge! + NDR_INFO_URL_TPL = 'https://www.ndr.de/info/%s-player.html' + embed_url = 'ndr:%s' % (ndr_id, ) if ndr_id else NDR_INFO_URL_TPL % (embed_url, ) + if not embed_url: + raise ExtractorError('Unable to extract embedUrl') + + description = self._search_regex( + r'<p[^>]+itemprop="description">([^<]+)</p>', + webpage, 'description', default=None) or self._og_search_description(webpage) + timestamp = parse_iso8601( + self._search_regex( + (r'<span[^>]+itemprop="(?:datePublished|uploadDate)"[^>]+content="(?P<cont>[^"]+)"', + r'\bvar\s*pdt\s*=\s*(?P<q>["\'])(?P<cont>(?:(?!(?P=q)).)+)(?P=q)', ), + webpage, 'upload date', group='cont', default=None)) + info = self._search_json_ld(webpage, display_id, default={}) + return merge_dicts({ + '_type': 'url_transparent', + 'url': embed_url, + 'display_id': display_id, + 'description': description, + 'timestamp': timestamp, + }, info) class NJoyIE(NDRBaseIE): @@ -154,19 +174,19 @@ class NJoyIE(NDRBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'No longer available', }, { # httpVideo, different content id 'url': 'http://www.n-joy.de/musik/Das-frueheste-DJ-Set-des-Nordens-live-mit-Felix-Jaehn-,felixjaehn168.html', 'md5': '417660fffa90e6df2fda19f1b40a64d8', 'info_dict': { - 'id': 'dockville882', + 'id': 'livestream283', 'display_id': 'Das-frueheste-DJ-Set-des-Nordens-live-mit-Felix-Jaehn-', - 'ext': 'mp4', - 'title': '"Ich hab noch nie" mit Felix Jaehn', - 'description': 'md5:85dd312d53be1b99e1f998a16452a2f3', + 'ext': 'mp3', + 'title': 'Das frueheste DJ Set des Nordens live mit Felix Jaehn', + 'description': 'md5:681698f527b8601e511e7b79edde7d2c', 'uploader': 'njoy', - 'upload_date': '20150822', - 'duration': 211, + 'upload_date': '20210830', }, 'params': { 'skip_download': True, @@ -176,22 +196,29 @@ class NJoyIE(NDRBaseIE): 'only_matching': True, }] - def _extract_embed(self, webpage, display_id, id): + def _extract_embed(self, webpage, display_id, url=None): + # find tell-tale URL with the actual ID, or ... video_id = self._search_regex( - r'<iframe[^>]+id="pp_([\da-z]+)"', webpage, 'embed id') - description = self._search_regex( - r'<div[^>]+class="subline"[^>]*>[^<]+</div>\s*<p>([^<]+)</p>', - webpage, 'description', fatal=False) + (r'''\bsrc\s*=\s*["']?(?:/\w+)+/([a-z]+\d+)(?!\.)\b''', + r'<iframe[^>]+id="pp_([\da-z]+)"', ), + webpage, 'NDR id', default=None) + + description = ( + self._html_search_meta('description', webpage) + or self._search_regex( + r'<div[^>]+class="subline"[^>]*>[^<]+</div>\s*<p>([^<]+)</p>', + webpage, 'description', fatal=False)) return { '_type': 'url_transparent', 'ie_key': 'NDREmbedBase', 'url': 'ndr:%s' % video_id, 'display_id': display_id, 'description': description, + 'title': display_id.replace('-', ' ').strip(), } -class NDREmbedBaseIE(InfoExtractor): +class NDREmbedBaseIE(InfoExtractor): # XXX: Conventionally, Concrete class names do not end in BaseIE IE_NAME = 'ndr:embed:base' _VALID_URL = r'(?:ndr:(?P<id_s>[\da-z]+)|https?://www\.ndr\.de/(?P<id>[\da-z]+)-ppjson\.json)' _TESTS = [{ @@ -239,7 +266,6 @@ class NDREmbedBaseIE(InfoExtractor): ff['vcodec'] = 'none' ff['ext'] = ext or 'mp3' formats.append(ff) - self._sort_formats(formats) config = playlist['config'] @@ -288,9 +314,9 @@ class NDREmbedBaseIE(InfoExtractor): } -class NDREmbedIE(NDREmbedBaseIE): +class NDREmbedIE(NDREmbedBaseIE): # XXX: Do not subclass from concrete IE IE_NAME = 'ndr:embed' - _VALID_URL = r'https?://(?:www\.)?(?:daserste\.)?ndr\.de/(?:[^/]+/)*(?P<id>[\da-z]+)-(?:player|externalPlayer)\.html' + _VALID_URL = r'https?://(?:\w+\.)*ndr\.de/(?:[^/]+/)*(?P<id>[\da-z]+)-(?:(?:ard)?player|externalPlayer)\.html' _TESTS = [{ 'url': 'http://www.ndr.de/fernsehen/sendungen/ndr_aktuell/ndraktuell28488-player.html', 'md5': '8b9306142fe65bbdefb5ce24edb6b0a9', @@ -303,6 +329,7 @@ class NDREmbedIE(NDREmbedBaseIE): 'upload_date': '20150907', 'duration': 132, }, + 'skip': 'No longer available', }, { 'url': 'http://www.ndr.de/ndr2/events/soundcheck/soundcheck3366-player.html', 'md5': '002085c44bae38802d94ae5802a36e78', @@ -318,6 +345,7 @@ class NDREmbedIE(NDREmbedBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'No longer available', }, { 'url': 'http://www.ndr.de/info/audio51535-player.html', 'md5': 'bb3cd38e24fbcc866d13b50ca59307b8', @@ -327,7 +355,7 @@ class NDREmbedIE(NDREmbedBaseIE): 'title': 'La Valette entgeht der Hinrichtung', 'is_live': False, 'uploader': 'ndrinfo', - 'upload_date': '20140729', + 'upload_date': '20210915', 'duration': 884, }, 'params': { @@ -348,15 +376,17 @@ class NDREmbedIE(NDREmbedBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'No longer available', }, { # httpVideoLive 'url': 'http://www.ndr.de/fernsehen/livestream/livestream217-externalPlayer.html', 'info_dict': { 'id': 'livestream217', - 'ext': 'flv', + 'ext': 'mp4', 'title': r're:^NDR Fernsehen Niedersachsen \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', 'is_live': True, - 'upload_date': '20150910', + 'upload_date': '20210409', + 'uploader': 'ndrtv', }, 'params': { 'skip_download': True, @@ -382,7 +412,7 @@ class NDREmbedIE(NDREmbedBaseIE): }] -class NJoyEmbedIE(NDREmbedBaseIE): +class NJoyEmbedIE(NDREmbedBaseIE): # XXX: Do not subclass from concrete IE IE_NAME = 'njoy:embed' _VALID_URL = r'https?://(?:www\.)?n-joy\.de/(?:[^/]+/)*(?P<id>[\da-z]+)-(?:player|externalPlayer)_[^/]+\.html' _TESTS = [{ @@ -394,9 +424,10 @@ class NJoyEmbedIE(NDREmbedBaseIE): 'ext': 'mp4', 'title': 'Zehn Jahre Reeperbahn Festival - die Doku', 'is_live': False, - 'upload_date': '20150807', + 'upload_date': '20200826', 'duration': 1011, }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { # httpAudio 'url': 'http://www.n-joy.de/news_wissen/stefanrichter100-player_image-d5e938b1-f21a-4b9a-86b8-aaba8bca3a13_theme-n-joy.html', @@ -413,6 +444,7 @@ class NJoyEmbedIE(NDREmbedBaseIE): 'params': { 'skip_download': True, }, + 'skip': 'No longer available', }, { # httpAudioLive, no explicit ext 'url': 'http://www.n-joy.de/news_wissen/webradioweltweit100-player_image-3fec0484-2244-4565-8fb8-ed25fd28b173_theme-n-joy.html', @@ -422,7 +454,7 @@ class NJoyEmbedIE(NDREmbedBaseIE): 'title': r're:^N-JOY Weltweit \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', 'is_live': True, 'uploader': 'njoy', - 'upload_date': '20150810', + 'upload_date': '20210830', }, 'params': { 'skip_download': True, diff --git a/hypervideo_dl/extractor/ndtv.py b/hypervideo_dl/extractor/ndtv.py index bc3eb91..bfe52f7 100644 --- a/hypervideo_dl/extractor/ndtv.py +++ b/hypervideo_dl/extractor/ndtv.py @@ -1,16 +1,7 @@ -# coding: utf-8 -from __future__ import unicode_literals +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_unquote_plus -) -from ..utils import ( - parse_duration, - remove_end, - unified_strdate, - urljoin -) +from ..utils import parse_duration, remove_end, unified_strdate, urljoin class NDTVIE(InfoExtractor): @@ -83,7 +74,7 @@ class NDTVIE(InfoExtractor): webpage = self._download_webpage(url, video_id) # '__title' does not contain extra words such as sub-site name, "Video" etc. - title = compat_urllib_parse_unquote_plus( + title = urllib.parse.unquote_plus( self._search_regex(r"__title\s*=\s*'([^']+)'", webpage, 'title', default=None) or self._og_search_title(webpage)) diff --git a/hypervideo_dl/extractor/nebula.py b/hypervideo_dl/extractor/nebula.py index 77f2535..861fcb1 100644 --- a/hypervideo_dl/extractor/nebula.py +++ b/hypervideo_dl/extractor/nebula.py @@ -1,17 +1,13 @@ -# coding: utf-8 -from __future__ import unicode_literals - import itertools import json import time -import urllib +import urllib.error +import urllib.parse -from ..utils import ( - ExtractorError, - parse_iso8601, - try_get, -) from .common import InfoExtractor +from ..utils import ExtractorError, parse_iso8601, try_get + +_BASE_URL_RE = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app|nebula\.tv)' class NebulaBaseIE(InfoExtractor): @@ -21,9 +17,8 @@ class NebulaBaseIE(InfoExtractor): _nebula_bearer_token = None _zype_access_token = None - def _perform_nebula_auth(self): - username, password = self._get_login_info() - if not (username and password): + def _perform_nebula_auth(self, username, password): + if not username or not password: self.raise_login_required() data = json.dumps({'email': username, 'password': password}).encode('utf8') @@ -54,7 +49,7 @@ class NebulaBaseIE(InfoExtractor): return response['key'] - def _retrieve_nebula_api_token(self): + def _retrieve_nebula_api_token(self, username=None, password=None): """ Check cookie jar for valid token. Try to authenticate using credentials if no valid token can be found in the cookie jar. @@ -68,7 +63,7 @@ class NebulaBaseIE(InfoExtractor): if nebula_api_token: return nebula_api_token - return self._perform_nebula_auth() + return self._perform_nebula_auth(username, password) def _call_nebula_api(self, url, video_id=None, method='GET', auth_type='api', note=''): assert method in ('GET', 'POST',) @@ -149,18 +144,17 @@ class NebulaBaseIE(InfoExtractor): } def _perform_login(self, username=None, password=None): - # FIXME: username should be passed from here to inner functions - self._nebula_api_token = self._retrieve_nebula_api_token() + self._nebula_api_token = self._retrieve_nebula_api_token(username, password) self._nebula_bearer_token = self._fetch_nebula_bearer_token() self._zype_access_token = self._fetch_zype_access_token() class NebulaIE(NebulaBaseIE): - _VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app)/videos/(?P<id>[-\w]+)' + _VALID_URL = rf'{_BASE_URL_RE}/videos/(?P<id>[-\w]+)' _TESTS = [ { 'url': 'https://nebula.app/videos/that-time-disney-remade-beauty-and-the-beast', - 'md5': 'fe79c4df8b3aa2fea98a93d027465c7e', + 'md5': '14944cfee8c7beeea106320c47560efc', 'info_dict': { 'id': '5c271b40b13fd613090034fd', 'ext': 'mp4', @@ -172,14 +166,21 @@ class NebulaIE(NebulaBaseIE): 'channel_id': 'lindsayellis', 'uploader': 'Lindsay Ellis', 'uploader_id': 'lindsayellis', - }, - 'params': { - 'usenetrc': True, + 'timestamp': 1533009600, + 'uploader_url': 'https://nebula.app/lindsayellis', + 'series': 'Lindsay Ellis', + 'average_rating': int, + 'display_id': 'that-time-disney-remade-beauty-and-the-beast', + 'channel_url': 'https://nebula.app/lindsayellis', + 'creator': 'Lindsay Ellis', + 'duration': 2212, + 'view_count': int, + 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*', }, }, { 'url': 'https://nebula.app/videos/the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', - 'md5': '6d4edd14ce65720fa63aba5c583fb328', + 'md5': 'd05739cf6c38c09322422f696b569c23', 'info_dict': { 'id': '5e7e78171aaf320001fbd6be', 'ext': 'mp4', @@ -191,14 +192,20 @@ class NebulaIE(NebulaBaseIE): 'channel_id': 'realengineering', 'uploader': 'Real Engineering', 'uploader_id': 'realengineering', - }, - 'params': { - 'usenetrc': True, + 'view_count': int, + 'series': 'Real Engineering', + 'average_rating': int, + 'display_id': 'the-logistics-of-d-day-landing-craft-how-the-allies-got-ashore', + 'creator': 'Real Engineering', + 'duration': 841, + 'channel_url': 'https://nebula.app/realengineering', + 'uploader_url': 'https://nebula.app/realengineering', + 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*', }, }, { 'url': 'https://nebula.app/videos/money-episode-1-the-draw', - 'md5': '8c7d272910eea320f6f8e6d3084eecf5', + 'md5': 'ebe28a7ad822b9ee172387d860487868', 'info_dict': { 'id': '5e779ebdd157bc0001d1c75a', 'ext': 'mp4', @@ -210,9 +217,15 @@ class NebulaIE(NebulaBaseIE): 'channel_id': 'tom-scott-presents-money', 'uploader': 'Tom Scott Presents: Money', 'uploader_id': 'tom-scott-presents-money', - }, - 'params': { - 'usenetrc': True, + 'uploader_url': 'https://nebula.app/tom-scott-presents-money', + 'duration': 825, + 'channel_url': 'https://nebula.app/tom-scott-presents-money', + 'view_count': int, + 'series': 'Tom Scott Presents: Money', + 'display_id': 'money-episode-1-the-draw', + 'thumbnail': r're:https://\w+\.cloudfront\.net/[\w-]+\.jpeg?.*', + 'average_rating': int, + 'creator': 'Tom Scott Presents: Money', }, }, { @@ -233,9 +246,37 @@ class NebulaIE(NebulaBaseIE): return self._build_video_info(video) -class NebulaCollectionIE(NebulaBaseIE): - IE_NAME = 'nebula:collection' - _VALID_URL = r'https?://(?:www\.)?(?:watchnebula\.com|nebula\.app)/(?!videos/)(?P<id>[-\w]+)' +class NebulaSubscriptionsIE(NebulaBaseIE): + IE_NAME = 'nebula:subscriptions' + _VALID_URL = rf'{_BASE_URL_RE}/myshows' + _TESTS = [ + { + 'url': 'https://nebula.app/myshows', + 'playlist_mincount': 1, + 'info_dict': { + 'id': 'myshows', + }, + }, + ] + + def _generate_playlist_entries(self): + next_url = 'https://content.watchnebula.com/library/video/?page_size=100' + page_num = 1 + while next_url: + channel = self._call_nebula_api(next_url, 'myshows', auth_type='bearer', + note=f'Retrieving subscriptions page {page_num}') + for episode in channel['results']: + yield self._build_video_info(episode) + next_url = channel['next'] + page_num += 1 + + def _real_extract(self, url): + return self.playlist_result(self._generate_playlist_entries(), 'myshows') + + +class NebulaChannelIE(NebulaBaseIE): + IE_NAME = 'nebula:channel' + _VALID_URL = rf'{_BASE_URL_RE}/(?!myshows|videos/)(?P<id>[-\w]+)' _TESTS = [ { 'url': 'https://nebula.app/tom-scott-presents-money', @@ -245,9 +286,6 @@ class NebulaCollectionIE(NebulaBaseIE): 'description': 'Tom Scott hosts a series all about trust, negotiation and money.', }, 'playlist_count': 5, - 'params': { - 'usenetrc': True, - }, }, { 'url': 'https://nebula.app/lindsayellis', 'info_dict': { @@ -256,9 +294,6 @@ class NebulaCollectionIE(NebulaBaseIE): 'description': 'Enjoy these hottest of takes on Disney, Transformers, and Musicals.', }, 'playlist_mincount': 100, - 'params': { - 'usenetrc': True, - }, }, ] diff --git a/hypervideo_dl/extractor/nerdcubed.py b/hypervideo_dl/extractor/nerdcubed.py index 9feccc6..7c801b5 100644 --- a/hypervideo_dl/extractor/nerdcubed.py +++ b/hypervideo_dl/extractor/nerdcubed.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import datetime from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/neteasemusic.py b/hypervideo_dl/extractor/neteasemusic.py index 57b4774..5957098 100644 --- a/hypervideo_dl/extractor/neteasemusic.py +++ b/hypervideo_dl/extractor/neteasemusic.py @@ -1,20 +1,25 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from hashlib import md5 +import itertools +import json +import re +import time from base64 import b64encode +from binascii import hexlify from datetime import datetime -import re +from hashlib import md5 +from random import randint from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlencode, - compat_str, - compat_itertools_count, -) +from ..aes import aes_ecb_encrypt, pkcs7_padding +from ..compat import compat_urllib_parse_urlencode from ..utils import ( - sanitized_Request, + ExtractorError, + bytes_to_intlist, + error_to_compat_str, float_or_none, + int_or_none, + intlist_to_bytes, + sanitized_Request, + try_get, ) @@ -26,7 +31,7 @@ class NetEaseMusicBaseIE(InfoExtractor): @classmethod def _encrypt(cls, dfsid): salt_bytes = bytearray(cls._NETEASE_SALT.encode('utf-8')) - string_bytes = bytearray(compat_str(dfsid).encode('ascii')) + string_bytes = bytearray(str(dfsid).encode('ascii')) salt_len = len(salt_bytes) for i in range(len(string_bytes)): string_bytes[i] = string_bytes[i] ^ salt_bytes[i % salt_len] @@ -35,32 +40,105 @@ class NetEaseMusicBaseIE(InfoExtractor): result = b64encode(m.digest()).decode('ascii') return result.replace('/', '_').replace('+', '-') + def make_player_api_request_data_and_headers(self, song_id, bitrate): + KEY = b'e82ckenh8dichen8' + URL = '/api/song/enhance/player/url' + now = int(time.time() * 1000) + rand = randint(0, 1000) + cookie = { + 'osver': None, + 'deviceId': None, + 'appver': '8.0.0', + 'versioncode': '140', + 'mobilename': None, + 'buildver': '1623435496', + 'resolution': '1920x1080', + '__csrf': '', + 'os': 'pc', + 'channel': None, + 'requestId': '{0}_{1:04}'.format(now, rand), + } + request_text = json.dumps( + {'ids': '[{0}]'.format(song_id), 'br': bitrate, 'header': cookie}, + separators=(',', ':')) + message = 'nobody{0}use{1}md5forencrypt'.format( + URL, request_text).encode('latin1') + msg_digest = md5(message).hexdigest() + + data = '{0}-36cd479b6b5-{1}-36cd479b6b5-{2}'.format( + URL, request_text, msg_digest) + data = pkcs7_padding(bytes_to_intlist(data)) + encrypted = intlist_to_bytes(aes_ecb_encrypt(data, bytes_to_intlist(KEY))) + encrypted_params = hexlify(encrypted).decode('ascii').upper() + + cookie = '; '.join( + ['{0}={1}'.format(k, v if v is not None else 'undefined') + for [k, v] in cookie.items()]) + + headers = { + 'User-Agent': self.extractor.get_param('http_headers')['User-Agent'], + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': 'https://music.163.com', + 'Cookie': cookie, + } + return ('params={0}'.format(encrypted_params), headers) + + def _call_player_api(self, song_id, bitrate): + url = 'https://interface3.music.163.com/eapi/song/enhance/player/url' + data, headers = self.make_player_api_request_data_and_headers(song_id, bitrate) + try: + msg = 'empty result' + result = self._download_json( + url, song_id, data=data.encode('ascii'), headers=headers) + if result: + return result + except ExtractorError as e: + if type(e.cause) in (ValueError, TypeError): + # JSON load failure + raise + except Exception as e: + msg = error_to_compat_str(e) + self.report_warning('%s API call (%s) failed: %s' % ( + song_id, bitrate, msg)) + return {} + def extract_formats(self, info): + err = 0 formats = [] + song_id = info['id'] for song_format in self._FORMATS: details = info.get(song_format) if not details: continue - song_file_path = '/%s/%s.%s' % ( - self._encrypt(details['dfsId']), details['dfsId'], details['extension']) - - # 203.130.59.9, 124.40.233.182, 115.231.74.139, etc is a reverse proxy-like feature - # from NetEase's CDN provider that can be used if m5.music.126.net does not - # work, especially for users outside of Mainland China - # via: https://github.com/JixunMoe/unblock-163/issues/3#issuecomment-163115880 - for host in ('http://m5.music.126.net', 'http://115.231.74.139/m1.music.126.net', - 'http://124.40.233.182/m1.music.126.net', 'http://203.130.59.9/m1.music.126.net'): - song_url = host + song_file_path + + bitrate = int_or_none(details.get('bitrate')) or 999000 + data = self._call_player_api(song_id, bitrate) + for song in try_get(data, lambda x: x['data'], list) or []: + song_url = try_get(song, lambda x: x['url']) + if not song_url: + continue if self._is_valid_url(song_url, info['id'], 'song'): formats.append({ 'url': song_url, 'ext': details.get('extension'), - 'abr': float_or_none(details.get('bitrate'), scale=1000), + 'abr': float_or_none(song.get('br'), scale=1000), 'format_id': song_format, - 'filesize': details.get('size'), - 'asr': details.get('sr') + 'filesize': int_or_none(song.get('size')), + 'asr': int_or_none(details.get('sr')), }) - break + elif err == 0: + err = try_get(song, lambda x: x['code'], int) + + if not formats: + msg = 'No media links found' + if err != 0 and (err < 200 or err >= 400): + raise ExtractorError( + '%s (site code %d)' % (msg, err, ), expected=True) + else: + self.raise_geo_restricted( + msg + ': probably this video is not available from your location due to geo restriction.', + countries=['CN']) + return formats @classmethod @@ -76,33 +154,19 @@ class NetEaseMusicBaseIE(InfoExtractor): class NetEaseMusicIE(NetEaseMusicBaseIE): IE_NAME = 'netease:song' IE_DESC = '网易云音乐' - _VALID_URL = r'https?://music\.163\.com/(#/)?song\?id=(?P<id>[0-9]+)' + _VALID_URL = r'https?://(y\.)?music\.163\.com/(?:[#m]/)?song\?.*?\bid=(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://music.163.com/#/song?id=32102397', - 'md5': 'f2e97280e6345c74ba9d5677dd5dcb45', + 'md5': '3e909614ce09b1ccef4a3eb205441190', 'info_dict': { 'id': '32102397', 'ext': 'mp3', - 'title': 'Bad Blood (feat. Kendrick Lamar)', + 'title': 'Bad Blood', 'creator': 'Taylor Swift / Kendrick Lamar', - 'upload_date': '20150517', - 'timestamp': 1431878400, - 'description': 'md5:a10a54589c2860300d02e1de821eb2ef', + 'upload_date': '20150516', + 'timestamp': 1431792000, + 'description': 'md5:25fc5f27e47aad975aa6d36382c7833c', }, - 'skip': 'Blocked outside Mainland China', - }, { - 'note': 'No lyrics translation.', - 'url': 'http://music.163.com/#/song?id=29822014', - 'info_dict': { - 'id': '29822014', - 'ext': 'mp3', - 'title': '听见下雨的声音', - 'creator': '周杰伦', - 'upload_date': '20141225', - 'timestamp': 1419523200, - 'description': 'md5:a4d8d89f44656af206b7b2555c0bce6c', - }, - 'skip': 'Blocked outside Mainland China', }, { 'note': 'No lyrics.', 'url': 'http://music.163.com/song?id=17241424', @@ -112,9 +176,9 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'title': 'Opus 28', 'creator': 'Dustin O\'Halloran', 'upload_date': '20080211', + 'description': 'md5:f12945b0f6e0365e3b73c5032e1b0ff4', 'timestamp': 1202745600, }, - 'skip': 'Blocked outside Mainland China', }, { 'note': 'Has translated name.', 'url': 'http://music.163.com/#/song?id=22735043', @@ -128,7 +192,18 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): 'timestamp': 1264608000, 'alt_title': '说出愿望吧(Genie)', }, - 'skip': 'Blocked outside Mainland China', + }, { + 'url': 'https://y.music.163.com/m/song?app_version=8.8.45&id=95670&uct2=sKnvS4+0YStsWkqsPhFijw%3D%3D&dlt=0846', + 'md5': '95826c73ea50b1c288b22180ec9e754d', + 'info_dict': { + 'id': '95670', + 'ext': 'mp3', + 'title': '国际歌', + 'creator': '马备', + 'upload_date': '19911130', + 'timestamp': 691516800, + 'description': 'md5:1ba2f911a2b0aa398479f595224f2141', + }, }] def _process_lyrics(self, lyrics_info): @@ -161,7 +236,6 @@ class NetEaseMusicIE(NetEaseMusicBaseIE): song_id, 'Downloading song info')['songs'][0] formats = self.extract_formats(info) - self._sort_formats(formats) lyrics_info = self.query_api( 'song/lyric?id=%s&lv=-1&tv=-1' % song_id, @@ -337,7 +411,6 @@ class NetEaseMusicMvIE(NetEaseMusicBaseIE): {'url': mv_url, 'ext': 'mp4', 'format_id': '%sp' % brs, 'height': int(brs)} for brs, mv_url in info['brs'].items() ] - self._sort_formats(formats) return { 'id': mv_id, @@ -407,7 +480,6 @@ class NetEaseMusicProgramIE(NetEaseMusicBaseIE): if not self._yes_playlist(info['songs'] and program_id, info['mainSong']['id']): formats = self.extract_formats(info['mainSong']) - self._sort_formats(formats) return { 'id': info['mainSong']['id'], @@ -452,7 +524,7 @@ class NetEaseMusicDjRadioIE(NetEaseMusicBaseIE): name = None desc = None entries = [] - for offset in compat_itertools_count(start=0, step=self._PAGE_SIZE): + for offset in itertools.count(start=0, step=self._PAGE_SIZE): info = self.query_api( 'dj/program/byradio?asc=false&limit=%d&radioId=%s&offset=%d' % (self._PAGE_SIZE, dj_id, offset), diff --git a/hypervideo_dl/extractor/netverse.py b/hypervideo_dl/extractor/netverse.py new file mode 100644 index 0000000..719a9da --- /dev/null +++ b/hypervideo_dl/extractor/netverse.py @@ -0,0 +1,176 @@ +from .common import InfoExtractor +from .dailymotion import DailymotionIE +from ..utils import smuggle_url, traverse_obj + + +class NetverseBaseIE(InfoExtractor): + _ENDPOINTS = { + 'watch': 'watchvideo', + 'video': 'watchvideo', + 'webseries': 'webseries', + 'season': 'webseason_videos', + } + + def _call_api(self, slug, endpoint, query={}, season_id='', display_id=None): + return self._download_json( + f'https://api.netverse.id/medias/api/v2/{self._ENDPOINTS[endpoint]}/{slug}/{season_id}', + display_id or slug, query=query) + + +class NetverseIE(NetverseBaseIE): + _VALID_URL = r'https?://(?:\w+\.)?netverse\.id/(?P<type>watch|video)/(?P<display_id>[^/?#&]+)' + _TESTS = [{ + # Watch video + 'url': 'https://www.netverse.id/watch/waktu-indonesia-bercanda-edisi-spesial-lebaran-2016', + 'info_dict': { + 'id': 'k4yhqUwINAGtmHx3NkL', + 'title': 'Waktu Indonesia Bercanda - Edisi Spesial Lebaran 2016', + 'ext': 'mp4', + 'season': 'Season 2016', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/T7aV31Y0eGRWBbwkK/x1080', + 'episode_number': 22, + 'episode': 'Episode 22', + 'uploader_id': 'x2ir3vq', + 'age_limit': 0, + 'tags': [], + 'view_count': int, + 'display_id': 'waktu-indonesia-bercanda-edisi-spesial-lebaran-2016', + 'duration': 2990, + 'upload_date': '20210722', + 'timestamp': 1626919804, + 'like_count': int, + 'uploader': 'Net Prime', + } + }, { + # series + 'url': 'https://www.netverse.id/watch/jadoo-seorang-model', + 'info_dict': { + 'id': 'x88izwc', + 'title': 'Jadoo Seorang Model', + 'ext': 'mp4', + 'season': 'Season 2', + 'description': 'md5:8a74f70812cca267e19ee0635f0af835', + 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/Thwuy1YURicFmGu0v/x1080', + 'episode_number': 2, + 'episode': 'Episode 2', + 'view_count': int, + 'like_count': int, + 'display_id': 'jadoo-seorang-model', + 'uploader_id': 'x2ir3vq', + 'duration': 635, + 'timestamp': 1646372927, + 'tags': ['PG069497-hellojadooseason2eps2'], + 'upload_date': '20220304', + 'uploader': 'Net Prime', + 'age_limit': 0, + }, + 'skip': 'video get Geo-blocked for some country' + }, { + # non www host + 'url': 'https://netverse.id/watch/tetangga-baru', + 'info_dict': { + 'id': 'k4CNGz7V0HJ7vfwZbXy', + 'ext': 'mp4', + 'title': 'Tetangga Baru', + 'season': 'Season 1', + 'description': 'md5:23fcf70e97d461d3029d25d59b2ccfb9', + 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/T3Ogm1YEnnyjVKAFF/x1080', + 'episode_number': 1, + 'episode': 'Episode 1', + 'timestamp': 1624538169, + 'view_count': int, + 'upload_date': '20210624', + 'age_limit': 0, + 'uploader_id': 'x2ir3vq', + 'like_count': int, + 'uploader': 'Net Prime', + 'tags': ['PG008534', 'tetangga', 'Baru'], + 'display_id': 'tetangga-baru', + 'duration': 1406, + }, + }, { + # /video url + 'url': 'https://www.netverse.id/video/pg067482-hellojadoo-season1', + 'title': 'Namaku Choi Jadoo', + 'info_dict': { + 'id': 'x887jzz', + 'ext': 'mp4', + 'thumbnail': r're:https?://s\d+\.dmcdn\.net/v/TfuZ_1Y6PboJ5An_s/x1080', + 'season': 'Season 1', + 'episode_number': 1, + 'description': 'md5:d4f627b3e7a3f9acdc55f6cdd5ea41d5', + 'title': 'Namaku Choi Jadoo', + 'episode': 'Episode 1', + 'age_limit': 0, + 'like_count': int, + 'view_count': int, + 'tags': ['PG067482', 'PG067482-HelloJadoo-season1'], + 'duration': 780, + 'display_id': 'pg067482-hellojadoo-season1', + 'uploader_id': 'x2ir3vq', + 'uploader': 'Net Prime', + 'timestamp': 1645764984, + 'upload_date': '20220225', + }, + 'skip': 'This video get Geo-blocked for some country' + }] + + def _real_extract(self, url): + display_id, sites_type = self._match_valid_url(url).group('display_id', 'type') + program_json = self._call_api(display_id, sites_type) + videos = program_json['response']['videos'] + + return { + '_type': 'url_transparent', + 'ie_key': DailymotionIE.ie_key(), + 'url': smuggle_url(videos['dailymotion_url'], {'query': {'embedder': 'https://www.netverse.id'}}), + 'display_id': display_id, + 'title': videos.get('title'), + 'season': videos.get('season_name'), + 'thumbnail': traverse_obj(videos, ('program_detail', 'thumbnail_image')), + 'description': traverse_obj(videos, ('program_detail', 'description')), + 'episode_number': videos.get('episode_order'), + } + + +class NetversePlaylistIE(NetverseBaseIE): + _VALID_URL = r'https?://(?:\w+\.)?netverse\.id/(?P<type>webseries)/(?P<display_id>[^/?#&]+)' + _TESTS = [{ + # multiple season + 'url': 'https://netverse.id/webseries/tetangga-masa-gitu', + 'info_dict': { + 'id': 'tetangga-masa-gitu', + 'title': 'Tetangga Masa Gitu', + }, + 'playlist_count': 519, + }, { + # single season + 'url': 'https://netverse.id/webseries/kelas-internasional', + 'info_dict': { + 'id': 'kelas-internasional', + 'title': 'Kelas Internasional', + }, + 'playlist_count': 203, + }] + + def parse_playlist(self, json_data, playlist_id): + slug_sample = traverse_obj(json_data, ('related', 'data', ..., 'slug'))[0] + for season in traverse_obj(json_data, ('seasons', ..., 'id')): + playlist_json = self._call_api( + slug_sample, 'season', display_id=playlist_id, season_id=season) + + for current_page in range(playlist_json['response']['season_list']['last_page']): + playlist_json = self._call_api(slug_sample, 'season', query={'page': current_page + 1}, + season_id=season, display_id=playlist_id) + for slug in traverse_obj(playlist_json, ('response', ..., 'data', ..., 'slug')): + yield self.url_result(f'https://www.netverse.id/video/{slug}', NetverseIE) + + def _real_extract(self, url): + playlist_id, sites_type = self._match_valid_url(url).group('display_id', 'type') + playlist_data = self._call_api(playlist_id, sites_type) + + return self.playlist_result( + self.parse_playlist(playlist_data['response'], playlist_id), + traverse_obj(playlist_data, ('response', 'webseries_info', 'slug')), + traverse_obj(playlist_data, ('response', 'webseries_info', 'title'))) diff --git a/hypervideo_dl/extractor/netzkino.py b/hypervideo_dl/extractor/netzkino.py index 4ad0d8e..9c314e2 100644 --- a/hypervideo_dl/extractor/netzkino.py +++ b/hypervideo_dl/extractor/netzkino.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( clean_html, @@ -76,7 +72,6 @@ class NetzkinoIE(InfoExtractor): 'ext': 'mp4', 'url': tpl.replace('{}', film_fn) + suffix[key], } for key, tpl in templates.items()] - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/newgrounds.py b/hypervideo_dl/extractor/newgrounds.py index 6525a6d..9e3286d 100644 --- a/hypervideo_dl/extractor/newgrounds.py +++ b/hypervideo_dl/extractor/newgrounds.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import functools import re @@ -175,7 +172,6 @@ class NewgroundsIE(InfoExtractor): if video_type_description == 'Audio File': formats[0]['vcodec'] = 'none' self._check_formats(formats, media_id) - self._sort_formats(formats) return { 'id': media_id, diff --git a/hypervideo_dl/extractor/newspicks.py b/hypervideo_dl/extractor/newspicks.py new file mode 100644 index 0000000..b6334dc --- /dev/null +++ b/hypervideo_dl/extractor/newspicks.py @@ -0,0 +1,53 @@ +import re + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class NewsPicksIE(InfoExtractor): + _VALID_URL = r'https://newspicks\.com/movie-series/(?P<channel_id>\d+)\?movieId=(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://newspicks.com/movie-series/11?movieId=1813', + 'info_dict': { + 'id': '1813', + 'title': '日本の課題を破壊せよ【ゲスト:成田悠輔】', + 'description': 'md5:09397aad46d6ded6487ff13f138acadf', + 'channel': 'HORIE ONE', + 'channel_id': '11', + 'release_date': '20220117', + 'thumbnail': r're:https://.+jpg', + 'ext': 'mp4', + }, + }] + + def _real_extract(self, url): + video_id, channel_id = self._match_valid_url(url).group('id', 'channel_id') + webpage = self._download_webpage(url, video_id) + entries = self._parse_html5_media_entries( + url, webpage.replace('movie-for-pc', 'movie'), video_id, 'hls') + if not entries: + raise ExtractorError('No HTML5 media elements found') + info = entries[0] + + title = self._html_search_meta('og:title', webpage, fatal=False) + description = self._html_search_meta( + ('og:description', 'twitter:title'), webpage, fatal=False) + channel = self._html_search_regex( + r'value="11".+?<div\s+class="title">(.+?)</div', webpage, 'channel name', fatal=False) + if not title or not channel: + title, channel = re.split(r'\s*|\s*', self._html_extract_title(webpage)) + + release_date = self._search_regex( + r'<span\s+class="on-air-date">\s*(\d+)年(\d+)月(\d+)日\s*</span>', + webpage, 'release date', fatal=False, group=(1, 2, 3)) + + info.update({ + 'id': video_id, + 'title': title, + 'description': description, + 'channel': channel, + 'channel_id': channel_id, + 'release_date': ('%04d%02d%02d' % tuple(map(int, release_date))) if release_date else None, + }) + return info diff --git a/hypervideo_dl/extractor/newstube.py b/hypervideo_dl/extractor/newstube.py index 479141a..820eb4b 100644 --- a/hypervideo_dl/extractor/newstube.py +++ b/hypervideo_dl/extractor/newstube.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import base64 import hashlib @@ -67,7 +64,6 @@ class NewstubeIE(InfoExtractor): formats.append(f) self._check_formats(formats, video_guid) - self._sort_formats(formats) return { 'id': video_guid, diff --git a/hypervideo_dl/extractor/newsy.py b/hypervideo_dl/extractor/newsy.py index cf31641..a5a7b16 100644 --- a/hypervideo_dl/extractor/newsy.py +++ b/hypervideo_dl/extractor/newsy.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( js_to_json, @@ -39,7 +36,6 @@ class NewsyIE(InfoExtractor): fmts, subs = self._extract_m3u8_formats_and_subtitles(data_json['stream'], display_id) formats.extend(fmts) subtitles = self._merge_subtitles(subtitles, subs) - self._sort_formats(formats) return merge_dicts(ld_json, { 'id': data_json['id'], 'display_id': display_id, diff --git a/hypervideo_dl/extractor/nextmedia.py b/hypervideo_dl/extractor/nextmedia.py index 7bd1290..0e47a4d 100644 --- a/hypervideo_dl/extractor/nextmedia.py +++ b/hypervideo_dl/extractor/nextmedia.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( @@ -80,7 +77,7 @@ class NextMediaIE(InfoExtractor): return self._og_search_property('description', page) -class NextMediaActionNewsIE(NextMediaIE): +class NextMediaActionNewsIE(NextMediaIE): # XXX: Do not subclass from concrete IE IE_DESC = '蘋果日報 - 動新聞' _VALID_URL = r'https?://hk\.dv\.nextmedia\.com/actionnews/[^/]+/(?P<date>\d+)/(?P<id>\d+)/\d+' _TESTS = [{ @@ -105,7 +102,7 @@ class NextMediaActionNewsIE(NextMediaIE): return self._extract_from_nextmedia_page(news_id, url, article_page) -class AppleDailyIE(NextMediaIE): +class AppleDailyIE(NextMediaIE): # XXX: Do not subclass from concrete IE IE_DESC = '臺灣蘋果日報' _VALID_URL = r'https?://(www|ent)\.appledaily\.com\.tw/[^/]+/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?' _TESTS = [{ diff --git a/hypervideo_dl/extractor/nexx.py b/hypervideo_dl/extractor/nexx.py index a521bb6..b4874c8 100644 --- a/hypervideo_dl/extractor/nexx.py +++ b/hypervideo_dl/extractor/nexx.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import hashlib import random import re @@ -117,8 +114,8 @@ class NexxIE(InfoExtractor): webpage) return mobj.group('id') if mobj else None - @staticmethod - def _extract_urls(webpage): + @classmethod + def _extract_embed_urls(cls, url, webpage): # Reference: # 1. https://nx-s.akamaized.net/files/201510/44.pdf @@ -138,10 +135,6 @@ class NexxIE(InfoExtractor): return entries - @staticmethod - def _extract_url(webpage): - return NexxIE._extract_urls(webpage)[0] - def _handle_error(self, response): if traverse_obj(response, ('metadata', 'notice'), expected_type=str): self.report_warning('%s said: %s' % (self.IE_NAME, response['metadata']['notice'])) @@ -459,8 +452,6 @@ class NexxIE(InfoExtractor): else: self.raise_no_formats(f'{cdn} formats are currently not supported', video_id) - self._sort_formats(formats) - subtitles = {} for sub in video.get('captiondata') or []: if sub.get('data'): @@ -501,6 +492,8 @@ class NexxIE(InfoExtractor): class NexxEmbedIE(InfoExtractor): _VALID_URL = r'https?://embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:video/)?(?P<id>[^/?#&]+)' + # Reference. https://nx-s.akamaized.net/files/201510/44.pdf + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:(?!\1).)+)\1'] _TESTS = [{ 'url': 'http://embed.nexx.cloud/748/KC1614647Z27Y7T?autoplay=1', 'md5': '16746bfc28c42049492385c989b26c4a', @@ -524,16 +517,6 @@ class NexxEmbedIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - # Reference: - # 1. https://nx-s.akamaized.net/files/201510/44.pdf - - # iFrame Embed Integration - return [mobj.group('url') for mobj in re.finditer( - r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:(?!\1).)+)\1', - webpage)] - def _real_extract(self, url): embed_id = self._match_id(url) diff --git a/hypervideo_dl/extractor/nfb.py b/hypervideo_dl/extractor/nfb.py index a12e503..38e068a 100644 --- a/hypervideo_dl/extractor/nfb.py +++ b/hypervideo_dl/extractor/nfb.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import int_or_none @@ -38,7 +35,6 @@ class NFBIE(InfoExtractor): player, 'source', default=None, fatal=True) formats, subtitles = self._extract_m3u8_formats_and_subtitles(source, video_id, ext='mp4') - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/nfhsnetwork.py b/hypervideo_dl/extractor/nfhsnetwork.py index 802f6ca..febad8f 100644 --- a/hypervideo_dl/extractor/nfhsnetwork.py +++ b/hypervideo_dl/extractor/nfhsnetwork.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor @@ -127,7 +124,6 @@ class NFHSNetworkIE(InfoExtractor): video_id).get('video_url') formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', live=isLive) - self._sort_formats(formats, ['res', 'tbr']) return { 'id': video_id, @@ -140,5 +136,6 @@ class NFHSNetworkIE(InfoExtractor): 'uploader_url': uploaderPage, 'location': location, 'upload_date': upload_date, - 'is_live': isLive + 'is_live': isLive, + '_format_sort_fields': ('res', 'tbr'), } diff --git a/hypervideo_dl/extractor/nfl.py b/hypervideo_dl/extractor/nfl.py index 821276a..29c53d5 100644 --- a/hypervideo_dl/extractor/nfl.py +++ b/hypervideo_dl/extractor/nfl.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -56,8 +53,7 @@ class NFLBaseIE(InfoExtractor): ) )/ ''' - _VIDEO_CONFIG_REGEX = r'<script[^>]+id="[^"]*video-config-[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12}[^"]*"[^>]*>\s*({.+})' - _WORKING = False + _VIDEO_CONFIG_REGEX = r'<script[^>]+id="[^"]*video-config-[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12}[^"]*"[^>]*>\s*({.+});?\s*</script>' def _parse_video_config(self, video_config, display_id): video_config = self._parse_json(video_config, display_id) @@ -69,13 +65,12 @@ class NFLBaseIE(InfoExtractor): 'Anvato', mcp_id) else: media_id = item.get('id') or item['entityId'] - title = item['title'] + title = item.get('title') item_url = item['url'] info = {'id': media_id} ext = determine_ext(item_url) if ext == 'm3u8': info['formats'] = self._extract_m3u8_formats(item_url, media_id, 'mp4') - self._sort_formats(info['formats']) else: info['url'] = item_url if item.get('audio') is True: @@ -111,6 +106,9 @@ class NFLIE(NFLBaseIE): 'timestamp': 1608009755, 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'NFL', + 'tags': 'count:6', + 'duration': 157, + 'categories': 'count:3', } }, { 'url': 'https://www.chiefs.com/listen/patrick-mahomes-travis-kelce-react-to-win-over-dolphins-the-breakdown', @@ -120,7 +118,8 @@ class NFLIE(NFLBaseIE): 'ext': 'mp3', 'title': 'Patrick Mahomes, Travis Kelce React to Win Over Dolphins | The Breakdown', 'description': 'md5:12ada8ee70e6762658c30e223e095075', - } + }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'https://www.buffalobills.com/video/buffalo-bills-military-recognition-week-14', 'only_matching': True, diff --git a/hypervideo_dl/extractor/nhk.py b/hypervideo_dl/extractor/nhk.py index 3b8efc3..59702b2 100644 --- a/hypervideo_dl/extractor/nhk.py +++ b/hypervideo_dl/extractor/nhk.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -13,7 +11,7 @@ from ..utils import ( class NhkBaseIE(InfoExtractor): - _API_URL_TEMPLATE = 'https://api.nhk.or.jp/nhkworld/%sod%slist/v7a/%s/%s/%s/all%s.json' + _API_URL_TEMPLATE = 'https://nwapi.nhk.jp/nhkworld/%sod%slist/v7b/%s/%s/%s/all%s.json' _BASE_URL_REGEX = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand' _TYPE_REGEX = r'/(?P<type>video|audio)/' @@ -29,7 +27,7 @@ class NhkBaseIE(InfoExtractor): def _extract_episode_info(self, url, episode=None): fetch_episode = episode is None lang, m_type, episode_id = NhkVodIE._match_valid_url(url).groups() - if episode_id.isdigit(): + if len(episode_id) == 7: episode_id = episode_id[:4] + '-' + episode_id[4:] is_video = m_type == 'video' @@ -80,7 +78,6 @@ class NhkBaseIE(InfoExtractor): m3u8_id='hls', fatal=False) for f in info['formats']: f['language'] = lang - self._sort_formats(info['formats']) else: info.update({ '_type': 'url_transparent', @@ -91,7 +88,8 @@ class NhkBaseIE(InfoExtractor): class NhkVodIE(NhkBaseIE): - _VALID_URL = r'%s%s(?P<id>\d{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) + # the 7-character IDs can have alphabetic chars too: assume [a-z] rather than just [a-f], eg + _VALID_URL = r'%s%s(?P<id>[0-9a-z]{7}|[^/]+?-\d{8}-[0-9a-z]+)' % (NhkBaseIE._BASE_URL_REGEX, NhkBaseIE._TYPE_REGEX) # Content available only for a limited period of time. Visit # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. _TESTS = [{ @@ -131,6 +129,19 @@ class NhkVodIE(NhkBaseIE): }, { 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/audio/j_art-20150903-1/', 'only_matching': True, + }, { + # video, alphabetic character in ID #29670 + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999a34/', + 'only_matching': True, + 'info_dict': { + 'id': 'qfjay6cg', + 'ext': 'mp4', + 'title': 'DESIGN TALKS plus - Fishermen’s Finery', + 'description': 'md5:8a8f958aaafb0d7cb59d38de53f1e448', + 'thumbnail': r're:^https?:/(/[a-z0-9.-]+)+\.jpg\?w=1920&h=1080$', + 'upload_date': '20210615', + 'timestamp': 1623722008, + } }] def _real_extract(self, url): @@ -228,7 +239,6 @@ class NhkForSchoolBangumiIE(InfoExtractor): formats = self._extract_m3u8_formats( f'https://nhks-vh.akamaihd.net/i/das/{video_id[0:8]}/{video_id}_V_000.f4v/master.m3u8', video_id, ext='mp4', m3u8_id='hls') - self._sort_formats(formats) duration = parse_duration(base_values.get('r_duration')) @@ -309,8 +319,7 @@ class NhkForSchoolProgramListIE(InfoExtractor): webpage = self._download_webpage(f'https://www.nhk.or.jp/school/{program_id}/', program_id) - title = (self._og_search_title(webpage) - or self._html_extract_title(webpage) + title = (self._generic_title('', webpage) or self._html_search_regex(r'<h3>([^<]+?)とは?\s*</h3>', webpage, 'title', fatal=False)) title = re.sub(r'\s*\|\s*NHK\s+for\s+School\s*$', '', title) if title else None description = self._html_search_regex( diff --git a/hypervideo_dl/extractor/nhl.py b/hypervideo_dl/extractor/nhl.py index d3a5e17..2521c40 100644 --- a/hypervideo_dl/extractor/nhl.py +++ b/hypervideo_dl/extractor/nhl.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -51,7 +48,6 @@ class NHLBaseIE(InfoExtractor): 'height': height, 'tbr': int_or_none(self._search_regex(r'_(\d+)[kK]', playback_url, 'bitrate', default=None)), }) - self._sort_formats(formats) thumbnails = [] cuts = video_data.get('image', {}).get('cuts') or [] diff --git a/hypervideo_dl/extractor/nick.py b/hypervideo_dl/extractor/nick.py index ba7da76..de22cb8 100644 --- a/hypervideo_dl/extractor/nick.py +++ b/hypervideo_dl/extractor/nick.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .mtv import MTVServicesInfoExtractor from ..utils import update_url_query @@ -192,7 +188,7 @@ class NickDeIE(MTVServicesInfoExtractor): return self._remove_template_parameter(config['feedWithQueryParams']) -class NickNightIE(NickDeIE): +class NickNightIE(NickDeIE): # XXX: Do not subclass from concrete IE IE_NAME = 'nicknight' _VALID_URL = r'https?://(?:www\.)(?P<host>nicknight\.(?:de|at|tv))/(?:playlist|shows)/(?:[^/]+/)*(?P<id>[^/?#&]+)' _TESTS = [{ diff --git a/hypervideo_dl/extractor/niconico.py b/hypervideo_dl/extractor/niconico.py index 4eb6ed0..2103037 100644 --- a/hypervideo_dl/extractor/niconico.py +++ b/hypervideo_dl/extractor/niconico.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import datetime import functools import itertools @@ -10,8 +7,6 @@ import time from .common import InfoExtractor, SearchInfoExtractor from ..compat import ( - compat_parse_qs, - compat_urllib_parse_urlparse, compat_HTTPError, ) from ..utils import ( @@ -35,6 +30,7 @@ from ..utils import ( update_url_query, url_or_none, urlencode_postdata, + urljoin, ) @@ -195,7 +191,7 @@ class NiconicoIE(InfoExtractor): self._request_webpage( 'https://account.nicovideo.jp/login', None, note='Acquiring Login session') - urlh = self._request_webpage( + page = self._download_webpage( 'https://account.nicovideo.jp/login/redirector?show_button_twitter=1&site=niconico&show_button_facebook=1', None, note='Logging in', errnote='Unable to log in', data=urlencode_postdata(login_form_strs), @@ -203,26 +199,39 @@ class NiconicoIE(InfoExtractor): 'Referer': 'https://account.nicovideo.jp/login', 'Content-Type': 'application/x-www-form-urlencoded', }) - if urlh is False: - login_ok = False - else: - parts = compat_urllib_parse_urlparse(urlh.geturl()) - if compat_parse_qs(parts.query).get('message', [None])[0] == 'cant_login': - login_ok = False + if 'oneTimePw' in page: + post_url = self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>.+?)\1', page, 'post url', group='url') + page = self._download_webpage( + urljoin('https://account.nicovideo.jp', post_url), None, + note='Performing MFA', errnote='Unable to complete MFA', + data=urlencode_postdata({ + 'otp': self._get_tfa_info('6 digits code') + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }) + if 'oneTimePw' in page or 'formError' in page: + err_msg = self._html_search_regex( + r'formError["\']+>(.*?)</div>', page, 'form_error', + default='There\'s an error but the message can\'t be parsed.', + flags=re.DOTALL) + self.report_warning(f'Unable to log in: MFA challenge failed, "{err_msg}"') + return False + login_ok = 'class="notice error"' not in page if not login_ok: - self.report_warning('unable to log in: bad username or password') + self.report_warning('Unable to log in: bad username or password') return login_ok def _get_heartbeat_info(self, info_dict): video_id, video_src_id, audio_src_id = info_dict['url'].split(':')[1].split('/') - dmc_protocol = info_dict['_expected_protocol'] + dmc_protocol = info_dict['expected_protocol'] api_data = ( info_dict.get('_api_data') or self._parse_json( self._html_search_regex( 'data-api-data="([^"]+)"', - self._download_webpage('http://www.nicovideo.jp/watch/' + video_id, video_id), + self._download_webpage('https://www.nicovideo.jp/watch/' + video_id, video_id), 'API data', default='{}'), video_id)) @@ -369,7 +378,7 @@ class NiconicoIE(InfoExtractor): 'width': traverse_obj(video_quality, ('metadata', 'resolution', 'width')), 'quality': -2 if 'low' in video_quality['id'] else None, 'protocol': 'niconico_dmc', - '_expected_protocol': dmc_protocol, + 'expected_protocol': dmc_protocol, # XXX: This is not a documented field 'http_headers': { 'Origin': 'https://www.nicovideo.jp', 'Referer': 'https://www.nicovideo.jp/watch/' + video_id, @@ -381,7 +390,7 @@ class NiconicoIE(InfoExtractor): try: webpage, handle = self._download_webpage_handle( - 'http://www.nicovideo.jp/watch/' + video_id, video_id) + 'https://www.nicovideo.jp/watch/' + video_id, video_id) if video_id.startswith('so'): video_id = self._match_id(handle.geturl()) @@ -416,8 +425,6 @@ class NiconicoIE(InfoExtractor): if fmt: formats.append(fmt) - self._sort_formats(formats) - # Start extracting information tags = None if webpage: @@ -548,8 +555,7 @@ class NiconicoPlaylistBaseIE(InfoExtractor): } def _call_api(self, list_id, resource, query): - "Implement this in child class" - pass + raise NotImplementedError('Must be implemented in subclasses') @staticmethod def _parse_owner(item): @@ -638,14 +644,14 @@ class NiconicoSeriesIE(InfoExtractor): 'id': '110226', 'title': 'ご立派ァ!のシリーズ', }, - 'playlist_mincount': 10, # as of 2021/03/17 + 'playlist_mincount': 10, }, { 'url': 'https://www.nicovideo.jp/series/12312/', 'info_dict': { 'id': '12312', 'title': 'バトルスピリッツ お勧めカード紹介(調整中)', }, - 'playlist_mincount': 97, # as of 2021/03/17 + 'playlist_mincount': 103, }, { 'url': 'https://nico.ms/series/203559', 'only_matching': True, @@ -663,7 +669,7 @@ class NiconicoSeriesIE(InfoExtractor): title = unescapeHTML(title) playlist = [ self.url_result(f'https://www.nicovideo.jp/watch/{v_id}', video_id=v_id) - for v_id in re.findall(r'href="/watch/([a-z0-9]+)" data-href="/watch/\1', webpage)] + for v_id in re.findall(r'data-href=[\'"](?:https://www\.nicovideo\.jp)?/watch/([a-z0-9]+)', webpage)] return self.playlist_result(playlist, list_id, title) @@ -720,7 +726,7 @@ class NicovideoSearchBaseIE(InfoExtractor): webpage = self._download_webpage(url, item_id, query=query, note=note % {'page': page_num}) results = re.findall(r'(?<=data-video-id=)["\']?(?P<videoid>.*?)(?=["\'])', webpage) for item in results: - yield self.url_result(f'http://www.nicovideo.jp/watch/{item}', 'Niconico', item) + yield self.url_result(f'https://www.nicovideo.jp/watch/{item}', 'Niconico', item) if not results: break diff --git a/hypervideo_dl/extractor/ninecninemedia.py b/hypervideo_dl/extractor/ninecninemedia.py index 7818427..31df42f 100644 --- a/hypervideo_dl/extractor/ninecninemedia.py +++ b/hypervideo_dl/extractor/ninecninemedia.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( float_or_none, @@ -46,7 +43,6 @@ class NineCNineMediaIE(InfoExtractor): formats.extend(self._extract_mpd_formats( manifest_base_url + 'mpd', content_id, mpd_id='dash', fatal=False)) - self._sort_formats(formats) thumbnails = [] for image in (content.get('Images') or []): diff --git a/hypervideo_dl/extractor/ninegag.py b/hypervideo_dl/extractor/ninegag.py index 1439082..865ad99 100644 --- a/hypervideo_dl/extractor/ninegag.py +++ b/hypervideo_dl/extractor/ninegag.py @@ -1,11 +1,9 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( ExtractorError, determine_ext, int_or_none, - try_get, + traverse_obj, unescapeHTML, url_or_none, ) @@ -13,18 +11,20 @@ from ..utils import ( class NineGagIE(InfoExtractor): IE_NAME = '9gag' + IE_DESC = '9GAG' _VALID_URL = r'https?://(?:www\.)?9gag\.com/gag/(?P<id>[^/?&#]+)' _TESTS = [{ 'url': 'https://9gag.com/gag/ae5Ag7B', 'info_dict': { 'id': 'ae5Ag7B', - 'ext': 'mp4', + 'ext': 'webm', 'title': 'Capybara Agility Training', 'upload_date': '20191108', 'timestamp': 1573237208, + 'thumbnail': 'https://img-9gag-fun.9cache.com/photo/ae5Ag7B_460s.jpg', 'categories': ['Awesome'], - 'tags': ['Weimaraner', 'American Pit Bull Terrier'], + 'tags': ['Awesome'], 'duration': 44, 'like_count': int, 'dislike_count': int, @@ -34,6 +34,26 @@ class NineGagIE(InfoExtractor): # HTML escaped title 'url': 'https://9gag.com/gag/av5nvyb', 'only_matching': True, + }, { + # Non Anonymous Uploader + 'url': 'https://9gag.com/gag/ajgp66G', + 'info_dict': { + 'id': 'ajgp66G', + 'ext': 'webm', + 'title': 'Master Shifu! Or Splinter! You decide:', + 'upload_date': '20220806', + 'timestamp': 1659803411, + 'thumbnail': 'https://img-9gag-fun.9cache.com/photo/ajgp66G_460s.jpg', + 'categories': ['Funny'], + 'tags': ['Funny'], + 'duration': 26, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'uploader': 'Peter Klaus', + 'uploader_id': 'peterklaus12', + 'uploader_url': 'https://9gag.com/u/peterklaus12', + } }] def _real_extract(self, url): @@ -48,8 +68,6 @@ class NineGagIE(InfoExtractor): 'The given url does not contain a video', expected=True) - title = unescapeHTML(post['title']) - duration = None formats = [] thumbnails = [] @@ -98,9 +116,8 @@ class NineGagIE(InfoExtractor): 'format_id': image_id, }) formats.append(common) - self._sort_formats(formats) - section = try_get(post, lambda x: x['postSection']['name']) + section = traverse_obj(post, ('postSection', 'name')) tags = None post_tags = post.get('tags') @@ -112,18 +129,19 @@ class NineGagIE(InfoExtractor): continue tags.append(tag_key) - get_count = lambda x: int_or_none(post.get(x + 'Count')) - return { 'id': post_id, - 'title': title, + 'title': unescapeHTML(post.get('title')), 'timestamp': int_or_none(post.get('creationTs')), 'duration': duration, + 'uploader': traverse_obj(post, ('creator', 'fullName')), + 'uploader_id': traverse_obj(post, ('creator', 'username')), + 'uploader_url': url_or_none(traverse_obj(post, ('creator', 'profileUrl'))), 'formats': formats, 'thumbnails': thumbnails, - 'like_count': get_count('upVote'), - 'dislike_count': get_count('downVote'), - 'comment_count': get_count('comments'), + 'like_count': int_or_none(post.get('upVoteCount')), + 'dislike_count': int_or_none(post.get('downVoteCount')), + 'comment_count': int_or_none(post.get('commentsCount')), 'age_limit': 18 if post.get('nsfw') == 1 else None, 'categories': [section] if section else None, 'tags': tags, diff --git a/hypervideo_dl/extractor/ninenow.py b/hypervideo_dl/extractor/ninenow.py index 6043674..b970f8c 100644 --- a/hypervideo_dl/extractor/ninenow.py +++ b/hypervideo_dl/extractor/ninenow.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( diff --git a/hypervideo_dl/extractor/nintendo.py b/hypervideo_dl/extractor/nintendo.py index ff8f70b..ed839af 100644 --- a/hypervideo_dl/extractor/nintendo.py +++ b/hypervideo_dl/extractor/nintendo.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/nitter.py b/hypervideo_dl/extractor/nitter.py index 8bb709c..251bf44 100644 --- a/hypervideo_dl/extractor/nitter.py +++ b/hypervideo_dl/extractor/nitter.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( diff --git a/hypervideo_dl/extractor/njpwworld.py b/hypervideo_dl/extractor/njpwworld.py index 68c8c8e..7b8a526 100644 --- a/hypervideo_dl/extractor/njpwworld.py +++ b/hypervideo_dl/extractor/njpwworld.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -72,8 +69,6 @@ class NJPWWorldIE(InfoExtractor): formats += self._extract_m3u8_formats( player_url, video_id, 'mp4', 'm3u8_native', m3u8_id=kind, fatal=False, quality=int(kind == 'high')) - self._sort_formats(formats) - tag_block = get_element_by_class('tag-block', webpage) tags = re.findall( r'<a[^>]+class="tag-[^"]+"[^>]*>([^<]+)</a>', tag_block diff --git a/hypervideo_dl/extractor/nobelprize.py b/hypervideo_dl/extractor/nobelprize.py index 4dfdb09..1aa9705 100644 --- a/hypervideo_dl/extractor/nobelprize.py +++ b/hypervideo_dl/extractor/nobelprize.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( js_to_json, @@ -51,7 +48,6 @@ class NobelPrizeIE(InfoExtractor): formats.append({ 'url': source_src, }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/noco.py b/hypervideo_dl/extractor/noco.py deleted file mode 100644 index 28af909..0000000 --- a/hypervideo_dl/extractor/noco.py +++ /dev/null @@ -1,228 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import time -import hashlib - -from .common import InfoExtractor -from ..compat import ( - compat_str, -) -from ..utils import ( - clean_html, - ExtractorError, - int_or_none, - float_or_none, - parse_iso8601, - parse_qs, - sanitized_Request, - urlencode_postdata, -) - - -class NocoIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:www\.)?noco\.tv/emission/|player\.noco\.tv/\?idvideo=)(?P<id>\d+)' - _LOGIN_URL = 'https://noco.tv/do.php' - _API_URL_TEMPLATE = 'https://api.noco.tv/1.1/%s?ts=%s&tk=%s' - _SUB_LANG_TEMPLATE = '&sub_lang=%s' - _NETRC_MACHINE = 'noco' - - _TESTS = [ - { - 'url': 'http://noco.tv/emission/11538/nolife/ami-ami-idol-hello-france/', - 'md5': '0a993f0058ddbcd902630b2047ef710e', - 'info_dict': { - 'id': '11538', - 'ext': 'mp4', - 'title': 'Ami Ami Idol - Hello! France', - 'description': 'md5:4eaab46ab68fa4197a317a88a53d3b86', - 'upload_date': '20140412', - 'uploader': 'Nolife', - 'uploader_id': 'NOL', - 'duration': 2851.2, - }, - 'skip': 'Requires noco account', - }, - { - 'url': 'http://noco.tv/emission/12610/lbl42/the-guild/s01e01-wake-up-call', - 'md5': 'c190f1f48e313c55838f1f412225934d', - 'info_dict': { - 'id': '12610', - 'ext': 'mp4', - 'title': 'The Guild #1 - Wake-Up Call', - 'timestamp': 1403863200, - 'upload_date': '20140627', - 'uploader': 'LBL42', - 'uploader_id': 'LBL', - 'duration': 233.023, - }, - 'skip': 'Requires noco account', - } - ] - - def _perform_login(self, username, password): - login = self._download_json( - self._LOGIN_URL, None, 'Logging in', - data=urlencode_postdata({ - 'a': 'login', - 'cookie': '1', - 'username': username, - 'password': password, - }), - headers={ - 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', - }) - - if 'erreur' in login: - raise ExtractorError('Unable to login: %s' % clean_html(login['erreur']), expected=True) - - @staticmethod - def _ts(): - return int(time.time() * 1000) - - def _call_api(self, path, video_id, note, sub_lang=None): - ts = compat_str(self._ts() + self._ts_offset) - tk = hashlib.md5((hashlib.md5(ts.encode('ascii')).hexdigest() + '#8S?uCraTedap6a').encode('ascii')).hexdigest() - url = self._API_URL_TEMPLATE % (path, ts, tk) - if sub_lang: - url += self._SUB_LANG_TEMPLATE % sub_lang - - request = sanitized_Request(url) - request.add_header('Referer', self._referer) - - resp = self._download_json(request, video_id, note) - - if isinstance(resp, dict) and resp.get('error'): - self._raise_error(resp['error'], resp['description']) - - return resp - - def _raise_error(self, error, description): - raise ExtractorError( - '%s returned error: %s - %s' % (self.IE_NAME, error, description), - expected=True) - - def _real_extract(self, url): - video_id = self._match_id(url) - - # Timestamp adjustment offset between server time and local time - # must be calculated in order to use timestamps closest to server's - # in all API requests (see https://github.com/ytdl-org/youtube-dl/issues/7864) - webpage = self._download_webpage(url, video_id) - - player_url = self._search_regex( - r'(["\'])(?P<player>https?://noco\.tv/(?:[^/]+/)+NocoPlayer.+?\.swf.*?)\1', - webpage, 'noco player', group='player', - default='http://noco.tv/cdata/js/player/NocoPlayer-v1.2.40.swf') - - qs = parse_qs(player_url) - ts = int_or_none(qs.get('ts', [None])[0]) - self._ts_offset = ts - self._ts() if ts else 0 - self._referer = player_url - - medias = self._call_api( - 'shows/%s/medias' % video_id, - video_id, 'Downloading video JSON') - - show = self._call_api( - 'shows/by_id/%s' % video_id, - video_id, 'Downloading show JSON')[0] - - options = self._call_api( - 'users/init', video_id, - 'Downloading user options JSON')['options'] - audio_lang_pref = options.get('audio_language') or options.get('language', 'fr') - - if audio_lang_pref == 'original': - audio_lang_pref = show['original_lang'] - if len(medias) == 1: - audio_lang_pref = list(medias.keys())[0] - elif audio_lang_pref not in medias: - audio_lang_pref = 'fr' - - qualities = self._call_api( - 'qualities', - video_id, 'Downloading qualities JSON') - - formats = [] - - for audio_lang, audio_lang_dict in medias.items(): - preference = 1 if audio_lang == audio_lang_pref else 0 - for sub_lang, lang_dict in audio_lang_dict['video_list'].items(): - for format_id, fmt in lang_dict['quality_list'].items(): - format_id_extended = 'audio-%s_sub-%s_%s' % (audio_lang, sub_lang, format_id) - - video = self._call_api( - 'shows/%s/video/%s/%s' % (video_id, format_id.lower(), audio_lang), - video_id, 'Downloading %s video JSON' % format_id_extended, - sub_lang if sub_lang != 'none' else None) - - file_url = video['file'] - if not file_url: - continue - - if file_url in ['forbidden', 'not found']: - popmessage = video['popmessage'] - self._raise_error(popmessage['title'], popmessage['message']) - - formats.append({ - 'url': file_url, - 'format_id': format_id_extended, - 'width': int_or_none(fmt.get('res_width')), - 'height': int_or_none(fmt.get('res_lines')), - 'abr': int_or_none(fmt.get('audiobitrate'), 1000), - 'vbr': int_or_none(fmt.get('videobitrate'), 1000), - 'filesize': int_or_none(fmt.get('filesize')), - 'format_note': qualities[format_id].get('quality_name'), - 'quality': qualities[format_id].get('priority'), - 'language_preference': preference, - }) - - self._sort_formats(formats) - - timestamp = parse_iso8601(show.get('online_date_start_utc'), ' ') - - if timestamp is not None and timestamp < 0: - timestamp = None - - uploader = show.get('partner_name') - uploader_id = show.get('partner_key') - duration = float_or_none(show.get('duration_ms'), 1000) - - thumbnails = [] - for thumbnail_key, thumbnail_url in show.items(): - m = re.search(r'^screenshot_(?P<width>\d+)x(?P<height>\d+)$', thumbnail_key) - if not m: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'width': int(m.group('width')), - 'height': int(m.group('height')), - }) - - episode = show.get('show_TT') or show.get('show_OT') - family = show.get('family_TT') or show.get('family_OT') - episode_number = show.get('episode_number') - - title = '' - if family: - title += family - if episode_number: - title += ' #' + compat_str(episode_number) - if episode: - title += ' - ' + compat_str(episode) - - description = show.get('show_resume') or show.get('family_resume') - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnails': thumbnails, - 'timestamp': timestamp, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'duration': duration, - 'formats': formats, - } diff --git a/hypervideo_dl/extractor/nonktube.py b/hypervideo_dl/extractor/nonktube.py index ca1424e..f191be3 100644 --- a/hypervideo_dl/extractor/nonktube.py +++ b/hypervideo_dl/extractor/nonktube.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .nuevo import NuevoBaseIE diff --git a/hypervideo_dl/extractor/noodlemagazine.py b/hypervideo_dl/extractor/noodlemagazine.py index 2f170bb..e620895 100644 --- a/hypervideo_dl/extractor/noodlemagazine.py +++ b/hypervideo_dl/extractor/noodlemagazine.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( parse_duration, @@ -50,8 +47,6 @@ class NoodleMagazineIE(InfoExtractor): 'ext': source.get('type'), } for source in playlist_info.get('sources')] - self._sort_formats(formats) - return { 'id': video_id, 'formats': formats, diff --git a/hypervideo_dl/extractor/noovo.py b/hypervideo_dl/extractor/noovo.py index b40770d..acbb74c 100644 --- a/hypervideo_dl/extractor/noovo.py +++ b/hypervideo_dl/extractor/noovo.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .brightcove import BrightcoveNewIE from .common import InfoExtractor from ..compat import compat_str diff --git a/hypervideo_dl/extractor/normalboots.py b/hypervideo_dl/extractor/normalboots.py index 61fe571..07babcd 100644 --- a/hypervideo_dl/extractor/normalboots.py +++ b/hypervideo_dl/extractor/normalboots.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from .jwplatform import JWPlatformIE diff --git a/hypervideo_dl/extractor/nosnl.py b/hypervideo_dl/extractor/nosnl.py new file mode 100644 index 0000000..eba94c4 --- /dev/null +++ b/hypervideo_dl/extractor/nosnl.py @@ -0,0 +1,95 @@ +from .common import InfoExtractor +from ..utils import parse_duration, parse_iso8601, traverse_obj + + +class NOSNLArticleIE(InfoExtractor): + _VALID_URL = r'https?://nos\.nl/((?!video)(\w+/)?\w+/)\d+-(?P<display_id>[\w-]+)' + _TESTS = [ + { + # only 1 video + 'url': 'https://nos.nl/nieuwsuur/artikel/2440353-verzakking-door-droogte-dreigt-tot-een-miljoen-kwetsbare-huizen', + 'info_dict': { + 'id': '2440340', + 'ext': 'mp4', + 'description': 'md5:5f83185d902ac97af3af4bed7ece3db5', + 'title': '\'We hebben een huis vol met scheuren\'', + 'duration': 95.0, + 'thumbnail': 'https://cdn.nos.nl/image/2022/08/12/887149/3840x2160a.jpg', + } + }, { + # more than 1 video + 'url': 'https://nos.nl/artikel/2440409-vannacht-sliepen-weer-enkele-honderden-asielzoekers-in-ter-apel-buiten', + 'info_dict': { + 'id': '2440409', + 'title': 'Vannacht sliepen weer enkele honderden asielzoekers in Ter Apel buiten', + 'description': 'Er werd wel geprobeerd om kwetsbare migranten onderdak te bieden, zegt het COA.', + 'tags': ['aanmeldcentrum', 'Centraal Orgaan opvang asielzoekers', 'COA', 'asielzoekers', 'Ter Apel'], + 'modified_timestamp': 1660452773, + 'modified_date': '20220814', + 'upload_date': '20220813', + 'thumbnail': 'https://cdn.nos.nl/image/2022/07/18/880346/1024x576a.jpg', + 'timestamp': 1660401384, + }, + 'playlist_count': 2, + }, { + # audio + video + 'url': 'https://nos.nl/artikel/2440789-wekdienst-16-8-groningse-acties-tien-jaar-na-zware-aardbeving-femke-bol-in-actie-op-ek-atletiek', + 'info_dict': { + 'id': '2440789', + 'title': 'Wekdienst 16/8: Groningse acties tien jaar na zware aardbeving • Femke Bol in actie op EK atletiek ', + 'description': 'Nieuws, weer, verkeer: met dit overzicht begin je geïnformeerd aan de dag.', + 'tags': ['wekdienst'], + 'modified_date': '20220816', + 'modified_timestamp': 1660625449, + 'timestamp': 1660625449, + 'upload_date': '20220816', + 'thumbnail': 'https://cdn.nos.nl/image/2022/08/16/888178/1024x576a.jpg', + }, + 'playlist_count': 2, + } + ] + + def _entries(self, nextjs_json, display_id): + for item in nextjs_json['items']: + if item.get('type') == 'video': + formats, subtitle = self._extract_m3u8_formats_and_subtitles( + traverse_obj(item, ('source', 'url')), display_id, ext='mp4') + yield { + 'id': str(item['id']), + 'title': item.get('title'), + 'description': item.get('description'), + 'formats': formats, + 'subtitles': subtitle, + 'duration': parse_duration(item.get('duration')), + 'thumbnails': [{ + 'url': traverse_obj(image, ('url', ...), get_all=False), + 'width': image.get('width'), + 'height': image.get('height') + } for image in traverse_obj(item, ('imagesByRatio', ...))[0]], + } + + elif item.get('type') == 'audio': + yield { + 'id': str(item['id']), + 'title': item.get('title'), + 'url': traverse_obj(item, ('media', 'src')), + 'ext': 'mp3', + } + + def _real_extract(self, url): + display_id = self._match_valid_url(url).group('display_id') + webpage = self._download_webpage(url, display_id) + + nextjs_json = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['data'] + return { + '_type': 'playlist', + 'entries': self._entries(nextjs_json, display_id), + 'id': str(nextjs_json['id']), + 'title': nextjs_json.get('title') or self._html_search_meta(['title', 'og:title', 'twitter:title'], webpage), + 'description': (nextjs_json.get('description') + or self._html_search_meta(['description', 'twitter:description', 'og:description'], webpage)), + 'tags': nextjs_json.get('keywords'), + 'modified_timestamp': parse_iso8601(nextjs_json.get('modifiedAt')), + 'thumbnail': nextjs_json.get('shareImageSrc') or self._html_search_meta(['og:image', 'twitter:image'], webpage), + 'timestamp': parse_iso8601(nextjs_json.get('publishedAt')) + } diff --git a/hypervideo_dl/extractor/nosvideo.py b/hypervideo_dl/extractor/nosvideo.py index 53c500c..b6d3ea4 100644 --- a/hypervideo_dl/extractor/nosvideo.py +++ b/hypervideo_dl/extractor/nosvideo.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/nova.py b/hypervideo_dl/extractor/nova.py index 00a64f8..8bd3fd4 100644 --- a/hypervideo_dl/extractor/nova.py +++ b/hypervideo_dl/extractor/nova.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -126,7 +123,6 @@ class NovaEmbedIE(InfoExtractor): if not formats and has_drm: self.report_drm(video_id) - self._sort_formats(formats) title = self._og_search_title( webpage, default=None) or self._search_regex( @@ -311,7 +307,6 @@ class NovaIE(InfoExtractor): formats = [{ 'url': video_url, }] - self._sort_formats(formats) title = mediafile.get('meta', {}).get('title') or self._og_search_title(webpage) thumbnail = config.get('poster') diff --git a/hypervideo_dl/extractor/novaplay.py b/hypervideo_dl/extractor/novaplay.py index bfb2c87..92d1d13 100644 --- a/hypervideo_dl/extractor/novaplay.py +++ b/hypervideo_dl/extractor/novaplay.py @@ -1,4 +1,3 @@ -# coding: utf-8 from .common import InfoExtractor from ..utils import int_or_none, parse_duration, parse_iso8601 @@ -7,46 +6,55 @@ class NovaPlayIE(InfoExtractor): _VALID_URL = r'https://play.nova\.bg/video/.*/(?P<id>\d+)' _TESTS = [ { - 'url': 'https://play.nova.bg/video/bratya/season-3/bratq-2021-10-08/548677', - 'md5': 'b1127a84e61bed1632b7c2ca9cbb4153', + 'url': 'https://play.nova.bg/video/ochakvaite/season-0/ochakvaite-2022-07-22-sybudi-se-sat/606627', + 'md5': 'd79dff2d09d196c595a7290f48e33399', 'info_dict': { - 'id': '548677', + 'id': '606627', 'ext': 'mp4', - 'title': 'Братя', - 'alt_title': 'bratya/season-3/bratq-2021-10-08', - 'duration': 1603.0, - 'timestamp': 1633724150, - 'upload_date': '20211008', - 'thumbnail': 'https://nbg-img.fite.tv/img/548677_460x260.jpg', - 'description': 'Сезон 3 Епизод 25' + 'title': 'Събуди се - събота по NOVA (23.07.2022)', + 'alt_title': 'ochakvaite/season-0/ochakvaite-2022-07-22-sybudi-se-sat', + 'duration': 29.0, + 'timestamp': 1658491547, + 'upload_date': '20220722', + 'thumbnail': 'https://nbg-img.fite.tv/img/606627_460x260.jpg', + 'description': '29 сек', + 'view_count': False }, }, { - 'url': 'https://play.nova.bg/video/igri-na-volqta/season-3/igri-na-volqta-2021-09-20-1/548227', - 'md5': '5fd61b8ecbe582fc021019d570965d58', + 'url': 'https://play.nova.bg/video/ochakvaite/season-0/ochakvaite-2022-07-22-cherry-tazi/606609', + 'md5': 'f3e973e2ed1a5b9b3f498b1ab82d01b3', 'info_dict': { - 'id': '548227', + 'id': '606609', 'ext': 'mp4', - 'title': 'Игри на волята: България (20.09.2021) - част 1', - 'alt_title': 'gri-na-volqta/season-3/igri-na-volqta-2021-09-20-1', - 'duration': 4060.0, - 'timestamp': 1632167564, - 'upload_date': '20210920', - 'thumbnail': 'https://nbg-img.fite.tv/img/548227_460x260.jpg', - 'description': 'Сезон 3 Епизод 13' + 'title': 'Черешката на тортата - тази вечер по NOVA (22.07.2022)', + 'alt_title': 'ochakvaite/season-0/ochakvaite-2022-07-22-cherry-tazi', + 'duration': 29.0, + 'timestamp': 1658476303, + 'upload_date': '20220722', + 'thumbnail': 'https://nbg-img.fite.tv/img/606609_460x260.jpg', + 'description': '29 сек', + 'view_count': False }, } ] + _access_token = None + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + self._access_token = self._access_token or self._download_json( + 'https://play.nova.bg/api/client', None, note='Fetching access token')['accessToken'] video_props = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['video'] m3u8_url = self._download_json( f'https://nbg-api.fite.tv/api/v2/videos/{video_id}/streams', - video_id, headers={'x-flipps-user-agent': 'Flipps/75/9.7'})[0]['url'] + video_id, headers={ + 'x-flipps-user-agent': 'Flipps/75/9.7', + 'x-flipps-version': '2022-05-17', + 'Authorization': f'Bearer {self._access_token}' + })[0]['links']['play']['href'] formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4', m3u8_id='hls') - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/nowness.py b/hypervideo_dl/extractor/nowness.py index 20ef4cd..18bb880 100644 --- a/hypervideo_dl/extractor/nowness.py +++ b/hypervideo_dl/extractor/nowness.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .brightcove import ( BrightcoveLegacyIE, BrightcoveNewIE, diff --git a/hypervideo_dl/extractor/noz.py b/hypervideo_dl/extractor/noz.py index ccafd77..59d259f 100644 --- a/hypervideo_dl/extractor/noz.py +++ b/hypervideo_dl/extractor/noz.py @@ -1,17 +1,11 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_xpath, -) from ..utils import ( int_or_none, find_xpath_attr, xpath_text, update_url_query, ) +from ..compat import compat_urllib_parse_unquote class NozIE(InfoExtractor): @@ -50,7 +44,7 @@ class NozIE(InfoExtractor): duration = int_or_none(xpath_text( doc, './/article/movie/file/duration')) formats = [] - for qnode in doc.findall(compat_xpath('.//article/movie/file/qualities/qual')): + for qnode in doc.findall('.//article/movie/file/qualities/qual'): http_url_ele = find_xpath_attr( qnode, './html_urls/video_url', 'format', 'video/mp4') http_url = http_url_ele.text if http_url_ele is not None else None @@ -77,7 +71,6 @@ class NozIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/npo.py b/hypervideo_dl/extractor/npo.py index a8aaef6..f18cb9e 100644 --- a/hypervideo_dl/extractor/npo.py +++ b/hypervideo_dl/extractor/npo.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -249,8 +247,6 @@ class NPOIE(NPOBaseIE): if not self.get_param('allow_unplayable_formats') and drm: self.report_drm(video_id) - self._sort_formats(formats) - info = { 'id': video_id, 'title': video_id, @@ -456,8 +452,6 @@ class NPOIE(NPOBaseIE): 'quality': stream.get('kwaliteit'), }) - self._sort_formats(formats) - subtitles = {} if metadata.get('tt888') == 'ja': subtitles['nl'] = [{ @@ -601,7 +595,7 @@ class NPORadioFragmentIE(InfoExtractor): } -class NPODataMidEmbedIE(InfoExtractor): +class NPODataMidEmbedIE(InfoExtractor): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) @@ -655,7 +649,7 @@ class HetKlokhuisIE(NPODataMidEmbedIE): } -class NPOPlaylistBaseIE(NPOIE): +class NPOPlaylistBaseIE(NPOIE): # XXX: Do not subclass from concrete IE def _real_extract(self, url): playlist_id = self._match_id(url) diff --git a/hypervideo_dl/extractor/npr.py b/hypervideo_dl/extractor/npr.py index 49f062d..4b6855c 100644 --- a/hypervideo_dl/extractor/npr.py +++ b/hypervideo_dl/extractor/npr.py @@ -1,11 +1,5 @@ -from __future__ import unicode_literals - from .common import InfoExtractor -from ..utils import ( - int_or_none, - qualities, - url_or_none, -) +from ..utils import int_or_none, qualities, traverse_obj, url_or_none class NprIE(InfoExtractor): @@ -53,6 +47,15 @@ class NprIE(InfoExtractor): # multimedia, no formats, stream 'url': 'https://www.npr.org/2020/02/14/805476846/laura-stevenson-tiny-desk-concert', 'only_matching': True, + }, { + 'url': 'https://www.npr.org/2022/03/15/1084896560/bonobo-tiny-desk-home-concert', + 'info_dict': { + 'id': '1086468851', + 'ext': 'mp4', + 'title': 'Bonobo: Tiny Desk (Home) Concert', + 'duration': 1061, + 'thumbnail': r're:^https?://media.npr.org/assets/img/.*\.jpg$', + }, }] def _real_extract(self, url): @@ -112,7 +115,11 @@ class NprIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( stream_url, stream_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - self._sort_formats(formats) + + if not formats: + raw_json_ld = self._yield_json_ld(self._download_webpage(url, playlist_id), playlist_id, fatal=False) + m3u8_url = traverse_obj(list(raw_json_ld), (..., 'subjectOf', ..., 'embedUrl'), get_all=False) + formats = self._extract_m3u8_formats(m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False) entries.append({ 'id': media_id, diff --git a/hypervideo_dl/extractor/nrk.py b/hypervideo_dl/extractor/nrk.py index 4d723e8..88d08e5 100644 --- a/hypervideo_dl/extractor/nrk.py +++ b/hypervideo_dl/extractor/nrk.py @@ -1,22 +1,19 @@ -# coding: utf-8 -from __future__ import unicode_literals - import itertools import random import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import compat_HTTPError, compat_str from ..utils import ( - compat_HTTPError, - determine_ext, ExtractorError, + determine_ext, int_or_none, parse_duration, + parse_iso8601, str_or_none, try_get, - urljoin, url_or_none, + urljoin, ) @@ -61,8 +58,7 @@ class NRKBaseIE(InfoExtractor): return self._download_json( urljoin('https://psapi.nrk.no/', path), video_id, note or 'Downloading %s JSON' % item, - fatal=fatal, query=query, - headers={'Accept-Encoding': 'gzip, deflate, br'}) + fatal=fatal, query=query) class NRKIE(NRKBaseIE): @@ -184,7 +180,6 @@ class NRKIE(NRKBaseIE): 'format_id': asset_format, 'vcodec': 'none', }) - self._sort_formats(formats) data = call_playback_api('metadata') @@ -247,6 +242,7 @@ class NRKIE(NRKBaseIE): 'age_limit': age_limit, 'formats': formats, 'subtitles': subtitles, + 'timestamp': parse_iso8601(try_get(manifest, lambda x: x['availability']['onDemand']['from'], str)) } if is_series: @@ -738,7 +734,7 @@ class NRKTVSeriesIE(NRKTVSerieBaseIE): entries, series_id, titles.get('title'), titles.get('subtitle')) -class NRKTVDirekteIE(NRKTVIE): +class NRKTVDirekteIE(NRKTVIE): # XXX: Do not subclass from concrete IE IE_DESC = 'NRK TV Direkte and NRK Radio Direkte' _VALID_URL = r'https?://(?:tv|radio)\.nrk\.no/direkte/(?P<id>[^/?#&]+)' @@ -797,7 +793,7 @@ class NRKPlaylistBaseIE(InfoExtractor): for video_id in re.findall(self._ITEM_RE, webpage) ] - playlist_title = self. _extract_title(webpage) + playlist_title = self._extract_title(webpage) playlist_description = self._extract_description(webpage) return self.playlist_result( diff --git a/hypervideo_dl/extractor/nrl.py b/hypervideo_dl/extractor/nrl.py index 0bd5086..798d034 100644 --- a/hypervideo_dl/extractor/nrl.py +++ b/hypervideo_dl/extractor/nrl.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/ntvcojp.py b/hypervideo_dl/extractor/ntvcojp.py index c9af911..422ec6e 100644 --- a/hypervideo_dl/extractor/ntvcojp.py +++ b/hypervideo_dl/extractor/ntvcojp.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( ExtractorError, diff --git a/hypervideo_dl/extractor/ntvde.py b/hypervideo_dl/extractor/ntvde.py index 035582e..6d7ea3d 100644 --- a/hypervideo_dl/extractor/ntvde.py +++ b/hypervideo_dl/extractor/ntvde.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -63,7 +60,6 @@ class NTVDeIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', quality=1, m3u8_id='hls', fatal=False)) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/ntvru.py b/hypervideo_dl/extractor/ntvru.py index c47d1df..8d5877d 100644 --- a/hypervideo_dl/extractor/ntvru.py +++ b/hypervideo_dl/extractor/ntvru.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -118,7 +115,6 @@ class NTVRuIE(InfoExtractor): 'url': file_, 'filesize': int_or_none(xpath_text(video, './%ssize' % format_id)), }) - self._sort_formats(formats) return { 'id': xpath_text(video, './id'), diff --git a/hypervideo_dl/extractor/nuevo.py b/hypervideo_dl/extractor/nuevo.py index be1e09d..ec54041 100644 --- a/hypervideo_dl/extractor/nuevo.py +++ b/hypervideo_dl/extractor/nuevo.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( diff --git a/hypervideo_dl/extractor/nuvid.py b/hypervideo_dl/extractor/nuvid.py index 84fb97d..6ac351c 100644 --- a/hypervideo_dl/extractor/nuvid.py +++ b/hypervideo_dl/extractor/nuvid.py @@ -1,5 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals import re from .common import InfoExtractor @@ -82,7 +80,6 @@ class NuvidIE(InfoExtractor): } for quality, source in video_data.get('files').items() if source] self._check_formats(formats, video_id) - self._sort_formats(formats) duration = parse_duration(traverse_obj(video_data, 'duration', 'duration_format')) thumbnails = [ diff --git a/hypervideo_dl/extractor/nytimes.py b/hypervideo_dl/extractor/nytimes.py index 9996473..2e21edb 100644 --- a/hypervideo_dl/extractor/nytimes.py +++ b/hypervideo_dl/extractor/nytimes.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import hmac import hashlib import base64 @@ -75,7 +72,6 @@ class NYTimesBaseIE(InfoExtractor): 'tbr': int_or_none(video.get('bitrate'), 1000) or None, 'ext': ext, }) - self._sort_formats(formats) thumbnails = [] for image in video_data.get('images', []): @@ -106,6 +102,7 @@ class NYTimesBaseIE(InfoExtractor): class NYTimesIE(NYTimesBaseIE): _VALID_URL = r'https?://(?:(?:www\.)?nytimes\.com/video/(?:[^/]+/)+?|graphics8\.nytimes\.com/bcvideo/\d+(?:\.\d+)?/iframe/embed\.html\?videoId=)(?P<id>\d+)' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//graphics8\.nytimes\.com/bcvideo/[^/]+/iframe/embed\.html.+?)\1>'] _TESTS = [{ 'url': 'http://www.nytimes.com/video/opinion/100000002847155/verbatim-what-is-a-photocopier.html?playlistId=100000001150263', diff --git a/hypervideo_dl/extractor/nzherald.py b/hypervideo_dl/extractor/nzherald.py index e5601b4..062f9a8 100644 --- a/hypervideo_dl/extractor/nzherald.py +++ b/hypervideo_dl/extractor/nzherald.py @@ -1,9 +1,7 @@ -# coding: utf-8 -from __future__ import unicode_literals +import json from .brightcove import BrightcoveNewIE from .common import InfoExtractor - from ..compat import compat_str from ..utils import ( ExtractorError, @@ -16,17 +14,20 @@ class NZHeraldIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?nzherald\.co\.nz/[\w\/-]+\/(?P<id>[A-Z0-9]+)' _TESTS = [ { - 'url': 'https://www.nzherald.co.nz/nz/weather-heavy-rain-gales-across-nz-most-days-this-week/PTG7QWY4E2225YHZ5NAIRBTYTQ/', + # Video accessible under 'video' key + 'url': 'https://www.nzherald.co.nz/nz/queen-elizabeth-death-nz-public-holiday-announced-for-september-26/CEOPBSXO2JDCLNK3H7E3BIE2FA/', 'info_dict': { - 'id': '6271084466001', + 'id': '6312191736112', 'ext': 'mp4', - 'title': 'MetService severe weather warning: September 6th - 7th', - 'timestamp': 1630891576, - 'upload_date': '20210906', + 'title': 'Focus: PM holds post-Cabinet press conference', + 'duration': 238.08, + 'upload_date': '20220912', 'uploader_id': '1308227299001', - 'description': 'md5:db6ca335a22e2cdf37ab9d2bcda52902' + 'timestamp': 1662957159, + 'tags': [], + 'thumbnail': r're:https?://.*\.jpg$', + 'description': 'md5:2f17713fcbfcfbe38bb9e7dfccbb0f2e', } - }, { # Webpage has brightcove embed player url 'url': 'https://www.nzherald.co.nz/travel/pencarrow-coastal-trail/HDVTPJEPP46HJ2UEMK4EGD2DFI/', @@ -37,9 +38,11 @@ class NZHeraldIE(InfoExtractor): 'timestamp': 1625102897, 'upload_date': '20210701', 'uploader_id': '1308227299001', - 'description': 'md5:d361aaa0c6498f7ac1bc4fc0a0aec1e4' + 'description': 'md5:d361aaa0c6498f7ac1bc4fc0a0aec1e4', + 'thumbnail': r're:https?://.*\.jpg$', + 'tags': ['travel', 'video'], + 'duration': 43.627, } - }, { # two video embeds of the same video 'url': 'https://www.nzherald.co.nz/nz/truck-driver-captured-cutting-off-motorist-on-state-highway-1-in-canterbury/FIHNJB7PLLPHWQPK4S7ZBDUC4I/', @@ -51,6 +54,22 @@ class NZHeraldIE(InfoExtractor): 'upload_date': '20210429', 'uploader_id': '1308227299001', 'description': 'md5:4cae7dfb7613ac4c73b9e73a75c6b5d7' + }, + 'skip': 'video removed', + }, { + # customVideo embed requiring additional API call + 'url': 'https://www.nzherald.co.nz/nz/politics/reserve-bank-rejects-political-criticisms-stands-by-review/2JO5Q4WLZRCBBNWTLACZMOP4RA/', + 'info_dict': { + 'id': '6315123873112', + 'ext': 'mp4', + 'timestamp': 1667862725, + 'title': 'Focus: Luxon on re-appointment of Reserve Bank governor Adrian Orr', + 'upload_date': '20221107', + 'description': 'md5:df2f1f7033a8160c66e28e4743f5d934', + 'uploader_id': '1308227299001', + 'tags': ['video', 'nz herald focus', 'politics', 'politics videos'], + 'thumbnail': r're:https?://.*\.jpg$', + 'duration': 99.584, } }, { 'url': 'https://www.nzherald.co.nz/kahu/kaupapa-companies-my-taiao-supporting-maori-in-study-and-business/PQBO2J25WCG77VGRX7W7BVYEAI/', @@ -83,6 +102,12 @@ class NZHeraldIE(InfoExtractor): self._search_regex(r'Fusion\.globalContent\s*=\s*({.+?})\s*;', webpage, 'fusion metadata'), article_id) video_metadata = fusion_metadata.get('video') + if not video_metadata: + custom_video_id = traverse_obj(fusion_metadata, ('customVideo', 'embed', 'id'), expected_type=str) + if custom_video_id: + video_metadata = self._download_json( + 'https://www.nzherald.co.nz/pf/api/v3/content/fetch/full-content-by-id', article_id, + query={'query': json.dumps({'id': custom_video_id, 'site': 'nzh'}), '_website': 'nzh'}) bc_video_id = traverse_obj( video_metadata or fusion_metadata, # fusion metadata is the video metadata for video-only pages 'brightcoveId', ('content_elements', ..., 'referent', 'id'), diff --git a/hypervideo_dl/extractor/nzz.py b/hypervideo_dl/extractor/nzz.py index 61ee77a..ac3b731 100644 --- a/hypervideo_dl/extractor/nzz.py +++ b/hypervideo_dl/extractor/nzz.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/odatv.py b/hypervideo_dl/extractor/odatv.py index 314527f..24ab939 100644 --- a/hypervideo_dl/extractor/odatv.py +++ b/hypervideo_dl/extractor/odatv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( ExtractorError, diff --git a/hypervideo_dl/extractor/odnoklassniki.py b/hypervideo_dl/extractor/odnoklassniki.py index 293f1aa..4f325f0 100644 --- a/hypervideo_dl/extractor/odnoklassniki.py +++ b/hypervideo_dl/extractor/odnoklassniki.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..compat import ( compat_etree_fromstring, @@ -13,10 +8,12 @@ from ..compat import ( from ..utils import ( ExtractorError, float_or_none, - unified_strdate, int_or_none, qualities, + smuggle_url, unescapeHTML, + unified_strdate, + unsmuggle_url, urlencode_postdata, ) @@ -27,13 +24,14 @@ class OdnoklassnikiIE(InfoExtractor): (?:(?:www|m|mobile)\.)? (?:odnoklassniki|ok)\.ru/ (?: - video(?:embed)?/| + video(?P<embed>embed)?/| web-api/video/moviePlayer/| live/| dk\?.*?st\.mvId= ) (?P<id>[\d-]+) ''' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1'] _TESTS = [{ 'note': 'Coub embedded', 'url': 'http://ok.ru/video/1484130554189', @@ -42,7 +40,7 @@ class OdnoklassnikiIE(InfoExtractor): 'ext': 'mp4', 'timestamp': 1545580896, 'view_count': int, - 'thumbnail': 'https://coub-anubis-a.akamaized.net/coub_storage/coub/simple/cw_image/c5ac87553bd/608e806a1239c210ab692/1545580913_00026.jpg', + 'thumbnail': 'https://coub-attachments.akamaized.net/coub_storage/coub/simple/cw_image/c5ac87553bd/608e806a1239c210ab692/1545580913_00026.jpg', 'title': 'Народная забава', 'uploader': 'Nevata', 'upload_date': '20181223', @@ -69,11 +67,12 @@ class OdnoklassnikiIE(InfoExtractor): }, { # metadata in JSON 'url': 'http://ok.ru/video/20079905452', - 'md5': '0b62089b479e06681abaaca9d204f152', + 'md5': '5d2b64756e2af296e3b383a0bc02a6aa', 'info_dict': { 'id': '20079905452', 'ext': 'mp4', 'title': 'Культура меняет нас (прекрасный ролик!))', + 'thumbnail': str, 'duration': 100, 'upload_date': '20141207', 'uploader_id': '330537914540', @@ -84,11 +83,12 @@ class OdnoklassnikiIE(InfoExtractor): }, { # metadataUrl 'url': 'http://ok.ru/video/63567059965189-0?fromTime=5', - 'md5': '6ff470ea2dd51d5d18c295a355b0b6bc', + 'md5': 'f8c951122516af72e6e6ffdd3c41103b', 'info_dict': { 'id': '63567059965189-0', 'ext': 'mp4', 'title': 'Девушка без комплексов ...', + 'thumbnail': str, 'duration': 191, 'upload_date': '20150518', 'uploader_id': '534380003155', @@ -99,18 +99,32 @@ class OdnoklassnikiIE(InfoExtractor): }, }, { # YouTube embed (metadataUrl, provider == USER_YOUTUBE) - 'url': 'http://ok.ru/video/64211978996595-1', - 'md5': '2f206894ffb5dbfcce2c5a14b909eea5', + 'url': 'https://ok.ru/video/3952212382174', + 'md5': '91749d0bd20763a28d083fa335bbd37a', 'info_dict': { - 'id': 'V_VztHT5BzY', + 'id': '5axVgHHDBvU', 'ext': 'mp4', - 'title': 'Космическая среда от 26 августа 2015', - 'description': 'md5:848eb8b85e5e3471a3a803dae1343ed0', - 'duration': 440, - 'upload_date': '20150826', - 'uploader_id': 'tvroscosmos', - 'uploader': 'Телестудия Роскосмоса', + 'title': 'Youtube-dl 101: What is it and HOW to use it! Full Download Walkthrough and Guide', + 'description': 'md5:b57209eeb9d5c2f20c984dfb58862097', + 'uploader': 'Lod Mer', + 'uploader_id': '575186401502', + 'duration': 1529, 'age_limit': 0, + 'upload_date': '20210405', + 'comment_count': int, + 'live_status': 'not_live', + 'view_count': int, + 'thumbnail': 'https://i.mycdn.me/i?r=AEHujHvw2RjEbemUCNEorZbxYpb_p_9AcN2FmGik64Krkcmz37YtlY093oAM5-HIEAt7Zi9s0CiBOSDmbngC-I-k&fn=external_8', + 'uploader_url': 'http://www.youtube.com/user/MrKewlkid94', + 'channel_follower_count': int, + 'tags': ['youtube-dl', 'youtube playlists', 'download videos', 'download audio'], + 'channel_id': 'UCVGtvURtEURYHtJFUegdSug', + 'like_count': int, + 'availability': 'public', + 'channel_url': 'https://www.youtube.com/channel/UCVGtvURtEURYHtJFUegdSug', + 'categories': ['Education'], + 'playable_in_embed': True, + 'channel': 'BornToReact', }, }, { # YouTube embed (metadata, provider == USER_YOUTUBE, no metadata.movie.title field) @@ -130,10 +144,12 @@ class OdnoklassnikiIE(InfoExtractor): }, 'skip': 'Video has not been found', }, { + # TODO: HTTP Error 400: Bad Request, it only works if there's no cookies when downloading 'note': 'Only available in mobile webpage', 'url': 'https://m.ok.ru/video/2361249957145', 'info_dict': { 'id': '2361249957145', + 'ext': 'mp4', 'title': 'Быковское крещение', 'duration': 3038.181, }, @@ -162,14 +178,36 @@ class OdnoklassnikiIE(InfoExtractor): # Paid video 'url': 'https://ok.ru/video/954886983203', 'only_matching': True, + }, { + 'url': 'https://ok.ru/videoembed/2932705602075', + 'info_dict': { + 'id': '2932705602075', + 'ext': 'mp4', + 'thumbnail': 'https://i.mycdn.me/videoPreview?id=1369902483995&type=37&idx=2&tkn=fqlnoQD_xwq5ovIlKfgNyU08qmM&fn=external_8', + 'title': 'Boosty для тебя!', + 'uploader_id': '597811038747', + 'like_count': 0, + 'duration': 35, + }, + }] + + _WEBPAGE_TESTS = [{ + 'url': 'https://boosty.to/ikakprosto/posts/56cedaca-b56a-4dfd-b3ed-98c79cfa0167', + 'info_dict': { + 'id': '3950343629563', + 'ext': 'mp4', + 'thumbnail': 'https://i.mycdn.me/videoPreview?id=2776238394107&type=37&idx=11&tkn=F3ejkUFcpuI4DnMRxrDGcH5YcmM&fn=external_8', + 'title': 'Заяц Бусти.mp4', + 'uploader_id': '571368965883', + 'like_count': 0, + 'duration': 10444, + }, }] - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1', webpage) - if mobj: - return mobj.group('url') + @classmethod + def _extract_embed_urls(cls, url, webpage): + for x in super()._extract_embed_urls(url, webpage): + yield smuggle_url(x, {'referrer': url}) def _real_extract(self, url): try: @@ -185,16 +223,23 @@ class OdnoklassnikiIE(InfoExtractor): start_time = int_or_none(compat_parse_qs( compat_urllib_parse_urlparse(url).query).get('fromTime', [None])[0]) - video_id = self._match_id(url) + url, smuggled = unsmuggle_url(url, {}) + video_id, is_embed = self._match_valid_url(url).group('id', 'embed') + mode = 'videoembed' if is_embed else 'video' webpage = self._download_webpage( - 'http://ok.ru/video/%s' % video_id, video_id, - note='Downloading desktop webpage') + f'https://ok.ru/{mode}/{video_id}', video_id, + note='Downloading desktop webpage', + headers={'Referer': smuggled['referrer']} if smuggled.get('referrer') else {}) error = self._search_regex( r'[^>]+class="vp_video_stub_txt"[^>]*>([^<]+)<', webpage, 'error', default=None) - if error: + # Direct link from boosty + if (error == 'The author of this video has not been found or is blocked' + and not smuggled.get('referrer') and mode == 'videoembed'): + return self._extract_desktop(smuggle_url(url, {'referrer': 'https://boosty.to'})) + elif error: raise ExtractorError(error, expected=True) player = self._parse_json( @@ -281,7 +326,7 @@ class OdnoklassnikiIE(InfoExtractor): if provider == 'LIVE_TV_APP': info['title'] = title - quality = qualities(('4', '0', '1', '2', '3', '5')) + quality = qualities(('4', '0', '1', '2', '3', '5', '6', '7')) formats = [{ 'url': f['url'], @@ -325,8 +370,6 @@ class OdnoklassnikiIE(InfoExtractor): if payment_info: self.raise_no_formats('This video is paid, subscribe to download it', expected=True) - self._sort_formats(formats) - info['formats'] = formats return info diff --git a/hypervideo_dl/extractor/oftv.py b/hypervideo_dl/extractor/oftv.py new file mode 100644 index 0000000..3ae7278 --- /dev/null +++ b/hypervideo_dl/extractor/oftv.py @@ -0,0 +1,54 @@ +from .common import InfoExtractor +from .zype import ZypeIE +from ..utils import traverse_obj + + +class OfTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?of.tv/video/(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://of.tv/video/627d7d95b353db0001dadd1a', + 'md5': 'cb9cd5db3bb9ee0d32bfd7e373d6ef0a', + 'info_dict': { + 'id': '627d7d95b353db0001dadd1a', + 'ext': 'mp4', + 'title': 'E1: Jacky vs Eric', + 'thumbnail': r're:^https?://.*\.jpg', + 'average_rating': 0, + 'description': 'md5:dd16e3e2a8d27d922e7a989f85986853', + 'display_id': '', + 'duration': 1423, + 'timestamp': 1652391300, + 'upload_date': '20220512', + 'view_count': 0, + 'creator': 'This is Fire' + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + info = next(ZypeIE.extract_from_webpage(self._downloader, url, webpage)) + info['_type'] = 'url_transparent' + info['creator'] = self._search_regex(r'<a[^>]+class=\"creator-name\"[^>]+>([^<]+)', webpage, 'creator') + return info + + +class OfTVPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?of.tv/creators/(?P<id>[a-zA-Z0-9-]+)/.?' + _TESTS = [{ + 'url': 'https://of.tv/creators/this-is-fire/', + 'playlist_count': 8, + 'info_dict': { + 'id': 'this-is-fire' + } + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + + json_match = self._search_json( + r'var\s*remaining_videos\s*=', webpage, 'oftv playlists', playlist_id, contains_pattern=r'\[.+\]') + + return self.playlist_from_matches( + traverse_obj(json_match, (..., 'discovery_url')), playlist_id) diff --git a/hypervideo_dl/extractor/oktoberfesttv.py b/hypervideo_dl/extractor/oktoberfesttv.py index 2765674..e0ac856 100644 --- a/hypervideo_dl/extractor/oktoberfesttv.py +++ b/hypervideo_dl/extractor/oktoberfesttv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/olympics.py b/hypervideo_dl/extractor/olympics.py index 784f282..61d1f40 100644 --- a/hypervideo_dl/extractor/olympics.py +++ b/hypervideo_dl/extractor/olympics.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -56,8 +53,7 @@ class OlympicsReplayIE(InfoExtractor): }) m3u8_url = self._download_json( f'https://olympics.com/tokenGenerator?url={m3u8_url}', uuid, note='Downloading m3u8 url') - formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, uuid, m3u8_id='hls') - self._sort_formats(formats) + formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, uuid, 'mp4', m3u8_id='hls') return { 'id': uuid, diff --git a/hypervideo_dl/extractor/on24.py b/hypervideo_dl/extractor/on24.py index d4d8244..9a4abc9 100644 --- a/hypervideo_dl/extractor/on24.py +++ b/hypervideo_dl/extractor/on24.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -79,7 +76,6 @@ class On24IE(InfoExtractor): 'vcodec': 'none', 'acodec': 'wav' }) - self._sort_formats(formats) return { 'id': event_id, diff --git a/hypervideo_dl/extractor/once.py b/hypervideo_dl/extractor/once.py index 3e44b78..989f10a 100644 --- a/hypervideo_dl/extractor/once.py +++ b/hypervideo_dl/extractor/once.py @@ -1,12 +1,9 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor -class OnceIE(InfoExtractor): +class OnceIE(InfoExtractor): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor _VALID_URL = r'https?://.+?\.unicornmedia\.com/now/(?:ads/vmap/)?[^/]+/[^/]+/(?P<domain_id>[^/]+)/(?P<application_id>[^/]+)/(?:[^/]+/)?(?P<media_item_id>[^/]+)/content\.(?:once|m3u8|mp4)' ADAPTIVE_URL_TEMPLATE = 'http://once.unicornmedia.com/now/master/playlist/%s/%s/%s/content.m3u8' PROGRESSIVE_URL_TEMPLATE = 'http://once.unicornmedia.com/now/media/progressive/%s/%s/%s/%s/content.mp4' diff --git a/hypervideo_dl/extractor/ondemandkorea.py b/hypervideo_dl/extractor/ondemandkorea.py index e933ea2..dd7d1d7 100644 --- a/hypervideo_dl/extractor/ondemandkorea.py +++ b/hypervideo_dl/extractor/ondemandkorea.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -14,11 +11,11 @@ class OnDemandKoreaIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?ondemandkorea\.com/(?P<id>[^/]+)\.html' _GEO_COUNTRIES = ['US', 'CA'] _TESTS = [{ - 'url': 'https://www.ondemandkorea.com/ask-us-anything-e43.html', + 'url': 'https://www.ondemandkorea.com/ask-us-anything-e351.html', 'info_dict': { - 'id': 'ask-us-anything-e43', + 'id': 'ask-us-anything-e351', 'ext': 'mp4', - 'title': 'Ask Us Anything : Gain, Ji Soo - 09/24/2016', + 'title': 'Ask Us Anything : Jung Sung-ho, Park Seul-gi, Kim Bo-min, Yang Seung-won - 09/24/2022', 'description': 'A talk show/game show with a school theme where celebrity guests appear as “transfer students.”', 'thumbnail': r're:^https?://.*\.jpg$', }, @@ -26,13 +23,13 @@ class OnDemandKoreaIE(InfoExtractor): 'skip_download': 'm3u8 download' } }, { - 'url': 'https://www.ondemandkorea.com/confession-e01-1.html', + 'url': 'https://www.ondemandkorea.com/work-later-drink-now-e1.html', 'info_dict': { - 'id': 'confession-e01-1', + 'id': 'work-later-drink-now-e1', 'ext': 'mp4', - 'title': 'Confession : E01', - 'description': 'Choi Do-hyun, a criminal attorney, is the son of a death row convict. Ever since Choi Pil-su got arrested for murder, Do-hyun has wanted to solve his ', - 'thumbnail': r're:^https?://.*\.jpg$', + 'title': 'Work Later, Drink Now : E01', + 'description': 'Work Later, Drink First follows three women who find solace in a glass of liquor at the end of the day. So-hee, who gets comfort from a cup of soju af', + 'thumbnail': r're:^https?://.*\.png$', 'subtitles': { 'English': 'mincount:1', }, @@ -72,9 +69,11 @@ class OnDemandKoreaIE(InfoExtractor): webpage, 'episode_title', fatal=False) or self._og_search_title(webpage) jw_config = self._parse_json( - self._search_regex( + self._search_regex(( + r'(?P<options>{\s*[\'"]tracks[\'"].*?})[)\];]+$', r'playlist\s*=\s*\[(?P<options>.+)];?$', - webpage, 'jw config', flags=re.MULTILINE, group='options'), + r'odkPlayer\.init.*?(?P<options>{[^;]+}).*?;', + ), webpage, 'jw config', flags=re.MULTILINE | re.DOTALL, group='options'), video_id, transform_source=js_to_json) info = self._parse_jwplayer_data( jw_config, video_id, require_title=False, m3u8_id='hls', diff --git a/hypervideo_dl/extractor/onefootball.py b/hypervideo_dl/extractor/onefootball.py index 826faad..591d157 100644 --- a/hypervideo_dl/extractor/onefootball.py +++ b/hypervideo_dl/extractor/onefootball.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor @@ -39,7 +36,6 @@ class OneFootballIE(InfoExtractor): data_json = self._search_json_ld(webpage, id) m3u8_url = self._html_search_regex(r'(https://cdn\.jwplayer\.com/manifests/.+\.m3u8)', webpage, 'm3u8_url') formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, id) - self._sort_formats(formats) return { 'id': id, 'title': data_json.get('title'), diff --git a/hypervideo_dl/extractor/onenewsnz.py b/hypervideo_dl/extractor/onenewsnz.py new file mode 100644 index 0000000..a46211e --- /dev/null +++ b/hypervideo_dl/extractor/onenewsnz.py @@ -0,0 +1,111 @@ +from .brightcove import BrightcoveNewIE +from .common import InfoExtractor + +from ..utils import ( + ExtractorError, + traverse_obj +) + + +class OneNewsNZIE(InfoExtractor): + IE_NAME = '1News' + IE_DESC = '1news.co.nz article videos' + _VALID_URL = r'https?://(?:www\.)?(?:1|one)news\.co\.nz/\d+/\d+/\d+/(?P<id>[^/?#&]+)' + _TESTS = [ + { # Brightcove video + 'url': 'https://www.1news.co.nz/2022/09/29/cows-painted-green-on-parliament-lawn-in-climate-protest/', + 'info_dict': { + 'id': 'cows-painted-green-on-parliament-lawn-in-climate-protest', + 'title': '\'Cows\' painted green on Parliament lawn in climate protest', + }, + 'playlist': [{ + 'info_dict': { + 'id': '6312993358112', + 'title': 'Activists dressed as cows painted green outside Parliament in climate protest', + 'ext': 'mp4', + 'tags': 'count:6', + 'uploader_id': '963482464001', + 'timestamp': 1664416255, + 'upload_date': '20220929', + 'duration': 38.272, + 'thumbnail': r're:^https?://.*\.jpg$', + 'description': 'Greenpeace accused the Government of "greenwashing" instead of taking climate action.', + } + }] + }, { + # YouTube video + 'url': 'https://www.1news.co.nz/2022/09/30/now-is-the-time-to-care-about-womens-rugby/', + 'info_dict': { + 'id': 'now-is-the-time-to-care-about-womens-rugby', + 'title': 'Now is the time to care about women\'s rugby', + }, + 'playlist': [{ + 'info_dict': { + 'id': 's4wEB9neTfU', + 'title': 'Why I love women’s rugby: Black Fern Ruahei Demant', + 'ext': 'mp4', + 'channel_follower_count': int, + 'channel_url': 'https://www.youtube.com/channel/UC2BQ3U9IxoYIJyulv0bN5PQ', + 'tags': 'count:12', + 'uploader': 'Re: News', + 'upload_date': '20211215', + 'uploader_id': 'UC2BQ3U9IxoYIJyulv0bN5PQ', + 'uploader_url': 'http://www.youtube.com/channel/UC2BQ3U9IxoYIJyulv0bN5PQ', + 'channel_id': 'UC2BQ3U9IxoYIJyulv0bN5PQ', + 'channel': 'Re: News', + 'like_count': int, + 'thumbnail': 'https://i.ytimg.com/vi/s4wEB9neTfU/maxresdefault.jpg', + 'age_limit': 0, + 'view_count': int, + 'categories': ['Sports'], + 'duration': 222, + 'description': 'md5:8874410e5740ed1d8fd0df839f849813', + 'availability': 'public', + 'playable_in_embed': True, + 'live_status': 'not_live', + } + }] + }, { + # 2 Brightcove videos + 'url': 'https://www.1news.co.nz/2022/09/29/raw-videos-capture-hurricane-ians-fury-as-it-slams-florida/', + 'info_dict': { + 'id': 'raw-videos-capture-hurricane-ians-fury-as-it-slams-florida', + 'title': 'Raw videos capture Hurricane Ian\'s fury as it slams Florida', + }, + 'playlist_mincount': 2, + }, { + 'url': 'https://www.onenews.co.nz/2022/09/29/cows-painted-green-on-parliament-lawn-in-climate-protest/', + 'only_matching': True, + }] + + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/0xpHIR6IB_default/index.html?videoId=%s' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + fusion_metadata = self._search_json(r'Fusion\.globalContent\s*=', webpage, 'fusion metadata', display_id) + + entries = [] + for item in traverse_obj(fusion_metadata, 'content_elements') or []: + item_type = traverse_obj(item, 'subtype') + if item_type == 'video': + brightcove_config = traverse_obj(item, ('embed', 'config')) + brightcove_url = self.BRIGHTCOVE_URL_TEMPLATE % ( + traverse_obj(brightcove_config, 'brightcoveAccount') or '963482464001', + traverse_obj(brightcove_config, 'brightcoveVideoId') + ) + entries.append(self.url_result(brightcove_url, BrightcoveNewIE)) + elif item_type == 'youtube': + video_id_or_url = traverse_obj(item, ('referent', 'id'), ('raw_oembed', '_id')) + if video_id_or_url: + entries.append(self.url_result(video_id_or_url, ie='Youtube')) + + if not entries: + raise ExtractorError('This article does not have a video.', expected=True) + + playlist_title = ( + traverse_obj(fusion_metadata, ('headlines', 'basic')) + or self._generic_title('', webpage) + ) + return self.playlist_result(entries, display_id, playlist_title) diff --git a/hypervideo_dl/extractor/onet.py b/hypervideo_dl/extractor/onet.py index 95177a2..0d59e8c 100644 --- a/hypervideo_dl/extractor/onet.py +++ b/hypervideo_dl/extractor/onet.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -83,7 +80,6 @@ class OnetBaseIE(InfoExtractor): 'vbr': float_or_none(f.get('video_bitrate')), }) formats.append(http_f) - self._sort_formats(formats) meta = video.get('meta', {}) diff --git a/hypervideo_dl/extractor/onionstudios.py b/hypervideo_dl/extractor/onionstudios.py index cf5c39e..5fa49e1 100644 --- a/hypervideo_dl/extractor/onionstudios.py +++ b/hypervideo_dl/extractor/onionstudios.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..compat import compat_str from ..utils import js_to_json @@ -10,6 +5,7 @@ from ..utils import js_to_json class OnionStudiosIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?onionstudios\.com/(?:video(?:s/[^/]+-|/)|embed\?.*\bid=)(?P<id>\d+)(?!-)' + _EMBED_REGEX = [r'(?s)<(?:iframe|bulbs-video)[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?onionstudios\.com/(?:embed.+?|video/\d+\.json))\1'] _TESTS = [{ 'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937', @@ -32,13 +28,6 @@ class OnionStudiosIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'(?s)<(?:iframe|bulbs-video)[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?onionstudios\.com/(?:embed.+?|video/\d+\.json))\1', webpage) - if mobj: - return mobj.group('url') - def _real_extract(self, url): video_id = self._match_id(url) diff --git a/hypervideo_dl/extractor/ooyala.py b/hypervideo_dl/extractor/ooyala.py index 20cfa0a..65afccd 100644 --- a/hypervideo_dl/extractor/ooyala.py +++ b/hypervideo_dl/extractor/ooyala.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import base64 import re @@ -12,6 +10,7 @@ from ..utils import ( determine_ext, float_or_none, int_or_none, + smuggle_url, try_get, unsmuggle_url, ) @@ -86,7 +85,6 @@ class OoyalaBaseIE(InfoExtractor): if not formats and not auth_data.get('authorized'): self.raise_no_formats('%s said: %s' % ( self.IE_NAME, auth_data['message']), expected=True) - self._sort_formats(formats) subtitles = {} for lang, sub in metadata.get('closed_captions_vtt', {}).get('captions', {}).items(): @@ -153,6 +151,29 @@ class OoyalaIE(OoyalaBaseIE): } ] + def _extract_from_webpage(self, url, webpage): + mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) + or re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) + or re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P<ec>.{32})[\'"]', webpage) + or re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) + or re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage)) + if mobj is not None: + embed_token = self._search_regex( + r'embedToken[\'"]?\s*:\s*[\'"]([^\'"]+)', + webpage, 'ooyala embed token', default=None) + yield self._build_url_result(smuggle_url( + mobj.group('ec'), { + 'domain': url, + 'embed_token': embed_token, + })) + return + + # Look for multiple Ooyala embeds on SBN network websites + mobj = re.search(r'SBN\.VideoLinkset\.entryGroup\((\[.*?\])', webpage) + if mobj is not None: + for v in self._parse_json(mobj.group(1), self._generic_id(url), fatal=False) or []: + yield self._build_url_result(smuggle_url(v['provider_video_id'], {'domain': url})) + @staticmethod def _url_for_embed_code(embed_code): return 'http://player.ooyala.com/player.js?embedCode=%s' % embed_code diff --git a/hypervideo_dl/extractor/opencast.py b/hypervideo_dl/extractor/opencast.py index cf8d917..fa46757 100644 --- a/hypervideo_dl/extractor/opencast.py +++ b/hypervideo_dl/extractor/opencast.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -95,8 +92,6 @@ class OpencastBaseIE(InfoExtractor): }) formats.append(track_obj) - self._sort_formats(formats) - return { 'id': video_id, 'formats': formats, diff --git a/hypervideo_dl/extractor/openload.py b/hypervideo_dl/extractor/openload.py index fe4740a..56b8330 100644 --- a/hypervideo_dl/extractor/openload.py +++ b/hypervideo_dl/extractor/openload.py @@ -1,22 +1,19 @@ -# coding: utf-8 -from __future__ import unicode_literals - +import collections +import contextlib import json import os import subprocess import tempfile -from ..compat import ( - compat_urlparse, - compat_kwargs, -) +from ..compat import compat_urlparse from ..utils import ( - check_executable, - encodeArgument, ExtractorError, + Popen, + check_executable, + format_field, get_exe_version, is_outdated_version, - Popen, + shell_quote, ) @@ -37,13 +34,11 @@ def cookie_to_dict(cookie): cookie_dict['secure'] = cookie.secure if cookie.discard is not None: cookie_dict['discard'] = cookie.discard - try: + with contextlib.suppress(TypeError): if (cookie.has_nonstandard_attr('httpOnly') or cookie.has_nonstandard_attr('httponly') or cookie.has_nonstandard_attr('HttpOnly')): cookie_dict['httponly'] = True - except TypeError: - pass return cookie_dict @@ -51,13 +46,15 @@ def cookie_jar_to_list(cookie_jar): return [cookie_to_dict(cookie) for cookie in cookie_jar] -class PhantomJSwrapper(object): +class PhantomJSwrapper: """PhantomJS wrapper class This class is experimental. """ - _TEMPLATE = r''' + INSTALL_HINT = 'Please download it from https://phantomjs.org/download.html' + + _BASE_JS = R''' phantom.onError = function(msg, trace) {{ var msgStack = ['PHANTOM ERROR: ' + msg]; if(trace && trace.length) {{ @@ -70,6 +67,9 @@ class PhantomJSwrapper(object): console.error(msgStack.join('\n')); phantom.exit(1); }}; + ''' + + _TEMPLATE = R''' var page = require('webpage').create(); var fs = require('fs'); var read = {{ mode: 'r', charset: 'utf-8' }}; @@ -112,9 +112,7 @@ class PhantomJSwrapper(object): self.exe = check_executable('phantomjs', ['-v']) if not self.exe: - raise ExtractorError('PhantomJS executable not found in PATH, ' - 'download it from http://phantomjs.org', - expected=True) + raise ExtractorError(f'PhantomJS not found, {self.INSTALL_HINT}', expected=True) self.extractor = extractor @@ -125,23 +123,25 @@ class PhantomJSwrapper(object): 'Your copy of PhantomJS is outdated, update it to version ' '%s or newer if you encounter any errors.' % required_version) - self.options = { - 'timeout': timeout, - } for name in self._TMP_FILE_NAMES: tmp = tempfile.NamedTemporaryFile(delete=False) tmp.close() self._TMP_FILES[name] = tmp + self.options = collections.ChainMap({ + 'timeout': timeout, + }, { + x: self._TMP_FILES[x].name.replace('\\', '\\\\').replace('"', '\\"') + for x in self._TMP_FILE_NAMES + }) + def __del__(self): for name in self._TMP_FILE_NAMES: - try: + with contextlib.suppress(OSError, KeyError): os.remove(self._TMP_FILES[name].name) - except (IOError, OSError, KeyError): - pass def _save_cookies(self, url): - cookies = cookie_jar_to_list(self.extractor._downloader.cookiejar) + cookies = cookie_jar_to_list(self.extractor.cookiejar) for cookie in cookies: if 'path' not in cookie: cookie['path'] = '/' @@ -158,7 +158,7 @@ class PhantomJSwrapper(object): cookie['rest'] = {'httpOnly': None} if 'expiry' in cookie: cookie['expire_time'] = cookie['expiry'] - self.extractor._set_cookie(**compat_kwargs(cookie)) + self.extractor._set_cookie(**cookie) def get(self, url, html=None, video_id=None, note=None, note2='Executing JS on webpage', headers={}, jscode='saveAndExit();'): """ @@ -180,7 +180,7 @@ class PhantomJSwrapper(object): In most cases you don't need to add any `jscode`. It is executed in `page.onLoadFinished`. `saveAndExit();` is mandatory, use it instead of `phantom.exit()` - It is possible to wait for some element on the webpage, for example: + It is possible to wait for some element on the webpage, e.g. var check = function() { var elementFound = page.evaluate(function() { return document.querySelector('#b.done') !== null; @@ -205,33 +205,39 @@ class PhantomJSwrapper(object): self._save_cookies(url) - replaces = self.options - replaces['url'] = url user_agent = headers.get('User-Agent') or self.extractor.get_param('http_headers')['User-Agent'] - replaces['ua'] = user_agent.replace('"', '\\"') - replaces['jscode'] = jscode - - for x in self._TMP_FILE_NAMES: - replaces[x] = self._TMP_FILES[x].name.replace('\\', '\\\\').replace('"', '\\"') - - with open(self._TMP_FILES['script'].name, 'wb') as f: - f.write(self._TEMPLATE.format(**replaces).encode('utf-8')) - - if video_id is None: - self.extractor.to_screen('%s' % (note2,)) - else: - self.extractor.to_screen('%s: %s' % (video_id, note2)) - - p = Popen( - [self.exe, '--ssl-protocol=any', self._TMP_FILES['script'].name], - stdout=subprocess.PIPE, stderr=subprocess.PIPE) - out, err = p.communicate_or_kill() - if p.returncode != 0: - raise ExtractorError( - 'Executing JS failed\n:' + encodeArgument(err)) + jscode = self._TEMPLATE.format_map(self.options.new_child({ + 'url': url, + 'ua': user_agent.replace('"', '\\"'), + 'jscode': jscode, + })) + + stdout = self.execute(jscode, video_id, note=note2) + with open(self._TMP_FILES['html'].name, 'rb') as f: html = f.read().decode('utf-8') - self._load_cookies() - return (html, encodeArgument(out)) + return html, stdout + + def execute(self, jscode, video_id=None, *, note='Executing JS'): + """Execute JS and return stdout""" + if 'phantom.exit();' not in jscode: + jscode += ';\nphantom.exit();' + jscode = self._BASE_JS + jscode + + with open(self._TMP_FILES['script'].name, 'w', encoding='utf-8') as f: + f.write(jscode) + self.extractor.to_screen(f'{format_field(video_id, None, "%s: ")}{note}') + + cmd = [self.exe, '--ssl-protocol=any', self._TMP_FILES['script'].name] + self.extractor.write_debug(f'PhantomJS command line: {shell_quote(cmd)}') + try: + stdout, stderr, returncode = Popen.run(cmd, timeout=self.options['timeout'] / 1000, + text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + except Exception as e: + raise ExtractorError(f'{note} failed: Unable to run PhantomJS binary', cause=e) + if returncode: + raise ExtractorError(f'{note} failed with returncode {returncode}:\n{stderr.strip()}') + + return stdout diff --git a/hypervideo_dl/extractor/openrec.py b/hypervideo_dl/extractor/openrec.py index 5eb1cdb..86dc9bb 100644 --- a/hypervideo_dl/extractor/openrec.py +++ b/hypervideo_dl/extractor/openrec.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -38,8 +35,8 @@ class OpenRecBaseIE(InfoExtractor): raise ExtractorError(f'Failed to extract {name} info') formats = list(self._expand_media(video_id, get_first(movie_stores, 'media'))) - if not formats and is_live: - # archived livestreams + if not formats: + # archived livestreams or subscriber-only videos cookies = self._get_cookies('https://www.openrec.tv/') detail = self._download_json( f'https://apiv5.openrec.tv/api/v5/movies/{video_id}/detail', video_id, @@ -53,8 +50,6 @@ class OpenRecBaseIE(InfoExtractor): formats = list(self._expand_media(video_id, new_media)) is_live = False - self._sort_formats(formats) - return { 'id': video_id, 'title': get_first(movie_stores, 'title'), @@ -116,7 +111,6 @@ class OpenRecCaptureIE(OpenRecBaseIE): formats = self._extract_m3u8_formats( capture_data.get('source'), video_id, ext='mp4') - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/ora.py b/hypervideo_dl/extractor/ora.py index 422d0b3..d49909d 100644 --- a/hypervideo_dl/extractor/ora.py +++ b/hypervideo_dl/extractor/ora.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor from ..compat import compat_urlparse @@ -57,7 +54,6 @@ class OraTVIE(InfoExtractor): 'format_id': q, 'quality': preference(q), }) - self._sort_formats(formats) else: return self.url_result(self._search_regex( r'"youtube_id"\s*:\s*"([^"]+)', webpage, 'youtube id'), 'Youtube') diff --git a/hypervideo_dl/extractor/orf.py b/hypervideo_dl/extractor/orf.py index 0628977..e9d23a4 100644 --- a/hypervideo_dl/extractor/orf.py +++ b/hypervideo_dl/extractor/orf.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import functools import re @@ -15,9 +12,10 @@ from ..utils import ( join_nonempty, orderedSet, remove_end, + make_archive_id, smuggle_url, - str_or_none, strip_jsonp, + try_call, unescapeHTML, unified_strdate, unsmuggle_url, @@ -136,8 +134,6 @@ class ORFTVthekIE(InfoExtractor): HEADRequest(http_url), video_id, fatal=False, note='Testing for geoblocking', errnote=f'This video seems to be blocked outside of {geo_str}. You may want to try the streaming-* formats') - self._sort_formats(formats) - subtitles = {} for sub in sd.get('subtitles', []): sub_src = sub.get('src') @@ -203,208 +199,99 @@ class ORFTVthekIE(InfoExtractor): class ORFRadioIE(InfoExtractor): - def _real_extract(self, url): - mobj = self._match_valid_url(url) - show_date = mobj.group('date') - show_id = mobj.group('show') + IE_NAME = 'orf:radio' + + STATION_INFO = { + 'fm4': ('fm4', 'fm4', 'orffm4'), + 'noe': ('noe', 'oe2n', 'orfnoe'), + 'wien': ('wie', 'oe2w', 'orfwie'), + 'burgenland': ('bgl', 'oe2b', 'orfbgl'), + 'ooe': ('ooe', 'oe2o', 'orfooe'), + 'steiermark': ('stm', 'oe2st', 'orfstm'), + 'kaernten': ('ktn', 'oe2k', 'orfktn'), + 'salzburg': ('sbg', 'oe2s', 'orfsbg'), + 'tirol': ('tir', 'oe2t', 'orftir'), + 'vorarlberg': ('vbg', 'oe2v', 'orfvbg'), + 'oe3': ('oe3', 'oe3', 'orfoe3'), + 'oe1': ('oe1', 'oe1', 'orfoe1'), + } + _STATION_RE = '|'.join(map(re.escape, STATION_INFO.keys())) - data = self._download_json( - 'http://audioapi.orf.at/%s/api/json/current/broadcast/%s/%s' - % (self._API_STATION, show_id, show_date), show_id) + _VALID_URL = rf'''(?x) + https?://(?: + (?P<station>{_STATION_RE})\.orf\.at/player| + radiothek\.orf\.at/(?P<station2>{_STATION_RE}) + )/(?P<date>[0-9]+)/(?P<show>\w+)''' - entries = [] - for info in data['streams']: - loop_stream_id = str_or_none(info.get('loopStreamId')) - if not loop_stream_id: - continue - title = str_or_none(data.get('title')) - if not title: - continue - start = int_or_none(info.get('start'), scale=1000) - end = int_or_none(info.get('end'), scale=1000) - duration = end - start if end and start else None - entries.append({ - 'id': loop_stream_id.replace('.mp3', ''), - 'url': 'https://loopstream01.apa.at/?channel=%s&id=%s' % (self._LOOP_STATION, loop_stream_id), - 'title': title, - 'description': clean_html(data.get('subtitle')), - 'duration': duration, - 'timestamp': start, + _TESTS = [{ + 'url': 'https://radiothek.orf.at/ooe/20220801/OGMO', + 'info_dict': { + 'id': 'OGMO', + 'title': 'Guten Morgen OÖ', + 'description': 'md5:a3f6083399ef92b8cbe2d421b180835a', + }, + 'playlist': [{ + 'md5': 'f33147d954a326e338ea52572c2810e8', + 'info_dict': { + 'id': '2022-08-01_0459_tl_66_7DaysMon1_319062', 'ext': 'mp3', - 'series': data.get('programTitle'), - }) - - return { - '_type': 'playlist', - 'id': show_id, - 'title': data.get('title'), - 'description': clean_html(data.get('subtitle')), - 'entries': entries, - } - - -class ORFFM4IE(ORFRadioIE): - IE_NAME = 'orf:fm4' - IE_DESC = 'radio FM4' - _VALID_URL = r'https?://(?P<station>fm4)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>4\w+)' - _API_STATION = 'fm4' - _LOOP_STATION = 'fm4' - - _TEST = { - 'url': 'http://fm4.orf.at/player/20170107/4CC', - 'md5': '2b0be47375432a7ef104453432a19212', + 'title': 'Guten Morgen OÖ', + 'upload_date': '20220801', + 'duration': 18000, + 'timestamp': 1659322789, + 'description': 'md5:a3f6083399ef92b8cbe2d421b180835a', + } + }] + }, { + 'url': 'https://ooe.orf.at/player/20220801/OGMO', 'info_dict': { - 'id': '2017-01-07_2100_tl_54_7DaysSat18_31295', - 'ext': 'mp3', - 'title': 'Solid Steel Radioshow', - 'description': 'Die Mixshow von Coldcut und Ninja Tune.', - 'duration': 3599, - 'timestamp': 1483819257, - 'upload_date': '20170107', + 'id': 'OGMO', + 'title': 'Guten Morgen OÖ', + 'description': 'md5:a3f6083399ef92b8cbe2d421b180835a', }, - 'skip': 'Shows from ORF radios are only available for 7 days.', + 'playlist': [{ + 'md5': 'f33147d954a326e338ea52572c2810e8', + 'info_dict': { + 'id': '2022-08-01_0459_tl_66_7DaysMon1_319062', + 'ext': 'mp3', + 'title': 'Guten Morgen OÖ', + 'upload_date': '20220801', + 'duration': 18000, + 'timestamp': 1659322789, + 'description': 'md5:a3f6083399ef92b8cbe2d421b180835a', + } + }] + }, { + 'url': 'http://fm4.orf.at/player/20170107/4CC', 'only_matching': True, - } - - -class ORFNOEIE(ORFRadioIE): - IE_NAME = 'orf:noe' - IE_DESC = 'Radio Niederösterreich' - _VALID_URL = r'https?://(?P<station>noe)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' - _API_STATION = 'noe' - _LOOP_STATION = 'oe2n' - - _TEST = { + }, { 'url': 'https://noe.orf.at/player/20200423/NGM', 'only_matching': True, - } - - -class ORFWIEIE(ORFRadioIE): - IE_NAME = 'orf:wien' - IE_DESC = 'Radio Wien' - _VALID_URL = r'https?://(?P<station>wien)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' - _API_STATION = 'wie' - _LOOP_STATION = 'oe2w' - - _TEST = { + }, { 'url': 'https://wien.orf.at/player/20200423/WGUM', 'only_matching': True, - } - - -class ORFBGLIE(ORFRadioIE): - IE_NAME = 'orf:burgenland' - IE_DESC = 'Radio Burgenland' - _VALID_URL = r'https?://(?P<station>burgenland)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' - _API_STATION = 'bgl' - _LOOP_STATION = 'oe2b' - - _TEST = { + }, { 'url': 'https://burgenland.orf.at/player/20200423/BGM', 'only_matching': True, - } - - -class ORFOOEIE(ORFRadioIE): - IE_NAME = 'orf:oberoesterreich' - IE_DESC = 'Radio Oberösterreich' - _VALID_URL = r'https?://(?P<station>ooe)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' - _API_STATION = 'ooe' - _LOOP_STATION = 'oe2o' - - _TEST = { - 'url': 'https://ooe.orf.at/player/20200423/OGMO', - 'only_matching': True, - } - - -class ORFSTMIE(ORFRadioIE): - IE_NAME = 'orf:steiermark' - IE_DESC = 'Radio Steiermark' - _VALID_URL = r'https?://(?P<station>steiermark)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' - _API_STATION = 'stm' - _LOOP_STATION = 'oe2st' - - _TEST = { + }, { 'url': 'https://steiermark.orf.at/player/20200423/STGMS', 'only_matching': True, - } - - -class ORFKTNIE(ORFRadioIE): - IE_NAME = 'orf:kaernten' - IE_DESC = 'Radio Kärnten' - _VALID_URL = r'https?://(?P<station>kaernten)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' - _API_STATION = 'ktn' - _LOOP_STATION = 'oe2k' - - _TEST = { + }, { 'url': 'https://kaernten.orf.at/player/20200423/KGUMO', 'only_matching': True, - } - - -class ORFSBGIE(ORFRadioIE): - IE_NAME = 'orf:salzburg' - IE_DESC = 'Radio Salzburg' - _VALID_URL = r'https?://(?P<station>salzburg)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' - _API_STATION = 'sbg' - _LOOP_STATION = 'oe2s' - - _TEST = { + }, { 'url': 'https://salzburg.orf.at/player/20200423/SGUM', 'only_matching': True, - } - - -class ORFTIRIE(ORFRadioIE): - IE_NAME = 'orf:tirol' - IE_DESC = 'Radio Tirol' - _VALID_URL = r'https?://(?P<station>tirol)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' - _API_STATION = 'tir' - _LOOP_STATION = 'oe2t' - - _TEST = { + }, { 'url': 'https://tirol.orf.at/player/20200423/TGUMO', 'only_matching': True, - } - - -class ORFVBGIE(ORFRadioIE): - IE_NAME = 'orf:vorarlberg' - IE_DESC = 'Radio Vorarlberg' - _VALID_URL = r'https?://(?P<station>vorarlberg)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' - _API_STATION = 'vbg' - _LOOP_STATION = 'oe2v' - - _TEST = { + }, { 'url': 'https://vorarlberg.orf.at/player/20200423/VGUM', 'only_matching': True, - } - - -class ORFOE3IE(ORFRadioIE): - IE_NAME = 'orf:oe3' - IE_DESC = 'Radio Österreich 3' - _VALID_URL = r'https?://(?P<station>oe3)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' - _API_STATION = 'oe3' - _LOOP_STATION = 'oe3' - - _TEST = { + }, { 'url': 'https://oe3.orf.at/player/20200424/3WEK', 'only_matching': True, - } - - -class ORFOE1IE(ORFRadioIE): - IE_NAME = 'orf:oe1' - IE_DESC = 'Radio Österreich 1' - _VALID_URL = r'https?://(?P<station>oe1)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' - _API_STATION = 'oe1' - _LOOP_STATION = 'oe1' - - _TEST = { + }, { 'url': 'http://oe1.orf.at/player/20170108/456544', 'md5': '34d8a6e67ea888293741c86a099b745b', 'info_dict': { @@ -416,7 +303,35 @@ class ORFOE1IE(ORFRadioIE): 'upload_date': '20170108', }, 'skip': 'Shows from ORF radios are only available for 7 days.' - } + }] + + def _entries(self, data, station): + _, loop_station, old_ie = self.STATION_INFO[station] + for info in data['streams']: + item_id = info.get('loopStreamId') + if not item_id: + continue + video_id = item_id.replace('.mp3', '') + yield { + 'id': video_id, + 'ext': 'mp3', + 'url': f'https://loopstream01.apa.at/?channel={loop_station}&id={item_id}', + '_old_archive_ids': [make_archive_id(old_ie, video_id)], + 'title': data.get('title'), + 'description': clean_html(data.get('subtitle')), + 'duration': try_call(lambda: (info['end'] - info['start']) / 1000), + 'timestamp': int_or_none(info.get('start'), scale=1000), + 'series': data.get('programTitle'), + } + + def _real_extract(self, url): + station, station2, show_date, show_id = self._match_valid_url(url).group('station', 'station2', 'date', 'show') + api_station, _, _ = self.STATION_INFO[station or station2] + data = self._download_json( + f'http://audioapi.orf.at/{api_station}/api/json/current/broadcast/{show_id}/{show_date}', show_id) + + return self.playlist_result( + self._entries(data, station or station2), show_id, data.get('title'), clean_html(data.get('subtitle'))) class ORFIPTVIE(InfoExtractor): @@ -490,7 +405,6 @@ class ORFIPTVIE(InfoExtractor): format_url, video_id, 'mp4', m3u8_id=format_id)) else: continue - self._sort_formats(formats) title = remove_end(self._og_search_title(webpage), ' - iptv.ORF.at') description = self._og_search_description(webpage) @@ -590,7 +504,6 @@ class ORFFM4StoryIE(InfoExtractor): format_url, video_id, 'mp4', m3u8_id=format_id)) else: continue - self._sort_formats(formats) title = remove_end(self._og_search_title(webpage), ' - fm4.ORF.at') if idx >= 1: diff --git a/hypervideo_dl/extractor/outsidetv.py b/hypervideo_dl/extractor/outsidetv.py index c5333b0..b1fcbd6 100644 --- a/hypervideo_dl/extractor/outsidetv.py +++ b/hypervideo_dl/extractor/outsidetv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/packtpub.py b/hypervideo_dl/extractor/packtpub.py index 62c52cd..51778d8 100644 --- a/hypervideo_dl/extractor/packtpub.py +++ b/hypervideo_dl/extractor/packtpub.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import json from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/palcomp3.py b/hypervideo_dl/extractor/palcomp3.py index d0a62fb..4b0801c 100644 --- a/hypervideo_dl/extractor/palcomp3.py +++ b/hypervideo_dl/extractor/palcomp3.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( diff --git a/hypervideo_dl/extractor/pandoratv.py b/hypervideo_dl/extractor/pandoratv.py index 6230053..ccc78da 100644 --- a/hypervideo_dl/extractor/pandoratv.py +++ b/hypervideo_dl/extractor/pandoratv.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..compat import ( compat_str, @@ -116,7 +112,6 @@ class PandoraTVIE(InfoExtractor): 'url': format_url, 'height': int(height), }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/panopto.py b/hypervideo_dl/extractor/panopto.py index 3388f7f..32c103b 100644 --- a/hypervideo_dl/extractor/panopto.py +++ b/hypervideo_dl/extractor/panopto.py @@ -1,4 +1,3 @@ -import re import calendar import json import functools @@ -73,15 +72,10 @@ class PanoptoBaseIE(InfoExtractor): def _parse_fragment(url): return {k: json.loads(v[0]) for k, v in compat_urlparse.parse_qs(compat_urllib_parse_urlparse(url).fragment).items()} - @staticmethod - def _extract_urls(webpage): - return [m.group('url') for m in re.finditer( - r'<iframe[^>]+src=["\'](?P<url>%s/Pages/(Viewer|Embed|Sessions/List)\.aspx[^"\']+)' % PanoptoIE.BASE_URL_RE, - webpage)] - class PanoptoIE(PanoptoBaseIE): _VALID_URL = PanoptoBaseIE.BASE_URL_RE + r'/Pages/(Viewer|Embed)\.aspx.*(?:\?|&)id=(?P<id>[a-f0-9-]+)' + _EMBED_REGEX = [rf'<iframe[^>]+src=["\'](?P<url>{PanoptoBaseIE.BASE_URL_RE}/Pages/(Viewer|Embed|Sessions/List)\.aspx[^"\']+)'] _TESTS = [ { 'url': 'https://demo.hosted.panopto.com/Panopto/Pages/Viewer.aspx?id=26b3ae9e-4a48-4dcc-96ba-0befba08a0fb', @@ -413,7 +407,6 @@ class PanoptoIE(PanoptoBaseIE): subtitles = self._merge_subtitles( podcast_subtitles, streams_subtitles, self.extract_subtitles(base_url, video_id, delivery)) - self._sort_formats(formats) self.mark_watched(base_url, video_id, delivery_info) return { diff --git a/hypervideo_dl/extractor/paramountplus.py b/hypervideo_dl/extractor/paramountplus.py index 94a9319..7e472a6 100644 --- a/hypervideo_dl/extractor/paramountplus.py +++ b/hypervideo_dl/extractor/paramountplus.py @@ -1,9 +1,9 @@ -from __future__ import unicode_literals import itertools from .common import InfoExtractor from .cbs import CBSBaseIE from ..utils import ( + ExtractorError, int_or_none, url_or_none, ) @@ -25,10 +25,17 @@ class ParamountPlusIE(CBSBaseIE): 'ext': 'mp4', 'title': 'CatDog - Climb Every CatDog/The Canine Mutiny', 'description': 'md5:7ac835000645a69933df226940e3c859', - 'duration': 1418, + 'duration': 1426, 'timestamp': 920264400, 'upload_date': '19990301', 'uploader': 'CBSI-NEW', + 'episode_number': 5, + 'thumbnail': r're:https?://.+\.jpg$', + 'season': 'Season 2', + 'chapters': 'count:3', + 'episode': 'Episode 5', + 'season_number': 2, + 'series': 'CatDog', }, 'params': { 'skip_download': 'm3u8', @@ -44,6 +51,13 @@ class ParamountPlusIE(CBSBaseIE): 'timestamp': 1627063200, 'upload_date': '20210723', 'uploader': 'CBSI-NEW', + 'episode_number': 81, + 'thumbnail': r're:https?://.+\.jpg$', + 'season': 'Season 2', + 'chapters': 'count:4', + 'episode': 'Episode 81', + 'season_number': 2, + 'series': 'Tooning Out The News', }, 'params': { 'skip_download': 'm3u8', @@ -55,14 +69,18 @@ class ParamountPlusIE(CBSBaseIE): 'ext': 'mp4', 'title': 'Daddy\'s Home', 'upload_date': '20151225', - 'description': 'md5:a0beaf24e8d3b0e81b2ee41d47c06f33', + 'description': 'md5:9a6300c504d5e12000e8707f20c54745', 'uploader': 'CBSI-NEW', 'timestamp': 1451030400, + 'thumbnail': r're:https?://.+\.jpg$', + 'chapters': 'count:0', + 'duration': 5761, + 'series': 'Paramount+ Movies', }, 'params': { 'skip_download': 'm3u8', }, - 'expected_warnings': ['Ignoring subtitle tracks'], # TODO: Investigate this + 'skip': 'DRM', }, { 'url': 'https://www.paramountplus.com/movies/video/5EKDXPOzdVf9voUqW6oRuocyAEeJGbEc/', 'info_dict': { @@ -73,11 +91,15 @@ class ParamountPlusIE(CBSBaseIE): 'timestamp': 1577865600, 'title': 'Sonic the Hedgehog', 'upload_date': '20200101', + 'thumbnail': r're:https?://.+\.jpg$', + 'chapters': 'count:0', + 'duration': 5932, + 'series': 'Paramount+ Movies', }, 'params': { 'skip_download': 'm3u8', }, - 'expected_warnings': ['Ignoring subtitle tracks'], + 'skip': 'DRM', }, { 'url': 'https://www.paramountplus.com/shows/the-real-world/video/mOVeHeL9ub9yWdyzSZFYz8Uj4ZBkVzQg/the-real-world-reunion/', 'only_matching': True, @@ -94,24 +116,51 @@ class ParamountPlusIE(CBSBaseIE): def _extract_video_info(self, content_id, mpx_acc=2198311517): items_data = self._download_json( - 'https://www.paramountplus.com/apps-api/v2.0/androidtv/video/cid/%s.json' % content_id, - content_id, query={'locale': 'en-us', 'at': 'ABCqWNNSwhIqINWIIAG+DFzcFUvF8/vcN6cNyXFFfNzWAIvXuoVgX+fK4naOC7V8MLI='}, headers=self.geo_verification_headers()) + f'https://www.paramountplus.com/apps-api/v2.0/androidtv/video/cid/{content_id}.json', + content_id, query={ + 'locale': 'en-us', + 'at': 'ABCXgPuoStiPipsK0OHVXIVh68zNys+G4f7nW9R6qH68GDOcneW6Kg89cJXGfiQCsj0=', + }, headers=self.geo_verification_headers()) asset_types = { item.get('assetType'): { 'format': 'SMIL', - 'formats': 'MPEG4,M3U', + 'formats': 'M3U+none,MPEG4', # '+none' specifies ProtectionScheme (no DRM) } for item in items_data['itemList'] } item = items_data['itemList'][-1] - return self._extract_common_video_info(content_id, asset_types, mpx_acc, extra_info={ + + info, error = {}, None + metadata = { 'title': item.get('title'), 'series': item.get('seriesTitle'), 'season_number': int_or_none(item.get('seasonNum')), 'episode_number': int_or_none(item.get('episodeNum')), 'duration': int_or_none(item.get('duration')), 'thumbnail': url_or_none(item.get('thumbnail')), - }) + } + try: + info = self._extract_common_video_info(content_id, asset_types, mpx_acc, extra_info=metadata) + except ExtractorError as e: + error = e + + # Check for DRM formats to give appropriate error + if not info.get('formats'): + for query in asset_types.values(): + query['formats'] = 'MPEG-DASH,M3U,MPEG4' # allows DRM formats + + try: + drm_info = self._extract_common_video_info(content_id, asset_types, mpx_acc, extra_info=metadata) + except ExtractorError: + if error: + raise error from None + raise + if drm_info['formats']: + self.report_drm(content_id) + elif error: + raise error + + return info class ParamountPlusSeriesIE(InfoExtractor): diff --git a/hypervideo_dl/extractor/parler.py b/hypervideo_dl/extractor/parler.py new file mode 100644 index 0000000..68a60bc --- /dev/null +++ b/hypervideo_dl/extractor/parler.py @@ -0,0 +1,111 @@ +from .common import InfoExtractor +from .youtube import YoutubeIE +from ..utils import ( + clean_html, + format_field, + int_or_none, + strip_or_none, + traverse_obj, + unified_timestamp, + urlencode_postdata, +) + + +class ParlerIE(InfoExtractor): + IE_DESC = 'Posts on parler.com' + _VALID_URL = r'https://parler\.com/feed/(?P<id>[0-9a-f]{8}-(?:[0-9a-f]{4}-){3}[0-9a-f]{12})' + _TESTS = [ + { + 'url': 'https://parler.com/feed/df79fdba-07cc-48fe-b085-3293897520d7', + 'md5': '16e0f447bf186bb3cf64de5bbbf4d22d', + 'info_dict': { + 'id': 'df79fdba-07cc-48fe-b085-3293897520d7', + 'ext': 'mp4', + 'thumbnail': 'https://bl-images.parler.com/videos/6ce7cdf3-a27a-4d72-bf9c-d3e17ce39a66/thumbnail.jpeg', + 'title': 'Parler video #df79fdba-07cc-48fe-b085-3293897520d7', + 'description': 'md5:6f220bde2df4a97cbb89ac11f1fd8197', + 'timestamp': 1659744000, + 'upload_date': '20220806', + 'uploader': 'Tulsi Gabbard', + 'uploader_id': 'TulsiGabbard', + 'uploader_url': 'https://parler.com/TulsiGabbard', + 'view_count': int, + 'comment_count': int, + 'repost_count': int, + }, + }, + { + 'url': 'https://parler.com/feed/a7406eb4-91e5-4793-b5e3-ade57a24e287', + 'md5': '11687e2f5bb353682cee338d181422ed', + 'info_dict': { + 'id': 'a7406eb4-91e5-4793-b5e3-ade57a24e287', + 'ext': 'mp4', + 'thumbnail': 'https://bl-images.parler.com/videos/317827a8-1e48-4cbc-981f-7dd17d4c1183/thumbnail.jpeg', + 'title': 'Parler video #a7406eb4-91e5-4793-b5e3-ade57a24e287', + 'description': 'This man should run for office', + 'timestamp': 1659657600, + 'upload_date': '20220805', + 'uploader': 'Benny Johnson', + 'uploader_id': 'BennyJohnson', + 'uploader_url': 'https://parler.com/BennyJohnson', + 'view_count': int, + 'comment_count': int, + 'repost_count': int, + }, + }, + { + 'url': 'https://parler.com/feed/f23b85c1-6558-470f-b9ff-02c145f28da5', + 'md5': 'eaba1ff4a10fe281f5ce74e930ab2cb4', + 'info_dict': { + 'id': 'r5vkSaz8PxQ', + 'ext': 'mp4', + 'thumbnail': 'https://i.ytimg.com/vi_webp/r5vkSaz8PxQ/maxresdefault.webp', + 'title': 'Tom MacDonald Names Reaction', + 'description': 'md5:33c21f0d35ae6dc2edf3007d6696baea', + 'upload_date': '20220716', + 'duration': 1267, + 'uploader': 'Mahesh Chookolingo', + 'uploader_id': 'maheshchookolingo', + 'uploader_url': 'http://www.youtube.com/user/maheshchookolingo', + 'channel': 'Mahesh Chookolingo', + 'channel_id': 'UCox6YeMSY1PQInbCtTaZj_w', + 'channel_url': 'https://www.youtube.com/channel/UCox6YeMSY1PQInbCtTaZj_w', + 'categories': ['Entertainment'], + 'tags': list, + 'availability': 'public', + 'live_status': 'not_live', + 'view_count': int, + 'comment_count': int, + 'like_count': int, + 'channel_follower_count': int, + 'age_limit': 0, + 'playable_in_embed': True, + }, + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + data = self._download_json( + 'https://parler.com/open-api/ParleyDetailEndpoint.php', video_id, + data=urlencode_postdata({'uuid': video_id}))['data'][0] + primary = data['primary'] + + embed = self._parse_json(primary.get('V2LINKLONG') or '', video_id, fatal=False) + if embed: + return self.url_result(embed[0], YoutubeIE) + + return { + 'id': video_id, + 'url': traverse_obj(primary, ('video_data', 'videoSrc')), + 'thumbnail': traverse_obj(primary, ('video_data', 'thumbnailUrl')), + 'title': '', + 'description': strip_or_none(clean_html(primary.get('full_body'))) or None, + 'timestamp': unified_timestamp(primary.get('date_created')), + 'uploader': strip_or_none(primary.get('name')), + 'uploader_id': strip_or_none(primary.get('username')), + 'uploader_url': format_field(strip_or_none(primary.get('username')), None, 'https://parler.com/%s'), + 'view_count': int_or_none(primary.get('view_count')), + 'comment_count': int_or_none(traverse_obj(data, ('engagement', 'commentCount'))), + 'repost_count': int_or_none(traverse_obj(data, ('engagement', 'echoCount'))), + } diff --git a/hypervideo_dl/extractor/parliamentliveuk.py b/hypervideo_dl/extractor/parliamentliveuk.py deleted file mode 100644 index 974d654..0000000 --- a/hypervideo_dl/extractor/parliamentliveuk.py +++ /dev/null @@ -1,80 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json -import uuid - -from .common import InfoExtractor -from ..utils import ( - unified_timestamp, - try_get, -) - - -class ParliamentLiveUKIE(InfoExtractor): - IE_NAME = 'parliamentlive.tv' - IE_DESC = 'UK parliament videos' - _VALID_URL = r'(?i)https?://(?:www\.)?parliamentlive\.tv/Event/Index/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' - - _TESTS = [{ - 'url': 'http://parliamentlive.tv/Event/Index/c1e9d44d-fd6c-4263-b50f-97ed26cc998b', - 'info_dict': { - 'id': 'c1e9d44d-fd6c-4263-b50f-97ed26cc998b', - 'ext': 'mp4', - 'title': 'Home Affairs Committee', - 'timestamp': 1395153872, - 'upload_date': '20140318', - }, - }, { - 'url': 'http://parliamentlive.tv/event/index/3f24936f-130f-40bf-9a5d-b3d6479da6a4', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - video_info = self._download_json(f'https://www.parliamentlive.tv/Event/GetShareVideo/{video_id}', video_id) - _DEVICE_ID = str(uuid.uuid4()) - auth = 'Bearer ' + self._download_json( - 'https://exposure.api.redbee.live/v2/customer/UKParliament/businessunit/ParliamentLive/auth/anonymous', - video_id, headers={ - 'Origin': 'https://videoplayback.parliamentlive.tv', - 'Accept': 'application/json, text/plain, */*', - 'Content-Type': 'application/json;charset=utf-8' - }, data=json.dumps({ - 'deviceId': _DEVICE_ID, - 'device': { - 'deviceId': _DEVICE_ID, - 'width': 653, - 'height': 368, - 'type': 'WEB', - 'name': ' Mozilla Firefox 91' - } - }).encode('utf-8'))['sessionToken'] - - video_urls = self._download_json( - f'https://exposure.api.redbee.live/v2/customer/UKParliament/businessunit/ParliamentLive/entitlement/{video_id}/play', - video_id, headers={'Authorization': auth, 'Accept': 'application/json, text/plain, */*'})['formats'] - - formats = [] - for format in video_urls: - if not format.get('mediaLocator'): - continue - if format.get('format') == 'DASH': - formats.extend(self._extract_mpd_formats( - format['mediaLocator'], video_id, mpd_id='dash', fatal=False)) - elif format.get('format') == 'SMOOTHSTREAMING': - formats.extend(self._extract_ism_formats( - format['mediaLocator'], video_id, ism_id='ism', fatal=False)) - elif format.get('format') == 'HLS': - formats.extend(self._extract_m3u8_formats( - format['mediaLocator'], video_id, m3u8_id='hls', fatal=False)) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'formats': formats, - 'title': video_info['event']['title'], - 'timestamp': unified_timestamp(try_get(video_info, lambda x: x['event']['publishedStartTime'])), - 'thumbnail': video_info.get('thumbnailUrl'), - } diff --git a/hypervideo_dl/extractor/parlview.py b/hypervideo_dl/extractor/parlview.py index c85eaa7..0b54791 100644 --- a/hypervideo_dl/extractor/parlview.py +++ b/hypervideo_dl/extractor/parlview.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -47,7 +44,6 @@ class ParlviewIE(InfoExtractor): elif stream.get('streamType') != 'VOD': self.raise_no_formats('Unknown type of stream was detected: "%s"' % str(stream.get('streamType'))) formats = self._extract_m3u8_formats(stream['url'], video_id, 'mp4', 'm3u8_native') - self._sort_formats(formats) media_info = self._download_webpage( self._MEDIA_INFO_URL % video_id, video_id, note='Downloading media info', fatal=False) diff --git a/hypervideo_dl/extractor/patreon.py b/hypervideo_dl/extractor/patreon.py index 963a0d6..4dc0298 100644 --- a/hypervideo_dl/extractor/patreon.py +++ b/hypervideo_dl/extractor/patreon.py @@ -1,7 +1,5 @@ -# coding: utf-8 -from __future__ import unicode_literals - import itertools +from urllib.error import HTTPError from .common import InfoExtractor from .vimeo import VimeoIE @@ -10,17 +8,45 @@ from ..compat import compat_urllib_parse_unquote from ..utils import ( clean_html, determine_ext, + ExtractorError, int_or_none, KNOWN_EXTENSIONS, mimetype2ext, parse_iso8601, str_or_none, + traverse_obj, try_get, url_or_none, ) -class PatreonIE(InfoExtractor): +class PatreonBaseIE(InfoExtractor): + USER_AGENT = 'Patreon/7.6.28 (Android; Android 11; Scale/2.10)' + + def _call_api(self, ep, item_id, query=None, headers=None, fatal=True, note=None): + if headers is None: + headers = {} + if 'User-Agent' not in headers: + headers['User-Agent'] = self.USER_AGENT + if query: + query.update({'json-api-version': 1.0}) + + try: + return self._download_json( + f'https://www.patreon.com/api/{ep}', + item_id, note='Downloading API JSON' if not note else note, + query=query, fatal=fatal, headers=headers) + except ExtractorError as e: + if not isinstance(e.cause, HTTPError) or mimetype2ext(e.cause.headers.get('Content-Type')) != 'json': + raise + err_json = self._parse_json(self._webpage_read_content(e.cause, None, item_id), item_id, fatal=False) + err_message = traverse_obj(err_json, ('errors', ..., 'detail'), get_all=False) + if err_message: + raise ExtractorError(f'Patreon said: {err_message}', expected=True) + raise + + +class PatreonIE(PatreonBaseIE): _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?:creation\?hid=|posts/(?:[\w-]+-)?)(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.patreon.com/creation?hid=743933', @@ -29,12 +55,18 @@ class PatreonIE(InfoExtractor): 'id': '743933', 'ext': 'mp3', 'title': 'Episode 166: David Smalley of Dogma Debate', - 'description': 'md5:713b08b772cd6271b9f3906683cfacdf', + 'description': 'md5:34d207dd29aa90e24f1b3f58841b81c7', 'uploader': 'Cognitive Dissonance Podcast', 'thumbnail': 're:^https?://.*$', 'timestamp': 1406473987, 'upload_date': '20140727', 'uploader_id': '87145', + 'like_count': int, + 'comment_count': int, + 'uploader_url': 'https://www.patreon.com/dissonancepod', + 'channel_id': '80642', + 'channel_url': 'https://www.patreon.com/dissonancepod', + 'channel_follower_count': int, }, }, { 'url': 'http://www.patreon.com/creation?hid=754133', @@ -45,6 +77,9 @@ class PatreonIE(InfoExtractor): 'title': 'CD 167 Extra', 'uploader': 'Cognitive Dissonance Podcast', 'thumbnail': 're:^https?://.*$', + 'like_count': int, + 'comment_count': int, + 'uploader_url': 'https://www.patreon.com/dissonancepod', }, 'skip': 'Patron-only content', }, { @@ -56,8 +91,23 @@ class PatreonIE(InfoExtractor): 'uploader': 'TraciJHines', 'thumbnail': 're:^https?://.*$', 'upload_date': '20150211', - 'description': 'md5:c5a706b1f687817a3de09db1eb93acd4', + 'description': 'md5:8af6425f50bd46fbf29f3db0fc3a8364', 'uploader_id': 'TraciJHines', + 'categories': ['Entertainment'], + 'duration': 282, + 'view_count': int, + 'tags': 'count:39', + 'age_limit': 0, + 'channel': 'TraciJHines', + 'channel_url': 'https://www.youtube.com/channel/UCGLim4T2loE5rwCMdpCIPVg', + 'live_status': 'not_live', + 'like_count': int, + 'channel_id': 'UCGLim4T2loE5rwCMdpCIPVg', + 'availability': 'public', + 'channel_follower_count': int, + 'playable_in_embed': True, + 'uploader_url': 'http://www.youtube.com/user/TraciJHines', + 'comment_count': int, }, 'params': { 'noplaylist': True, @@ -83,38 +133,62 @@ class PatreonIE(InfoExtractor): 'uploader_id': '14936315', }, 'skip': 'Patron-only content' - }] - - # Currently Patreon exposes download URL via hidden CSS, so login is not - # needed. Keeping this commented for when this inevitably changes. - ''' - def _perform_login(self, username, password): - login_form = { - 'redirectUrl': 'http://www.patreon.com/', - 'email': username, - 'password': password, + }, { + # m3u8 video (https://github.com/hypervideo/hypervideo/issues/2277) + 'url': 'https://www.patreon.com/posts/video-sketchbook-32452882', + 'info_dict': { + 'id': '32452882', + 'ext': 'mp4', + 'comment_count': int, + 'uploader_id': '4301314', + 'like_count': int, + 'timestamp': 1576696962, + 'upload_date': '20191218', + 'thumbnail': r're:^https?://.*$', + 'uploader_url': 'https://www.patreon.com/loish', + 'description': 'md5:e2693e97ee299c8ece47ffdb67e7d9d2', + 'title': 'VIDEO // sketchbook flipthrough', + 'uploader': 'Loish ', + 'tags': ['sketchbook', 'video'], + 'channel_id': '1641751', + 'channel_url': 'https://www.patreon.com/loish', + 'channel_follower_count': int, } - - request = sanitized_Request( - 'https://www.patreon.com/processLogin', - compat_urllib_parse_urlencode(login_form).encode('utf-8') - ) - login_page = self._download_webpage(request, None, note='Logging in') - - if re.search(r'onLoginFailed', login_page): - raise ExtractorError('Unable to login, incorrect username and/or password', expected=True) - - ''' + }, { + # bad videos under media (if media is included). Real one is under post_file + 'url': 'https://www.patreon.com/posts/premium-access-70282931', + 'info_dict': { + 'id': '70282931', + 'ext': 'mp4', + 'title': '[Premium Access + Uncut] The Office - 2x6 The Fight - Group Reaction', + 'channel_url': 'https://www.patreon.com/thenormies', + 'channel_id': '573397', + 'uploader_id': '2929435', + 'uploader': 'The Normies', + 'description': 'md5:79c9fd8778e2cef84049a94c058a5e23', + 'comment_count': int, + 'upload_date': '20220809', + 'thumbnail': r're:^https?://.*$', + 'channel_follower_count': int, + 'like_count': int, + 'timestamp': 1660052820, + 'tags': ['The Office', 'early access', 'uncut'], + 'uploader_url': 'https://www.patreon.com/thenormies', + }, + 'skip': 'Patron-only content', + }] def _real_extract(self, url): video_id = self._match_id(url) - post = self._download_json( - 'https://www.patreon.com/api/posts/' + video_id, video_id, query={ + post = self._call_api( + f'posts/{video_id}', video_id, query={ 'fields[media]': 'download_url,mimetype,size_bytes', - 'fields[post]': 'comment_count,content,embed,image,like_count,post_file,published_at,title', + 'fields[post]': 'comment_count,content,embed,image,like_count,post_file,published_at,title,current_user_can_view', 'fields[user]': 'full_name,url', + 'fields[post_tag]': 'value', + 'fields[campaign]': 'url,name,patron_count', 'json-api-use-default-includes': 'false', - 'include': 'media,user', + 'include': 'audio,user,user_defined_tags,campaign,attachments_media', }) attributes = post['data']['attributes'] title = attributes['title'].strip() @@ -128,6 +202,9 @@ class PatreonIE(InfoExtractor): 'like_count': int_or_none(attributes.get('like_count')), 'comment_count': int_or_none(attributes.get('comment_count')), } + can_view_post = traverse_obj(attributes, 'current_user_can_view') + if can_view_post and info['comment_count']: + info['__post_extractor'] = self.extract_comments(video_id) for i in post.get('included', []): i_type = i.get('type') @@ -135,12 +212,18 @@ class PatreonIE(InfoExtractor): media_attributes = i.get('attributes') or {} download_url = media_attributes.get('download_url') ext = mimetype2ext(media_attributes.get('mimetype')) - if download_url and ext in KNOWN_EXTENSIONS: - info.update({ + + # if size_bytes is None, this media file is likely unavailable + # See: https://github.com/hypervideo/hypervideo/issues/4608 + size_bytes = int_or_none(media_attributes.get('size_bytes')) + if download_url and ext in KNOWN_EXTENSIONS and size_bytes is not None: + # XXX: what happens if there are multiple attachments? + return { + **info, 'ext': ext, - 'filesize': int_or_none(media_attributes.get('size_bytes')), + 'filesize': size_bytes, 'url': download_url, - }) + } elif i_type == 'user': user_attributes = i.get('attributes') if user_attributes: @@ -150,87 +233,222 @@ class PatreonIE(InfoExtractor): 'uploader_url': user_attributes.get('url'), }) - if not info.get('url'): - # handle Vimeo embeds - if try_get(attributes, lambda x: x['embed']['provider']) == 'Vimeo': - embed_html = try_get(attributes, lambda x: x['embed']['html']) - v_url = url_or_none(compat_urllib_parse_unquote( - self._search_regex(r'(https(?:%3A%2F%2F|://)player\.vimeo\.com.+app_id(?:=|%3D)+\d+)', embed_html, 'vimeo url', fatal=False))) - if v_url: - info.update({ - '_type': 'url_transparent', - 'url': VimeoIE._smuggle_referrer(v_url, 'https://patreon.com'), - 'ie_key': 'Vimeo', - }) + elif i_type == 'post_tag': + info.setdefault('tags', []).append(traverse_obj(i, ('attributes', 'value'))) - if not info.get('url'): - embed_url = try_get(attributes, lambda x: x['embed']['url']) - if embed_url: + elif i_type == 'campaign': info.update({ - '_type': 'url', - 'url': embed_url, + 'channel': traverse_obj(i, ('attributes', 'title')), + 'channel_id': str_or_none(i.get('id')), + 'channel_url': traverse_obj(i, ('attributes', 'url')), + 'channel_follower_count': int_or_none(traverse_obj(i, ('attributes', 'patron_count'))), }) - if not info.get('url'): - post_file = attributes['post_file'] - ext = determine_ext(post_file.get('name')) + # handle Vimeo embeds + if try_get(attributes, lambda x: x['embed']['provider']) == 'Vimeo': + embed_html = try_get(attributes, lambda x: x['embed']['html']) + v_url = url_or_none(compat_urllib_parse_unquote( + self._search_regex(r'(https(?:%3A%2F%2F|://)player\.vimeo\.com.+app_id(?:=|%3D)+\d+)', embed_html, 'vimeo url', fatal=False))) + if v_url: + return { + **info, + '_type': 'url_transparent', + 'url': VimeoIE._smuggle_referrer(v_url, 'https://patreon.com'), + 'ie_key': 'Vimeo', + } + + embed_url = try_get(attributes, lambda x: x['embed']['url']) + if embed_url: + return { + **info, + '_type': 'url', + 'url': embed_url, + } + + post_file = traverse_obj(attributes, 'post_file') + if post_file: + name = post_file.get('name') + ext = determine_ext(name) if ext in KNOWN_EXTENSIONS: - info.update({ + return { + **info, 'ext': ext, 'url': post_file['url'], - }) + } + elif name == 'video': + formats, subtitles = self._extract_m3u8_formats_and_subtitles(post_file['url'], video_id) + return { + **info, + 'formats': formats, + 'subtitles': subtitles, + } + if can_view_post is False: + self.raise_no_formats('You do not have access to this post', video_id=video_id, expected=True) + else: + self.raise_no_formats('No supported media found in this post', video_id=video_id, expected=True) return info + def _get_comments(self, post_id): + cursor = None + count = 0 + params = { + 'page[count]': 50, + 'include': 'parent.commenter.campaign,parent.post.user,parent.post.campaign.creator,parent.replies.parent,parent.replies.commenter.campaign,parent.replies.post.user,parent.replies.post.campaign.creator,commenter.campaign,post.user,post.campaign.creator,replies.parent,replies.commenter.campaign,replies.post.user,replies.post.campaign.creator,on_behalf_of_campaign', + 'fields[comment]': 'body,created,is_by_creator', + 'fields[user]': 'image_url,full_name,url', + 'filter[flair]': 'image_tiny_url,name', + 'sort': '-created', + 'json-api-version': 1.0, + 'json-api-use-default-includes': 'false', + } + + for page in itertools.count(1): -class PatreonUserIE(InfoExtractor): + params.update({'page[cursor]': cursor} if cursor else {}) + response = self._call_api( + f'posts/{post_id}/comments', post_id, query=params, note='Downloading comments page %d' % page) + + cursor = None + for comment in traverse_obj(response, (('data', ('included', lambda _, v: v['type'] == 'comment')), ...), default=[]): + count += 1 + comment_id = comment.get('id') + attributes = comment.get('attributes') or {} + if comment_id is None: + continue + author_id = traverse_obj(comment, ('relationships', 'commenter', 'data', 'id')) + author_info = traverse_obj( + response, ('included', lambda _, v: v['id'] == author_id and v['type'] == 'user', 'attributes'), + get_all=False, expected_type=dict, default={}) - _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?!rss)(?P<id>[-\w]+)' + yield { + 'id': comment_id, + 'text': attributes.get('body'), + 'timestamp': parse_iso8601(attributes.get('created')), + 'parent': traverse_obj(comment, ('relationships', 'parent', 'data', 'id'), default='root'), + 'author_is_uploader': attributes.get('is_by_creator'), + 'author_id': author_id, + 'author': author_info.get('full_name'), + 'author_thumbnail': author_info.get('image_url'), + } + if count < traverse_obj(response, ('meta', 'count')): + cursor = traverse_obj(response, ('data', -1, 'id')) + + if cursor is None: + break + + +class PatreonCampaignIE(PatreonBaseIE): + + _VALID_URL = r'https?://(?:www\.)?patreon\.com/(?!rss)(?:(?:m/(?P<campaign_id>\d+))|(?P<vanity>[-\w]+))' _TESTS = [{ 'url': 'https://www.patreon.com/dissonancepod/', 'info_dict': { - 'title': 'dissonancepod', + 'title': 'Cognitive Dissonance Podcast', + 'channel_url': 'https://www.patreon.com/dissonancepod', + 'id': '80642', + 'description': 'md5:eb2fa8b83da7ab887adeac34da6b7af7', + 'channel_id': '80642', + 'channel': 'Cognitive Dissonance Podcast', + 'age_limit': 0, + 'channel_follower_count': int, + 'uploader_id': '87145', + 'uploader_url': 'https://www.patreon.com/dissonancepod', + 'uploader': 'Cognitive Dissonance Podcast', + 'thumbnail': r're:^https?://.*$', }, 'playlist_mincount': 68, - 'expected_warnings': 'Post not viewable by current user! Skipping!', + }, { + 'url': 'https://www.patreon.com/m/4767637/posts', + 'info_dict': { + 'title': 'Not Just Bikes', + 'channel_follower_count': int, + 'id': '4767637', + 'channel_id': '4767637', + 'channel_url': 'https://www.patreon.com/notjustbikes', + 'description': 'md5:595c6e7dca76ae615b1d38c298a287a1', + 'age_limit': 0, + 'channel': 'Not Just Bikes', + 'uploader_url': 'https://www.patreon.com/notjustbikes', + 'uploader': 'Not Just Bikes', + 'uploader_id': '37306634', + 'thumbnail': r're:^https?://.*$', + }, + 'playlist_mincount': 71 }, { 'url': 'https://www.patreon.com/dissonancepod/posts', 'only_matching': True - }, ] + }, { + 'url': 'https://www.patreon.com/m/5932659', + 'only_matching': True + }] @classmethod def suitable(cls, url): - return False if PatreonIE.suitable(url) else super(PatreonUserIE, cls).suitable(url) + return False if PatreonIE.suitable(url) else super(PatreonCampaignIE, cls).suitable(url) - def _entries(self, campaign_id, user_id): + def _entries(self, campaign_id): cursor = None params = { - 'fields[campaign]': 'show_audio_post_download_links,name,url', - 'fields[post]': 'current_user_can_view,embed,image,is_paid,post_file,published_at,patreon_url,url,post_type,thumbnail_url,title', + 'fields[post]': 'patreon_url,url', 'filter[campaign_id]': campaign_id, 'filter[is_draft]': 'false', 'sort': '-published_at', - 'json-api-version': 1.0, 'json-api-use-default-includes': 'false', } for page in itertools.count(1): params.update({'page[cursor]': cursor} if cursor else {}) - posts_json = self._download_json('https://www.patreon.com/api/posts', user_id, note='Downloading posts page %d' % page, query=params, headers={'Cookie': '.'}) - - cursor = try_get(posts_json, lambda x: x['meta']['pagination']['cursors']['next']) + posts_json = self._call_api('posts', campaign_id, query=params, note='Downloading posts page %d' % page) + cursor = traverse_obj(posts_json, ('meta', 'pagination', 'cursors', 'next')) for post in posts_json.get('data') or []: - yield self.url_result(url_or_none(try_get(post, lambda x: x['attributes']['patreon_url'])), 'Patreon') + yield self.url_result(url_or_none(traverse_obj(post, ('attributes', 'patreon_url'))), 'Patreon') if cursor is None: break def _real_extract(self, url): - user_id = self._match_id(url) - webpage = self._download_webpage(url, user_id, headers={'Cookie': '.'}) - campaign_id = self._search_regex(r'https://www.patreon.com/api/campaigns/(\d+)/?', webpage, 'Campaign ID') - return self.playlist_result(self._entries(campaign_id, user_id), playlist_title=user_id) + campaign_id, vanity = self._match_valid_url(url).group('campaign_id', 'vanity') + if campaign_id is None: + webpage = self._download_webpage(url, vanity, headers={'User-Agent': self.USER_AGENT}) + campaign_id = self._search_regex(r'https://www.patreon.com/api/campaigns/(\d+)/?', webpage, 'Campaign ID') + + params = { + 'json-api-use-default-includes': 'false', + 'fields[user]': 'full_name,url', + 'fields[campaign]': 'name,summary,url,patron_count,creation_count,is_nsfw,avatar_photo_url', + 'include': 'creator' + } + + campaign_response = self._call_api( + f'campaigns/{campaign_id}', campaign_id, + note='Downloading campaign info', fatal=False, + query=params) or {} + + campaign_info = campaign_response.get('data') or {} + channel_name = traverse_obj(campaign_info, ('attributes', 'name')) + user_info = traverse_obj( + campaign_response, ('included', lambda _, v: v['type'] == 'user'), + default={}, expected_type=dict, get_all=False) + + return { + '_type': 'playlist', + 'id': campaign_id, + 'title': channel_name, + 'entries': self._entries(campaign_id), + 'description': clean_html(traverse_obj(campaign_info, ('attributes', 'summary'))), + 'channel_url': traverse_obj(campaign_info, ('attributes', 'url')), + 'channel_follower_count': int_or_none(traverse_obj(campaign_info, ('attributes', 'patron_count'))), + 'channel_id': campaign_id, + 'channel': channel_name, + 'uploader_url': traverse_obj(user_info, ('attributes', 'url')), + 'uploader_id': str_or_none(user_info.get('id')), + 'uploader': traverse_obj(user_info, ('attributes', 'full_name')), + 'playlist_count': traverse_obj(campaign_info, ('attributes', 'creation_count')), + 'age_limit': 18 if traverse_obj(campaign_info, ('attributes', 'is_nsfw')) else 0, + 'thumbnail': url_or_none(traverse_obj(campaign_info, ('attributes', 'avatar_photo_url'))), + } diff --git a/hypervideo_dl/extractor/pbs.py b/hypervideo_dl/extractor/pbs.py index e48a2b8..5bdf561 100644 --- a/hypervideo_dl/extractor/pbs.py +++ b/hypervideo_dl/extractor/pbs.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -663,7 +660,6 @@ class PBSIE(InfoExtractor): for f in formats: if (f.get('format_note') or '').endswith(' AD'): # Audio description f['language_preference'] = -10 - self._sort_formats(formats) rating_str = info.get('rating') if rating_str is not None: diff --git a/hypervideo_dl/extractor/pearvideo.py b/hypervideo_dl/extractor/pearvideo.py index 1d77722..e27e5a7 100644 --- a/hypervideo_dl/extractor/pearvideo.py +++ b/hypervideo_dl/extractor/pearvideo.py @@ -1,12 +1,10 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor from ..utils import ( qualities, unified_timestamp, + traverse_obj, ) @@ -39,7 +37,14 @@ class PearVideoIE(InfoExtractor): } for mobj in re.finditer( r'(?P<id>[a-zA-Z]+)Url\s*=\s*(["\'])(?P<url>(?:https?:)?//.+?)\2', webpage)] - self._sort_formats(formats) + if not formats: + info = self._download_json( + 'https://www.pearvideo.com/videoStatus.jsp', video_id=video_id, + query={'contId': video_id}, headers={'Referer': url}) + formats = [{ + 'format_id': k, + 'url': v.replace(info['systemTime'], f'cont-{video_id}') if k == 'srcUrl' else v + } for k, v in traverse_obj(info, ('videoInfo', 'videos'), default={}).items() if v] title = self._search_regex( (r'<h1[^>]+\bclass=(["\'])video-tt\1[^>]*>(?P<value>[^<]+)', diff --git a/hypervideo_dl/extractor/peekvids.py b/hypervideo_dl/extractor/peekvids.py index 4bf6855..2d9b9a7 100644 --- a/hypervideo_dl/extractor/peekvids.py +++ b/hypervideo_dl/extractor/peekvids.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor @@ -43,7 +40,6 @@ class PeekVidsIE(InfoExtractor): } for name, url in srcs.items() if len(name) > 8 and name.startswith('data-src')] if not formats: formats = [{'url': url} for url in srcs.values()] - self._sort_formats(formats) info = self._search_json_ld(webpage, video_id, expected_type='VideoObject') info.update({ @@ -54,7 +50,7 @@ class PeekVidsIE(InfoExtractor): return info -class PlayVidsIE(PeekVidsIE): +class PlayVidsIE(PeekVidsIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'https?://(?:www\.)?playvids\.com/(?:embed/|[^/]{2}/)?(?P<id>[^/?#]*)' _TESTS = [{ 'url': 'https://www.playvids.com/U3pBrYhsjXM/pc/dane-jones-cute-redhead-with-perfect-tits-with-mini-vamp', diff --git a/hypervideo_dl/extractor/peertube.py b/hypervideo_dl/extractor/peertube.py index 9d6b821..68e1573 100644 --- a/hypervideo_dl/extractor/peertube.py +++ b/hypervideo_dl/extractor/peertube.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import functools import re @@ -1060,6 +1057,7 @@ class PeerTubeIE(InfoExtractor): ) (?P<id>%s) ''' % (_INSTANCES_RE, _UUID_RE) + _EMBED_REGEX = [r'''(?x)<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//{_INSTANCES_RE}/videos/embed/{cls._UUID_RE})'''] _TESTS = [{ 'url': 'https://framatube.org/videos/watch/9c9de5e8-0a1e-484a-b099-e80766180a6d', 'md5': '8563064d245a4be5705bddb22bb00a28', @@ -1161,16 +1159,15 @@ class PeerTubeIE(InfoExtractor): '>We are sorry but it seems that PeerTube is not compatible with your web browser.<')): return 'peertube:%s:%s' % mobj.group('host', 'id') - @staticmethod - def _extract_urls(webpage, source_url): - entries = re.findall( - r'''(?x)<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//%s/videos/embed/%s)''' - % (PeerTubeIE._INSTANCES_RE, PeerTubeIE._UUID_RE), webpage) - if not entries: - peertube_url = PeerTubeIE._extract_peertube_url(webpage, source_url) - if peertube_url: - entries = [peertube_url] - return entries + @classmethod + def _extract_embed_urls(cls, url, webpage): + embeds = tuple(super()._extract_embed_urls(url, webpage)) + if embeds: + return embeds + + peertube_url = cls._extract_peertube_url(webpage, url) + if peertube_url: + return [peertube_url] def _call_api(self, host, video_id, path, note=None, errnote=None, fatal=True): return self._download_json( @@ -1236,7 +1233,6 @@ class PeerTubeIE(InfoExtractor): else: f['fps'] = int_or_none(file_.get('fps')) formats.append(f) - self._sort_formats(formats) description = video.get('description') if description and len(description) >= 250: diff --git a/hypervideo_dl/extractor/peertv.py b/hypervideo_dl/extractor/peertv.py index 002d33a..a709e21 100644 --- a/hypervideo_dl/extractor/peertv.py +++ b/hypervideo_dl/extractor/peertv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import js_to_json @@ -46,8 +43,6 @@ class PeerTVIE(InfoExtractor): formats = self._extract_m3u8_formats(m3u8_url, video_id, m3u8_id='hls') - self._sort_formats(formats) - return { 'id': video_id, 'title': self._html_search_regex(r'<h1>(.+?)</h1>', webpage, 'title').replace('\xa0', ' '), diff --git a/hypervideo_dl/extractor/peloton.py b/hypervideo_dl/extractor/peloton.py index 7d83225..4835822 100644 --- a/hypervideo_dl/extractor/peloton.py +++ b/hypervideo_dl/extractor/peloton.py @@ -1,14 +1,9 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import re +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_urllib_parse, -) +from ..compat import compat_HTTPError from ..utils import ( ExtractorError, float_or_none, @@ -128,7 +123,7 @@ class PelotonIE(InfoExtractor): is_live = False if ride_data.get('content_format') == 'audio': - url = self._MANIFEST_URL_TEMPLATE % (ride_data.get('vod_stream_url'), compat_urllib_parse.quote(token)) + url = self._MANIFEST_URL_TEMPLATE % (ride_data.get('vod_stream_url'), urllib.parse.quote(token)) formats = [{ 'url': url, 'ext': 'm4a', @@ -141,9 +136,9 @@ class PelotonIE(InfoExtractor): url = 'https://members.onepeloton.com/.netlify/functions/m3u8-proxy?displayLanguage=en&acceptedSubtitles=%s&url=%s?hdnea=%s' % ( ','.join([re.sub('^([a-z]+)-([A-Z]+)$', r'\1', caption) for caption in ride_data['captions']]), ride_data['vod_stream_url'], - compat_urllib_parse.quote(compat_urllib_parse.quote(token))) + urllib.parse.quote(urllib.parse.quote(token))) elif ride_data.get('live_stream_url'): - url = self._MANIFEST_URL_TEMPLATE % (ride_data.get('live_stream_url'), compat_urllib_parse.quote(token)) + url = self._MANIFEST_URL_TEMPLATE % (ride_data.get('live_stream_url'), urllib.parse.quote(token)) is_live = True else: raise ExtractorError('Missing video URL') @@ -162,7 +157,6 @@ class PelotonIE(InfoExtractor): 'title': segment.get('name') } for segment in traverse_obj(metadata, ('segments', 'segment_list'))] - self._sort_formats(formats) return { 'id': video_id, 'title': ride_data.get('title'), diff --git a/hypervideo_dl/extractor/people.py b/hypervideo_dl/extractor/people.py index 6ca9571..c5143c3 100644 --- a/hypervideo_dl/extractor/people.py +++ b/hypervideo_dl/extractor/people.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/performgroup.py b/hypervideo_dl/extractor/performgroup.py index c00d393..f4d7f22 100644 --- a/hypervideo_dl/extractor/performgroup.py +++ b/hypervideo_dl/extractor/performgroup.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import int_or_none @@ -69,7 +65,6 @@ class PerformGroupIE(InfoExtractor): 'vbr': int_or_none(c.get('videoRate'), 1000), 'abr': int_or_none(c.get('audioRate'), 1000), }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/periscope.py b/hypervideo_dl/extractor/periscope.py index 1a292b8..84bcf15 100644 --- a/hypervideo_dl/extractor/periscope.py +++ b/hypervideo_dl/extractor/periscope.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -70,6 +65,7 @@ class PeriscopeIE(PeriscopeBaseIE): IE_DESC = 'Periscope' IE_NAME = 'periscope' _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/[^/]+/(?P<id>[^/?#]+)' + _EMBED_REGEX = [r'<iframe[^>]+src=([\'"])(?P<url>(?:https?:)?//(?:www\.)?(?:periscope|pscp)\.tv/(?:(?!\1).)+)\1'] # Alive example URLs can be found here https://www.periscope.tv/ _TESTS = [{ 'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==', @@ -95,13 +91,6 @@ class PeriscopeIE(PeriscopeBaseIE): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+src=([\'"])(?P<url>(?:https?:)?//(?:www\.)?(?:periscope|pscp)\.tv/(?:(?!\1).)+)\1', webpage) - if mobj: - return mobj.group('url') - def _real_extract(self, url): token = self._match_id(url) @@ -138,7 +127,6 @@ class PeriscopeIE(PeriscopeBaseIE): } self._add_width_and_height(rtmp_format) formats.append(rtmp_format) - self._sort_formats(formats) info['formats'] = formats return info diff --git a/hypervideo_dl/extractor/philharmoniedeparis.py b/hypervideo_dl/extractor/philharmoniedeparis.py index 9f4899c..e8494a0 100644 --- a/hypervideo_dl/extractor/philharmoniedeparis.py +++ b/hypervideo_dl/extractor/philharmoniedeparis.py @@ -1,12 +1,6 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str -from ..utils import ( - try_get, - urljoin, -) +from ..utils import try_get class PhilharmonieDeParisIE(InfoExtractor): @@ -15,27 +9,29 @@ class PhilharmonieDeParisIE(InfoExtractor): https?:// (?: live\.philharmoniedeparis\.fr/(?:[Cc]oncert/|embed(?:app)?/|misc/Playlist\.ashx\?id=)| - pad\.philharmoniedeparis\.fr/doc/CIMU/ + pad\.philharmoniedeparis\.fr/(?:doc/CIMU/|player\.aspx\?id=)| + philharmoniedeparis\.fr/fr/live/concert/| + otoplayer\.philharmoniedeparis\.fr/fr/embed/ ) (?P<id>\d+) ''' _TESTS = [{ - 'url': 'http://pad.philharmoniedeparis.fr/doc/CIMU/1086697/jazz-a-la-villette-knower', - 'md5': 'a0a4b195f544645073631cbec166a2c2', + 'url': 'https://philharmoniedeparis.fr/fr/live/concert/1129666-danses-symphoniques', + 'md5': '24bdb7e86c200c107680e1f7770330ae', 'info_dict': { - 'id': '1086697', + 'id': '1129666', 'ext': 'mp4', - 'title': 'Jazz à la Villette : Knower', + 'title': 'Danses symphoniques. Orchestre symphonique Divertimento - Zahia Ziouani. Bizet, de Falla, Stravinski, Moussorgski, Saint-Saëns', }, }, { - 'url': 'http://live.philharmoniedeparis.fr/concert/1032066.html', + 'url': 'https://philharmoniedeparis.fr/fr/live/concert/1032066-akademie-fur-alte-musik-berlin-rias-kammerchor-rene-jacobs-passion-selon-saint-jean-de-johann', 'info_dict': { 'id': '1032066', - 'title': 'md5:0a031b81807b3593cffa3c9a87a167a0', + 'title': 'Akademie für alte Musik Berlin, Rias Kammerchor, René Jacobs : Passion selon saint Jean de Johann Sebastian Bach', }, 'playlist_mincount': 2, }, { - 'url': 'http://live.philharmoniedeparis.fr/Concert/1030324.html', + 'url': 'https://philharmoniedeparis.fr/fr/live/concert/1030324-orchestre-philharmonique-de-radio-france-myung-whun-chung-renaud-capucon-pascal-dusapin-johannes', 'only_matching': True, }, { 'url': 'http://live.philharmoniedeparis.fr/misc/Playlist.ashx?id=1030324&track=&lang=fr', @@ -44,16 +40,15 @@ class PhilharmonieDeParisIE(InfoExtractor): 'url': 'https://live.philharmoniedeparis.fr/embedapp/1098406/berlioz-fantastique-lelio-les-siecles-national-youth-choir-of.html?lang=fr-FR', 'only_matching': True, }, { - 'url': 'https://live.philharmoniedeparis.fr/embed/1098406/berlioz-fantastique-lelio-les-siecles-national-youth-choir-of.html?lang=fr-FR', + 'url': 'https://otoplayer.philharmoniedeparis.fr/fr/embed/1098406?lang=fr-FR', 'only_matching': True, }] - _LIVE_URL = 'https://live.philharmoniedeparis.fr' def _real_extract(self, url): video_id = self._match_id(url) config = self._download_json( - '%s/otoPlayer/config.ashx' % self._LIVE_URL, video_id, query={ + 'https://otoplayer.philharmoniedeparis.fr/fr/config/%s.json' % video_id, video_id, query={ 'id': video_id, 'lang': 'fr-FR', }) @@ -75,31 +70,27 @@ class PhilharmonieDeParisIE(InfoExtractor): if not format_url or format_url in format_urls: continue format_urls.add(format_url) - m3u8_url = urljoin(self._LIVE_URL, format_url) formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + format_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) if not formats and not self.get_param('ignore_no_formats'): return - self._sort_formats(formats) return { 'title': title, 'formats': formats, + 'thumbnail': files.get('thumbnail'), } - - thumbnail = urljoin(self._LIVE_URL, config.get('image')) - info = extract_entry(config) if info: info.update({ 'id': video_id, - 'thumbnail': thumbnail, }) return info - entries = [] for num, chapter in enumerate(config['chapters'], start=1): entry = extract_entry(chapter) + if entry is None: + continue entry['id'] = '%s-%d' % (video_id, num) entries.append(entry) diff --git a/hypervideo_dl/extractor/phoenix.py b/hypervideo_dl/extractor/phoenix.py index e3ea014..5fa133a 100644 --- a/hypervideo_dl/extractor/phoenix.py +++ b/hypervideo_dl/extractor/phoenix.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .youtube import YoutubeIE diff --git a/hypervideo_dl/extractor/photobucket.py b/hypervideo_dl/extractor/photobucket.py index 53aebe2..71e9a48 100644 --- a/hypervideo_dl/extractor/photobucket.py +++ b/hypervideo_dl/extractor/photobucket.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import json from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/piapro.py b/hypervideo_dl/extractor/piapro.py index c4eb491..d8d9c78 100644 --- a/hypervideo_dl/extractor/piapro.py +++ b/hypervideo_dl/extractor/piapro.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( @@ -27,6 +24,18 @@ class PiaproIE(InfoExtractor): 'title': '裏表ラバーズ', 'thumbnail': r're:^https?://.*\.jpg$', } + }, { + 'note': 'There are break lines in description, mandating (?s) flag', + 'url': 'https://piapro.jp/t/9cSd', + 'md5': '952bb6d1e8de95050206408a87790676', + 'info_dict': { + 'id': '9cSd', + 'ext': 'mp3', + 'title': '青に溶けた風船 / 初音ミク', + 'description': 'md5:d395a9bd151447631a5a1460bc7f9132', + 'uploader': 'シアン・キノ', + 'uploader_id': 'cyankino', + } }] _login_status = False @@ -81,7 +90,7 @@ class PiaproIE(InfoExtractor): return { 'id': video_id, 'title': self._html_search_regex(r'<h1\s+class="cd_works-title">(.+?)</h1>', webpage, 'title', fatal=False), - 'description': self._html_search_regex(r'<p\s+class="cd_dtl_cap">(.+?)</p>\s*<div', webpage, 'description', fatal=False), + 'description': self._html_search_regex(r'(?s)<p\s+class="cd_dtl_cap">(.+?)</p>\s*<div', webpage, 'description', fatal=False), 'uploader': uploader, 'uploader_id': uploader_id, 'timestamp': unified_timestamp(create_date, False), diff --git a/hypervideo_dl/extractor/picarto.py b/hypervideo_dl/extractor/picarto.py index adf21fd..36a062d 100644 --- a/hypervideo_dl/extractor/picarto.py +++ b/hypervideo_dl/extractor/picarto.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -67,7 +64,6 @@ class PicartoIE(InfoExtractor): formats.append({ 'url': source_url, }) - self._sort_formats(formats) mature = metadata.get('adult') if mature is None: @@ -117,7 +113,6 @@ class PicartoVodIE(InfoExtractor): formats = self._extract_m3u8_formats( vod_info['vod'], video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/piksel.py b/hypervideo_dl/extractor/piksel.py index 84c3de2..cc60b30 100644 --- a/hypervideo_dl/extractor/piksel.py +++ b/hypervideo_dl/extractor/piksel.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -33,6 +30,7 @@ class PikselIE(InfoExtractor): )\.jp| vidego\.baltimorecity\.gov )/v/(?:refid/(?P<refid>[^/]+)/prefid/)?(?P<id>[\w-]+)''' + _EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>(?:https?:)?//player\.piksel\.com/v/[a-z0-9]+)'] _TESTS = [ { 'url': 'http://player.piksel.com/v/ums2867l', @@ -65,14 +63,6 @@ class PikselIE(InfoExtractor): } ] - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+src=["\'](?P<url>(?:https?:)?//player\.piksel\.com/v/[a-z0-9]+)', - webpage) - if mobj: - return mobj.group('url') - def _call_api(self, app_token, resource, display_id, query, fatal=True): response = (self._download_json( 'http://player.piksel.com/ws/ws_%s/api/%s/mode/json/apiv/5' % (resource, app_token), @@ -163,8 +153,6 @@ class PikselIE(InfoExtractor): re.sub(r'/od/[^/]+/', '/od/http/', smil_url), video_id, transform_source=transform_source, fatal=False)) - self._sort_formats(formats, ('tbr', )) # Incomplete resolution information - subtitles = {} for caption in video_data.get('captions', []): caption_url = caption.get('url') @@ -180,4 +168,5 @@ class PikselIE(InfoExtractor): 'timestamp': parse_iso8601(video_data.get('dateadd')), 'formats': formats, 'subtitles': subtitles, + '_format_sort_fields': ('tbr', ), # Incomplete resolution information } diff --git a/hypervideo_dl/extractor/pinkbike.py b/hypervideo_dl/extractor/pinkbike.py index 9f3501f..e4e1caa 100644 --- a/hypervideo_dl/extractor/pinkbike.py +++ b/hypervideo_dl/extractor/pinkbike.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -52,7 +49,6 @@ class PinkbikeIE(InfoExtractor): 'format_id': format_id, 'height': height, }) - self._sort_formats(formats) title = remove_end(self._og_search_title(webpage), ' Video - Pinkbike') description = self._html_search_regex( diff --git a/hypervideo_dl/extractor/pinterest.py b/hypervideo_dl/extractor/pinterest.py index 80e9cd0..2c6cd6d 100644 --- a/hypervideo_dl/extractor/pinterest.py +++ b/hypervideo_dl/extractor/pinterest.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -55,7 +52,6 @@ class PinterestBaseIE(InfoExtractor): 'height': int_or_none(format_dict.get('height')), 'duration': duration, }) - self._sort_formats(formats) description = data.get('description') or data.get('description_html') or data.get('seo_description') timestamp = unified_timestamp(data.get('created_at')) diff --git a/hypervideo_dl/extractor/pixivsketch.py b/hypervideo_dl/extractor/pixivsketch.py index f0ad0b2..850c6f2 100644 --- a/hypervideo_dl/extractor/pixivsketch.py +++ b/hypervideo_dl/extractor/pixivsketch.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -74,7 +71,6 @@ class PixivSketchIE(PixivSketchBaseIE): formats = self._extract_m3u8_formats( m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls') - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/pladform.py b/hypervideo_dl/extractor/pladform.py index 99ade85..dcf18e1 100644 --- a/hypervideo_dl/extractor/pladform.py +++ b/hypervideo_dl/extractor/pladform.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -27,6 +22,7 @@ class PladformIE(InfoExtractor): ) (?P<id>\d+) ''' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)\1'] _TESTS = [{ 'url': 'http://out.pladform.ru/player?pl=18079&type=html5&videoid=100231282', 'info_dict': { @@ -64,13 +60,6 @@ class PladformIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)\1', webpage) - if mobj: - return mobj.group('url') - def _real_extract(self, url): video_id = self._match_id(url) @@ -122,8 +111,6 @@ class PladformIE(InfoExtractor): if error: fail(error) - self._sort_formats(formats) - webpage = self._download_webpage( 'http://video.pladform.ru/catalog/video/videoid/%s' % video_id, video_id) diff --git a/hypervideo_dl/extractor/planetmarathi.py b/hypervideo_dl/extractor/planetmarathi.py index 07ac15b..25753fe 100644 --- a/hypervideo_dl/extractor/planetmarathi.py +++ b/hypervideo_dl/extractor/planetmarathi.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( try_get, @@ -60,7 +57,6 @@ class PlanetMarathiIE(InfoExtractor): asset_title = id.replace('-', ' ') asset_id = f'{asset["sk"]}_{id}'.replace('#', '-') formats, subtitles = self._extract_m3u8_formats_and_subtitles(asset['mediaAssetURL'], asset_id) - self._sort_formats(formats) entries.append({ 'id': asset_id, 'title': asset_title, diff --git a/hypervideo_dl/extractor/platzi.py b/hypervideo_dl/extractor/platzi.py index 17f52e7..b8a4414 100644 --- a/hypervideo_dl/extractor/platzi.py +++ b/hypervideo_dl/extractor/platzi.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_b64decode, @@ -130,7 +127,6 @@ class PlatziIE(PlatziBaseIE): format_url, lecture_id, mpd_id=format_id, note='Downloading %s MPD manifest' % server_id, fatal=False)) - self._sort_formats(formats) content = str_or_none(desc.get('content')) description = (clean_html(compat_b64decode(content).decode('utf-8')) diff --git a/hypervideo_dl/extractor/playfm.py b/hypervideo_dl/extractor/playfm.py index 4298cbe..e895ba4 100644 --- a/hypervideo_dl/extractor/playfm.py +++ b/hypervideo_dl/extractor/playfm.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( diff --git a/hypervideo_dl/extractor/playplustv.py b/hypervideo_dl/extractor/playplustv.py index cad2c3a..316f220 100644 --- a/hypervideo_dl/extractor/playplustv.py +++ b/hypervideo_dl/extractor/playplustv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -82,7 +79,6 @@ class PlayPlusTVIE(InfoExtractor): 'width': int_or_none(file_info.get('width')), 'height': int_or_none(file_info.get('height')), }) - self._sort_formats(formats) thumbnails = [] for thumb in media.get('thumbs', []): diff --git a/hypervideo_dl/extractor/plays.py b/hypervideo_dl/extractor/plays.py index ddfc6f1..9371f7b 100644 --- a/hypervideo_dl/extractor/plays.py +++ b/hypervideo_dl/extractor/plays.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -41,7 +38,6 @@ class PlaysTVIE(InfoExtractor): 'format_id': 'http-' + format_id, 'height': int_or_none(height), }) - self._sort_formats(formats) info.update({ 'id': video_id, diff --git a/hypervideo_dl/extractor/playstuff.py b/hypervideo_dl/extractor/playstuff.py index 5a32995..b424ba1 100644 --- a/hypervideo_dl/extractor/playstuff.py +++ b/hypervideo_dl/extractor/playstuff.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( diff --git a/hypervideo_dl/extractor/playsuisse.py b/hypervideo_dl/extractor/playsuisse.py new file mode 100644 index 0000000..a635ac9 --- /dev/null +++ b/hypervideo_dl/extractor/playsuisse.py @@ -0,0 +1,147 @@ +import json + +from .common import InfoExtractor +from ..utils import int_or_none, traverse_obj + + +class PlaySuisseIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?playsuisse\.ch/watch/(?P<id>[0-9]+)' + _TESTS = [ + { + 'url': 'https://www.playsuisse.ch/watch/763211/0', + 'md5': '82df2a470b2dfa60c2d33772a8a60cf8', + 'info_dict': { + 'id': '763211', + 'ext': 'mp4', + 'title': 'Knochen', + 'description': 'md5:8ea7a8076ba000cd9e8bc132fd0afdd8', + 'duration': 3344, + 'series': 'Wilder', + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Knochen', + 'episode_number': 1, + 'thumbnail': 'md5:9260abe0c0ec9b69914d0a10d54c5878' + } + }, + { + 'url': 'https://www.playsuisse.ch/watch/808675/0', + 'md5': '818b94c1d2d7c4beef953f12cb8f3e75', + 'info_dict': { + 'id': '808675', + 'ext': 'mp4', + 'title': 'Der Läufer', + 'description': 'md5:9f61265c7e6dcc3e046137a792b275fd', + 'duration': 5280, + 'episode': 'Der Läufer', + 'thumbnail': 'md5:44af7d65ee02bbba4576b131868bb783' + } + }, + { + 'url': 'https://www.playsuisse.ch/watch/817193/0', + 'md5': '1d6c066f92cd7fffd8b28a53526d6b59', + 'info_dict': { + 'id': '817193', + 'ext': 'mp4', + 'title': 'Die Einweihungsparty', + 'description': 'md5:91ebf04d3a42cb3ab70666acf750a930', + 'duration': 1380, + 'series': 'Nr. 47', + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Die Einweihungsparty', + 'episode_number': 1, + 'thumbnail': 'md5:637585fb106e3a4bcd991958924c7e44' + } + } + ] + + _GRAPHQL_QUERY = ''' + query AssetWatch($assetId: ID!) { + assetV2(id: $assetId) { + ...Asset + episodes { + ...Asset + } + } + } + fragment Asset on AssetV2 { + id + name + description + duration + episodeNumber + seasonNumber + seriesName + medias { + type + url + } + thumbnail16x9 { + ...ImageDetails + } + thumbnail2x3 { + ...ImageDetails + } + thumbnail16x9WithTitle { + ...ImageDetails + } + thumbnail2x3WithTitle { + ...ImageDetails + } + } + fragment ImageDetails on AssetImage { + id + url + }''' + + def _get_media_data(self, media_id): + # NOTE In the web app, the "locale" header is used to switch between languages, + # However this doesn't seem to take effect when passing the header here. + response = self._download_json( + 'https://4bbepzm4ef.execute-api.eu-central-1.amazonaws.com/prod/graphql', + media_id, data=json.dumps({ + 'operationName': 'AssetWatch', + 'query': self._GRAPHQL_QUERY, + 'variables': {'assetId': media_id} + }).encode('utf-8'), + headers={'Content-Type': 'application/json', 'locale': 'de'}) + + return response['data']['assetV2'] + + def _real_extract(self, url): + media_id = self._match_id(url) + media_data = self._get_media_data(media_id) + info = self._extract_single(media_data) + if media_data.get('episodes'): + info.update({ + '_type': 'playlist', + 'entries': map(self._extract_single, media_data['episodes']), + }) + return info + + def _extract_single(self, media_data): + thumbnails = traverse_obj(media_data, lambda k, _: k.startswith('thumbnail')) + + formats, subtitles = [], {} + for media in traverse_obj(media_data, 'medias', default=[]): + if not media.get('url') or media.get('type') != 'HLS': + continue + f, subs = self._extract_m3u8_formats_and_subtitles( + media['url'], media_data['id'], 'mp4', m3u8_id='HLS', fatal=False) + formats.extend(f) + self._merge_subtitles(subs, target=subtitles) + + return { + 'id': media_data['id'], + 'title': media_data.get('name'), + 'description': media_data.get('description'), + 'thumbnails': thumbnails, + 'duration': int_or_none(media_data.get('duration')), + 'formats': formats, + 'subtitles': subtitles, + 'series': media_data.get('seriesName'), + 'season_number': int_or_none(media_data.get('seasonNumber')), + 'episode': media_data.get('name'), + 'episode_number': int_or_none(media_data.get('episodeNumber')), + } diff --git a/hypervideo_dl/extractor/playtvak.py b/hypervideo_dl/extractor/playtvak.py index 30c8a59..c418f88 100644 --- a/hypervideo_dl/extractor/playtvak.py +++ b/hypervideo_dl/extractor/playtvak.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_urlparse, @@ -163,7 +160,6 @@ class PlaytvakIE(InfoExtractor): 'quality': quality(fmt.get('quality')), 'preference': preference, }) - self._sort_formats(formats) title = item['title'] is_live = item['type'] == 'stream' diff --git a/hypervideo_dl/extractor/playvid.py b/hypervideo_dl/extractor/playvid.py index e1c406b..1e0989d 100644 --- a/hypervideo_dl/extractor/playvid.py +++ b/hypervideo_dl/extractor/playvid.py @@ -1,16 +1,9 @@ -from __future__ import unicode_literals - import re +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_urllib_parse_unquote_plus, -) -from ..utils import ( - clean_html, - ExtractorError, -) +from ..compat import compat_urllib_parse_unquote +from ..utils import ExtractorError, clean_html class PlayvidIE(InfoExtractor): @@ -64,7 +57,7 @@ class PlayvidIE(InfoExtractor): val = videovars_match.group(2) if key == 'title': - video_title = compat_urllib_parse_unquote_plus(val) + video_title = urllib.parse.unquote_plus(val) if key == 'duration': try: duration = int(val) @@ -81,7 +74,6 @@ class PlayvidIE(InfoExtractor): 'height': height, 'url': val, }) - self._sort_formats(formats) # Extract title - should be in the flashvars; if not, look elsewhere if video_title is None: diff --git a/hypervideo_dl/extractor/playwire.py b/hypervideo_dl/extractor/playwire.py index 9c9e597..1057bff 100644 --- a/hypervideo_dl/extractor/playwire.py +++ b/hypervideo_dl/extractor/playwire.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( dict_get, @@ -10,6 +7,8 @@ from ..utils import ( class PlaywireIE(InfoExtractor): _VALID_URL = r'https?://(?:config|cdn)\.playwire\.com(?:/v2)?/(?P<publisher_id>\d+)/(?:videos/v2|embed|config)/(?P<id>\d+)' + _EMBED_REGEX = [r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1'] + _TESTS = [{ 'url': 'http://config.playwire.com/14907/videos/v2/3353705/player.json', 'md5': 'e6398701e3595888125729eaa2329ed9', @@ -63,7 +62,6 @@ class PlaywireIE(InfoExtractor): for a_format in formats: if not dict_get(a_format, ['tbr', 'width', 'height']): a_format['quality'] = 1 if '-hd.' in a_format['url'] else 0 - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/pluralsight.py b/hypervideo_dl/extractor/pluralsight.py index 2a5e0e4..809b656 100644 --- a/hypervideo_dl/extractor/pluralsight.py +++ b/hypervideo_dl/extractor/pluralsight.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import collections import json import os @@ -412,8 +410,6 @@ query viewClip { }) formats.append(clip_f) - self._sort_formats(formats) - duration = int_or_none( clip.get('duration')) or parse_duration(clip.get('formattedDuration')) diff --git a/hypervideo_dl/extractor/plutotv.py b/hypervideo_dl/extractor/plutotv.py index 26aff1a..71a05cc 100644 --- a/hypervideo_dl/extractor/plutotv.py +++ b/hypervideo_dl/extractor/plutotv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re import uuid @@ -138,7 +135,6 @@ class PlutoTVIE(InfoExtractor): subtitles = self._merge_subtitles(subtitles, subs) formats, subtitles = self._to_ad_free_formats(video_id, formats, subtitles) - self._sort_formats(formats) info = { 'id': video_id, diff --git a/hypervideo_dl/extractor/podbayfm.py b/hypervideo_dl/extractor/podbayfm.py new file mode 100644 index 0000000..2a26fd2 --- /dev/null +++ b/hypervideo_dl/extractor/podbayfm.py @@ -0,0 +1,75 @@ +from .common import InfoExtractor +from ..utils import OnDemandPagedList, int_or_none, jwt_decode_hs256, try_call + + +def result_from_props(props, episode_id=None): + return { + 'id': props.get('podcast_id') or episode_id, + 'title': props.get('title'), + 'url': props['mediaURL'], + 'ext': 'mp3', + 'thumbnail': try_call(lambda: jwt_decode_hs256(props['image'])['url']), + 'timestamp': props.get('timestamp'), + 'duration': int_or_none(props.get('duration')), + } + + +class PodbayFMIE(InfoExtractor): + _VALID_URL = r'https?://podbay\.fm/p/[^/]*/e/(?P<id>[^/]*)/?(?:[\?#].*)?$' + _TESTS = [{ + 'url': 'https://podbay.fm/p/behind-the-bastards/e/1647338400', + 'md5': '98b41285dcf7989d105a4ed0404054cf', + 'info_dict': { + 'id': '1647338400', + 'title': 'Part One: Kissinger', + 'ext': 'mp3', + 'thumbnail': r're:^https?://.*\.jpg', + 'timestamp': 1647338400, + 'duration': 5001, + 'upload_date': '20220315', + }, + }] + + def _real_extract(self, url): + episode_id = self._match_id(url) + webpage = self._download_webpage(url, episode_id) + data = self._search_nextjs_data(webpage, episode_id) + return result_from_props(data['props']['pageProps']['episode'], episode_id) + + +class PodbayFMChannelIE(InfoExtractor): + _VALID_URL = r'https?://podbay\.fm/p/(?P<id>[^/]*)/?(?:[\?#].*)?$' + _TESTS = [{ + 'url': 'https://podbay.fm/p/behind-the-bastards', + 'info_dict': { + 'id': 'behind-the-bastards', + 'title': 'Behind the Bastards', + }, + }] + _PAGE_SIZE = 10 + + def _fetch_page(self, channel_id, pagenum): + return self._download_json( + f'https://podbay.fm/api/podcast?reverse=true&page={pagenum}&slug={channel_id}', + channel_id)['podcast'] + + @staticmethod + def _results_from_page(channel_id, page): + return [{ + **result_from_props(e), + 'extractor': PodbayFMIE.IE_NAME, + 'extractor_key': PodbayFMIE.ie_key(), + # somehow they use timestamps as the episode identifier + 'webpage_url': f'https://podbay.fm/p/{channel_id}/e/{e["timestamp"]}', + } for e in page['episodes']] + + def _real_extract(self, url): + channel_id = self._match_id(url) + + first_page = self._fetch_page(channel_id, 0) + entries = OnDemandPagedList( + lambda pagenum: self._results_from_page( + channel_id, self._fetch_page(channel_id, pagenum) if pagenum else first_page), + self._PAGE_SIZE) + + return self.playlist_result(entries, channel_id, first_page.get('title')) diff --git a/hypervideo_dl/extractor/podchaser.py b/hypervideo_dl/extractor/podchaser.py new file mode 100644 index 0000000..290c488 --- /dev/null +++ b/hypervideo_dl/extractor/podchaser.py @@ -0,0 +1,97 @@ +import functools +import json + +from .common import InfoExtractor +from ..utils import ( + OnDemandPagedList, + float_or_none, + str_or_none, + str_to_int, + traverse_obj, + unified_timestamp, +) + + +class PodchaserIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?podchaser\.com/podcasts/[\w-]+-(?P<podcast_id>\d+)(?:/episodes/[\w-]+-(?P<id>\d+))?' + _PAGE_SIZE = 100 + _TESTS = [{ + 'url': 'https://www.podchaser.com/podcasts/cum-town-36924/episodes/ep-285-freeze-me-off-104365585', + 'info_dict': { + 'id': '104365585', + 'title': 'Ep. 285 – freeze me off', + 'description': 'cam ahn', + 'thumbnail': r're:^https?://.*\.jpg$', + 'ext': 'mp3', + 'categories': ['Comedy'], + 'tags': ['comedy', 'dark humor'], + 'series': 'Cum Town', + 'duration': 3708, + 'timestamp': 1636531259, + 'upload_date': '20211110', + 'rating': 4.0 + } + }, { + 'url': 'https://www.podchaser.com/podcasts/the-bone-zone-28853', + 'info_dict': { + 'id': '28853', + 'title': 'The Bone Zone', + 'description': 'Podcast by The Bone Zone', + }, + 'playlist_count': 275 + }, { + 'url': 'https://www.podchaser.com/podcasts/sean-carrolls-mindscape-scienc-699349/episodes', + 'info_dict': { + 'id': '699349', + 'title': 'Sean Carroll\'s Mindscape: Science, Society, Philosophy, Culture, Arts, and Ideas', + 'description': 'md5:2cbd8f4749891a84dc8235342e0b5ff1' + }, + 'playlist_mincount': 225 + }] + + @staticmethod + def _parse_episode(episode, podcast): + return { + 'id': str(episode.get('id')), + 'title': episode.get('title'), + 'description': episode.get('description'), + 'url': episode.get('audio_url'), + 'thumbnail': episode.get('image_url'), + 'duration': str_to_int(episode.get('length')), + 'timestamp': unified_timestamp(episode.get('air_date')), + 'rating': float_or_none(episode.get('rating')), + 'categories': list(set(traverse_obj(podcast, (('summary', None), 'categories', ..., 'text')))), + 'tags': traverse_obj(podcast, ('tags', ..., 'text')), + 'series': podcast.get('title'), + } + + def _call_api(self, path, *args, **kwargs): + return self._download_json(f'https://api.podchaser.com/{path}', *args, **kwargs) + + def _fetch_page(self, podcast_id, podcast, page): + json_response = self._call_api( + 'list/episode', podcast_id, + headers={'Content-Type': 'application/json;charset=utf-8'}, + data=json.dumps({ + 'start': page * self._PAGE_SIZE, + 'count': self._PAGE_SIZE, + 'sort_order': 'SORT_ORDER_RECENT', + 'filters': { + 'podcast_id': podcast_id + }, + 'options': {} + }).encode()) + + for episode in json_response['entities']: + yield self._parse_episode(episode, podcast) + + def _real_extract(self, url): + podcast_id, episode_id = self._match_valid_url(url).group('podcast_id', 'id') + podcast = self._call_api(f'podcasts/{podcast_id}', episode_id or podcast_id) + if not episode_id: + return self.playlist_result( + OnDemandPagedList(functools.partial(self._fetch_page, podcast_id, podcast), self._PAGE_SIZE), + str_or_none(podcast.get('id')), podcast.get('title'), podcast.get('description')) + + episode = self._call_api(f'episodes/{episode_id}', episode_id) + return self._parse_episode(episode, podcast) diff --git a/hypervideo_dl/extractor/podomatic.py b/hypervideo_dl/extractor/podomatic.py index 673a3ab..985bfae 100644 --- a/hypervideo_dl/extractor/podomatic.py +++ b/hypervideo_dl/extractor/podomatic.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import json from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/pokemon.py b/hypervideo_dl/extractor/pokemon.py index b411390..0911893 100644 --- a/hypervideo_dl/extractor/pokemon.py +++ b/hypervideo_dl/extractor/pokemon.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -139,42 +134,3 @@ class PokemonWatchIE(InfoExtractor): 'episode': video_data.get('title'), 'episode_number': int_or_none(video_data.get('episode')), }) - - -class PokemonSoundLibraryIE(InfoExtractor): - _VALID_URL = r'https?://soundlibrary\.pokemon\.co\.jp' - - _TESTS = [{ - 'url': 'https://soundlibrary.pokemon.co.jp/', - 'info_dict': { - 'title': 'Pokémon Diamond and Pearl Sound Tracks', - }, - 'playlist_mincount': 149, - }] - - def _real_extract(self, url): - musicbox_webpage = self._download_webpage( - 'https://soundlibrary.pokemon.co.jp/musicbox', None, - 'Downloading list of songs') - song_titles = [x.group(1) for x in re.finditer(r'<span>([^>]+?)</span><br/>をてもち曲に加えます。', musicbox_webpage)] - song_titles = song_titles[4::2] - - # each songs don't have permalink; instead we return all songs at once - song_entries = [{ - 'id': f'pokemon-soundlibrary-{song_id}', - 'url': f'https://soundlibrary.pokemon.co.jp/api/assets/signing/sounds/wav/{song_id}.wav', - # note: the server always serves MP3 files, despite its extension of the URL above - 'ext': 'mp3', - 'acodec': 'mp3', - 'vcodec': 'none', - 'title': song_title, - 'track': song_title, - 'artist': 'Nintendo / Creatures Inc. / GAME FREAK inc.', - 'uploader': 'Pokémon', - 'release_year': 2006, - 'release_date': '20060928', - 'track_number': song_id, - 'album': 'Pokémon Diamond and Pearl', - } for song_id, song_title in enumerate(song_titles, 1)] - - return self.playlist_result(song_entries, playlist_title='Pokémon Diamond and Pearl Sound Tracks') diff --git a/hypervideo_dl/extractor/pokergo.py b/hypervideo_dl/extractor/pokergo.py index c9e2fed..5c7baad 100644 --- a/hypervideo_dl/extractor/pokergo.py +++ b/hypervideo_dl/extractor/pokergo.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import base64 from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/polsatgo.py b/hypervideo_dl/extractor/polsatgo.py index 1e3f46c..1524a1f 100644 --- a/hypervideo_dl/extractor/polsatgo.py +++ b/hypervideo_dl/extractor/polsatgo.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from uuid import uuid4 import json @@ -45,7 +42,6 @@ class PolsatGoIE(InfoExtractor): formats = list(self._extract_formats( try_get(media, lambda x: x['playback']['mediaSources']), video_id)) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/polskieradio.py b/hypervideo_dl/extractor/polskieradio.py index b2b3eb2..99244f6 100644 --- a/hypervideo_dl/extractor/polskieradio.py +++ b/hypervideo_dl/extractor/polskieradio.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import itertools import json import math @@ -298,8 +295,6 @@ class PolskieRadioPlayerIE(InfoExtractor): 'url': stream_url, }) - self._sort_formats(formats) - return { 'id': compat_str(channel['id']), 'formats': formats, diff --git a/hypervideo_dl/extractor/popcorntimes.py b/hypervideo_dl/extractor/popcorntimes.py index 5f9d0e7..ddc5ec8 100644 --- a/hypervideo_dl/extractor/popcorntimes.py +++ b/hypervideo_dl/extractor/popcorntimes.py @@ -1,12 +1,5 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_chr, -) +from ..compat import compat_b64decode from ..utils import int_or_none @@ -54,7 +47,7 @@ class PopcorntimesIE(InfoExtractor): c_ord += 13 if upper < c_ord: c_ord -= 26 - loc_b64 += compat_chr(c_ord) + loc_b64 += chr(c_ord) video_url = compat_b64decode(loc_b64).decode('utf-8') diff --git a/hypervideo_dl/extractor/popcorntv.py b/hypervideo_dl/extractor/popcorntv.py index 66d2e50..7798462 100644 --- a/hypervideo_dl/extractor/popcorntv.py +++ b/hypervideo_dl/extractor/popcorntv.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( extract_attributes, diff --git a/hypervideo_dl/extractor/porn91.py b/hypervideo_dl/extractor/porn91.py index 20eac64..af4a0dc 100644 --- a/hypervideo_dl/extractor/porn91.py +++ b/hypervideo_dl/extractor/porn91.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( parse_duration, diff --git a/hypervideo_dl/extractor/porncom.py b/hypervideo_dl/extractor/porncom.py index 83df221..c8ef240 100644 --- a/hypervideo_dl/extractor/porncom.py +++ b/hypervideo_dl/extractor/porncom.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -75,8 +73,6 @@ class PornComIE(InfoExtractor): thumbnail = None duration = None - self._sort_formats(formats) - view_count = str_to_int(self._search_regex( (r'Views:\s*</span>\s*<span>\s*([\d,.]+)', r'class=["\']views["\'][^>]*><p>([\d,.]+)'), webpage, diff --git a/hypervideo_dl/extractor/pornez.py b/hypervideo_dl/extractor/pornez.py index 713dc00..df0e44a 100644 --- a/hypervideo_dl/extractor/pornez.py +++ b/hypervideo_dl/extractor/pornez.py @@ -1,5 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals from .common import InfoExtractor from ..utils import int_or_none diff --git a/hypervideo_dl/extractor/pornflip.py b/hypervideo_dl/extractor/pornflip.py index accf452..51a9cf3 100644 --- a/hypervideo_dl/extractor/pornflip.py +++ b/hypervideo_dl/extractor/pornflip.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -63,7 +60,6 @@ class PornFlipIE(InfoExtractor): r'class="btn btn-down-rating[^>]*>[^<]*<i[^>]*>[^<]*</i>[^>]*<span[^>]*>[^0-9]*([0-9]+)[^<0-9]*<', webpage, 'dislike_count', fatal=False) mpd_url = self._search_regex(r'"([^"]+userscontent.net/dash/[0-9]+/manifest.mpd[^"]*)"', webpage, 'mpd_url').replace('&', '&') formats = self._extract_mpd_formats(mpd_url, video_id, mpd_id='dash') - self._sort_formats(formats) return { 'age_limit': 18, diff --git a/hypervideo_dl/extractor/pornhd.py b/hypervideo_dl/extractor/pornhd.py index 9dbd72f..c8a1ec8 100644 --- a/hypervideo_dl/extractor/pornhd.py +++ b/hypervideo_dl/extractor/pornhd.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -87,7 +84,6 @@ class PornHdIE(InfoExtractor): }) if formats: info['formats'] = formats - self._sort_formats(info['formats']) description = self._html_search_regex( (r'(?s)<section[^>]+class=["\']video-description[^>]+>(?P<value>.+?)</section>', diff --git a/hypervideo_dl/extractor/pornhub.py b/hypervideo_dl/extractor/pornhub.py index 17c8c91..5d8d7c1 100644 --- a/hypervideo_dl/extractor/pornhub.py +++ b/hypervideo_dl/extractor/pornhub.py @@ -1,33 +1,28 @@ -# coding: utf-8 -from __future__ import unicode_literals - import functools import itertools import math import operator import re +import urllib.request from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, - compat_urllib_request, -) from .openload import PhantomJSwrapper +from ..compat import compat_HTTPError, compat_str from ..utils import ( + NO_DEFAULT, + ExtractorError, clean_html, determine_ext, - ExtractorError, format_field, int_or_none, merge_dicts, - NO_DEFAULT, orderedSet, remove_quotes, + remove_start, str_to_int, update_url_query, - urlencode_postdata, url_or_none, + urlencode_postdata, ) @@ -52,7 +47,7 @@ class PornHubBaseIE(InfoExtractor): r'document\.location\.reload\(true\)')): url_or_request = args[0] url = (url_or_request.get_full_url() - if isinstance(url_or_request, compat_urllib_request.Request) + if isinstance(url_or_request, urllib.request.Request) else url_or_request) phantom = PhantomJSwrapper(self, required_version='2.0') phantom.get(url, html=webpage) @@ -133,6 +128,7 @@ class PornHubIE(PornHubBaseIE): ) (?P<id>[\da-z]+) ''' % PornHubBaseIE._PORNHUB_HOST_RE + _EMBED_REGEX = [r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)'] _TESTS = [{ 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', 'md5': 'a6391306d050e4547f62b3f485dd9ba9', @@ -202,6 +198,16 @@ class PornHubIE(PornHubBaseIE): }, 'skip': 'This video has been disabled', }, { + 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph601dc30bae19a', + 'info_dict': { + 'id': 'ph601dc30bae19a', + 'uploader': 'Projekt Melody', + 'uploader_id': 'projekt-melody', + 'upload_date': '20210205', + 'title': '"Welcome to My Pussy Mansion" - CB Stream (02/03/21)', + 'thumbnail': r're:https?://.+', + }, + }, { 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', 'only_matching': True, }, { @@ -252,12 +258,6 @@ class PornHubIE(PornHubBaseIE): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?pornhub(?:premium)?\.(?:com|net|org)/embed/[\da-z]+)', - webpage) - def _extract_count(self, pattern, webpage, name): return str_to_int(self._search_regex(pattern, webpage, '%s count' % name, default=None)) @@ -432,7 +432,7 @@ class PornHubIE(PornHubBaseIE): default=None)) formats.append({ 'url': format_url, - 'format_id': format_field(height, template='%dp'), + 'format_id': format_field(height, None, '%dp'), 'height': height, }) @@ -456,13 +456,11 @@ class PornHubIE(PornHubBaseIE): continue add_format(video_url) - # field_preference is unnecessary here, but kept for code-similarity with youtube-dl - self._sort_formats( - formats, field_preference=('height', 'width', 'fps', 'format_id')) - + model_profile = self._search_json( + r'var\s+MODEL_PROFILE\s*=', webpage, 'model profile', video_id, fatal=False) video_uploader = self._html_search_regex( r'(?s)From: .+?<(?:a\b[^>]+\bhref=["\']/(?:(?:user|channel)s|model|pornstar)/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<', - webpage, 'uploader', default=None) + webpage, 'uploader', default=None) or model_profile.get('username') def extract_vote_count(kind, name): return self._extract_count( @@ -491,6 +489,7 @@ class PornHubIE(PornHubBaseIE): return merge_dicts({ 'id': video_id, 'uploader': video_uploader, + 'uploader_id': remove_start(model_profile.get('modelProfileLink'), '/model/'), 'upload_date': upload_date, 'title': title, 'thumbnail': thumbnail, diff --git a/hypervideo_dl/extractor/pornotube.py b/hypervideo_dl/extractor/pornotube.py index 1b5b9a3..e0960f4 100644 --- a/hypervideo_dl/extractor/pornotube.py +++ b/hypervideo_dl/extractor/pornotube.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import json from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/pornovoisines.py b/hypervideo_dl/extractor/pornovoisines.py index 18459fc..aa48da0 100644 --- a/hypervideo_dl/extractor/pornovoisines.py +++ b/hypervideo_dl/extractor/pornovoisines.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -59,7 +55,6 @@ class PornoVoisinesIE(InfoExtractor): 'height': item.get('height'), 'bitrate': item.get('bitrate'), }) - self._sort_formats(formats) webpage = self._download_webpage(url, video_id) diff --git a/hypervideo_dl/extractor/pornoxo.py b/hypervideo_dl/extractor/pornoxo.py index 489dc2b..5104d8a 100644 --- a/hypervideo_dl/extractor/pornoxo.py +++ b/hypervideo_dl/extractor/pornoxo.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( str_to_int, diff --git a/hypervideo_dl/extractor/prankcast.py b/hypervideo_dl/extractor/prankcast.py new file mode 100644 index 0000000..0eb5f98 --- /dev/null +++ b/hypervideo_dl/extractor/prankcast.py @@ -0,0 +1,66 @@ +from .common import InfoExtractor +from ..utils import parse_iso8601, traverse_obj, try_call + + +class PrankCastIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?prankcast\.com/[^/?#]+/showreel/(?P<id>\d+)-(?P<display_id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://prankcast.com/Devonanustart/showreel/1561-Beverly-is-back-like-a-heart-attack-', + 'info_dict': { + 'id': '1561', + 'ext': 'mp3', + 'title': 'Beverly is back like a heart attack!', + 'display_id': 'Beverly-is-back-like-a-heart-attack-', + 'timestamp': 1661391575, + 'uploader': 'Devonanustart', + 'channel_id': 4, + 'duration': 7918, + 'cast': ['Devonanustart', 'Phonelosers'], + 'description': '', + 'categories': ['prank'], + 'tags': ['prank call', 'prank'], + 'upload_date': '20220825' + } + }, { + 'url': 'https://prankcast.com/phonelosers/showreel/2048-NOT-COOL', + 'info_dict': { + 'id': '2048', + 'ext': 'mp3', + 'title': 'NOT COOL', + 'display_id': 'NOT-COOL', + 'timestamp': 1665028364, + 'uploader': 'phonelosers', + 'channel_id': 6, + 'duration': 4044, + 'cast': ['phonelosers'], + 'description': '', + 'categories': ['prank'], + 'tags': ['prank call', 'prank'], + 'upload_date': '20221006' + } + }] + + def _real_extract(self, url): + video_id, display_id = self._match_valid_url(url).group('id', 'display_id') + + webpage = self._download_webpage(url, video_id) + json_info = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['ssr_data_showreel'] + + uploader = json_info.get('user_name') + guests_json = self._parse_json(json_info.get('guests_json') or '{}', video_id) + start_date = parse_iso8601(json_info.get('start_date')) + + return { + 'id': video_id, + 'title': json_info.get('broadcast_title') or self._og_search_title(webpage), + 'display_id': display_id, + 'url': f'{json_info["broadcast_url"]}{json_info["recording_hash"]}.mp3', + 'timestamp': start_date, + 'uploader': uploader, + 'channel_id': json_info.get('user_id'), + 'duration': try_call(lambda: parse_iso8601(json_info['end_date']) - start_date), + 'cast': list(filter(None, [uploader] + traverse_obj(guests_json, (..., 'name')))), + 'description': json_info.get('broadcast_description'), + 'categories': [json_info.get('broadcast_category')], + 'tags': self._parse_json(json_info.get('broadcast_tags') or '{}', video_id) + } diff --git a/hypervideo_dl/extractor/premiershiprugby.py b/hypervideo_dl/extractor/premiershiprugby.py new file mode 100644 index 0000000..67d41fd --- /dev/null +++ b/hypervideo_dl/extractor/premiershiprugby.py @@ -0,0 +1,39 @@ +from .common import InfoExtractor +from ..utils import int_or_none, traverse_obj + + +class PremiershipRugbyIE(InfoExtractor): + _VALID_URL = r'https?://(?:\w+\.)premiershiprugby\.(?:com)/watch/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.premiershiprugby.com/watch/full-match-harlequins-v-newcastle-falcons', + 'info_dict': { + 'id': '0_mbkb7ldt', + 'title': 'Full Match: Harlequins v Newcastle Falcons', + 'ext': 'mp4', + 'thumbnail': 'https://open.http.mp.streamamg.com/p/3000914/sp/300091400/thumbnail/entry_id/0_mbkb7ldt//width/960/height/540/type/1/quality/75', + 'duration': 6093.0, + 'tags': ['video'], + 'categories': ['Full Match', 'Harlequins', 'Newcastle Falcons', 'gallaher premiership'], + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + json_data = self._download_json( + f'https://article-cms-api.incrowdsports.com/v2/articles/slug/{display_id}', + display_id, query={'clientId': 'PRL'})['data']['article'] + + formats, subs = self._extract_m3u8_formats_and_subtitles( + json_data['heroMedia']['content']['videoLink'], display_id) + + return { + 'id': json_data['heroMedia']['content']['sourceSystemId'], + 'display_id': display_id, + 'title': traverse_obj(json_data, ('heroMedia', 'title')), + 'formats': formats, + 'subtitles': subs, + 'thumbnail': traverse_obj(json_data, ('heroMedia', 'content', 'videoThumbnail')), + 'duration': int_or_none(traverse_obj(json_data, ('heroMedia', 'content', 'metadata', 'msDuration')), scale=1000), + 'tags': json_data.get('tags'), + 'categories': traverse_obj(json_data, ('categories', ..., 'text')), + } diff --git a/hypervideo_dl/extractor/presstv.py b/hypervideo_dl/extractor/presstv.py index bfb2eb7..26ce74a 100644 --- a/hypervideo_dl/extractor/presstv.py +++ b/hypervideo_dl/extractor/presstv.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import remove_start diff --git a/hypervideo_dl/extractor/projectveritas.py b/hypervideo_dl/extractor/projectveritas.py index 9e9867b..0e029ce 100644 --- a/hypervideo_dl/extractor/projectveritas.py +++ b/hypervideo_dl/extractor/projectveritas.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -45,7 +42,6 @@ class ProjectVeritasIE(InfoExtractor): raise ExtractorError('No video on the provided url.', expected=True) playback_id = traverse_obj(mux_asset, 'playbackId', ('en-US', 'playbackId')) formats = self._extract_m3u8_formats(f'https://stream.mux.com/{playback_id}.m3u8', video_id) - self._sort_formats(formats) return { 'id': video_id, 'title': main_data['title'], diff --git a/hypervideo_dl/extractor/prosiebensat1.py b/hypervideo_dl/extractor/prosiebensat1.py index e89bbfd..46e2e8a 100644 --- a/hypervideo_dl/extractor/prosiebensat1.py +++ b/hypervideo_dl/extractor/prosiebensat1.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from hashlib import sha1 @@ -159,7 +156,6 @@ class ProSiebenSat1BaseIE(InfoExtractor): 'tbr': tbr, 'format_id': 'http%s' % ('-%d' % tbr if tbr else ''), }) - self._sort_formats(formats) return { 'duration': float_or_none(video.get('duration')), diff --git a/hypervideo_dl/extractor/prx.py b/hypervideo_dl/extractor/prx.py index 80561b8..5bb1832 100644 --- a/hypervideo_dl/extractor/prx.py +++ b/hypervideo_dl/extractor/prx.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import itertools from .common import InfoExtractor, SearchInfoExtractor from ..utils import ( diff --git a/hypervideo_dl/extractor/puhutv.py b/hypervideo_dl/extractor/puhutv.py index ca71665..482e570 100644 --- a/hypervideo_dl/extractor/puhutv.py +++ b/hypervideo_dl/extractor/puhutv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_HTTPError, @@ -114,7 +111,6 @@ class PuhuTVIE(InfoExtractor): format_id += '-%sp' % quality f['format_id'] = format_id formats.append(f) - self._sort_formats(formats) creator = try_get( show, lambda x: x['producer']['name'], compat_str) diff --git a/hypervideo_dl/extractor/puls4.py b/hypervideo_dl/extractor/puls4.py index 80091b8..38c5d11 100644 --- a/hypervideo_dl/extractor/puls4.py +++ b/hypervideo_dl/extractor/puls4.py @@ -1,12 +1,6 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .prosiebensat1 import ProSiebenSat1BaseIE -from ..utils import ( - unified_strdate, - parse_duration, - compat_str, -) +from ..compat import compat_str +from ..utils import parse_duration, unified_strdate class Puls4IE(ProSiebenSat1BaseIE): diff --git a/hypervideo_dl/extractor/pyvideo.py b/hypervideo_dl/extractor/pyvideo.py index 8696197..7b25166 100644 --- a/hypervideo_dl/extractor/pyvideo.py +++ b/hypervideo_dl/extractor/pyvideo.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/qingting.py b/hypervideo_dl/extractor/qingting.py new file mode 100644 index 0000000..aa690d4 --- /dev/null +++ b/hypervideo_dl/extractor/qingting.py @@ -0,0 +1,47 @@ +from .common import InfoExtractor + +from ..utils import traverse_obj + + +class QingTingIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.|m\.)?(?:qingting\.fm|qtfm\.cn)/v?channels/(?P<channel>\d+)/programs/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.qingting.fm/channels/378005/programs/22257411/', + 'md5': '47e6a94f4e621ed832c316fd1888fb3c', + 'info_dict': { + 'id': '22257411', + 'title': '用了十年才修改,谁在乎教科书?', + 'channel_id': '378005', + 'channel': '睡前消息', + 'uploader': '马督工', + 'ext': 'm4a', + } + }, { + 'url': 'https://m.qtfm.cn/vchannels/378005/programs/23023573/', + 'md5': '2703120b6abe63b5fa90b975a58f4c0e', + 'info_dict': { + 'id': '23023573', + 'title': '【睡前消息488】重庆山火之后,有图≠真相', + 'channel_id': '378005', + 'channel': '睡前消息', + 'uploader': '马督工', + 'ext': 'm4a', + } + }] + + def _real_extract(self, url): + channel_id, pid = self._match_valid_url(url).group('channel', 'id') + webpage = self._download_webpage( + f'https://m.qtfm.cn/vchannels/{channel_id}/programs/{pid}/', pid) + info = self._search_json(r'window\.__initStores\s*=', webpage, 'program info', pid) + return { + 'id': pid, + 'title': traverse_obj(info, ('ProgramStore', 'programInfo', 'title')), + 'channel_id': channel_id, + 'channel': traverse_obj(info, ('ProgramStore', 'channelInfo', 'title')), + 'uploader': traverse_obj(info, ('ProgramStore', 'podcasterInfo', 'podcaster', 'nickname')), + 'url': traverse_obj(info, ('ProgramStore', 'programInfo', 'audioUrl')), + 'vcodec': 'none', + 'acodec': 'm4a', + 'ext': 'm4a', + } diff --git a/hypervideo_dl/extractor/qqmusic.py b/hypervideo_dl/extractor/qqmusic.py index 0106d16..9285825 100644 --- a/hypervideo_dl/extractor/qqmusic.py +++ b/hypervideo_dl/extractor/qqmusic.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import random import re import time @@ -125,7 +122,6 @@ class QQMusicIE(InfoExtractor): 'abr': details.get('abr'), }) self._check_formats(formats, mid) - self._sort_formats(formats) actual_lrc_lyrics = ''.join( line + '\n' for line in re.findall( diff --git a/hypervideo_dl/extractor/r7.py b/hypervideo_dl/extractor/r7.py index e2202d6..f067a05 100644 --- a/hypervideo_dl/extractor/r7.py +++ b/hypervideo_dl/extractor/r7.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import int_or_none @@ -69,7 +66,6 @@ class R7IE(InfoExtractor): f_copy['protocol'] = 'http' f = f_copy formats.append(f) - self._sort_formats(formats) description = video.get('description') thumbnail = video.get('thumb') diff --git a/hypervideo_dl/extractor/radiko.py b/hypervideo_dl/extractor/radiko.py index 1e60de1..f102922 100644 --- a/hypervideo_dl/extractor/radiko.py +++ b/hypervideo_dl/extractor/radiko.py @@ -1,29 +1,22 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re import base64 -import calendar -import datetime +import re +import urllib.parse from .common import InfoExtractor from ..utils import ( ExtractorError, - update_url_query, clean_html, + time_seconds, + try_call, unified_timestamp, + update_url_query, ) -from ..compat import compat_urllib_parse class RadikoBaseIE(InfoExtractor): _FULL_KEY = None def _auth_client(self): - auth_cache = self._downloader.cache.load('radiko', 'auth_data') - if auth_cache: - return auth_cache - _, auth1_handle = self._download_webpage_handle( 'https://radiko.jp/v2/api/auth1', None, 'Downloading authentication page', headers={ @@ -50,7 +43,7 @@ class RadikoBaseIE(InfoExtractor): }).split(',')[0] auth_data = (auth_token, area_id) - self._downloader.cache.store('radiko', 'auth_data', auth_data) + self.cache.store('radiko', 'auth_data', auth_data) return auth_data def _extract_full_key(self): @@ -92,8 +85,8 @@ class RadikoBaseIE(InfoExtractor): def _extract_formats(self, video_id, station, is_onair, ft, cursor, auth_token, area_id, query): m3u8_playlist_data = self._download_xml( - 'https://radiko.jp/v3/station/stream/pc_html5/%s.xml' % station, video_id, - note='Downloading m3u8 information') + f'https://radiko.jp/v3/station/stream/pc_html5/{station}.xml', video_id, + note='Downloading stream information') m3u8_urls = m3u8_playlist_data.findall('.//url') formats = [] @@ -105,7 +98,7 @@ class RadikoBaseIE(InfoExtractor): 'station_id': station, **query, 'l': '15', - 'lsid': '77d0678df93a1034659c14d6fc89f018', + 'lsid': '88ecea37e968c1f17d5413312d9f8003', 'type': 'b', }) if playlist_url in found: @@ -115,23 +108,23 @@ class RadikoBaseIE(InfoExtractor): time_to_skip = None if is_onair else cursor - ft + domain = urllib.parse.urlparse(playlist_url).netloc subformats = self._extract_m3u8_formats( playlist_url, video_id, ext='m4a', - live=True, fatal=False, m3u8_id=None, + live=True, fatal=False, m3u8_id=domain, + note=f'Downloading m3u8 information from {domain}', headers={ 'X-Radiko-AreaId': area_id, 'X-Radiko-AuthToken': auth_token, }) for sf in subformats: - domain = sf['format_id'] = compat_urllib_parse.urlparse(sf['url']).netloc - if re.match(r'^[cf]-radiko\.smartstream\.ne\.jp$', domain): + if re.fullmatch(r'[cf]-radiko\.smartstream\.ne\.jp', domain): # Prioritize live radio vs playback based on extractor sf['preference'] = 100 if is_onair else -100 if not is_onair and url_attrib['timefree'] == '1' and time_to_skip: - sf['_ffmpeg_args'] = ['-ss', time_to_skip] + sf['downloader_options'] = {'ffmpeg_args': ['-ss', time_to_skip]} formats.extend(subformats) - self._sort_formats(formats) return formats @@ -154,31 +147,29 @@ class RadikoIE(RadikoBaseIE): def _real_extract(self, url): station, video_id = self._match_valid_url(url).groups() vid_int = unified_timestamp(video_id, False) - - auth_token, area_id = self._auth_client() - prog, station_program, ft, radio_begin, radio_end = self._find_program(video_id, station, vid_int) - title = prog.find('title').text - description = clean_html(prog.find('info').text) - station_name = station_program.find('.//name').text - - formats = self._extract_formats( - video_id=video_id, station=station, is_onair=False, - ft=ft, cursor=vid_int, auth_token=auth_token, area_id=area_id, - query={ - 'start_at': radio_begin, - 'ft': radio_begin, - 'end_at': radio_end, - 'to': radio_end, - 'seek': video_id, - }) + auth_cache = self.cache.load('radiko', 'auth_data') + for attempt in range(2): + auth_token, area_id = (not attempt and auth_cache) or self._auth_client() + formats = self._extract_formats( + video_id=video_id, station=station, is_onair=False, + ft=ft, cursor=vid_int, auth_token=auth_token, area_id=area_id, + query={ + 'start_at': radio_begin, + 'ft': radio_begin, + 'end_at': radio_end, + 'to': radio_end, + 'seek': video_id, + }) + if formats: + break return { 'id': video_id, - 'title': title, - 'description': description, - 'uploader': station_name, + 'title': try_call(lambda: prog.find('title').text), + 'description': clean_html(try_call(lambda: prog.find('info').text)), + 'uploader': try_call(lambda: station_program.find('.//name').text), 'uploader_id': station, 'timestamp': vid_int, 'formats': formats, @@ -208,8 +199,7 @@ class RadikoRadioIE(RadikoBaseIE): auth_token, area_id = self._auth_client() # get current time in JST (GMT+9:00 w/o DST) - vid_now = datetime.datetime.now(datetime.timezone(datetime.timedelta(hours=9))) - vid_now = calendar.timegm(vid_now.timetuple()) + vid_now = time_seconds(hours=9) prog, station_program, ft, _, _ = self._find_program(station, station, vid_now) diff --git a/hypervideo_dl/extractor/radiobremen.py b/hypervideo_dl/extractor/radiobremen.py index 2c35f98..99ba050 100644 --- a/hypervideo_dl/extractor/radiobremen.py +++ b/hypervideo_dl/extractor/radiobremen.py @@ -1,7 +1,3 @@ -# coding: utf-8 - -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/radiocanada.py b/hypervideo_dl/extractor/radiocanada.py index 4b4445c..72c21d5 100644 --- a/hypervideo_dl/extractor/radiocanada.py +++ b/hypervideo_dl/extractor/radiocanada.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( @@ -117,7 +113,6 @@ class RadioCanadaIE(InfoExtractor): raise ExtractorError( '%s said: %s' % (self.IE_NAME, error), expected=True) formats = self._extract_m3u8_formats(v_url, video_id, 'mp4') - self._sort_formats(formats) subtitles = {} closed_caption_url = get_meta('closedCaption') or get_meta('closedCaptionHTML5') diff --git a/hypervideo_dl/extractor/radiode.py b/hypervideo_dl/extractor/radiode.py index 0382873..32c36d5 100644 --- a/hypervideo_dl/extractor/radiode.py +++ b/hypervideo_dl/extractor/radiode.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor @@ -40,7 +38,6 @@ class RadioDeIE(InfoExtractor): 'abr': stream['bitRate'], 'asr': stream['sampleRate'] } for stream in broadcast['streamUrls']] - self._sort_formats(formats) return { 'id': radio_id, diff --git a/hypervideo_dl/extractor/radiofrance.py b/hypervideo_dl/extractor/radiofrance.py index 082238b..11765d0 100644 --- a/hypervideo_dl/extractor/radiofrance.py +++ b/hypervideo_dl/extractor/radiofrance.py @@ -1,9 +1,7 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor +from ..utils import parse_duration, unified_strdate class RadioFranceIE(InfoExtractor): @@ -48,7 +46,6 @@ class RadioFranceIE(InfoExtractor): for i, fm in enumerate(re.findall(r"([a-z0-9]+)\s*:\s*'([^']+)'", formats_str)) ] - self._sort_formats(formats) return { 'id': video_id, @@ -57,3 +54,51 @@ class RadioFranceIE(InfoExtractor): 'description': description, 'uploader': uploader, } + + +class FranceCultureIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?radiofrance\.fr/(?:franceculture|fip|francemusique|mouv|franceinter)/podcasts/(?:[^?#]+/)?(?P<display_id>[^?#]+)-(?P<id>\d+)($|[?#])' + _TESTS = [ + { + 'url': 'https://www.radiofrance.fr/franceculture/podcasts/science-en-questions/la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau-8440487', + 'info_dict': { + 'id': '8440487', + 'display_id': 'la-physique-d-einstein-aiderait-elle-a-comprendre-le-cerveau', + 'ext': 'mp3', + 'title': 'La physique d’Einstein aiderait-elle à comprendre le cerveau ?', + 'description': 'Existerait-il un pont conceptuel entre la physique de l’espace-temps et les neurosciences ?', + 'thumbnail': 'https://cdn.radiofrance.fr/s3/cruiser-production/2022/05/d184e7a3-4827-4494-bf94-04ed7b120db4/1200x630_gettyimages-200171095-001.jpg', + 'upload_date': '20220514', + 'duration': 2750, + }, + }, + { + 'url': 'https://www.radiofrance.fr/franceinter/podcasts/la-rafle-du-vel-d-hiv-une-affaire-d-etat/les-racines-du-crime-episode-1-3715507', + 'only_matching': True, + } + ] + + def _real_extract(self, url): + video_id, display_id = self._match_valid_url(url).group('id', 'display_id') + webpage = self._download_webpage(url, display_id) + + # _search_json_ld doesn't correctly handle this. See https://github.com/hypervideo/hypervideo/pull/3874#discussion_r891903846 + video_data = self._search_json('', webpage, 'audio data', display_id, contains_pattern=r'{\s*"@type"\s*:\s*"AudioObject".+}') + + return { + 'id': video_id, + 'display_id': display_id, + 'url': video_data['contentUrl'], + 'ext': video_data.get('encodingFormat'), + 'vcodec': 'none' if video_data.get('encodingFormat') == 'mp3' else None, + 'duration': parse_duration(video_data.get('duration')), + 'title': self._html_search_regex(r'(?s)<h1[^>]*itemprop="[^"]*name[^"]*"[^>]*>(.+?)</h1>', + webpage, 'title', default=self._og_search_title(webpage)), + 'description': self._html_search_regex( + r'(?s)<meta name="description"\s*content="([^"]+)', webpage, 'description', default=None), + 'thumbnail': self._og_search_thumbnail(webpage), + 'uploader': self._html_search_regex( + r'(?s)<span class="author">(.*?)</span>', webpage, 'uploader', default=None), + 'upload_date': unified_strdate(self._search_regex( + r'"datePublished"\s*:\s*"([^"]+)', webpage, 'timestamp', fatal=False)) + } diff --git a/hypervideo_dl/extractor/radiojavan.py b/hypervideo_dl/extractor/radiojavan.py index 3f74f0c..6a91394 100644 --- a/hypervideo_dl/extractor/radiojavan.py +++ b/hypervideo_dl/extractor/radiojavan.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -52,7 +50,6 @@ class RadioJavanIE(InfoExtractor): 'format_id': format_id, }) formats.append(f) - self._sort_formats(formats) title = self._og_search_title(webpage) thumbnail = self._og_search_thumbnail(webpage) diff --git a/hypervideo_dl/extractor/radiokapital.py b/hypervideo_dl/extractor/radiokapital.py index 2e93e03..8f9737a 100644 --- a/hypervideo_dl/extractor/radiokapital.py +++ b/hypervideo_dl/extractor/radiokapital.py @@ -1,5 +1,3 @@ -# coding: utf-8 - from .common import InfoExtractor from ..utils import ( clean_html, diff --git a/hypervideo_dl/extractor/radiozet.py b/hypervideo_dl/extractor/radiozet.py index 2e1ff36..6752017 100644 --- a/hypervideo_dl/extractor/radiozet.py +++ b/hypervideo_dl/extractor/radiozet.py @@ -1,4 +1,3 @@ -# coding: utf-8 from .common import InfoExtractor from ..utils import ( traverse_obj, diff --git a/hypervideo_dl/extractor/radlive.py b/hypervideo_dl/extractor/radlive.py index dc98973..9bcbb11 100644 --- a/hypervideo_dl/extractor/radlive.py +++ b/hypervideo_dl/extractor/radlive.py @@ -62,7 +62,6 @@ class RadLiveIE(InfoExtractor): raise ExtractorError('Unable to extract video info, make sure the URL is valid') formats = self._extract_m3u8_formats(video_info['assets']['videos'][0]['url'], video_id) - self._sort_formats(formats) data = video_info.get('structured_data', {}) @@ -80,7 +79,7 @@ class RadLiveIE(InfoExtractor): 'release_timestamp': release_date, 'channel': channel.get('name'), 'channel_id': channel_id, - 'channel_url': format_field(channel_id, template='https://rad.live/content/channel/%s'), + 'channel_url': format_field(channel_id, None, 'https://rad.live/content/channel/%s'), } if content_type == 'episode': @@ -94,7 +93,7 @@ class RadLiveIE(InfoExtractor): return result -class RadLiveSeasonIE(RadLiveIE): +class RadLiveSeasonIE(RadLiveIE): # XXX: Do not subclass from concrete IE IE_NAME = 'radlive:season' _VALID_URL = r'https?://(?:www\.)?rad\.live/content/season/(?P<id>[a-f0-9-]+)' _TESTS = [{ @@ -134,7 +133,7 @@ class RadLiveSeasonIE(RadLiveIE): return self.playlist_result(entries, season_id, video_info.get('title')) -class RadLiveChannelIE(RadLiveIE): +class RadLiveChannelIE(RadLiveIE): # XXX: Do not subclass from concrete IE IE_NAME = 'radlive:channel' _VALID_URL = r'https?://(?:www\.)?rad\.live/content/channel/(?P<id>[a-f0-9-]+)' _TESTS = [{ diff --git a/hypervideo_dl/extractor/rai.py b/hypervideo_dl/extractor/rai.py index 6864129..cab12cc 100644 --- a/hypervideo_dl/extractor/rai.py +++ b/hypervideo_dl/extractor/rai.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -9,6 +6,7 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( + clean_html, determine_ext, ExtractorError, filter_dict, @@ -48,11 +46,14 @@ class RaiBaseIE(InfoExtractor): for platform in ('mon', 'flash', 'native'): relinker = self._download_xml( relinker_url, video_id, - note='Downloading XML metadata for platform %s' % platform, + note=f'Downloading XML metadata for platform {platform}', transform_source=fix_xml_ampersands, query={'output': 45, 'pl': platform}, headers=self.geo_verification_headers()) + if xpath_text(relinker, './license_url', default='{}') != '{}': + self.report_drm(video_id) + if not geoprotection: geoprotection = xpath_text( relinker, './geoprotection', default=None) == 'Y' @@ -102,7 +103,7 @@ class RaiBaseIE(InfoExtractor): formats.append({ 'url': media_url, 'tbr': bitrate if bitrate > 0 else None, - 'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http', + 'format_id': f'http-{bitrate if bitrate > 0 else "http"}', }) if not formats and geoprotection is True: @@ -155,7 +156,7 @@ class RaiBaseIE(InfoExtractor): br = int_or_none(tbr) if len(fmts) == 1 and not br: br = fmts[0].get('tbr') - if br > 300: + if br and br > 300: tbr = compat_str(math.floor(br / 100) * 100) else: tbr = '250' @@ -174,11 +175,11 @@ class RaiBaseIE(InfoExtractor): 'vcodec': format_copy.get('vcodec'), 'acodec': format_copy.get('acodec'), 'fps': format_copy.get('fps'), - 'format_id': 'https-%s' % tbr, + 'format_id': f'https-{tbr}', } if format_copy else { 'width': _QUALITY[tbr][0], 'height': _QUALITY[tbr][1], - 'format_id': 'https-%s' % tbr, + 'format_id': f'https-{tbr}', 'tbr': int(tbr), } @@ -201,8 +202,8 @@ class RaiBaseIE(InfoExtractor): 'url': _MP4_TMPL % (relinker_url, q), 'protocol': 'https', 'ext': 'mp4', + **get_format_info(q) } - fmt.update(get_format_info(q)) formats.append(fmt) return formats @@ -233,7 +234,7 @@ class RaiBaseIE(InfoExtractor): class RaiPlayIE(RaiBaseIE): - _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s))\.(?:html|json)' % RaiBaseIE._UUID_RE + _VALID_URL = rf'(?P<base>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>{RaiBaseIE._UUID_RE}))\.(?:html|json)' _TESTS = [{ 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', 'md5': '8970abf8caf8aef4696e7b1f2adfc696', @@ -251,6 +252,10 @@ class RaiPlayIE(RaiBaseIE): 'subtitles': { 'it': 'count:4', }, + 'release_year': 2022, + 'episode': 'Espresso nel caffè - 07/04/2014', + 'timestamp': 1396919880, + 'upload_date': '20140408', }, 'params': { 'skip_download': True, @@ -270,6 +275,12 @@ class RaiPlayIE(RaiBaseIE): 'duration': 6493, 'series': 'Blanca', 'season': 'Season 1', + 'episode_number': 1, + 'release_year': 2021, + 'season_number': 1, + 'episode': 'Senza occhi', + 'timestamp': 1637318940, + 'upload_date': '20211119', }, }, { 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', @@ -280,7 +291,7 @@ class RaiPlayIE(RaiBaseIE): 'only_matching': True, }, { # DRM protected - 'url': 'https://www.raiplay.it/video/2020/09/Lo-straordinario-mondo-di-Zoey-S1E1-Lo-straordinario-potere-di-Zoey-ed493918-1d32-44b7-8454-862e473d00ff.html', + 'url': 'https://www.raiplay.it/video/2021/06/Lo-straordinario-mondo-di-Zoey-S2E1-Lo-straordinario-ritorno-di-Zoey-3ba992de-2332-41ad-9214-73e32ab209f4.html', 'only_matching': True, }] @@ -302,7 +313,6 @@ class RaiPlayIE(RaiBaseIE): video = media['video'] relinker_info = self._extract_relinker_info(video['content_url'], video_id) - self._sort_formats(relinker_info['formats']) thumbnails = [] for _, value in media.get('images', {}).items(): @@ -323,13 +333,13 @@ class RaiPlayIE(RaiBaseIE): alt_title = join_nonempty(media.get('subtitle'), media.get('toptitle'), delim=' - ') - info = { + return { 'id': remove_start(media.get('id'), 'ContentItem-') or video_id, 'display_id': video_id, 'title': title, - 'alt_title': strip_or_none(alt_title), + 'alt_title': strip_or_none(alt_title or None), 'description': media.get('description'), - 'uploader': strip_or_none(media.get('channel')), + 'uploader': strip_or_none(media.get('channel') or None), 'creator': strip_or_none(media.get('editor') or None), 'duration': parse_duration(video.get('duration')), 'timestamp': unified_timestamp(date_published), @@ -340,13 +350,12 @@ class RaiPlayIE(RaiBaseIE): 'episode': media.get('episode_title'), 'episode_number': int_or_none(media.get('episode')), 'subtitles': subtitles, + 'release_year': int_or_none(traverse_obj(media, ('track_info', 'edit_year'))), + **relinker_info } - info.update(relinker_info) - return info - -class RaiPlayLiveIE(RaiPlayIE): +class RaiPlayLiveIE(RaiPlayIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+))' _TESTS = [{ 'url': 'http://www.raiplay.it/dirette/rainews24', @@ -359,6 +368,9 @@ class RaiPlayLiveIE(RaiPlayIE): 'uploader': 'Rai News 24', 'creator': 'Rai News 24', 'is_live': True, + 'live_status': 'is_live', + 'upload_date': '20090502', + 'timestamp': 1241276220, }, 'params': { 'skip_download': True, @@ -409,7 +421,7 @@ class RaiPlayPlaylistIE(InfoExtractor): if not s_id: continue medias = self._download_json( - '%s/%s.json' % (base, s_id), s_id, + f'{base}/{s_id}.json', s_id, 'Downloading content set JSON', fatal=False) if not medias: continue @@ -428,7 +440,7 @@ class RaiPlayPlaylistIE(InfoExtractor): class RaiPlaySoundIE(RaiBaseIE): - _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplaysound\.it/.+?-(?P<id>%s))\.(?:html|json)' % RaiBaseIE._UUID_RE + _VALID_URL = rf'(?P<base>https?://(?:www\.)?raiplaysound\.it/.+?-(?P<id>{RaiBaseIE._UUID_RE}))\.(?:html|json)' _TESTS = [{ 'url': 'https://www.raiplaysound.it/audio/2021/12/IL-RUGGITO-DEL-CONIGLIO-1ebae2a7-7cdb-42bb-842e-fe0d193e9707.html', 'md5': '8970abf8caf8aef4696e7b1f2adfc696', @@ -436,11 +448,16 @@ class RaiPlaySoundIE(RaiBaseIE): 'id': '1ebae2a7-7cdb-42bb-842e-fe0d193e9707', 'ext': 'mp3', 'title': 'Il Ruggito del Coniglio del 10/12/2021', + 'alt_title': 'md5:0e6476cd57858bb0f3fcc835d305b455', 'description': 'md5:2a17d2107e59a4a8faa0e18334139ee2', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'rai radio 2', 'duration': 5685, 'series': 'Il Ruggito del Coniglio', + 'episode': 'Il Ruggito del Coniglio del 10/12/2021', + 'creator': 'rai radio 2', + 'timestamp': 1638346620, + 'upload_date': '20211201', }, 'params': { 'skip_download': True, @@ -472,7 +489,7 @@ class RaiPlaySoundIE(RaiBaseIE): 'id': uid or audio_id, 'display_id': audio_id, 'title': traverse_obj(media, 'title', 'episode_title'), - 'alt_title': traverse_obj(media, ('track_info', 'media_name')), + 'alt_title': traverse_obj(media, ('track_info', 'media_name'), expected_type=strip_or_none), 'description': media.get('description'), 'uploader': traverse_obj(media, ('track_info', 'channel'), expected_type=strip_or_none), 'creator': traverse_obj(media, ('track_info', 'editor'), expected_type=strip_or_none), @@ -486,7 +503,7 @@ class RaiPlaySoundIE(RaiBaseIE): } -class RaiPlaySoundLiveIE(RaiPlaySoundIE): +class RaiPlaySoundLiveIE(RaiPlaySoundIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplaysound\.it/(?P<id>[^/?#&]+)$)' _TESTS = [{ 'url': 'https://www.raiplaysound.it/radio2', @@ -494,10 +511,13 @@ class RaiPlaySoundLiveIE(RaiPlaySoundIE): 'id': 'b00a50e6-f404-4af6-8f8c-ff3b9af73a44', 'display_id': 'radio2', 'ext': 'mp4', - 'title': 'Rai Radio 2', + 'title': r're:Rai Radio 2 \d+-\d+-\d+ \d+:\d+', + 'thumbnail': r're:https://www.raiplaysound.it/dl/img/.+?png', 'uploader': 'rai radio 2', + 'series': 'Rai Radio 2', 'creator': 'raiplaysound', 'is_live': True, + 'live_status': 'is_live', }, 'params': { 'skip_download': 'live', @@ -546,11 +566,11 @@ class RaiPlaySoundPlaylistIE(InfoExtractor): class RaiIE(RaiBaseIE): - _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE + _VALID_URL = rf'https?://[^/]+\.(?:rai\.(?:it|tv))/.+?-(?P<id>{RaiBaseIE._UUID_RE})(?:-.+?)?\.html' _TESTS = [{ # var uniquename = "ContentItem-..." # data-id="ContentItem-..." - 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', + 'url': 'https://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', 'info_dict': { 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', 'ext': 'mp4', @@ -561,20 +581,8 @@ class RaiIE(RaiBaseIE): }, 'skip': 'This content is available only in Italy', }, { - # with ContentItem in many metas - 'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', - 'info_dict': { - 'id': '1632c009-c843-4836-bb65-80c33084a64b', - 'ext': 'mp4', - 'title': 'Weekend al cinema, da Hollywood arriva il thriller di Tate Taylor "La ragazza del treno"', - 'description': 'I film in uscita questa settimana.', - 'thumbnail': r're:^https?://.*\.png$', - 'duration': 833, - 'upload_date': '20161103', - } - }, { # with ContentItem in og:url - 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', + 'url': 'https://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', 'md5': '06345bd97c932f19ffb129973d07a020', 'info_dict': { 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', @@ -583,42 +591,17 @@ class RaiIE(RaiBaseIE): 'description': 'TG1 edizione integrale ore 20:00 del giorno 03/11/2016', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 2214, - 'upload_date': '20161103', + 'upload_date': '20161103' } }, { - # initEdizione('ContentItem-...' - 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', - 'info_dict': { - 'id': 'c2187016-8484-4e3a-8ac8-35e475b07303', - 'ext': 'mp4', - 'title': r're:TG1 ore \d{2}:\d{2} del \d{2}/\d{2}/\d{4}', - 'duration': 2274, - 'upload_date': '20170401', - }, - 'skip': 'Changes daily', - }, { - # HLS live stream with ContentItem in og:url - 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', - 'info_dict': { - 'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9', - 'ext': 'mp4', - 'title': 'La diretta di Rainews24', - }, - 'params': { - 'skip_download': True, - }, - }, { # Direct MMS URL 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-b63a4089-ac28-48cf-bca5-9f5b5bc46df5.html', 'only_matching': True, - }, { - 'url': 'https://www.rainews.it/tgr/marche/notiziari/video/2019/02/ContentItem-6ba945a2-889c-4a80-bdeb-8489c70a8db9.html', - 'only_matching': True, }] def _extract_from_content_id(self, content_id, url): media = self._download_json( - 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id, + f'https://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-{content_id}.html?json', content_id, 'Downloading video JSON') title = media['name'].strip() @@ -637,8 +620,6 @@ class RaiIE(RaiBaseIE): else: raise ExtractorError('not a media file') - self._sort_formats(relinker_info['formats']) - thumbnails = [] for image_type in ('image', 'image_medium', 'image_300'): thumbnail_url = media.get(image_type) @@ -649,21 +630,18 @@ class RaiIE(RaiBaseIE): subtitles = self._extract_subtitles(url, media) - info = { + return { 'id': content_id, 'title': title, - 'description': strip_or_none(media.get('desc')), + 'description': strip_or_none(media.get('desc') or None), 'thumbnails': thumbnails, - 'uploader': media.get('author'), + 'uploader': strip_or_none(media.get('author') or None), 'upload_date': unified_strdate(media.get('date')), 'duration': parse_duration(media.get('length')), 'subtitles': subtitles, + **relinker_info } - info.update(relinker_info) - - return info - def _real_extract(self, url): video_id = self._match_id(url) @@ -676,20 +654,20 @@ class RaiIE(RaiBaseIE): 'twitter:player', 'jsonlink'), webpage, default=None) if content_item_url: content_item_id = self._search_regex( - r'ContentItem-(%s)' % self._UUID_RE, content_item_url, + rf'ContentItem-({self._UUID_RE})', content_item_url, 'content item id', default=None) if not content_item_id: content_item_id = self._search_regex( - r'''(?x) + rf'''(?x) (?: (?:initEdizione|drawMediaRaiTV)\(| <(?:[^>]+\bdata-id|var\s+uniquename)=| <iframe[^>]+\bsrc= ) (["\']) - (?:(?!\1).)*\bContentItem-(?P<id>%s) - ''' % self._UUID_RE, + (?:(?!\1).)*\bContentItem-(?P<id>{self._UUID_RE}) + ''', webpage, 'content item id', default=None, group='id') content_item_ids = set() @@ -722,18 +700,121 @@ class RaiIE(RaiBaseIE): relinker_info = self._extract_relinker_info( urljoin(url, relinker_url), video_id) - self._sort_formats(relinker_info['formats']) title = self._search_regex( r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1', webpage, 'title', group='title', default=None) or self._og_search_title(webpage) - info = { + return { 'id': video_id, 'title': title, + **relinker_info } - info.update(relinker_info) - return info +class RaiNewsIE(RaiIE): # XXX: Do not subclass from concrete IE + _VALID_URL = rf'https?://(www\.)?rainews\.it/(?!articoli)[^?#]+-(?P<id>{RaiBaseIE._UUID_RE})(?:-[^/?#]+)?\.html' + _EMBED_REGEX = [rf'<iframe[^>]+data-src="(?P<url>/iframe/[^?#]+?{RaiBaseIE._UUID_RE}\.html)'] + _TESTS = [{ + # new rainews player (#3911) + 'url': 'https://www.rainews.it/rubriche/24mm/video/2022/05/24mm-del-29052022-12cf645d-1ffd-4220-b27c-07c226dbdecf.html', + 'info_dict': { + 'id': '12cf645d-1ffd-4220-b27c-07c226dbdecf', + 'ext': 'mp4', + 'title': 'Puntata del 29/05/2022', + 'duration': 1589, + 'upload_date': '20220529', + 'uploader': 'rainews', + } + }, { + # old content with fallback method to extract media urls + 'url': 'https://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', + 'info_dict': { + 'id': '1632c009-c843-4836-bb65-80c33084a64b', + 'ext': 'mp4', + 'title': 'Weekend al cinema, da Hollywood arriva il thriller di Tate Taylor "La ragazza del treno"', + 'description': 'I film in uscita questa settimana.', + 'thumbnail': r're:^https?://.*\.png$', + 'duration': 833, + 'upload_date': '20161103' + }, + 'expected_warnings': ['unable to extract player_data'], + }, { + # iframe + drm + 'url': 'https://www.rainews.it/iframe/video/2022/07/euro2022-europei-calcio-femminile-italia-belgio-gol-0-1-video-4de06a69-de75-4e32-a657-02f0885f8118.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + player_data = self._search_json( + r'<rainews-player\s*data=\'', webpage, 'player_data', video_id, + transform_source=clean_html, fatal=False) + track_info = player_data.get('track_info') + relinker_url = traverse_obj(player_data, 'mediapolis', 'content_url') + + if not relinker_url: + # fallback on old implementation for some old content + try: + return self._extract_from_content_id(video_id, url) + except GeoRestrictedError: + raise + except ExtractorError as e: + raise ExtractorError('Relinker URL not found', cause=e) + + relinker_info = self._extract_relinker_info(urljoin(url, relinker_url), video_id) + + return { + 'id': video_id, + 'title': track_info.get('title') or self._og_search_title(webpage), + 'upload_date': unified_strdate(track_info.get('date')), + 'uploader': strip_or_none(track_info.get('editor') or None), + **relinker_info + } + + +class RaiSudtirolIE(RaiBaseIE): + _VALID_URL = r'https?://raisudtirol\.rai\.it/.+?media=(?P<id>[TP]tv\d+)' + _TESTS = [{ + 'url': 'https://raisudtirol.rai.it/la/index.php?media=Ptv1619729460', + 'info_dict': { + 'id': 'Ptv1619729460', + 'ext': 'mp4', + 'title': 'Euro: trasmisciun d\'economia - 29-04-2021 20:51', + 'series': 'Euro: trasmisciun d\'economia', + 'upload_date': '20210429', + 'thumbnail': r're:https://raisudtirol\.rai\.it/img/.+?\.jpg', + 'uploader': 'raisudtirol', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_date = self._html_search_regex(r'<span class="med_data">(.+?)</span>', webpage, 'video_date', fatal=False) + video_title = self._html_search_regex(r'<span class="med_title">(.+?)</span>', webpage, 'video_title', fatal=False) + video_url = self._html_search_regex(r'sources:\s*\[\{file:\s*"(.+?)"\}\]', webpage, 'video_url') + video_thumb = self._html_search_regex(r'image: \'(.+?)\'', webpage, 'video_thumb', fatal=False) + + return { + 'id': video_id, + 'title': join_nonempty(video_title, video_date, delim=' - '), + 'series': video_title, + 'upload_date': unified_strdate(video_date), + 'thumbnail': urljoin('https://raisudtirol.rai.it/', video_thumb), + 'uploader': 'raisudtirol', + 'formats': [{ + 'format_id': 'https-mp4', + 'url': self._proto_relative_url(video_url), + 'width': 1024, + 'height': 576, + 'fps': 25, + 'vcodec': 'h264', + 'acodec': 'aac', + }], + } diff --git a/hypervideo_dl/extractor/raywenderlich.py b/hypervideo_dl/extractor/raywenderlich.py index f04d51f..e0e3c3e 100644 --- a/hypervideo_dl/extractor/raywenderlich.py +++ b/hypervideo_dl/extractor/raywenderlich.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/rbmaradio.py b/hypervideo_dl/extractor/rbmaradio.py index 9642fbb..86c63db 100644 --- a/hypervideo_dl/extractor/rbmaradio.py +++ b/hypervideo_dl/extractor/rbmaradio.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( diff --git a/hypervideo_dl/extractor/rcs.py b/hypervideo_dl/extractor/rcs.py index ace611b..b905f8d 100644 --- a/hypervideo_dl/extractor/rcs.py +++ b/hypervideo_dl/extractor/rcs.py @@ -1,14 +1,11 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor from ..utils import ( - clean_html, ExtractorError, - js_to_json, base_url, + clean_html, + js_to_json, url_basename, urljoin, ) @@ -199,7 +196,6 @@ class RCSBaseIE(InfoExtractor): 'format_id': 'http-mp4', 'url': urls['mp4'] }) - self._sort_formats(formats) return formats def _real_extract(self, url): @@ -284,6 +280,20 @@ class RCSEmbedsIE(RCSBaseIE): (?:gazzanet\.)?gazzetta )\.it) /video-embed/(?P<id>[^/=&\?]+?)(?:$|\?)''' + _EMBED_REGEX = [r'''(?x) + (?: + data-frame-src=| + <iframe[^\n]+src= + ) + (["']) + (?P<url>(?:https?:)?//video\. + (?: + rcs| + (?:corriere\w+\.)?corriere| + (?:gazzanet\.)?gazzetta + ) + \.it/video-embed/.+?) + \1'''] _TESTS = [{ 'url': 'https://video.rcs.it/video-embed/iodonna-0001585037', 'md5': '623ecc8ffe7299b2d0c1046d8331a9df', @@ -324,30 +334,9 @@ class RCSEmbedsIE(RCSBaseIE): urls[i] = urljoin(base_url(e), url_basename(e)) return urls - @staticmethod - def _extract_urls(webpage): - entries = [ - mobj.group('url') - for mobj in re.finditer(r'''(?x) - (?: - data-frame-src=| - <iframe[^\n]+src= - ) - (["']) - (?P<url>(?:https?:)?//video\. - (?: - rcs| - (?:corriere\w+\.)?corriere| - (?:gazzanet\.)?gazzetta - ) - \.it/video-embed/.+?) - \1''', webpage)] - return RCSEmbedsIE._sanitize_urls(entries) - - @staticmethod - def _extract_url(webpage): - urls = RCSEmbedsIE._extract_urls(webpage) - return urls[0] if urls else None + @classmethod + def _extract_embed_urls(cls, url, webpage): + return cls._sanitize_urls(list(super()._extract_embed_urls(url, webpage))) class RCSIE(RCSBaseIE): diff --git a/hypervideo_dl/extractor/rcti.py b/hypervideo_dl/extractor/rcti.py index ac42e58..27b4ad7 100644 --- a/hypervideo_dl/extractor/rcti.py +++ b/hypervideo_dl/extractor/rcti.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import random import time @@ -197,8 +194,6 @@ class RCTIPlusIE(RCTIPlusBaseIE): if 'akamaized' in f['url'] or 'cloudfront' in f['url']: f.setdefault('http_headers', {})['Referer'] = 'https://www.rctiplus.com/' # Referer header is required for akamai/cloudfront CDNs - self._sort_formats(formats) - return { 'id': video_meta.get('product_id') or video_json.get('product_id'), 'title': dict_get(video_meta, ('title', 'name')) or dict_get(video_json, ('content_name', 'assets_name')), diff --git a/hypervideo_dl/extractor/rds.py b/hypervideo_dl/extractor/rds.py index 0c49785..9a2e0d9 100644 --- a/hypervideo_dl/extractor/rds.py +++ b/hypervideo_dl/extractor/rds.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( parse_duration, diff --git a/hypervideo_dl/extractor/redbee.py b/hypervideo_dl/extractor/redbee.py new file mode 100644 index 0000000..eb40a81 --- /dev/null +++ b/hypervideo_dl/extractor/redbee.py @@ -0,0 +1,379 @@ +import json +import re +import time +import urllib.parse +import uuid + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, + strip_or_none, + traverse_obj, + try_call, + unified_timestamp, +) + + +class RedBeeBaseIE(InfoExtractor): + _DEVICE_ID = str(uuid.uuid4()) + + @property + def _API_URL(self): + """ + Ref: https://apidocs.emp.ebsd.ericsson.net + Subclasses must set _REDBEE_CUSTOMER, _REDBEE_BUSINESS_UNIT + """ + return f'https://exposure.api.redbee.live/v2/customer/{self._REDBEE_CUSTOMER}/businessunit/{self._REDBEE_BUSINESS_UNIT}' + + def _get_bearer_token(self, asset_id, jwt=None): + request = { + 'deviceId': self._DEVICE_ID, + 'device': { + 'deviceId': self._DEVICE_ID, + 'name': 'Mozilla Firefox 102', + 'type': 'WEB', + }, + } + if jwt: + request['jwt'] = jwt + + return self._download_json( + f'{self._API_URL}/auth/{"gigyaLogin" if jwt else "anonymous"}', + asset_id, data=json.dumps(request).encode('utf-8'), headers={ + 'Content-Type': 'application/json;charset=utf-8' + })['sessionToken'] + + def _get_formats_and_subtitles(self, asset_id, **kwargs): + bearer_token = self._get_bearer_token(asset_id, **kwargs) + api_response = self._download_json( + f'{self._API_URL}/entitlement/{asset_id}/play', + asset_id, headers={ + 'Authorization': f'Bearer {bearer_token}', + 'Accept': 'application/json, text/plain, */*' + }) + + formats, subtitles = [], {} + for format in api_response['formats']: + if not format.get('mediaLocator'): + continue + + fmts, subs = [], {} + if format.get('format') == 'DASH': + fmts, subs = self._extract_mpd_formats_and_subtitles( + format['mediaLocator'], asset_id, fatal=False) + elif format.get('format') == 'SMOOTHSTREAMING': + fmts, subs = self._extract_ism_formats_and_subtitles( + format['mediaLocator'], asset_id, fatal=False) + elif format.get('format') == 'HLS': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + format['mediaLocator'], asset_id, fatal=False) + + if format.get('drm'): + for f in fmts: + f['has_drm'] = True + + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + return formats, subtitles + + +class ParliamentLiveUKIE(RedBeeBaseIE): + IE_NAME = 'parliamentlive.tv' + IE_DESC = 'UK parliament videos' + _VALID_URL = r'(?i)https?://(?:www\.)?parliamentlive\.tv/Event/Index/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + + _REDBEE_CUSTOMER = 'UKParliament' + _REDBEE_BUSINESS_UNIT = 'ParliamentLive' + + _TESTS = [{ + 'url': 'http://parliamentlive.tv/Event/Index/c1e9d44d-fd6c-4263-b50f-97ed26cc998b', + 'info_dict': { + 'id': 'c1e9d44d-fd6c-4263-b50f-97ed26cc998b', + 'ext': 'mp4', + 'title': 'Home Affairs Committee', + 'timestamp': 1395153872, + 'upload_date': '20140318', + 'thumbnail': r're:https?://[^?#]+c1e9d44d-fd6c-4263-b50f-97ed26cc998b[^/]*/thumbnail', + }, + }, { + 'url': 'http://parliamentlive.tv/event/index/3f24936f-130f-40bf-9a5d-b3d6479da6a4', + 'only_matching': True, + }, { + 'url': 'https://parliamentlive.tv/Event/Index/27cf25e4-e77b-42a3-93c5-c815cd6d7377', + 'info_dict': { + 'id': '27cf25e4-e77b-42a3-93c5-c815cd6d7377', + 'ext': 'mp4', + 'title': 'House of Commons', + 'timestamp': 1658392447, + 'upload_date': '20220721', + 'thumbnail': r're:https?://[^?#]+27cf25e4-e77b-42a3-93c5-c815cd6d7377[^/]*/thumbnail', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + formats, subtitles = self._get_formats_and_subtitles(video_id) + + video_info = self._download_json( + f'https://www.parliamentlive.tv/Event/GetShareVideo/{video_id}', video_id, fatal=False) + + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'title': traverse_obj(video_info, ('event', 'title')), + 'thumbnail': traverse_obj(video_info, 'thumbnailUrl'), + 'timestamp': traverse_obj( + video_info, ('event', 'publishedStartTime'), expected_type=unified_timestamp), + '_format_sort_fields': ('res', 'proto'), + } + + +class RTBFIE(RedBeeBaseIE): + _VALID_URL = r'''(?x) + https?://(?:www\.)?rtbf\.be/ + (?: + video/[^?]+\?.*\bid=| + ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=| + auvio/[^/]+\?.*\b(?P<live>l)?id= + )(?P<id>\d+)''' + _NETRC_MACHINE = 'rtbf' + + _REDBEE_CUSTOMER = 'RTBF' + _REDBEE_BUSINESS_UNIT = 'Auvio' + + _TESTS = [{ + 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', + 'md5': '8c876a1cceeb6cf31b476461ade72384', + 'info_dict': { + 'id': '1921274', + 'ext': 'mp4', + 'title': 'Les Diables au coeur (épisode 2)', + 'description': '(du 25/04/2014)', + 'duration': 3099.54, + 'upload_date': '20140425', + 'timestamp': 1398456300, + }, + 'skip': 'No longer available', + }, { + # geo restricted + 'url': 'http://www.rtbf.be/ouftivi/heros/detail_scooby-doo-mysteres-associes?id=1097&videoId=2057442', + 'only_matching': True, + }, { + 'url': 'http://www.rtbf.be/ouftivi/niouzz?videoId=2055858', + 'only_matching': True, + }, { + 'url': 'http://www.rtbf.be/auvio/detail_jeudi-en-prime-siegfried-bracke?id=2102996', + 'only_matching': True, + }, { + # Live + 'url': 'https://www.rtbf.be/auvio/direct_pure-fm?lid=134775', + 'only_matching': True, + }, { + # Audio + 'url': 'https://www.rtbf.be/auvio/detail_cinq-heures-cinema?id=2360811', + 'only_matching': True, + }, { + # With Subtitle + 'url': 'https://www.rtbf.be/auvio/detail_les-carnets-du-bourlingueur?id=2361588', + 'only_matching': True, + }, { + 'url': 'https://www.rtbf.be/auvio/detail_investigation?id=2921926', + 'md5': 'd5d11bb62169fef38d7ce7ac531e034f', + 'info_dict': { + 'id': '2921926', + 'ext': 'mp4', + 'title': 'Le handicap un confinement perpétuel - Maladie de Lyme', + 'description': 'md5:dcbd5dcf6015488c9069b057c15ccc52', + 'duration': 5258.8, + 'upload_date': '20220727', + 'timestamp': 1658934000, + 'series': '#Investigation', + 'thumbnail': r're:^https?://[^?&]+\.jpg$', + }, + }, { + 'url': 'https://www.rtbf.be/auvio/detail_la-belgique-criminelle?id=2920492', + 'md5': '054f9f143bc79c89647c35e5a7d35fa8', + 'info_dict': { + 'id': '2920492', + 'ext': 'mp4', + 'title': '04 - Le crime de la rue Royale', + 'description': 'md5:0c3da1efab286df83f2ab3f8f96bd7a6', + 'duration': 1574.6, + 'upload_date': '20220723', + 'timestamp': 1658596887, + 'series': 'La Belgique criminelle - TV', + 'thumbnail': r're:^https?://[^?&]+\.jpg$', + }, + }] + + _IMAGE_HOST = 'http://ds1.ds.static.rtbf.be' + _PROVIDERS = { + 'YOUTUBE': 'Youtube', + 'DAILYMOTION': 'Dailymotion', + 'VIMEO': 'Vimeo', + } + _QUALITIES = [ + ('mobile', 'SD'), + ('web', 'MD'), + ('high', 'HD'), + ] + _LOGIN_URL = 'https://login.rtbf.be/accounts.login' + _GIGYA_API_KEY = '3_kWKuPgcdAybqnqxq_MvHVk0-6PN8Zk8pIIkJM_yXOu-qLPDDsGOtIDFfpGivtbeO' + _LOGIN_COOKIE_ID = f'glt_{_GIGYA_API_KEY}' + + def _perform_login(self, username, password): + if self._get_cookies(self._LOGIN_URL).get(self._LOGIN_COOKIE_ID): + return + + self._set_cookie('.rtbf.be', 'gmid', 'gmid.ver4', secure=True, expire_time=time.time() + 3600) + + login_response = self._download_json( + self._LOGIN_URL, None, data=urllib.parse.urlencode({ + 'loginID': username, + 'password': password, + 'APIKey': self._GIGYA_API_KEY, + 'targetEnv': 'jssdk', + 'sessionExpiration': '-2', + }).encode('utf-8'), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }) + + if login_response['statusCode'] != 200: + raise ExtractorError('Login failed. Server message: %s' % login_response['errorMessage'], expected=True) + + self._set_cookie('.rtbf.be', self._LOGIN_COOKIE_ID, login_response['sessionInfo']['login_token'], + secure=True, expire_time=time.time() + 3600) + + def _get_formats_and_subtitles(self, url, media_id): + login_token = self._get_cookies(url).get(self._LOGIN_COOKIE_ID) + if not login_token: + self.raise_login_required() + + session_jwt = try_call(lambda: self._get_cookies(url)['rtbf_jwt'].value) or self._download_json( + 'https://login.rtbf.be/accounts.getJWT', media_id, query={ + 'login_token': login_token.value, + 'APIKey': self._GIGYA_API_KEY, + 'sdk': 'js_latest', + 'authMode': 'cookie', + 'pageURL': url, + 'sdkBuild': '13273', + 'format': 'json', + })['id_token'] + + return super()._get_formats_and_subtitles(media_id, jwt=session_jwt) + + def _real_extract(self, url): + live, media_id = self._match_valid_url(url).groups() + embed_page = self._download_webpage( + 'https://www.rtbf.be/auvio/embed/' + ('direct' if live else 'media'), + media_id, query={'id': media_id}) + + media_data = self._html_search_regex(r'data-media="([^"]+)"', embed_page, 'media data', fatal=False) + if not media_data: + if re.search(r'<div[^>]+id="js-error-expired"[^>]+class="(?![^"]*hidden)', embed_page): + raise ExtractorError('Livestream has ended.', expected=True) + if re.search(r'<div[^>]+id="js-sso-connect"[^>]+class="(?![^"]*hidden)', embed_page): + self.raise_login_required() + + raise ExtractorError('Could not find media data') + + data = self._parse_json(media_data, media_id) + + error = data.get('error') + if error: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + + provider = data.get('provider') + if provider in self._PROVIDERS: + return self.url_result(data['url'], self._PROVIDERS[provider]) + + title = traverse_obj(data, 'subtitle', 'title') + is_live = data.get('isLive') + height_re = r'-(\d+)p\.' + formats, subtitles = [], {} + + # The old api still returns m3u8 and mpd manifest for livestreams, but these are 'fake' + # since all they contain is a 20s video that is completely unrelated. + # https://github.com/hypervideo/hypervideo/issues/4656#issuecomment-1214461092 + m3u8_url = None if data.get('isLive') else traverse_obj(data, 'urlHlsAes128', 'urlHls') + if m3u8_url: + fmts, subs = self._extract_m3u8_formats_and_subtitles( + m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + fix_url = lambda x: x.replace('//rtbf-vod.', '//rtbf.') if '/geo/drm/' in x else x + http_url = data.get('url') + if formats and http_url and re.search(height_re, http_url): + http_url = fix_url(http_url) + for m3u8_f in formats[:]: + height = m3u8_f.get('height') + if not height: + continue + f = m3u8_f.copy() + del f['protocol'] + f.update({ + 'format_id': m3u8_f['format_id'].replace('hls-', 'http-'), + 'url': re.sub(height_re, '-%dp.' % height, http_url), + }) + formats.append(f) + else: + sources = data.get('sources') or {} + for key, format_id in self._QUALITIES: + format_url = sources.get(key) + if not format_url: + continue + height = int_or_none(self._search_regex( + height_re, format_url, 'height', default=None)) + formats.append({ + 'format_id': format_id, + 'url': fix_url(format_url), + 'height': height, + }) + + mpd_url = None if data.get('isLive') else data.get('urlDash') + if mpd_url and (self.get_param('allow_unplayable_formats') or not data.get('drm')): + fmts, subs = self._extract_mpd_formats_and_subtitles( + mpd_url, media_id, mpd_id='dash', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + audio_url = data.get('urlAudio') + if audio_url: + formats.append({ + 'format_id': 'audio', + 'url': audio_url, + 'vcodec': 'none', + }) + + for track in (data.get('tracks') or {}).values(): + sub_url = track.get('url') + if not sub_url: + continue + subtitles.setdefault(track.get('lang') or 'fr', []).append({ + 'url': sub_url, + }) + + if not formats: + fmts, subs = self._get_formats_and_subtitles(url, f'live_{media_id}' if is_live else media_id) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + + return { + 'id': media_id, + 'formats': formats, + 'title': title, + 'description': strip_or_none(data.get('description')), + 'thumbnail': data.get('thumbnail'), + 'duration': float_or_none(data.get('realDuration')), + 'timestamp': int_or_none(data.get('liveFrom')), + 'series': data.get('programLabel'), + 'subtitles': subtitles, + 'is_live': is_live, + '_format_sort_fields': ('res', 'proto'), + } diff --git a/hypervideo_dl/extractor/redbulltv.py b/hypervideo_dl/extractor/redbulltv.py index 756a366..a01bc84 100644 --- a/hypervideo_dl/extractor/redbulltv.py +++ b/hypervideo_dl/extractor/redbulltv.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( @@ -84,7 +80,6 @@ class RedBullTVIE(InfoExtractor): formats, subtitles = self._extract_m3u8_formats_and_subtitles( 'https://dms.redbull.tv/v3/%s/%s/playlist.m3u8' % (video_id, token), video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') - self._sort_formats(formats) for resource in video.get('resources', []): if resource.startswith('closed_caption_'): @@ -114,7 +109,7 @@ class RedBullTVIE(InfoExtractor): return self.extract_info(video_id) -class RedBullEmbedIE(RedBullTVIE): +class RedBullEmbedIE(RedBullTVIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'https?://(?:www\.)?redbull\.com/embed/(?P<id>rrn:content:[^:]+:[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}:[a-z]{2}-[A-Z]{2,3})' _TESTS = [{ # HLS manifest accessible only using assetId diff --git a/hypervideo_dl/extractor/reddit.py b/hypervideo_dl/extractor/reddit.py index a042a59..f1a5c85 100644 --- a/hypervideo_dl/extractor/reddit.py +++ b/hypervideo_dl/extractor/reddit.py @@ -1,14 +1,15 @@ import random +import urllib.parse from .common import InfoExtractor from ..utils import ( ExtractorError, - int_or_none, float_or_none, + int_or_none, + traverse_obj, try_get, unescapeHTML, url_or_none, - traverse_obj ) @@ -19,6 +20,7 @@ class RedditIE(InfoExtractor): 'info_dict': { 'id': 'zv89llsvexdz', 'ext': 'mp4', + 'display_id': '6rrwyj', 'title': 'That small heart attack.', 'thumbnail': r're:^https?://.*\.(?:jpg|png)', 'thumbnails': 'count:4', @@ -35,6 +37,34 @@ class RedditIE(InfoExtractor): 'skip_download': True, }, }, { + # 1080p fallback format + 'url': 'https://www.reddit.com/r/aww/comments/90bu6w/heat_index_was_110_degrees_so_we_offered_him_a/', + 'md5': '8b5902cfda3006bf90faea7adf765a49', + 'info_dict': { + 'id': 'gyh95hiqc0b11', + 'ext': 'mp4', + 'display_id': '90bu6w', + 'title': 'Heat index was 110 degrees so we offered him a cold drink. He went for a full body soak instead', + 'thumbnail': r're:^https?://.*\.(?:jpg|png)', + 'thumbnails': 'count:7', + 'timestamp': 1532051078, + 'upload_date': '20180720', + 'uploader': 'FootLoosePickleJuice', + 'duration': 14, + 'like_count': int, + 'dislike_count': int, + 'comment_count': int, + 'age_limit': 0, + }, + }, { + # videos embedded in reddit text post + 'url': 'https://www.reddit.com/r/KamenRider/comments/wzqkxp/finale_kamen_rider_revice_episode_50_family_to/', + 'playlist_count': 2, + 'info_dict': { + 'id': 'wzqkxp', + 'title': 'md5:72d3d19402aa11eff5bd32fc96369b37', + }, + }, { 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj', 'only_matching': True, }, { @@ -80,10 +110,6 @@ class RedditIE(InfoExtractor): data = data[0]['data']['children'][0]['data'] video_url = data['url'] - # Avoid recursing into the same reddit URL - if 'reddit.com/' in video_url and '/%s/' % video_id in video_url: - raise ExtractorError('No media found', expected=True) - over_18 = data.get('over_18') if over_18 is True: age_limit = 18 @@ -126,6 +152,32 @@ class RedditIE(InfoExtractor): 'age_limit': age_limit, } + parsed_url = urllib.parse.urlparse(video_url) + + # Check for embeds in text posts, or else raise to avoid recursing into the same reddit URL + if 'reddit.com' in parsed_url.netloc and f'/{video_id}/' in parsed_url.path: + entries = [] + for media in traverse_obj(data, ('media_metadata', ...), expected_type=dict): + if not media.get('id') or media.get('e') != 'RedditVideo': + continue + formats = [] + if media.get('hlsUrl'): + formats.extend(self._extract_m3u8_formats( + unescapeHTML(media['hlsUrl']), video_id, 'mp4', m3u8_id='hls', fatal=False)) + if media.get('dashUrl'): + formats.extend(self._extract_mpd_formats( + unescapeHTML(media['dashUrl']), video_id, mpd_id='dash', fatal=False)) + if formats: + entries.append({ + 'id': media['id'], + 'display_id': video_id, + 'formats': formats, + **info, + }) + if entries: + return self.playlist_result(entries, video_id, info.get('title')) + raise ExtractorError('No media found', expected=True) + # Check if media is hosted on reddit: reddit_video = traverse_obj(data, (('media', 'secure_media'), 'reddit_video'), get_all=False) if reddit_video: @@ -143,12 +195,21 @@ class RedditIE(InfoExtractor): dash_playlist_url = playlist_urls[0] or f'https://v.redd.it/{video_id}/DASHPlaylist.mpd' hls_playlist_url = playlist_urls[1] or f'https://v.redd.it/{video_id}/HLSPlaylist.m3u8' - formats = self._extract_m3u8_formats( - hls_playlist_url, display_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) + formats = [{ + 'url': unescapeHTML(reddit_video['fallback_url']), + 'height': int_or_none(reddit_video.get('height')), + 'width': int_or_none(reddit_video.get('width')), + 'tbr': int_or_none(reddit_video.get('bitrate_kbps')), + 'acodec': 'none', + 'vcodec': 'h264', + 'ext': 'mp4', + 'format_id': 'fallback', + 'format_note': 'DASH video, mp4_dash', + }] + formats.extend(self._extract_m3u8_formats( + hls_playlist_url, display_id, 'mp4', m3u8_id='hls', fatal=False)) formats.extend(self._extract_mpd_formats( dash_playlist_url, display_id, mpd_id='dash', fatal=False)) - self._sort_formats(formats) return { **info, @@ -158,6 +219,14 @@ class RedditIE(InfoExtractor): 'duration': int_or_none(reddit_video.get('duration')), } + if parsed_url.netloc == 'v.redd.it': + self.raise_no_formats('This video is processing', expected=True, video_id=video_id) + return { + **info, + 'id': parsed_url.path.split('/')[1], + 'display_id': video_id, + } + # Not hosted on reddit, must continue extraction return { **info, diff --git a/hypervideo_dl/extractor/redgifs.py b/hypervideo_dl/extractor/redgifs.py index 55196b7..098fb81 100644 --- a/hypervideo_dl/extractor/redgifs.py +++ b/hypervideo_dl/extractor/redgifs.py @@ -1,5 +1,5 @@ -# coding: utf-8 import functools +import urllib from .common import InfoExtractor from ..compat import compat_parse_qs @@ -19,6 +19,12 @@ class RedGifsBaseInfoExtractor(InfoExtractor): 'hd': None, } + _API_HEADERS = { + 'referer': 'https://www.redgifs.com/', + 'origin': 'https://www.redgifs.com', + 'content-type': 'application/json', + } + def _parse_gif_data(self, gif_data): video_id = gif_data.get('id') quality = qualities(tuple(self._FORMATS.keys())) @@ -39,12 +45,11 @@ class RedGifsBaseInfoExtractor(InfoExtractor): 'height': height, 'quality': quality(format_id), }) - self._sort_formats(formats) return { 'id': video_id, 'webpage_url': f'https://redgifs.com/watch/{video_id}', - 'ie_key': RedGifsIE.ie_key(), + 'extractor_key': RedGifsIE.ie_key(), 'extractor': 'RedGifs', 'title': ' '.join(gif_data.get('tags') or []) or 'RedGifs', 'timestamp': int_or_none(gif_data.get('createDate')), @@ -58,9 +63,30 @@ class RedGifsBaseInfoExtractor(InfoExtractor): 'formats': formats, } + def _fetch_oauth_token(self, video_id): + # https://github.com/Redgifs/api/wiki/Temporary-tokens + auth = self._download_json('https://api.redgifs.com/v2/auth/temporary', + video_id, note='Fetching temporary token') + if not auth.get('token'): + raise ExtractorError('Unable to get temporary token') + self._API_HEADERS['authorization'] = f'Bearer {auth["token"]}' + def _call_api(self, ep, video_id, *args, **kwargs): - data = self._download_json( - f'https://api.redgifs.com/v2/{ep}', video_id, *args, **kwargs) + for first_attempt in True, False: + if 'authorization' not in self._API_HEADERS: + self._fetch_oauth_token(video_id) + try: + headers = dict(self._API_HEADERS) + headers['x-customheader'] = f'https://www.redgifs.com/watch/{video_id}' + data = self._download_json( + f'https://api.redgifs.com/v2/{ep}', video_id, headers=headers, *args, **kwargs) + break + except ExtractorError as e: + if first_attempt and isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 401: + del self._API_HEADERS['authorization'] # refresh the token + continue + raise + if 'error' in data: raise ExtractorError(f'RedGifs said: {data["error"]}', expected=True, video_id=video_id) return data @@ -103,6 +129,7 @@ class RedGifsIE(RedGifsBaseInfoExtractor): 'like_count': int, 'categories': list, 'age_limit': 18, + 'tags': list, } }, { 'url': 'https://thumbs2.redgifs.com/SqueakyHelplessWisent-mobile.mp4#t=0', @@ -118,13 +145,14 @@ class RedGifsIE(RedGifsBaseInfoExtractor): 'like_count': int, 'categories': list, 'age_limit': 18, + 'tags': list, } }] def _real_extract(self, url): video_id = self._match_id(url).lower() video_info = self._call_api( - f'gifs/{video_id}', video_id, note='Downloading video info') + f'gifs/{video_id}?views=yes', video_id, note='Downloading video info') return self._parse_gif_data(video_info['gif']) diff --git a/hypervideo_dl/extractor/redtube.py b/hypervideo_dl/extractor/redtube.py index 7fee54f..49076cc 100644 --- a/hypervideo_dl/extractor/redtube.py +++ b/hypervideo_dl/extractor/redtube.py @@ -1,7 +1,3 @@ -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -16,6 +12,7 @@ from ..utils import ( class RedTubeIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:\w+\.)?redtube\.com/|embed\.redtube\.com/\?.*?\bid=)(?P<id>[0-9]+)' + _EMBED_REGEX = [r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//embed\.redtube\.com/\?.*?\bid=\d+)'] _TESTS = [{ 'url': 'https://www.redtube.com/38864951', 'md5': '4fba70cbca3aefd25767ab4b523c9878', @@ -39,12 +36,6 @@ class RedTubeIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//embed\.redtube\.com/\?.*?\bid=\d+)', - webpage) - def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( @@ -119,7 +110,6 @@ class RedTubeIE(InfoExtractor): video_url = self._html_search_regex( r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL') formats.append({'url': video_url, 'ext': 'mp4'}) - self._sort_formats(formats) thumbnail = self._og_search_thumbnail(webpage) upload_date = unified_strdate(self._search_regex( diff --git a/hypervideo_dl/extractor/regiotv.py b/hypervideo_dl/extractor/regiotv.py index e250a52..6114841 100644 --- a/hypervideo_dl/extractor/regiotv.py +++ b/hypervideo_dl/extractor/regiotv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( diff --git a/hypervideo_dl/extractor/rentv.py b/hypervideo_dl/extractor/rentv.py index 7c8909d..fdde317 100644 --- a/hypervideo_dl/extractor/rentv.py +++ b/hypervideo_dl/extractor/rentv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -50,7 +47,6 @@ class RENTVIE(InfoExtractor): formats.append({ 'url': src, }) - self._sort_formats(formats) return { 'id': video_id, 'title': title, diff --git a/hypervideo_dl/extractor/restudy.py b/hypervideo_dl/extractor/restudy.py index d47fb45..6d03256 100644 --- a/hypervideo_dl/extractor/restudy.py +++ b/hypervideo_dl/extractor/restudy.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor @@ -34,7 +31,6 @@ class RestudyIE(InfoExtractor): formats = self._extract_smil_formats( 'https://cdn.portal.restudy.dk/dynamic/themes/front/awsmedia/SmilDirectory/video_%s.xml' % video_id, video_id) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/reuters.py b/hypervideo_dl/extractor/reuters.py index 9dc482d..6919425 100644 --- a/hypervideo_dl/extractor/reuters.py +++ b/hypervideo_dl/extractor/reuters.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -58,7 +55,6 @@ class ReutersIE(InfoExtractor): 'ext': ext, 'container': container if method != 'mobile' else None, }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/reverbnation.py b/hypervideo_dl/extractor/reverbnation.py index 4cb99c2..06b6c3c 100644 --- a/hypervideo_dl/extractor/reverbnation.py +++ b/hypervideo_dl/extractor/reverbnation.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( qualities, diff --git a/hypervideo_dl/extractor/rice.py b/hypervideo_dl/extractor/rice.py index cf2bb1b..3dd4d31 100644 --- a/hypervideo_dl/extractor/rice.py +++ b/hypervideo_dl/extractor/rice.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -91,7 +88,6 @@ class RICEIE(InfoExtractor): 'ext': 'flv', }) formats.append(fmt) - self._sort_formats(formats) thumbnails = [] for content_asset in content_data.findall('.//contentAssets'): diff --git a/hypervideo_dl/extractor/rmcdecouverte.py b/hypervideo_dl/extractor/rmcdecouverte.py index 8bfce34..8d29b30 100644 --- a/hypervideo_dl/extractor/rmcdecouverte.py +++ b/hypervideo_dl/extractor/rmcdecouverte.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from .brightcove import BrightcoveLegacyIE from ..compat import ( diff --git a/hypervideo_dl/extractor/ro220.py b/hypervideo_dl/extractor/ro220.py deleted file mode 100644 index 69934ef..0000000 --- a/hypervideo_dl/extractor/ro220.py +++ /dev/null @@ -1,43 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote - - -class Ro220IE(InfoExtractor): - IE_NAME = '220.ro' - _VALID_URL = r'(?x)(?:https?://)?(?:www\.)?220\.ro/(?P<category>[^/]+)/(?P<shorttitle>[^/]+)/(?P<id>[^/]+)' - _TEST = { - 'url': 'http://www.220.ro/sport/Luati-Le-Banii-Sez-4-Ep-1/LYV6doKo7f/', - 'md5': '03af18b73a07b4088753930db7a34add', - 'info_dict': { - 'id': 'LYV6doKo7f', - 'ext': 'mp4', - 'title': 'Luati-le Banii sez 4 ep 1', - 'description': r're:^Iata-ne reveniti dupa o binemeritata vacanta\. +Va astept si pe Facebook cu pareri si comentarii.$', - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - url = compat_urllib_parse_unquote(self._search_regex( - r'(?s)clip\s*:\s*{.*?url\s*:\s*\'([^\']+)\'', webpage, 'url')) - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) - - formats = [{ - 'format_id': 'sd', - 'url': url, - 'ext': 'mp4', - }] - - return { - 'id': video_id, - 'formats': formats, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - } diff --git a/hypervideo_dl/extractor/rockstargames.py b/hypervideo_dl/extractor/rockstargames.py index cd6904b..c491aaf 100644 --- a/hypervideo_dl/extractor/rockstargames.py +++ b/hypervideo_dl/extractor/rockstargames.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -57,8 +54,6 @@ class RockstarGamesIE(InfoExtractor): if youtube_id: return self.url_result(youtube_id, 'Youtube') - self._sort_formats(formats) - return { 'id': video_id, 'title': title, diff --git a/hypervideo_dl/extractor/rokfin.py b/hypervideo_dl/extractor/rokfin.py index 0fd65db..ade3cd0 100644 --- a/hypervideo_dl/extractor/rokfin.py +++ b/hypervideo_dl/extractor/rokfin.py @@ -1,26 +1,33 @@ -# coding: utf-8 import itertools +import json +import re +import urllib.parse from datetime import datetime -from .common import InfoExtractor +from .common import InfoExtractor, SearchInfoExtractor from ..utils import ( - determine_ext, ExtractorError, + determine_ext, float_or_none, format_field, int_or_none, str_or_none, traverse_obj, + try_get, + unescapeHTML, unified_timestamp, url_or_none, + urlencode_postdata, ) - _API_BASE_URL = 'https://prod-api-v2.production.rokfin.com/api/v2/public/' class RokfinIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rokfin\.com/(?P<id>(?P<type>post|stream)/\d+)' + _NETRC_MACHINE = 'rokfin' + _AUTH_BASE = 'https://secure.rokfin.com/auth/realms/rokfin-web/protocol/openid-connect' + _access_mgmt_tokens = {} # OAuth 2.0: RFC 6749, Sec. 1.4-5 _TESTS = [{ 'url': 'https://www.rokfin.com/post/57548/Mitt-Romneys-Crazy-Solution-To-Climate-Change', 'info_dict': { @@ -84,8 +91,7 @@ class RokfinIE(InfoExtractor): def _real_extract(self, url): video_id, video_type = self._match_valid_url(url).group('id', 'type') - - metadata = self._download_json(f'{_API_BASE_URL}{video_id}', video_id) + metadata = self._download_json_using_access_token(f'{_API_BASE_URL}{video_id}', video_id) scheduled = unified_timestamp(metadata.get('scheduledAt')) live_status = ('was_live' if metadata.get('stoppedAt') @@ -104,9 +110,8 @@ class RokfinIE(InfoExtractor): self.raise_login_required('This video is only available to premium users', True, method='cookies') elif scheduled: self.raise_no_formats( - f'Stream is offline; sheduled for {datetime.fromtimestamp(scheduled).strftime("%Y-%m-%d %H:%M:%S")}', + f'Stream is offline; scheduled for {datetime.fromtimestamp(scheduled).strftime("%Y-%m-%d %H:%M:%S")}', video_id=video_id, expected=True) - self._sort_formats(formats) uploader = traverse_obj(metadata, ('createdBy', 'username'), ('creator', 'username')) timestamp = (scheduled or float_or_none(metadata.get('postedAtMilli'), 1000) @@ -140,7 +145,7 @@ class RokfinIE(InfoExtractor): for page_n in itertools.count(): raw_comments = self._download_json( f'{_API_BASE_URL}comment?postId={video_id[5:]}&page={page_n}&size=50', - video_id, note=f'Downloading viewer comments page {page_n + 1}{format_field(pages_total, template=" of %s")}', + video_id, note=f'Downloading viewer comments page {page_n + 1}{format_field(pages_total, None, " of %s")}', fatal=False) or {} for comment in raw_comments.get('content') or []: @@ -160,6 +165,79 @@ class RokfinIE(InfoExtractor): if not raw_comments.get('content') or is_last or (page_n > pages_total if pages_total else is_last is not False): return + def _perform_login(self, username, password): + # https://openid.net/specs/openid-connect-core-1_0.html#CodeFlowAuth (Sec. 3.1) + login_page = self._download_webpage( + f'{self._AUTH_BASE}/auth?client_id=web&redirect_uri=https%3A%2F%2Frokfin.com%2Ffeed&response_mode=fragment&response_type=code&scope=openid', + None, note='loading login page', errnote='error loading login page') + authentication_point_url = unescapeHTML(self._search_regex( + r'<form\s+[^>]+action\s*=\s*"(https://secure\.rokfin\.com/auth/realms/rokfin-web/login-actions/authenticate\?[^"]+)"', + login_page, name='Authentication URL')) + + resp_body = self._download_webpage( + authentication_point_url, None, note='logging in', fatal=False, expected_status=404, + data=urlencode_postdata({'username': username, 'password': password, 'rememberMe': 'off', 'credentialId': ''})) + if not self._authentication_active(): + if re.search(r'(?i)(invalid\s+username\s+or\s+password)', resp_body or ''): + raise ExtractorError('invalid username/password', expected=True) + raise ExtractorError('Login failed') + + urlh = self._request_webpage( + f'{self._AUTH_BASE}/auth', None, + note='granting user authorization', errnote='user authorization rejected by Rokfin', + query={ + 'client_id': 'web', + 'prompt': 'none', + 'redirect_uri': 'https://rokfin.com/silent-check-sso.html', + 'response_mode': 'fragment', + 'response_type': 'code', + 'scope': 'openid', + }) + self._access_mgmt_tokens = self._download_json( + f'{self._AUTH_BASE}/token', None, + note='getting access credentials', errnote='error getting access credentials', + data=urlencode_postdata({ + 'code': urllib.parse.parse_qs(urllib.parse.urldefrag(urlh.geturl()).fragment).get('code')[0], + 'client_id': 'web', + 'grant_type': 'authorization_code', + 'redirect_uri': 'https://rokfin.com/silent-check-sso.html' + })) + + def _authentication_active(self): + return not ( + {'KEYCLOAK_IDENTITY', 'KEYCLOAK_IDENTITY_LEGACY', 'KEYCLOAK_SESSION', 'KEYCLOAK_SESSION_LEGACY'} + - set(self._get_cookies(self._AUTH_BASE))) + + def _get_auth_token(self): + return try_get(self._access_mgmt_tokens, lambda x: ' '.join([x['token_type'], x['access_token']])) + + def _download_json_using_access_token(self, url_or_request, video_id, headers={}, query={}): + assert 'authorization' not in headers + headers = headers.copy() + auth_token = self._get_auth_token() + refresh_token = self._access_mgmt_tokens.get('refresh_token') + if auth_token: + headers['authorization'] = auth_token + + json_string, urlh = self._download_webpage_handle( + url_or_request, video_id, headers=headers, query=query, expected_status=401) + if not auth_token or urlh.code != 401 or refresh_token is None: + return self._parse_json(json_string, video_id) + + self._access_mgmt_tokens = self._download_json( + f'{self._AUTH_BASE}/token', video_id, + note='User authorization expired or canceled by Rokfin. Re-authorizing ...', errnote='Failed to re-authorize', + data=urlencode_postdata({ + 'grant_type': 'refresh_token', + 'refresh_token': refresh_token, + 'client_id': 'web' + })) + headers['authorization'] = self._get_auth_token() + if headers['authorization'] is None: + raise ExtractorError('User authorization lost', expected=True) + + return self._download_json(url_or_request, video_id, headers=headers, query=query) + class RokfinPlaylistBaseIE(InfoExtractor): _TYPES = { @@ -183,6 +261,7 @@ class RokfinPlaylistBaseIE(InfoExtractor): class RokfinStackIE(RokfinPlaylistBaseIE): IE_NAME = 'rokfin:stack' + IE_DESC = 'Rokfin Stacks' _VALID_URL = r'https?://(?:www\.)?rokfin\.com/stack/(?P<id>[^/]+)' _TESTS = [{ 'url': 'https://www.rokfin.com/stack/271/Tulsi-Gabbard-Portsmouth-Townhall-FULL--Feb-9-2020', @@ -200,6 +279,7 @@ class RokfinStackIE(RokfinPlaylistBaseIE): class RokfinChannelIE(RokfinPlaylistBaseIE): IE_NAME = 'rokfin:channel' + IE_DESC = 'Rokfin Channels' _VALID_URL = r'https?://(?:www\.)?rokfin\.com/(?!((feed/?)|(discover/?)|(channels/?))$)(?P<id>[^/]+)/?$' _TESTS = [{ 'url': 'https://rokfin.com/TheConvoCouch', @@ -237,7 +317,7 @@ class RokfinChannelIE(RokfinPlaylistBaseIE): data_url = f'{_API_BASE_URL}post/search/{tab}?page={page_n}&size=50&creator={channel_id}' metadata = self._download_json( data_url, channel_name, - note=f'Downloading video metadata page {page_n + 1}{format_field(pages_total, template=" of %s")}') + note=f'Downloading video metadata page {page_n + 1}{format_field(pages_total, None, " of %s")}') yield from self._get_video_data(metadata) pages_total = int_or_none(metadata.get('totalPages')) or None @@ -254,3 +334,76 @@ class RokfinChannelIE(RokfinPlaylistBaseIE): return self.playlist_result( self._entries(channel_id, channel_name, self._TABS[tab]), f'{channel_id}-{tab}', f'{channel_name} - {tab.title()}', str_or_none(channel_info.get('description'))) + + +class RokfinSearchIE(SearchInfoExtractor): + IE_NAME = 'rokfin:search' + IE_DESC = 'Rokfin Search' + _SEARCH_KEY = 'rkfnsearch' + _TYPES = { + 'video': (('id', 'raw'), 'post'), + 'audio': (('id', 'raw'), 'post'), + 'stream': (('content_id', 'raw'), 'stream'), + 'dead_stream': (('content_id', 'raw'), 'stream'), + 'stack': (('content_id', 'raw'), 'stack'), + } + _TESTS = [{ + 'url': 'rkfnsearch5:"zelenko"', + 'playlist_count': 5, + 'info_dict': { + 'id': '"zelenko"', + 'title': '"zelenko"', + } + }] + _db_url = None + _db_access_key = None + + def _real_initialize(self): + self._db_url, self._db_access_key = self.cache.load(self.ie_key(), 'auth', default=(None, None)) + if not self._db_url: + self._get_db_access_credentials() + + def _search_results(self, query): + total_pages = None + for page_number in itertools.count(1): + search_results = self._run_search_query( + query, data={'query': query, 'page': {'size': 100, 'current': page_number}}, + note=f'Downloading page {page_number}{format_field(total_pages, None, " of ~%s")}') + total_pages = traverse_obj(search_results, ('meta', 'page', 'total_pages'), expected_type=int_or_none) + + for result in search_results.get('results') or []: + video_id_key, video_type = self._TYPES.get(traverse_obj(result, ('content_type', 'raw')), (None, None)) + video_id = traverse_obj(result, video_id_key, expected_type=int_or_none) + if video_id and video_type: + yield self.url_result(url=f'https://rokfin.com/{video_type}/{video_id}') + if not search_results.get('results'): + return + + def _run_search_query(self, video_id, data, **kwargs): + data = json.dumps(data).encode() + for attempt in range(2): + search_results = self._download_json( + self._db_url, video_id, data=data, fatal=(attempt == 1), + headers={'authorization': self._db_access_key}, **kwargs) + if search_results: + return search_results + self.write_debug('Updating access credentials') + self._get_db_access_credentials(video_id) + + def _get_db_access_credentials(self, video_id=None): + auth_data = {'SEARCH_KEY': None, 'ENDPOINT_BASE': None} + notfound_err_page = self._download_webpage( + 'https://rokfin.com/discover', video_id, expected_status=404, note='Downloading home page') + for js_file_path in re.findall(r'<script\b[^>]*\ssrc\s*=\s*"(/static/js/[^">]+)"', notfound_err_page): + js_content = self._download_webpage( + f'https://rokfin.com{js_file_path}', video_id, note='Downloading JavaScript file', fatal=False) + auth_data.update(re.findall( + rf'REACT_APP_({"|".join(auth_data.keys())})\s*:\s*"([^"]+)"', js_content or '')) + if not all(auth_data.values()): + continue + + self._db_url = url_or_none(f'{auth_data["ENDPOINT_BASE"]}/api/as/v1/engines/rokfin-search/search.json') + self._db_access_key = f'Bearer {auth_data["SEARCH_KEY"]}' + self.cache.store(self.ie_key(), 'auth', (self._db_url, self._db_access_key)) + return + raise ExtractorError('Unable to extract access credentials') diff --git a/hypervideo_dl/extractor/roosterteeth.py b/hypervideo_dl/extractor/roosterteeth.py index a55dd4f..776fbfb 100644 --- a/hypervideo_dl/extractor/roosterteeth.py +++ b/hypervideo_dl/extractor/roosterteeth.py @@ -1,4 +1,3 @@ -# coding: utf-8 from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( @@ -147,7 +146,6 @@ class RoosterTeethIE(RoosterTeethBaseIE): formats, subtitles = self._extract_m3u8_formats_and_subtitles( m3u8_url, display_id, 'mp4', 'm3u8_native', m3u8_id='hls') - self._sort_formats(formats) episode = self._download_json( api_episode_url, display_id, diff --git a/hypervideo_dl/extractor/rottentomatoes.py b/hypervideo_dl/extractor/rottentomatoes.py index 14c8e82..f133c85 100644 --- a/hypervideo_dl/extractor/rottentomatoes.py +++ b/hypervideo_dl/extractor/rottentomatoes.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from .internetvideoarchive import InternetVideoArchiveIE diff --git a/hypervideo_dl/extractor/roxwel.py b/hypervideo_dl/extractor/roxwel.py deleted file mode 100644 index 84bb1aa..0000000 --- a/hypervideo_dl/extractor/roxwel.py +++ /dev/null @@ -1,52 +0,0 @@ -from __future__ import unicode_literals - - -from .common import InfoExtractor -from ..utils import unified_strdate, determine_ext - - -class RoxwelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?roxwel\.com/player/(?P<filename>.+?)(\.|\?|$)' - - _TEST = { - 'url': 'http://www.roxwel.com/player/passionpittakeawalklive.html', - 'info_dict': { - 'id': 'passionpittakeawalklive', - 'ext': 'flv', - 'title': 'Take A Walk (live)', - 'uploader': 'Passion Pit', - 'uploader_id': 'passionpit', - 'upload_date': '20120928', - 'description': 'Passion Pit performs "Take A Walk\" live at The Backyard in Austin, Texas. ', - }, - 'params': { - # rtmp download - 'skip_download': True, - } - } - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - filename = mobj.group('filename') - info_url = 'http://www.roxwel.com/api/videos/%s' % filename - info = self._download_json(info_url, filename) - - rtmp_rates = sorted([int(r.replace('flv_', '')) for r in info['media_rates'] if r.startswith('flv_')]) - best_rate = rtmp_rates[-1] - url_page_url = 'http://roxwel.com/pl_one_time.php?filename=%s&quality=%s' % (filename, best_rate) - rtmp_url = self._download_webpage(url_page_url, filename, 'Downloading video url') - ext = determine_ext(rtmp_url) - if ext == 'f4v': - rtmp_url = rtmp_url.replace(filename, 'mp4:%s' % filename) - - return { - 'id': filename, - 'title': info['title'], - 'url': rtmp_url, - 'ext': 'flv', - 'description': info['description'], - 'thumbnail': info.get('player_image_url') or info.get('image_url_large'), - 'uploader': info['artist'], - 'uploader_id': info['artistname'], - 'upload_date': unified_strdate(info['dbdate']), - } diff --git a/hypervideo_dl/extractor/rozhlas.py b/hypervideo_dl/extractor/rozhlas.py index fccf694..a818967 100644 --- a/hypervideo_dl/extractor/rozhlas.py +++ b/hypervideo_dl/extractor/rozhlas.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, diff --git a/hypervideo_dl/extractor/rtbf.py b/hypervideo_dl/extractor/rtbf.py deleted file mode 100644 index 4b61fdb..0000000 --- a/hypervideo_dl/extractor/rtbf.py +++ /dev/null @@ -1,159 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - float_or_none, - int_or_none, - strip_or_none, -) - - -class RTBFIE(InfoExtractor): - _VALID_URL = r'''(?x) - https?://(?:www\.)?rtbf\.be/ - (?: - video/[^?]+\?.*\bid=| - ouftivi/(?:[^/]+/)*[^?]+\?.*\bvideoId=| - auvio/[^/]+\?.*\b(?P<live>l)?id= - )(?P<id>\d+)''' - _TESTS = [{ - 'url': 'https://www.rtbf.be/video/detail_les-diables-au-coeur-episode-2?id=1921274', - 'md5': '8c876a1cceeb6cf31b476461ade72384', - 'info_dict': { - 'id': '1921274', - 'ext': 'mp4', - 'title': 'Les Diables au coeur (épisode 2)', - 'description': '(du 25/04/2014)', - 'duration': 3099.54, - 'upload_date': '20140425', - 'timestamp': 1398456300, - } - }, { - # geo restricted - 'url': 'http://www.rtbf.be/ouftivi/heros/detail_scooby-doo-mysteres-associes?id=1097&videoId=2057442', - 'only_matching': True, - }, { - 'url': 'http://www.rtbf.be/ouftivi/niouzz?videoId=2055858', - 'only_matching': True, - }, { - 'url': 'http://www.rtbf.be/auvio/detail_jeudi-en-prime-siegfried-bracke?id=2102996', - 'only_matching': True, - }, { - # Live - 'url': 'https://www.rtbf.be/auvio/direct_pure-fm?lid=134775', - 'only_matching': True, - }, { - # Audio - 'url': 'https://www.rtbf.be/auvio/detail_cinq-heures-cinema?id=2360811', - 'only_matching': True, - }, { - # With Subtitle - 'url': 'https://www.rtbf.be/auvio/detail_les-carnets-du-bourlingueur?id=2361588', - 'only_matching': True, - }] - _IMAGE_HOST = 'http://ds1.ds.static.rtbf.be' - _PROVIDERS = { - 'YOUTUBE': 'Youtube', - 'DAILYMOTION': 'Dailymotion', - 'VIMEO': 'Vimeo', - } - _QUALITIES = [ - ('mobile', 'SD'), - ('web', 'MD'), - ('high', 'HD'), - ] - - def _real_extract(self, url): - live, media_id = self._match_valid_url(url).groups() - embed_page = self._download_webpage( - 'https://www.rtbf.be/auvio/embed/' + ('direct' if live else 'media'), - media_id, query={'id': media_id}) - data = self._parse_json(self._html_search_regex( - r'data-media="([^"]+)"', embed_page, 'media data'), media_id) - - error = data.get('error') - if error: - raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) - - provider = data.get('provider') - if provider in self._PROVIDERS: - return self.url_result(data['url'], self._PROVIDERS[provider]) - - title = data['title'] - is_live = data.get('isLive') - height_re = r'-(\d+)p\.' - formats = [] - - m3u8_url = data.get('urlHlsAes128') or data.get('urlHls') - if m3u8_url: - formats.extend(self._extract_m3u8_formats( - m3u8_url, media_id, 'mp4', m3u8_id='hls', fatal=False)) - - fix_url = lambda x: x.replace('//rtbf-vod.', '//rtbf.') if '/geo/drm/' in x else x - http_url = data.get('url') - if formats and http_url and re.search(height_re, http_url): - http_url = fix_url(http_url) - for m3u8_f in formats[:]: - height = m3u8_f.get('height') - if not height: - continue - f = m3u8_f.copy() - del f['protocol'] - f.update({ - 'format_id': m3u8_f['format_id'].replace('hls-', 'http-'), - 'url': re.sub(height_re, '-%dp.' % height, http_url), - }) - formats.append(f) - else: - sources = data.get('sources') or {} - for key, format_id in self._QUALITIES: - format_url = sources.get(key) - if not format_url: - continue - height = int_or_none(self._search_regex( - height_re, format_url, 'height', default=None)) - formats.append({ - 'format_id': format_id, - 'url': fix_url(format_url), - 'height': height, - }) - - mpd_url = data.get('urlDash') - if mpd_url and (self.get_param('allow_unplayable_formats') or not data.get('drm')): - formats.extend(self._extract_mpd_formats( - mpd_url, media_id, mpd_id='dash', fatal=False)) - - audio_url = data.get('urlAudio') - if audio_url: - formats.append({ - 'format_id': 'audio', - 'url': audio_url, - 'vcodec': 'none', - }) - self._sort_formats(formats) - - subtitles = {} - for track in (data.get('tracks') or {}).values(): - sub_url = track.get('url') - if not sub_url: - continue - subtitles.setdefault(track.get('lang') or 'fr', []).append({ - 'url': sub_url, - }) - - return { - 'id': media_id, - 'formats': formats, - 'title': title, - 'description': strip_or_none(data.get('description')), - 'thumbnail': data.get('thumbnail'), - 'duration': float_or_none(data.get('realDuration')), - 'timestamp': int_or_none(data.get('liveFrom')), - 'series': data.get('programLabel'), - 'subtitles': subtitles, - 'is_live': is_live, - } diff --git a/hypervideo_dl/extractor/rte.py b/hypervideo_dl/extractor/rte.py index 1fbc729..aedaa5b 100644 --- a/hypervideo_dl/extractor/rte.py +++ b/hypervideo_dl/extractor/rte.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -97,8 +94,6 @@ class RteBaseIE(InfoExtractor): formats.extend(self._extract_f4m_formats( hds_url, item_id, f4m_id='hds', fatal=False)) - self._sort_formats(formats) - info_dict['formats'] = formats return info_dict diff --git a/hypervideo_dl/extractor/rtl2.py b/hypervideo_dl/extractor/rtl2.py index e291714..056cf87 100644 --- a/hypervideo_dl/extractor/rtl2.py +++ b/hypervideo_dl/extractor/rtl2.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -97,8 +94,6 @@ class RTL2IE(InfoExtractor): if m3u8_url: formats.extend(self._extract_akamai_formats(m3u8_url, display_id)) - self._sort_formats(formats) - return { 'id': display_id, 'title': title, @@ -145,7 +140,6 @@ class RTL2YouIE(RTL2YouBaseIE): raise ExtractorError('video not found', expected=True) formats = self._extract_m3u8_formats(stream_url.decode(), video_id, 'mp4', 'm3u8_native') - self._sort_formats(formats) video_data = self._download_json( self._BACKWERK_BASE_URL + 'video/' + video_id, video_id) diff --git a/hypervideo_dl/extractor/rtlnl.py b/hypervideo_dl/extractor/rtlnl.py index 9eaa06f..724cb64 100644 --- a/hypervideo_dl/extractor/rtlnl.py +++ b/hypervideo_dl/extractor/rtlnl.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -11,6 +8,7 @@ from ..utils import ( class RtlNlIE(InfoExtractor): IE_NAME = 'rtl.nl' IE_DESC = 'rtl.nl and rtlxl.nl' + _EMBED_REGEX = [r'<iframe[^>]+?\bsrc=(?P<q1>[\'"])(?P<url>(?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)(?P=q1)'] _VALID_URL = r'''(?x) https?://(?:(?:www|static)\.)? (?: @@ -118,7 +116,6 @@ class RtlNlIE(InfoExtractor): formats = self._extract_m3u8_formats( m3u8_url, uuid, 'mp4', m3u8_id='hls', fatal=False) - self._sort_formats(formats) thumbnails = [] @@ -144,3 +141,154 @@ class RtlNlIE(InfoExtractor): 'duration': parse_duration(material.get('duration')), 'thumbnails': thumbnails, } + + +class RTLLuBaseIE(InfoExtractor): + _MEDIA_REGEX = { + 'video': r'<rtl-player\s[^>]*\bhls\s*=\s*"([^"]+)', + 'audio': r'<rtl-audioplayer\s[^>]*\bsrc\s*=\s*"([^"]+)', + 'thumbnail': r'<rtl-player\s[^>]*\bposter\s*=\s*"([^"]+)', + } + + def get_media_url(self, webpage, video_id, media_type): + return self._search_regex(self._MEDIA_REGEX[media_type], webpage, f'{media_type} url', default=None) + + def get_formats_and_subtitles(self, webpage, video_id): + video_url, audio_url = self.get_media_url(webpage, video_id, 'video'), self.get_media_url(webpage, video_id, 'audio') + + formats, subtitles = [], {} + if video_url is not None: + formats, subtitles = self._extract_m3u8_formats_and_subtitles(video_url, video_id) + if audio_url is not None: + formats.append({'url': audio_url, 'ext': 'mp3', 'vcodec': 'none'}) + + return formats, subtitles + + def _real_extract(self, url): + video_id = self._match_id(url) + is_live = video_id in ('live', 'live-2', 'lauschteren') + + # TODO: extract comment from https://www.rtl.lu/comments?status=1&order=desc&context=news|article|<video_id> + # we can context from <rtl-comments context=<context> in webpage + webpage = self._download_webpage(url, video_id) + + formats, subtitles = self.get_formats_and_subtitles(webpage, video_id) + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage, default=None), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': self.get_media_url(webpage, video_id, 'thumbnail') or self._og_search_thumbnail(webpage, default=None), + 'is_live': is_live, + } + + +class RTLLuTeleVODIE(RTLLuBaseIE): + IE_NAME = 'rtl.lu:tele-vod' + _VALID_URL = r'https?://(?:www\.)?rtl\.lu/(tele/(?P<slug>[\w-]+)/v/|video/)(?P<id>\d+)(\.html)?' + _TESTS = [{ + 'url': 'https://www.rtl.lu/tele/de-journal-vun-der-tele/v/3266757.html', + 'info_dict': { + 'id': '3266757', + 'title': 'Informatiounsversammlung Héichwaasser', + 'ext': 'mp4', + 'thumbnail': 'https://replay-assets.rtl.lu/2021/11/16/d3647fc4-470d-11ec-adc2-3a00abd6e90f_00008.jpg', + 'description': 'md5:b1db974408cc858c9fd241812e4a2a14', + } + }, { + 'url': 'https://www.rtl.lu/video/3295215', + 'info_dict': { + 'id': '3295215', + 'title': 'Kulturassisen iwwer d\'Bestandsopnam vum Lëtzebuerger Konscht', + 'ext': 'mp4', + 'thumbnail': 'https://replay-assets.rtl.lu/2022/06/28/0000_3295215_0000.jpg', + 'description': 'md5:85bcd4e0490aa6ec969d9bf16927437b', + } + }] + + +class RTLLuArticleIE(RTLLuBaseIE): + IE_NAME = 'rtl.lu:article' + _VALID_URL = r'https?://(?:(www|5minutes|today)\.)rtl\.lu/(?:[\w-]+)/(?:[\w-]+)/a/(?P<id>\d+)\.html' + _TESTS = [{ + # Audio-only + 'url': 'https://www.rtl.lu/sport/news/a/1934360.html', + 'info_dict': { + 'id': '1934360', + 'ext': 'mp3', + 'thumbnail': 'https://static.rtl.lu/rtl2008.lu/nt/p/2022/06/28/19/e4b37d66ddf00bab4c45617b91a5bb9b.jpeg', + 'description': 'md5:5eab4a2a911c1fff7efc1682a38f9ef7', + 'title': 'md5:40aa85f135578fbd549d3c9370321f99', + } + }, { + # 5minutes + 'url': 'https://5minutes.rtl.lu/espace-frontaliers/frontaliers-en-questions/a/1853173.html', + 'info_dict': { + 'id': '1853173', + 'ext': 'mp4', + 'description': 'md5:ac031da0740e997a5cf4633173634fee', + 'title': 'md5:87e17722ed21af0f24be3243f4ec0c46', + 'thumbnail': 'https://replay-assets.rtl.lu/2022/01/26/screenshot_20220126104933_3274749_12b249833469b0d6e4440a1dec83cdfa.jpg', + } + }, { + # today.lu + 'url': 'https://today.rtl.lu/entertainment/news/a/1936203.html', + 'info_dict': { + 'id': '1936203', + 'ext': 'mp4', + 'title': 'Once Upon A Time...zu Lëtzebuerg: The Three Witches\' Tower', + 'description': 'The witchy theme continues in the latest episode of Once Upon A Time...', + 'thumbnail': 'https://replay-assets.rtl.lu/2022/07/02/screenshot_20220702122859_3290019_412dc5185951b7f6545a4039c8be9235.jpg', + } + }] + + +class RTLLuLiveIE(RTLLuBaseIE): + _VALID_URL = r'https?://www\.rtl\.lu/(?:tele|radio)/(?P<id>live(?:-\d+)?|lauschteren)' + _TESTS = [{ + # Tele:live + 'url': 'https://www.rtl.lu/tele/live', + 'info_dict': { + 'id': 'live', + 'ext': 'mp4', + 'live_status': 'is_live', + 'title': r're:RTL - Télé LIVE \d{4}-\d{2}-\d{2} \d{2}:\d{2}', + 'thumbnail': 'https://static.rtl.lu/livestream/channel1.jpg', + } + }, { + # Tele:live-2 + 'url': 'https://www.rtl.lu/tele/live-2', + 'info_dict': { + 'id': 'live-2', + 'ext': 'mp4', + 'live_status': 'is_live', + 'title': r're:RTL - Télé LIVE \d{4}-\d{2}-\d{2} \d{2}:\d{2}', + 'thumbnail': 'https://static.rtl.lu/livestream/channel2.jpg', + } + }, { + # Radio:lauschteren + 'url': 'https://www.rtl.lu/radio/lauschteren', + 'info_dict': { + 'id': 'lauschteren', + 'ext': 'mp4', + 'live_status': 'is_live', + 'title': r're:RTL - Radio LIVE \d{4}-\d{2}-\d{2} \d{2}:\d{2}', + 'thumbnail': 'https://static.rtl.lu/livestream/rtlradiowebtv.jpg', + } + }] + + +class RTLLuRadioIE(RTLLuBaseIE): + _VALID_URL = r'https?://www\.rtl\.lu/radio/(?:[\w-]+)/s/(?P<id>\d+)(\.html)?' + _TESTS = [{ + 'url': 'https://www.rtl.lu/radio/5-vir-12/s/4033058.html', + 'info_dict': { + 'id': '4033058', + 'ext': 'mp3', + 'description': 'md5:f855a4f3e3235393ae47ed1db5d934b9', + 'title': '5 vir 12 - Stau um Stau', + 'thumbnail': 'https://static.rtl.lu/rtlg//2022/06/24/c9c19e5694a14be46a3647a3760e1f62.jpg', + } + }] diff --git a/hypervideo_dl/extractor/rtnews.py b/hypervideo_dl/extractor/rtnews.py index 68b6044..6be9945 100644 --- a/hypervideo_dl/extractor/rtnews.py +++ b/hypervideo_dl/extractor/rtnews.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/rtp.py b/hypervideo_dl/extractor/rtp.py index c165ade..5928a20 100644 --- a/hypervideo_dl/extractor/rtp.py +++ b/hypervideo_dl/extractor/rtp.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import js_to_json import re diff --git a/hypervideo_dl/extractor/rtrfm.py b/hypervideo_dl/extractor/rtrfm.py index 93d51e8..7381d82 100644 --- a/hypervideo_dl/extractor/rtrfm.py +++ b/hypervideo_dl/extractor/rtrfm.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/rts.py b/hypervideo_dl/extractor/rts.py index 865a730..81c4d7c 100644 --- a/hypervideo_dl/extractor/rts.py +++ b/hypervideo_dl/extractor/rts.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .srgssr import SRGSSRIE @@ -15,7 +12,7 @@ from ..utils import ( ) -class RTSIE(SRGSSRIE): +class RTSIE(SRGSSRIE): # XXX: Do not subclass from concrete IE IE_DESC = 'RTS.ch' _VALID_URL = r'rts:(?P<rts_id>\d+)|https?://(?:.+?\.)?rts\.ch/(?:[^/]+/){2,}(?P<id>[0-9]+)-(?P<display_id>.+?)\.html' @@ -215,7 +212,6 @@ class RTSIE(SRGSSRIE): }) self._check_formats(formats, media_id) - self._sort_formats(formats) duration = info.get('duration') or info.get('cutout') or info.get('cutduration') if isinstance(duration, compat_str): diff --git a/hypervideo_dl/extractor/rtve.py b/hypervideo_dl/extractor/rtve.py index 7a1dc6f..a99a266 100644 --- a/hypervideo_dl/extractor/rtve.py +++ b/hypervideo_dl/extractor/rtve.py @@ -1,18 +1,12 @@ -# coding: utf-8 -from __future__ import unicode_literals - import base64 import io -import sys +import struct from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_struct_unpack, -) +from ..compat import compat_b64decode from ..utils import ( - determine_ext, ExtractorError, + determine_ext, float_or_none, qualities, remove_end, @@ -20,8 +14,6 @@ from ..utils import ( try_get, ) -_bytes_to_chr = (lambda x: x) if sys.version_info[0] == 2 else (lambda x: map(chr, x)) - class RTVEALaCartaIE(InfoExtractor): IE_NAME = 'rtve.es:alacarta' @@ -79,7 +71,7 @@ class RTVEALaCartaIE(InfoExtractor): def _decrypt_url(png): encrypted_data = io.BytesIO(compat_b64decode(png)[8:]) while True: - length = compat_struct_unpack('!I', encrypted_data.read(4))[0] + length = struct.unpack('!I', encrypted_data.read(4))[0] chunk_type = encrypted_data.read(4) if chunk_type == b'IEND': break @@ -90,7 +82,7 @@ class RTVEALaCartaIE(InfoExtractor): alphabet = [] e = 0 d = 0 - for l in _bytes_to_chr(alphabet_data): + for l in alphabet_data.decode('iso-8859-1'): if d == 0: alphabet.append(l) d = e = (e + 1) % 4 @@ -100,7 +92,7 @@ class RTVEALaCartaIE(InfoExtractor): f = 0 e = 3 b = 1 - for letter in _bytes_to_chr(url_data): + for letter in url_data.decode('iso-8859-1'): if f == 0: l = int(letter) * 10 f = 1 @@ -138,7 +130,6 @@ class RTVEALaCartaIE(InfoExtractor): 'quality': q(quality), 'url': video_url, }) - self._sort_formats(formats) return formats def _real_extract(self, url): @@ -178,7 +169,7 @@ class RTVEALaCartaIE(InfoExtractor): for s in subs) -class RTVEAudioIE(RTVEALaCartaIE): +class RTVEAudioIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE IE_NAME = 'rtve.es:audio' IE_DESC = 'RTVE audio' _VALID_URL = r'https?://(?:www\.)?rtve\.es/(alacarta|play)/audios/[^/]+/[^/]+/(?P<id>[0-9]+)' @@ -246,7 +237,6 @@ class RTVEAudioIE(RTVEALaCartaIE): 'quality': q(quality), 'url': audio_url, }) - self._sort_formats(formats) return formats def _real_extract(self, url): @@ -265,7 +255,7 @@ class RTVEAudioIE(RTVEALaCartaIE): } -class RTVEInfantilIE(RTVEALaCartaIE): +class RTVEInfantilIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE IE_NAME = 'rtve.es:infantil' IE_DESC = 'RTVE infantil' _VALID_URL = r'https?://(?:www\.)?rtve\.es/infantil/serie/[^/]+/video/[^/]+/(?P<id>[0-9]+)/' @@ -284,7 +274,7 @@ class RTVEInfantilIE(RTVEALaCartaIE): }] -class RTVELiveIE(RTVEALaCartaIE): +class RTVELiveIE(RTVEALaCartaIE): # XXX: Do not subclass from concrete IE IE_NAME = 'rtve.es:live' IE_DESC = 'RTVE.es live streams' _VALID_URL = r'https?://(?:www\.)?rtve\.es/directo/(?P<id>[a-zA-Z0-9-]+)' diff --git a/hypervideo_dl/extractor/rtvnh.py b/hypervideo_dl/extractor/rtvnh.py index 6a00f70..7c61744 100644 --- a/hypervideo_dl/extractor/rtvnh.py +++ b/hypervideo_dl/extractor/rtvnh.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ExtractorError @@ -52,7 +49,6 @@ class RTVNHIE(InfoExtractor): formats.extend(self._extract_f4m_formats( http_base_url + '/manifest.f4m', video_id, f4m_id='hds', fatal=False)) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/rtvs.py b/hypervideo_dl/extractor/rtvs.py index 3ea0f18..a84a78d 100644 --- a/hypervideo_dl/extractor/rtvs.py +++ b/hypervideo_dl/extractor/rtvs.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -75,7 +72,6 @@ class RTVSIE(InfoExtractor): formats = [{'url': traverse_obj(data, ('playlist', 0, 'sources', 0, 'src'))}] else: formats = self._extract_m3u8_formats(traverse_obj(data, ('playlist', 0, 'sources', 0, 'src')), video_id) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/rtvslo.py b/hypervideo_dl/extractor/rtvslo.py new file mode 100644 index 0000000..05942b6 --- /dev/null +++ b/hypervideo_dl/extractor/rtvslo.py @@ -0,0 +1,150 @@ +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + parse_duration, + traverse_obj, + unified_timestamp, + url_or_none, +) + + +class RTVSLOIE(InfoExtractor): + IE_NAME = 'rtvslo.si' + _VALID_URL = r'''(?x) + https?://(?: + (?:365|4d)\.rtvslo.si/arhiv/[^/?#&;]+| + (?:www\.)?rtvslo\.si/rtv365/arhiv + )/(?P<id>\d+)''' + _GEO_COUNTRIES = ['SI'] + + _API_BASE = 'https://api.rtvslo.si/ava/{}/{}?client_id=82013fb3a531d5414f478747c1aca622' + SUB_LANGS_MAP = {'Slovenski': 'sl'} + + _TESTS = [ + { + 'url': 'https://www.rtvslo.si/rtv365/arhiv/174842550?s=tv', + 'info_dict': { + 'id': '174842550', + 'ext': 'flv', + 'release_timestamp': 1643140032, + 'upload_date': '20220125', + 'series': 'Dnevnik', + 'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/92/dnevnik_3_wide2.jpg', + 'description': 'md5:76a18692757aeb8f0f51221106277dd2', + 'timestamp': 1643137046, + 'title': 'Dnevnik', + 'series_id': '92', + 'release_date': '20220125', + 'duration': 1789, + }, + }, { + 'url': 'https://365.rtvslo.si/arhiv/utrip/174843754', + 'info_dict': { + 'id': '174843754', + 'ext': 'mp4', + 'series_id': '94', + 'release_date': '20220129', + 'timestamp': 1643484455, + 'title': 'Utrip', + 'duration': 813, + 'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/94/utrip_1_wide2.jpg', + 'description': 'md5:77f2892630c7b17bb7a5bb84319020c9', + 'release_timestamp': 1643485825, + 'upload_date': '20220129', + 'series': 'Utrip', + }, + }, { + 'url': 'https://365.rtvslo.si/arhiv/il-giornale-della-sera/174844609', + 'info_dict': { + 'id': '174844609', + 'ext': 'mp3', + 'series_id': '106615841', + 'title': 'Il giornale della sera', + 'duration': 1328, + 'series': 'Il giornale della sera', + 'timestamp': 1643743800, + 'release_timestamp': 1643745424, + 'thumbnail': 'https://img.rtvcdn.si/_up/ava/ava_misc/show_logos/il-giornale-della-sera_wide2.jpg', + 'upload_date': '20220201', + 'tbr': 128000, + 'release_date': '20220201', + }, + + }, { + 'url': 'https://4d.rtvslo.si/arhiv/dnevnik/174842550', + 'only_matching': True + } + ] + + def _real_extract(self, url): + v_id = self._match_id(url) + meta = self._download_json(self._API_BASE.format('getRecordingDrm', v_id), v_id)['response'] + + thumbs = [{'id': k, 'url': v, 'http_headers': {'Accept': 'image/jpeg'}} + for k, v in (meta.get('images') or {}).items()] + + subs = {} + for s in traverse_obj(meta, 'subs', 'subtitles', default=[]): + lang = self.SUB_LANGS_MAP.get(s.get('language'), s.get('language') or 'und') + subs.setdefault(lang, []).append({ + 'url': s.get('file'), + 'ext': traverse_obj(s, 'format', expected_type=str.lower), + }) + + jwt = meta.get('jwt') + if not jwt: + raise ExtractorError('Site did not provide an authentication token, cannot proceed.') + + media = self._download_json(self._API_BASE.format('getMedia', v_id), v_id, query={'jwt': jwt})['response'] + + formats = [] + adaptive_url = traverse_obj(media, ('addaptiveMedia', 'hls_sec'), expected_type=url_or_none) + if adaptive_url: + formats = self._extract_wowza_formats(adaptive_url, v_id, skip_protocols=['smil']) + + adaptive_url = traverse_obj(media, ('addaptiveMedia_sl', 'hls_sec'), expected_type=url_or_none) + if adaptive_url: + for f in self._extract_wowza_formats(adaptive_url, v_id, skip_protocols=['smil']): + formats.append({ + **f, + 'format_id': 'sign-' + f['format_id'], + 'format_note': 'Sign language interpretation', 'preference': -10, + 'language': ( + 'slv' if f.get('language') == 'eng' and f.get('acodec') != 'none' + else f.get('language')) + }) + + formats.extend( + { + 'url': f['streams'][strm], + 'ext': traverse_obj(f, 'mediaType', expected_type=str.lower), + 'width': f.get('width'), + 'height': f.get('height'), + 'tbr': f.get('bitrate'), + 'filesize': f.get('filesize'), + } + for strm in ('http', 'https') + for f in media.get('mediaFiles') or [] + if traverse_obj(f, ('streams', strm)) + ) + + if any('intermission.mp4' in x['url'] for x in formats): + self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) + if any('dummy_720p.mp4' in x.get('manifest_url', '') for x in formats) and meta.get('stub') == 'error': + raise ExtractorError(f'{self.IE_NAME} said: Clip not available', expected=True) + + return { + 'id': v_id, + 'webpage_url': ''.join(traverse_obj(meta, ('canonical', ('domain', 'path')))), + 'title': meta.get('title'), + 'formats': formats, + 'subtitles': subs, + 'thumbnails': thumbs, + 'description': meta.get('description'), + 'timestamp': unified_timestamp(traverse_obj(meta, 'broadcastDate', ('broadcastDates', 0))), + 'release_timestamp': unified_timestamp(meta.get('recordingDate')), + 'duration': meta.get('duration') or parse_duration(meta.get('length')), + 'tags': meta.get('genre'), + 'series': meta.get('showName'), + 'series_id': meta.get('showId'), + } diff --git a/hypervideo_dl/extractor/ruhd.py b/hypervideo_dl/extractor/ruhd.py index 3c8053a..abaa3f9 100644 --- a/hypervideo_dl/extractor/ruhd.py +++ b/hypervideo_dl/extractor/ruhd.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/rule34video.py b/hypervideo_dl/extractor/rule34video.py index a602a9f..9d15f4d 100644 --- a/hypervideo_dl/extractor/rule34video.py +++ b/hypervideo_dl/extractor/rule34video.py @@ -1,5 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals import re from ..utils import parse_duration @@ -53,8 +51,6 @@ class Rule34VideoIE(InfoExtractor): thumbnail = self._html_search_regex(r'preview_url:\s+\'([^\']+)\'', webpage, 'thumbnail', default=None) duration = self._html_search_regex(r'"icon-clock"></i>\s+<span>((?:\d+:?)+)', webpage, 'duration', default=None) - self._sort_formats(formats) - return { 'id': video_id, 'formats': formats, diff --git a/hypervideo_dl/extractor/rumble.py b/hypervideo_dl/extractor/rumble.py index a0d5f88..102615c 100644 --- a/hypervideo_dl/extractor/rumble.py +++ b/hypervideo_dl/extractor/rumble.py @@ -1,16 +1,12 @@ -# coding: utf-8 -from __future__ import unicode_literals - import itertools import re from .common import InfoExtractor -from ..compat import compat_str, compat_HTTPError +from ..compat import compat_HTTPError from ..utils import ( - determine_ext, int_or_none, parse_iso8601, - try_get, + traverse_obj, unescapeHTML, ExtractorError, ) @@ -18,6 +14,7 @@ from ..utils import ( class RumbleEmbedIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)' + _EMBED_REGEX = [fr'(?:<(?:script|iframe)[^>]+\bsrc=|["\']embedUrl["\']\s*:\s*)["\'](?P<url>{_VALID_URL})'] _TESTS = [{ 'url': 'https://rumble.com/embed/v5pv5f', 'md5': '36a18a049856720189f30977ccbb2c34', @@ -27,6 +24,12 @@ class RumbleEmbedIE(InfoExtractor): 'title': 'WMAR 2 News Latest Headlines | October 20, 6pm', 'timestamp': 1571611968, 'upload_date': '20191020', + 'channel_url': 'https://rumble.com/c/WMAR', + 'channel': 'WMAR', + 'thumbnail': 'https://sp.rmbl.ws/s8/1/5/M/z/1/5Mz1a.OvCc-small-WMAR-2-News-Latest-Headline.jpg', + 'duration': 234, + 'uploader': 'WMAR', + 'live_status': 'not_live', } }, { 'url': 'https://rumble.com/embed/vslb7v', @@ -41,56 +44,194 @@ class RumbleEmbedIE(InfoExtractor): 'channel': 'CTNews', 'thumbnail': 'https://sp.rmbl.ws/s8/6/7/i/9/h/7i9hd.OvCc.jpg', 'duration': 901, + 'uploader': 'CTNews', + 'live_status': 'not_live', } }, { + 'url': 'https://rumble.com/embed/vunh1h', + 'info_dict': { + 'id': 'vunh1h', + 'ext': 'mp4', + 'title': '‘Gideon, op zoek naar de waarheid’ including ENG SUBS', + 'timestamp': 1647197663, + 'upload_date': '20220313', + 'channel_url': 'https://rumble.com/user/BLCKBX', + 'channel': 'BLCKBX', + 'thumbnail': r're:https://.+\.jpg', + 'duration': 5069, + 'uploader': 'BLCKBX', + 'live_status': 'not_live', + 'subtitles': { + 'en': [ + { + 'url': r're:https://.+\.vtt', + 'name': 'English', + 'ext': 'vtt' + } + ] + }, + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://rumble.com/embed/v1essrt', + 'info_dict': { + 'id': 'v1essrt', + 'ext': 'mp4', + 'title': 'startswith:lofi hip hop radio - beats to relax/study', + 'timestamp': 1661519399, + 'upload_date': '20220826', + 'channel_url': 'https://rumble.com/c/LofiGirl', + 'channel': 'Lofi Girl', + 'thumbnail': r're:https://.+\.jpg', + 'duration': None, + 'uploader': 'Lofi Girl', + 'live_status': 'is_live', + }, + 'params': {'skip_download': True} + }, { + 'url': 'https://rumble.com/embed/v1amumr', + 'info_dict': { + 'id': 'v1amumr', + 'ext': 'webm', + 'fps': 60, + 'title': 'Turning Point USA 2022 Student Action Summit DAY 1 - Rumble Exclusive Live', + 'timestamp': 1658518457, + 'upload_date': '20220722', + 'channel_url': 'https://rumble.com/c/RumbleEvents', + 'channel': 'Rumble Events', + 'thumbnail': r're:https://.+\.jpg', + 'duration': 16427, + 'uploader': 'Rumble Events', + 'live_status': 'was_live', + }, + 'params': {'skip_download': True} + }, { 'url': 'https://rumble.com/embed/ufe9n.v5pv5f', 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'(?:<(?:script|iframe)[^>]+\bsrc=|["\']embedUrl["\']\s*:\s*)["\'](?P<url>%s)' % RumbleEmbedIE._VALID_URL, - webpage)] + _WEBPAGE_TESTS = [ + { + 'note': 'Rumble embed', + 'url': 'https://rumble.com/vdmum1-moose-the-dog-helps-girls-dig-a-snow-fort.html', + 'md5': '53af34098a7f92c4e51cf0bd1c33f009', + 'info_dict': { + 'id': 'vb0ofn', + 'ext': 'mp4', + 'timestamp': 1612662578, + 'uploader': 'LovingMontana', + 'channel': 'LovingMontana', + 'upload_date': '20210207', + 'title': 'Winter-loving dog helps girls dig a snow fort ', + 'channel_url': 'https://rumble.com/c/c-546523', + 'thumbnail': 'https://sp.rmbl.ws/s8/1/5/f/x/x/5fxxb.OvCc.1-small-Moose-The-Dog-Helps-Girls-D.jpg', + 'duration': 103, + 'live_status': 'not_live', + } + }, + { + 'note': 'Rumble JS embed', + 'url': 'https://therightscoop.com/what-does-9-plus-1-plus-1-equal-listen-to-this-audio-of-attempted-kavanaugh-assassins-call-and-youll-get-it', + 'md5': '4701209ac99095592e73dbba21889690', + 'info_dict': { + 'id': 'v15eqxl', + 'ext': 'mp4', + 'channel': 'Mr Producer Media', + 'duration': 92, + 'title': '911 Audio From The Man Who Wanted To Kill Supreme Court Justice Kavanaugh', + 'channel_url': 'https://rumble.com/c/RichSementa', + 'thumbnail': 'https://sp.rmbl.ws/s8/1/P/j/f/A/PjfAe.OvCc-small-911-Audio-From-The-Man-Who-.jpg', + 'timestamp': 1654892716, + 'uploader': 'Mr Producer Media', + 'upload_date': '20220610', + 'live_status': 'not_live', + } + }, + ] + + @classmethod + def _extract_embed_urls(cls, url, webpage): + embeds = tuple(super()._extract_embed_urls(url, webpage)) + if embeds: + return embeds + return [f'https://rumble.com/embed/{mobj.group("id")}' for mobj in re.finditer( + r'<script>\s*Rumble\(\s*"play"\s*,\s*{\s*[\'"]video[\'"]\s*:\s*[\'"](?P<id>[0-9a-z]+)[\'"]', webpage)] def _real_extract(self, url): video_id = self._match_id(url) video = self._download_json( - 'https://rumble.com/embedJS/', video_id, - query={'request': 'video', 'v': video_id}) - title = unescapeHTML(video['title']) + 'https://rumble.com/embedJS/u3/', video_id, + query={'request': 'video', 'ver': 2, 'v': video_id}) + + sys_msg = traverse_obj(video, ('sys', 'msg')) + if sys_msg: + self.report_warning(sys_msg, video_id=video_id) + + if video.get('live') == 0: + live_status = 'not_live' if video.get('livestream_has_dvr') is None else 'was_live' + elif video.get('live') == 1: + live_status = 'is_upcoming' if video.get('livestream_has_dvr') else 'was_live' + elif video.get('live') == 2: + live_status = 'is_live' + else: + live_status = None formats = [] - for height, ua in (video.get('ua') or {}).items(): - for i in range(2): - f_url = try_get(ua, lambda x: x[i], compat_str) - if f_url: - ext = determine_ext(f_url) - f = { - 'ext': ext, - 'format_id': '%s-%sp' % (ext, height), - 'height': int_or_none(height), - 'url': f_url, - } - bitrate = try_get(ua, lambda x: x[i + 2]['bitrate']) - if bitrate: - f['tbr'] = int_or_none(bitrate) - formats.append(f) - self._sort_formats(formats) + for ext, ext_info in (video.get('ua') or {}).items(): + for height, video_info in (ext_info or {}).items(): + meta = video_info.get('meta') or {} + if not video_info.get('url'): + continue + if ext == 'hls': + if meta.get('live') is True and video.get('live') == 1: + live_status = 'post_live' + formats.extend(self._extract_m3u8_formats( + video_info['url'], video_id, + ext='mp4', m3u8_id='hls', fatal=False, live=live_status == 'is_live')) + continue + formats.append({ + 'ext': ext, + 'url': video_info['url'], + 'format_id': '%s-%sp' % (ext, height), + 'height': int_or_none(height), + 'fps': video.get('fps'), + **traverse_obj(meta, { + 'tbr': 'bitrate', + 'filesize': 'size', + 'width': 'w', + 'height': 'h', + }, default={}) + }) + + subtitles = { + lang: [{ + 'url': sub_info['path'], + 'name': sub_info.get('language') or '', + }] for lang, sub_info in (video.get('cc') or {}).items() if sub_info.get('path') + } author = video.get('author') or {} + thumbnails = traverse_obj(video, ('t', ..., {'url': 'i', 'width': 'w', 'height': 'h'})) + if not thumbnails and video.get('i'): + thumbnails = [{'url': video['i']}] + + if live_status in {'is_live', 'post_live'}: + duration = None + else: + duration = int_or_none(video.get('duration')) return { 'id': video_id, - 'title': title, + 'title': unescapeHTML(video.get('title')), 'formats': formats, - 'thumbnail': video.get('i'), + 'subtitles': subtitles, + 'thumbnails': thumbnails, 'timestamp': parse_iso8601(video.get('pubDate')), 'channel': author.get('name'), 'channel_url': author.get('url'), - 'duration': int_or_none(video.get('duration')), + 'duration': duration, + 'uploader': author.get('name'), + 'live_status': live_status, } @@ -105,7 +246,7 @@ class RumbleChannelIE(InfoExtractor): }, }, { 'url': 'https://rumble.com/user/goldenpoodleharleyeuna', - 'playlist_count': 4, + 'playlist_mincount': 4, 'info_dict': { 'id': 'goldenpoodleharleyeuna', }, diff --git a/hypervideo_dl/extractor/rutube.py b/hypervideo_dl/extractor/rutube.py index 2f753b4..5a4fd97 100644 --- a/hypervideo_dl/extractor/rutube.py +++ b/hypervideo_dl/extractor/rutube.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re import itertools from .common import InfoExtractor @@ -85,7 +81,6 @@ class RutubeBaseIE(InfoExtractor): 'url': format_url, 'format_id': format_id, }) - self._sort_formats(formats) return formats def _download_and_extract_formats(self, video_id, query=None): @@ -97,6 +92,7 @@ class RutubeIE(RutubeBaseIE): IE_NAME = 'rutube' IE_DESC = 'Rutube videos' _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/(?P<id>[\da-z]{32})' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//rutube\.ru/(?:play/)?embed/[\da-z]{32}.*?)\1'] _TESTS = [{ 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', @@ -131,12 +127,6 @@ class RutubeIE(RutubeBaseIE): def suitable(cls, url): return False if RutubePlaylistIE.suitable(url) else super(RutubeIE, cls).suitable(url) - @staticmethod - def _extract_urls(webpage): - return [mobj.group('url') for mobj in re.finditer( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//rutube\.ru/embed/[\da-z]{32}.*?)\1', - webpage)] - def _real_extract(self, url): video_id = self._match_id(url) info = self._download_and_extract_info(video_id) @@ -249,7 +239,6 @@ class RutubeMovieIE(RutubePlaylistBaseIE): IE_NAME = 'rutube:movie' IE_DESC = 'Rutube movies' _VALID_URL = r'https?://rutube\.ru/metainfo/tv/(?P<id>\d+)' - _TESTS = [] _MOVIE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/?format=json' _PAGE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/video?page=%s&format=json' diff --git a/hypervideo_dl/extractor/rutv.py b/hypervideo_dl/extractor/rutv.py index 0ea8253..d7f9a73 100644 --- a/hypervideo_dl/extractor/rutv.py +++ b/hypervideo_dl/extractor/rutv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -23,6 +20,10 @@ class RUTVIE(InfoExtractor): ) (?P<id>\d+) ''' + _EMBED_URLS = [ + r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', + r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/flash\d+v/container\.swf\?id=.+?\2)', + ] _TESTS = [ { @@ -110,19 +111,6 @@ class RUTVIE(InfoExtractor): }, ] - @classmethod - def _extract_url(cls, webpage): - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage) - if mobj: - return mobj.group('url') - - mobj = re.search( - r'<meta[^>]+?property=(["\'])og:video\1[^>]+?content=(["\'])(?P<url>https?://(?:test)?player\.(?:rutv\.ru|vgtrk\.com)/flash\d+v/container\.swf\?id=.+?\2)', - webpage) - if mobj: - return mobj.group('url') - def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') @@ -153,7 +141,7 @@ class RUTVIE(InfoExtractor): if media['errors']: raise ExtractorError('%s said: %s' % (self.IE_NAME, media['errors']), expected=True) - view_count = playlist.get('count_views') + view_count = int_or_none(playlist.get('count_views')) priority_transport = playlist['priority_transport'] thumbnail = media['picture'] @@ -164,6 +152,7 @@ class RUTVIE(InfoExtractor): duration = int_or_none(media.get('duration')) formats = [] + subtitles = {} for transport, links in media['sources'].items(): for quality, url in links.items(): @@ -183,8 +172,10 @@ class RUTVIE(InfoExtractor): 'vbr': str_to_int(quality), } elif transport == 'm3u8': - formats.extend(self._extract_m3u8_formats( - url, video_id, 'mp4', quality=preference, m3u8_id='hls')) + fmt, subs = self._extract_m3u8_formats_and_subtitles( + url, video_id, 'mp4', quality=preference, m3u8_id='hls') + formats.extend(fmt) + self._merge_subtitles(subs, target=subtitles) continue else: fmt = { @@ -198,8 +189,6 @@ class RUTVIE(InfoExtractor): }) formats.append(fmt) - self._sort_formats(formats) - return { 'id': video_id, 'title': title, @@ -208,5 +197,7 @@ class RUTVIE(InfoExtractor): 'view_count': view_count, 'duration': duration, 'formats': formats, + 'subtitles': subtitles, 'is_live': is_live, + '_format_sort_fields': ('source', ), } diff --git a/hypervideo_dl/extractor/ruutu.py b/hypervideo_dl/extractor/ruutu.py index 5a30e33..33f6652 100644 --- a/hypervideo_dl/extractor/ruutu.py +++ b/hypervideo_dl/extractor/ruutu.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import re @@ -41,6 +38,7 @@ class RuutuIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 114, 'age_limit': 0, + 'upload_date': '20150508', }, }, { @@ -54,6 +52,9 @@ class RuutuIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 40, 'age_limit': 0, + 'upload_date': '20150507', + 'series': 'Superpesis', + 'categories': ['Urheilu'], }, }, { @@ -66,6 +67,8 @@ class RuutuIE(InfoExtractor): 'description': 'md5:7d90f358c47542e3072ff65d7b1bcffe', 'thumbnail': r're:^https?://.*\.jpg$', 'age_limit': 0, + 'upload_date': '20151012', + 'series': 'Läpivalaisu', }, }, # Episode where <SourceFile> is "NOT-USED", but has other @@ -85,6 +88,9 @@ class RuutuIE(InfoExtractor): 'description': 'md5:bbb6963df17dfd0ecd9eb9a61bf14b52', 'thumbnail': r're:^https?://.*\.jpg$', 'age_limit': 0, + 'upload_date': '20190320', + 'series': 'Mysteeritarinat', + 'duration': 1324, }, 'expected_warnings': [ 'HTTP Error 502: Bad Gateway', @@ -129,14 +135,30 @@ class RuutuIE(InfoExtractor): _API_BASE = 'https://gatling.nelonenmedia.fi' @classmethod - def _extract_url(cls, webpage): + def _extract_embed_urls(cls, url, webpage): + # nelonen.fi settings = try_call( lambda: json.loads(re.search( r'jQuery\.extend\(Drupal\.settings, ({.+?})\);', webpage).group(1), strict=False)) - video_id = traverse_obj(settings, ( - 'mediaCrossbowSettings', 'file', 'field_crossbow_video_id', 'und', 0, 'value')) - if video_id: - return f'http://www.ruutu.fi/video/{video_id}' + if settings: + video_id = traverse_obj(settings, ( + 'mediaCrossbowSettings', 'file', 'field_crossbow_video_id', 'und', 0, 'value')) + if video_id: + return [f'http://www.ruutu.fi/video/{video_id}'] + # hs.fi and is.fi + settings = try_call( + lambda: json.loads(re.search( + '(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>', + webpage).group(1), strict=False)) + if settings: + video_ids = set(traverse_obj(settings, ( + 'props', 'pageProps', 'page', 'assetData', 'splitBody', ..., 'video', 'sourceId')) or []) + if video_ids: + return [f'http://www.ruutu.fi/video/{v}' for v in video_ids] + video_id = traverse_obj(settings, ( + 'props', 'pageProps', 'page', 'assetData', 'mainVideo', 'sourceId')) + if video_id: + return [f'http://www.ruutu.fi/video/{video_id}'] def _real_extract(self, url): video_id = self._match_id(url) @@ -209,10 +231,10 @@ class RuutuIE(InfoExtractor): extract_formats(video_xml.find('./Clip')) def pv(name): - node = find_xpath_attr( - video_xml, './Clip/PassthroughVariables/variable', 'name', name) - if node is not None: - return node.get('value') + value = try_call(lambda: find_xpath_attr( + video_xml, './Clip/PassthroughVariables/variable', 'name', name).get('value')) + if value != 'NA': + return value or None if not formats: if (not self.get_param('allow_unplayable_formats') @@ -222,8 +244,6 @@ class RuutuIE(InfoExtractor): if ns_st_cds != 'free': raise ExtractorError('This video is %s.' % ns_st_cds, expected=True) - self._sort_formats(formats) - themes = pv('themes') return { @@ -237,6 +257,6 @@ class RuutuIE(InfoExtractor): 'series': pv('series_name'), 'season_number': int_or_none(pv('season_number')), 'episode_number': int_or_none(pv('episode_number')), - 'categories': themes.split(',') if themes else [], + 'categories': themes.split(',') if themes else None, 'formats': formats, } diff --git a/hypervideo_dl/extractor/ruv.py b/hypervideo_dl/extractor/ruv.py index d806ed0..12499d6 100644 --- a/hypervideo_dl/extractor/ruv.py +++ b/hypervideo_dl/extractor/ruv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( determine_ext, diff --git a/hypervideo_dl/extractor/safari.py b/hypervideo_dl/extractor/safari.py index 7b4571d..450a661 100644 --- a/hypervideo_dl/extractor/safari.py +++ b/hypervideo_dl/extractor/safari.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import re diff --git a/hypervideo_dl/extractor/saitosan.py b/hypervideo_dl/extractor/saitosan.py index 621335c..d2f60e9 100644 --- a/hypervideo_dl/extractor/saitosan.py +++ b/hypervideo_dl/extractor/saitosan.py @@ -1,7 +1,3 @@ -# coding: utf-8 - -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ExtractorError, try_get diff --git a/hypervideo_dl/extractor/samplefocus.py b/hypervideo_dl/extractor/samplefocus.py index 806c3c3..e9f5c22 100644 --- a/hypervideo_dl/extractor/samplefocus.py +++ b/hypervideo_dl/extractor/samplefocus.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/sapo.py b/hypervideo_dl/extractor/sapo.py index df202a3..beffaee 100644 --- a/hypervideo_dl/extractor/sapo.py +++ b/hypervideo_dl/extractor/sapo.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -101,8 +98,6 @@ class SapoIE(InfoExtractor): 'height': 720, }) - self._sort_formats(formats) - return { 'id': video_id, 'title': title, diff --git a/hypervideo_dl/extractor/savefrom.py b/hypervideo_dl/extractor/savefrom.py index 98efdc2..9c9e74b 100644 --- a/hypervideo_dl/extractor/savefrom.py +++ b/hypervideo_dl/extractor/savefrom.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import os.path from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/sbs.py b/hypervideo_dl/extractor/sbs.py index 4090f63..4532033 100644 --- a/hypervideo_dl/extractor/sbs.py +++ b/hypervideo_dl/extractor/sbs.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( smuggle_url, @@ -15,9 +12,16 @@ class SBSIE(InfoExtractor): ondemand(?: /video/(?:single/)?| /movie/[^/]+/| + /(?:tv|news)-series/(?:[^/]+/){3}| .*?\bplay=|/watch/ )|news/(?:embeds/)?video/ )(?P<id>[0-9]+)''' + _EMBED_REGEX = [r'''(?x)] + (?: + <meta\s+property="og:video"\s+content=| + <iframe[^>]+?src= + ) + (["\'])(?P<url>https?://(?:www\.)?sbs\.com\.au/ondemand/video/.+?)\1'''] _TESTS = [{ # Original URL is handled by the generic IE which finds the iframe: @@ -60,6 +64,12 @@ class SBSIE(InfoExtractor): 'note': 'Live stream', 'url': 'https://www.sbs.com.au/ondemand/video/1726824003663/sbs-24x7-live-stream-nsw', 'only_matching': True, + }, { + 'url': 'https://www.sbs.com.au/ondemand/news-series/dateline/dateline-2022/dateline-s2022-ep26/2072245827515', + 'only_matching': True, + }, { + 'url': 'https://www.sbs.com.au/ondemand/tv-series/the-handmaids-tale/season-5/the-handmaids-tale-s5-ep1/2065631811776', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/hypervideo_dl/extractor/screen9.py b/hypervideo_dl/extractor/screen9.py new file mode 100644 index 0000000..5ab0b6c --- /dev/null +++ b/hypervideo_dl/extractor/screen9.py @@ -0,0 +1,62 @@ +from .common import InfoExtractor +from ..utils import traverse_obj + + +class Screen9IE(InfoExtractor): + _VALID_URL = r'https?://(?:\w+\.screen9\.(?:tv|com)|play\.su\.se)/(?:embed|media)/(?P<id>[^?#/]+)' + _TESTS = [ + { + 'url': 'https://api.screen9.com/embed/8kTNEjvoXGM33dmWwF0uDA', + 'md5': 'd60d23f8980583b930724b01fa6ddb41', + 'info_dict': { + 'id': '8kTNEjvoXGM33dmWwF0uDA', + 'ext': 'mp4', + 'title': 'Östersjön i förändrat klimat', + 'thumbnail': r're:^https?://.+\.jpg', + }, + }, + { + 'url': 'https://folkhogskolekanalen.screen9.tv/media/gy35PKLHe-5K29RYHga2bw/ett-starkare-samhalle-en-snabbguide-om-sveriges-folkhogskolor', + 'md5': 'c9389806e78573ea34fc48b6f94465dc', + 'info_dict': { + 'id': 'gy35PKLHe-5K29RYHga2bw', + 'ext': 'mp4', + 'title': 'Ett starkare samhälle - en snabbguide om Sveriges folkhögskolor', + 'thumbnail': r're:^https?://.+\.jpg', + }, + }, + { + 'url': 'https://play.su.se/media/H1YA0EYNCxiesrSU1kaRBQ/baltic-breakfast', + 'md5': '2b817647c3058002526269deff4c0683', + 'info_dict': { + 'id': 'H1YA0EYNCxiesrSU1kaRBQ', + 'ext': 'mp4', + 'title': 'Baltic Breakfast', + 'thumbnail': r're:^https?://.+\.jpg', + }, + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(f'https://api.screen9.com/embed/{video_id}', video_id) + config = self._search_json(r'var\s+config\s*=', webpage, 'config', video_id) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + traverse_obj(config, ('src', lambda _, v: v['type'] == 'application/x-mpegURL', 'src'), get_all=False), + video_id, ext='mp4') + formats.append({ + 'url': traverse_obj(config, ('src', lambda _, v: v['type'] == 'video/mp4', 'src'), get_all=False), + 'format': 'mp4', + }) + + return { + 'id': video_id, + 'title': traverse_obj( + config, + ('plugins', (('title', 'title'), ('googleAnalytics', 'title'), ('share', 'mediaTitle'))), + get_all=False), + 'description': traverse_obj(config, ('plugins', 'title', 'description')), + 'thumbnail': traverse_obj(config, ('poster')), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/hypervideo_dl/extractor/screencast.py b/hypervideo_dl/extractor/screencast.py index 69a0d01..df5e79b 100644 --- a/hypervideo_dl/extractor/screencast.py +++ b/hypervideo_dl/extractor/screencast.py @@ -1,14 +1,8 @@ -# coding: utf-8 -from __future__ import unicode_literals +import urllib.request from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_request, -) -from ..utils import ( - ExtractorError, -) +from ..compat import compat_parse_qs +from ..utils import ExtractorError class ScreencastIE(InfoExtractor): @@ -78,7 +72,7 @@ class ScreencastIE(InfoExtractor): flash_vars_s = flash_vars_s.replace(',', '&') if flash_vars_s: flash_vars = compat_parse_qs(flash_vars_s) - video_url_raw = compat_urllib_request.quote( + video_url_raw = urllib.request.quote( flash_vars['content'][0]) video_url = video_url_raw.replace('http%3A', 'http:') diff --git a/hypervideo_dl/extractor/screencastify.py b/hypervideo_dl/extractor/screencastify.py new file mode 100644 index 0000000..136b847 --- /dev/null +++ b/hypervideo_dl/extractor/screencastify.py @@ -0,0 +1,52 @@ +import urllib.parse + +from .common import InfoExtractor +from ..utils import traverse_obj, update_url_query + + +class ScreencastifyIE(InfoExtractor): + _VALID_URL = r'https?://watch\.screencastify\.com/v/(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://watch.screencastify.com/v/sYVkZip3quLKhHw4Ybk8', + 'info_dict': { + 'id': 'sYVkZip3quLKhHw4Ybk8', + 'ext': 'mp4', + 'title': 'Inserting and Aligning the Case Top and Bottom', + 'description': '', + 'uploader': 'Paul Gunn', + 'extra_param_to_segment_url': str, + }, + 'params': { + 'skip_download': 'm3u8', + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + info = self._download_json( + f'https://umbrella.svc.screencastify.com/api/umbrellaService/watch/{video_id}', video_id) + + query_string = traverse_obj(info, ('manifest', 'auth', 'query')) + query = urllib.parse.parse_qs(query_string) + formats = [] + dash_manifest_url = traverse_obj(info, ('manifest', 'url')) + if dash_manifest_url: + formats.extend( + self._extract_mpd_formats( + dash_manifest_url, video_id, mpd_id='dash', query=query, fatal=False)) + hls_manifest_url = traverse_obj(info, ('manifest', 'hlsUrl')) + if hls_manifest_url: + formats.extend( + self._extract_m3u8_formats( + hls_manifest_url, video_id, ext='mp4', m3u8_id='hls', query=query, fatal=False)) + for f in formats: + f['url'] = update_url_query(f['url'], query) + + return { + 'id': video_id, + 'title': info.get('title'), + 'description': info.get('description'), + 'uploader': info.get('userName'), + 'formats': formats, + 'extra_param_to_segment_url': query_string, + } diff --git a/hypervideo_dl/extractor/screencastomatic.py b/hypervideo_dl/extractor/screencastomatic.py index 0afdc17..28e25e9 100644 --- a/hypervideo_dl/extractor/screencastomatic.py +++ b/hypervideo_dl/extractor/screencastomatic.py @@ -1,13 +1,12 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( + ExtractorError, get_element_by_class, int_or_none, remove_start, strip_or_none, unified_strdate, + urlencode_postdata, ) @@ -37,6 +36,28 @@ class ScreencastOMaticIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage( 'https://screencast-o-matic.com/player/' + video_id, video_id) + + if (self._html_extract_title(webpage) == 'Protected Content' + or 'This video is private and requires a password' in webpage): + password = self.get_param('videopassword') + + if not password: + raise ExtractorError('Password protected video, use --video-password <password>', expected=True) + + form = self._search_regex( + r'(?is)<form[^>]*>(?P<form>.+?)</form>', webpage, 'login form', group='form') + form_data = self._hidden_inputs(form) + form_data.update({ + 'scPassword': password, + }) + + webpage = self._download_webpage( + 'https://screencast-o-matic.com/player/password', video_id, 'Logging in', + data=urlencode_postdata(form_data)) + + if '<small class="text-danger">Invalid password</small>' in webpage: + raise ExtractorError('Unable to login: Invalid password', expected=True) + info = self._parse_html5_media_entries(url, webpage, video_id)[0] info.update({ 'id': video_id, diff --git a/hypervideo_dl/extractor/scrippsnetworks.py b/hypervideo_dl/extractor/scrippsnetworks.py index 84918b6..c3cee6e 100644 --- a/hypervideo_dl/extractor/scrippsnetworks.py +++ b/hypervideo_dl/extractor/scrippsnetworks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import hashlib diff --git a/hypervideo_dl/extractor/scrolller.py b/hypervideo_dl/extractor/scrolller.py new file mode 100644 index 0000000..4f9fa14 --- /dev/null +++ b/hypervideo_dl/extractor/scrolller.py @@ -0,0 +1,102 @@ +import json + +from .common import InfoExtractor +from ..utils import determine_ext, int_or_none + + +class ScrolllerIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?scrolller\.com/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://scrolller.com/a-helping-hand-1k9pxikxkw', + 'info_dict': { + 'id': 'a-helping-hand-1k9pxikxkw', + 'ext': 'mp4', + 'thumbnail': 'https://zepto.scrolller.com/a-helping-hand-3ty9q8x094-540x960.jpg', + 'title': 'A helping hand', + 'age_limit': 0, + } + }, { + 'url': 'https://scrolller.com/tigers-chasing-a-drone-c5d1f2so6j', + 'info_dict': { + 'id': 'tigers-chasing-a-drone-c5d1f2so6j', + 'ext': 'mp4', + 'thumbnail': 'https://zepto.scrolller.com/tigers-chasing-a-drone-az9pkpguwe-540x303.jpg', + 'title': 'Tigers chasing a drone', + 'age_limit': 0, + } + }, { + 'url': 'https://scrolller.com/baby-rhino-smells-something-9chhugsv9p', + 'info_dict': { + 'id': 'baby-rhino-smells-something-9chhugsv9p', + 'ext': 'mp4', + 'thumbnail': 'https://atto.scrolller.com/hmm-whats-that-smell-bh54mf2c52-300x224.jpg', + 'title': 'Baby rhino smells something', + 'age_limit': 0, + } + }, { + 'url': 'https://scrolller.com/its-all-fun-and-games-cco8jjmoh7', + 'info_dict': { + 'id': 'its-all-fun-and-games-cco8jjmoh7', + 'ext': 'mp4', + 'thumbnail': 'https://atto.scrolller.com/its-all-fun-and-games-3amk9vg7m3-540x649.jpg', + 'title': 'It\'s all fun and games...', + 'age_limit': 0, + } + }, { + 'url': 'https://scrolller.com/may-the-force-be-with-you-octokuro-yeytg1fs7a', + 'info_dict': { + 'id': 'may-the-force-be-with-you-octokuro-yeytg1fs7a', + 'ext': 'mp4', + 'thumbnail': 'https://thumbs2.redgifs.com/DarkStarchyNautilus-poster.jpg', + 'title': 'May the force be with you (Octokuro)', + 'age_limit': 18, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + query = { + 'query': '''{ + getSubredditPost(url:"/%s"){ + id + title + isNsfw + mediaSources{ + url + width + height + } + } + }''' % video_id + } + + video_data = self._download_json( + 'https://api.scrolller.com/api/v2/graphql', video_id, data=json.dumps(query).encode(), + headers={'Content-Type': 'application/json'})['data']['getSubredditPost'] + + formats, thumbnails = [], [] + for source in video_data['mediaSources']: + if determine_ext(source.get('url')) in ('jpg', 'png'): + thumbnails.append({ + 'url': source['url'], + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + }) + elif source.get('url'): + formats.append({ + 'url': source['url'], + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + }) + + if not formats: + self.raise_no_formats('There is no video.', expected=True, video_id=video_id) + + return { + 'id': video_id, + 'title': video_data.get('title'), + 'thumbnails': thumbnails, + 'formats': formats, + 'age_limit': 18 if video_data.get('isNsfw') else 0 + } diff --git a/hypervideo_dl/extractor/scte.py b/hypervideo_dl/extractor/scte.py index 7215cf5..d839ffc 100644 --- a/hypervideo_dl/extractor/scte.py +++ b/hypervideo_dl/extractor/scte.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/seeker.py b/hypervideo_dl/extractor/seeker.py index e5c18c7..65eb16a 100644 --- a/hypervideo_dl/extractor/seeker.py +++ b/hypervideo_dl/extractor/seeker.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/senategov.py b/hypervideo_dl/extractor/senategov.py index b295184..7ff0cf5 100644 --- a/hypervideo_dl/extractor/senategov.py +++ b/hypervideo_dl/extractor/senategov.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -52,6 +49,7 @@ _COMMITTEES = { class SenateISVPIE(InfoExtractor): _IE_NAME = 'senate.gov:isvp' _VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P<qs>.+)' + _EMBED_REGEX = [r"<iframe[^>]+src=['\"](?P<url>https?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]"] _TESTS = [{ 'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png', @@ -90,14 +88,6 @@ class SenateISVPIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _search_iframe_url(webpage): - mobj = re.search( - r"<iframe[^>]+src=['\"](?P<url>https?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]", - webpage) - if mobj: - return mobj.group('url') - def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -141,8 +131,6 @@ class SenateISVPIE(InfoExtractor): entry['format_id'] += mobj.group('tag') formats.append(entry) - self._sort_formats(formats) - return { 'id': video_id, 'title': title, @@ -197,7 +185,6 @@ class SenateGovIE(InfoExtractor): formats = self._extract_m3u8_formats( f'{stream_domain}/i/{filename}_1@{stream_num}/master.m3u8', display_id, ext='mp4') - self._sort_formats(formats) title = self._html_search_regex( (*self._og_regexes('title'), r'(?s)<title>([^<]*?)</title>'), webpage, 'video title') diff --git a/hypervideo_dl/extractor/senateisvp.py b/hypervideo_dl/extractor/senateisvp.py deleted file mode 100644 index 8794d47..0000000 --- a/hypervideo_dl/extractor/senateisvp.py +++ /dev/null @@ -1,153 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - unsmuggle_url, -) -from ..compat import ( - compat_parse_qs, - compat_urlparse, -) - - -class SenateISVPIE(InfoExtractor): - _COMM_MAP = [ - ['ag', '76440', 'http://ag-f.akamaihd.net'], - ['aging', '76442', 'http://aging-f.akamaihd.net'], - ['approps', '76441', 'http://approps-f.akamaihd.net'], - ['armed', '76445', 'http://armed-f.akamaihd.net'], - ['banking', '76446', 'http://banking-f.akamaihd.net'], - ['budget', '76447', 'http://budget-f.akamaihd.net'], - ['cecc', '76486', 'http://srs-f.akamaihd.net'], - ['commerce', '80177', 'http://commerce1-f.akamaihd.net'], - ['csce', '75229', 'http://srs-f.akamaihd.net'], - ['dpc', '76590', 'http://dpc-f.akamaihd.net'], - ['energy', '76448', 'http://energy-f.akamaihd.net'], - ['epw', '76478', 'http://epw-f.akamaihd.net'], - ['ethics', '76449', 'http://ethics-f.akamaihd.net'], - ['finance', '76450', 'http://finance-f.akamaihd.net'], - ['foreign', '76451', 'http://foreign-f.akamaihd.net'], - ['govtaff', '76453', 'http://govtaff-f.akamaihd.net'], - ['help', '76452', 'http://help-f.akamaihd.net'], - ['indian', '76455', 'http://indian-f.akamaihd.net'], - ['intel', '76456', 'http://intel-f.akamaihd.net'], - ['intlnarc', '76457', 'http://intlnarc-f.akamaihd.net'], - ['jccic', '85180', 'http://jccic-f.akamaihd.net'], - ['jec', '76458', 'http://jec-f.akamaihd.net'], - ['judiciary', '76459', 'http://judiciary-f.akamaihd.net'], - ['rpc', '76591', 'http://rpc-f.akamaihd.net'], - ['rules', '76460', 'http://rules-f.akamaihd.net'], - ['saa', '76489', 'http://srs-f.akamaihd.net'], - ['smbiz', '76461', 'http://smbiz-f.akamaihd.net'], - ['srs', '75229', 'http://srs-f.akamaihd.net'], - ['uscc', '76487', 'http://srs-f.akamaihd.net'], - ['vetaff', '76462', 'http://vetaff-f.akamaihd.net'], - ['arch', '', 'http://ussenate-f.akamaihd.net/'] - ] - _IE_NAME = 'senate.gov' - _VALID_URL = r'https?://(?:www\.)?senate\.gov/isvp/?\?(?P<qs>.+)' - _TESTS = [{ - 'url': 'http://www.senate.gov/isvp/?comm=judiciary&type=live&stt=&filename=judiciary031715&auto_play=false&wmode=transparent&poster=http%3A%2F%2Fwww.judiciary.senate.gov%2Fthemes%2Fjudiciary%2Fimages%2Fvideo-poster-flash-fit.png', - 'info_dict': { - 'id': 'judiciary031715', - 'ext': 'mp4', - 'title': 'Integrated Senate Video Player', - 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://www.senate.gov/isvp/?type=live&comm=commerce&filename=commerce011514.mp4&auto_play=false', - 'info_dict': { - 'id': 'commerce011514', - 'ext': 'mp4', - 'title': 'Integrated Senate Video Player' - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://www.senate.gov/isvp/?type=arch&comm=intel&filename=intel090613&hc_location=ufi', - # checksum differs each time - 'info_dict': { - 'id': 'intel090613', - 'ext': 'mp4', - 'title': 'Integrated Senate Video Player' - } - }, { - # From http://www.c-span.org/video/?96791-1 - 'url': 'http://www.senate.gov/isvp?type=live&comm=banking&filename=banking012715', - 'only_matching': True, - }] - - @staticmethod - def _search_iframe_url(webpage): - mobj = re.search( - r"<iframe[^>]+src=['\"](?P<url>https?://www\.senate\.gov/isvp/?\?[^'\"]+)['\"]", - webpage) - if mobj: - return mobj.group('url') - - def _get_info_for_comm(self, committee): - for entry in self._COMM_MAP: - if entry[0] == committee: - return entry[1:] - - def _real_extract(self, url): - url, smuggled_data = unsmuggle_url(url, {}) - - qs = compat_parse_qs(self._match_valid_url(url).group('qs')) - if not qs.get('filename') or not qs.get('type') or not qs.get('comm'): - raise ExtractorError('Invalid URL', expected=True) - - video_id = re.sub(r'.mp4$', '', qs['filename'][0]) - - webpage = self._download_webpage(url, video_id) - - if smuggled_data.get('force_title'): - title = smuggled_data['force_title'] - else: - title = self._html_search_regex(r'<title>([^<]+)</title>', webpage, video_id) - poster = qs.get('poster') - thumbnail = poster[0] if poster else None - - video_type = qs['type'][0] - committee = video_type if video_type == 'arch' else qs['comm'][0] - stream_num, domain = self._get_info_for_comm(committee) - - formats = [] - if video_type == 'arch': - filename = video_id if '.' in video_id else video_id + '.mp4' - formats = [{ - # All parameters in the query string are necessary to prevent a 403 error - 'url': compat_urlparse.urljoin(domain, filename) + '?v=3.1.0&fp=&r=&g=', - }] - else: - hdcore_sign = 'hdcore=3.1.0' - url_params = (domain, video_id, stream_num) - f4m_url = '%s/z/%s_1@%s/manifest.f4m?' % url_params + hdcore_sign - m3u8_url = '%s/i/%s_1@%s/master.m3u8' % url_params - for entry in self._extract_f4m_formats(f4m_url, video_id, f4m_id='f4m'): - # URLs without the extra param induce an 404 error - entry.update({'extra_param_to_segment_url': hdcore_sign}) - formats.append(entry) - for entry in self._extract_m3u8_formats(m3u8_url, video_id, ext='mp4', m3u8_id='m3u8'): - mobj = re.search(r'(?P<tag>(?:-p|-b)).m3u8', entry['url']) - if mobj: - entry['format_id'] += mobj.group('tag') - formats.append(entry) - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'thumbnail': thumbnail, - } diff --git a/hypervideo_dl/extractor/sendtonews.py b/hypervideo_dl/extractor/sendtonews.py index 858547b..3600e2e 100644 --- a/hypervideo_dl/extractor/sendtonews.py +++ b/hypervideo_dl/extractor/sendtonews.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -46,14 +43,14 @@ class SendtoNewsIE(InfoExtractor): _URL_TEMPLATE = '//embed.sendtonews.com/player2/embedplayer.php?SC=%s' @classmethod - def _extract_url(cls, webpage): + def _extract_embed_urls(cls, url, webpage): mobj = re.search(r'''(?x)<script[^>]+src=([\'"]) (?:https?:)?//embed\.sendtonews\.com/player/responsiveembed\.php\? .*\bSC=(?P<SC>[0-9a-zA-Z-]+).* \1>''', webpage) if mobj: sc = mobj.group('SC') - return cls._URL_TEMPLATE % sc + yield cls._URL_TEMPLATE % sc def _real_extract(self, url): playlist_id = self._match_id(url) @@ -80,9 +77,6 @@ class SendtoNewsIE(InfoExtractor): 'format_id': '%s-%d' % (determine_protocol(f), tbr), 'tbr': tbr, }) - # 'tbr' was explicitly set to be preferred over 'height' originally, - # So this is being kept unless someone can confirm this is unnecessary - self._sort_formats(info_dict['formats'], ('tbr', 'res')) thumbnails = [] if video.get('thumbnailUrl'): @@ -101,6 +95,9 @@ class SendtoNewsIE(InfoExtractor): 'thumbnails': thumbnails, 'duration': float_or_none(video.get('SM_length')), 'timestamp': parse_iso8601(video.get('S_sysDate'), delimiter=' '), + # 'tbr' was explicitly set to be preferred over 'height' originally, + # So this is being kept unless someone can confirm this is unnecessary + '_format_sort_fields': ('tbr', 'res') }) entries.append(info_dict) diff --git a/hypervideo_dl/extractor/servus.py b/hypervideo_dl/extractor/servus.py index 1610ddc..490d562 100644 --- a/hypervideo_dl/extractor/servus.py +++ b/hypervideo_dl/extractor/servus.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -107,7 +104,6 @@ class ServusIE(InfoExtractor): 'width': int_or_none(resource.get('width')), 'height': int_or_none(resource.get('height')), }) - self._sort_formats(formats) attrs = {} for attribute in video['attributes']: diff --git a/hypervideo_dl/extractor/sevenplus.py b/hypervideo_dl/extractor/sevenplus.py index 9867961..222bf6c 100644 --- a/hypervideo_dl/extractor/sevenplus.py +++ b/hypervideo_dl/extractor/sevenplus.py @@ -1,10 +1,7 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import re -from .brightcove import BrightcoveNewIE +from .brightcove import BrightcoveNewBaseIE from ..compat import ( compat_HTTPError, compat_str, @@ -16,7 +13,7 @@ from ..utils import ( ) -class SevenPlusIE(BrightcoveNewIE): +class SevenPlusIE(BrightcoveNewBaseIE): IE_NAME = '7plus' _VALID_URL = r'https?://(?:www\.)?7plus\.com\.au/(?P<path>[^?]+\?.*?\bepisode-id=(?P<id>[^&#]+))' _TESTS = [{ diff --git a/hypervideo_dl/extractor/sexu.py b/hypervideo_dl/extractor/sexu.py index 3df5152..3117f81 100644 --- a/hypervideo_dl/extractor/sexu.py +++ b/hypervideo_dl/extractor/sexu.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor @@ -36,7 +34,6 @@ class SexuIE(InfoExtractor): r'^(\d+)[pP]', source.get('label', ''), 'height', default=None)), } for source in sources if source.get('file')] - self._sort_formats(formats) title = self._html_search_regex( r'<title>([^<]+)\s*-\s*Sexu\.Com</title>', webpage, 'title') diff --git a/hypervideo_dl/extractor/seznamzpravy.py b/hypervideo_dl/extractor/seznamzpravy.py index eef4975..79e8885 100644 --- a/hypervideo_dl/extractor/seznamzpravy.py +++ b/hypervideo_dl/extractor/seznamzpravy.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..compat import ( compat_str, @@ -23,6 +18,7 @@ def _raw_id(src_url): class SeznamZpravyIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?seznamzpravy\.cz/iframe/player\?.*\bsrc=' + _EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?seznamzpravy\.cz/iframe/player\?.*?)\1'] _TESTS = [{ 'url': 'https://www.seznamzpravy.cz/iframe/player?duration=241&serviceSlug=zpravy&src=https%3A%2F%2Fv39-a.sdn.szn.cz%2Fv_39%2Fvmd%2F5999c902ea707c67d8e267a9%3Ffl%3Dmdk%2C432f65a0%7C&itemType=video&autoPlay=false&title=Sv%C4%9Bt%20bez%20obalu%3A%20%C4%8Ce%C5%A1t%C3%AD%20voj%C3%A1ci%20na%20mis%C3%ADch%20(kr%C3%A1tk%C3%A1%20verze)&series=Sv%C4%9Bt%20bez%20obalu&serviceName=Seznam%20Zpr%C3%A1vy&poster=%2F%2Fd39-a.sdn.szn.cz%2Fd_39%2Fc_img_F_I%2FR5puJ.jpeg%3Ffl%3Dcro%2C0%2C0%2C1920%2C1080%7Cres%2C1200%2C%2C1%7Cjpg%2C80%2C%2C1&width=1920&height=1080&cutFrom=0&cutTo=0&splVersion=VOD&contentId=170889&contextId=35990&showAdvert=true&collocation=&autoplayPossible=true&embed=&isVideoTooShortForPreroll=false&isVideoTooLongForPostroll=true&videoCommentOpKey=&videoCommentId=&version=4.0.76&dotService=zpravy&gemiusPrismIdentifier=bVc1ZIb_Qax4W2v5xOPGpMeCP31kFfrTzj0SqPTLh_b.Z7&zoneIdPreroll=seznam.pack.videospot&skipOffsetPreroll=5§ionPrefixPreroll=%2Fzpravy', 'info_dict': { @@ -51,13 +47,6 @@ class SeznamZpravyIE(InfoExtractor): }, }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') for mobj in re.finditer( - r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:www\.)?seznamzpravy\.cz/iframe/player\?.*?)\1', - webpage)] - def _extract_sdn_formats(self, sdn_url, video_id): sdn_data = self._download_json(sdn_url, video_id) @@ -104,7 +93,6 @@ class SeznamZpravyIE(InfoExtractor): urljoin(sdn_url, hls_rel_url), video_id, ext='mp4', m3u8_id='hls', fatal=False)) - self._sort_formats(formats) return formats def _real_extract(self, url): @@ -165,5 +153,5 @@ class SeznamZpravyArticleIE(InfoExtractor): return self.playlist_result([ self.url_result(entry_url, ie=SeznamZpravyIE.ie_key()) - for entry_url in SeznamZpravyIE._extract_urls(webpage)], + for entry_url in SeznamZpravyIE._extract_embed_urls(url, webpage)], article_id, title, description) diff --git a/hypervideo_dl/extractor/shahid.py b/hypervideo_dl/extractor/shahid.py index ab45d9c..26a0bff 100644 --- a/hypervideo_dl/extractor/shahid.py +++ b/hypervideo_dl/extractor/shahid.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import math import re @@ -121,7 +118,6 @@ class ShahidIE(ShahidBaseIE): # https://docs.aws.amazon.com/mediapackage/latest/ug/manifest-filtering.html r'aws\.manifestfilter=[\w:;,-]+&?', '', playout['url']), video_id, 'mp4') - self._sort_formats(formats) # video = self._call_api( # 'product/id', video_id, { diff --git a/hypervideo_dl/extractor/shared.py b/hypervideo_dl/extractor/shared.py index 93ab2a1..9a237b3 100644 --- a/hypervideo_dl/extractor/shared.py +++ b/hypervideo_dl/extractor/shared.py @@ -1,16 +1,13 @@ -from __future__ import unicode_literals +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_b64decode, - compat_urllib_parse_unquote_plus, -) +from ..compat import compat_b64decode from ..utils import ( - determine_ext, + KNOWN_EXTENSIONS, ExtractorError, + determine_ext, int_or_none, js_to_json, - KNOWN_EXTENSIONS, parse_filesize, rot47, url_or_none, @@ -132,7 +129,7 @@ class VivoIE(SharedBaseIE): return stream_url def decode_url(encoded_url): - return rot47(compat_urllib_parse_unquote_plus(encoded_url)) + return rot47(urllib.parse.unquote_plus(encoded_url)) return decode_url(self._parse_json( self._search_regex( diff --git a/hypervideo_dl/extractor/sharevideos.py b/hypervideo_dl/extractor/sharevideos.py new file mode 100644 index 0000000..3132c7a --- /dev/null +++ b/hypervideo_dl/extractor/sharevideos.py @@ -0,0 +1,6 @@ +from .common import InfoExtractor + + +class ShareVideosEmbedIE(InfoExtractor): + _VALID_URL = False + _EMBED_REGEX = [r'<iframe[^>]+?\bsrc\s*=\s*(["\'])(?P<url>(?:https?:)?//embed\.share-videos\.se/auto/embed/\d+\?.*?\buid=\d+.*?)\1'] diff --git a/hypervideo_dl/extractor/shemaroome.py b/hypervideo_dl/extractor/shemaroome.py index 45c1291..7a78c6e 100644 --- a/hypervideo_dl/extractor/shemaroome.py +++ b/hypervideo_dl/extractor/shemaroome.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..aes import aes_cbc_decrypt, unpad_pkcs7 from ..compat import ( @@ -77,7 +74,6 @@ class ShemarooMeIE(InfoExtractor): iv = [0] * 16 m3u8_url = unpad_pkcs7(intlist_to_bytes(aes_cbc_decrypt(url_data, key, iv))).decode('ascii') formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, fatal=False, headers={'stream_key': data_json['stream_key']}) - self._sort_formats(formats) release_date = self._html_search_regex( (r'itemprop="uploadDate">\s*([\d-]+)', r'id="release_date" value="([\d-]+)'), diff --git a/hypervideo_dl/extractor/showroomlive.py b/hypervideo_dl/extractor/showroomlive.py index 1aada69..ab18953 100644 --- a/hypervideo_dl/extractor/showroomlive.py +++ b/hypervideo_dl/extractor/showroomlive.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -69,7 +66,6 @@ class ShowRoomLiveIE(InfoExtractor): 'format_note': stream.get('label'), 'quality': int_or_none(stream.get('quality', 100)), }) - self._sort_formats(formats) return { 'id': compat_str(room.get('live_id') or broadcaster_id), diff --git a/hypervideo_dl/extractor/simplecast.py b/hypervideo_dl/extractor/simplecast.py index 857e941..ec349dd 100644 --- a/hypervideo_dl/extractor/simplecast.py +++ b/hypervideo_dl/extractor/simplecast.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import ( clean_podcast_url, @@ -71,6 +66,11 @@ class SimplecastBaseIE(InfoExtractor): class SimplecastIE(SimplecastBaseIE): IE_NAME = 'simplecast' _VALID_URL = r'https?://(?:api\.simplecast\.com/episodes|player\.simplecast\.com)/(?P<id>%s)' % SimplecastBaseIE._UUID_REGEX + _EMBED_REGEX = [rf'''(?x)<iframe[^>]+src=["\'] + (?P<url>https?://(?: + embed\.simplecast\.com/[0-9a-f]{8}| + player\.simplecast\.com/{SimplecastBaseIE._UUID_REGEX} + ))'''] _COMMON_TEST_INFO = { 'display_id': 'errant-signal-chris-franklin-new-wave-video-essays', 'id': 'b6dc49a2-9404-4853-9aa9-9cfc097be876', @@ -97,15 +97,6 @@ class SimplecastIE(SimplecastBaseIE): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'''(?x)<iframe[^>]+src=["\'] - ( - https?://(?:embed\.simplecast\.com/[0-9a-f]{8}| - player\.simplecast\.com/%s - ))''' % SimplecastBaseIE._UUID_REGEX, webpage) - def _real_extract(self, url): episode_id = self._match_id(url) episode = self._call_api('episodes/%s', episode_id) diff --git a/hypervideo_dl/extractor/sina.py b/hypervideo_dl/extractor/sina.py index b62b0c3..aeba4e3 100644 --- a/hypervideo_dl/extractor/sina.py +++ b/hypervideo_dl/extractor/sina.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( HEADRequest, @@ -101,7 +97,6 @@ class SinaIE(InfoExtractor): 'quality': preference(quality_id), 'ext': 'mp4', }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/sixplay.py b/hypervideo_dl/extractor/sixplay.py index fd747f5..a6fb6c1 100644 --- a/hypervideo_dl/extractor/sixplay.py +++ b/hypervideo_dl/extractor/sixplay.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..compat import ( compat_str, @@ -108,7 +104,6 @@ class SixPlayIE(InfoExtractor): 'quality': quality_key(quality), 'ext': ext, }) - self._sort_formats(formats) def get(getter): for src in (data, clip_data): diff --git a/hypervideo_dl/extractor/skeb.py b/hypervideo_dl/extractor/skeb.py index 81aecb3..e02f8ce 100644 --- a/hypervideo_dl/extractor/skeb.py +++ b/hypervideo_dl/extractor/skeb.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ExtractorError, determine_ext, parse_qs, traverse_obj diff --git a/hypervideo_dl/extractor/sky.py b/hypervideo_dl/extractor/sky.py index ad1e62d..0a8b6cc 100644 --- a/hypervideo_dl/extractor/sky.py +++ b/hypervideo_dl/extractor/sky.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/skyit.py b/hypervideo_dl/extractor/skyit.py index ddb43c0..42d30f7 100644 --- a/hypervideo_dl/extractor/skyit.py +++ b/hypervideo_dl/extractor/skyit.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_parse_qs, @@ -28,7 +25,6 @@ class SkyItPlayerIE(InfoExtractor): 'salesforce': 'C6D585FD1615272C98DE38235F38BD86', 'sitocommerciale': 'VJwfFuSGnLKnd9Phe9y96WkXgYDCguPMJ2dLhGMb2RE', 'sky': 'F96WlOd8yoFmLQgiqv6fNQRvHZcsWk5jDaYnDvhbiJk', - 'skyacademy': 'A6LAn7EkO2Q26FRy0IAMBekX6jzDXYL3', 'skyarte': 'LWk29hfiU39NNdq87ePeRach3nzTSV20o0lTv2001Cd', 'theupfront': 'PRSGmDMsg6QMGc04Obpoy7Vsbn7i2Whp', } @@ -45,12 +41,7 @@ class SkyItPlayerIE(InfoExtractor): if not hls_url and video.get('geoblock' if is_live else 'geob'): self.raise_geo_restricted(countries=['IT']) - if is_live: - formats = self._extract_m3u8_formats(hls_url, video_id, 'mp4') - else: - formats = self._extract_akamai_formats( - hls_url, video_id, {'http': 'videoplatform.sky.it'}) - self._sort_formats(formats) + formats = self._extract_m3u8_formats(hls_url, video_id, 'mp4') return { 'id': video_id, @@ -78,19 +69,22 @@ class SkyItPlayerIE(InfoExtractor): return self._parse_video(video, video_id) -class SkyItVideoIE(SkyItPlayerIE): +class SkyItVideoIE(SkyItPlayerIE): # XXX: Do not subclass from concrete IE IE_NAME = 'video.sky.it' _VALID_URL = r'https?://(?:masterchef|video|xfactor)\.sky\.it(?:/[^/]+)*/video/[0-9a-z-]+-(?P<id>\d+)' _TESTS = [{ 'url': 'https://video.sky.it/news/mondo/video/uomo-ucciso-da-uno-squalo-in-australia-631227', - 'md5': 'fe5c91e59a84a3437eaa0bca6e134ccd', + 'md5': '5b858a62d9ffe2ab77b397553024184a', 'info_dict': { 'id': '631227', 'ext': 'mp4', 'title': 'Uomo ucciso da uno squalo in Australia', 'timestamp': 1606036192, 'upload_date': '20201122', - } + 'duration': 26, + 'thumbnail': 'https://video.sky.it/captures/thumbs/631227/631227_thumb_880x494.jpg', + }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://xfactor.sky.it/video/x-factor-2020-replay-audizioni-1-615820', 'only_matching': True, @@ -104,7 +98,7 @@ class SkyItVideoIE(SkyItPlayerIE): return self._player_url_result(video_id) -class SkyItVideoLiveIE(SkyItPlayerIE): +class SkyItVideoLiveIE(SkyItPlayerIE): # XXX: Do not subclass from concrete IE IE_NAME = 'video.sky.it:live' _VALID_URL = r'https?://video\.sky\.it/diretta/(?P<id>[^/?&#]+)' _TEST = { @@ -113,7 +107,8 @@ class SkyItVideoLiveIE(SkyItPlayerIE): 'id': '1', 'ext': 'mp4', 'title': r're:Diretta TG24 \d{4}-\d{2}-\d{2} \d{2}:\d{2}', - 'description': 'Guarda la diretta streaming di SkyTg24, segui con Sky tutti gli appuntamenti e gli speciali di Tg24.', + 'description': r're:(?:Clicca play e )?[Gg]uarda la diretta streaming di SkyTg24, segui con Sky tutti gli appuntamenti e gli speciali di Tg24\.', + 'live_status': 'is_live', }, 'params': { # m3u8 download @@ -131,19 +126,21 @@ class SkyItVideoLiveIE(SkyItPlayerIE): return self._parse_video(livestream, asset_id) -class SkyItIE(SkyItPlayerIE): +class SkyItIE(SkyItPlayerIE): # XXX: Do not subclass from concrete IE IE_NAME = 'sky.it' _VALID_URL = r'https?://(?:sport|tg24)\.sky\.it(?:/[^/]+)*/\d{4}/\d{2}/\d{2}/(?P<id>[^/?&#]+)' _TESTS = [{ - 'url': 'https://sport.sky.it/calcio/serie-a/2020/11/21/juventus-cagliari-risultato-gol', + 'url': 'https://sport.sky.it/calcio/serie-a/2022/11/03/brozovic-inter-news', 'info_dict': { - 'id': '631201', + 'id': '789222', 'ext': 'mp4', - 'title': 'Un rosso alla violenza: in campo per i diritti delle donne', - 'upload_date': '20201121', - 'timestamp': 1605995753, + 'title': 'Brozovic con il gruppo: verso convocazione per Juve-Inter', + 'upload_date': '20221103', + 'timestamp': 1667484130, + 'duration': 22, + 'thumbnail': 'https://videoplatform.sky.it/still/2022/11/03/1667480526353_brozovic_videostill_1.jpg', }, - 'expected_warnings': ['Unable to download f4m manifest'], + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://tg24.sky.it/mondo/2020/11/22/australia-squalo-uccide-uomo', 'md5': 'fe5c91e59a84a3437eaa0bca6e134ccd', @@ -153,7 +150,10 @@ class SkyItIE(SkyItPlayerIE): 'title': 'Uomo ucciso da uno squalo in Australia', 'timestamp': 1606036192, 'upload_date': '20201122', + 'duration': 26, + 'thumbnail': 'https://video.sky.it/captures/thumbs/631227/631227_thumb_880x494.jpg', }, + 'params': {'skip_download': 'm3u8'}, }] _VIDEO_ID_REGEX = r'data-videoid="(\d+)"' @@ -165,43 +165,28 @@ class SkyItIE(SkyItPlayerIE): return self._player_url_result(video_id) -class SkyItAcademyIE(SkyItIE): - IE_NAME = 'skyacademy.it' - _VALID_URL = r'https?://(?:www\.)?skyacademy\.it(?:/[^/]+)*/\d{4}/\d{2}/\d{2}/(?P<id>[^/?&#]+)' - _TESTS = [{ - 'url': 'https://www.skyacademy.it/eventi-speciali/2019/07/05/a-lezione-di-cinema-con-sky-academy-/', - 'md5': 'ced5c26638b7863190cbc44dd6f6ba08', - 'info_dict': { - 'id': '523458', - 'ext': 'mp4', - 'title': 'Sky Academy "The Best CineCamp 2019"', - 'timestamp': 1562843784, - 'upload_date': '20190711', - } - }] - _DOMAIN = 'skyacademy' - _VIDEO_ID_REGEX = r'id="news-videoId_(\d+)"' - - -class SkyItArteIE(SkyItIE): +class SkyItArteIE(SkyItIE): # XXX: Do not subclass from concrete IE IE_NAME = 'arte.sky.it' _VALID_URL = r'https?://arte\.sky\.it/video/(?P<id>[^/?&#]+)' _TESTS = [{ - 'url': 'https://arte.sky.it/video/serie-musei-venezia-collezionismo-12-novembre/', + 'url': 'https://arte.sky.it/video/oliviero-toscani-torino-galleria-mazzoleni-788962', 'md5': '515aee97b87d7a018b6c80727d3e7e17', 'info_dict': { - 'id': '627926', + 'id': '788962', 'ext': 'mp4', - 'title': "Musei Galleria Franchetti alla Ca' d'Oro Palazzo Grimani", - 'upload_date': '20201106', - 'timestamp': 1604664493, - } + 'title': 'La fotografia di Oliviero Toscani conquista Torino', + 'upload_date': '20221102', + 'timestamp': 1667399996, + 'duration': 12, + 'thumbnail': 'https://videoplatform.sky.it/still/2022/11/02/1667396388552_oliviero-toscani-torino-galleria-mazzoleni_videostill_1.jpg', + }, + 'params': {'skip_download': 'm3u8'}, }] _DOMAIN = 'skyarte' - _VIDEO_ID_REGEX = r'(?s)<iframe[^>]+src="(?:https:)?//player\.sky\.it/player/external\.html\?[^"]*\bid=(\d+)' + _VIDEO_ID_REGEX = r'"embedUrl"\s*:\s*"(?:https:)?//player\.sky\.it/player/external\.html\?[^"]*\bid=(\d+)' -class CieloTVItIE(SkyItIE): +class CieloTVItIE(SkyItIE): # XXX: Do not subclass from concrete IE IE_NAME = 'cielotv.it' _VALID_URL = r'https?://(?:www\.)?cielotv\.it/video/(?P<id>[^.]+)\.html' _TESTS = [{ @@ -213,17 +198,20 @@ class CieloTVItIE(SkyItIE): 'title': 'Il lunedì è sempre un dramma', 'upload_date': '20190329', 'timestamp': 1553862178, - } + 'duration': 30, + 'thumbnail': 'https://videoplatform.sky.it/still/2019/03/29/1553858575610_lunedi_dramma_mant_videostill_1.jpg', + }, + 'params': {'skip_download': 'm3u8'}, }] _DOMAIN = 'cielo' _VIDEO_ID_REGEX = r'videoId\s*=\s*"(\d+)"' -class TV8ItIE(SkyItVideoIE): +class TV8ItIE(SkyItVideoIE): # XXX: Do not subclass from concrete IE IE_NAME = 'tv8.it' - _VALID_URL = r'https?://tv8\.it/showvideo/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?tv8\.it/(?:show)?video/[0-9a-z-]+-(?P<id>\d+)' _TESTS = [{ - 'url': 'https://tv8.it/showvideo/630529/ogni-mattina-ucciso-asino-di-andrea-lo-cicero/18-11-2020/', + 'url': 'https://www.tv8.it/video/ogni-mattina-ucciso-asino-di-andrea-lo-cicero-630529', 'md5': '9ab906a3f75ea342ed928442f9dabd21', 'info_dict': { 'id': '630529', @@ -231,6 +219,9 @@ class TV8ItIE(SkyItVideoIE): 'title': 'Ogni mattina - Ucciso asino di Andrea Lo Cicero', 'timestamp': 1605721374, 'upload_date': '20201118', - } + 'duration': 114, + 'thumbnail': 'https://videoplatform.sky.it/still/2020/11/18/1605717753954_ogni-mattina-ucciso-asino-di-andrea-lo-cicero_videostill_1.jpg', + }, + 'params': {'skip_download': 'm3u8'}, }] _DOMAIN = 'mtv8' diff --git a/hypervideo_dl/extractor/skylinewebcams.py b/hypervideo_dl/extractor/skylinewebcams.py index 47bbb76..4292bb2 100644 --- a/hypervideo_dl/extractor/skylinewebcams.py +++ b/hypervideo_dl/extractor/skylinewebcams.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/skynewsarabia.py b/hypervideo_dl/extractor/skynewsarabia.py index fffc9aa..6264b04 100644 --- a/hypervideo_dl/extractor/skynewsarabia.py +++ b/hypervideo_dl/extractor/skynewsarabia.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( diff --git a/hypervideo_dl/extractor/skynewsau.py b/hypervideo_dl/extractor/skynewsau.py index 8e079ee..43a9c82 100644 --- a/hypervideo_dl/extractor/skynewsau.py +++ b/hypervideo_dl/extractor/skynewsau.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( try_get, diff --git a/hypervideo_dl/extractor/slideshare.py b/hypervideo_dl/extractor/slideshare.py index 9b3ad0a..ab9dad0 100644 --- a/hypervideo_dl/extractor/slideshare.py +++ b/hypervideo_dl/extractor/slideshare.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import json from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/slideslive.py b/hypervideo_dl/extractor/slideslive.py index df60846..9a60a79 100644 --- a/hypervideo_dl/extractor/slideslive.py +++ b/hypervideo_dl/extractor/slideslive.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( bool_or_none, @@ -12,6 +9,7 @@ from ..utils import ( class SlidesLiveIE(InfoExtractor): _VALID_URL = r'https?://slideslive\.com/(?P<id>[0-9]+)' + _WORKING = False _TESTS = [{ # video_service_name = YOUTUBE 'url': 'https://slideslive.com/38902413/gcc-ia16-backend', @@ -87,7 +85,6 @@ class SlidesLiveIE(InfoExtractor): formats.extend(self._extract_mpd_formats( _MANIFEST_PATTERN % (service_id, 'mpd'), service_id, mpd_id='dash', fatal=False)) - self._sort_formats(formats) info.update({ 'id': service_id, 'formats': formats, diff --git a/hypervideo_dl/extractor/slutload.py b/hypervideo_dl/extractor/slutload.py index 661f9e5..8e6e89c 100644 --- a/hypervideo_dl/extractor/slutload.py +++ b/hypervideo_dl/extractor/slutload.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/smotrim.py b/hypervideo_dl/extractor/smotrim.py new file mode 100644 index 0000000..d3f1b69 --- /dev/null +++ b/hypervideo_dl/extractor/smotrim.py @@ -0,0 +1,65 @@ +from .common import InfoExtractor +from ..utils import ExtractorError + + +class SmotrimIE(InfoExtractor): + _VALID_URL = r'https?://smotrim\.ru/(?P<type>brand|video|article|live)/(?P<id>[0-9]+)' + _TESTS = [{ # video + 'url': 'https://smotrim.ru/video/1539617', + 'md5': 'b1923a533c8cab09679789d720d0b1c5', + 'info_dict': { + 'id': '1539617', + 'ext': 'mp4', + 'title': 'Полиглот. Китайский с нуля за 16 часов! Урок №16', + 'description': '', + }, + 'add_ie': ['RUTV'], + }, { # article (geo-restricted? plays fine from the US and JP) + 'url': 'https://smotrim.ru/article/2813445', + 'md5': 'e0ac453952afbc6a2742e850b4dc8e77', + 'info_dict': { + 'id': '2431846', + 'ext': 'mp4', + 'title': 'Новости культуры. Съёмки первой программы "Большие и маленькие"', + 'description': 'md5:94a4a22472da4252bf5587a4ee441b99', + }, + 'add_ie': ['RUTV'], + }, { # brand, redirect + 'url': 'https://smotrim.ru/brand/64356', + 'md5': '740472999ccff81d7f6df79cecd91c18', + 'info_dict': { + 'id': '2354523', + 'ext': 'mp4', + 'title': 'Большие и маленькие. Лучшее. 4-й выпуск', + 'description': 'md5:84089e834429008371ea41ea3507b989', + }, + 'add_ie': ['RUTV'], + }, { # live + 'url': 'https://smotrim.ru/live/19201', + 'info_dict': { + 'id': '19201', + 'ext': 'mp4', + # this looks like a TV channel name + 'title': 'Россия Культура. Прямой эфир', + 'description': '', + }, + 'add_ie': ['RUTV'], + }] + + def _real_extract(self, url): + video_id, typ = self._match_valid_url(url).group('id', 'type') + rutv_type = 'video' + if typ not in ('video', 'live'): + webpage = self._download_webpage(url, video_id, f'Resolving {typ} link') + # there are two cases matching regex: + # 1. "embedUrl" in JSON LD (/brand/) + # 2. "src" attribute from iframe (/article/) + video_id = self._search_regex( + r'"https://player.smotrim.ru/iframe/video/id/(?P<video_id>\d+)/', + webpage, 'video_id', default=None) + if not video_id: + raise ExtractorError('There are no video in this page.', expected=True) + elif typ == 'live': + rutv_type = 'live' + + return self.url_result(f'https://player.vgtrk.com/iframe/{rutv_type}/id/{video_id}') diff --git a/hypervideo_dl/extractor/snotr.py b/hypervideo_dl/extractor/snotr.py index 0bb5482..6889f19 100644 --- a/hypervideo_dl/extractor/snotr.py +++ b/hypervideo_dl/extractor/snotr.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( parse_duration, diff --git a/hypervideo_dl/extractor/sohu.py b/hypervideo_dl/extractor/sohu.py index 3bff5c5..a8f1e46 100644 --- a/hypervideo_dl/extractor/sohu.py +++ b/hypervideo_dl/extractor/sohu.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -179,7 +176,6 @@ class SohuIE(InfoExtractor): 'height': int_or_none(data.get('height')), 'fps': int_or_none(data.get('fps')), }) - self._sort_formats(formats) playlist.append({ 'id': '%s_part%d' % (video_id, i + 1), diff --git a/hypervideo_dl/extractor/sonyliv.py b/hypervideo_dl/extractor/sonyliv.py index 5b6849f..aaad420 100644 --- a/hypervideo_dl/extractor/sonyliv.py +++ b/hypervideo_dl/extractor/sonyliv.py @@ -1,7 +1,5 @@ -# coding: utf-8 -from __future__ import unicode_literals - import datetime +import json import math import random import time @@ -85,21 +83,32 @@ class SonyLIVIE(InfoExtractor): raise ExtractorError(f'Invalid username/password; {self._LOGIN_HINT}') self.report_login() - data = '''{"mobileNumber":"%s","channelPartnerID":"MSMIND","country":"IN","timestamp":"%s", - "otpSize":6,"loginType":"REGISTERORSIGNIN","isMobileMandatory":true} - ''' % (username, datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%MZ")) otp_request_json = self._download_json( 'https://apiv2.sonyliv.com/AGL/1.6/A/ENG/WEB/IN/HR/CREATEOTP-V2', - None, note='Sending OTP', data=data.encode(), headers=self._HEADERS) + None, note='Sending OTP', headers=self._HEADERS, data=json.dumps({ + 'mobileNumber': username, + 'channelPartnerID': 'MSMIND', + 'country': 'IN', + 'timestamp': datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%MZ'), + 'otpSize': 6, + 'loginType': 'REGISTERORSIGNIN', + 'isMobileMandatory': True, + }).encode()) if otp_request_json['resultCode'] == 'KO': raise ExtractorError(otp_request_json['message'], expected=True) - otp_code = self._get_tfa_info('OTP') - data = '''{"channelPartnerID":"MSMIND","mobileNumber":"%s","country":"IN","otp":"%s", - "dmaId":"IN","ageConfirmation":true,"timestamp":"%s","isMobileMandatory":true} - ''' % (username, otp_code, datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%MZ")) + otp_verify_json = self._download_json( 'https://apiv2.sonyliv.com/AGL/2.0/A/ENG/WEB/IN/HR/CONFIRMOTP-V2', - None, note='Verifying OTP', data=data.encode(), headers=self._HEADERS) + None, note='Verifying OTP', headers=self._HEADERS, data=json.dumps({ + 'channelPartnerID': 'MSMIND', + 'mobileNumber': username, + 'country': 'IN', + 'otp': self._get_tfa_info('OTP'), + 'dmaId': 'IN', + 'ageConfirmation': True, + 'timestamp': datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%MZ'), + 'isMobileMandatory': True, + }).encode()) if otp_verify_json['resultCode'] == 'KO': raise ExtractorError(otp_request_json['message'], expected=True) self._HEADERS['authorization'] = otp_verify_json['resultObj']['accessToken'] @@ -141,7 +150,6 @@ class SonyLIVIE(InfoExtractor): video_id, 'mp4', m3u8_id='hls', headers=headers, fatal=False)) for f in formats: f.setdefault('http_headers', {}).update(headers) - self._sort_formats(formats) metadata = self._call_api( '1.6', 'IN/DETAIL/' + video_id, video_id)['containers'][0]['metadata'] diff --git a/hypervideo_dl/extractor/soundcloud.py b/hypervideo_dl/extractor/soundcloud.py index 92535f7..c2344dd 100644 --- a/hypervideo_dl/extractor/soundcloud.py +++ b/hypervideo_dl/extractor/soundcloud.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import itertools import re import json @@ -12,7 +9,6 @@ from .common import ( ) from ..compat import ( compat_HTTPError, - compat_kwargs, compat_str, ) from ..utils import ( @@ -23,7 +19,6 @@ from ..utils import ( int_or_none, KNOWN_EXTENSIONS, mimetype2ext, - remove_end, parse_qs, str_or_none, try_get, @@ -37,18 +32,13 @@ from ..utils import ( class SoundcloudEmbedIE(InfoExtractor): _VALID_URL = r'https?://(?:w|player|p)\.soundcloud\.com/player/?.*?\burl=(?P<id>.+)' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1'] _TEST = { # from https://www.soundi.fi/uutiset/ennakkokuuntelussa-timo-kaukolammen-station-to-station-to-station-julkaisua-juhlitaan-tanaan-g-livelabissa/ 'url': 'https://w.soundcloud.com/player/?visual=true&url=https%3A%2F%2Fapi.soundcloud.com%2Fplaylists%2F922213810&show_artwork=true&maxwidth=640&maxheight=960&dnt=1&secret_token=s-ziYey', 'only_matching': True, } - @staticmethod - def _extract_urls(webpage): - return [m.group('url') for m in re.finditer( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1', - webpage)] - def _real_extract(self, url): query = parse_qs(url) api_url = query['url'][0] @@ -70,8 +60,23 @@ class SoundcloudBaseIE(InfoExtractor): _access_token = None _HEADERS = {} + _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg' + + _ARTWORK_MAP = { + 'mini': 16, + 'tiny': 20, + 'small': 32, + 'badge': 47, + 't67x67': 67, + 'large': 100, + 't300x300': 300, + 'crop': 400, + 't500x500': 500, + 'original': 0, + } + def _store_client_id(self, client_id): - self._downloader.cache.store('soundcloud', 'client_id', client_id) + self.cache.store('soundcloud', 'client_id', client_id) def _update_client_id(self): webpage = self._download_webpage('https://soundcloud.com/', None) @@ -96,7 +101,7 @@ class SoundcloudBaseIE(InfoExtractor): query['client_id'] = self._CLIENT_ID kwargs['query'] = query try: - return super()._download_json(*args, **compat_kwargs(kwargs)) + return super()._download_json(*args, **kwargs) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 403): self._store_client_id(None) @@ -108,7 +113,7 @@ class SoundcloudBaseIE(InfoExtractor): raise def _initialize_pre_login(self): - self._CLIENT_ID = self._downloader.cache.load('soundcloud', 'client_id') or 'a3e059563d7fd3372b49b37f00a00bcf' + self._CLIENT_ID = self.cache.load('soundcloud', 'client_id') or 'a3e059563d7fd3372b49b37f00a00bcf' def _perform_login(self, username, password): if username != 'oauth': @@ -189,6 +194,157 @@ class SoundcloudBaseIE(InfoExtractor): return out + def _extract_info_dict(self, info, full_title=None, secret_token=None, extract_flat=False): + track_id = compat_str(info['id']) + title = info['title'] + + format_urls = set() + formats = [] + query = {'client_id': self._CLIENT_ID} + if secret_token: + query['secret_token'] = secret_token + + if not extract_flat and info.get('downloadable') and info.get('has_downloads_left'): + download_url = update_url_query( + self._API_V2_BASE + 'tracks/' + track_id + '/download', query) + redirect_url = (self._download_json(download_url, track_id, fatal=False) or {}).get('redirectUri') + if redirect_url: + urlh = self._request_webpage( + HEADRequest(redirect_url), track_id, fatal=False) + if urlh: + format_url = urlh.geturl() + format_urls.add(format_url) + formats.append({ + 'format_id': 'download', + 'ext': urlhandle_detect_ext(urlh) or 'mp3', + 'filesize': int_or_none(urlh.headers.get('Content-Length')), + 'url': format_url, + 'quality': 10, + }) + + def invalid_url(url): + return not url or url in format_urls + + def add_format(f, protocol, is_preview=False): + mobj = re.search(r'\.(?P<abr>\d+)\.(?P<ext>[0-9a-z]{3,4})(?=[/?])', stream_url) + if mobj: + for k, v in mobj.groupdict().items(): + if not f.get(k): + f[k] = v + format_id_list = [] + if protocol: + format_id_list.append(protocol) + ext = f.get('ext') + if ext == 'aac': + f['abr'] = '256' + for k in ('ext', 'abr'): + v = f.get(k) + if v: + format_id_list.append(v) + preview = is_preview or re.search(r'/(?:preview|playlist)/0/30/', f['url']) + if preview: + format_id_list.append('preview') + abr = f.get('abr') + if abr: + f['abr'] = int(abr) + if protocol == 'hls': + protocol = 'm3u8' if ext == 'aac' else 'm3u8_native' + else: + protocol = 'http' + f.update({ + 'format_id': '_'.join(format_id_list), + 'protocol': protocol, + 'preference': -10 if preview else None, + }) + formats.append(f) + + # New API + transcodings = try_get( + info, lambda x: x['media']['transcodings'], list) or [] + for t in transcodings: + if not isinstance(t, dict): + continue + format_url = url_or_none(t.get('url')) + if not format_url: + continue + stream = None if extract_flat else self._download_json( + format_url, track_id, query=query, fatal=False, headers=self._HEADERS) + if not isinstance(stream, dict): + continue + stream_url = url_or_none(stream.get('url')) + if invalid_url(stream_url): + continue + format_urls.add(stream_url) + stream_format = t.get('format') or {} + protocol = stream_format.get('protocol') + if protocol != 'hls' and '/hls' in format_url: + protocol = 'hls' + ext = None + preset = str_or_none(t.get('preset')) + if preset: + ext = preset.split('_')[0] + if ext not in KNOWN_EXTENSIONS: + ext = mimetype2ext(stream_format.get('mime_type')) + add_format({ + 'url': stream_url, + 'ext': ext, + }, 'http' if protocol == 'progressive' else protocol, + t.get('snipped') or '/preview/' in format_url) + + for f in formats: + f['vcodec'] = 'none' + + if not formats and info.get('policy') == 'BLOCK': + self.raise_geo_restricted(metadata_available=True) + + user = info.get('user') or {} + + thumbnails = [] + artwork_url = info.get('artwork_url') + thumbnail = artwork_url or user.get('avatar_url') + if isinstance(thumbnail, compat_str): + if re.search(self._IMAGE_REPL_RE, thumbnail): + for image_id, size in self._ARTWORK_MAP.items(): + i = { + 'id': image_id, + 'url': re.sub(self._IMAGE_REPL_RE, '-%s.jpg' % image_id, thumbnail), + } + if image_id == 'tiny' and not artwork_url: + size = 18 + elif image_id == 'original': + i['preference'] = 10 + if size: + i.update({ + 'width': size, + 'height': size, + }) + thumbnails.append(i) + else: + thumbnails = [{'url': thumbnail}] + + def extract_count(key): + return int_or_none(info.get('%s_count' % key)) + + return { + 'id': track_id, + 'uploader': user.get('username'), + 'uploader_id': str_or_none(user.get('id')) or user.get('permalink'), + 'uploader_url': user.get('permalink_url'), + 'timestamp': unified_timestamp(info.get('created_at')), + 'title': title, + 'description': info.get('description'), + 'thumbnails': thumbnails, + 'duration': float_or_none(info.get('duration'), 1000), + 'webpage_url': info.get('permalink_url'), + 'license': info.get('license'), + 'view_count': extract_count('playback'), + 'like_count': extract_count('favoritings') or extract_count('likes'), + 'comment_count': extract_count('comment'), + 'repost_count': extract_count('reposts'), + 'genre': info.get('genre'), + 'formats': formats if not extract_flat else None + } + @classmethod def _resolv_url(cls, url): return cls._API_V2_BASE + 'resolve?url=' + url @@ -387,173 +543,6 @@ class SoundcloudIE(SoundcloudBaseIE): }, ] - _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg' - - _ARTWORK_MAP = { - 'mini': 16, - 'tiny': 20, - 'small': 32, - 'badge': 47, - 't67x67': 67, - 'large': 100, - 't300x300': 300, - 'crop': 400, - 't500x500': 500, - 'original': 0, - } - - def _extract_info_dict(self, info, full_title=None, secret_token=None): - track_id = compat_str(info['id']) - title = info['title'] - - format_urls = set() - formats = [] - query = {'client_id': self._CLIENT_ID} - if secret_token: - query['secret_token'] = secret_token - - if info.get('downloadable') and info.get('has_downloads_left'): - download_url = update_url_query( - self._API_V2_BASE + 'tracks/' + track_id + '/download', query) - redirect_url = (self._download_json(download_url, track_id, fatal=False) or {}).get('redirectUri') - if redirect_url: - urlh = self._request_webpage( - HEADRequest(redirect_url), track_id, fatal=False) - if urlh: - format_url = urlh.geturl() - format_urls.add(format_url) - formats.append({ - 'format_id': 'download', - 'ext': urlhandle_detect_ext(urlh) or 'mp3', - 'filesize': int_or_none(urlh.headers.get('Content-Length')), - 'url': format_url, - 'quality': 10, - }) - - def invalid_url(url): - return not url or url in format_urls - - def add_format(f, protocol, is_preview=False): - mobj = re.search(r'\.(?P<abr>\d+)\.(?P<ext>[0-9a-z]{3,4})(?=[/?])', stream_url) - if mobj: - for k, v in mobj.groupdict().items(): - if not f.get(k): - f[k] = v - format_id_list = [] - if protocol: - format_id_list.append(protocol) - ext = f.get('ext') - if ext == 'aac': - f['abr'] = '256' - for k in ('ext', 'abr'): - v = f.get(k) - if v: - format_id_list.append(v) - preview = is_preview or re.search(r'/(?:preview|playlist)/0/30/', f['url']) - if preview: - format_id_list.append('preview') - abr = f.get('abr') - if abr: - f['abr'] = int(abr) - if protocol == 'hls': - protocol = 'm3u8' if ext == 'aac' else 'm3u8_native' - else: - protocol = 'http' - f.update({ - 'format_id': '_'.join(format_id_list), - 'protocol': protocol, - 'preference': -10 if preview else None, - }) - formats.append(f) - - # New API - transcodings = try_get( - info, lambda x: x['media']['transcodings'], list) or [] - for t in transcodings: - if not isinstance(t, dict): - continue - format_url = url_or_none(t.get('url')) - if not format_url: - continue - stream = self._download_json( - format_url, track_id, query=query, fatal=False, headers=self._HEADERS) - if not isinstance(stream, dict): - continue - stream_url = url_or_none(stream.get('url')) - if invalid_url(stream_url): - continue - format_urls.add(stream_url) - stream_format = t.get('format') or {} - protocol = stream_format.get('protocol') - if protocol != 'hls' and '/hls' in format_url: - protocol = 'hls' - ext = None - preset = str_or_none(t.get('preset')) - if preset: - ext = preset.split('_')[0] - if ext not in KNOWN_EXTENSIONS: - ext = mimetype2ext(stream_format.get('mime_type')) - add_format({ - 'url': stream_url, - 'ext': ext, - }, 'http' if protocol == 'progressive' else protocol, - t.get('snipped') or '/preview/' in format_url) - - for f in formats: - f['vcodec'] = 'none' - - if not formats and info.get('policy') == 'BLOCK': - self.raise_geo_restricted(metadata_available=True) - self._sort_formats(formats) - - user = info.get('user') or {} - - thumbnails = [] - artwork_url = info.get('artwork_url') - thumbnail = artwork_url or user.get('avatar_url') - if isinstance(thumbnail, compat_str): - if re.search(self._IMAGE_REPL_RE, thumbnail): - for image_id, size in self._ARTWORK_MAP.items(): - i = { - 'id': image_id, - 'url': re.sub(self._IMAGE_REPL_RE, '-%s.jpg' % image_id, thumbnail), - } - if image_id == 'tiny' and not artwork_url: - size = 18 - elif image_id == 'original': - i['preference'] = 10 - if size: - i.update({ - 'width': size, - 'height': size, - }) - thumbnails.append(i) - else: - thumbnails = [{'url': thumbnail}] - - def extract_count(key): - return int_or_none(info.get('%s_count' % key)) - - return { - 'id': track_id, - 'uploader': user.get('username'), - 'uploader_id': str_or_none(user.get('id')) or user.get('permalink'), - 'uploader_url': user.get('permalink_url'), - 'timestamp': unified_timestamp(info.get('created_at')), - 'title': title, - 'description': info.get('description'), - 'thumbnails': thumbnails, - 'duration': float_or_none(info.get('duration'), 1000), - 'webpage_url': info.get('permalink_url'), - 'license': info.get('license'), - 'view_count': extract_count('playback'), - 'like_count': extract_count('favoritings') or extract_count('likes'), - 'comment_count': extract_count('comment'), - 'repost_count': extract_count('reposts'), - 'genre': info.get('genre'), - 'formats': formats - } - def _real_extract(self, url): mobj = self._match_valid_url(url) @@ -670,25 +659,20 @@ class SoundcloudPagedPlaylistBaseIE(SoundcloudBaseIE): 'offset': 0, } - retries = self.get_param('extractor_retries', 3) - for i in itertools.count(): - attempt, last_error = -1, None - while attempt < retries: - attempt += 1 - if last_error: - self.report_warning('%s. Retrying ...' % remove_end(last_error, '.'), playlist_id) + for retry in self.RetryManager(): try: response = self._download_json( url, playlist_id, query=query, headers=self._HEADERS, - note='Downloading track page %s%s' % (i + 1, f' (retry #{attempt})' if attempt else '')) + note=f'Downloading track page {i + 1}') break except ExtractorError as e: # Downloading page may result in intermittent 502 HTTP error # See https://github.com/hypervideo/hypervideo/issues/872 - if attempt >= retries or not isinstance(e.cause, compat_HTTPError) or e.cause.code != 502: + if not isinstance(e.cause, compat_HTTPError) or e.cause.code != 502: raise - last_error = str(e.cause or e.msg) + retry.error = e + continue def resolve_entry(*candidates): for cand in candidates: @@ -906,6 +890,7 @@ class SoundcloudSearchIE(SoundcloudBaseIE, SearchInfoExtractor): _TESTS = [{ 'url': 'scsearch15:post-avant jazzcore', 'info_dict': { + 'id': 'post-avant jazzcore', 'title': 'post-avant jazzcore', }, 'playlist_count': 15, @@ -932,7 +917,8 @@ class SoundcloudSearchIE(SoundcloudBaseIE, SearchInfoExtractor): for item in response.get('collection') or []: if item: - yield self.url_result(item['uri'], SoundcloudIE.ie_key()) + yield self.url_result( + item['uri'], SoundcloudIE.ie_key(), **self._extract_info_dict(item, extract_flat=True)) next_url = response.get('next_href') if not next_url: diff --git a/hypervideo_dl/extractor/soundgasm.py b/hypervideo_dl/extractor/soundgasm.py index d608eb7..9e59c7c 100644 --- a/hypervideo_dl/extractor/soundgasm.py +++ b/hypervideo_dl/extractor/soundgasm.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/southpark.py b/hypervideo_dl/extractor/southpark.py index 942a52d..e23f192 100644 --- a/hypervideo_dl/extractor/southpark.py +++ b/hypervideo_dl/extractor/southpark.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .mtv import MTVServicesInfoExtractor @@ -37,7 +34,7 @@ class SouthParkIE(MTVServicesInfoExtractor): } -class SouthParkEsIE(SouthParkIE): +class SouthParkEsIE(SouthParkIE): # XXX: Do not subclass from concrete IE IE_NAME = 'southpark.cc.com:español' _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/es/episodios/(?P<id>.+?)(\?|#|$))' _LANG = 'es' @@ -53,7 +50,7 @@ class SouthParkEsIE(SouthParkIE): }] -class SouthParkDeIE(SouthParkIE): +class SouthParkDeIE(SouthParkIE): # XXX: Do not subclass from concrete IE IE_NAME = 'southpark.de' _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.de/(?:(en/(videoclip|collections|episodes|video-clips))|(videoclip|collections|folgen))/(?P<id>(?P<unique_id>.+?)/.+?)(?:\?|#|$))' _TESTS = [{ @@ -112,7 +109,50 @@ class SouthParkDeIE(SouthParkIE): return -class SouthParkNlIE(SouthParkIE): +class SouthParkLatIE(SouthParkIE): # XXX: Do not subclass from concrete IE + IE_NAME = 'southpark.lat' + _VALID_URL = r'https?://(?:www\.)?southpark\.lat/(?:en/)?(?:video-?clips?|collections|episod(?:e|io)s)/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.southpark.lat/en/video-clips/ct46op/south-park-tooth-fairy-cartman', + 'only_matching': True, + }, { + 'url': 'https://www.southpark.lat/episodios/9h0qbg/south-park-orgia-gatuna-temporada-3-ep-7', + 'only_matching': True, + }, { + 'url': 'https://www.southpark.lat/en/collections/29ve08/south-park-heating-up/lydbrc', + 'only_matching': True, + }, { + # clip + 'url': 'https://www.southpark.lat/en/video-clips/ct46op/south-park-tooth-fairy-cartman', + 'info_dict': { + 'id': 'e99d45ea-ed00-11e0-aca6-0026b9414f30', + 'ext': 'mp4', + 'title': 'Tooth Fairy Cartman', + 'description': 'md5:db02e23818b4dc9cb5f0c5a7e8833a68', + }, + }, { + # episode + 'url': 'https://www.southpark.lat/episodios/9h0qbg/south-park-orgia-gatuna-temporada-3-ep-7', + 'info_dict': { + 'id': 'f5fbd823-04bc-11eb-9b1b-0e40cf2fc285', + 'ext': 'mp4', + 'title': 'South Park', + 'description': 'md5:ae0d875eff169dcbed16b21531857ac1', + }, + }] + + def _get_feed_url(self, uri, url=None): + video_id = self._id_from_uri(uri) + config = self._download_json( + f'http://media.mtvnservices.com/pmt/e1/access/index.html?uri={uri}&configtype=edge&ref={url}', + video_id) + return self._remove_template_parameter(config['feedWithQueryParams']) + + def _get_feed_query(self, uri): + return + + +class SouthParkNlIE(SouthParkIE): # XXX: Do not subclass from concrete IE IE_NAME = 'southpark.nl' _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.nl/(?:clips|(?:full-)?episodes|collections)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://www.southpark.nl/feeds/video-player/mrss/' @@ -127,7 +167,7 @@ class SouthParkNlIE(SouthParkIE): }] -class SouthParkDkIE(SouthParkIE): +class SouthParkDkIE(SouthParkIE): # XXX: Do not subclass from concrete IE IE_NAME = 'southparkstudios.dk' _VALID_URL = r'https?://(?:www\.)?(?P<url>southparkstudios\.(?:dk|nu)/(?:clips|full-episodes|collections)/(?P<id>.+?)(\?|#|$))' _FEED_URL = 'http://www.southparkstudios.dk/feeds/video-player/mrss/' diff --git a/hypervideo_dl/extractor/sovietscloset.py b/hypervideo_dl/extractor/sovietscloset.py index 4bc2263..453016c 100644 --- a/hypervideo_dl/extractor/sovietscloset.py +++ b/hypervideo_dl/extractor/sovietscloset.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( try_get, @@ -47,7 +44,7 @@ class SovietsClosetIE(SovietsClosetBaseIE): _TESTS = [ { 'url': 'https://sovietscloset.com/video/1337', - 'md5': '11e58781c4ca5b283307aa54db5b3f93', + 'md5': 'bd012b04b261725510ca5383074cdd55', 'info_dict': { 'id': '1337', 'ext': 'mp4', @@ -72,11 +69,11 @@ class SovietsClosetIE(SovietsClosetBaseIE): }, { 'url': 'https://sovietscloset.com/video/1105', - 'md5': '578b1958a379e7110ba38697042e9efb', + 'md5': '89fa928f183893cb65a0b7be846d8a90', 'info_dict': { 'id': '1105', 'ext': 'mp4', - 'title': 'Arma 3 - Zeus Games #3', + 'title': 'Arma 3 - Zeus Games #5', 'uploader': 'SovietWomble', 'thumbnail': r're:^https?://.*\.b-cdn\.net/c0e5e76f-3a93-40b4-bf01-12343c2eec5d/thumbnail\.jpg$', 'uploader': 'SovietWomble', @@ -92,8 +89,8 @@ class SovietsClosetIE(SovietsClosetBaseIE): 'availability': 'public', 'series': 'Arma 3', 'season': 'Zeus Games', - 'episode_number': 3, - 'episode': 'Episode 3', + 'episode_number': 5, + 'episode': 'Episode 5', }, }, ] @@ -107,7 +104,6 @@ class SovietsClosetIE(SovietsClosetBaseIE): thumbnail_url = self._search_regex(r'(https?://.*?thumbnail\.jpg)', iframe, 'thumbnail url') m3u8_formats = self._extract_m3u8_formats(m3u8_url, video_id, headers=self.MEDIADELIVERY_REFERER) - self._sort_formats(m3u8_formats) if not m3u8_formats: duration = None @@ -125,7 +121,7 @@ class SovietsClosetIE(SovietsClosetBaseIE): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - static_assets_base = self._search_regex(r'staticAssetsBase:\"(.*?)\"', webpage, 'staticAssetsBase') + static_assets_base = self._search_regex(r'(/_nuxt/static/\d+)', webpage, 'staticAssetsBase') static_assets_base = f'https://sovietscloset.com{static_assets_base}' stream = self.parse_nuxt_jsonp(f'{static_assets_base}/video/{video_id}/payload.js', video_id, 'video')['stream'] @@ -184,7 +180,7 @@ class SovietsClosetPlaylistIE(SovietsClosetBaseIE): webpage = self._download_webpage(url, playlist_id) - static_assets_base = self._search_regex(r'staticAssetsBase:\"(.*?)\"', webpage, 'staticAssetsBase') + static_assets_base = self._search_regex(r'(/_nuxt/static/\d+)', webpage, 'staticAssetsBase') static_assets_base = f'https://sovietscloset.com{static_assets_base}' sovietscloset = self.parse_nuxt_jsonp(f'{static_assets_base}/payload.js', playlist_id, 'global')['games'] diff --git a/hypervideo_dl/extractor/spankbang.py b/hypervideo_dl/extractor/spankbang.py index dd849ae..f242d33 100644 --- a/hypervideo_dl/extractor/spankbang.py +++ b/hypervideo_dl/extractor/spankbang.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -130,8 +128,6 @@ class SpankBangIE(InfoExtractor): format_url = format_url[0] extract_format(format_id, format_url) - self._sort_formats(formats) - info = self._search_json_ld(webpage, video_id, default={}) title = self._html_search_regex( diff --git a/hypervideo_dl/extractor/spankwire.py b/hypervideo_dl/extractor/spankwire.py index e97c1d2..334b297 100644 --- a/hypervideo_dl/extractor/spankwire.py +++ b/hypervideo_dl/extractor/spankwire.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -23,6 +21,7 @@ class SpankwireIE(InfoExtractor): ) (?P<id>\d+) ''' + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?spankwire\.com/EmbedPlayer\.aspx/?\?.*?\bArticleId=\d+)'] _TESTS = [{ # download URL pattern: */<height>P_<tbr>K_<video_id>.mp4 'url': 'http://www.spankwire.com/Buckcherry-s-X-Rated-Music-Video-Crazy-Bitch/video103545/', @@ -67,12 +66,6 @@ class SpankwireIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?spankwire\.com/EmbedPlayer\.aspx/?\?.*?\bArticleId=\d+)', - webpage) - def _real_extract(self, url): video_id = self._match_id(url) @@ -108,7 +101,6 @@ class SpankwireIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) - self._sort_formats(formats) view_count = str_to_int(video.get('viewed')) diff --git a/hypervideo_dl/extractor/spiegel.py b/hypervideo_dl/extractor/spiegel.py index 58f2ed3..3701e29 100644 --- a/hypervideo_dl/extractor/spiegel.py +++ b/hypervideo_dl/extractor/spiegel.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from .jwplatform import JWPlatformIE diff --git a/hypervideo_dl/extractor/spiegeltv.py b/hypervideo_dl/extractor/spiegeltv.py deleted file mode 100644 index 6ccf4c3..0000000 --- a/hypervideo_dl/extractor/spiegeltv.py +++ /dev/null @@ -1,17 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from .nexx import NexxIE - - -class SpiegeltvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.tv/videos/(?P<id>\d+)' - _TEST = { - 'url': 'http://www.spiegel.tv/videos/161681-flug-mh370/', - 'only_matching': True, - } - - def _real_extract(self, url): - return self.url_result( - 'https://api.nexx.cloud/v3/748/videos/byid/%s' - % self._match_id(url), ie=NexxIE.ie_key()) diff --git a/hypervideo_dl/extractor/spike.py b/hypervideo_dl/extractor/spike.py index 5805f3d..5c1c78d 100644 --- a/hypervideo_dl/extractor/spike.py +++ b/hypervideo_dl/extractor/spike.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .mtv import MTVServicesInfoExtractor diff --git a/hypervideo_dl/extractor/sport5.py b/hypervideo_dl/extractor/sport5.py index 35c57d6..44b4067 100644 --- a/hypervideo_dl/extractor/sport5.py +++ b/hypervideo_dl/extractor/sport5.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ExtractorError @@ -78,7 +74,6 @@ class Sport5IE(InfoExtractor): 'width': int(fmt.get('width')), 'height': int(fmt.get('height')), } for fmt in metadata.findall('./PlaybackLinks/FileURL')] - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/sportbox.py b/hypervideo_dl/extractor/sportbox.py index b9017fd..ccbb0e8 100644 --- a/hypervideo_dl/extractor/sportbox.py +++ b/hypervideo_dl/extractor/sportbox.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -14,6 +9,7 @@ from ..utils import ( class SportBoxIE(InfoExtractor): _VALID_URL = r'https?://(?:news\.sportbox|matchtv)\.ru/vdl/player(?:/[^/]+/|\?.*?\bn?id=)(?P<id>\d+)' + _EMBED_REGEX = [r'<iframe[^>]+src="(?P<url>https?://(?:news\.sportbox|matchtv)\.ru/vdl/player[^"]+)"'] _TESTS = [{ 'url': 'http://news.sportbox.ru/vdl/player/ci/211355', 'info_dict': { @@ -45,12 +41,6 @@ class SportBoxIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+src="(https?://(?:news\.sportbox|matchtv)\.ru/vdl/player[^"]+)"', - webpage) - def _real_extract(self, url): video_id = self._match_id(url) @@ -75,7 +65,6 @@ class SportBoxIE(InfoExtractor): formats.append({ 'url': src, }) - self._sort_formats(formats) player = self._parse_json( self._search_regex( diff --git a/hypervideo_dl/extractor/sportdeutschland.py b/hypervideo_dl/extractor/sportdeutschland.py index 15b488a..75074b3 100644 --- a/hypervideo_dl/extractor/sportdeutschland.py +++ b/hypervideo_dl/extractor/sportdeutschland.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( clean_html, diff --git a/hypervideo_dl/extractor/spotify.py b/hypervideo_dl/extractor/spotify.py index 826f98c..55ce36a 100644 --- a/hypervideo_dl/extractor/spotify.py +++ b/hypervideo_dl/extractor/spotify.py @@ -1,34 +1,36 @@ -# coding: utf-8 -from __future__ import unicode_literals - +import functools import json import re from .common import InfoExtractor from ..utils import ( + OnDemandPagedList, clean_podcast_url, float_or_none, int_or_none, strip_or_none, + traverse_obj, try_get, unified_strdate, ) class SpotifyBaseIE(InfoExtractor): + _WORKING = False _ACCESS_TOKEN = None _OPERATION_HASHES = { 'Episode': '8276d4423d709ae9b68ec1b74cc047ba0f7479059a37820be730f125189ac2bf', 'MinimalShow': '13ee079672fad3f858ea45a55eb109553b4fb0969ed793185b2e34cbb6ee7cc0', 'ShowEpisodes': 'e0e5ce27bd7748d2c59b4d44ba245a8992a05be75d6fabc3b20753fc8857444d', } - _VALID_URL_TEMPL = r'https?://open\.spotify\.com/%s/(?P<id>[^/?&#]+)' + _VALID_URL_TEMPL = r'https?://open\.spotify\.com/(?:embed-podcast/|embed/|)%s/(?P<id>[^/?&#]+)' + _EMBED_REGEX = [r'<iframe[^>]+src="(?P<url>https?://open\.spotify.com/embed/[^"]+)"'] def _real_initialize(self): self._ACCESS_TOKEN = self._download_json( 'https://open.spotify.com/get_access_token', None)['accessToken'] - def _call_api(self, operation, video_id, variables): + def _call_api(self, operation, video_id, variables, **kwargs): return self._download_json( 'https://api-partner.spotify.com/pathfinder/v1/query', video_id, query={ 'operationName': 'query' + operation, @@ -38,7 +40,8 @@ class SpotifyBaseIE(InfoExtractor): 'sha256Hash': self._OPERATION_HASHES[operation], }, }) - }, headers={'authorization': 'Bearer ' + self._ACCESS_TOKEN})['data'] + }, headers={'authorization': 'Bearer ' + self._ACCESS_TOKEN}, + **kwargs)['data'] def _extract_episode(self, episode, series): episode_id = episode['id'] @@ -99,8 +102,9 @@ class SpotifyBaseIE(InfoExtractor): class SpotifyIE(SpotifyBaseIE): IE_NAME = 'spotify' + IE_DESC = 'Spotify episodes' _VALID_URL = SpotifyBaseIE._VALID_URL_TEMPL % 'episode' - _TEST = { + _TESTS = [{ 'url': 'https://open.spotify.com/episode/4Z7GAJ50bgctf6uclHlWKo', 'md5': '74010a1e3fa4d9e1ab3aa7ad14e42d3b', 'info_dict': { @@ -112,7 +116,10 @@ class SpotifyIE(SpotifyBaseIE): 'release_date': '20201217', 'series': "The Guardian's Audio Long Reads", } - } + }, { + 'url': 'https://open.spotify.com/embed/episode/4TvCsKKs2thXmarHigWvXE?si=7eatS8AbQb6RxqO2raIuWA', + 'only_matching': True, + }] def _real_extract(self, url): episode_id = self._match_id(url) @@ -125,6 +132,7 @@ class SpotifyIE(SpotifyBaseIE): class SpotifyShowIE(SpotifyBaseIE): IE_NAME = 'spotify:show' + IE_DESC = 'Spotify shows' _VALID_URL = SpotifyBaseIE._VALID_URL_TEMPL % 'show' _TEST = { 'url': 'https://open.spotify.com/show/4PM9Ke6l66IRNpottHKV9M', @@ -135,22 +143,25 @@ class SpotifyShowIE(SpotifyBaseIE): }, 'playlist_mincount': 36, } + _PER_PAGE = 100 + + def _fetch_page(self, show_id, page=0): + return self._call_api('ShowEpisodes', show_id, { + 'limit': 100, + 'offset': page * self._PER_PAGE, + 'uri': f'spotify:show:{show_id}', + }, note=f'Downloading page {page + 1} JSON metadata')['podcast'] def _real_extract(self, url): show_id = self._match_id(url) - podcast = self._call_api('ShowEpisodes', show_id, { - 'limit': 1000000000, - 'offset': 0, - 'uri': 'spotify:show:' + show_id, - })['podcast'] - podcast_name = podcast.get('name') - - entries = [] - for item in (try_get(podcast, lambda x: x['episodes']['items']) or []): - episode = item.get('episode') - if not episode: - continue - entries.append(self._extract_episode(episode, podcast_name)) + first_page = self._fetch_page(show_id) + + def _entries(page): + podcast = self._fetch_page(show_id, page) if page else first_page + yield from map( + functools.partial(self._extract_episode, series=podcast.get('name')), + traverse_obj(podcast, ('episodes', 'items', ..., 'episode'))) return self.playlist_result( - entries, show_id, podcast_name, podcast.get('description')) + OnDemandPagedList(_entries, self._PER_PAGE), + show_id, first_page.get('name'), first_page.get('description')) diff --git a/hypervideo_dl/extractor/spreaker.py b/hypervideo_dl/extractor/spreaker.py index 6c7e40a..36a9bd2 100644 --- a/hypervideo_dl/extractor/spreaker.py +++ b/hypervideo_dl/extractor/spreaker.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import itertools from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/springboardplatform.py b/hypervideo_dl/extractor/springboardplatform.py index 49ac1f5..a98584a 100644 --- a/hypervideo_dl/extractor/springboardplatform.py +++ b/hypervideo_dl/extractor/springboardplatform.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -24,6 +21,7 @@ class SpringboardPlatformIE(InfoExtractor): xml_feeds_advanced/index/(?P<index_2>\d+)/rss3/(?P<id_2>\d+) ) ''' + _EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//cms\.springboardplatform\.com/embed_iframe/\d+/video/\d+.*?)\1'] _TESTS = [{ 'url': 'http://cms.springboardplatform.com/previews/159/video/981017/0/0/1', 'md5': '5c3cb7b5c55740d482561099e920f192', @@ -48,14 +46,6 @@ class SpringboardPlatformIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//cms\.springboardplatform\.com/embed_iframe/\d+/video/\d+.*?)\1', - webpage)] - def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') or mobj.group('id_2') @@ -112,8 +102,6 @@ class SpringboardPlatformIE(InfoExtractor): }) formats.append(m3u8_format) - self._sort_formats(formats) - return { 'id': video_id, 'title': title, diff --git a/hypervideo_dl/extractor/sprout.py b/hypervideo_dl/extractor/sprout.py index e243732..444a6c2 100644 --- a/hypervideo_dl/extractor/sprout.py +++ b/hypervideo_dl/extractor/sprout.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .adobepass import AdobePassIE from ..utils import ( int_or_none, diff --git a/hypervideo_dl/extractor/srgssr.py b/hypervideo_dl/extractor/srgssr.py index f991981..145f25e 100644 --- a/hypervideo_dl/extractor/srgssr.py +++ b/hypervideo_dl/extractor/srgssr.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -132,7 +128,6 @@ class SRGSSRIE(InfoExtractor): 'url': podcast_url, 'quality': q(quality), }) - self._sort_formats(formats) if media_type == 'video': for sub in (media_data.get('subtitleList') or []): diff --git a/hypervideo_dl/extractor/srmediathek.py b/hypervideo_dl/extractor/srmediathek.py index 359dada..3cc3987 100644 --- a/hypervideo_dl/extractor/srmediathek.py +++ b/hypervideo_dl/extractor/srmediathek.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .ard import ARDMediathekBaseIE from ..utils import ( ExtractorError, diff --git a/hypervideo_dl/extractor/stanfordoc.py b/hypervideo_dl/extractor/stanfordoc.py index 0003075..be0f4af 100644 --- a/hypervideo_dl/extractor/stanfordoc.py +++ b/hypervideo_dl/extractor/stanfordoc.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/startrek.py b/hypervideo_dl/extractor/startrek.py new file mode 100644 index 0000000..e92122f --- /dev/null +++ b/hypervideo_dl/extractor/startrek.py @@ -0,0 +1,75 @@ +from .common import InfoExtractor +from ..utils import int_or_none, urljoin + + +class StarTrekIE(InfoExtractor): + _VALID_URL = r'(?P<base>https?://(?:intl|www)\.startrek\.com)/videos/(?P<id>[^/]+)' + _TESTS = [{ + 'url': 'https://intl.startrek.com/videos/watch-welcoming-jess-bush-to-the-ready-room', + 'md5': '491df5035c9d4dc7f63c79caaf9c839e', + 'info_dict': { + 'id': 'watch-welcoming-jess-bush-to-the-ready-room', + 'ext': 'mp4', + 'title': 'WATCH: Welcoming Jess Bush to The Ready Room', + 'duration': 1888, + 'timestamp': 1655388000, + 'upload_date': '20220616', + 'description': 'md5:1ffee884e3920afbdd6dd04e926a1221', + 'thumbnail': r're:https://(?:intl|www)\.startrek\.com/sites/default/files/styles/video_1920x1080/public/images/2022-06/pp_14794_rr_thumb_107_yt_16x9\.jpg(?:\?.+)?', + 'subtitles': {'en-US': [{ + 'url': r're:https://(?:intl|www)\.startrek\.com/sites/default/files/video/captions/2022-06/TRR_SNW_107_v4\.vtt', + }, { + 'url': 'https://media.startrek.com/2022/06/16/2043801155561/1069981_hls/trr_snw_107_v4-c4bfc25d/stream_vtt.m3u8', + }]}, + } + }, { + 'url': 'https://www.startrek.com/videos/watch-ethan-peck-and-gia-sandhu-beam-down-to-the-ready-room', + 'md5': 'f5ad74fbb86e91e0882fc0a333178d1d', + 'info_dict': { + 'id': 'watch-ethan-peck-and-gia-sandhu-beam-down-to-the-ready-room', + 'ext': 'mp4', + 'title': 'WATCH: Ethan Peck and Gia Sandhu Beam Down to The Ready Room', + 'duration': 1986, + 'timestamp': 1654221600, + 'upload_date': '20220603', + 'description': 'md5:b3aa0edacfe119386567362dec8ed51b', + 'thumbnail': r're:https://www\.startrek\.com/sites/default/files/styles/video_1920x1080/public/images/2022-06/pp_14792_rr_thumb_105_yt_16x9_1.jpg(?:\?.+)?', + 'subtitles': {'en-US': [{ + 'url': r're:https://(?:intl|www)\.startrek\.com/sites/default/files/video/captions/2022-06/TRR_SNW_105_v5\.vtt', + }]}, + } + }] + + def _real_extract(self, url): + urlbase, video_id = self._match_valid_url(url).group('base', 'id') + webpage = self._download_webpage(url, video_id) + + player = self._search_regex( + r'(<\s*div\s+id\s*=\s*"cvp-player-[^<]+<\s*/div\s*>)', webpage, 'player') + + hls = self._html_search_regex(r'\bdata-hls\s*=\s*"([^"]+)"', player, 'HLS URL') + formats, subtitles = self._extract_m3u8_formats_and_subtitles(hls, video_id, 'mp4') + + captions = self._html_search_regex( + r'\bdata-captions-url\s*=\s*"([^"]+)"', player, 'captions URL', fatal=False) + if captions: + subtitles.setdefault('en-US', [])[:0] = [{'url': urljoin(urlbase, captions)}] + + # NB: Most of the data in the json_ld is undesirable + json_ld = self._search_json_ld(webpage, video_id, fatal=False) + + return { + 'id': video_id, + 'title': self._html_search_regex( + r'\bdata-title\s*=\s*"([^"]+)"', player, 'title', json_ld.get('title')), + 'description': self._html_search_regex( + r'(?s)<\s*div\s+class\s*=\s*"header-body"\s*>(.+?)<\s*/div\s*>', + webpage, 'description', fatal=False), + 'duration': int_or_none(self._html_search_regex( + r'\bdata-duration\s*=\s*"(\d+)"', player, 'duration', fatal=False)), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': urljoin(urlbase, self._html_search_regex( + r'\bdata-poster-url\s*=\s*"([^"]+)"', player, 'thumbnail', fatal=False)), + 'timestamp': json_ld.get('timestamp'), + } diff --git a/hypervideo_dl/extractor/startv.py b/hypervideo_dl/extractor/startv.py index 411320e..bb6e8f1 100644 --- a/hypervideo_dl/extractor/startv.py +++ b/hypervideo_dl/extractor/startv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_str, diff --git a/hypervideo_dl/extractor/steam.py b/hypervideo_dl/extractor/steam.py index 4ed0fb5..7daee2f 100644 --- a/hypervideo_dl/extractor/steam.py +++ b/hypervideo_dl/extractor/steam.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -111,7 +109,6 @@ class SteamIE(InfoExtractor): 'format_id': ext + quality, 'url': video_url, }) - self._sort_formats(formats) entry['formats'] = formats entries.append(entry) embedded_videos = re.findall(r'(<iframe[^>]+>)', webpage) @@ -129,3 +126,49 @@ class SteamIE(InfoExtractor): raise ExtractorError('Could not find any videos') return self.playlist_result(entries, playlist_id, playlist_title) + + +class SteamCommunityBroadcastIE(InfoExtractor): + _VALID_URL = r'https?://steamcommunity\.(?:com)/broadcast/watch/(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://steamcommunity.com/broadcast/watch/76561199073851486', + 'info_dict': { + 'id': '76561199073851486', + 'title': r're:Steam Community :: pepperm!nt :: Broadcast 2022-06-26 \d{2}:\d{2}', + 'ext': 'mp4', + 'uploader_id': 1113585758, + 'uploader': 'pepperm!nt', + 'live_status': 'is_live', + }, + 'skip': 'Stream has ended', + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + json_data = self._download_json( + 'https://steamcommunity.com/broadcast/getbroadcastmpd/', + video_id, query={'steamid': f'{video_id}'}) + + formats, subs = self._extract_m3u8_formats_and_subtitles(json_data['hls_url'], video_id) + + ''' # We cannot download live dash atm + mpd_formats, mpd_subs = self._extract_mpd_formats_and_subtitles(json_data['url'], video_id) + formats.extend(mpd_formats) + self._merge_subtitles(mpd_subs, target=subs) + ''' + + uploader_json = self._download_json( + 'https://steamcommunity.com/actions/ajaxresolveusers', + video_id, query={'steamids': video_id})[0] + + return { + 'id': video_id, + 'title': self._generic_title('', webpage), + 'formats': formats, + 'live_status': 'is_live', + 'view_count': json_data.get('num_view'), + 'uploader': uploader_json.get('persona_name'), + 'uploader_id': uploader_json.get('accountid'), + 'subtitles': subs, + } diff --git a/hypervideo_dl/extractor/stitcher.py b/hypervideo_dl/extractor/stitcher.py index 8227825..2fd200f 100644 --- a/hypervideo_dl/extractor/stitcher.py +++ b/hypervideo_dl/extractor/stitcher.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( diff --git a/hypervideo_dl/extractor/storyfire.py b/hypervideo_dl/extractor/storyfire.py index e18a59a..035747c 100644 --- a/hypervideo_dl/extractor/storyfire.py +++ b/hypervideo_dl/extractor/storyfire.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import functools from .common import InfoExtractor @@ -47,7 +44,7 @@ class StoryFireBaseIE(InfoExtractor): 'timestamp': int_or_none(video.get('publishDate')), 'uploader': video.get('username'), 'uploader_id': uploader_id, - 'uploader_url': format_field(uploader_id, template='https://storyfire.com/user/%s/video'), + 'uploader_url': format_field(uploader_id, None, 'https://storyfire.com/user/%s/video'), 'episode_number': int_or_none(video.get('episodeNumber') or video.get('episode_number')), } diff --git a/hypervideo_dl/extractor/streamable.py b/hypervideo_dl/extractor/streamable.py index 8081296..462861e 100644 --- a/hypervideo_dl/extractor/streamable.py +++ b/hypervideo_dl/extractor/streamable.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -15,6 +10,7 @@ from ..utils import ( class StreamableIE(InfoExtractor): _VALID_URL = r'https?://streamable\.com/(?:[es]/)?(?P<id>\w+)' + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=(?P<q1>[\'"])(?P<url>(?:https?:)?//streamable\.com/.+?)(?P=q1)'] _TESTS = [ { 'url': 'https://streamable.com/dnd1', @@ -56,14 +52,6 @@ class StreamableIE(InfoExtractor): } ] - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+src=(?P<q1>[\'"])(?P<src>(?:https?:)?//streamable\.com/(?:(?!\1).+))(?P=q1)', - webpage) - if mobj: - return mobj.group('src') - def _real_extract(self, url): video_id = self._match_id(url) @@ -101,7 +89,6 @@ class StreamableIE(InfoExtractor): 'vcodec': parse_codecs(try_get(info, lambda x: x['input_metadata']['video_codec_name'])).get('vcodec'), 'acodec': parse_codecs(try_get(info, lambda x: x['input_metadata']['audio_codec_name'])).get('acodec'), }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/streamanity.py b/hypervideo_dl/extractor/streamanity.py index 2e2d5ee..6eaee52 100644 --- a/hypervideo_dl/extractor/streamanity.py +++ b/hypervideo_dl/extractor/streamanity.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor @@ -38,7 +35,6 @@ class StreamanityIE(InfoExtractor): formats = self._extract_m3u8_formats( f'https://stream.mux.com/{video_info["play_id"]}.m3u8?token={video_info["token"]}', video_id, ext='mp4', m3u8_id='hls') - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/streamcloud.py b/hypervideo_dl/extractor/streamcloud.py index b97bb43..7289809 100644 --- a/hypervideo_dl/extractor/streamcloud.py +++ b/hypervideo_dl/extractor/streamcloud.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/streamcz.py b/hypervideo_dl/extractor/streamcz.py index 4cb9923..c4537ba 100644 --- a/hypervideo_dl/extractor/streamcz.py +++ b/hypervideo_dl/extractor/streamcz.py @@ -1,4 +1,3 @@ -# coding: utf-8 import json from .common import InfoExtractor @@ -53,8 +52,8 @@ class StreamCZIE(InfoExtractor): def _extract_formats(self, spl_url, video): for ext, pref, streams in ( - ('ts', -1, traverse_obj(video, ('http_stream', 'qualities'))), - ('mp4', 1, video.get('mp4'))): + ('ts', -1, traverse_obj(video, ('http_stream', 'qualities')) or {}), + ('mp4', 1, video.get('mp4') or {})): for format_id, stream in streams.items(): if not stream.get('url'): continue @@ -110,7 +109,6 @@ class StreamCZIE(InfoExtractor): }) formats = list(self._extract_formats(spl_url, video)) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/streamff.py b/hypervideo_dl/extractor/streamff.py index 6b190bb..93c4294 100644 --- a/hypervideo_dl/extractor/streamff.py +++ b/hypervideo_dl/extractor/streamff.py @@ -1,4 +1,3 @@ -# coding: utf-8 from .common import InfoExtractor from ..utils import int_or_none, parse_iso8601 diff --git a/hypervideo_dl/extractor/streetvoice.py b/hypervideo_dl/extractor/streetvoice.py index f21681a..a32c8bc 100644 --- a/hypervideo_dl/extractor/streetvoice.py +++ b/hypervideo_dl/extractor/streetvoice.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, diff --git a/hypervideo_dl/extractor/stretchinternet.py b/hypervideo_dl/extractor/stretchinternet.py index ec08eae..e438dee 100644 --- a/hypervideo_dl/extractor/stretchinternet.py +++ b/hypervideo_dl/extractor/stretchinternet.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/stripchat.py b/hypervideo_dl/extractor/stripchat.py index 0d4a0ce..4229a0b 100644 --- a/hypervideo_dl/extractor/stripchat.py +++ b/hypervideo_dl/extractor/stripchat.py @@ -1,37 +1,28 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor -from ..compat import ( - compat_str, -) -from ..utils import ( - ExtractorError, - lowercase_escape, - try_get, -) +from ..utils import ExtractorError, lowercase_escape, traverse_obj class StripchatIE(InfoExtractor): - _VALID_URL = r'https?://stripchat\.com/(?P<id>[0-9A-Za-z-_]+)' + _VALID_URL = r'https?://stripchat\.com/(?P<id>[^/?#]+)' _TESTS = [{ - 'url': 'https://stripchat.com/feel_me', + 'url': 'https://stripchat.com/Joselin_Flower', 'info_dict': { - 'id': 'feel_me', + 'id': 'Joselin_Flower', 'ext': 'mp4', - 'title': 're:^feel_me [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': 're:^Joselin_Flower [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': str, 'is_live': True, 'age_limit': 18, }, 'skip': 'Room is offline', + }, { + 'url': 'https://stripchat.com/Rakhijaan@xh', + 'only_matching': True }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'https://stripchat.com/%s/' % video_id, video_id, - headers=self.geo_verification_headers()) + webpage = self._download_webpage(url, video_id, headers=self.geo_verification_headers()) data = self._parse_json( self._search_regex( @@ -41,19 +32,24 @@ class StripchatIE(InfoExtractor): if not data: raise ExtractorError('Unable to find configuration for stream.') - if try_get(data, lambda x: x['viewCam']['show'], dict): + if traverse_obj(data, ('viewCam', 'show'), expected_type=dict): raise ExtractorError('Model is in private show', expected=True) - elif not try_get(data, lambda x: x['viewCam']['model']['isLive'], bool): + elif not traverse_obj(data, ('viewCam', 'model', 'isLive'), expected_type=bool): raise ExtractorError('Model is offline', expected=True) - server = try_get(data, lambda x: x['viewCam']['viewServers']['flashphoner-hls'], compat_str) - host = try_get(data, lambda x: x['config']['data']['hlsStreamHost'], compat_str) - model_id = try_get(data, lambda x: x['viewCam']['model']['id'], int) - - formats = self._extract_m3u8_formats( - 'https://b-%s.%s/hls/%d/%d.m3u8' % (server, host, model_id, model_id), - video_id, ext='mp4', m3u8_id='hls', fatal=False, live=True) - self._sort_formats(formats) + server = traverse_obj(data, ('viewCam', 'viewServers', 'flashphoner-hls'), expected_type=str) + model_id = traverse_obj(data, ('viewCam', 'model', 'id'), expected_type=int) + + formats = [] + for host in traverse_obj(data, ( + 'config', 'data', (('featuresV2', 'hlsFallback', 'fallbackDomains', ...), 'hlsStreamHost'))): + formats = self._extract_m3u8_formats( + f'https://b-{server}.{host}/hls/{model_id}/{model_id}.m3u8', + video_id, ext='mp4', m3u8_id='hls', fatal=False, live=True) + if formats: + break + if not formats: + self.raise_no_formats('No active streams found', expected=True) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/stv.py b/hypervideo_dl/extractor/stv.py index ba5661d..c879fb5 100644 --- a/hypervideo_dl/extractor/stv.py +++ b/hypervideo_dl/extractor/stv.py @@ -1,10 +1,6 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( - compat_str, float_or_none, int_or_none, smuggle_url, diff --git a/hypervideo_dl/extractor/substack.py b/hypervideo_dl/extractor/substack.py new file mode 100644 index 0000000..fa38263 --- /dev/null +++ b/hypervideo_dl/extractor/substack.py @@ -0,0 +1,100 @@ +import re +import urllib.parse + +from .common import InfoExtractor +from ..utils import str_or_none, traverse_obj + + +class SubstackIE(InfoExtractor): + _VALID_URL = r'https?://(?P<username>[\w-]+)\.substack\.com/p/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://haleynahman.substack.com/p/i-made-a-vlog?s=r', + 'md5': 'f27e4fc6252001d48d479f45e65cdfd5', + 'info_dict': { + 'id': '47660949', + 'ext': 'mp4', + 'title': 'I MADE A VLOG', + 'description': 'md5:10c01ff93439a62e70ce963b2aa0b7f6', + 'thumbnail': 'md5:bec758a34d8ee9142d43bcebdf33af18', + 'uploader': 'Maybe Baby', + 'uploader_id': '33628', + } + }, { + 'url': 'https://haleynahman.substack.com/p/-dear-danny-i-found-my-boyfriends?s=r', + 'md5': '0a63eacec877a1171a62cfa69710fcea', + 'info_dict': { + 'id': '51045592', + 'ext': 'mpga', + 'title': "🎧 Dear Danny: I found my boyfriend's secret Twitter account", + 'description': 'md5:a57f2439319e56e0af92dd0c95d75797', + 'thumbnail': 'md5:daa40b6b79249417c14ff8103db29639', + 'uploader': 'Maybe Baby', + 'uploader_id': '33628', + } + }, { + 'url': 'https://andrewzimmern.substack.com/p/mussels-with-black-bean-sauce-recipe', + 'md5': 'fd3c07077b02444ff0130715b5f632bb', + 'info_dict': { + 'id': '47368578', + 'ext': 'mp4', + 'title': 'Mussels with Black Bean Sauce: Recipe of the Week #7', + 'description': 'md5:b96234a2906c7d854d5229818d889515', + 'thumbnail': 'md5:e30bfaa9da40e82aa62354263a9dd232', + 'uploader': "Andrew Zimmern's Spilled Milk ", + 'uploader_id': '577659', + } + }] + + @classmethod + def _extract_embed_urls(cls, url, webpage): + if not re.search(r'<script[^>]+src=["\']https://substackcdn.com/[^"\']+\.js', webpage): + return + + mobj = re.search(r'{[^}]*["\']subdomain["\']\s*:\s*["\'](?P<subdomain>[^"]+)', webpage) + if mobj: + parsed = urllib.parse.urlparse(url) + yield parsed._replace(netloc=f'{mobj.group("subdomain")}.substack.com').geturl() + raise cls.StopExtraction() + + def _extract_video_formats(self, video_id, username): + formats, subtitles = [], {} + for video_format in ('hls', 'mp4'): + video_url = f'https://{username}.substack.com/api/v1/video/upload/{video_id}/src?type={video_format}' + + if video_format == 'hls': + fmts, subs = self._extract_m3u8_formats_and_subtitles(video_url, video_id, 'mp4', fatal=False) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({ + 'url': video_url, + 'ext': video_format, + }) + + return formats, subtitles + + def _real_extract(self, url): + display_id, username = self._match_valid_url(url).group('id', 'username') + webpage = self._download_webpage(url, display_id) + + webpage_info = self._search_json(r'<script[^>]*>\s*window\._preloads\s*=', webpage, 'preloads', display_id) + + post_type = webpage_info['post']['type'] + formats, subtitles = [], {} + if post_type == 'podcast': + formats, subtitles = [{'url': webpage_info['post']['podcast_url']}], {} + elif post_type == 'video': + formats, subtitles = self._extract_video_formats(webpage_info['post']['videoUpload']['id'], username) + else: + self.raise_no_formats(f'Page type "{post_type}" is not supported') + + return { + 'id': str(webpage_info['post']['id']), + 'formats': formats, + 'subtitles': subtitles, + 'title': traverse_obj(webpage_info, ('post', 'title')), + 'description': traverse_obj(webpage_info, ('post', 'description')), + 'thumbnail': traverse_obj(webpage_info, ('post', 'cover_image')), + 'uploader': traverse_obj(webpage_info, ('pub', 'name')), + 'uploader_id': str_or_none(traverse_obj(webpage_info, ('post', 'publication_id'))), + } diff --git a/hypervideo_dl/extractor/sunporno.py b/hypervideo_dl/extractor/sunporno.py index 59b77bf..708873a 100644 --- a/hypervideo_dl/extractor/sunporno.py +++ b/hypervideo_dl/extractor/sunporno.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -63,7 +61,6 @@ class SunPornoIE(InfoExtractor): 'format_id': video_ext, 'quality': quality(video_ext), }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/sverigesradio.py b/hypervideo_dl/extractor/sverigesradio.py index aa0691f..65da615 100644 --- a/hypervideo_dl/extractor/sverigesradio.py +++ b/hypervideo_dl/extractor/sverigesradio.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -61,7 +58,6 @@ class SverigesRadioBaseIE(InfoExtractor): 'vcodec': 'none', 'url': audio_url, }) - self._sort_formats(formats) return { 'id': audio_id, diff --git a/hypervideo_dl/extractor/svt.py b/hypervideo_dl/extractor/svt.py index 8ca62e3..31bf7f9 100644 --- a/hypervideo_dl/extractor/svt.py +++ b/hypervideo_dl/extractor/svt.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -54,7 +51,6 @@ class SVTBaseIE(InfoExtractor): self.raise_geo_restricted( 'This video is only available in Sweden', countries=self._GEO_COUNTRIES, metadata_available=True) - self._sort_formats(formats) subtitle_references = dict_get(video_info, ('subtitles', 'subtitleReferences')) if isinstance(subtitle_references, list): @@ -104,6 +100,7 @@ class SVTBaseIE(InfoExtractor): class SVTIE(SVTBaseIE): _VALID_URL = r'https?://(?:www\.)?svt\.se/wd\?(?:.*?&)?widgetId=(?P<widget_id>\d+)&.*?\barticleId=(?P<id>\d+)' + _EMBED_REGEX = [r'(?:<iframe src|href)="(?P<url>%s[^"]*)"' % _VALID_URL] _TEST = { 'url': 'http://www.svt.se/wd?widgetId=23991§ionId=541&articleId=2900353&type=embed&contextSectionId=123&autostart=false', 'md5': '33e9a5d8f646523ce0868ecfb0eed77d', @@ -116,13 +113,6 @@ class SVTIE(SVTBaseIE): }, } - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'(?:<iframe src|href)="(?P<url>%s[^"]*)"' % SVTIE._VALID_URL, webpage) - if mobj: - return mobj.group('url') - def _real_extract(self, url): mobj = self._match_valid_url(url) widget_id = mobj.group('widget_id') diff --git a/hypervideo_dl/extractor/swearnet.py b/hypervideo_dl/extractor/swearnet.py new file mode 100644 index 0000000..6e216a2 --- /dev/null +++ b/hypervideo_dl/extractor/swearnet.py @@ -0,0 +1,73 @@ +from .common import InfoExtractor +from ..utils import int_or_none, traverse_obj + + +class SwearnetEpisodeIE(InfoExtractor): + _VALID_URL = r'https?://www\.swearnet\.com/shows/(?P<id>[\w-]+)/seasons/(?P<season_num>\d+)/episodes/(?P<episode_num>\d+)' + _TESTS = [{ + 'url': 'https://www.swearnet.com/shows/gettin-learnt-with-ricky/seasons/1/episodes/1', + 'info_dict': { + 'id': '232819', + 'ext': 'mp4', + 'episode_number': 1, + 'episode': 'Episode 1', + 'duration': 719, + 'description': 'md5:c48ef71440ce466284c07085cd7bd761', + 'season': 'Season 1', + 'title': 'Episode 1 - Grilled Cheese Sammich', + 'season_number': 1, + 'thumbnail': 'https://cdn.vidyard.com/thumbnails/232819/_RX04IKIq60a2V6rIRqq_Q_small.jpg', + } + }] + + def _get_formats_and_subtitle(self, video_source, video_id): + video_source = video_source or {} + formats, subtitles = [], {} + for key, value in video_source.items(): + if key == 'hls': + for video_hls in value: + fmts, subs = self._extract_m3u8_formats_and_subtitles(video_hls.get('url'), video_id) + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.extend({ + 'url': video_mp4.get('url'), + 'ext': 'mp4' + } for video_mp4 in value) + + return formats, subtitles + + def _get_direct_subtitle(self, caption_json): + subs = {} + for caption in caption_json: + subs.setdefault(caption.get('language') or 'und', []).append({ + 'url': caption.get('vttUrl'), + 'name': caption.get('name') + }) + + return subs + + def _real_extract(self, url): + display_id, season_number, episode_number = self._match_valid_url(url).group('id', 'season_num', 'episode_num') + webpage = self._download_webpage(url, display_id) + + external_id = self._search_regex(r'externalid\s*=\s*"([^"]+)', webpage, 'externalid') + json_data = self._download_json( + f'https://play.vidyard.com/player/{external_id}.json', display_id)['payload']['chapters'][0] + + formats, subtitles = self._get_formats_and_subtitle(json_data['sources'], display_id) + self._merge_subtitles(self._get_direct_subtitle(json_data.get('captions')), target=subtitles) + + return { + 'id': str(json_data['videoId']), + 'title': json_data.get('name') or self._html_search_meta(['og:title', 'twitter:title'], webpage), + 'description': (json_data.get('description') + or self._html_search_meta(['og:description', 'twitter:description'], webpage)), + 'duration': int_or_none(json_data.get('seconds')), + 'formats': formats, + 'subtitles': subtitles, + 'season_number': int_or_none(season_number), + 'episode_number': int_or_none(episode_number), + 'thumbnails': [{'url': thumbnail_url} + for thumbnail_url in traverse_obj(json_data, ('thumbnailUrls', ...))] + } diff --git a/hypervideo_dl/extractor/swrmediathek.py b/hypervideo_dl/extractor/swrmediathek.py index 0f61597..38bdfce 100644 --- a/hypervideo_dl/extractor/swrmediathek.py +++ b/hypervideo_dl/extractor/swrmediathek.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( parse_duration, @@ -95,7 +92,6 @@ class SWRMediathekIE(InfoExtractor): 'vcodec': codec if media_type == 'Video' else 'none', 'acodec': codec if media_type == 'Audio' else None, }) - self._sort_formats(formats) upload_date = None entry_pdatet = attr.get('entry_pdatet') diff --git a/hypervideo_dl/extractor/syfy.py b/hypervideo_dl/extractor/syfy.py index def7e5a..c79d27a 100644 --- a/hypervideo_dl/extractor/syfy.py +++ b/hypervideo_dl/extractor/syfy.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .adobepass import AdobePassIE from ..utils import ( update_url_query, diff --git a/hypervideo_dl/extractor/syvdk.py b/hypervideo_dl/extractor/syvdk.py new file mode 100644 index 0000000..287fb26 --- /dev/null +++ b/hypervideo_dl/extractor/syvdk.py @@ -0,0 +1,33 @@ +from .common import InfoExtractor +from ..utils import traverse_obj + + +class SYVDKIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?24syv\.dk/episode/(?P<id>[\w-]+)' + + _TESTS = [{ + 'url': 'https://24syv.dk/episode/isabella-arendt-stiller-op-for-de-konservative-2', + 'md5': '429ce5a423dd4b1e1d0bf3a569558089', + 'info_dict': { + 'id': '12215', + 'display_id': 'isabella-arendt-stiller-op-for-de-konservative-2', + 'ext': 'mp3', + 'title': 'Isabella Arendt stiller op for De Konservative', + 'description': 'md5:f5fa6a431813bf37284f3412ad7c6c06' + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + info_data = self._search_nextjs_data(webpage, video_id)['props']['pageProps']['episodeDetails'][0] + + return { + 'id': str(info_data['id']), + 'vcodec': 'none', + 'ext': 'mp3', + 'url': info_data['details']['enclosure'], + 'display_id': video_id, + 'title': traverse_obj(info_data, ('title', 'rendered')), + 'description': traverse_obj(info_data, ('details', 'post_title')), + } diff --git a/hypervideo_dl/extractor/sztvhu.py b/hypervideo_dl/extractor/sztvhu.py index cfad331..1cbc2a3 100644 --- a/hypervideo_dl/extractor/sztvhu.py +++ b/hypervideo_dl/extractor/sztvhu.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/tagesschau.py b/hypervideo_dl/extractor/tagesschau.py index 6e03d0a..ea0532c 100644 --- a/hypervideo_dl/extractor/tagesschau.py +++ b/hypervideo_dl/extractor/tagesschau.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -142,8 +139,6 @@ class TagesschauIE(InfoExtractor): timestamp = video_info.get('timestamp') title = title or video_info.get('description') - self._sort_formats(formats) - return { 'id': display_id, 'title': title, diff --git a/hypervideo_dl/extractor/tass.py b/hypervideo_dl/extractor/tass.py index 6d336da..67e544a 100644 --- a/hypervideo_dl/extractor/tass.py +++ b/hypervideo_dl/extractor/tass.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -51,7 +48,6 @@ class TassIE(InfoExtractor): 'format_id': label, 'quality': quality(label), }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/tastytrade.py b/hypervideo_dl/extractor/tastytrade.py deleted file mode 100644 index 7fe96bd..0000000 --- a/hypervideo_dl/extractor/tastytrade.py +++ /dev/null @@ -1,43 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor -from .ooyala import OoyalaIE - - -class TastyTradeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tastytrade\.com/tt/shows/[^/]+/episodes/(?P<id>[^/?#&]+)' - - _TESTS = [{ - 'url': 'https://www.tastytrade.com/tt/shows/market-measures/episodes/correlation-in-short-volatility-06-28-2017', - 'info_dict': { - 'id': 'F3bnlzbToeI6pLEfRyrlfooIILUjz4nM', - 'ext': 'mp4', - 'title': 'A History of Teaming', - 'description': 'md5:2a9033db8da81f2edffa4c99888140b3', - 'duration': 422.255, - }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['Ooyala'], - }, { - 'url': 'https://www.tastytrade.com/tt/shows/daily-dose/episodes/daily-dose-06-30-2017', - 'only_matching': True, - }] - - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - - ooyala_code = self._search_regex( - r'data-media-id=(["\'])(?P<code>(?:(?!\1).)+)\1', - webpage, 'ooyala code', group='code') - - info = self._search_json_ld(webpage, display_id, fatal=False) - info.update({ - '_type': 'url_transparent', - 'ie_key': OoyalaIE.ie_key(), - 'url': 'ooyala:%s' % ooyala_code, - 'display_id': display_id, - }) - return info diff --git a/hypervideo_dl/extractor/tbs.py b/hypervideo_dl/extractor/tbs.py index c7d62ff..808c6c7 100644 --- a/hypervideo_dl/extractor/tbs.py +++ b/hypervideo_dl/extractor/tbs.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .turner import TurnerBaseIE diff --git a/hypervideo_dl/extractor/tdslifeway.py b/hypervideo_dl/extractor/tdslifeway.py index 101c6ee..3623a68 100644 --- a/hypervideo_dl/extractor/tdslifeway.py +++ b/hypervideo_dl/extractor/tdslifeway.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/teachable.py b/hypervideo_dl/extractor/teachable.py index 232eaa5..c212a49 100644 --- a/hypervideo_dl/extractor/teachable.py +++ b/hypervideo_dl/extractor/teachable.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -142,12 +140,12 @@ class TeachableIE(TeachableBaseIE): r'<link[^>]+href=["\']https?://(?:process\.fs|assets)\.teachablecdn\.com', webpage) - @staticmethod - def _extract_url(webpage, source_url): - if not TeachableIE._is_teachable(webpage): - return - if re.match(r'https?://[^/]+/(?:courses|p)', source_url): - return '%s%s' % (TeachableBaseIE._URL_PREFIX, source_url) + @classmethod + def _extract_embed_urls(cls, url, webpage): + if cls._is_teachable(webpage): + if re.match(r'https?://[^/]+/(?:courses|p)', url): + yield f'{cls._URL_PREFIX}{url}' + raise cls.StopExtraction() def _real_extract(self, url): mobj = self._match_valid_url(url) @@ -162,7 +160,7 @@ class TeachableIE(TeachableBaseIE): webpage = self._download_webpage(url, video_id) - wistia_urls = WistiaIE._extract_urls(webpage) + wistia_urls = WistiaIE._extract_embed_urls(url, webpage) if not wistia_urls: if any(re.search(p, webpage) for p in ( r'class=["\']lecture-contents-locked', diff --git a/hypervideo_dl/extractor/teachertube.py b/hypervideo_dl/extractor/teachertube.py index e22f011..c3eec27 100644 --- a/hypervideo_dl/extractor/teachertube.py +++ b/hypervideo_dl/extractor/teachertube.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -76,8 +73,6 @@ class TeacherTubeIE(InfoExtractor): } for media_url in set(media_urls) ] - self._sort_formats(formats) - thumbnail = self._og_search_thumbnail( webpage, default=None) or self._html_search_meta( 'thumbnail', webpage) diff --git a/hypervideo_dl/extractor/teachingchannel.py b/hypervideo_dl/extractor/teachingchannel.py index 624cdb3..275f6d1 100644 --- a/hypervideo_dl/extractor/teachingchannel.py +++ b/hypervideo_dl/extractor/teachingchannel.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/teamcoco.py b/hypervideo_dl/extractor/teamcoco.py index 5793b71..a822b67 100644 --- a/hypervideo_dl/extractor/teamcoco.py +++ b/hypervideo_dl/extractor/teamcoco.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .turner import TurnerBaseIE @@ -199,7 +196,6 @@ class TeamcocoIE(TurnerBaseIE): 'format_id': format_id, 'quality': get_quality(format_id), }) - self._sort_formats(formats) info['formats'] = formats return info diff --git a/hypervideo_dl/extractor/teamtreehouse.py b/hypervideo_dl/extractor/teamtreehouse.py index 64522ec..dd802db 100644 --- a/hypervideo_dl/extractor/teamtreehouse.py +++ b/hypervideo_dl/extractor/teamtreehouse.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/techtalks.py b/hypervideo_dl/extractor/techtalks.py index 78f0731..d37de36 100644 --- a/hypervideo_dl/extractor/techtalks.py +++ b/hypervideo_dl/extractor/techtalks.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/ted.py b/hypervideo_dl/extractor/ted.py index b5c7e35..c28a154 100644 --- a/hypervideo_dl/extractor/ted.py +++ b/hypervideo_dl/extractor/ted.py @@ -125,8 +125,6 @@ class TedTalkIE(TedBaseIE): ext_url = external.get('code') if service.lower() == 'youtube' else None return self.url_result(ext_url or external['uri']) - self._sort_formats(formats) - thumbnail = playerData.get('thumb') or self._og_search_property('image', webpage) if thumbnail: # trim thumbnail resize parameters @@ -215,6 +213,7 @@ class TedPlaylistIE(TedBaseIE): class TedEmbedIE(InfoExtractor): _VALID_URL = r'https?://embed(?:-ssl)?\.ted\.com/' + _EMBED_REGEX = [rf'<iframe[^>]+?src=(["\'])(?P<url>{_VALID_URL}.+?)\1'] _TESTS = [{ 'url': 'https://embed.ted.com/talks/janet_stovall_how_to_get_serious_about_diversity_and_inclusion_in_the_workplace', @@ -233,10 +232,5 @@ class TedEmbedIE(InfoExtractor): }, }] - @classmethod - def _extract_urls(cls, webpage): - return [mobj.group('url') for mobj in re.finditer( - fr'<iframe[^>]+?src=(["\'])(?P<url>{cls._VALID_URL}.+?)\1', webpage)] - def _real_extract(self, url): return self.url_result(re.sub(r'://embed(-ssl)?', '://www', url), TedTalkIE.ie_key()) diff --git a/hypervideo_dl/extractor/tele13.py b/hypervideo_dl/extractor/tele13.py index f8a2755..212af37 100644 --- a/hypervideo_dl/extractor/tele13.py +++ b/hypervideo_dl/extractor/tele13.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from .youtube import YoutubeIE from ..utils import ( @@ -74,7 +71,6 @@ class Tele13IE(InfoExtractor): 'ext': ext, }) urls.append(format_url) - self._sort_formats(formats) return { 'id': display_id, diff --git a/hypervideo_dl/extractor/tele5.py b/hypervideo_dl/extractor/tele5.py index c7beee1..9260db2 100644 --- a/hypervideo_dl/extractor/tele5.py +++ b/hypervideo_dl/extractor/tele5.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .dplay import DPlayIE from ..compat import compat_urlparse from ..utils import ( @@ -9,7 +6,7 @@ from ..utils import ( ) -class Tele5IE(DPlayIE): +class Tele5IE(DPlayIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'https?://(?:www\.)?tele5\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)' _GEO_COUNTRIES = ['DE'] _TESTS = [{ diff --git a/hypervideo_dl/extractor/telebruxelles.py b/hypervideo_dl/extractor/telebruxelles.py index 9e8c89b..2c50a67 100644 --- a/hypervideo_dl/extractor/telebruxelles.py +++ b/hypervideo_dl/extractor/telebruxelles.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -62,7 +59,6 @@ class TeleBruxellesIE(InfoExtractor): rtmp_url = re.sub(r'^rmtp', 'rtmp', rtmp_url) rtmp_url = re.sub(r'"\s*\+\s*"', '', rtmp_url) formats = self._extract_wowza_formats(rtmp_url, article_id or display_id) - self._sort_formats(formats) is_live = 'stream/live' in rtmp_url diff --git a/hypervideo_dl/extractor/telecinco.py b/hypervideo_dl/extractor/telecinco.py index eecd6a5..20bb824 100644 --- a/hypervideo_dl/extractor/telecinco.py +++ b/hypervideo_dl/extractor/telecinco.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import re @@ -105,7 +102,6 @@ class TelecincoIE(InfoExtractor): }).encode(), headers=headers)['tokens']['1']['cdn'] formats = self._extract_m3u8_formats( stream + '?' + cdn, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/telegraaf.py b/hypervideo_dl/extractor/telegraaf.py index 2dc0205..13e9515 100644 --- a/hypervideo_dl/extractor/telegraaf.py +++ b/hypervideo_dl/extractor/telegraaf.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -34,7 +31,9 @@ class TelegraafIE(InfoExtractor): article_id = self._match_id(url) video_id = self._download_json( - 'https://www.telegraaf.nl/graphql', article_id, query={ + 'https://app.telegraaf.nl/graphql', article_id, + headers={'User-Agent': 'De Telegraaf/6.8.11 (Android 11; en_US)'}, + query={ 'query': '''{ article(uid: %s) { videos { @@ -76,8 +75,6 @@ class TelegraafIE(InfoExtractor): 'format_id': 'http' + ('-%s' % label if label else ''), }) - self._sort_formats(formats) - return { 'id': video_id, 'title': title, diff --git a/hypervideo_dl/extractor/telegram.py b/hypervideo_dl/extractor/telegram.py index 2dfa261..5ec5485 100644 --- a/hypervideo_dl/extractor/telegram.py +++ b/hypervideo_dl/extractor/telegram.py @@ -1,37 +1,136 @@ +import re + from .common import InfoExtractor +from ..utils import ( + clean_html, + format_field, + get_element_by_class, + parse_duration, + parse_qs, + traverse_obj, + unified_timestamp, + update_url_query, + url_basename, +) class TelegramEmbedIE(InfoExtractor): IE_NAME = 'telegram:embed' - _VALID_URL = r'https?://t\.me/(?P<channel_name>[^/]+)/(?P<id>\d+)' + _VALID_URL = r'https?://t\.me/(?P<channel_id>[^/]+)/(?P<id>\d+)' _TESTS = [{ 'url': 'https://t.me/europa_press/613', + 'md5': 'dd707708aea958c11a590e8068825f22', 'info_dict': { 'id': '613', 'ext': 'mp4', - 'title': 'Europa Press', - 'description': '6ce2d7e8d56eda16d80607b23db7b252', - 'thumbnail': r're:^https?:\/\/cdn.*?telesco\.pe\/file\/\w+', + 'title': 'md5:6ce2d7e8d56eda16d80607b23db7b252', + 'description': 'md5:6ce2d7e8d56eda16d80607b23db7b252', + 'channel_id': 'europa_press', + 'channel': 'Europa Press ✔', + 'thumbnail': r're:^https?://.+', + 'timestamp': 1635631203, + 'upload_date': '20211030', + 'duration': 61, + }, + }, { + # 2-video post + 'url': 'https://t.me/vorposte/29342', + 'info_dict': { + 'id': 'vorposte-29342', + 'title': 'Форпост 29342', + 'description': 'md5:9d92e22169a3e136d5d69df25f82c3dc', + }, + 'playlist_count': 2, + 'params': { + 'skip_download': True, + }, + }, { + # 2-video post with --no-playlist + 'url': 'https://t.me/vorposte/29343', + 'md5': '1724e96053c18e788c8464038876e245', + 'info_dict': { + 'id': '29343', + 'ext': 'mp4', + 'title': 'md5:9d92e22169a3e136d5d69df25f82c3dc', + 'description': 'md5:9d92e22169a3e136d5d69df25f82c3dc', + 'channel_id': 'vorposte', + 'channel': 'Форпост', + 'thumbnail': r're:^https?://.+', + 'timestamp': 1666384480, + 'upload_date': '20221021', + 'duration': 35, + }, + 'params': { + 'noplaylist': True, + } + }, { + # 2-video post with 'single' query param + 'url': 'https://t.me/vorposte/29342?single', + 'md5': 'd20b202f1e41400a9f43201428add18f', + 'info_dict': { + 'id': '29342', + 'ext': 'mp4', + 'title': 'md5:9d92e22169a3e136d5d69df25f82c3dc', + 'description': 'md5:9d92e22169a3e136d5d69df25f82c3dc', + 'channel_id': 'vorposte', + 'channel': 'Форпост', + 'thumbnail': r're:^https?://.+', + 'timestamp': 1666384480, + 'upload_date': '20221021', + 'duration': 33, }, }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - webpage_embed = self._download_webpage(f'{url}?embed=1', video_id) + channel_id, msg_id = self._match_valid_url(url).group('channel_id', 'id') + embed = self._download_webpage( + url, msg_id, query={'embed': '1', 'single': []}, note='Downloading embed frame') - formats = [{ - 'url': self._proto_relative_url(self._search_regex( - '<video[^>]+src="([^"]+)"', webpage_embed, 'source')), - 'ext': 'mp4', - }] - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': self._html_search_meta(['og:title', 'twitter:title'], webpage, fatal=True), - 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage, fatal=True), - 'thumbnail': self._search_regex(r'tgme_widget_message_video_thumb"[^>]+background-image:url\(\'([^\']+)\'\)', - webpage_embed, 'thumbnail'), - 'formats': formats, + def clean_text(html_class, html): + text = clean_html(get_element_by_class(html_class, html)) + return text.replace('\n', ' ') if text else None + + description = clean_text('tgme_widget_message_text', embed) + message = { + 'title': description or '', + 'description': description, + 'channel': clean_text('tgme_widget_message_author', embed), + 'channel_id': channel_id, + 'timestamp': unified_timestamp(self._search_regex( + r'<time[^>]*datetime="([^"]*)"', embed, 'timestamp', fatal=False)), } + + videos = [] + for video in re.findall(r'<a class="tgme_widget_message_video_player(?s:.+?)</time>', embed): + video_url = self._search_regex( + r'<video[^>]+src="([^"]+)"', video, 'video URL', fatal=False) + webpage_url = self._search_regex( + r'<a class="tgme_widget_message_video_player[^>]+href="([^"]+)"', + video, 'webpage URL', fatal=False) + if not video_url or not webpage_url: + continue + formats = [{ + 'url': video_url, + 'ext': 'mp4', + }] + videos.append({ + 'id': url_basename(webpage_url), + 'webpage_url': update_url_query(webpage_url, {'single': True}), + 'duration': parse_duration(self._search_regex( + r'<time[^>]+duration[^>]*>([\d:]+)</time>', video, 'duration', fatal=False)), + 'thumbnail': self._search_regex( + r'tgme_widget_message_video_thumb"[^>]+background-image:url\(\'([^\']+)\'\)', + video, 'thumbnail', fatal=False), + 'formats': formats, + **message, + }) + + playlist_id = None + if len(videos) > 1 and 'single' not in parse_qs(url, keep_blank_values=True): + playlist_id = f'{channel_id}-{msg_id}' + + if self._yes_playlist(playlist_id, msg_id): + return self.playlist_result( + videos, playlist_id, format_field(message, 'channel', f'%s {msg_id}'), description) + else: + return traverse_obj(videos, lambda _, x: x['id'] == msg_id, get_all=False) diff --git a/hypervideo_dl/extractor/telemb.py b/hypervideo_dl/extractor/telemb.py index ac2d603..3d29dac 100644 --- a/hypervideo_dl/extractor/telemb.py +++ b/hypervideo_dl/extractor/telemb.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -60,7 +57,6 @@ class TeleMBIE(InfoExtractor): 'preference': -10, }) formats.append(fmt) - self._sort_formats(formats) title = remove_start(self._og_search_title(webpage), 'TéléMB : ') description = self._html_search_regex( diff --git a/hypervideo_dl/extractor/telemundo.py b/hypervideo_dl/extractor/telemundo.py index ebcecf5..88f29cb 100644 --- a/hypervideo_dl/extractor/telemundo.py +++ b/hypervideo_dl/extractor/telemundo.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( try_get, @@ -43,7 +40,6 @@ class TelemundoIE(InfoExtractor): redirect_url + '?format=redirect&manifest=m3u&format=redirect&Tracking=true&Embedded=true&formats=MPEG4'), video_id, 'Processing m3u8').geturl() formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4') - self._sort_formats(formats) date = unified_timestamp(try_get( metadata, lambda x: x['props']['initialState']['video']['associatedPlaylists'][0]['videos'][0]['datePublished'].split(' ', 1)[1])) return { diff --git a/hypervideo_dl/extractor/telequebec.py b/hypervideo_dl/extractor/telequebec.py index 4bef2fe..e891372 100644 --- a/hypervideo_dl/extractor/telequebec.py +++ b/hypervideo_dl/extractor/telequebec.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( diff --git a/hypervideo_dl/extractor/teletask.py b/hypervideo_dl/extractor/teletask.py index b9e2ef8..a73dd68 100644 --- a/hypervideo_dl/extractor/teletask.py +++ b/hypervideo_dl/extractor/teletask.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/telewebion.py b/hypervideo_dl/extractor/telewebion.py index 1207b1a..550549f 100644 --- a/hypervideo_dl/extractor/telewebion.py +++ b/hypervideo_dl/extractor/telewebion.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/tempo.py b/hypervideo_dl/extractor/tempo.py new file mode 100644 index 0000000..1cfb956 --- /dev/null +++ b/hypervideo_dl/extractor/tempo.py @@ -0,0 +1,53 @@ +from .common import InfoExtractor +from ..utils import int_or_none, parse_iso8601, str_or_none, traverse_obj + + +class TempoIE(InfoExtractor): + _VALID_URL = r'https?://video\.tempo\.co/\w+/\d+/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://video.tempo.co/read/30058/anies-baswedan-ajukan-banding-putusan-ptun-batalkan-ump-dki', + 'info_dict': { + 'id': '2144438', + 'ext': 'mp4', + 'title': 'Anies Baswedan Ajukan Banding Putusan PTUN Batalkan UMP DKI', + 'display_id': 'anies-baswedan-ajukan-banding-putusan-ptun-batalkan-ump-dki', + 'duration': 84, + 'description': 'md5:a6822b7c4c874fa7e5bd63e96a387b66', + 'thumbnail': 'https://statik.tempo.co/data/2022/07/27/id_1128287/1128287_720.jpg', + 'timestamp': 1658911277, + 'upload_date': '20220727', + 'tags': ['Anies Baswedan', ' PTUN', ' PTUN | Pengadilan Tata Usaha Negara', ' PTUN Batalkan UMP DKI', ' UMP DKI'], + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + player_key, widget_id = self._search_regex( + r'<ivs-player\s*[^>]+data-ivs-key\s*=\s*"(?P<player_key>[\w]+)[^>]+\bdata-ivs-wid="(?P<widget_id>[\w-]+)', + webpage, 'player_key, widget_id', group=('player_key', 'widget_id')) + + json_ld_data = self._search_json_ld(webpage, display_id) + + json_data = self._download_json( + f'https://ivxplayer.ivideosmart.com/prod/widget/{widget_id}', + display_id, query={'key': player_key}) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + json_data['player']['video_url'], display_id, ext='mp4') + + return { + 'id': str(json_data['ivx']['id']), + 'display_id': display_id, + 'formats': formats, + 'subtitles': subtitles, + 'title': (self._html_search_meta('twitter:title', webpage) or self._og_search_title(webpage) + or traverse_obj(json_data, ('ivx', 'name'))), + 'duration': int_or_none(traverse_obj(json_data, ('ivx', 'duration'))), + 'thumbnail': (self._html_search_meta('twitter:image:src', webpage) or self._og_search_thumbnail(webpage) + or traverse_obj(json_data, ('ivx', 'thumbnail_url'))), + 'description': (json_ld_data.get('description') or self._html_search_meta(['description', 'twitter:description'], webpage) + or self._og_search_description(webpage)), + 'timestamp': parse_iso8601(traverse_obj(json_data, ('ivx', 'created_at'))), + 'tags': str_or_none(self._html_search_meta('keywords', webpage), '').split(','), + } diff --git a/hypervideo_dl/extractor/tencent.py b/hypervideo_dl/extractor/tencent.py new file mode 100644 index 0000000..ff8bf99 --- /dev/null +++ b/hypervideo_dl/extractor/tencent.py @@ -0,0 +1,452 @@ +import functools +import random +import re +import string +import time + +from .common import InfoExtractor +from ..aes import aes_cbc_encrypt_bytes +from ..utils import ( + ExtractorError, + determine_ext, + int_or_none, + js_to_json, + traverse_obj, + urljoin, +) + + +class TencentBaseIE(InfoExtractor): + """Subclasses must set _API_URL, _APP_VERSION, _PLATFORM, _HOST, _REFERER""" + + def _get_ckey(self, video_id, url, guid): + ua = self.get_param('http_headers')['User-Agent'] + + payload = (f'{video_id}|{int(time.time())}|mg3c3b04ba|{self._APP_VERSION}|{guid}|' + f'{self._PLATFORM}|{url[:48]}|{ua.lower()[:48]}||Mozilla|Netscape|Windows x86_64|00|') + + return aes_cbc_encrypt_bytes( + bytes(f'|{sum(map(ord, payload))}|{payload}', 'utf-8'), + b'Ok\xda\xa3\x9e/\x8c\xb0\x7f^r-\x9e\xde\xf3\x14', + b'\x01PJ\xf3V\xe6\x19\xcf.B\xbb\xa6\x8c?p\xf9', + padding_mode='whitespace').hex().upper() + + def _get_video_api_response(self, video_url, video_id, series_id, subtitle_format, video_format, video_quality): + guid = ''.join([random.choice(string.digits + string.ascii_lowercase) for _ in range(16)]) + ckey = self._get_ckey(video_id, video_url, guid) + query = { + 'vid': video_id, + 'cid': series_id, + 'cKey': ckey, + 'encryptVer': '8.1', + 'spcaptiontype': '1' if subtitle_format == 'vtt' else '0', + 'sphls': '2' if video_format == 'hls' else '0', + 'dtype': '3' if video_format == 'hls' else '0', + 'defn': video_quality, + 'spsrt': '2', # Enable subtitles + 'sphttps': '1', # Enable HTTPS + 'otype': 'json', + 'spwm': '1', + # For SHD + 'host': self._HOST, + 'referer': self._REFERER, + 'ehost': video_url, + 'appVer': self._APP_VERSION, + 'platform': self._PLATFORM, + # For VQQ + 'guid': guid, + 'flowid': ''.join(random.choice(string.digits + string.ascii_lowercase) for _ in range(32)), + } + + return self._search_json(r'QZOutputJson=', self._download_webpage( + self._API_URL, video_id, query=query), 'api_response', video_id) + + def _extract_video_formats_and_subtitles(self, api_response, video_id): + video_response = api_response['vl']['vi'][0] + video_width, video_height = video_response.get('vw'), video_response.get('vh') + + formats, subtitles = [], {} + for video_format in video_response['ul']['ui']: + if video_format.get('hls') or determine_ext(video_format['url']) == 'm3u8': + fmts, subs = self._extract_m3u8_formats_and_subtitles( + video_format['url'] + traverse_obj(video_format, ('hls', 'pt'), default=''), + video_id, 'mp4', fatal=False) + for f in fmts: + f.update({'width': video_width, 'height': video_height}) + + formats.extend(fmts) + self._merge_subtitles(subs, target=subtitles) + else: + formats.append({ + 'url': f'{video_format["url"]}{video_response["fn"]}?vkey={video_response["fvkey"]}', + 'width': video_width, + 'height': video_height, + 'ext': 'mp4', + }) + + return formats, subtitles + + def _extract_video_native_subtitles(self, api_response, subtitles_format): + subtitles = {} + for subtitle in traverse_obj(api_response, ('sfl', 'fi')) or (): + subtitles.setdefault(subtitle['lang'].lower(), []).append({ + 'url': subtitle['url'], + 'ext': subtitles_format, + 'protocol': 'm3u8_native' if determine_ext(subtitle['url']) == 'm3u8' else 'http', + }) + + return subtitles + + def _extract_all_video_formats_and_subtitles(self, url, video_id, series_id): + formats, subtitles = [], {} + for video_format, subtitle_format, video_quality in ( + # '': 480p, 'shd': 720p, 'fhd': 1080p + ('mp4', 'srt', ''), ('hls', 'vtt', 'shd'), ('hls', 'vtt', 'fhd')): + api_response = self._get_video_api_response( + url, video_id, series_id, subtitle_format, video_format, video_quality) + + if api_response.get('em') != 0 and api_response.get('exem') != 0: + if '您所在区域暂无此内容版权' in api_response.get('msg'): + self.raise_geo_restricted() + raise ExtractorError(f'Tencent said: {api_response.get("msg")}') + + fmts, subs = self._extract_video_formats_and_subtitles(api_response, video_id) + native_subtitles = self._extract_video_native_subtitles(api_response, subtitle_format) + + formats.extend(fmts) + self._merge_subtitles(subs, native_subtitles, target=subtitles) + + return formats, subtitles + + def _get_clean_title(self, title): + return re.sub( + r'\s*[_\-]\s*(?:Watch online|腾讯视频|(?:高清)?1080P在线观看平台).*?$', + '', title or '').strip() or None + + +class VQQBaseIE(TencentBaseIE): + _VALID_URL_BASE = r'https?://v\.qq\.com' + + _API_URL = 'https://h5vv6.video.qq.com/getvinfo' + _APP_VERSION = '3.5.57' + _PLATFORM = '10901' + _HOST = 'v.qq.com' + _REFERER = 'v.qq.com' + + def _get_webpage_metadata(self, webpage, video_id): + return self._parse_json( + self._search_regex( + r'(?s)<script[^>]*>[^<]*window\.__pinia\s*=\s*([^<]+)</script>', + webpage, 'pinia data', fatal=False), + video_id, transform_source=js_to_json, fatal=False) + + +class VQQVideoIE(VQQBaseIE): + IE_NAME = 'vqq:video' + _VALID_URL = VQQBaseIE._VALID_URL_BASE + r'/x/(?:page|cover/(?P<series_id>\w+))/(?P<id>\w+)' + + _TESTS = [{ + 'url': 'https://v.qq.com/x/page/q326831cny0.html', + 'md5': '826ef93682df09e3deac4a6e6e8cdb6e', + 'info_dict': { + 'id': 'q326831cny0', + 'ext': 'mp4', + 'title': '我是选手:雷霆裂阵,终极时刻', + 'description': 'md5:e7ed70be89244017dac2a835a10aeb1e', + 'thumbnail': r're:^https?://[^?#]+q326831cny0', + }, + }, { + 'url': 'https://v.qq.com/x/page/o3013za7cse.html', + 'md5': 'b91cbbeada22ef8cc4b06df53e36fa21', + 'info_dict': { + 'id': 'o3013za7cse', + 'ext': 'mp4', + 'title': '欧阳娜娜VLOG', + 'description': 'md5:29fe847497a98e04a8c3826e499edd2e', + 'thumbnail': r're:^https?://[^?#]+o3013za7cse', + }, + }, { + 'url': 'https://v.qq.com/x/cover/7ce5noezvafma27/a00269ix3l8.html', + 'md5': '71459c5375c617c265a22f083facce67', + 'info_dict': { + 'id': 'a00269ix3l8', + 'ext': 'mp4', + 'title': '鸡毛飞上天 第01集', + 'description': 'md5:8cae3534327315b3872fbef5e51b5c5b', + 'thumbnail': r're:^https?://[^?#]+7ce5noezvafma27', + 'series': '鸡毛飞上天', + }, + }, { + 'url': 'https://v.qq.com/x/cover/mzc00200p29k31e/s0043cwsgj0.html', + 'md5': '96b9fd4a189fdd4078c111f21d7ac1bc', + 'info_dict': { + 'id': 's0043cwsgj0', + 'ext': 'mp4', + 'title': '第1集:如何快乐吃糖?', + 'description': 'md5:1d8c3a0b8729ae3827fa5b2d3ebd5213', + 'thumbnail': r're:^https?://[^?#]+s0043cwsgj0', + 'series': '青年理工工作者生活研究所', + }, + }, { + # Geo-restricted to China + 'url': 'https://v.qq.com/x/cover/mcv8hkc8zk8lnov/x0036x5qqsr.html', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id, series_id = self._match_valid_url(url).group('id', 'series_id') + webpage = self._download_webpage(url, video_id) + webpage_metadata = self._get_webpage_metadata(webpage, video_id) + + formats, subtitles = self._extract_all_video_formats_and_subtitles(url, video_id, series_id) + return { + 'id': video_id, + 'title': self._get_clean_title(self._og_search_title(webpage) + or traverse_obj(webpage_metadata, ('global', 'videoInfo', 'title'))), + 'description': (self._og_search_description(webpage) + or traverse_obj(webpage_metadata, ('global', 'videoInfo', 'desc'))), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': (self._og_search_thumbnail(webpage) + or traverse_obj(webpage_metadata, ('global', 'videoInfo', 'pic160x90'))), + 'series': traverse_obj(webpage_metadata, ('global', 'coverInfo', 'title')), + } + + +class VQQSeriesIE(VQQBaseIE): + IE_NAME = 'vqq:series' + _VALID_URL = VQQBaseIE._VALID_URL_BASE + r'/x/cover/(?P<id>\w+)\.html/?(?:[?#]|$)' + + _TESTS = [{ + 'url': 'https://v.qq.com/x/cover/7ce5noezvafma27.html', + 'info_dict': { + 'id': '7ce5noezvafma27', + 'title': '鸡毛飞上天', + 'description': 'md5:8cae3534327315b3872fbef5e51b5c5b', + }, + 'playlist_count': 55, + }, { + 'url': 'https://v.qq.com/x/cover/oshd7r0vy9sfq8e.html', + 'info_dict': { + 'id': 'oshd7r0vy9sfq8e', + 'title': '恋爱细胞2', + 'description': 'md5:9d8a2245679f71ca828534b0f95d2a03', + }, + 'playlist_count': 12, + }] + + def _real_extract(self, url): + series_id = self._match_id(url) + webpage = self._download_webpage(url, series_id) + webpage_metadata = self._get_webpage_metadata(webpage, series_id) + + episode_paths = [f'/x/cover/{series_id}/{video_id}.html' for video_id in re.findall( + r'<div[^>]+data-vid="(?P<video_id>[^"]+)"[^>]+class="[^"]+episode-item-rect--number', + webpage)] + + return self.playlist_from_matches( + episode_paths, series_id, ie=VQQVideoIE, getter=functools.partial(urljoin, url), + title=self._get_clean_title(traverse_obj(webpage_metadata, ('coverInfo', 'title')) + or self._og_search_title(webpage)), + description=(traverse_obj(webpage_metadata, ('coverInfo', 'description')) + or self._og_search_description(webpage))) + + +class WeTvBaseIE(TencentBaseIE): + _VALID_URL_BASE = r'https?://(?:www\.)?wetv\.vip/(?:[^?#]+/)?play' + + _API_URL = 'https://play.wetv.vip/getvinfo' + _APP_VERSION = '3.5.57' + _PLATFORM = '4830201' + _HOST = 'wetv.vip' + _REFERER = 'wetv.vip' + + def _get_webpage_metadata(self, webpage, video_id): + return self._parse_json( + traverse_obj(self._search_nextjs_data(webpage, video_id), ('props', 'pageProps', 'data')), + video_id, fatal=False) + + def _extract_episode(self, url): + video_id, series_id = self._match_valid_url(url).group('id', 'series_id') + webpage = self._download_webpage(url, video_id) + webpage_metadata = self._get_webpage_metadata(webpage, video_id) + + formats, subtitles = self._extract_all_video_formats_and_subtitles(url, video_id, series_id) + return { + 'id': video_id, + 'title': self._get_clean_title(self._og_search_title(webpage) + or traverse_obj(webpage_metadata, ('coverInfo', 'title'))), + 'description': (traverse_obj(webpage_metadata, ('coverInfo', 'description')) + or self._og_search_description(webpage)), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': self._og_search_thumbnail(webpage), + 'duration': int_or_none(traverse_obj(webpage_metadata, ('videoInfo', 'duration'))), + 'series': traverse_obj(webpage_metadata, ('coverInfo', 'title')), + 'episode_number': int_or_none(traverse_obj(webpage_metadata, ('videoInfo', 'episode'))), + } + + def _extract_series(self, url, ie): + series_id = self._match_id(url) + webpage = self._download_webpage(url, series_id) + webpage_metadata = self._get_webpage_metadata(webpage, series_id) + + episode_paths = ([f'/play/{series_id}/{episode["vid"]}' for episode in webpage_metadata.get('videoList')] + or re.findall(r'<a[^>]+class="play-video__link"[^>]+href="(?P<path>[^"]+)', webpage)) + + return self.playlist_from_matches( + episode_paths, series_id, ie=ie, getter=functools.partial(urljoin, url), + title=self._get_clean_title(traverse_obj(webpage_metadata, ('coverInfo', 'title')) + or self._og_search_title(webpage)), + description=(traverse_obj(webpage_metadata, ('coverInfo', 'description')) + or self._og_search_description(webpage))) + + +class WeTvEpisodeIE(WeTvBaseIE): + IE_NAME = 'wetv:episode' + _VALID_URL = WeTvBaseIE._VALID_URL_BASE + r'/(?P<series_id>\w+)(?:-[^?#]+)?/(?P<id>\w+)(?:-[^?#]+)?' + + _TESTS = [{ + 'url': 'https://wetv.vip/en/play/air11ooo2rdsdi3-Cute-Programmer/v0040pr89t9-EP1-Cute-Programmer', + 'md5': '0c70fdfaa5011ab022eebc598e64bbbe', + 'info_dict': { + 'id': 'v0040pr89t9', + 'ext': 'mp4', + 'title': 'EP1: Cute Programmer', + 'description': 'md5:e87beab3bf9f392d6b9e541a63286343', + 'thumbnail': r're:^https?://[^?#]+air11ooo2rdsdi3', + 'series': 'Cute Programmer', + 'episode': 'Episode 1', + 'episode_number': 1, + 'duration': 2835, + }, + }, { + 'url': 'https://wetv.vip/en/play/u37kgfnfzs73kiu/p0039b9nvik', + 'md5': '3b3c15ca4b9a158d8d28d5aa9d7c0a49', + 'info_dict': { + 'id': 'p0039b9nvik', + 'ext': 'mp4', + 'title': 'EP1: You Are My Glory', + 'description': 'md5:831363a4c3b4d7615e1f3854be3a123b', + 'thumbnail': r're:^https?://[^?#]+u37kgfnfzs73kiu', + 'series': 'You Are My Glory', + 'episode': 'Episode 1', + 'episode_number': 1, + 'duration': 2454, + }, + }, { + 'url': 'https://wetv.vip/en/play/lcxgwod5hapghvw-WeTV-PICK-A-BOO/i0042y00lxp-Zhao-Lusi-Describes-The-First-Experiences-She-Had-In-Who-Rules-The-World-%7C-WeTV-PICK-A-BOO', + 'md5': '71133f5c2d5d6cad3427e1b010488280', + 'info_dict': { + 'id': 'i0042y00lxp', + 'ext': 'mp4', + 'title': 'md5:f7a0857dbe5fbbe2e7ad630b92b54e6a', + 'description': 'md5:76260cb9cdc0ef76826d7ca9d92fadfa', + 'thumbnail': r're:^https?://[^?#]+lcxgwod5hapghvw', + 'series': 'WeTV PICK-A-BOO', + 'episode': 'Episode 0', + 'episode_number': 0, + 'duration': 442, + }, + }] + + def _real_extract(self, url): + return self._extract_episode(url) + + +class WeTvSeriesIE(WeTvBaseIE): + _VALID_URL = WeTvBaseIE._VALID_URL_BASE + r'/(?P<id>\w+)(?:-[^/?#]+)?/?(?:[?#]|$)' + + _TESTS = [{ + 'url': 'https://wetv.vip/play/air11ooo2rdsdi3-Cute-Programmer', + 'info_dict': { + 'id': 'air11ooo2rdsdi3', + 'title': 'Cute Programmer', + 'description': 'md5:e87beab3bf9f392d6b9e541a63286343', + }, + 'playlist_count': 30, + }, { + 'url': 'https://wetv.vip/en/play/u37kgfnfzs73kiu-You-Are-My-Glory', + 'info_dict': { + 'id': 'u37kgfnfzs73kiu', + 'title': 'You Are My Glory', + 'description': 'md5:831363a4c3b4d7615e1f3854be3a123b', + }, + 'playlist_count': 32, + }] + + def _real_extract(self, url): + return self._extract_series(url, WeTvEpisodeIE) + + +class IflixBaseIE(WeTvBaseIE): + _VALID_URL_BASE = r'https?://(?:www\.)?iflix\.com/(?:[^?#]+/)?play' + + _API_URL = 'https://vplay.iflix.com/getvinfo' + _APP_VERSION = '3.5.57' + _PLATFORM = '330201' + _HOST = 'www.iflix.com' + _REFERER = 'www.iflix.com' + + +class IflixEpisodeIE(IflixBaseIE): + IE_NAME = 'iflix:episode' + _VALID_URL = IflixBaseIE._VALID_URL_BASE + r'/(?P<series_id>\w+)(?:-[^?#]+)?/(?P<id>\w+)(?:-[^?#]+)?' + + _TESTS = [{ + 'url': 'https://www.iflix.com/en/play/daijrxu03yypu0s/a0040kvgaza', + 'md5': '9740f9338c3a2105290d16b68fb3262f', + 'info_dict': { + 'id': 'a0040kvgaza', + 'ext': 'mp4', + 'title': 'EP1: Put Your Head On My Shoulder 2021', + 'description': 'md5:c095a742d3b7da6dfedd0c8170727a42', + 'thumbnail': r're:^https?://[^?#]+daijrxu03yypu0s', + 'series': 'Put Your Head On My Shoulder 2021', + 'episode': 'Episode 1', + 'episode_number': 1, + 'duration': 2639, + }, + }, { + 'url': 'https://www.iflix.com/en/play/fvvrcc3ra9lbtt1-Take-My-Brother-Away/i0029sd3gm1-EP1%EF%BC%9ATake-My-Brother-Away', + 'md5': '375c9b8478fdedca062274b2c2f53681', + 'info_dict': { + 'id': 'i0029sd3gm1', + 'ext': 'mp4', + 'title': 'EP1:Take My Brother Away', + 'description': 'md5:f0f7be1606af51cd94d5627de96b0c76', + 'thumbnail': r're:^https?://[^?#]+fvvrcc3ra9lbtt1', + 'series': 'Take My Brother Away', + 'episode': 'Episode 1', + 'episode_number': 1, + 'duration': 228, + }, + }] + + def _real_extract(self, url): + return self._extract_episode(url) + + +class IflixSeriesIE(IflixBaseIE): + _VALID_URL = IflixBaseIE._VALID_URL_BASE + r'/(?P<id>\w+)(?:-[^/?#]+)?/?(?:[?#]|$)' + + _TESTS = [{ + 'url': 'https://www.iflix.com/en/play/g21a6qk4u1s9x22-You-Are-My-Hero', + 'info_dict': { + 'id': 'g21a6qk4u1s9x22', + 'title': 'You Are My Hero', + 'description': 'md5:9c4d844bc0799cd3d2b5aed758a2050a', + }, + 'playlist_count': 40, + }, { + 'url': 'https://www.iflix.com/play/0s682hc45t0ohll', + 'info_dict': { + 'id': '0s682hc45t0ohll', + 'title': 'Miss Gu Who Is Silent', + 'description': 'md5:a9651d0236f25af06435e845fa2f8c78', + }, + 'playlist_count': 20, + }] + + def _real_extract(self, url): + return self._extract_series(url, IflixEpisodeIE) diff --git a/hypervideo_dl/extractor/tennistv.py b/hypervideo_dl/extractor/tennistv.py index 58fdece..bc64226 100644 --- a/hypervideo_dl/extractor/tennistv.py +++ b/hypervideo_dl/extractor/tennistv.py @@ -1,19 +1,17 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import json +import urllib.parse from .common import InfoExtractor - from ..utils import ( ExtractorError, + random_uuidv4, unified_timestamp, + urlencode_postdata, ) class TennisTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tennistv\.com/videos/(?P<id>[-a-z0-9]+)' - _TEST = { + _TESTS = [{ 'url': 'https://www.tennistv.com/videos/indian-wells-2018-verdasco-fritz', 'info_dict': { 'id': 'indian-wells-2018-verdasco-fritz', @@ -28,86 +26,130 @@ class TennisTVIE(InfoExtractor): 'skip_download': True, }, 'skip': 'Requires email and password of a subscribed account', - } + }, { + 'url': 'https://www.tennistv.com/videos/2650480/best-matches-of-2022-part-5', + 'info_dict': { + 'id': '2650480', + 'ext': 'mp4', + 'title': 'Best Matches of 2022 - Part 5', + 'description': 'md5:36dec3bfae7ed74bd79e48045b17264c', + 'thumbnail': 'https://open.http.mp.streamamg.com/p/3001482/sp/300148200/thumbnail/entry_id/0_myef18pd/version/100001/height/1920', + }, + 'params': {'skip_download': 'm3u8'}, + 'skip': 'Requires email and password of a subscribed account', + }] _NETRC_MACHINE = 'tennistv' - _session_token = None - - def _perform_login(self, username, password): - - login_form = { - 'Email': username, - 'Password': password, - } - login_json = json.dumps(login_form).encode('utf-8') - headers = { - 'content-type': 'application/json', - 'Referer': 'https://www.tennistv.com/login', - 'Origin': 'https://www.tennistv.com', - } - login_result = self._download_json( - 'https://www.tennistv.com/api/users/v1/login', None, - note='Logging in', - errnote='Login failed (wrong password?)', - headers=headers, - data=login_json) + access_token, refresh_token = None, None + _PARTNER_ID = 3001482 + _FORMAT_URL = 'https://open.http.mp.streamamg.com/p/{partner}/sp/{partner}00/playManifest/entryId/{entry}/format/applehttp/protocol/https/a.m3u8?ks={session}' + _AUTH_BASE_URL = 'https://sso.tennistv.com/auth/realms/TennisTV/protocol/openid-connect' + _HEADERS = { + 'origin': 'https://www.tennistv.com', + 'referer': 'https://www.tennistv.com/', + 'content-Type': 'application/x-www-form-urlencoded' + } - if login_result['error']['errorCode']: - raise ExtractorError('Login failed, %s said: %r' % (self.IE_NAME, login_result['error']['errorMessage'])) + def _perform_login(self, username, password): + login_page = self._download_webpage( + f'{self._AUTH_BASE_URL}/auth', None, 'Downloading login page', + query={ + 'client_id': 'tennis-tv-web', + 'redirect_uri': 'https://tennistv.com', + 'response_mode': 'fragment', + 'response_type': 'code', + 'scope': 'openid' + }) + + post_url = self._html_search_regex(r'action=["\']([^"\']+?)["\']\s+method=["\']post["\']', login_page, 'login POST url') + temp_page = self._download_webpage( + post_url, None, 'Sending login data', 'Unable to send login data', + headers=self._HEADERS, data=urlencode_postdata({ + 'username': username, + 'password': password, + 'submitAction': 'Log In' + })) + if 'Your username or password was incorrect' in temp_page: + raise ExtractorError('Your username or password was incorrect', expected=True) + + handle = self._request_webpage( + f'{self._AUTH_BASE_URL}/auth', None, 'Logging in', headers=self._HEADERS, + query={ + 'client_id': 'tennis-tv-web', + 'redirect_uri': 'https://www.tennistv.com/resources/v1.1.10/html/silent-check-sso.html', + 'state': random_uuidv4(), + 'response_mode': 'fragment', + 'response_type': 'code', + 'scope': 'openid', + 'nonce': random_uuidv4(), + 'prompt': 'none' + }) + + self.get_token(None, { + 'code': urllib.parse.parse_qs(handle.geturl())['code'][-1], + 'grant_type': 'authorization_code', + 'client_id': 'tennis-tv-web', + 'redirect_uri': 'https://www.tennistv.com/resources/v1.1.10/html/silent-check-sso.html' + }) + + def get_token(self, video_id, payload): + res = self._download_json( + f'{self._AUTH_BASE_URL}/token', video_id, 'Fetching tokens', + 'Unable to fetch tokens', headers=self._HEADERS, data=urlencode_postdata(payload)) + + self.access_token = res.get('access_token') or self.access_token + self.refresh_token = res.get('refresh_token') or self.refresh_token - if login_result['entitlement'] != 'SUBSCRIBED': - self.report_warning('%s may not be subscribed to %s.' % (username, self.IE_NAME)) + def _real_initialize(self): + if self.access_token and self.refresh_token: + return - self._session_token = login_result['sessionToken'] + cookies = self._get_cookies('https://www.tennistv.com/') + if not cookies.get('access_token') or not cookies.get('refresh_token'): + self.raise_login_required() + self.access_token, self.refresh_token = cookies['access_token'].value, cookies['refresh_token'].value - def _real_initialize(self): - if not self._session_token: - raise self.raise_login_required('Login info is needed for this website', method='password') + def _download_session_json(self, video_id, entryid,): + return self._download_json( + f'https://atppayments.streamamg.com/api/v1/session/ksession/?lang=en&apijwttoken={self.access_token}&entryId={entryid}', + video_id, 'Downloading ksession token', 'Failed to download ksession token', headers=self._HEADERS) def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - internal_id = self._search_regex(r'video=([\w-]+)', webpage, 'internal video id') + entryid = self._search_regex(r'data-entry-id=["\']([^"\']+)', webpage, 'entryID') + session_json = self._download_session_json(video_id, entryid) - headers = { - 'Origin': 'https://www.tennistv.com', - 'authorization': 'ATP %s' % self._session_token, - 'content-type': 'application/json', - 'Referer': url, - } - check_data = { - 'videoID': internal_id, - 'VideoUrlType': 'HLS', - } - check_json = json.dumps(check_data).encode('utf-8') - check_result = self._download_json( - 'https://www.tennistv.com/api/users/v1/entitlementchecknondiva', - video_id, note='Checking video authorization', headers=headers, data=check_json) - formats = self._extract_m3u8_formats(check_result['contentUrl'], video_id, ext='mp4') - self._sort_formats(formats) - - vdata = self._download_json( - 'https://www.tennistv.com/api/en/v2/none/common/video/%s' % video_id, - video_id, headers=headers) - - timestamp = unified_timestamp(vdata['timestamp']) - thumbnail = vdata['video']['thumbnailUrl'] - description = vdata['displayText']['description'] - title = vdata['video']['title'] - - series = vdata['tour'] - venue = vdata['displayText']['venue'] - round_str = vdata['seo']['round'] + k_session = session_json.get('KSession') + if k_session is None: + self.get_token(video_id, { + 'grant_type': 'refresh_token', + 'refresh_token': self.refresh_token, + 'client_id': 'tennis-tv-web' + }) + k_session = self._download_session_json(video_id, entryid).get('KSession') + if k_session is None: + raise ExtractorError('Failed to get KSession, possibly a premium video', expected=True) + + if session_json.get('ErrorMessage'): + self.report_warning(session_json['ErrorMessage']) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + self._FORMAT_URL.format(partner=self._PARTNER_ID, entry=entryid, session=k_session), video_id) return { 'id': video_id, - 'title': title, - 'description': description, + 'title': self._generic_title('', webpage), + 'description': self._html_search_regex( + (r'<span itemprop="description" content=["\']([^"\']+)["\']>', *self._og_regexes('description')), + webpage, 'description', fatal=False), + 'thumbnail': f'https://open.http.mp.streamamg.com/p/{self._PARTNER_ID}/sp/{self._PARTNER_ID}00/thumbnail/entry_id/{entryid}/version/100001/height/1920', + 'timestamp': unified_timestamp(self._html_search_regex( + r'<span itemprop="uploadDate" content=["\']([^"\']+)["\']>', webpage, 'upload time', fatal=False)), + 'series': self._html_search_regex(r'data-series\s*?=\s*?"(.*?)"', webpage, 'series', fatal=False) or None, + 'season': self._html_search_regex(r'data-tournament-city\s*?=\s*?"(.*?)"', webpage, 'season', fatal=False) or None, + 'episode': self._html_search_regex(r'data-round\s*?=\s*?"(.*?)"', webpage, 'round', fatal=False) or None, 'formats': formats, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'series': series, - 'season': venue, - 'episode': round_str, + 'subtitles': subtitles, } diff --git a/hypervideo_dl/extractor/tenplay.py b/hypervideo_dl/extractor/tenplay.py index 5c7b545..633032e 100644 --- a/hypervideo_dl/extractor/tenplay.py +++ b/hypervideo_dl/extractor/tenplay.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from datetime import datetime import base64 @@ -101,7 +98,6 @@ class TenPlayIE(InfoExtractor): if '10play-not-in-oz' in m3u8_url: self.raise_geo_restricted(countries=['AU']) formats = self._extract_m3u8_formats(m3u8_url, content_id, 'mp4') - self._sort_formats(formats) return { 'formats': formats, diff --git a/hypervideo_dl/extractor/testurl.py b/hypervideo_dl/extractor/testurl.py index 8bc512a..dccca10 100644 --- a/hypervideo_dl/extractor/testurl.py +++ b/hypervideo_dl/extractor/testurl.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -10,55 +8,38 @@ class TestURLIE(InfoExtractor): """ Allows addressing of the test cases as test:yout.*be_1 """ IE_DESC = False # Do not list - _VALID_URL = r'test(?:url)?:(?P<id>(?P<extractor>.+?)(?:_(?P<num>[0-9]+))?)$' + _VALID_URL = r'test(?:url)?:(?P<extractor>.*?)(?:_(?P<num>[0-9]+))?$' def _real_extract(self, url): - from ..extractor import gen_extractors + from . import gen_extractor_classes - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - extractor_id = mobj.group('extractor') - all_extractors = gen_extractors() + extractor_id, num = self._match_valid_url(url).group('extractor', 'num') + if not extractor_id: + return {'id': ':test', 'title': '', 'url': url} rex = re.compile(extractor_id, flags=re.IGNORECASE) - matching_extractors = [ - e for e in all_extractors if rex.search(e.IE_NAME)] + matching_extractors = [e for e in gen_extractor_classes() if rex.search(e.IE_NAME)] if len(matching_extractors) == 0: - raise ExtractorError( - 'No extractors matching %r found' % extractor_id, - expected=True) + raise ExtractorError(f'No extractors matching {extractor_id!r} found', expected=True) elif len(matching_extractors) > 1: - # Is it obvious which one to pick? - try: + try: # Check for exact match extractor = next( ie for ie in matching_extractors if ie.IE_NAME.lower() == extractor_id.lower()) except StopIteration: raise ExtractorError( - ('Found multiple matching extractors: %s' % - ' '.join(ie.IE_NAME for ie in matching_extractors)), + 'Found multiple matching extractors: %s' % ' '.join(ie.IE_NAME for ie in matching_extractors), expected=True) else: extractor = matching_extractors[0] - num_str = mobj.group('num') - num = int(num_str) if num_str else 0 - - testcases = [] - t = getattr(extractor, '_TEST', None) - if t: - testcases.append(t) - testcases.extend(getattr(extractor, '_TESTS', [])) - + testcases = tuple(extractor.get_testcases(True)) try: - tc = testcases[num] + tc = testcases[int(num or 0)] except IndexError: raise ExtractorError( - ('Test case %d not found, got only %d tests' % - (num, len(testcases))), - expected=True) - - self.to_screen('Test URL: %s' % tc['url']) + f'Test case {num or 0} not found, got only {len(testcases)} tests', expected=True) - return self.url_result(tc['url'], video_id=video_id) + self.to_screen(f'Test URL: {tc["url"]}') + return self.url_result(tc['url']) diff --git a/hypervideo_dl/extractor/tf1.py b/hypervideo_dl/extractor/tf1.py index 44785bc..4cf0322 100644 --- a/hypervideo_dl/extractor/tf1.py +++ b/hypervideo_dl/extractor/tf1.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/tfo.py b/hypervideo_dl/extractor/tfo.py index 0631cb7..a24789c 100644 --- a/hypervideo_dl/extractor/tfo.py +++ b/hypervideo_dl/extractor/tfo.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/theholetv.py b/hypervideo_dl/extractor/theholetv.py new file mode 100644 index 0000000..a13f83b --- /dev/null +++ b/hypervideo_dl/extractor/theholetv.py @@ -0,0 +1,35 @@ +from .common import InfoExtractor +from ..utils import extract_attributes, remove_end + + +class TheHoleTvIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?the-hole\.tv/episodes/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://the-hole.tv/episodes/gromkii-vopros-sergey-orlov', + 'md5': 'fea6682f47786f3ae5a6cbd635ec4bf9', + 'info_dict': { + 'id': 'gromkii-vopros-sergey-orlov', + 'ext': 'mp4', + 'title': 'Сергей Орлов — Громкий вопрос', + 'thumbnail': 'https://assets-cdn.the-hole.tv/images/t8gan4n6zn627e7wni11b2uemqts', + 'description': 'md5:45741a9202331f995d9fb76996759379' + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + player_attrs = extract_attributes(self._search_regex( + r'(<div[^>]*\bdata-controller="player"[^>]*>)', webpage, 'video player')) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + player_attrs['data-player-source-value'], video_id, 'mp4') + + return { + 'id': video_id, + 'title': remove_end(self._html_extract_title(webpage), ' — The Hole'), + 'description': self._og_search_description(webpage), + 'thumbnail': player_attrs.get('data-player-poster-value'), + 'formats': formats, + 'subtitles': subtitles + } diff --git a/hypervideo_dl/extractor/theintercept.py b/hypervideo_dl/extractor/theintercept.py index f23b587..a991a4d 100644 --- a/hypervideo_dl/extractor/theintercept.py +++ b/hypervideo_dl/extractor/theintercept.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( diff --git a/hypervideo_dl/extractor/theplatform.py b/hypervideo_dl/extractor/theplatform.py index c2729f1..e659b8e 100644 --- a/hypervideo_dl/extractor/theplatform.py +++ b/hypervideo_dl/extractor/theplatform.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re import time import hmac @@ -126,6 +123,13 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/ (?:(?:(?:[^/]+/)+select/)?(?P<media>media/(?:guid/\d+/)?)?|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))? |theplatform:)(?P<id>[^/\?&]+)''' + _EMBED_REGEX = [ + r'''(?x) + <meta\s+ + property=(["'])(?:og:video(?::(?:secure_)?url)?|twitter:player)\1\s+ + content=(["'])(?P<url>https?://player\.theplatform\.com/p/.+?)\2''', + r'(?s)<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//player\.theplatform\.com/p/.+?)\1' + ] _TESTS = [{ # from http://www.metacafe.com/watch/cb-e9I_cZgTgIPd/blackberrys_big_bold_z30/ @@ -195,22 +199,11 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): }] @classmethod - def _extract_urls(cls, webpage): - m = re.search( - r'''(?x) - <meta\s+ - property=(["'])(?:og:video(?::(?:secure_)?url)?|twitter:player)\1\s+ - content=(["'])(?P<url>https?://player\.theplatform\.com/p/.+?)\2 - ''', webpage) - if m: - return [m.group('url')] - + def _extract_embed_urls(cls, url, webpage): # Are whitespaces ignored in URLs? # https://github.com/ytdl-org/youtube-dl/issues/12044 - matches = re.findall( - r'(?s)<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage) - if matches: - return [re.sub(r'\s', '', list(zip(*matches))[1][0])] + for embed_url in super()._extract_embed_urls(url, webpage): + yield re.sub(r'\s', '', embed_url) @staticmethod def _sign_url(url, sig_key, sig_secret, life=600, include_qs=False): @@ -303,7 +296,6 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): smil_url = self._sign_url(smil_url, sig['key'], sig['secret']) formats, subtitles = self._extract_theplatform_smil(smil_url, video_id) - self._sort_formats(formats) ret = self._extract_theplatform_metadata(path, video_id) combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles) @@ -373,8 +365,6 @@ class ThePlatformFeedIE(ThePlatformBaseIE): formats.extend(cur_formats) subtitles = self._merge_subtitles(subtitles, cur_subtitles) - self._sort_formats(formats) - thumbnails = [{ 'url': thumbnail['plfile$url'], 'width': int_or_none(thumbnail.get('plfile$width')), diff --git a/hypervideo_dl/extractor/thescene.py b/hypervideo_dl/extractor/thescene.py deleted file mode 100644 index cd64235..0000000 --- a/hypervideo_dl/extractor/thescene.py +++ /dev/null @@ -1,44 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor - -from ..compat import compat_urlparse - - -class TheSceneIE(InfoExtractor): - _VALID_URL = r'https?://thescene\.com/watch/[^/]+/(?P<id>[^/#?]+)' - - _TEST = { - 'url': 'https://thescene.com/watch/vogue/narciso-rodriguez-spring-2013-ready-to-wear', - 'info_dict': { - 'id': '520e8faac2b4c00e3c6e5f43', - 'ext': 'mp4', - 'title': 'Narciso Rodriguez: Spring 2013 Ready-to-Wear', - 'display_id': 'narciso-rodriguez-spring-2013-ready-to-wear', - 'duration': 127, - 'series': 'Style.com Fashion Shows', - 'season': 'Ready To Wear Spring 2013', - 'tags': list, - 'categories': list, - 'upload_date': '20120913', - 'timestamp': 1347512400, - 'uploader': 'vogue', - }, - } - - def _real_extract(self, url): - display_id = self._match_id(url) - - webpage = self._download_webpage(url, display_id) - - player_url = compat_urlparse.urljoin( - url, - self._html_search_regex( - r'id=\'js-player-script\'[^>]+src=\'(.+?)\'', webpage, 'player url')) - - return { - '_type': 'url_transparent', - 'display_id': display_id, - 'url': player_url, - 'ie_key': 'CondeNast', - } diff --git a/hypervideo_dl/extractor/thestar.py b/hypervideo_dl/extractor/thestar.py index c3f1188..293c34c 100644 --- a/hypervideo_dl/extractor/thestar.py +++ b/hypervideo_dl/extractor/thestar.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/thesun.py b/hypervideo_dl/extractor/thesun.py index 15d4a69..ba58482 100644 --- a/hypervideo_dl/extractor/thesun.py +++ b/hypervideo_dl/extractor/thesun.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/theta.py b/hypervideo_dl/extractor/theta.py index 8b6d70a..ecf0ea0 100644 --- a/hypervideo_dl/extractor/theta.py +++ b/hypervideo_dl/extractor/theta.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import try_get @@ -44,7 +41,6 @@ class ThetaStreamIE(InfoExtractor): if data.get('type') != 'embed' and data.get('resolution') in ('master', 'source')) formats = self._extract_m3u8_formats(m3u8_playlist, channel_id, 'mp4', m3u8_id='hls', live=True) - self._sort_formats(formats) channel = try_get(info, lambda x: x['user']['username']) # using this field instead of channel_id due to capitalization @@ -81,7 +77,6 @@ class ThetaVideoIE(InfoExtractor): m3u8_playlist = try_get(info, lambda x: x['video_urls'][0]['url']) formats = self._extract_m3u8_formats(m3u8_playlist, video_id, 'mp4', m3u8_id='hls') - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/theweatherchannel.py b/hypervideo_dl/extractor/theweatherchannel.py index 9e506c9..682e433 100644 --- a/hypervideo_dl/extractor/theweatherchannel.py +++ b/hypervideo_dl/extractor/theweatherchannel.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .theplatform import ThePlatformIE @@ -11,7 +8,7 @@ from ..utils import ( ) -class TheWeatherChannelIE(ThePlatformIE): +class TheWeatherChannelIE(ThePlatformIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'https?://(?:www\.)?weather\.com(?P<asset_name>(?:/(?P<locale>[a-z]{2}-[A-Z]{2}))?/(?:[^/]+/)*video/(?P<id>[^/?#]+))' _TESTS = [{ 'url': 'https://weather.com/series/great-outdoors/video/ice-climber-is-in-for-a-shock', @@ -82,7 +79,6 @@ class TheWeatherChannelIE(ThePlatformIE): 'url': variant_url, 'format_id': variant_id, }) - self._sort_formats(formats) cc_url = video_data.get('cc_url') diff --git a/hypervideo_dl/extractor/thisamericanlife.py b/hypervideo_dl/extractor/thisamericanlife.py index 91e45f2..9a3d798 100644 --- a/hypervideo_dl/extractor/thisamericanlife.py +++ b/hypervideo_dl/extractor/thisamericanlife.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/thisav.py b/hypervideo_dl/extractor/thisav.py index 6bb00b3..b1cd57d 100644 --- a/hypervideo_dl/extractor/thisav.py +++ b/hypervideo_dl/extractor/thisav.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import remove_end diff --git a/hypervideo_dl/extractor/thisoldhouse.py b/hypervideo_dl/extractor/thisoldhouse.py index 8a1d173..55b6413 100644 --- a/hypervideo_dl/extractor/thisoldhouse.py +++ b/hypervideo_dl/extractor/thisoldhouse.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import HEADRequest diff --git a/hypervideo_dl/extractor/threeqsdn.py b/hypervideo_dl/extractor/threeqsdn.py index 00a51dc..b104190 100644 --- a/hypervideo_dl/extractor/threeqsdn.py +++ b/hypervideo_dl/extractor/threeqsdn.py @@ -1,7 +1,3 @@ -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( @@ -18,6 +14,7 @@ class ThreeQSDNIE(InfoExtractor): IE_NAME = '3qsdn' IE_DESC = '3Q SDN' _VALID_URL = r'https?://playout\.3qsdn\.com/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _EMBED_REGEX = [r'<iframe[^>]+\b(?:data-)?src=(["\'])(?P<url>%s.*?)\1' % _VALID_URL] _TESTS = [{ # https://player.3qsdn.com/demo.html 'url': 'https://playout.3qsdn.com/7201c779-6b3c-11e7-a40e-002590c750be', @@ -78,12 +75,13 @@ class ThreeQSDNIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+\b(?:data-)?src=(["\'])(?P<url>%s.*?)\1' % ThreeQSDNIE._VALID_URL, webpage) - if mobj: - return mobj.group('url') + def _extract_from_webpage(self, url, webpage): + for res in super()._extract_from_webpage(url, webpage): + yield { + **res, + '_type': 'url_transparent', + 'uploader': self._search_regex(r'^(?:https?://)?([^/]*)/.*', url, 'video uploader'), + } def _real_extract(self, url): video_id = self._match_id(url) @@ -130,10 +128,6 @@ class ThreeQSDNIE(InfoExtractor): 'vcodec': 'none' if height == 0 else None, 'width': int(height * aspect) if height and aspect else None, }) - # It seems like this would be correctly handled by default - # However, unless someone can confirm this, the old - # behaviour is being kept as-is - self._sort_formats(formats, ('res', 'source_preference')) for subtitle in (config.get('subtitles') or []): src = subtitle.get('src') @@ -155,4 +149,8 @@ class ThreeQSDNIE(InfoExtractor): 'is_live': live, 'formats': formats, 'subtitles': subtitles, + # It seems like this would be correctly handled by default + # However, unless someone can confirm this, the old + # behaviour is being kept as-is + '_format_sort_fields': ('res', 'source_preference') } diff --git a/hypervideo_dl/extractor/threespeak.py b/hypervideo_dl/extractor/threespeak.py index fe6a955..dbd5090 100644 --- a/hypervideo_dl/extractor/threespeak.py +++ b/hypervideo_dl/extractor/threespeak.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -60,7 +57,6 @@ class ThreeSpeakIE(InfoExtractor): 'quality': 11, 'format_note': 'Original file', }) - self._sort_formats(formats) return { 'id': id, 'title': data_json.get('title') or data_json.get('root_title'), diff --git a/hypervideo_dl/extractor/tiktok.py b/hypervideo_dl/extractor/tiktok.py index c1d6c54..1bbf884 100644 --- a/hypervideo_dl/extractor/tiktok.py +++ b/hypervideo_dl/extractor/tiktok.py @@ -1,35 +1,32 @@ -# coding: utf-8 -from __future__ import unicode_literals - import itertools +import json import random import string import time -import json from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_urllib_parse_urlparse -) +from ..compat import compat_urllib_parse_unquote, compat_urllib_parse_urlparse from ..utils import ( ExtractorError, HEADRequest, + LazyList, + UnsupportedError, + get_element_by_id, get_first, int_or_none, join_nonempty, - LazyList, + qualities, + remove_start, srt_subtitles_timecode, str_or_none, traverse_obj, try_get, url_or_none, - qualities, ) class TikTokBaseIE(InfoExtractor): - _APP_VERSIONS = [('20.9.3', '293'), ('20.4.3', '243'), ('20.2.1', '221'), ('20.1.2', '212'), ('20.0.4', '204')] + _APP_VERSIONS = [('26.1.3', '260103'), ('26.1.2', '260102'), ('26.1.1', '260101'), ('25.6.2', '250602')] _WORKING_APP_VERSION = None _APP_NAME = 'trill' _AID = 1180 @@ -38,6 +35,14 @@ class TikTokBaseIE(InfoExtractor): _WEBPAGE_HOST = 'https://www.tiktok.com/' QUALITIES = ('360p', '540p', '720p', '1080p') + @staticmethod + def _create_url(user_id, video_id): + return f'https://www.tiktok.com/@{user_id or "_"}/video/{video_id}' + + def _get_sigi_state(self, webpage, display_id): + return self._parse_json(get_element_by_id( + 'SIGI_STATE|sigi-persisted-data', webpage, escape_value=False), display_id) + def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True, note='Downloading API JSON', errnote='Unable to download API page'): self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choice('0123456789abcdef') for _ in range(160))) @@ -47,7 +52,7 @@ class TikTokBaseIE(InfoExtractor): return self._download_json( 'https://%s/aweme/v1/%s/' % (self._API_HOSTNAME, ep), video_id=video_id, fatal=fatal, note=note, errnote=errnote, headers={ - 'User-Agent': f'com.ss.android.ugc.trill/{manifest_app_version} (Linux; U; Android 10; en_US; Pixel 4; Build/QQ3A.200805.001; Cronet/58.0.2991.0)', + 'User-Agent': f'com.ss.android.ugc.{self._APP_NAME}/{manifest_app_version} (Linux; U; Android 10; en_US; Pixel 4; Build/QQ3A.200805.001; Cronet/58.0.2991.0)', 'Accept': 'application/json', }, query=query) @@ -122,11 +127,21 @@ class TikTokBaseIE(InfoExtractor): continue raise e + def _extract_aweme_app(self, aweme_id): + feed_list = self._call_api( + 'feed', {'aweme_id': aweme_id}, aweme_id, note='Downloading video feed', + errnote='Unable to download video feed').get('aweme_list') or [] + aweme_detail = next((aweme for aweme in feed_list if str(aweme.get('aweme_id')) == aweme_id), None) + if not aweme_detail: + raise ExtractorError('Unable to find video in feed', video_id=aweme_id) + return self._parse_aweme_video_app(aweme_detail) + def _get_subtitles(self, aweme_detail, aweme_id): # TODO: Extract text positioning info subtitles = {} + # aweme/detail endpoint subs captions_info = traverse_obj( - aweme_detail, ('interaction_stickers', ..., 'auto_video_caption_info', 'auto_captions', ...), expected_type=dict, default=[]) + aweme_detail, ('interaction_stickers', ..., 'auto_video_caption_info', 'auto_captions', ...), expected_type=dict) for caption in captions_info: caption_url = traverse_obj(caption, ('url', 'url_list', ...), expected_type=url_or_none, get_all=False) if not caption_url: @@ -141,6 +156,24 @@ class TikTokBaseIE(InfoExtractor): f'{i + 1}\n{srt_subtitles_timecode(line["start_time"] / 1000)} --> {srt_subtitles_timecode(line["end_time"] / 1000)}\n{line["text"]}' for i, line in enumerate(caption_json['utterances']) if line.get('text')) }) + # feed endpoint subs + if not subtitles: + for caption in traverse_obj(aweme_detail, ('video', 'cla_info', 'caption_infos', ...), expected_type=dict): + if not caption.get('url'): + continue + subtitles.setdefault(caption.get('lang') or 'en', []).append({ + 'ext': remove_start(caption.get('caption_format'), 'web'), + 'url': caption['url'], + }) + # webpage subs + if not subtitles: + for caption in traverse_obj(aweme_detail, ('video', 'subtitleInfos', ...), expected_type=dict): + if not caption.get('Url'): + continue + subtitles.setdefault(caption.get('LanguageCodeName') or 'en', []).append({ + 'ext': remove_start(caption.get('Format'), 'web'), + 'url': caption['Url'], + }) return subtitles def _parse_aweme_video_app(self, aweme_detail): @@ -229,7 +262,6 @@ class TikTokBaseIE(InfoExtractor): if auth_cookie: for f in formats: self._set_cookie(compat_urllib_parse_urlparse(f['url']).hostname, 'sid_tt', auth_cookie.value) - self._sort_formats(formats, ('quality', 'codec', 'size', 'br')) thumbnails = [] for cover_id in ('cover', 'ai_dynamic_cover', 'animated_cover', 'ai_dynamic_cover_bak', @@ -263,6 +295,9 @@ class TikTokBaseIE(InfoExtractor): return { 'id': aweme_id, + 'extractor_key': TikTokIE.ie_key(), + 'extractor': TikTokIE.IE_NAME, + 'webpage_url': self._create_url(author_info.get('uid'), aweme_id), 'title': aweme_detail.get('desc'), 'description': aweme_detail.get('desc'), 'view_count': int_or_none(stats_info.get('play_count')), @@ -275,7 +310,7 @@ class TikTokBaseIE(InfoExtractor): 'uploader_url': user_url, 'track': music_track, 'album': str_or_none(music_info.get('album')) or None, - 'artist': music_author, + 'artist': music_author or None, 'timestamp': int_or_none(aweme_detail.get('create_time')), 'formats': formats, 'subtitles': self.extract_subtitles(aweme_detail, aweme_id), @@ -284,7 +319,8 @@ class TikTokBaseIE(InfoExtractor): 'availability': self._availability( is_private='Private' in labels, needs_subscription='Friends only' in labels, - is_unlisted='Followers only' in labels) + is_unlisted='Followers only' in labels), + '_format_sort_fields': ('quality', 'codec', 'size', 'br'), } def _parse_aweme_video_web(self, aweme_detail, webpage_url): @@ -326,7 +362,6 @@ class TikTokBaseIE(InfoExtractor): 'height': height, }) self._remove_duplicate_formats(formats) - self._sort_formats(formats) thumbnails = [] for thumbnail_name in ('thumbnail', 'cover', 'dynamicCover', 'originCover'): @@ -348,7 +383,7 @@ class TikTokBaseIE(InfoExtractor): 'timestamp': int_or_none(aweme_detail.get('createTime')), 'creator': str_or_none(author_info.get('nickname')), 'uploader': str_or_none(author_info.get('uniqueId') or aweme_detail.get('author')), - 'uploader_id': str_or_none(author_info.get('id') or aweme_detail.get('authorId')), + 'uploader_id': str_or_none(traverse_obj(author_info, 'id', 'uid', 'authorId')), 'uploader_url': user_url, 'track': str_or_none(music_info.get('title')), 'album': str_or_none(music_info.get('album')) or None, @@ -363,7 +398,8 @@ class TikTokBaseIE(InfoExtractor): class TikTokIE(TikTokBaseIE): - _VALID_URL = r'https?://www\.tiktok\.com/@[\w\.-]+/video/(?P<id>\d+)' + _VALID_URL = r'https?://www\.tiktok\.com/(?:embed|@(?P<user_id>[\w\.-]+)/video)/(?P<id>\d+)' + _EMBED_REGEX = [rf'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>{_VALID_URL})'] _TESTS = [{ 'url': 'https://www.tiktok.com/@leenabhushan/video/6748451240264420610', @@ -461,14 +497,14 @@ class TikTokIE(TikTokBaseIE): 'repost_count': int, 'comment_count': int, }, - 'expected_warnings': ['Video not available'] + 'expected_warnings': ['trying with webpage', 'Unable to find video in feed'] }, { # Video without title and description 'url': 'https://www.tiktok.com/@pokemonlife22/video/7059698374567611694', 'info_dict': { 'id': '7059698374567611694', 'ext': 'mp4', - 'title': 'tiktok video #7059698374567611694', + 'title': 'TikTok video #7059698374567611694', 'description': '', 'uploader': 'pokemonlife22', 'creator': 'Pokemon', @@ -485,49 +521,50 @@ class TikTokIE(TikTokBaseIE): 'repost_count': int, 'comment_count': int, }, - 'expected_warnings': ['Video not available', 'Creating a generic title'] + }, { + # hydration JSON is sent in a <script> element + 'url': 'https://www.tiktok.com/@denidil6/video/7065799023130643713', + 'info_dict': { + 'id': '7065799023130643713', + 'ext': 'mp4', + 'title': '#denidil#денидил', + 'description': '#denidil#денидил', + 'uploader': 'denidil6', + 'uploader_id': '7046664115636405250', + 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAsvMSzFdQ4ikl3uR2TEJwMBbB2yZh2Zxwhx-WCo3rbDpAharE3GQCrFuJArI3C8QJ', + 'artist': 'Holocron Music', + 'album': 'Wolf Sounds (1 Hour) Enjoy the Company of the Animal That Is the Majestic King of the Night', + 'track': 'Wolf Sounds (1 Hour) Enjoy the Company of the Animal That Is the Majestic King of the Night', + 'timestamp': 1645134536, + 'duration': 26, + 'upload_date': '20220217', + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + }, + 'skip': 'This video is unavailable', }, { # Auto-captions available 'url': 'https://www.tiktok.com/@hankgreen1/video/7047596209028074758', 'only_matching': True }] - def _extract_aweme_app(self, aweme_id): - try: - aweme_detail = self._call_api('aweme/detail', {'aweme_id': aweme_id}, aweme_id, - note='Downloading video details', errnote='Unable to download video details').get('aweme_detail') - if not aweme_detail: - raise ExtractorError('Video not available', video_id=aweme_id) - except ExtractorError as e: - self.report_warning(f'{e}; Retrying with feed workaround') - feed_list = self._call_api('feed', {'aweme_id': aweme_id}, aweme_id, - note='Downloading video feed', errnote='Unable to download video feed').get('aweme_list') or [] - aweme_detail = next((aweme for aweme in feed_list if str(aweme.get('aweme_id')) == aweme_id), None) - if not aweme_detail: - raise ExtractorError('Unable to find video in feed', video_id=aweme_id) - return self._parse_aweme_video_app(aweme_detail) - def _real_extract(self, url): - video_id = self._match_id(url) - + video_id, user_id = self._match_valid_url(url).group('id', 'user_id') try: return self._extract_aweme_app(video_id) except ExtractorError as e: - self.report_warning(f'{e}; Retrying with webpage') + self.report_warning(f'{e}; trying with webpage') - # If we only call once, we get a 403 when downlaoding the video. - self._download_webpage(url, video_id) - webpage = self._download_webpage(url, video_id, note='Downloading video webpage') + url = self._create_url(user_id, video_id) + webpage = self._download_webpage(url, video_id, headers={'User-Agent': 'User-Agent:Mozilla/5.0'}) next_data = self._search_nextjs_data(webpage, video_id, default='{}') - if next_data: status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode'), expected_type=int) or 0 video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct'), expected_type=dict) else: - sigi_json = self._search_regex( - r'>\s*window\[[\'"]SIGI_STATE[\'"]\]\s*=\s*(?P<sigi_state>{.+});', - webpage, 'sigi data', group='sigi_state') - sigi_data = self._parse_json(sigi_json, video_id) + sigi_data = self._get_sigi_state(webpage, video_id) status = traverse_obj(sigi_data, ('VideoPage', 'statusCode'), expected_type=int) or 0 video_data = traverse_obj(sigi_data, ('ItemModule', video_id), expected_type=dict) @@ -541,6 +578,7 @@ class TikTokIE(TikTokBaseIE): class TikTokUserIE(TikTokBaseIE): IE_NAME = 'tiktok:user' _VALID_URL = r'https?://(?:www\.)?tiktok\.com/@(?P<id>[\w\.-]+)/?(?:$|[#?])' + _WORKING = False _TESTS = [{ 'url': 'https://tiktok.com/@corgibobaa?lang=en', 'playlist_mincount': 45, @@ -599,19 +637,17 @@ class TikTokUserIE(TikTokBaseIE): 'device_id': ''.join(random.choice(string.digits) for _ in range(19)), # Some endpoints don't like randomized device_id, so it isn't directly set in _call_api. } - max_retries = self.get_param('extractor_retries', 3) for page in itertools.count(1): - for retries in itertools.count(): + for retry in self.RetryManager(): try: - post_list = self._call_api('aweme/post', query, username, - note='Downloading user video list page %d%s' % (page, f' (attempt {retries})' if retries != 0 else ''), - errnote='Unable to download user video list') + post_list = self._call_api( + 'aweme/post', query, username, note=f'Downloading user video list page {page}', + errnote='Unable to download user video list') except ExtractorError as e: - if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0 and retries != max_retries: - self.report_warning('%s. Retrying...' % str(e.cause or e.msg)) + if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0: + retry.error = e continue raise - break yield from post_list.get('aweme_list', []) if not post_list.get('has_more'): break @@ -639,7 +675,7 @@ class TikTokUserIE(TikTokBaseIE): return self.playlist_result(self._entries_api(user_id, videos), user_id, user_name, thumbnail=thumbnail) -class TikTokBaseListIE(TikTokBaseIE): +class TikTokBaseListIE(TikTokBaseIE): # XXX: Conventionally, base classes should end with BaseIE/InfoExtractor def _entries(self, list_id, display_id): query = { self._QUERY_NAME: list_id, @@ -649,19 +685,17 @@ class TikTokBaseListIE(TikTokBaseIE): 'device_id': ''.join(random.choice(string.digits) for i in range(19)) } - max_retries = self.get_param('extractor_retries', 3) for page in itertools.count(1): - for retries in itertools.count(): + for retry in self.RetryManager(): try: - post_list = self._call_api(self._API_ENDPOINT, query, display_id, - note='Downloading video list page %d%s' % (page, f' (attempt {retries})' if retries != 0 else ''), - errnote='Unable to download video list') + post_list = self._call_api( + self._API_ENDPOINT, query, display_id, note=f'Downloading video list page {page}', + errnote='Unable to download video list') except ExtractorError as e: - if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0 and retries != max_retries: - self.report_warning('%s. Retrying...' % str(e.cause or e.msg)) + if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0: + retry.error = e continue raise - break for video in post_list.get('aweme_list', []): yield { **self._parse_aweme_video_app(video), @@ -681,6 +715,7 @@ class TikTokBaseListIE(TikTokBaseIE): class TikTokSoundIE(TikTokBaseListIE): IE_NAME = 'tiktok:sound' _VALID_URL = r'https?://(?:www\.)?tiktok\.com/music/[\w\.-]+-(?P<id>[\d]+)[/?#&]?' + _WORKING = False _QUERY_NAME = 'music_id' _API_ENDPOINT = 'music/aweme' _TESTS = [{ @@ -704,6 +739,7 @@ class TikTokSoundIE(TikTokBaseListIE): class TikTokEffectIE(TikTokBaseListIE): IE_NAME = 'tiktok:effect' _VALID_URL = r'https?://(?:www\.)?tiktok\.com/sticker/[\w\.-]+-(?P<id>[\d]+)[/?#&]?' + _WORKING = False _QUERY_NAME = 'sticker_id' _API_ENDPOINT = 'sticker/aweme' _TESTS = [{ @@ -723,6 +759,7 @@ class TikTokEffectIE(TikTokBaseListIE): class TikTokTagIE(TikTokBaseListIE): IE_NAME = 'tiktok:tag' _VALID_URL = r'https?://(?:www\.)?tiktok\.com/tag/(?P<id>[^/?#&]+)' + _WORKING = False _QUERY_NAME = 'ch_id' _API_ENDPOINT = 'challenge/aweme' _TESTS = [{ @@ -747,56 +784,68 @@ class TikTokTagIE(TikTokBaseListIE): return self.playlist_result(self._entries(tag_id, display_id), tag_id, display_id) -class DouyinIE(TikTokIE): +class DouyinIE(TikTokBaseIE): _VALID_URL = r'https?://(?:www\.)?douyin\.com/video/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'https://www.douyin.com/video/6961737553342991651', - 'md5': '10523312c8b8100f353620ac9dc8f067', + 'md5': 'a97db7e3e67eb57bf40735c022ffa228', 'info_dict': { 'id': '6961737553342991651', 'ext': 'mp4', 'title': '#杨超越 小小水手带你去远航❤️', - 'uploader': '杨超越', - 'upload_date': '20210513', - 'timestamp': 1620905839, + 'description': '#杨超越 小小水手带你去远航❤️', 'uploader_id': '110403406559', + 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98', + 'creator': '杨超越', + 'duration': 19782, + 'timestamp': 1620905839, + 'upload_date': '20210513', + 'track': '@杨超越创作的原声', 'view_count': int, 'like_count': int, 'repost_count': int, 'comment_count': int, - } + }, }, { 'url': 'https://www.douyin.com/video/6982497745948921092', - 'md5': 'd78408c984b9b5102904cf6b6bc2d712', + 'md5': '34a87ebff3833357733da3fe17e37c0e', 'info_dict': { 'id': '6982497745948921092', 'ext': 'mp4', 'title': '这个夏日和小羊@杨超越 一起遇见白色幻想', - 'uploader': '杨超越工作室', - 'upload_date': '20210708', - 'timestamp': 1625739481, + 'description': '这个夏日和小羊@杨超越 一起遇见白色幻想', 'uploader_id': '408654318141572', + 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAZJpnglcjW2f_CMVcnqA_6oVBXKWMpH0F8LIHuUu8-lA', + 'creator': '杨超越工作室', + 'duration': 42608, + 'timestamp': 1625739481, + 'upload_date': '20210708', + 'track': '@杨超越工作室创作的原声', 'view_count': int, 'like_count': int, 'repost_count': int, 'comment_count': int, - } + }, }, { 'url': 'https://www.douyin.com/video/6953975910773099811', - 'md5': '72e882e24f75064c218b76c8b713c185', + 'md5': 'dde3302460f19db59c47060ff013b902', 'info_dict': { 'id': '6953975910773099811', 'ext': 'mp4', 'title': '#一起看海 出现在你的夏日里', - 'uploader': '杨超越', - 'upload_date': '20210422', - 'timestamp': 1619098692, + 'description': '#一起看海 出现在你的夏日里', 'uploader_id': '110403406559', + 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98', + 'creator': '杨超越', + 'duration': 17228, + 'timestamp': 1619098692, + 'upload_date': '20210422', + 'track': '@杨超越创作的原声', 'view_count': int, 'like_count': int, 'repost_count': int, 'comment_count': int, - } + }, }, { 'url': 'https://www.douyin.com/video/6950251282489675042', 'md5': 'b4db86aec367ef810ddd38b1737d2fed', @@ -812,25 +861,30 @@ class DouyinIE(TikTokIE): 'like_count': int, 'repost_count': int, 'comment_count': int, - } + }, + 'skip': 'No longer available', }, { 'url': 'https://www.douyin.com/video/6963263655114722595', - 'md5': '1abe1c477d05ee62efb40bf2329957cf', + 'md5': 'cf9f11f0ec45d131445ec2f06766e122', 'info_dict': { 'id': '6963263655114722595', 'ext': 'mp4', 'title': '#哪个爱豆的105度最甜 换个角度看看我哈哈', - 'uploader': '杨超越', - 'upload_date': '20210517', - 'timestamp': 1621261163, + 'description': '#哪个爱豆的105度最甜 换个角度看看我哈哈', 'uploader_id': '110403406559', + 'uploader_url': 'https://www.douyin.com/user/MS4wLjABAAAAEKnfa654JAJ_N5lgZDQluwsxmY0lhfmEYNQBBkwGG98', + 'creator': '杨超越', + 'duration': 15115, + 'timestamp': 1621261163, + 'upload_date': '20210517', + 'track': '@杨超越创作的原声', 'view_count': int, 'like_count': int, 'repost_count': int, 'comment_count': int, - } + }, }] - _APP_VERSIONS = [('9.6.0', '960')] + _APP_VERSIONS = [('23.3.0', '230300')] _APP_NAME = 'aweme' _AID = 1128 _API_HOSTNAME = 'aweme.snssdk.com' @@ -843,7 +897,8 @@ class DouyinIE(TikTokIE): try: return self._extract_aweme_app(video_id) except ExtractorError as e: - self.report_warning(f'{e}; Retrying with webpage') + e.expected = True + self.to_screen(f'{e}; trying with webpage') webpage = self._download_webpage(url, video_id) render_data_json = self._search_regex( @@ -851,7 +906,10 @@ class DouyinIE(TikTokIE): webpage, 'render data', default=None) if not render_data_json: # TODO: Run verification challenge code to generate signature cookies - raise ExtractorError('Fresh cookies (not necessarily logged in) are needed') + cookies = self._get_cookies(self._WEBPAGE_HOST) + expected = not cookies.get('s_v_web_id') or not cookies.get('ttwid') + raise ExtractorError( + 'Fresh cookies (not necessarily logged in) are needed', expected=expected) render_data = self._parse_json( render_data_json, video_id, transform_source=compat_urllib_parse_unquote) @@ -859,36 +917,43 @@ class DouyinIE(TikTokIE): class TikTokVMIE(InfoExtractor): - _VALID_URL = r'https?://(?:vm|vt)\.tiktok\.com/(?P<id>\w+)' + _VALID_URL = r'https?://(?:(?:vm|vt)\.tiktok\.com|(?:www\.)tiktok\.com/t)/(?P<id>\w+)' IE_NAME = 'vm.tiktok' _TESTS = [{ - 'url': 'https://vm.tiktok.com/ZSe4FqkKd', + 'url': 'https://www.tiktok.com/t/ZTRC5xgJp', 'info_dict': { - 'id': '7023491746608712966', + 'id': '7170520270497680683', 'ext': 'mp4', - 'title': 'md5:5607564db90271abbbf8294cca77eddd', - 'description': 'md5:5607564db90271abbbf8294cca77eddd', - 'duration': 11, - 'upload_date': '20211026', - 'uploader_id': '7007385080558846981', - 'creator': 'Memes', - 'artist': 'Memes', - 'track': 'original sound', - 'uploader': 'susmandem', - 'timestamp': 1635284105, - 'thumbnail': r're:https://.+\.webp.*', - 'like_count': int, + 'title': 'md5:c64f6152330c2efe98093ccc8597871c', + 'uploader_id': '6687535061741700102', + 'upload_date': '20221127', 'view_count': int, + 'like_count': int, 'comment_count': int, + 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAObqu3WCTXxmw2xwZ3iLEHnEecEIw7ks6rxWqOqOhaPja9BI7gqUQnjw8_5FSoDXX', + 'album': 'Wave of Mutilation: Best of Pixies', + 'thumbnail': r're:https://.+\.webp.*', + 'duration': 5, + 'timestamp': 1669516858, 'repost_count': int, - 'uploader_url': 'https://www.tiktok.com/@MS4wLjABAAAAXcNoOEOxVyBzuII_E--T0MeCrLP0ay1Sm6x_n3dluiWEoWZD0VlQOytwad4W0i0n', - } + 'artist': 'Pixies', + 'track': 'Where Is My Mind?', + 'description': 'md5:c64f6152330c2efe98093ccc8597871c', + 'uploader': 'sigmachaddeus', + 'creator': 'SigmaChad', + }, + }, { + 'url': 'https://vm.tiktok.com/ZSe4FqkKd', + 'only_matching': True, }, { 'url': 'https://vt.tiktok.com/ZSe4FqkKd', 'only_matching': True, }] def _real_extract(self, url): - return self.url_result(self._request_webpage( - HEADRequest(url), self._match_id(url), headers={'User-Agent': 'facebookexternalhit/1.1'}).geturl(), TikTokIE) + new_url = self._request_webpage( + HEADRequest(url), self._match_id(url), headers={'User-Agent': 'facebookexternalhit/1.1'}).geturl() + if self.suitable(new_url): # Prevent infinite loop in case redirect fails + raise UnsupportedError(new_url) + return self.url_result(new_url) diff --git a/hypervideo_dl/extractor/tinypic.py b/hypervideo_dl/extractor/tinypic.py index 39056e5..216208c 100644 --- a/hypervideo_dl/extractor/tinypic.py +++ b/hypervideo_dl/extractor/tinypic.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/tmz.py b/hypervideo_dl/extractor/tmz.py index aee2273..ffb30c6 100644 --- a/hypervideo_dl/extractor/tmz.py +++ b/hypervideo_dl/extractor/tmz.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -21,8 +18,10 @@ class TMZIE(InfoExtractor): "title": "No Charges Against Hillary Clinton? Harvey Says It Ain't Over Yet", "description": "Harvey talks about Director Comey’s decision not to prosecute Hillary Clinton.", "timestamp": 1467831837, - "uploader": "{'@type': 'Person', 'name': 'TMZ Staff'}", + "uploader": "TMZ Staff", "upload_date": "20160706", + "thumbnail": "https://imagez.tmz.com/image/5e/4by3/2016/07/06/5eea7dc01baa5c2e83eb06930c170e46_xl.jpg", + "duration": 772.0, }, }, { @@ -33,8 +32,10 @@ class TMZIE(InfoExtractor): "title": "Angry Bagel Shop Guy Says He Doesn't Trust Women", "description": "The enraged man who went viral for ranting about women on dating sites before getting ragdolled in a bagel shop is defending his misogyny ... he says it's women's fault in the first place.", "timestamp": 1562889485, - "uploader": "{'@type': 'Person', 'name': 'TMZ Staff'}", + "uploader": "TMZ Staff", "upload_date": "20190711", + "thumbnail": "https://imagez.tmz.com/image/a8/4by3/2019/07/12/a85480d27b2f50a7bfea2322151d67a5_xl.jpg", + "duration": 123.0, }, }, { @@ -46,8 +47,10 @@ class TMZIE(InfoExtractor): "title": "Bobby Brown Tells Crowd ... Bobbi Kristina is Awake", "description": 'Bobby Brown stunned his audience during a concert Saturday night, when he told the crowd, "Bobbi is awake. She\'s watching me."', "timestamp": 1429467813, - "uploader": "{'@type': 'Person', 'name': 'TMZ Staff'}", + "uploader": "TMZ Staff", "upload_date": "20150419", + "duration": 29.0, + "thumbnail": "https://imagez.tmz.com/image/15/4by3/2015/04/20/1539c7ae136359fc979236fa6a9449dd_xl.jpg", }, }, { @@ -59,8 +62,10 @@ class TMZIE(InfoExtractor): "description": "Patti LaBelle made it known loud and clear last night ... NO " "ONE gets on her stage and strips down.", "timestamp": 1442683746, - "uploader": "{'@type': 'Person', 'name': 'TMZ Staff'}", + "uploader": "TMZ Staff", "upload_date": "20150919", + "duration": 104.0, + "thumbnail": "https://imagez.tmz.com/image/5e/4by3/2015/09/20/5e57d7575062528082994e18ac3f0f48_xl.jpg", }, }, { @@ -71,8 +76,10 @@ class TMZIE(InfoExtractor): "title": "NBA's Adam Silver -- Blake Griffin's a Great Guy ... He'll Learn from This", "description": "Two pretty parts of this video with NBA Commish Adam Silver.", "timestamp": 1454010989, - "uploader": "{'@type': 'Person', 'name': 'TMZ Staff'}", + "uploader": "TMZ Staff", "upload_date": "20160128", + "duration": 59.0, + "thumbnail": "https://imagez.tmz.com/image/38/4by3/2016/01/29/3856e83e0beb57059ec412122b842fb1_xl.jpg", }, }, { @@ -83,8 +90,10 @@ class TMZIE(InfoExtractor): "title": "Trump Star Vandal -- I'm Not Afraid of Donald or the Cops!", "description": "James Otis is the the guy who took a pickaxe to Donald Trump's star on the Walk of Fame, and he tells TMZ .. he's ready and willing to go to jail for the crime.", "timestamp": 1477500095, - "uploader": "{'@type': 'Person', 'name': 'TMZ Staff'}", + "uploader": "TMZ Staff", "upload_date": "20161026", + "thumbnail": "https://imagez.tmz.com/image/0d/4by3/2016/10/27/0d904814d4a75dcf9cc3b8cfd1edc1a3_xl.jpg", + "duration": 128.0, }, }, { @@ -99,8 +108,10 @@ class TMZIE(InfoExtractor): "swinging their billy clubs at both Anti-Fascist and Pro-Trump " "demonstrators.", "timestamp": 1604182772, - "uploader": "{'@type': 'Person', 'name': 'TMZ Staff'}", + "uploader": "TMZ Staff", "upload_date": "20201031", + "duration": 96.0, + "thumbnail": "https://imagez.tmz.com/image/f3/4by3/2020/10/31/f37bd5a8aef84497866f425130c58be3_xl.jpg", }, }, { @@ -111,8 +122,23 @@ class TMZIE(InfoExtractor): "title": "SICK LAMBO GERVONTA DAVIS IN HIS NEW RIDE RIGHT AFTER KO AFTER LEO EsNews Boxing", "uploader": "ESNEWS", "description": "md5:49675bc58883ccf80474b8aa701e1064", - "upload_date": "20201101", + "upload_date": "20201102", "uploader_id": "ESNEWS", + "uploader_url": "http://www.youtube.com/user/ESNEWS", + "like_count": int, + "channel_id": "UCI-Oq7oFGakzSzHFlTtsUsQ", + "channel": "ESNEWS", + "view_count": int, + "duration": 225, + "live_status": "not_live", + "thumbnail": "https://i.ytimg.com/vi_webp/Dddb6IGe-ws/maxresdefault.webp", + "channel_url": "https://www.youtube.com/channel/UCI-Oq7oFGakzSzHFlTtsUsQ", + "channel_follower_count": int, + "playable_in_embed": True, + "categories": ["Sports"], + "age_limit": 0, + "tags": "count:10", + "availability": "public", }, }, { @@ -120,12 +146,20 @@ class TMZIE(InfoExtractor): "info_dict": { "id": "1329450007125225473", "ext": "mp4", - "title": "TheMacLife - BREAKING: Conor McGregor (@thenotoriousmma) has signed his bout agreement for his rematch with Dustin Poirier for January 23.", - "uploader": "TheMacLife", + "title": "The Mac Life - BREAKING: Conor McGregor (@thenotoriousmma) has signed his bout agreement for his rematch with Dustin Poirier for January 23.", + "uploader": "The Mac Life", "description": "md5:56e6009bbc3d12498e10d08a8e1f1c69", "upload_date": "20201119", - "uploader_id": "Maclifeofficial", + "uploader_id": "TheMacLife", "timestamp": 1605800556, + "thumbnail": "https://pbs.twimg.com/media/EnMmfT8XYAExgxJ.jpg?name=small", + "like_count": int, + "duration": 11.812, + "uploader_url": "https://twitter.com/TheMacLife", + "age_limit": 0, + "repost_count": int, + "tags": [], + "comment_count": int, }, }, ] diff --git a/hypervideo_dl/extractor/tnaflix.py b/hypervideo_dl/extractor/tnaflix.py index d7617f7..4482c84 100644 --- a/hypervideo_dl/extractor/tnaflix.py +++ b/hypervideo_dl/extractor/tnaflix.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -11,6 +9,7 @@ from ..utils import ( parse_duration, str_to_int, unescapeHTML, + url_basename, xpath_text, ) @@ -22,8 +21,6 @@ class TNAFlixNetworkBaseIE(InfoExtractor): r'<input[^>]+name="config\d?" value="(?P<url>[^"]+)"', r'config\s*=\s*(["\'])(?P<url>(?:https?:)?//(?:(?!\1).)+)\1', ] - _HOST = 'tna' - _VKEY_SUFFIX = '' _TITLE_REGEX = r'<input[^>]+name="title" value="([^"]+)"' _DESCRIPTION_REGEX = r'<input[^>]+name="description" value="([^"]+)"' _UPLOADER_REGEX = r'<input[^>]+name="username" value="([^"]+)"' @@ -74,7 +71,7 @@ class TNAFlixNetworkBaseIE(InfoExtractor): def _real_extract(self, url): mobj = self._match_valid_url(url) - video_id = mobj.group('id') + video_id, host = mobj.group('id', 'host') for display_id_key in ('display_id', 'display_id_2'): if display_id_key in mobj.groupdict(): display_id = mobj.group(display_id_key) @@ -85,98 +82,109 @@ class TNAFlixNetworkBaseIE(InfoExtractor): webpage = self._download_webpage(url, display_id) + # check for MovieFap-style config cfg_url = self._proto_relative_url(self._html_search_regex( self._CONFIG_REGEX, webpage, 'flashvars.config', default=None, group='url'), 'http:') + query = {} + # check for TNAFlix-style config if not cfg_url: inputs = self._hidden_inputs(webpage) - cfg_url = ('https://cdn-fck.%sflix.com/%sflix/%s%s.fid?key=%s&VID=%s&premium=1&vip=1&alpha' - % (self._HOST, self._HOST, inputs['vkey'], self._VKEY_SUFFIX, inputs['nkey'], video_id)) - - cfg_xml = self._download_xml( - cfg_url, display_id, 'Downloading metadata', - transform_source=fix_xml_ampersands, headers={'Referer': url}) - - formats = [] - - def extract_video_url(vl): - # Any URL modification now results in HTTP Error 403: Forbidden - return unescapeHTML(vl.text) - - video_link = cfg_xml.find('./videoLink') - if video_link is not None: - formats.append({ - 'url': extract_video_url(video_link), - 'ext': xpath_text(cfg_xml, './videoConfig/type', 'type', default='flv'), - }) - - for item in cfg_xml.findall('./quality/item'): - video_link = item.find('./videoLink') - if video_link is None: - continue - res = item.find('res') - format_id = None if res is None else res.text - height = int_or_none(self._search_regex( - r'^(\d+)[pP]', format_id, 'height', default=None)) - formats.append({ - 'url': self._proto_relative_url(extract_video_url(video_link), 'http:'), - 'format_id': format_id, - 'height': height, + if inputs.get('vkey') and inputs.get('nkey'): + cfg_url = f'https://www.{host}.com/cdn/cdn.php' + query.update({ + 'file': inputs['vkey'], + 'key': inputs['nkey'], + 'VID': video_id, + 'premium': '1', + 'vip': '1', + 'alpha': '', + }) + + formats, json_ld = [], {} + + # TNAFlix and MovieFap extraction + if cfg_url: + cfg_xml = self._download_xml( + cfg_url, display_id, 'Downloading metadata', + transform_source=fix_xml_ampersands, headers={'Referer': url}, query=query) + + def extract_video_url(vl): + # Any URL modification now results in HTTP Error 403: Forbidden + return unescapeHTML(vl.text) + + video_link = cfg_xml.find('./videoLink') + if video_link is not None: + formats.append({ + 'url': extract_video_url(video_link), + 'ext': xpath_text(cfg_xml, './videoConfig/type', 'type', default='flv'), + }) + + for item in cfg_xml.findall('./quality/item'): + video_link = item.find('./videoLink') + if video_link is None: + continue + res = item.find('res') + format_id = None if res is None else res.text + height = int_or_none(self._search_regex( + r'^(\d+)[pP]', format_id, 'height', default=None)) + formats.append({ + 'url': self._proto_relative_url(extract_video_url(video_link), 'http:'), + 'format_id': format_id, + 'height': height, + }) + + thumbnails = self._extract_thumbnails(cfg_xml) or [] + thumbnails.append({ + 'url': self._proto_relative_url(xpath_text(cfg_xml, './startThumb', 'thumbnail'), 'http:') }) - self._sort_formats(formats) - - thumbnail = self._proto_relative_url( - xpath_text(cfg_xml, './startThumb', 'thumbnail'), 'http:') - thumbnails = self._extract_thumbnails(cfg_xml) - - title = None - if self._TITLE_REGEX: - title = self._html_search_regex( - self._TITLE_REGEX, webpage, 'title', default=None) - if not title: - title = self._og_search_title(webpage) - - age_limit = self._rta_search(webpage) or 18 - - duration = parse_duration(self._html_search_meta( - 'duration', webpage, 'duration', default=None)) + # check for EMPFlix-style JSON and extract + else: + player = self._download_json( + f'http://www.{host}.com/ajax/video-player/{video_id}', video_id, + headers={'Referer': url}).get('html', '') + for mobj in re.finditer(r'<source src="(?P<src>[^"]+)"', player): + video_url = mobj.group('src') + height = self._search_regex(r'-(\d+)p\.', url_basename(video_url), 'height', default=None) + formats.append({ + 'url': self._proto_relative_url(video_url, 'http:'), + 'ext': url_basename(video_url).split('.')[-1], + 'height': int_or_none(height), + 'format_id': f'{height}p' if height else url_basename(video_url).split('.')[0], + }) + thumbnail = self._proto_relative_url(self._search_regex( + r'data-poster="([^"]+)"', player, 'thumbnail', default=None), 'http:') + thumbnails = [{'url': thumbnail}] if thumbnail else None + json_ld = self._search_json_ld(webpage, display_id, default={}) def extract_field(pattern, name): return self._html_search_regex(pattern, webpage, name, default=None) if pattern else None - description = extract_field(self._DESCRIPTION_REGEX, 'description') - uploader = extract_field(self._UPLOADER_REGEX, 'uploader') - view_count = str_to_int(extract_field(self._VIEW_COUNT_REGEX, 'view count')) - comment_count = str_to_int(extract_field(self._COMMENT_COUNT_REGEX, 'comment count')) - average_rating = float_or_none(extract_field(self._AVERAGE_RATING_REGEX, 'average rating')) - - categories_str = extract_field(self._CATEGORIES_REGEX, 'categories') - categories = [c.strip() for c in categories_str.split(',')] if categories_str is not None else [] - return { 'id': video_id, 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, + 'title': (extract_field(self._TITLE_REGEX, 'title') + or self._og_search_title(webpage, default=None) + or json_ld.get('title')), + 'description': extract_field(self._DESCRIPTION_REGEX, 'description') or json_ld.get('description'), 'thumbnails': thumbnails, - 'duration': duration, - 'age_limit': age_limit, - 'uploader': uploader, - 'view_count': view_count, - 'comment_count': comment_count, - 'average_rating': average_rating, - 'categories': categories, + 'duration': parse_duration( + self._html_search_meta('duration', webpage, 'duration', default=None)) or json_ld.get('duration'), + 'age_limit': self._rta_search(webpage) or 18, + 'uploader': extract_field(self._UPLOADER_REGEX, 'uploader') or json_ld.get('uploader'), + 'view_count': str_to_int(extract_field(self._VIEW_COUNT_REGEX, 'view count')), + 'comment_count': str_to_int(extract_field(self._COMMENT_COUNT_REGEX, 'comment count')), + 'average_rating': float_or_none(extract_field(self._AVERAGE_RATING_REGEX, 'average rating')), + 'categories': list(map(str.strip, (extract_field(self._CATEGORIES_REGEX, 'categories') or '').split(','))), 'formats': formats, } class TNAFlixNetworkEmbedIE(TNAFlixNetworkBaseIE): - _VALID_URL = r'https?://player\.(?:tna|emp)flix\.com/video/(?P<id>\d+)' - - _TITLE_REGEX = r'<title>([^<]+)</title>' + _VALID_URL = r'https?://player\.(?P<host>tnaflix|empflix)\.com/video/(?P<id>\d+)' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.(?:tna|emp)flix\.com/video/\d+)\1'] _TESTS = [{ 'url': 'https://player.tnaflix.com/video/6538', @@ -184,23 +192,26 @@ class TNAFlixNetworkEmbedIE(TNAFlixNetworkBaseIE): 'id': '6538', 'display_id': '6538', 'ext': 'mp4', - 'title': 'Educational xxx video', + 'title': 'Educational xxx video (G Spot)', + 'description': 'md5:b4fab8f88a8621c8fabd361a173fe5b8', 'thumbnail': r're:https?://.*\.jpg$', 'age_limit': 18, + 'duration': 164, + 'uploader': 'bobwhite39', + 'categories': list, }, 'params': { 'skip_download': True, }, }, { - 'url': 'https://player.empflix.com/video/33051', + 'url': 'http://player.empflix.com/video/33051', 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [url for _, url in re.findall( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.(?:tna|emp)flix\.com/video/\d+)\1', - webpage)] + def _real_extract(self, url): + mobj = self._match_valid_url(url) + video_id, host = mobj.group('id', 'host') + return self.url_result(f'http://www.{host}.com/category/{video_id}/video{video_id}') class TNAEMPFlixBaseIE(TNAFlixNetworkBaseIE): @@ -210,7 +221,7 @@ class TNAEMPFlixBaseIE(TNAFlixNetworkBaseIE): class TNAFlixIE(TNAEMPFlixBaseIE): - _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?(?P<host>tnaflix)\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)' _TITLE_REGEX = r'<title>(.+?) - (?:TNAFlix Porn Videos|TNAFlix\.com)</title>' @@ -226,17 +237,17 @@ class TNAFlixIE(TNAEMPFlixBaseIE): 'thumbnail': r're:https?://.*\.jpg$', 'duration': 91, 'age_limit': 18, - 'categories': ['Porn Stars'], + 'categories': list, } }, { # non-anonymous uploader, categories 'url': 'https://www.tnaflix.com/teen-porn/Educational-xxx-video/video6538', - 'md5': '0f5d4d490dbfd117b8607054248a07c0', + 'md5': 'add5a9fa7f4da53d3e9d0845ac58f20c', 'info_dict': { 'id': '6538', 'display_id': 'Educational-xxx-video', 'ext': 'mp4', - 'title': 'Educational xxx video', + 'title': 'Educational xxx video (G Spot)', 'description': 'md5:b4fab8f88a8621c8fabd361a173fe5b8', 'thumbnail': r're:https?://.*\.jpg$', 'duration': 164, @@ -251,14 +262,11 @@ class TNAFlixIE(TNAEMPFlixBaseIE): class EMPFlixIE(TNAEMPFlixBaseIE): - _VALID_URL = r'https?://(?:www\.)?empflix\.com/(?:videos/(?P<display_id>.+?)-|[^/]+/(?P<display_id_2>[^/]+)/video)(?P<id>[0-9]+)' - - _HOST = 'emp' - _VKEY_SUFFIX = '-1' + _VALID_URL = r'https?://(?:www\.)?(?P<host>empflix)\.com/(?:videos/(?P<display_id>.+?)-|[^/]+/(?P<display_id_2>[^/]+)/video)(?P<id>[0-9]+)' _TESTS = [{ - 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', - 'md5': 'bc30d48b91a7179448a0bda465114676', + 'url': 'http://www.empflix.com/amateur-porn/Amateur-Finger-Fuck/video33051', + 'md5': 'd761c7b26601bd14476cd9512f2654fc', 'info_dict': { 'id': '33051', 'display_id': 'Amateur-Finger-Fuck', @@ -268,20 +276,20 @@ class EMPFlixIE(TNAEMPFlixBaseIE): 'thumbnail': r're:https?://.*\.jpg$', 'duration': 83, 'age_limit': 18, - 'uploader': 'cwbike', - 'categories': ['Amateur', 'Anal', 'Fisting', 'Home made', 'Solo'], + 'uploader': None, + 'categories': list, } }, { 'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html', 'only_matching': True, }, { - 'url': 'https://www.empflix.com/amateur-porn/Amateur-Finger-Fuck/video33051', + 'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html', 'only_matching': True, }] class MovieFapIE(TNAFlixNetworkBaseIE): - _VALID_URL = r'https?://(?:www\.)?moviefap\.com/videos/(?P<id>[0-9a-f]+)/(?P<display_id>[^/]+)\.html' + _VALID_URL = r'https?://(?:www\.)?(?P<host>moviefap)\.com/videos/(?P<id>[0-9a-f]+)/(?P<display_id>[^/]+)\.html' _VIEW_COUNT_REGEX = r'<br>Views\s*<strong>([\d,.]+)</strong>' _COMMENT_COUNT_REGEX = r'<span[^>]+id="comCount"[^>]*>([\d,.]+)</span>' @@ -323,5 +331,6 @@ class MovieFapIE(TNAFlixNetworkBaseIE): 'comment_count': int, 'average_rating': float, 'categories': ['Amateur', 'Teen'], - } + }, + 'skip': 'This video does not exist', }] diff --git a/hypervideo_dl/extractor/toggle.py b/hypervideo_dl/extractor/toggle.py index eb87349..7073733 100644 --- a/hypervideo_dl/extractor/toggle.py +++ b/hypervideo_dl/extractor/toggle.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import re @@ -157,7 +154,6 @@ class ToggleIE(InfoExtractor): and meta.get('Key') == 'Encryption' and meta.get('Value') == '1'): self.report_drm(video_id) # Most likely because geo-blocked if no formats and no DRM - self._sort_formats(formats) thumbnails = [] for picture in info.get('Pictures', []): diff --git a/hypervideo_dl/extractor/toggo.py b/hypervideo_dl/extractor/toggo.py index da5f0c4..1ddec49 100644 --- a/hypervideo_dl/extractor/toggo.py +++ b/hypervideo_dl/extractor/toggo.py @@ -4,7 +4,7 @@ from ..utils import int_or_none, parse_qs class ToggoIE(InfoExtractor): IE_NAME = 'toggo' - _VALID_URL = r'https?://(?:www\.)?toggo\.de/[\w-]+/folge/(?P<id>[\w-]+)' + _VALID_URL = r'https?://(?:www\.)?toggo\.de/(?:toggolino/)?[^/?#]+/(?:folge|video)/(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'https://www.toggo.de/weihnachtsmann--co-kg/folge/ein-geschenk-fuer-zwei', 'info_dict': { @@ -27,6 +27,15 @@ class ToggoIE(InfoExtractor): 'upload_date': '20200217', }, 'params': {'skip_download': True}, + }, { + 'url': 'https://www.toggo.de/grizzy--die-lemminge/folge/ab-durch-die-wand-vogelfrei-rock\'n\'lemming', + 'only_matching': True, + }, { + 'url': 'https://www.toggo.de/toggolino/paw-patrol/folge/der-wetter-zeppelin-der-chili-kochwettbewerb', + 'only_matching': True, + }, { + 'url': 'https://www.toggo.de/toggolino/paw-patrol/video/paw-patrol-rettung-im-anflug', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/hypervideo_dl/extractor/tokentube.py b/hypervideo_dl/extractor/tokentube.py index 579623f..d022e27 100644 --- a/hypervideo_dl/extractor/tokentube.py +++ b/hypervideo_dl/extractor/tokentube.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import functools import re @@ -98,8 +95,6 @@ class TokentubeIE(InfoExtractor): description = remove_end(description, 'Category') - self._sort_formats(formats) - return { 'id': video_id, 'formats': formats, diff --git a/hypervideo_dl/extractor/tonline.py b/hypervideo_dl/extractor/tonline.py index 9b6a40d..7202826 100644 --- a/hypervideo_dl/extractor/tonline.py +++ b/hypervideo_dl/extractor/tonline.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import int_or_none, join_nonempty diff --git a/hypervideo_dl/extractor/toongoggles.py b/hypervideo_dl/extractor/toongoggles.py index df13d64..1b8fc3a 100644 --- a/hypervideo_dl/extractor/toongoggles.py +++ b/hypervideo_dl/extractor/toongoggles.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( int_or_none, diff --git a/hypervideo_dl/extractor/toutv.py b/hypervideo_dl/extractor/toutv.py index 1d5da10..f60c199 100644 --- a/hypervideo_dl/extractor/toutv.py +++ b/hypervideo_dl/extractor/toutv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .radiocanada import RadioCanadaIE @@ -12,7 +9,7 @@ from ..utils import ( ) -class TouTvIE(RadioCanadaIE): +class TouTvIE(RadioCanadaIE): # XXX: Do not subclass from concrete IE _NETRC_MACHINE = 'toutv' IE_NAME = 'tou.tv' _VALID_URL = r'https?://ici\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+(?:/S[0-9]+[EC][0-9]+)?)' diff --git a/hypervideo_dl/extractor/toypics.py b/hypervideo_dl/extractor/toypics.py index f705a06..bc73361 100644 --- a/hypervideo_dl/extractor/toypics.py +++ b/hypervideo_dl/extractor/toypics.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor import re diff --git a/hypervideo_dl/extractor/traileraddict.py b/hypervideo_dl/extractor/traileraddict.py index 514f479..5c4a138 100644 --- a/hypervideo_dl/extractor/traileraddict.py +++ b/hypervideo_dl/extractor/traileraddict.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/triller.py b/hypervideo_dl/extractor/triller.py new file mode 100644 index 0000000..acd9e68 --- /dev/null +++ b/hypervideo_dl/extractor/triller.py @@ -0,0 +1,294 @@ +import itertools +import json + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + str_or_none, + traverse_obj, + unified_strdate, + unified_timestamp, + url_basename, +) + + +class TrillerBaseIE(InfoExtractor): + _NETRC_MACHINE = 'triller' + _API_BASE_URL = 'https://social.triller.co/v1.5' + _API_HEADERS = {'Origin': 'https://triller.co'} + + def _perform_login(self, username, password): + if self._API_HEADERS.get('Authorization'): + return + + user_check = self._download_json( + f'{self._API_BASE_URL}/api/user/is-valid-username', None, note='Checking username', + fatal=False, expected_status=400, headers={ + 'Content-Type': 'application/json', + 'Origin': 'https://triller.co', + }, data=json.dumps({'username': username}, separators=(',', ':')).encode('utf-8')) + if user_check.get('status'): # endpoint returns "status":false if username exists + raise ExtractorError('Unable to login: Invalid username', expected=True) + + credentials = { + 'username': username, + 'password': password, + } + login = self._download_json( + f'{self._API_BASE_URL}/user/auth', None, note='Logging in', + fatal=False, expected_status=400, headers={ + 'Content-Type': 'application/json', + 'Origin': 'https://triller.co', + }, data=json.dumps(credentials, separators=(',', ':')).encode('utf-8')) + if not login.get('auth_token'): + if login.get('error') == 1008: + raise ExtractorError('Unable to login: Incorrect password', expected=True) + raise ExtractorError('Unable to login') + + self._API_HEADERS['Authorization'] = f'Bearer {login["auth_token"]}' + + def _get_comments(self, video_id, limit=15): + comment_info = self._download_json( + f'{self._API_BASE_URL}/api/videos/{video_id}/comments_v2', + video_id, fatal=False, note='Downloading comments API JSON', + headers=self._API_HEADERS, query={'limit': limit}) or {} + if not comment_info.get('comments'): + return + for comment_dict in comment_info['comments']: + yield { + 'author': traverse_obj(comment_dict, ('author', 'username')), + 'author_id': traverse_obj(comment_dict, ('author', 'user_id')), + 'id': comment_dict.get('id'), + 'text': comment_dict.get('body'), + 'timestamp': unified_timestamp(comment_dict.get('timestamp')), + } + + def _check_user_info(self, user_info): + if not user_info: + self.report_warning('Unable to extract user info') + elif user_info.get('private') and not user_info.get('followed_by_me'): + raise ExtractorError('This video is private', expected=True) + elif traverse_obj(user_info, 'blocked_by_user', 'blocking_user'): + raise ExtractorError('The author of the video is blocked', expected=True) + return user_info + + def _parse_video_info(self, video_info, username, user_info=None): + video_uuid = video_info.get('video_uuid') + video_id = video_info.get('id') + + formats = [] + video_url = traverse_obj(video_info, 'video_url', 'stream_url') + if video_url: + formats.append({ + 'url': video_url, + 'ext': 'mp4', + 'vcodec': 'h264', + 'width': video_info.get('width'), + 'height': video_info.get('height'), + 'format_id': url_basename(video_url).split('.')[0], + 'filesize': video_info.get('filesize'), + }) + video_set = video_info.get('video_set') or [] + for video in video_set: + resolution = video.get('resolution') or '' + formats.append({ + 'url': video['url'], + 'ext': 'mp4', + 'vcodec': video.get('codec'), + 'vbr': int_or_none(video.get('bitrate'), 1000), + 'width': int_or_none(resolution.split('x')[0]), + 'height': int_or_none(resolution.split('x')[1]), + 'format_id': url_basename(video['url']).split('.')[0], + }) + audio_url = video_info.get('audio_url') + if audio_url: + formats.append({ + 'url': audio_url, + 'ext': 'm4a', + 'format_id': url_basename(audio_url).split('.')[0], + }) + + manifest_url = video_info.get('transcoded_url') + if manifest_url: + formats.extend(self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + + comment_count = int_or_none(video_info.get('comment_count')) + + user_info = user_info or traverse_obj(video_info, 'user', default={}) + + return { + 'id': str_or_none(video_id) or video_uuid, + 'title': video_info.get('description') or f'Video by {username}', + 'thumbnail': video_info.get('thumbnail_url'), + 'description': video_info.get('description'), + 'uploader': str_or_none(username), + 'uploader_id': str_or_none(user_info.get('user_id')), + 'creator': str_or_none(user_info.get('name')), + 'timestamp': unified_timestamp(video_info.get('timestamp')), + 'upload_date': unified_strdate(video_info.get('timestamp')), + 'duration': int_or_none(video_info.get('duration')), + 'view_count': int_or_none(video_info.get('play_count')), + 'like_count': int_or_none(video_info.get('likes_count')), + 'artist': str_or_none(video_info.get('song_artist')), + 'track': str_or_none(video_info.get('song_title')), + 'webpage_url': f'https://triller.co/@{username}/video/{video_uuid}', + 'uploader_url': f'https://triller.co/@{username}', + 'extractor_key': TrillerIE.ie_key(), + 'extractor': TrillerIE.IE_NAME, + 'formats': formats, + 'comment_count': comment_count, + '__post_extractor': self.extract_comments(video_id, comment_count), + } + + +class TrillerIE(TrillerBaseIE): + _VALID_URL = r'''(?x) + https?://(?:www\.)?triller\.co/ + @(?P<username>[\w\._]+)/video/ + (?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}) + ''' + _TESTS = [{ + 'url': 'https://triller.co/@theestallion/video/2358fcd7-3df2-4c77-84c8-1d091610a6cf', + 'md5': '228662d783923b60d78395fedddc0a20', + 'info_dict': { + 'id': '71595734', + 'ext': 'mp4', + 'title': 'md5:9a2bf9435c5c4292678996a464669416', + 'thumbnail': r're:^https://uploads\.cdn\.triller\.co/.+\.jpg$', + 'description': 'md5:9a2bf9435c5c4292678996a464669416', + 'uploader': 'theestallion', + 'uploader_id': '18992236', + 'creator': 'Megan Thee Stallion', + 'timestamp': 1660598222, + 'upload_date': '20220815', + 'duration': 47, + 'height': 3840, + 'width': 2160, + 'view_count': int, + 'like_count': int, + 'artist': 'Megan Thee Stallion', + 'track': 'Her', + 'webpage_url': 'https://triller.co/@theestallion/video/2358fcd7-3df2-4c77-84c8-1d091610a6cf', + 'uploader_url': 'https://triller.co/@theestallion', + 'comment_count': int, + } + }, { + 'url': 'https://triller.co/@charlidamelio/video/46c6fcfa-aa9e-4503-a50c-68444f44cddc', + 'md5': '874055f462af5b0699b9dbb527a505a0', + 'info_dict': { + 'id': '71621339', + 'ext': 'mp4', + 'title': 'md5:4c91ea82760fe0fffb71b8c3aa7295fc', + 'thumbnail': r're:^https://uploads\.cdn\.triller\.co/.+\.jpg$', + 'description': 'md5:4c91ea82760fe0fffb71b8c3aa7295fc', + 'uploader': 'charlidamelio', + 'uploader_id': '1875551', + 'creator': 'charli damelio', + 'timestamp': 1660773354, + 'upload_date': '20220817', + 'duration': 16, + 'height': 1920, + 'width': 1080, + 'view_count': int, + 'like_count': int, + 'artist': 'Dixie', + 'track': 'Someone to Blame', + 'webpage_url': 'https://triller.co/@charlidamelio/video/46c6fcfa-aa9e-4503-a50c-68444f44cddc', + 'uploader_url': 'https://triller.co/@charlidamelio', + 'comment_count': int, + } + }] + + def _real_extract(self, url): + username, video_uuid = self._match_valid_url(url).group('username', 'id') + + video_info = traverse_obj(self._download_json( + f'{self._API_BASE_URL}/api/videos/{video_uuid}', + video_uuid, note='Downloading video info API JSON', + errnote='Unable to download video info API JSON', + headers=self._API_HEADERS), ('videos', 0)) + if not video_info: + raise ExtractorError('No video info found in API response') + + user_info = self._check_user_info(video_info.get('user') or {}) + return self._parse_video_info(video_info, username, user_info) + + +class TrillerUserIE(TrillerBaseIE): + _VALID_URL = r'https?://(?:www\.)?triller\.co/@(?P<id>[\w\._]+)/?(?:$|[#?])' + _TESTS = [{ + # first videos request only returns 2 videos + 'url': 'https://triller.co/@theestallion', + 'playlist_mincount': 9, + 'info_dict': { + 'id': '18992236', + 'title': 'theestallion', + 'thumbnail': r're:^https://uploads\.cdn\.triller\.co/.+\.jpg$', + } + }, { + 'url': 'https://triller.co/@charlidamelio', + 'playlist_mincount': 25, + 'info_dict': { + 'id': '1875551', + 'title': 'charlidamelio', + 'thumbnail': r're:^https://uploads\.cdn\.triller\.co/.+\.jpg$', + } + }] + + def _real_initialize(self): + if not self._API_HEADERS.get('Authorization'): + guest = self._download_json( + f'{self._API_BASE_URL}/user/create_guest', + None, note='Creating guest session', data=b'', headers=self._API_HEADERS, query={ + 'platform': 'Web', + 'app_version': '', + }) + if not guest.get('auth_token'): + raise ExtractorError('Unable to fetch required auth token for user extraction') + + self._API_HEADERS['Authorization'] = f'Bearer {guest["auth_token"]}' + + def _extract_video_list(self, username, user_id, limit=6): + query = { + 'limit': limit, + } + for page in itertools.count(1): + for retry in self.RetryManager(): + try: + video_list = self._download_json( + f'{self._API_BASE_URL}/api/users/{user_id}/videos', + username, note=f'Downloading user video list page {page}', + errnote='Unable to download user video list', headers=self._API_HEADERS, + query=query) + except ExtractorError as e: + if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0: + retry.error = e + continue + raise + if not video_list.get('videos'): + break + yield from video_list['videos'] + query['before_time'] = traverse_obj(video_list, ('videos', -1, 'timestamp')) + if not query['before_time']: + break + + def _entries(self, videos, username, user_info): + for video in videos: + yield self._parse_video_info(video, username, user_info) + + def _real_extract(self, url): + username = self._match_id(url) + user_info = self._check_user_info(self._download_json( + f'{self._API_BASE_URL}/api/users/by_username/{username}', + username, note='Downloading user info', + errnote='Failed to download user info', headers=self._API_HEADERS).get('user', {})) + + user_id = str_or_none(user_info.get('user_id')) + videos = self._extract_video_list(username, user_id) + thumbnail = user_info.get('avatar_url') + + return self.playlist_result( + self._entries(videos, username, user_info), user_id, username, thumbnail=thumbnail) diff --git a/hypervideo_dl/extractor/trilulilu.py b/hypervideo_dl/extractor/trilulilu.py index a800449..fb97be7 100644 --- a/hypervideo_dl/extractor/trilulilu.py +++ b/hypervideo_dl/extractor/trilulilu.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( ExtractorError, diff --git a/hypervideo_dl/extractor/trovo.py b/hypervideo_dl/extractor/trovo.py index 65ea13d..545a672 100644 --- a/hypervideo_dl/extractor/trovo.py +++ b/hypervideo_dl/extractor/trovo.py @@ -1,8 +1,7 @@ -# coding: utf-8 -from __future__ import unicode_literals - import itertools import json +import random +import string from .common import InfoExtractor from ..utils import ( @@ -10,6 +9,7 @@ from ..utils import ( format_field, int_or_none, str_or_none, + traverse_obj, try_get, ) @@ -18,10 +18,20 @@ class TrovoBaseIE(InfoExtractor): _VALID_URL_BASE = r'https?://(?:www\.)?trovo\.live/' _HEADERS = {'Origin': 'https://trovo.live'} - def _call_api(self, video_id, query=None, data=None): - return self._download_json( - 'https://gql.trovo.live/', video_id, query=query, data=data, - headers={'Accept': 'application/json'}) + def _call_api(self, video_id, data): + if 'persistedQuery' in data.get('extensions', {}): + url = 'https://gql.trovo.live' + else: + url = 'https://api-web.trovo.live/graphql' + + resp = self._download_json( + url, video_id, data=json.dumps([data]).encode(), headers={'Accept': 'application/json'}, + query={ + 'qid': ''.join(random.choices(string.ascii_uppercase + string.digits, k=16)), + })[0] + if 'errors' in resp: + raise ExtractorError(f'Trovo said: {resp["errors"][0]["message"]}') + return resp['data'][data['operationName']] def _extract_streamer_info(self, data): streamer_info = data.get('streamerInfo') or {} @@ -29,36 +39,43 @@ class TrovoBaseIE(InfoExtractor): return { 'uploader': streamer_info.get('nickName'), 'uploader_id': str_or_none(streamer_info.get('uid')), - 'uploader_url': format_field(username, template='https://trovo.live/%s'), + 'uploader_url': format_field(username, None, 'https://trovo.live/%s'), } class TrovoIE(TrovoBaseIE): - _VALID_URL = TrovoBaseIE._VALID_URL_BASE + r'(?!(?:clip|video)/)(?P<id>[^/?&#]+)' + _VALID_URL = TrovoBaseIE._VALID_URL_BASE + r'(?:s/)?(?!(?:clip|video)/)(?P<id>(?!s/)[^/?&#]+(?![^#]+[?&]vid=))' + _TESTS = [{ + 'url': 'https://trovo.live/Exsl', + 'only_matching': True, + }, { + 'url': 'https://trovo.live/s/SkenonSLive/549759191497', + 'only_matching': True, + }, { + 'url': 'https://trovo.live/s/zijo987/208251706', + 'info_dict': { + 'id': '104125853_104125853_1656439572', + 'ext': 'flv', + 'uploader_url': 'https://trovo.live/zijo987', + 'uploader_id': '104125853', + 'thumbnail': 'https://livecover.trovo.live/screenshot/73846_104125853_104125853-2022-06-29-04-00-22-852x480.jpg', + 'uploader': 'zijo987', + 'title': '💥IGRAMO IGRICE UPADAJTE💥2500/5000 2022-06-28 22:01', + 'live_status': 'is_live', + }, + 'skip': 'May not be live' + }] def _real_extract(self, url): username = self._match_id(url) - live_info = self._call_api(username, query={ - 'query': '''{ - getLiveInfo(params: {userName: "%s"}) { - isLive - programInfo { - coverUrl - id - streamInfo { - desc - playUrl - } - title - } - streamerInfo { - nickName - uid - userName - } - } -}''' % username, - })['data']['getLiveInfo'] + live_info = self._call_api(username, data={ + 'operationName': 'live_LiveReaderService_GetLiveInfo', + 'variables': { + 'params': { + 'userName': username, + }, + }, + }) if live_info.get('isLive') == 0: raise ExtractorError('%s is offline' % username, expected=True) program_info = live_info['programInfo'] @@ -75,9 +92,9 @@ class TrovoIE(TrovoBaseIE): 'format_id': format_id, 'height': int_or_none(format_id[:-1]) if format_id else None, 'url': play_url, + 'tbr': stream_info.get('bitrate'), 'http_headers': self._HEADERS, }) - self._sort_formats(formats) info = { 'id': program_id, @@ -91,57 +108,100 @@ class TrovoIE(TrovoBaseIE): class TrovoVodIE(TrovoBaseIE): - _VALID_URL = TrovoBaseIE._VALID_URL_BASE + r'(?:clip|video)/(?P<id>[^/?&#]+)' + _VALID_URL = TrovoBaseIE._VALID_URL_BASE + r'(?:clip|video|s)/(?:[^/]+/\d+[^#]*[?&]vid=)?(?P<id>(?<!/s/)[^/?&#]+)' _TESTS = [{ - 'url': 'https://trovo.live/video/ltv-100095501_100095501_1609596043', + 'url': 'https://trovo.live/clip/lc-5285890818705062210?ltab=videos', + 'params': {'getcomments': True}, 'info_dict': { - 'id': 'ltv-100095501_100095501_1609596043', + 'id': 'lc-5285890818705062210', 'ext': 'mp4', - 'title': 'Spontaner 12 Stunden Stream! - Ok Boomer!', - 'uploader': 'Exsl', - 'timestamp': 1609640305, - 'upload_date': '20210103', - 'uploader_id': '100095501', - 'duration': 43977, + 'title': 'fatal moaning for a super good🤣🤣', + 'uploader': 'OneTappedYou', + 'timestamp': 1621628019, + 'upload_date': '20210521', + 'uploader_id': '100719456', + 'duration': 31, 'view_count': int, 'like_count': int, 'comment_count': int, - 'comments': 'mincount:8', - 'categories': ['Grand Theft Auto V'], + 'comments': 'mincount:1', + 'categories': ['Call of Duty: Mobile'], + 'uploader_url': 'https://trovo.live/OneTappedYou', + 'thumbnail': r're:^https?://.*\.jpg', }, - 'skip': '404' }, { - 'url': 'https://trovo.live/clip/lc-5285890810184026005', + 'url': 'https://trovo.live/s/SkenonSLive/549759191497?vid=ltv-100829718_100829718_387702301737980280', + 'info_dict': { + 'id': 'ltv-100829718_100829718_387702301737980280', + 'ext': 'mp4', + 'timestamp': 1654909624, + 'thumbnail': 'http://vod.trovo.live/1f09baf0vodtransger1301120758/ef9ea3f0387702301737980280/coverBySnapshot/coverBySnapshot_10_0.jpg', + 'uploader_id': '100829718', + 'uploader': 'SkenonSLive', + 'title': 'Trovo u secanju, uz par modova i muzike :)', + 'uploader_url': 'https://trovo.live/SkenonSLive', + 'duration': 10830, + 'view_count': int, + 'like_count': int, + 'upload_date': '20220611', + 'comment_count': int, + 'categories': ['Minecraft'], + }, + 'skip': 'Not available', + }, { + 'url': 'https://trovo.live/s/Trovo/549756886599?vid=ltv-100264059_100264059_387702304241698583', + 'info_dict': { + 'id': 'ltv-100264059_100264059_387702304241698583', + 'ext': 'mp4', + 'timestamp': 1661479563, + 'thumbnail': 'http://vod.trovo.live/be5ae591vodtransusw1301120758/cccb9915387702304241698583/coverBySnapshot/coverBySnapshot_10_0.jpg', + 'uploader_id': '100264059', + 'uploader': 'Trovo', + 'title': 'Dev Corner 8/25', + 'uploader_url': 'https://trovo.live/Trovo', + 'duration': 3753, + 'view_count': int, + 'like_count': int, + 'upload_date': '20220826', + 'comment_count': int, + 'categories': ['Talk Shows'], + }, + }, { + 'url': 'https://trovo.live/video/ltv-100095501_100095501_1609596043', + 'only_matching': True, + }, { + 'url': 'https://trovo.live/s/SkenonSLive/549759191497?foo=bar&vid=ltv-100829718_100829718_387702301737980280', 'only_matching': True, }] def _real_extract(self, url): vid = self._match_id(url) - resp = self._call_api(vid, data=json.dumps([{ - 'query': '''{ - batchGetVodDetailInfo(params: {vids: ["%s"]}) { - VodDetailInfos - } -}''' % vid, - }, { - 'query': '''{ - getCommentList(params: {appInfo: {postID: "%s"}, pageSize: 1000000000, preview: {}}) { - commentList { - author { - nickName - uid - } - commentID - content - createdAt - parentID - } - } -}''' % vid, - }]).encode()) - vod_detail_info = resp[0]['data']['batchGetVodDetailInfo']['VodDetailInfos'][vid] - vod_info = vod_detail_info['vodInfo'] - title = vod_info['title'] + + # NOTE: It is also possible to extract this info from the Nuxt data on the website, + # however that seems unreliable - sometimes it randomly doesn't return the data, + # at least when using a non-residential IP. + resp = self._call_api(vid, data={ + 'operationName': 'vod_VodReaderService_BatchGetVodDetailInfo', + 'variables': { + 'params': { + 'vids': [vid], + }, + }, + 'extensions': {}, + }) + + vod_detail_info = traverse_obj(resp, ('VodDetailInfos', vid), expected_type=dict) + if not vod_detail_info: + raise ExtractorError('This video not found or not available anymore', expected=True) + vod_info = vod_detail_info.get('vodInfo') + title = vod_info.get('title') + + if try_get(vod_info, lambda x: x['playbackRights']['playbackRights'] != 'Normal'): + playback_rights_setting = vod_info['playbackRights']['playbackRightsSetting'] + if playback_rights_setting == 'SubscriberOnly': + raise ExtractorError('This video is only available for subscribers', expected=True) + else: + raise ExtractorError(f'This video is not available ({playback_rights_setting})', expected=True) language = vod_info.get('languageName') formats = [] @@ -161,28 +221,10 @@ class TrovoVodIE(TrovoBaseIE): 'url': play_url, 'http_headers': self._HEADERS, }) - self._sort_formats(formats) category = vod_info.get('categoryName') get_count = lambda x: int_or_none(vod_info.get(x + 'Num')) - comment_list = try_get(resp, lambda x: x[1]['data']['getCommentList']['commentList'], list) or [] - comments = [] - for comment in comment_list: - content = comment.get('content') - if not content: - continue - author = comment.get('author') or {} - parent = comment.get('parentID') - comments.append({ - 'author': author.get('nickName'), - 'author_id': str_or_none(author.get('uid')), - 'id': str_or_none(comment.get('commentID')), - 'text': content, - 'timestamp': int_or_none(comment.get('createdAt')), - 'parent': 'root' if parent == 0 else str_or_none(parent), - }) - info = { 'id': vid, 'title': title, @@ -193,35 +235,81 @@ class TrovoVodIE(TrovoBaseIE): 'view_count': get_count('watch'), 'like_count': get_count('like'), 'comment_count': get_count('comment'), - 'comments': comments, 'categories': [category] if category else None, + '__post_extractor': self.extract_comments(vid), } info.update(self._extract_streamer_info(vod_detail_info)) return info + def _get_comments(self, vid): + for page in itertools.count(1): + comments_json = self._call_api(vid, data={ + 'operationName': 'public_CommentProxyService_GetCommentList', + 'variables': { + 'params': { + 'appInfo': { + 'postID': vid, + }, + 'preview': {}, + 'pageSize': 99, + 'page': page, + }, + }, + 'extensions': { + 'singleReq': 'true', + }, + }) + for comment in comments_json['commentList']: + content = comment.get('content') + if not content: + continue + author = comment.get('author') or {} + parent = comment.get('parentID') + yield { + 'author': author.get('nickName'), + 'author_id': str_or_none(author.get('uid')), + 'id': str_or_none(comment.get('commentID')), + 'text': content, + 'timestamp': int_or_none(comment.get('createdAt')), + 'parent': 'root' if parent == 0 else str_or_none(parent), + } + + if comments_json['lastPage']: + break -class TrovoChannelBaseIE(TrovoBaseIE): - def _get_vod_json(self, page, uid): - raise NotImplementedError('This method must be implemented by subclasses') - def _entries(self, uid): +class TrovoChannelBaseIE(TrovoBaseIE): + def _entries(self, spacename): for page in itertools.count(1): - vod_json = self._get_vod_json(page, uid) + vod_json = self._call_api(spacename, data={ + 'operationName': self._OPERATION, + 'variables': { + 'params': { + 'terminalSpaceID': { + 'spaceName': spacename, + }, + 'currPage': page, + 'pageSize': 99, + }, + }, + 'extensions': { + 'singleReq': 'true', + }, + }) vods = vod_json.get('vodInfos', []) for vod in vods: + vid = vod.get('vid') + room = traverse_obj(vod, ('spaceInfo', 'roomID')) yield self.url_result( - 'https://trovo.live/%s/%s' % (self._TYPE, vod.get('vid')), + f'https://trovo.live/s/{spacename}/{room}?vid={vid}', ie=TrovoVodIE.ie_key()) - has_more = vod_json['hasMore'] + has_more = vod_json.get('hasMore') if not has_more: break def _real_extract(self, url): - id = self._match_id(url) - uid = str(self._call_api(id, query={ - 'query': '{getLiveInfo(params:{userName:"%s"}){streamerInfo{uid}}}' % id - })['data']['getLiveInfo']['streamerInfo']['uid']) - return self.playlist_result(self._entries(uid), playlist_id=uid) + spacename = self._match_id(url) + return self.playlist_result(self._entries(spacename), playlist_id=spacename) class TrovoChannelVodIE(TrovoChannelBaseIE): @@ -232,17 +320,11 @@ class TrovoChannelVodIE(TrovoChannelBaseIE): 'url': 'trovovod:OneTappedYou', 'playlist_mincount': 24, 'info_dict': { - 'id': '100719456', + 'id': 'OneTappedYou', }, }] - _QUERY = '{getChannelLtvVideoInfos(params:{pageSize:99,currPage:%d,channelID:%s}){hasMore,vodInfos{vid}}}' - _TYPE = 'video' - - def _get_vod_json(self, page, uid): - return self._call_api(uid, query={ - 'query': self._QUERY % (page, uid) - })['data']['getChannelLtvVideoInfos'] + _OPERATION = 'vod_VodReaderService_GetChannelLtvVideoInfos' class TrovoChannelClipIE(TrovoChannelBaseIE): @@ -253,14 +335,8 @@ class TrovoChannelClipIE(TrovoChannelBaseIE): 'url': 'trovoclip:OneTappedYou', 'playlist_mincount': 29, 'info_dict': { - 'id': '100719456', + 'id': 'OneTappedYou', }, }] - _QUERY = '{getChannelClipVideoInfos(params:{pageSize:99,currPage:%d,channelID:%s,albumType:VOD_CLIP_ALBUM_TYPE_LATEST}){hasMore,vodInfos{vid}}}' - _TYPE = 'clip' - - def _get_vod_json(self, page, uid): - return self._call_api(uid, query={ - 'query': self._QUERY % (page, uid) - })['data']['getChannelClipVideoInfos'] + _OPERATION = 'vod_VodReaderService_GetChannelClipVideoInfos' diff --git a/hypervideo_dl/extractor/trueid.py b/hypervideo_dl/extractor/trueid.py index fc98303..6963436 100644 --- a/hypervideo_dl/extractor/trueid.py +++ b/hypervideo_dl/extractor/trueid.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( diff --git a/hypervideo_dl/extractor/trunews.py b/hypervideo_dl/extractor/trunews.py index cca5b5c..d5ce86e 100644 --- a/hypervideo_dl/extractor/trunews.py +++ b/hypervideo_dl/extractor/trunews.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/truth.py b/hypervideo_dl/extractor/truth.py new file mode 100644 index 0000000..1c6409c --- /dev/null +++ b/hypervideo_dl/extractor/truth.py @@ -0,0 +1,69 @@ +from .common import InfoExtractor +from ..utils import ( + clean_html, + format_field, + int_or_none, + strip_or_none, + traverse_obj, + unified_timestamp, +) + + +class TruthIE(InfoExtractor): + _VALID_URL = r'https?://truthsocial\.com/@[^/]+/posts/(?P<id>\d+)' + _TESTS = [ + { + 'url': 'https://truthsocial.com/@realDonaldTrump/posts/108779000807761862', + 'md5': '4a5fb1470c192e493d9efd6f19e514d3', + 'info_dict': { + 'id': '108779000807761862', + 'ext': 'qt', + 'title': 'Truth video #108779000807761862', + 'description': None, + 'timestamp': 1659835827, + 'upload_date': '20220807', + 'uploader': 'Donald J. Trump', + 'uploader_id': 'realDonaldTrump', + 'uploader_url': 'https://truthsocial.com/@realDonaldTrump', + 'repost_count': int, + 'comment_count': int, + 'like_count': int, + }, + }, + { + 'url': 'https://truthsocial.com/@ProjectVeritasAction/posts/108618228543962049', + 'md5': 'fd47ba68933f9dce27accc52275be9c3', + 'info_dict': { + 'id': '108618228543962049', + 'ext': 'mp4', + 'title': 'md5:debde7186cf83f60ff7b44dbb9444e35', + 'description': 'md5:de2fc49045bf92bb8dc97e56503b150f', + 'timestamp': 1657382637, + 'upload_date': '20220709', + 'uploader': 'Project Veritas Action', + 'uploader_id': 'ProjectVeritasAction', + 'uploader_url': 'https://truthsocial.com/@ProjectVeritasAction', + 'repost_count': int, + 'comment_count': int, + 'like_count': int, + }, + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + status = self._download_json(f'https://truthsocial.com/api/v1/statuses/{video_id}', video_id) + uploader_id = strip_or_none(traverse_obj(status, ('account', 'username'))) + return { + 'id': video_id, + 'url': status['media_attachments'][0]['url'], + 'title': '', + 'description': strip_or_none(clean_html(status.get('content'))) or None, + 'timestamp': unified_timestamp(status.get('created_at')), + 'uploader': strip_or_none(traverse_obj(status, ('account', 'display_name'))), + 'uploader_id': uploader_id, + 'uploader_url': format_field(uploader_id, None, 'https://truthsocial.com/@%s'), + 'repost_count': int_or_none(status.get('reblogs_count')), + 'like_count': int_or_none(status.get('favourites_count')), + 'comment_count': int_or_none(status.get('replies_count')), + } diff --git a/hypervideo_dl/extractor/trutv.py b/hypervideo_dl/extractor/trutv.py index c09ff89..ea0f2f4 100644 --- a/hypervideo_dl/extractor/trutv.py +++ b/hypervideo_dl/extractor/trutv.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .turner import TurnerBaseIE from ..utils import ( int_or_none, diff --git a/hypervideo_dl/extractor/tube8.py b/hypervideo_dl/extractor/tube8.py index db93b01..77ed05f 100644 --- a/hypervideo_dl/extractor/tube8.py +++ b/hypervideo_dl/extractor/tube8.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from ..utils import ( @@ -9,8 +7,9 @@ from ..utils import ( from .keezmovies import KeezMoviesIE -class Tube8IE(KeezMoviesIE): +class Tube8IE(KeezMoviesIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/(?P<id>\d+)' + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?tube8\.com/embed/(?:[^/]+/)+\d+)'] _TESTS = [{ 'url': 'http://www.tube8.com/teen/kasia-music-video/229795/', 'md5': '65e20c48e6abff62ed0c3965fff13a39', @@ -31,12 +30,6 @@ class Tube8IE(KeezMoviesIE): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?tube8\.com/embed/(?:[^/]+/)+\d+)', - webpage) - def _real_extract(self, url): webpage, info = self._extract_info(url) diff --git a/hypervideo_dl/extractor/tubetugraz.py b/hypervideo_dl/extractor/tubetugraz.py new file mode 100644 index 0000000..ebabedc --- /dev/null +++ b/hypervideo_dl/extractor/tubetugraz.py @@ -0,0 +1,233 @@ +from .common import InfoExtractor +from ..utils import ( + float_or_none, + parse_resolution, + traverse_obj, + urlencode_postdata, + variadic, +) + + +class TubeTuGrazBaseIE(InfoExtractor): + _NETRC_MACHINE = 'tubetugraz' + + _API_EPISODE = 'https://tube.tugraz.at/search/episode.json' + _FORMAT_TYPES = ('presentation', 'presenter') + + def _perform_login(self, username, password): + urlh = self._request_webpage( + 'https://tube.tugraz.at/Shibboleth.sso/Login?target=/paella/ui/index.html', + None, fatal=False, note='downloading login page', errnote='unable to fetch login page') + if not urlh: + return + + urlh = self._request_webpage( + urlh.geturl(), None, fatal=False, headers={'referer': urlh.geturl()}, + note='logging in', errnote='unable to log in', data=urlencode_postdata({ + 'lang': 'de', + '_eventId_proceed': '', + 'j_username': username, + 'j_password': password + })) + + if urlh and urlh.geturl() != 'https://tube.tugraz.at/paella/ui/index.html': + self.report_warning('unable to login: incorrect password') + + def _extract_episode(self, episode_info): + id = episode_info.get('id') + formats = list(self._extract_formats( + traverse_obj(episode_info, ('mediapackage', 'media', 'track')), id)) + + title = traverse_obj(episode_info, ('mediapackage', 'title'), 'dcTitle') + series_title = traverse_obj(episode_info, ('mediapackage', 'seriestitle')) + creator = ', '.join(variadic(traverse_obj( + episode_info, ('mediapackage', 'creators', 'creator'), 'dcCreator', default=''))) + return { + 'id': id, + 'title': title, + 'creator': creator or None, + 'duration': traverse_obj(episode_info, ('mediapackage', 'duration'), 'dcExtent'), + 'series': series_title, + 'series_id': traverse_obj(episode_info, ('mediapackage', 'series'), 'dcIsPartOf'), + 'episode': series_title and title, + 'formats': formats + } + + def _set_format_type(self, formats, type): + for f in formats: + f['format_note'] = type + if not type.startswith(self._FORMAT_TYPES[0]): + f['preference'] = -2 + return formats + + def _extract_formats(self, format_list, id): + has_hls, has_dash = False, False + + for format_info in format_list or []: + url = traverse_obj(format_info, ('tags', 'url'), 'url') + if url is None: + continue + + type = format_info.get('type') or 'unknown' + transport = (format_info.get('transport') or 'https').lower() + + if transport == 'https': + formats = [{ + 'url': url, + 'abr': float_or_none(traverse_obj(format_info, ('audio', 'bitrate')), 1000), + 'vbr': float_or_none(traverse_obj(format_info, ('video', 'bitrate')), 1000), + 'fps': traverse_obj(format_info, ('video', 'framerate')), + **parse_resolution(traverse_obj(format_info, ('video', 'resolution'))), + }] + elif transport == 'hls': + has_hls, formats = True, self._extract_m3u8_formats( + url, id, 'mp4', fatal=False, note=f'downloading {type} HLS manifest') + elif transport == 'dash': + has_dash, formats = True, self._extract_mpd_formats( + url, id, fatal=False, note=f'downloading {type} DASH manifest') + else: + # RTMP, HDS, SMOOTH, and unknown formats + # - RTMP url fails on every tested entry until now + # - HDS url 404's on every tested entry until now + # - SMOOTH url 404's on every tested entry until now + continue + + yield from self._set_format_type(formats, type) + + # TODO: Add test for these + for type in self._FORMAT_TYPES: + if not has_hls: + hls_formats = self._extract_m3u8_formats( + f'https://wowza.tugraz.at/matterhorn_engage/smil:engage-player_{id}_{type}.smil/playlist.m3u8', + id, 'mp4', fatal=False, note=f'Downloading {type} HLS manifest', errnote=False) or [] + yield from self._set_format_type(hls_formats, type) + + if not has_dash: + dash_formats = self._extract_mpd_formats( + f'https://wowza.tugraz.at/matterhorn_engage/smil:engage-player_{id}_{type}.smil/manifest_mpm4sav_mvlist.mpd', + id, fatal=False, note=f'Downloading {type} DASH manifest', errnote=False) + yield from self._set_format_type(dash_formats, type) + + +class TubeTuGrazIE(TubeTuGrazBaseIE): + IE_DESC = 'tube.tugraz.at' + + _VALID_URL = r'''(?x) + https?://tube\.tugraz\.at/paella/ui/watch.html\?id= + (?P<id>[0-9a-fA-F]{8}-(?:[0-9a-fA-F]{4}-){3}[0-9a-fA-F]{12}) + ''' + _TESTS = [ + { + 'url': 'https://tube.tugraz.at/paella/ui/watch.html?id=f2634392-e40e-4ac7-9ddc-47764aa23d40', + 'md5': 'a23a3d5c9aaca2b84932fdba66e17145', + 'info_dict': { + 'id': 'f2634392-e40e-4ac7-9ddc-47764aa23d40', + 'ext': 'mp4', + 'title': '#6 (23.11.2017)', + 'episode': '#6 (23.11.2017)', + 'series': '[INB03001UF] Einführung in die strukturierte Programmierung', + 'creator': 'Safran C', + 'duration': 3295818, + 'series_id': 'b1192fff-2aa7-4bf0-a5cf-7b15c3bd3b34', + } + }, { + 'url': 'https://tube.tugraz.at/paella/ui/watch.html?id=2df6d787-e56a-428d-8ef4-d57f07eef238', + 'md5': 'de0d854a56bf7318d2b693fe1adb89a5', + 'info_dict': { + 'id': '2df6d787-e56a-428d-8ef4-d57f07eef238', + 'title': 'TubeTuGraz video #2df6d787-e56a-428d-8ef4-d57f07eef238', + 'ext': 'mp4', + }, + 'expected_warnings': ['Extractor failed to obtain "title"'], + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + episode_data = self._download_json( + self._API_EPISODE, video_id, query={'id': video_id, 'limit': 1}, note='Downloading episode metadata') + + episode_info = traverse_obj(episode_data, ('search-results', 'result'), default={'id': video_id}) + return self._extract_episode(episode_info) + + +class TubeTuGrazSeriesIE(TubeTuGrazBaseIE): + _VALID_URL = r'''(?x) + https?://tube\.tugraz\.at/paella/ui/browse\.html\?series= + (?P<id>[0-9a-fA-F]{8}-(?:[0-9a-fA-F]{4}-){3}[0-9a-fA-F]{12}) + ''' + _TESTS = [{ + 'url': 'https://tube.tugraz.at/paella/ui/browse.html?series=0e6351b7-c372-491e-8a49-2c9b7e21c5a6', + 'id': '0e6351b7-c372-491e-8a49-2c9b7e21c5a6', + 'info_dict': { + 'id': '0e6351b7-c372-491e-8a49-2c9b7e21c5a6', + 'title': '[209351] Strassenwesen', + }, + 'playlist': [ + { + 'info_dict': { + 'id': 'ee17ce5d-34e2-48b7-a76a-fed148614e11', + 'series_id': '0e6351b7-c372-491e-8a49-2c9b7e21c5a6', + 'ext': 'mp4', + 'title': '#4 Detailprojekt', + 'episode': '#4 Detailprojekt', + 'series': '[209351] Strassenwesen', + 'creator': 'Neuhold R', + 'duration': 6127024, + } + }, + { + 'info_dict': { + 'id': '87350498-799a-44d3-863f-d1518a98b114', + 'series_id': '0e6351b7-c372-491e-8a49-2c9b7e21c5a6', + 'ext': 'mp4', + 'title': '#3 Generelles Projekt', + 'episode': '#3 Generelles Projekt', + 'series': '[209351] Strassenwesen', + 'creator': 'Neuhold R', + 'duration': 5374422, + } + }, + { + 'info_dict': { + 'id': '778599ea-489e-4189-9e05-3b4888e19bcd', + 'series_id': '0e6351b7-c372-491e-8a49-2c9b7e21c5a6', + 'ext': 'mp4', + 'title': '#2 Vorprojekt', + 'episode': '#2 Vorprojekt', + 'series': '[209351] Strassenwesen', + 'creator': 'Neuhold R', + 'duration': 5566404, + } + }, + { + 'info_dict': { + 'id': '75e4c71c-d99d-4e56-b0e6-4f2bcdf11f29', + 'series_id': '0e6351b7-c372-491e-8a49-2c9b7e21c5a6', + 'ext': 'mp4', + 'title': '#1 Variantenstudium', + 'episode': '#1 Variantenstudium', + 'series': '[209351] Strassenwesen', + 'creator': 'Neuhold R', + 'duration': 5420200, + } + } + ], + 'min_playlist_count': 4 + }] + + def _real_extract(self, url): + id = self._match_id(url) + episodes_data = self._download_json(self._API_EPISODE, id, query={'sid': id}, note='Downloading episode list') + series_data = self._download_json( + 'https://tube.tugraz.at/series/series.json', id, fatal=False, + note='downloading series metadata', errnote='failed to download series metadata', + query={ + 'seriesId': id, + 'count': 1, + 'sort': 'TITLE' + }) + + return self.playlist_result( + map(self._extract_episode, episodes_data['search-results']['result']), id, + traverse_obj(series_data, ('catalogs', 0, 'http://purl.org/dc/terms/', 'title', 0, 'value'))) diff --git a/hypervideo_dl/extractor/tubitv.py b/hypervideo_dl/extractor/tubitv.py index 31feb9a..de8b5da 100644 --- a/hypervideo_dl/extractor/tubitv.py +++ b/hypervideo_dl/extractor/tubitv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -10,6 +7,7 @@ from ..utils import ( js_to_json, sanitized_Request, urlencode_postdata, + traverse_obj, ) @@ -24,6 +22,19 @@ class TubiTvIE(InfoExtractor): _NETRC_MACHINE = 'tubitv' _GEO_COUNTRIES = ['US'] _TESTS = [{ + 'url': 'https://tubitv.com/movies/383676/tracker', + 'md5': '566fa0f76870302d11af0de89511d3f0', + 'info_dict': { + 'id': '383676', + 'ext': 'mp4', + 'title': 'Tracker', + 'description': 'md5:ff320baf43d0ad2655e538c1d5cd9706', + 'uploader_id': 'f866e2677ea2f0dff719788e4f7f9195', + 'release_year': 2010, + 'thumbnail': r're:^https?://.+\.(jpe?g|png)$', + 'duration': 6122, + }, + }, { 'url': 'http://tubitv.com/video/283829/the_comedian_at_the_friday', 'md5': '43ac06be9326f41912dc64ccf7a80320', 'info_dict': { @@ -33,13 +44,11 @@ class TubiTvIE(InfoExtractor): 'description': 'A stand up comedian is forced to look at the decisions in his life while on a one week trip to the west coast.', 'uploader_id': 'bc168bee0d18dd1cb3b86c68706ab434', }, + 'skip': 'Content Unavailable' }, { 'url': 'http://tubitv.com/tv-shows/321886/s01_e01_on_nom_stories', 'only_matching': True, }, { - 'url': 'http://tubitv.com/movies/383676/tracker', - 'only_matching': True, - }, { 'url': 'https://tubitv.com/movies/560057/penitentiary?start=true', 'info_dict': { 'id': '560057', @@ -49,11 +58,13 @@ class TubiTvIE(InfoExtractor): 'uploader_id': 'd8fed30d4f24fcb22ec294421b9defc2', 'release_year': 1979, }, - 'params': { - 'skip_download': True, - }, + 'skip': 'Content Unavailable' }] + # DRM formats are included only to raise appropriate error + _UNPLAYABLE_FORMATS = ('hlsv6_widevine', 'hlsv6_widevine_nonclearlead', 'hlsv6_playready_psshv0', + 'hlsv6_fairplay', 'dash_widevine', 'dash_widevine_nonclearlead') + def _perform_login(self, username, password): self.report_login() form_data = { @@ -71,18 +82,26 @@ class TubiTvIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json( - 'http://tubitv.com/oz/videos/%s/content' % video_id, video_id) + video_data = self._download_json(f'https://tubitv.com/oz/videos/{video_id}/content', video_id, query={ + 'video_resources': ['dash', 'hlsv3', 'hlsv6', *self._UNPLAYABLE_FORMATS], + }) title = video_data['title'] formats = [] - url = video_data['url'] - # URL can be sometimes empty. Does this only happen when there is DRM? - if url: - formats = self._extract_m3u8_formats( - self._proto_relative_url(url), - video_id, 'mp4', 'm3u8_native') - self._sort_formats(formats) + drm_formats = False + + for resource in video_data['video_resources']: + if resource['type'] in ('dash', ): + formats += self._extract_mpd_formats(resource['manifest']['url'], video_id, mpd_id=resource['type'], fatal=False) + elif resource['type'] in ('hlsv3', 'hlsv6'): + formats += self._extract_m3u8_formats(resource['manifest']['url'], video_id, 'mp4', m3u8_id=resource['type'], fatal=False) + elif resource['type'] in self._UNPLAYABLE_FORMATS: + drm_formats = True + + if not formats and drm_formats: + self.report_drm(video_id) + elif not formats and not video_data.get('policy_match'): # policy_match is False if content was removed + raise ExtractorError('This content is currently unavailable', expected=True) thumbnails = [] for thumbnail_url in video_data.get('thumbnails', []): @@ -138,6 +157,8 @@ class TubiTvShowIE(InfoExtractor): show_webpage, 'data'), show_name, transform_source=js_to_json)['video'] for episode_id in show_json['fullContentById'].keys(): + if traverse_obj(show_json, ('byId', episode_id, 'type')) == 's': + continue yield self.url_result( 'tubitv:%s' % episode_id, ie=TubiTvIE.ie_key(), video_id=episode_id) diff --git a/hypervideo_dl/extractor/tudou.py b/hypervideo_dl/extractor/tudou.py deleted file mode 100644 index 7421378..0000000 --- a/hypervideo_dl/extractor/tudou.py +++ /dev/null @@ -1,49 +0,0 @@ -# coding: utf-8 - -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class TudouPlaylistIE(InfoExtractor): - IE_NAME = 'tudou:playlist' - _VALID_URL = r'https?://(?:www\.)?tudou\.com/listplay/(?P<id>[\w-]{11})\.html' - _TESTS = [{ - 'url': 'http://www.tudou.com/listplay/zzdE77v6Mmo.html', - 'info_dict': { - 'id': 'zzdE77v6Mmo', - }, - 'playlist_mincount': 209, - }] - - def _real_extract(self, url): - playlist_id = self._match_id(url) - playlist_data = self._download_json( - 'http://www.tudou.com/tvp/plist.action?lcode=%s' % playlist_id, playlist_id) - entries = [self.url_result( - 'http://www.tudou.com/programs/view/%s' % item['icode'], - 'Tudou', item['icode'], - item['kw']) for item in playlist_data['items']] - return self.playlist_result(entries, playlist_id) - - -class TudouAlbumIE(InfoExtractor): - IE_NAME = 'tudou:album' - _VALID_URL = r'https?://(?:www\.)?tudou\.com/album(?:cover|play)/(?P<id>[\w-]{11})' - _TESTS = [{ - 'url': 'http://www.tudou.com/albumplay/v5qckFJvNJg.html', - 'info_dict': { - 'id': 'v5qckFJvNJg', - }, - 'playlist_mincount': 45, - }] - - def _real_extract(self, url): - album_id = self._match_id(url) - album_data = self._download_json( - 'http://www.tudou.com/tvp/alist.action?acode=%s' % album_id, album_id) - entries = [self.url_result( - 'http://www.tudou.com/programs/view/%s' % item['icode'], - 'Tudou', item['icode'], - item['kw']) for item in album_data['items']] - return self.playlist_result(entries, album_id) diff --git a/hypervideo_dl/extractor/tumblr.py b/hypervideo_dl/extractor/tumblr.py index 8086f61..88d4ae3 100644 --- a/hypervideo_dl/extractor/tumblr.py +++ b/hypervideo_dl/extractor/tumblr.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -362,7 +358,6 @@ class TumblrIE(InfoExtractor): 'height': int_or_none( media_json.get('height') or self._og_search_property('video:height', webpage, default=None)), }] - self._sort_formats(formats) # the url we're extracting from might be an original post or it might be a reblog. # if it's a reblog, og:description will be the reblogger's comment, not the uploader's. diff --git a/hypervideo_dl/extractor/tunein.py b/hypervideo_dl/extractor/tunein.py index 7e51de8..43b4f67 100644 --- a/hypervideo_dl/extractor/tunein.py +++ b/hypervideo_dl/extractor/tunein.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -11,12 +8,6 @@ from ..compat import compat_urlparse class TuneInBaseIE(InfoExtractor): _API_BASE_URL = 'http://tunein.com/tuner/tune/' - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+src=["\'](?P<url>(?:https?://)?tunein\.com/embed/player/[pst]\d+)', - webpage) - def _real_extract(self, url): content_id = self._match_id(url) @@ -58,7 +49,6 @@ class TuneInBaseIE(InfoExtractor): 'source_preference': reliability, 'format_note': format_note, }) - self._sort_formats(formats) return { 'id': content_id, @@ -89,6 +79,7 @@ class TuneInClipIE(TuneInBaseIE): class TuneInStationIE(TuneInBaseIE): IE_NAME = 'tunein:station' _VALID_URL = r'https?://(?:www\.)?tunein\.com/(?:radio/.*?-s|station/.*?StationId=|embed/player/s)(?P<id>\d+)' + _EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>(?:https?://)?tunein\.com/embed/player/[pst]\d+)'] _API_URL_QUERY = '?tuneType=Station&stationId=%s' @classmethod diff --git a/hypervideo_dl/extractor/tunepk.py b/hypervideo_dl/extractor/tunepk.py index 9d42651..e4e507b 100644 --- a/hypervideo_dl/extractor/tunepk.py +++ b/hypervideo_dl/extractor/tunepk.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -59,7 +57,6 @@ class TunePkIE(InfoExtractor): formats = self._parse_jwplayer_formats( details['player']['sources'], video_id) - self._sort_formats(formats) description = self._og_search_description( webpage, default=None) or self._html_search_meta( diff --git a/hypervideo_dl/extractor/turbo.py b/hypervideo_dl/extractor/turbo.py index f6bbf25..cdb7dcf 100644 --- a/hypervideo_dl/extractor/turbo.py +++ b/hypervideo_dl/extractor/turbo.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -56,7 +53,6 @@ class TurboIE(InfoExtractor): 'url': child.text, 'quality': get_quality(quality), }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/turner.py b/hypervideo_dl/extractor/turner.py index 519dc32..630d84b 100644 --- a/hypervideo_dl/extractor/turner.py +++ b/hypervideo_dl/extractor/turner.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .adobepass import AdobePassIE @@ -144,7 +141,7 @@ class TurnerBaseIE(AdobePassIE): m3u8_id=format_id or 'hls', fatal=False) if '/secure/' in video_url and '?hdnea=' in video_url: for f in m3u8_formats: - f['_ffmpeg_args'] = ['-seekable', '0'] + f['downloader_options'] = {'ffmpeg_args': ['-seekable', '0']} formats.extend(m3u8_formats) elif ext == 'f4m': formats.extend(self._extract_f4m_formats( @@ -177,7 +174,6 @@ class TurnerBaseIE(AdobePassIE): else: f['tbr'] = int(mobj.group(1)) formats.append(f) - self._sort_formats(formats) for source in video_data.findall('closedCaptions/source'): for track in source.findall('track'): @@ -252,7 +248,6 @@ class TurnerBaseIE(AdobePassIE): 'start_time': start_time, 'end_time': start_time + chapter_duration, }) - self._sort_formats(formats) return { 'formats': formats, diff --git a/hypervideo_dl/extractor/tv2.py b/hypervideo_dl/extractor/tv2.py index 977da30..c51e633 100644 --- a/hypervideo_dl/extractor/tv2.py +++ b/hypervideo_dl/extractor/tv2.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -19,23 +16,27 @@ from ..utils import ( class TV2IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tv2\.no/v\d*/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?tv2\.no/v(?:ideo)?\d*/(?:[^?#]+/)*(?P<id>\d+)' _TESTS = [{ - 'url': 'http://www.tv2.no/v/916509/', + 'url': 'http://www.tv2.no/v/1791207/', 'info_dict': { - 'id': '916509', + 'id': '1791207', 'ext': 'mp4', - 'title': 'Se Frode Gryttens hyllest av Steven Gerrard', - 'description': 'TV 2 Sportens huspoet tar avskjed med Liverpools kaptein Steven Gerrard.', - 'timestamp': 1431715610, - 'upload_date': '20150515', - 'duration': 157, + 'title': 'Her kolliderer romsonden med asteroiden ', + 'description': 'En romsonde har krasjet inn i en asteroide i verdensrommet. Kollisjonen skjedde klokken 01:14 natt til tirsdag 27. september norsk tid. \n\nNasa kaller det sitt første forsøk på planetforsvar.', + 'timestamp': 1664238190, + 'upload_date': '20220927', + 'duration': 146, + 'thumbnail': r're:^https://.*$', 'view_count': int, 'categories': list, }, }, { 'url': 'http://www.tv2.no/v2/916509', 'only_matching': True, + }, { + 'url': 'https://www.tv2.no/video/nyhetene/her-kolliderer-romsonden-med-asteroiden/1791207/', + 'only_matching': True, }] _PROTOCOLS = ('HLS', 'DASH') _GEO_COUNTRIES = ['NO'] @@ -94,7 +95,6 @@ class TV2IE(InfoExtractor): }) if not formats and data.get('drmProtected'): self.report_drm(video_id) - self._sort_formats(formats) thumbnails = [{ 'id': type, @@ -117,13 +117,13 @@ class TV2IE(InfoExtractor): class TV2ArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tv2\.no/(?:a|\d{4}/\d{2}/\d{2}(/[^/]+)+)/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?tv2\.no/(?!v(?:ideo)?\d*/)[^?#]+/(?P<id>\d+)' _TESTS = [{ - 'url': 'http://www.tv2.no/2015/05/16/nyheter/alesund/krim/pingvin/6930542', + 'url': 'https://www.tv2.no/underholdning/forraeder/katarina-flatland-angrer-etter-forraeder-exit/15095188/', 'info_dict': { - 'id': '6930542', - 'title': 'Russen hetses etter pingvintyveri - innrømmer å ha åpnet luken på buret', - 'description': 'De fire siktede nekter fortsatt for å ha stjålet pingvinbabyene, men innrømmer å ha åpnet luken til de små kyllingene.', + 'id': '15095188', + 'title': 'Katarina Flatland angrer etter Forræder-exit', + 'description': 'SANDEFJORD (TV 2): Katarina Flatland (33) måtte følge i sine fars fotspor, da hun ble forvist fra Forræder.', }, 'playlist_count': 2, }, { @@ -141,7 +141,7 @@ class TV2ArticleIE(InfoExtractor): if not assets: # New embed pattern - for v in re.findall(r'(?s)TV2ContentboxVideo\(({.+?})\)', webpage): + for v in re.findall(r'(?s)(?:TV2ContentboxVideo|TV2\.TV2Video)\(({.+?})\)', webpage): video = self._parse_json( v, playlist_id, transform_source=js_to_json, fatal=False) if not video: @@ -257,7 +257,6 @@ class KatsomoIE(InfoExtractor): }) if not formats and data.get('drmProtected'): self.report_drm(video_id) - self._sort_formats(formats) thumbnails = [{ 'id': thumbnail.get('@type'), diff --git a/hypervideo_dl/extractor/tv24ua.py b/hypervideo_dl/extractor/tv24ua.py new file mode 100644 index 0000000..89905ac --- /dev/null +++ b/hypervideo_dl/extractor/tv24ua.py @@ -0,0 +1,78 @@ +import re + +from .common import InfoExtractor +from ..utils import determine_ext, js_to_json, mimetype2ext, traverse_obj + + +class TV24UAVideoIE(InfoExtractor): + _VALID_URL = r'https?://24tv\.ua/news/showPlayer\.do.*?(?:\?|&)objectId=(?P<id>\d+)' + _EMBED_REGEX = [rf'<iframe[^>]+?src=["\']?(?P<url>{_VALID_URL})["\']?'] + IE_NAME = '24tv.ua' + _TESTS = [{ + 'url': 'https://24tv.ua/news/showPlayer.do?objectId=2074790&videoUrl=2022/07/2074790&w=640&h=360', + 'info_dict': { + 'id': '2074790', + 'ext': 'mp4', + 'title': 'У Харкові ворожа ракета прилетіла в будинок, де слухали пісні про "офіцерів-росіян"', + 'thumbnail': r're:^https?://.*\.jpe?g', + } + }, { + 'url': 'https://24tv.ua/news/showPlayer.do?videoUrl=2022/07/2074790&objectId=2074790&w=640&h=360', + 'only_matching': True, + }] + + _WEBPAGE_TESTS = [ + { + # iframe embed created from share menu. + 'url': 'data:text/html,%3Ciframe%20src=%22https://24tv.ua/news/showPlayer.do?objectId=1886193&videoUrl' + '=2022/03/1886193&w=640&h=360%22%20width=%22640%22%20height=%22360%22%20frameborder=%220%22' + '%20scrolling=%22no%22%3E%3C/iframe%3E', + 'info_dict': { + 'id': '1886193', + 'ext': 'mp4', + 'title': 'Росіяни руйнують Бородянку на Київщині та стріляють з літаків по мешканцях: шокуючі фото', + 'thumbnail': r're:^https?://.*\.jpe?g', + } + }, + { + 'url': 'https://24tv.ua/vipalyuyut-nashi-mista-sela-dsns-pokazali-motoroshni-naslidki_n1883966', + 'info_dict': { + 'id': '1883966', + 'ext': 'mp4', + 'title': 'Випалюють наші міста та села, – моторошні наслідки обстрілів на Чернігівщині', + 'thumbnail': r're:^https?://.*\.jpe?g', + }, + 'params': {'allowed_extractors': ['Generic', '24tv.ua']}, + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + formats = [] + subtitles = {} + for j in re.findall(r'vPlayConfig\.sources\s*=\s*(?P<json>\[{\s*(?s:.+?)\s*}])', webpage): + sources = self._parse_json(j, video_id, fatal=False, ignore_extra=True, transform_source=js_to_json, errnote='') or [] + for source in sources: + if mimetype2ext(traverse_obj(source, 'type')) == 'm3u8': + f, s = self._extract_m3u8_formats_and_subtitles(source['src'], video_id) + formats.extend(f) + self._merge_subtitles(subtitles, s) + else: + formats.append({ + 'url': source['src'], + 'ext': determine_ext(source['src']), + }) + thumbnail = traverse_obj( + self._search_json( + r'var\s*vPlayConfig\s*=\s*', webpage, 'thumbnail', + video_id, default=None, transform_source=js_to_json), 'poster') + return { + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': thumbnail or self._og_search_thumbnail(webpage), + 'title': self._generic_title('', webpage), + 'description': self._og_search_description(webpage, default=None), + } diff --git a/hypervideo_dl/extractor/tv2dk.py b/hypervideo_dl/extractor/tv2dk.py index ec5cbdf..35e92f1 100644 --- a/hypervideo_dl/extractor/tv2dk.py +++ b/hypervideo_dl/extractor/tv2dk.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json import re @@ -167,7 +164,6 @@ class TV2DKBornholmPlayIE(InfoExtractor): formats.append({ 'url': src, }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/tv2hu.py b/hypervideo_dl/extractor/tv2hu.py index f210435..d4c21c0 100644 --- a/hypervideo_dl/extractor/tv2hu.py +++ b/hypervideo_dl/extractor/tv2hu.py @@ -1,6 +1,4 @@ # encoding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( traverse_obj, @@ -68,7 +66,6 @@ class TV2HuIE(InfoExtractor): video_json = self._download_json(video_json_url, video_id) m3u8_url = self._proto_relative_url(traverse_obj(video_json, ('bitrates', 'hls'))) formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/tv4.py b/hypervideo_dl/extractor/tv4.py index 4043e63..1378a6f 100644 --- a/hypervideo_dl/extractor/tv4.py +++ b/hypervideo_dl/extractor/tv4.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -122,8 +119,6 @@ class TV4IE(InfoExtractor): if not formats and info.get('is_geo_restricted'): self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) - self._sort_formats(formats) - return { 'id': video_id, 'title': title, diff --git a/hypervideo_dl/extractor/tv5mondeplus.py b/hypervideo_dl/extractor/tv5mondeplus.py index a0832d2..bd0be78 100644 --- a/hypervideo_dl/extractor/tv5mondeplus.py +++ b/hypervideo_dl/extractor/tv5mondeplus.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -80,7 +77,6 @@ class TV5MondePlusIE(InfoExtractor): 'url': v_url, 'format_id': video_format, }) - self._sort_formats(formats) metadata = self._parse_json( vpl_data['data-metadata'], display_id) diff --git a/hypervideo_dl/extractor/tv5unis.py b/hypervideo_dl/extractor/tv5unis.py index 398b85d..978255b 100644 --- a/hypervideo_dl/extractor/tv5unis.py +++ b/hypervideo_dl/extractor/tv5unis.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( int_or_none, diff --git a/hypervideo_dl/extractor/tva.py b/hypervideo_dl/extractor/tva.py index 52a4ddf..9afe233 100644 --- a/hypervideo_dl/extractor/tva.py +++ b/hypervideo_dl/extractor/tva.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( float_or_none, diff --git a/hypervideo_dl/extractor/tvanouvelles.py b/hypervideo_dl/extractor/tvanouvelles.py index 1086176..b9f5e11 100644 --- a/hypervideo_dl/extractor/tvanouvelles.py +++ b/hypervideo_dl/extractor/tvanouvelles.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/tvc.py b/hypervideo_dl/extractor/tvc.py index 008f64c..caa76ab 100644 --- a/hypervideo_dl/extractor/tvc.py +++ b/hypervideo_dl/extractor/tvc.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import ( clean_html, @@ -12,6 +7,7 @@ from ..utils import ( class TVCIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tvc\.ru/video/iframe/id/(?P<id>\d+)' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:http:)?//(?:www\.)?tvc\.ru/video/iframe/id/[^"]+)\1'] _TEST = { 'url': 'http://www.tvc.ru/video/iframe/id/74622/isPlay/false/id_stat/channel/?acc_video_id=/channel/brand/id/17/show/episodes/episode_id/39702', 'md5': 'bbc5ff531d1e90e856f60fc4b3afd708', @@ -24,13 +20,6 @@ class TVCIE(InfoExtractor): }, } - @classmethod - def _extract_url(cls, webpage): - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:http:)?//(?:www\.)?tvc\.ru/video/iframe/id/[^"]+)\1', webpage) - if mobj: - return mobj.group('url') - def _real_extract(self, url): video_id = self._match_id(url) @@ -52,7 +41,6 @@ class TVCIE(InfoExtractor): 'height': int_or_none(info.get('height')), 'tbr': int_or_none(info.get('bitrate')), }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/tver.py b/hypervideo_dl/extractor/tver.py index 9ff3136..cebd027 100644 --- a/hypervideo_dl/extractor/tver.py +++ b/hypervideo_dl/extractor/tver.py @@ -1,77 +1,105 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( ExtractorError, - int_or_none, - remove_start, + join_nonempty, smuggle_url, + str_or_none, + strip_or_none, traverse_obj, ) class TVerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?P<path>corner|episode|feature|lp|tokyo2020/video)/(?P<id>[fc]?\d+)' - # videos are only available for 7 days + _VALID_URL = r'https?://(?:www\.)?tver\.jp/(?:(?P<type>lp|corner|series|episodes?|feature|tokyo2020/video)/)+(?P<id>[a-zA-Z0-9]+)' _TESTS = [{ - 'url': 'https://tver.jp/corner/f0062178', - 'only_matching': True, - }, { - 'url': 'https://tver.jp/feature/f0062413', - 'only_matching': True, - }, { - 'url': 'https://tver.jp/episode/79622438', - 'only_matching': True, + 'skip': 'videos are only available for 7 days', + 'url': 'https://tver.jp/episodes/ep83nf3w4p', + 'info_dict': { + 'title': '家事ヤロウ!!! 売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着!', + 'description': 'md5:dc2c06b6acc23f1e7c730c513737719b', + 'series': '家事ヤロウ!!!', + 'episode': '売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着!', + 'alt_title': '売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着!', + 'channel': 'テレビ朝日', + 'onair_label': '5月3日(火)放送分', + 'ext_title': '家事ヤロウ!!! 売り場席巻のチーズSP&財前直見×森泉親子の脱東京暮らし密着! テレビ朝日 5月3日(火)放送分', + }, + 'add_ie': ['BrightcoveNew'], }, { - # subtitle = ' ' - 'url': 'https://tver.jp/corner/f0068870', + 'url': 'https://tver.jp/corner/f0103888', 'only_matching': True, }, { - 'url': 'https://tver.jp/lp/f0009694', - 'only_matching': True, - }, { - 'url': 'https://tver.jp/lp/c0000239', - 'only_matching': True, - }, { - 'url': 'https://tver.jp/tokyo2020/video/6264525510001', + 'url': 'https://tver.jp/lp/f0033031', 'only_matching': True, }] - _TOKEN = None BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' + _PLATFORM_UID = None + _PLATFORM_TOKEN = None def _real_initialize(self): - self._TOKEN = self._download_json( - 'https://tver.jp/api/access_token.php', None)['token'] + create_response = self._download_json( + 'https://platform-api.tver.jp/v2/api/platform_users/browser/create', None, + note='Creating session', data=b'device_type=pc', headers={ + 'Origin': 'https://s.tver.jp', + 'Referer': 'https://s.tver.jp/', + 'Content-Type': 'application/x-www-form-urlencoded', + }) + self._PLATFORM_UID = traverse_obj(create_response, ('result', 'platform_uid')) + self._PLATFORM_TOKEN = traverse_obj(create_response, ('result', 'platform_token')) def _real_extract(self, url): - path, video_id = self._match_valid_url(url).groups() - if path == 'lp': - webpage = self._download_webpage(url, video_id) - redirect_path = self._search_regex(r'to_href="([^"]+)', webpage, 'redirect path') - path, video_id = self._match_valid_url(f'https://tver.jp{redirect_path}').groups() - api_response = self._download_json(f'https://api.tver.jp/v4/{path}/{video_id}', video_id, query={'token': self._TOKEN}) - p_id = traverse_obj(api_response, ('main', 'publisher_id')) - if not p_id: - error_msg, expected = traverse_obj(api_response, ('episode', 0, 'textbar', 0, ('text', 'longer')), get_all=False), True - if not error_msg: - error_msg, expected = 'Failed to extract publisher ID', False - raise ExtractorError(error_msg, expected=expected) - service = remove_start(traverse_obj(api_response, ('main', 'service')), 'ts_') + video_id, video_type = self._match_valid_url(url).group('id', 'type') + if video_type not in {'series', 'episodes'}: + webpage = self._download_webpage(url, video_id, note='Resolving to new URL') + video_id = self._match_id(self._search_regex( + (r'canonical"\s*href="(https?://tver\.jp/[^"]+)"', r'&link=(https?://tver\.jp/[^?&]+)[?&]'), + webpage, 'url regex')) + + episode_info = self._download_json( + f'https://platform-api.tver.jp/service/api/v1/callEpisode/{video_id}?require_data=mylist,later[epefy106ur],good[epefy106ur],resume[epefy106ur]', + video_id, fatal=False, + query={ + 'platform_uid': self._PLATFORM_UID, + 'platform_token': self._PLATFORM_TOKEN, + }, headers={ + 'x-tver-platform-type': 'web' + }) + episode_content = traverse_obj( + episode_info, ('result', 'episode', 'content')) or {} + + video_info = self._download_json( + f'https://statics.tver.jp/content/episode/{video_id}.json', video_id, + query={ + 'v': str_or_none(episode_content.get('version')) or '5', + }, headers={ + 'Origin': 'https://tver.jp', + 'Referer': 'https://tver.jp/', + }) + p_id = video_info['video']['accountID'] + r_id = traverse_obj(video_info, ('video', ('videoRefID', 'videoID')), get_all=False) + if not r_id: + raise ExtractorError('Failed to extract reference ID for Brightcove') + if not r_id.isdigit(): + r_id = f'ref:{r_id}' - r_id = traverse_obj(api_response, ('main', 'reference_id')) - if service not in ('tx', 'russia2018', 'sebare2018live', 'gorin'): - r_id = 'ref:' + r_id - bc_url = smuggle_url( - self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id), - {'geo_countries': ['JP']}) + episode = strip_or_none(episode_content.get('title')) + series = str_or_none(episode_content.get('seriesTitle')) + title = ( + join_nonempty(series, episode, delim=' ') + or str_or_none(video_info.get('title'))) + provider = str_or_none(episode_content.get('productionProviderName')) + onair_label = str_or_none(episode_content.get('broadcastDateLabel')) return { '_type': 'url_transparent', - 'description': traverse_obj(api_response, ('main', 'note', 0, 'text'), expected_type=compat_str), - 'episode_number': int_or_none(traverse_obj(api_response, ('main', 'ext', 'episode_number'), expected_type=compat_str)), - 'url': bc_url, + 'title': title, + 'series': series, + 'episode': episode, + # an another title which is considered "full title" for some viewers + 'alt_title': join_nonempty(title, provider, onair_label, delim=' '), + 'channel': provider, + 'description': str_or_none(video_info.get('description')), + 'url': smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % (p_id, r_id), {'geo_countries': ['JP']}), 'ie_key': 'BrightcoveNew', } diff --git a/hypervideo_dl/extractor/tvigle.py b/hypervideo_dl/extractor/tvigle.py index aa25ba0..6c98219 100644 --- a/hypervideo_dl/extractor/tvigle.py +++ b/hypervideo_dl/extractor/tvigle.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -17,6 +13,7 @@ class TvigleIE(InfoExtractor): IE_NAME = 'tvigle' IE_DESC = 'Интернет-телевидение Tvigle.ru' _VALID_URL = r'https?://(?:www\.)?(?:tvigle\.ru/(?:[^/]+/)+(?P<display_id>[^/]+)/$|cloud\.tvigle\.ru/video/(?P<id>\d+))' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//cloud\.tvigle\.ru/video/.+?)\1'] _GEO_BYPASS = False _GEO_COUNTRIES = ['RU'] @@ -123,7 +120,6 @@ class TvigleIE(InfoExtractor): 'height': int_or_none(height), 'filesize': filesize, }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/tviplayer.py b/hypervideo_dl/extractor/tviplayer.py new file mode 100644 index 0000000..7e9b04d --- /dev/null +++ b/hypervideo_dl/extractor/tviplayer.py @@ -0,0 +1,78 @@ +from .common import InfoExtractor +from ..utils import traverse_obj + + +class TVIPlayerIE(InfoExtractor): + _VALID_URL = r'https?://tviplayer\.iol\.pt(/programa/[\w-]+/[a-f0-9]+)?/\w+/(?P<id>\w+)' + _TESTS = [{ + 'url': 'https://tviplayer.iol.pt/programa/jornal-das-8/53c6b3903004dc006243d0cf/video/61c8e8b90cf2c7ea0f0f71a9', + 'info_dict': { + 'id': '61c8e8b90cf2c7ea0f0f71a9', + 'ext': 'mp4', + 'duration': 4167, + 'title': 'Jornal das 8 - 26 de dezembro de 2021', + 'thumbnail': 'https://www.iol.pt/multimedia/oratvi/multimedia/imagem/id/61c8ee630cf2cc58e7d98d9f/', + 'season_number': 8, + 'season': 'Season 8', + } + }, { + 'url': 'https://tviplayer.iol.pt/programa/isabel/62b471090cf26256cd2a8594/video/62be445f0cf2ea4f0a5218e5', + 'info_dict': { + 'id': '62be445f0cf2ea4f0a5218e5', + 'ext': 'mp4', + 'duration': 3255, + 'season': 'Season 1', + 'title': 'Isabel - Episódio 1', + 'thumbnail': 'https://www.iol.pt/multimedia/oratvi/multimedia/imagem/id/62beac200cf2f9a86eab856b/', + 'season_number': 1, + } + }, { + # no /programa/ + 'url': 'https://tviplayer.iol.pt/video/62c4131c0cf2f9a86eac06bb', + 'info_dict': { + 'id': '62c4131c0cf2f9a86eac06bb', + 'ext': 'mp4', + 'title': 'David e Mickael Carreira respondem: «Qual é o próximo a ser pai?»', + 'thumbnail': 'https://www.iol.pt/multimedia/oratvi/multimedia/imagem/id/62c416490cf2ea367d4433fd/', + 'season': 'Season 2', + 'duration': 148, + 'season_number': 2, + } + }, { + # episodio url + 'url': 'https://tviplayer.iol.pt/programa/para-sempre/61716c360cf2365a5ed894c4/episodio/t1e187', + 'info_dict': { + 'id': 't1e187', + 'ext': 'mp4', + 'season': 'Season 1', + 'title': 'Quem denunciou Pedro?', + 'thumbnail': 'https://www.iol.pt/multimedia/oratvi/multimedia/imagem/id/62eda30b0cf2ea367d48973b/', + 'duration': 1250, + 'season_number': 1, + } + }] + + def _real_initialize(self): + self.wms_auth_sign_token = self._download_webpage( + 'https://services.iol.pt/matrix?userId=', 'wmsAuthSign', + note='Trying to get wmsAuthSign token') + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + json_data = self._search_json( + r'<script>\s*jsonData\s*=', webpage, 'json_data', video_id) + + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + f'{json_data["videoUrl"]}?wmsAuthSign={self.wms_auth_sign_token}', + video_id, ext='mp4') + return { + 'id': video_id, + 'title': json_data.get('title') or self._og_search_title(webpage), + 'thumbnail': json_data.get('cover') or self._og_search_thumbnail(webpage), + 'duration': json_data.get('duration'), + 'formats': formats, + 'subtitles': subtitles, + 'season_number': traverse_obj(json_data, ('program', 'seasonNum')), + } diff --git a/hypervideo_dl/extractor/tvland.py b/hypervideo_dl/extractor/tvland.py index 9ebf57f..481d5eb 100644 --- a/hypervideo_dl/extractor/tvland.py +++ b/hypervideo_dl/extractor/tvland.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .mtv import MTVServicesInfoExtractor # TODO: Remove - Reason not used anymore - Service moved to youtube diff --git a/hypervideo_dl/extractor/tvn24.py b/hypervideo_dl/extractor/tvn24.py index de0fb50..9c777c1 100644 --- a/hypervideo_dl/extractor/tvn24.py +++ b/hypervideo_dl/extractor/tvn24.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -73,7 +70,6 @@ class TVN24IE(InfoExtractor): 'format_id': format_id, 'height': int_or_none(format_id.rstrip('p')), }) - self._sort_formats(formats) description = self._og_search_description(webpage, default=None) thumbnail = self._og_search_thumbnail( diff --git a/hypervideo_dl/extractor/tvnet.py b/hypervideo_dl/extractor/tvnet.py index aa1e9d9..77426f7 100644 --- a/hypervideo_dl/extractor/tvnet.py +++ b/hypervideo_dl/extractor/tvnet.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -112,7 +109,6 @@ class TVNetIE(InfoExtractor): stream_urls.add(stream_url) formats.extend(self._extract_m3u8_formats( stream_url, video_id, 'mp4', live=is_live, m3u8_id='hls', fatal=False)) - self._sort_formats(formats) # better support for radio streams if title.startswith('VOV'): diff --git a/hypervideo_dl/extractor/tvnoe.py b/hypervideo_dl/extractor/tvnoe.py index 26a5aea..712fbb2 100644 --- a/hypervideo_dl/extractor/tvnoe.py +++ b/hypervideo_dl/extractor/tvnoe.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( clean_html, diff --git a/hypervideo_dl/extractor/tvnow.py b/hypervideo_dl/extractor/tvnow.py index b318184..0acc306 100644 --- a/hypervideo_dl/extractor/tvnow.py +++ b/hypervideo_dl/extractor/tvnow.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -77,7 +74,6 @@ class TVNowBaseIE(InfoExtractor): if not info.get('free', True): raise ExtractorError( 'Video %s is not available for free' % video_id, expected=True) - self._sort_formats(formats) description = info.get('articleLong') or info.get('articleShort') timestamp = parse_iso8601(info.get('broadcastStartDate'), ' ') @@ -395,7 +391,6 @@ class TVNowIE(TVNowNewBaseIE): if not info.get('free', True): raise ExtractorError( 'Video %s is not available for free' % video_id, expected=True) - self._sort_formats(formats) description = source.get('description') thumbnail = url_or_none(source.get('poster')) @@ -429,7 +424,7 @@ class TVNowIE(TVNowNewBaseIE): return self._extract_video(info, video_id, display_id) -class TVNowFilmIE(TVNowIE): +class TVNowFilmIE(TVNowIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'''(?x) (?P<base_url>https?:// (?:www\.)?tvnow\.(?:de|at|ch)/ diff --git a/hypervideo_dl/extractor/tvopengr.py b/hypervideo_dl/extractor/tvopengr.py index a11cdc6..e208e57 100644 --- a/hypervideo_dl/extractor/tvopengr.py +++ b/hypervideo_dl/extractor/tvopengr.py @@ -1,14 +1,8 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import ( determine_ext, get_elements_text_and_html_by_attribute, scale_thumbnails_to_max_format_width, - unescapeHTML, ) @@ -75,7 +69,6 @@ class TVOpenGrWatchIE(TVOpenGrBaseIE): continue formats.extend(formats_) self._merge_subtitles(subs_, target=subs) - self._sort_formats(formats) return formats, subs def _real_extract(self, url): @@ -101,7 +94,7 @@ class TVOpenGrEmbedIE(TVOpenGrBaseIE): IE_NAME = 'tvopengr:embed' IE_DESC = 'tvopen.gr embedded videos' _VALID_URL = r'(?:https?:)?//(?:www\.|cdn\.|)(?:tvopen|ethnos).gr/embed/(?P<id>\d+)' - _EMBED_RE = re.compile(rf'''<iframe[^>]+?src=(?P<_q1>["'])(?P<url>{_VALID_URL})(?P=_q1)''') + _EMBED_REGEX = [rf'''<iframe[^>]+?src=(?P<_q1>["'])(?P<url>{_VALID_URL})(?P=_q1)'''] _TESTS = [{ 'url': 'https://cdn.ethnos.gr/embed/100963', @@ -118,11 +111,6 @@ class TVOpenGrEmbedIE(TVOpenGrBaseIE): }, }] - @classmethod - def _extract_urls(cls, webpage): - for mobj in cls._EMBED_RE.finditer(webpage): - yield unescapeHTML(mobj.group('url')) - def _real_extract(self, url): video_id = self._match_id(url) return self._return_canonical_url(url, video_id) diff --git a/hypervideo_dl/extractor/tvp.py b/hypervideo_dl/extractor/tvp.py index 48e2c6e..8483564 100644 --- a/hypervideo_dl/extractor/tvp.py +++ b/hypervideo_dl/extractor/tvp.py @@ -1,46 +1,54 @@ -# coding: utf-8 -from __future__ import unicode_literals - import itertools import random import re from .common import InfoExtractor from ..utils import ( + clean_html, determine_ext, dict_get, ExtractorError, int_or_none, js_to_json, - orderedSet, str_or_none, + strip_or_none, + traverse_obj, try_get, + url_or_none, ) class TVPIE(InfoExtractor): IE_NAME = 'tvp' IE_DESC = 'Telewizja Polska' - _VALID_URL = r'https?://(?:[^/]+\.)?(?:tvp(?:parlament)?\.(?:pl|info)|polandin\.com)/(?:video/(?:[^,\s]*,)*|(?:(?!\d+/)[^/]+/)*)(?P<id>\d+)' + _VALID_URL = r'https?://(?:[^/]+\.)?(?:tvp(?:parlament)?\.(?:pl|info)|tvpworld\.com|swipeto\.pl)/(?:(?!\d+/)[^/]+/)*(?P<id>\d+)' _TESTS = [{ # TVPlayer 2 in js wrapper - 'url': 'https://vod.tvp.pl/video/czas-honoru,i-seria-odc-13,194536', + 'url': 'https://swipeto.pl/64095316/uliczny-foxtrot-wypozyczalnia-kaset-kto-pamieta-dvdvideo', 'info_dict': { - 'id': '194536', + 'id': '64095316', 'ext': 'mp4', - 'title': 'Czas honoru, odc. 13 – Władek', - 'description': 'md5:437f48b93558370b031740546b696e24', - 'age_limit': 12, + 'title': 'Uliczny Foxtrot — Wypożyczalnia kaset. Kto pamięta DVD-Video?', + 'age_limit': 0, + 'duration': 374, + 'thumbnail': r're:https://.+', }, + 'expected_warnings': [ + 'Failed to download ISM manifest: HTTP Error 404: Not Found', + 'Failed to download m3u8 information: HTTP Error 404: Not Found', + ], }, { # TVPlayer legacy - 'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176', + 'url': 'https://www.tvp.pl/polska-press-video-uploader/wideo/62042351', 'info_dict': { - 'id': '17916176', + 'id': '62042351', 'ext': 'mp4', - 'title': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata', - 'description': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata', + 'title': 'Wideo', + 'description': 'Wideo Kamera', + 'duration': 24, + 'age_limit': 0, + 'thumbnail': r're:https://.+', }, }, { # TVPlayer 2 in iframe @@ -51,6 +59,8 @@ class TVPIE(InfoExtractor): 'title': 'Dzieci na sprzedaż dla homoseksualistów', 'description': 'md5:7d318eef04e55ddd9f87a8488ac7d590', 'age_limit': 12, + 'duration': 259, + 'thumbnail': r're:https://.+', }, }, { # TVPlayer 2 in client-side rendered website (regional; window.__newsData) @@ -61,7 +71,11 @@ class TVPIE(InfoExtractor): 'title': 'Studio Yayo', 'upload_date': '20160616', 'timestamp': 1466075700, - } + 'age_limit': 0, + 'duration': 20, + 'thumbnail': r're:https://.+', + }, + 'skip': 'Geo-blocked outside PL', }, { # TVPlayer 2 in client-side rendered website (tvp.info; window.__videoData) 'url': 'https://www.tvp.info/52880236/09042021-0800', @@ -69,7 +83,10 @@ class TVPIE(InfoExtractor): 'id': '52880236', 'ext': 'mp4', 'title': '09.04.2021, 08:00', + 'age_limit': 0, + 'thumbnail': r're:https://.+', }, + 'skip': 'Geo-blocked outside PL', }, { # client-side rendered (regional) program (playlist) page 'url': 'https://opole.tvp.pl/9660819/rozmowa-dnia', @@ -125,7 +142,7 @@ class TVPIE(InfoExtractor): 'url': 'https://www.tvpparlament.pl/retransmisje-vod/inne/wizyta-premiera-mateusza-morawieckiego-w-firmie-berotu-sp-z-oo/48857277', 'only_matching': True, }, { - 'url': 'https://polandin.com/47942651/pln-10-billion-in-subsidies-transferred-to-companies-pm', + 'url': 'https://tvpworld.com/48583640/tescos-polish-business-bought-by-danish-chain-netto', 'only_matching': True, }] @@ -154,16 +171,13 @@ class TVPIE(InfoExtractor): is_website = video_data.get('type') == 'website' if is_website: url = video_data['url'] - fucked_up_url_parts = re.match(r'https?://vod\.tvp\.pl/(\d+)/([^/?#]+)', url) - if fucked_up_url_parts: - url = f'https://vod.tvp.pl/website/{fucked_up_url_parts.group(2)},{fucked_up_url_parts.group(1)}' else: url = 'tvp:' + str_or_none(video_data.get('_id') or page_id) return { '_type': 'url_transparent', 'id': str_or_none(video_data.get('_id') or page_id), 'url': url, - 'ie_key': 'TVPEmbed' if not is_website else 'TVPWebsite', + 'ie_key': (TVPIE if is_website else TVPEmbedIE).ie_key(), 'title': str_or_none(video_data.get('title')), 'description': str_or_none(video_data.get('lead')), 'timestamp': int_or_none(video_data.get('release_date_long')), @@ -220,8 +234,9 @@ class TVPIE(InfoExtractor): # The URL may redirect to a VOD # example: https://vod.tvp.pl/48463890/wadowickie-spotkania-z-janem-pawlem-ii - if TVPWebsiteIE.suitable(urlh.url): - return self.url_result(urlh.url, ie=TVPWebsiteIE.ie_key(), video_id=page_id) + for ie_cls in (TVPVODSeriesIE, TVPVODVideoIE): + if ie_cls.suitable(urlh.url): + return self.url_result(urlh.url, ie=ie_cls.ie_key(), video_id=page_id) if re.search( r'window\.__(?:video|news|website|directory)Data\s*=', @@ -300,12 +315,13 @@ class TVPStreamIE(InfoExtractor): class TVPEmbedIE(InfoExtractor): IE_NAME = 'tvp:embed' IE_DESC = 'Telewizja Polska' + _GEO_BYPASS = False _VALID_URL = r'''(?x) (?: tvp: |https?:// (?:[^/]+\.)? - (?:tvp(?:parlament)?\.pl|tvp\.info|polandin\.com)/ + (?:tvp(?:parlament)?\.pl|tvp\.info|tvpworld\.com|swipeto\.pl)/ (?:sess/ (?:tvplayer\.php\?.*?object_id |TVPlayer2/(?:embed|api)\.php\?.*[Ii][Dd]) @@ -313,6 +329,7 @@ class TVPEmbedIE(InfoExtractor): =) (?P<id>\d+) ''' + _EMBED_REGEX = [rf'(?x)<iframe[^>]+?src=(["\'])(?P<url>{_VALID_URL[4:]})'] _TESTS = [{ 'url': 'tvp:194536', @@ -322,6 +339,12 @@ class TVPEmbedIE(InfoExtractor): 'title': 'Czas honoru, odc. 13 – Władek', 'description': 'md5:76649d2014f65c99477be17f23a4dead', 'age_limit': 12, + 'duration': 2652, + 'series': 'Czas honoru', + 'episode': 'Episode 13', + 'episode_number': 13, + 'season': 'sezon 1', + 'thumbnail': r're:https://.+', }, }, { 'url': 'https://www.tvp.pl/sess/tvplayer.php?object_id=51247504&autoplay=false', @@ -329,6 +352,9 @@ class TVPEmbedIE(InfoExtractor): 'id': '51247504', 'ext': 'mp4', 'title': 'Razmova 091220', + 'duration': 876, + 'age_limit': 0, + 'thumbnail': r're:https://.+', }, }, { # TVPlayer2 embed URL @@ -343,12 +369,6 @@ class TVPEmbedIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage, **kw): - return [m.group('embed') for m in re.finditer( - r'(?x)<iframe[^>]+?src=(["\'])(?P<embed>%s)' % TVPEmbedIE._VALID_URL[4:], - webpage)] - def _real_extract(self, url): video_id = self._match_id(url) @@ -369,44 +389,50 @@ class TVPEmbedIE(InfoExtractor): # stripping JSONP padding datastr = webpage[15 + len(callback):-3] if datastr.startswith('null,'): - error = self._parse_json(datastr[5:], video_id) - raise ExtractorError(error[0]['desc']) + error = self._parse_json(datastr[5:], video_id, fatal=False) + error_desc = traverse_obj(error, (0, 'desc')) + + if error_desc == 'Obiekt wymaga płatności': + raise ExtractorError('Video requires payment and log-in, but log-in is not implemented') + + raise ExtractorError(error_desc or 'unexpected JSON error') content = self._parse_json(datastr, video_id)['content'] info = content['info'] is_live = try_get(info, lambda x: x['isLive'], bool) + if info.get('isGeoBlocked'): + # actual country list is not provided, we just assume it's always available in PL + self.raise_geo_restricted(countries=['PL']) + formats = [] for file in content['files']: - video_url = file.get('url') + video_url = url_or_none(file.get('url')) if not video_url: continue - if video_url.endswith('.m3u8'): + ext = determine_ext(video_url, None) + if ext == 'm3u8': formats.extend(self._extract_m3u8_formats(video_url, video_id, m3u8_id='hls', fatal=False, live=is_live)) - elif video_url.endswith('.mpd'): + elif ext == 'mpd': if is_live: # doesn't work with either ffmpeg or native downloader continue formats.extend(self._extract_mpd_formats(video_url, video_id, mpd_id='dash', fatal=False)) - elif video_url.endswith('.f4m'): + elif ext == 'f4m': formats.extend(self._extract_f4m_formats(video_url, video_id, f4m_id='hds', fatal=False)) elif video_url.endswith('.ism/manifest'): formats.extend(self._extract_ism_formats(video_url, video_id, ism_id='mss', fatal=False)) else: - # mp4, wmv or something - quality = file.get('quality', {}) formats.append({ 'format_id': 'direct', 'url': video_url, - 'ext': determine_ext(video_url, file['type']), - 'fps': int_or_none(quality.get('fps')), - 'tbr': int_or_none(quality.get('bitrate')), - 'width': int_or_none(quality.get('width')), - 'height': int_or_none(quality.get('height')), + 'ext': ext or file.get('type'), + 'fps': int_or_none(traverse_obj(file, ('quality', 'fps'))), + 'tbr': int_or_none(traverse_obj(file, ('quality', 'bitrate')), scale=1000), + 'width': int_or_none(traverse_obj(file, ('quality', 'width'))), + 'height': int_or_none(traverse_obj(file, ('quality', 'height'))), }) - self._sort_formats(formats) - title = dict_get(info, ('subtitle', 'title', 'seoTitle')) description = dict_get(info, ('description', 'seoDescription')) thumbnails = [] @@ -457,57 +483,105 @@ class TVPEmbedIE(InfoExtractor): return info_dict -class TVPWebsiteIE(InfoExtractor): - IE_NAME = 'tvp:series' - _VALID_URL = r'https?://vod\.tvp\.pl/website/(?P<display_id>[^,]+),(?P<id>\d+)' +class TVPVODBaseIE(InfoExtractor): + _API_BASE_URL = 'https://vod.tvp.pl/api/products' + + def _call_api(self, resource, video_id, **kwargs): + return self._download_json( + f'{self._API_BASE_URL}/{resource}', video_id, + query={'lang': 'pl', 'platform': 'BROWSER'}, **kwargs) + + def _parse_video(self, video): + return { + '_type': 'url', + 'url': 'tvp:' + video['externalUid'], + 'ie_key': TVPEmbedIE.ie_key(), + 'title': video.get('title'), + 'description': traverse_obj(video, ('lead', 'description')), + 'age_limit': int_or_none(video.get('rating')), + 'duration': int_or_none(video.get('duration')), + } + + +class TVPVODVideoIE(TVPVODBaseIE): + IE_NAME = 'tvp:vod' + _VALID_URL = r'https?://vod\.tvp\.pl/[a-z\d-]+,\d+/[a-z\d-]+(?<!-odcinki)(?:-odcinki,\d+/odcinek-\d+,S\d+E\d+)?,(?P<id>\d+)(?:\?[^#]+)?(?:#.+)?$' _TESTS = [{ - # series - 'url': 'https://vod.tvp.pl/website/wspaniale-stulecie,17069012/video', + 'url': 'https://vod.tvp.pl/dla-dzieci,24/laboratorium-alchemika-odcinki,309338/odcinek-24,S01E24,311357', 'info_dict': { - 'id': '17069012', + 'id': '60468609', + 'ext': 'mp4', + 'title': 'Laboratorium alchemika, Tusze termiczne. Jak zobaczyć niewidoczne. Odcinek 24', + 'description': 'md5:1d4098d3e537092ccbac1abf49b7cd4c', + 'duration': 300, + 'episode_number': 24, + 'episode': 'Episode 24', + 'age_limit': 0, + 'series': 'Laboratorium alchemika', + 'thumbnail': 're:https://.+', }, - 'playlist_count': 312, }, { - # film - 'url': 'https://vod.tvp.pl/website/krzysztof-krawczyk-cale-moje-zycie,51374466', + 'url': 'https://vod.tvp.pl/filmy-dokumentalne,163/ukrainski-sluga-narodu,339667', 'info_dict': { - 'id': '51374509', + 'id': '51640077', 'ext': 'mp4', - 'title': 'Krzysztof Krawczyk – całe moje życie, Krzysztof Krawczyk – całe moje życie', - 'description': 'md5:2e80823f00f5fc263555482f76f8fa42', + 'title': 'Ukraiński sługa narodu, Ukraiński sługa narodu', + 'series': 'Ukraiński sługa narodu', + 'description': 'md5:b7940c0a8e439b0c81653a986f544ef3', 'age_limit': 12, + 'episode': 'Episode 0', + 'episode_number': 0, + 'duration': 3051, + 'thumbnail': 're:https://.+', }, - 'params': { - 'skip_download': True, - }, - 'add_ie': ['TVPEmbed'], - }, { - 'url': 'https://vod.tvp.pl/website/lzy-cennet,38678312', - 'only_matching': True, }] - def _entries(self, display_id, playlist_id): - url = 'https://vod.tvp.pl/website/%s,%s/video' % (display_id, playlist_id) - for page_num in itertools.count(1): - page = self._download_webpage( - url, display_id, 'Downloading page %d' % page_num, - query={'page': page_num}) + def _real_extract(self, url): + video_id = self._match_id(url) + + return self._parse_video(self._call_api(f'vods/{video_id}', video_id)) + - video_ids = orderedSet(re.findall( - r'<a[^>]+\bhref=["\']/video/%s,[^,]+,(\d+)' % display_id, - page)) +class TVPVODSeriesIE(TVPVODBaseIE): + IE_NAME = 'tvp:vod:series' + _VALID_URL = r'https?://vod\.tvp\.pl/[a-z\d-]+,\d+/[a-z\d-]+-odcinki,(?P<id>\d+)(?:\?[^#]+)?(?:#.+)?$' - if not video_ids: - break + _TESTS = [{ + 'url': 'https://vod.tvp.pl/seriale,18/ranczo-odcinki,316445', + 'info_dict': { + 'id': '316445', + 'title': 'Ranczo', + 'age_limit': 12, + 'categories': ['seriale'], + }, + 'playlist_count': 129, + }, { + 'url': 'https://vod.tvp.pl/programy,88/rolnik-szuka-zony-odcinki,284514', + 'only_matching': True, + }, { + 'url': 'https://vod.tvp.pl/dla-dzieci,24/laboratorium-alchemika-odcinki,309338', + 'only_matching': True, + }] - for video_id in video_ids: - yield self.url_result( - 'tvp:%s' % video_id, ie=TVPEmbedIE.ie_key(), - video_id=video_id) + def _entries(self, seasons, playlist_id): + for season in seasons: + episodes = self._call_api( + f'vods/serials/{playlist_id}/seasons/{season["id"]}/episodes', playlist_id, + note=f'Downloading episode list for {season["title"]}') + yield from map(self._parse_video, episodes) def _real_extract(self, url): - mobj = self._match_valid_url(url) - display_id, playlist_id = mobj.group('display_id', 'id') + playlist_id = self._match_id(url) + metadata = self._call_api( + f'vods/serials/{playlist_id}', playlist_id, + note='Downloading serial metadata') + seasons = self._call_api( + f'vods/serials/{playlist_id}/seasons', playlist_id, + note='Downloading season list') return self.playlist_result( - self._entries(display_id, playlist_id), playlist_id) + self._entries(seasons, playlist_id), playlist_id, strip_or_none(metadata.get('title')), + clean_html(traverse_obj(metadata, ('description', 'lead'), expected_type=strip_or_none)), + categories=[traverse_obj(metadata, ('mainCategory', 'name'))], + age_limit=int_or_none(metadata.get('rating')), + ) diff --git a/hypervideo_dl/extractor/tvplay.py b/hypervideo_dl/extractor/tvplay.py index b5dbc55..9ef4f96 100644 --- a/hypervideo_dl/extractor/tvplay.py +++ b/hypervideo_dl/extractor/tvplay.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -297,8 +294,6 @@ class TVPlayIE(InfoExtractor): 'This content might not be available in your country due to copyright reasons', metadata_available=True) - self._sort_formats(formats) - # TODO: webvtt in m3u8 subtitles = {} sami_path = video.get('sami_path') @@ -413,7 +408,6 @@ class ViafreeIE(InfoExtractor): raise formats, subtitles = self._extract_m3u8_formats_and_subtitles(stream_href, guid, 'mp4') - self._sort_formats(formats) episode = program.get('episode') or {} return { 'id': guid, @@ -498,7 +492,6 @@ class TVPlayHomeIE(InfoExtractor): urljoin(url, f'/api/products/{stream_id}/videos/playlist?videoType={video_type}&platform=BROWSER'), video_id) formats, subtitles = self._extract_m3u8_formats_and_subtitles( stream['sources']['HLS'][0]['src'], video_id, 'mp4', 'm3u8_native', m3u8_id='hls') - self._sort_formats(formats) thumbnails = set(traverse_obj( data, (('galary', 'images', 'artworks'), ..., ..., ('miniUrl', 'mainUrl')), expected_type=url_or_none)) diff --git a/hypervideo_dl/extractor/tvplayer.py b/hypervideo_dl/extractor/tvplayer.py index 5970596..b05355f 100644 --- a/hypervideo_dl/extractor/tvplayer.py +++ b/hypervideo_dl/extractor/tvplayer.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_HTTPError, @@ -75,7 +72,6 @@ class TVPlayerIE(InfoExtractor): raise formats = self._extract_m3u8_formats(response['stream'], display_id, 'mp4') - self._sort_formats(formats) return { 'id': resource_id, diff --git a/hypervideo_dl/extractor/tweakers.py b/hypervideo_dl/extractor/tweakers.py index 2b10d9b..e8e1fc6 100644 --- a/hypervideo_dl/extractor/tweakers.py +++ b/hypervideo_dl/extractor/tweakers.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -49,7 +47,6 @@ class TweakersIE(InfoExtractor): 'height': height, 'ext': ext, }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/twentyfourvideo.py b/hypervideo_dl/extractor/twentyfourvideo.py index ae19e11..baeb85d 100644 --- a/hypervideo_dl/extractor/twentyfourvideo.py +++ b/hypervideo_dl/extractor/twentyfourvideo.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( parse_iso8601, diff --git a/hypervideo_dl/extractor/twentymin.py b/hypervideo_dl/extractor/twentymin.py index a42977f..74f90b0 100644 --- a/hypervideo_dl/extractor/twentymin.py +++ b/hypervideo_dl/extractor/twentymin.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -21,6 +16,7 @@ class TwentyMinutenIE(InfoExtractor): ) (?P<id>\d+) ''' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:(?:https?:)?//)?(?:www\.)?20min\.ch/videoplayer/videoplayer.html\?.*?\bvideoId@\d+.*?)\1'] _TESTS = [{ 'url': 'http://www.20min.ch/videotv/?vid=469148&cid=2', 'md5': 'e7264320db31eed8c38364150c12496e', @@ -47,12 +43,6 @@ class TwentyMinutenIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [m.group('url') for m in re.finditer( - r'<iframe[^>]+src=(["\'])(?P<url>(?:(?:https?:)?//)?(?:www\.)?20min\.ch/videoplayer/videoplayer.html\?.*?\bvideoId@\d+.*?)\1', - webpage)] - def _real_extract(self, url): video_id = self._match_id(url) @@ -67,7 +57,6 @@ class TwentyMinutenIE(InfoExtractor): 'url': 'http://podcast.20min-tv.ch/podcast/20min/%s%s.mp4' % (video_id, p), 'quality': quality, } for quality, (format_id, p) in enumerate([('sd', ''), ('hd', 'h')])] - self._sort_formats(formats) description = video.get('lead') thumbnail = video.get('thumbnail') diff --git a/hypervideo_dl/extractor/twentythreevideo.py b/hypervideo_dl/extractor/twentythreevideo.py index e8cf5a1..290c376 100644 --- a/hypervideo_dl/extractor/twentythreevideo.py +++ b/hypervideo_dl/extractor/twentythreevideo.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import int_or_none diff --git a/hypervideo_dl/extractor/twitcasting.py b/hypervideo_dl/extractor/twitcasting.py index 5c4d26c..30bc987 100644 --- a/hypervideo_dl/extractor/twitcasting.py +++ b/hypervideo_dl/extractor/twitcasting.py @@ -1,11 +1,9 @@ -# coding: utf-8 -from __future__ import unicode_literals - +import base64 import itertools import re from .common import InfoExtractor -from ..downloader.websocket import has_websockets +from ..dependencies import websockets from ..utils import ( clean_html, ExtractorError, @@ -77,6 +75,16 @@ class TwitCastingIE(InfoExtractor): 'playlist_mincount': 2, }] + def _parse_data_movie_playlist(self, dmp, video_id): + # attempt 1: parse as JSON directly + try: + return self._parse_json(dmp, video_id) + except ExtractorError: + pass + # attempt 2: decode reversed base64 + decoded = base64.b64decode(dmp[::-1]) + return self._parse_json(decoded, video_id) + def _real_extract(self, url): uploader_id, video_id = self._match_valid_url(url).groups() @@ -103,7 +111,7 @@ class TwitCastingIE(InfoExtractor): video_js_data = try_get( webpage, - lambda x: self._parse_json(self._search_regex( + lambda x: self._parse_data_movie_playlist(self._search_regex( r'data-movie-playlist=\'([^\']+?)\'', x, 'movie playlist', default=None), video_id)['2'], list) @@ -164,7 +172,7 @@ class TwitCastingIE(InfoExtractor): note='Downloading source quality m3u8', headers=self._M3U8_HEADERS, fatal=False)) - if has_websockets: + if websockets: qq = qualities(['base', 'mobilesource', 'main']) streams = traverse_obj(stream_server_data, ('llfmp4', 'streams')) or {} for mode, ws_url in streams.items(): @@ -178,10 +186,17 @@ class TwitCastingIE(InfoExtractor): 'protocol': 'websocket_frag', }) - self._sort_formats(formats, ('source',)) - infodict = { - 'formats': formats + 'formats': formats, + '_format_sort_fields': ('source', ), + } + elif len(m3u8_urls) == 1: + formats = self._extract_m3u8_formats( + m3u8_urls[0], video_id, 'mp4', headers=self._M3U8_HEADERS) + infodict = { + # No problem here since there's only one manifest + 'formats': formats, + 'http_headers': self._M3U8_HEADERS, } else: infodict = { diff --git a/hypervideo_dl/extractor/twitch.py b/hypervideo_dl/extractor/twitch.py index 10de74c..c59d1cf 100644 --- a/hypervideo_dl/extractor/twitch.py +++ b/hypervideo_dl/extractor/twitch.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import collections import itertools import json @@ -15,11 +12,14 @@ from ..compat import ( compat_urllib_parse_urlparse, ) from ..utils import ( + ExtractorError, + UserNotLive, + base_url, clean_html, dict_get, - ExtractorError, float_or_none, int_or_none, + make_archive_id, parse_duration, parse_iso8601, parse_qs, @@ -55,6 +55,7 @@ class TwitchBaseIE(InfoExtractor): 'VideoPreviewOverlay': '3006e77e51b128d838fa4e835723ca4dc9a05c5efd4466c1085215c6e437e65c', 'VideoMetadata': '226edb3e692509f727fd56821f5653c05740242c82b0388883e0c0e75dcbf687', 'VideoPlayer_ChapterSelectButtonVideo': '8d2793384aac3773beab5e59bd5d6f585aedb923d292800119e03d40cd0f9b41', + 'VideoPlayer_VODSeekbarPreviewVideo': '07e99e4d56c5a7c67117a154777b0baf85a5ffefa393b213f4bc712ccaf85dd6', } def _perform_login(self, username, password): @@ -132,7 +133,6 @@ class TwitchBaseIE(InfoExtractor): 'quality': 10, 'format_note': 'Source', }) - self._sort_formats(formats) def _download_base_gql(self, video_id, ops, note, fatal=True): headers = { @@ -205,6 +205,14 @@ class TwitchVodIE(TwitchBaseIE): 'uploader_id': 'riotgames', 'view_count': int, 'start_time': 310, + 'chapters': [ + { + 'start_time': 0, + 'end_time': 17208, + 'title': 'League of Legends' + } + ], + 'live_status': 'was_live', }, 'params': { # m3u8 download @@ -273,10 +281,80 @@ class TwitchVodIE(TwitchBaseIE): 'title': 'Art' } ], + 'live_status': 'was_live', + 'thumbnail': r're:^https?://.*\.jpg$', + 'view_count': int, }, 'params': { 'skip_download': True + }, + }, { + 'note': 'Storyboards', + 'url': 'https://www.twitch.tv/videos/635475444', + 'info_dict': { + 'id': 'v635475444', + 'format_id': 'sb0', + 'ext': 'mhtml', + 'title': 'Riot Games', + 'duration': 11643, + 'uploader': 'Riot Games', + 'uploader_id': 'riotgames', + 'timestamp': 1590770569, + 'upload_date': '20200529', + 'chapters': [ + { + 'start_time': 0, + 'end_time': 573, + 'title': 'League of Legends' + }, + { + 'start_time': 573, + 'end_time': 3922, + 'title': 'Legends of Runeterra' + }, + { + 'start_time': 3922, + 'end_time': 11643, + 'title': 'Art' + } + ], + 'live_status': 'was_live', + 'thumbnail': r're:^https?://.*\.jpg$', + 'view_count': int, + 'columns': int, + 'rows': int, + }, + 'params': { + 'format': 'mhtml', + 'skip_download': True } + }, { + 'note': 'VOD with single chapter', + 'url': 'https://www.twitch.tv/videos/1536751224', + 'info_dict': { + 'id': 'v1536751224', + 'ext': 'mp4', + 'title': 'Porter Robinson Star Guardian Stream Tour with LilyPichu', + 'duration': 8353, + 'uploader': 'Riot Games', + 'uploader_id': 'riotgames', + 'timestamp': 1658267731, + 'upload_date': '20220719', + 'chapters': [ + { + 'start_time': 0, + 'end_time': 8353, + 'title': 'League of Legends' + } + ], + 'live_status': 'was_live', + 'thumbnail': r're:^https?://.*\.jpg$', + 'view_count': int, + }, + 'params': { + 'skip_download': True + }, + 'expected_warnings': ['Unable to download JSON metadata: HTTP Error 403: Forbidden'] }] def _download_info(self, item_id): @@ -293,16 +371,23 @@ class TwitchVodIE(TwitchBaseIE): 'includePrivate': False, 'videoID': item_id, }, + }, { + 'operationName': 'VideoPlayer_VODSeekbarPreviewVideo', + 'variables': { + 'includePrivate': False, + 'videoID': item_id, + }, }], 'Downloading stream metadata GraphQL') video = traverse_obj(data, (0, 'data', 'video')) video['moments'] = traverse_obj(data, (1, 'data', 'video', 'moments', 'edges', ..., 'node')) + video['storyboard'] = traverse_obj(data, (2, 'data', 'video', 'seekPreviewsURL'), expected_type=url_or_none) if video is None: raise ExtractorError( 'Video %s does not exist' % item_id, expected=True) - return self._extract_info_gql(video, item_id) + return video def _extract_info(self, info): status = info.get('status') @@ -341,8 +426,14 @@ class TwitchVodIE(TwitchBaseIE): 'was_live': True, } - def _extract_moments(self, info, item_id): - for moment in info.get('moments') or []: + def _extract_chapters(self, info, item_id): + if not info.get('moments'): + game = traverse_obj(info, ('game', 'displayName')) + if game: + yield {'title': game} + return + + for moment in info['moments']: start_time = int_or_none(moment.get('positionMilliseconds'), 1000) duration = int_or_none(moment.get('durationMilliseconds'), 1000) name = str_or_none(moment.get('description')) @@ -381,15 +472,49 @@ class TwitchVodIE(TwitchBaseIE): 'uploader_id': try_get(info, lambda x: x['owner']['login'], compat_str), 'timestamp': unified_timestamp(info.get('publishedAt')), 'view_count': int_or_none(info.get('viewCount')), - 'chapters': list(self._extract_moments(info, item_id)), + 'chapters': list(self._extract_chapters(info, item_id)), 'is_live': is_live, 'was_live': True, } + def _extract_storyboard(self, item_id, storyboard_json_url, duration): + if not duration or not storyboard_json_url: + return + spec = self._download_json(storyboard_json_url, item_id, 'Downloading storyboard metadata JSON', fatal=False) or [] + # sort from highest quality to lowest + # This makes sb0 the highest-quality format, sb1 - lower, etc which is consistent with youtube sb ordering + spec.sort(key=lambda x: int_or_none(x.get('width')) or 0, reverse=True) + base = base_url(storyboard_json_url) + for i, s in enumerate(spec): + count = int_or_none(s.get('count')) + images = s.get('images') + if not (images and count): + continue + fragment_duration = duration / len(images) + yield { + 'format_id': f'sb{i}', + 'format_note': 'storyboard', + 'ext': 'mhtml', + 'protocol': 'mhtml', + 'acodec': 'none', + 'vcodec': 'none', + 'url': urljoin(base, images[0]), + 'width': int_or_none(s.get('width')), + 'height': int_or_none(s.get('height')), + 'fps': count / duration, + 'rows': int_or_none(s.get('rows')), + 'columns': int_or_none(s.get('cols')), + 'fragments': [{ + 'url': urljoin(base, path), + 'duration': fragment_duration, + } for path in images], + } + def _real_extract(self, url): vod_id = self._match_id(url) - info = self._download_info(vod_id) + video = self._download_info(vod_id) + info = self._extract_info_gql(video, vod_id) access_token = self._download_access_token(vod_id, 'video', 'id') formats = self._extract_m3u8_formats( @@ -406,6 +531,8 @@ class TwitchVodIE(TwitchBaseIE): })), vod_id, 'mp4', entry_protocol='m3u8_native') + formats.extend(self._extract_storyboard(vod_id, video.get('storyboard'), info.get('duration'))) + self._prefer_source(formats) info['formats'] = formats @@ -853,7 +980,7 @@ class TwitchStreamIE(TwitchBaseIE): stream = user['stream'] if not stream: - raise ExtractorError('%s is offline' % channel_name, expected=True) + raise UserNotLive(video_id=channel_name) access_token = self._download_access_token( channel_name, 'stream', 'channelName') @@ -1016,7 +1143,6 @@ class TwitchClipsIE(TwitchBaseIE): 'height': int_or_none(option.get('quality')), 'fps': int_or_none(option.get('frameRate')), }) - self._sort_formats(formats) thumbnails = [] for thumbnail_id in ('tiny', 'small', 'medium'): @@ -1035,10 +1161,13 @@ class TwitchClipsIE(TwitchBaseIE): }) thumbnails.append(thumb) + old_id = self._search_regex(r'%7C(\d+)(?:-\d+)?.mp4', formats[-1]['url'], 'old id', default=None) + return { 'id': clip.get('id') or video_id, + '_old_archive_ids': [make_archive_id(self, old_id)] if old_id else None, 'display_id': video_id, - 'title': clip.get('title') or video_id, + 'title': clip.get('title'), 'formats': formats, 'duration': int_or_none(clip.get('durationSeconds')), 'view_count': int_or_none(clip.get('viewCount')), diff --git a/hypervideo_dl/extractor/twitter.py b/hypervideo_dl/extractor/twitter.py index 8ccc38e..18ebb36 100644 --- a/hypervideo_dl/extractor/twitter.py +++ b/hypervideo_dl/extractor/twitter.py @@ -1,40 +1,42 @@ -# coding: utf-8 -from __future__ import unicode_literals - +import json import re +import urllib.error from .common import InfoExtractor +from .periscope import PeriscopeBaseIE, PeriscopeIE +from ..compat import functools # isort: split from ..compat import ( - compat_HTTPError, compat_parse_qs, compat_urllib_parse_unquote, compat_urllib_parse_urlparse, ) from ..utils import ( - dict_get, ExtractorError, - format_field, + dict_get, float_or_none, + format_field, int_or_none, + make_archive_id, + str_or_none, + strip_or_none, traverse_obj, + try_call, try_get, - strip_or_none, unified_timestamp, update_url_query, url_or_none, xpath_text, ) -from .periscope import ( - PeriscopeBaseIE, - PeriscopeIE, -) - class TwitterBaseIE(InfoExtractor): _API_BASE = 'https://api.twitter.com/1.1/' - _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?twitter\.com/' - _GUEST_TOKEN = None + _GRAPHQL_API_BASE = 'https://twitter.com/i/api/graphql/' + _TOKENS = { + 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA': None, + 'AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw': None, + } + _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?(?:twitter\.com|twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid\.onion)/' def _extract_variant_formats(self, variant, video_id): variant_url = variant.get('url') @@ -86,28 +88,81 @@ class TwitterBaseIE(InfoExtractor): 'height': int(m.group('height')), }) - def _call_api(self, path, video_id, query={}): - headers = { - 'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw', - } - token = self._get_cookies(self._API_BASE).get('ct0') - if token: - headers['x-csrf-token'] = token.value - if not self._GUEST_TOKEN: - self._GUEST_TOKEN = self._download_json( - self._API_BASE + 'guest/activate.json', video_id, - 'Downloading guest token', data=b'', - headers=headers)['guest_token'] - headers['x-guest-token'] = self._GUEST_TOKEN - try: - return self._download_json( - self._API_BASE + path, video_id, headers=headers, query=query) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - raise ExtractorError(self._parse_json( - e.cause.read().decode(), - video_id)['errors'][0]['message'], expected=True) - raise + @functools.cached_property + def is_logged_in(self): + return bool(self._get_cookies(self._API_BASE).get('auth_token')) + + def _call_api(self, path, video_id, query={}, graphql=False): + cookies = self._get_cookies(self._API_BASE) + headers = {} + + csrf_cookie = cookies.get('ct0') + if csrf_cookie: + headers['x-csrf-token'] = csrf_cookie.value + + if self.is_logged_in: + headers.update({ + 'x-twitter-auth-type': 'OAuth2Session', + 'x-twitter-client-language': 'en', + 'x-twitter-active-user': 'yes', + }) + + last_error = None + for bearer_token in self._TOKENS: + for first_attempt in (True, False): + headers['Authorization'] = f'Bearer {bearer_token}' + + if not self.is_logged_in: + if not self._TOKENS[bearer_token]: + headers.pop('x-guest-token', None) + guest_token_response = self._download_json( + self._API_BASE + 'guest/activate.json', video_id, + 'Downloading guest token', data=b'', headers=headers) + + self._TOKENS[bearer_token] = guest_token_response.get('guest_token') + if not self._TOKENS[bearer_token]: + raise ExtractorError('Could not retrieve guest token') + + headers['x-guest-token'] = self._TOKENS[bearer_token] + + try: + allowed_status = {400, 403, 404} if graphql else {403} + result = self._download_json( + (self._GRAPHQL_API_BASE if graphql else self._API_BASE) + path, + video_id, headers=headers, query=query, expected_status=allowed_status) + + except ExtractorError as e: + if last_error: + raise last_error + + if not isinstance(e.cause, urllib.error.HTTPError) or e.cause.code != 404: + raise + + last_error = e + self.report_warning( + 'Twitter API gave 404 response, retrying with deprecated auth token. ' + 'Only one media item can be extracted') + break # continue outer loop with next bearer_token + + if result.get('errors'): + errors = traverse_obj(result, ('errors', ..., 'message'), expected_type=str) + if first_attempt and any('bad guest token' in error.lower() for error in errors): + self.to_screen('Guest token has expired. Refreshing guest token') + self._TOKENS[bearer_token] = None + continue + + error_message = ', '.join(set(errors)) or 'Unknown error' + raise ExtractorError(f'Error(s) while querying API: {error_message}', expected=True) + + return result + + def _build_graphql_query(self, media_id): + raise NotImplementedError('Method must be implemented to support GraphQL') + + def _call_graphql_api(self, endpoint, media_id): + data = self._build_graphql_query(media_id) + query = {key: json.dumps(value, separators=(',', ':')) for key, value in data.items()} + return traverse_obj(self._call_api(endpoint, media_id, query=query, graphql=True), 'data') class TwitterCardIE(InfoExtractor): @@ -118,7 +173,7 @@ class TwitterCardIE(InfoExtractor): 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', # MD5 checksums are different in different places 'info_dict': { - 'id': '560070183650213889', + 'id': '560070131976392705', 'ext': 'mp4', 'title': "Twitter - You can now shoot, edit and share video on Twitter. Capture life's most moving moments from your perspective.", 'description': 'md5:18d3e24bb4f6e5007487dd546e53bd96', @@ -128,6 +183,13 @@ class TwitterCardIE(InfoExtractor): 'duration': 30.033, 'timestamp': 1422366112, 'upload_date': '20150127', + 'age_limit': 0, + 'comment_count': int, + 'tags': [], + 'repost_count': int, + 'like_count': int, + 'display_id': '560070183650213889', + 'uploader_url': 'https://twitter.com/Twitter', }, }, { @@ -142,7 +204,14 @@ class TwitterCardIE(InfoExtractor): 'uploader_id': 'NASA', 'timestamp': 1437408129, 'upload_date': '20150720', + 'uploader_url': 'https://twitter.com/NASA', + 'age_limit': 0, + 'comment_count': int, + 'like_count': int, + 'repost_count': int, + 'tags': ['PlutoFlyby'], }, + 'params': {'format': '[protocol=https]'} }, { 'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977', @@ -155,12 +224,27 @@ class TwitterCardIE(InfoExtractor): 'upload_date': '20111013', 'uploader': 'OMG! UBUNTU!', 'uploader_id': 'omgubuntu', + 'channel_url': 'https://www.youtube.com/channel/UCIiSwcm9xiFb3Y4wjzR41eQ', + 'channel_id': 'UCIiSwcm9xiFb3Y4wjzR41eQ', + 'channel_follower_count': int, + 'chapters': 'count:8', + 'uploader_url': 'http://www.youtube.com/user/omgubuntu', + 'duration': 138, + 'categories': ['Film & Animation'], + 'age_limit': 0, + 'comment_count': int, + 'availability': 'public', + 'like_count': int, + 'thumbnail': 'https://i.ytimg.com/vi/dq4Oj5quskI/maxresdefault.jpg', + 'view_count': int, + 'tags': 'count:12', + 'channel': 'OMG! UBUNTU!', + 'playable_in_embed': True, }, 'add_ie': ['Youtube'], }, { 'url': 'https://twitter.com/i/cards/tfw/v1/665289828897005568', - 'md5': '6dabeaca9e68cbb71c99c322a4b42a11', 'info_dict': { 'id': 'iBb2x00UVlv', 'ext': 'mp4', @@ -169,9 +253,17 @@ class TwitterCardIE(InfoExtractor): 'uploader': 'ArsenalTerje', 'title': 'Vine by ArsenalTerje', 'timestamp': 1447451307, + 'alt_title': 'Vine by ArsenalTerje', + 'comment_count': int, + 'like_count': int, + 'thumbnail': r're:^https?://[^?#]+\.jpg', + 'view_count': int, + 'repost_count': int, }, 'add_ie': ['Vine'], - }, { + 'params': {'skip_download': 'm3u8'}, + }, + { 'url': 'https://twitter.com/i/videos/tweet/705235433198714880', 'md5': '884812a2adc8aaf6fe52b15ccbfa3b88', 'info_dict': { @@ -185,7 +277,8 @@ class TwitterCardIE(InfoExtractor): 'upload_date': '20160303', }, 'skip': 'This content is no longer available.', - }, { + }, + { 'url': 'https://twitter.com/i/videos/752274308186120192', 'only_matching': True, }, @@ -205,7 +298,8 @@ class TwitterIE(TwitterBaseIE): _TESTS = [{ 'url': 'https://twitter.com/freethenipple/status/643211948184596480', 'info_dict': { - 'id': '643211948184596480', + 'id': '643211870443208704', + 'display_id': '643211948184596480', 'ext': 'mp4', 'title': 'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!', 'thumbnail': r're:^https?://.*\.jpg', @@ -215,6 +309,11 @@ class TwitterIE(TwitterBaseIE): 'duration': 12.922, 'timestamp': 1442188653, 'upload_date': '20150913', + 'uploader_url': 'https://twitter.com/freethenipple', + 'comment_count': int, + 'repost_count': int, + 'like_count': int, + 'tags': [], 'age_limit': 18, }, }, { @@ -235,13 +334,20 @@ class TwitterIE(TwitterBaseIE): 'url': 'https://twitter.com/starwars/status/665052190608723968', 'info_dict': { 'id': '665052190608723968', + 'display_id': '665052190608723968', 'ext': 'mp4', - 'title': 'Star Wars - A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens.', + 'title': 'md5:55fef1d5b811944f1550e91b44abb82e', 'description': 'A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens. https://t.co/OkSqT2fjWJ', 'uploader_id': 'starwars', - 'uploader': 'Star Wars', + 'uploader': r're:Star Wars.*', 'timestamp': 1447395772, 'upload_date': '20151113', + 'uploader_url': 'https://twitter.com/starwars', + 'comment_count': int, + 'repost_count': int, + 'like_count': int, + 'tags': ['TV', 'StarWars', 'TheForceAwakens'], + 'age_limit': 0, }, }, { 'url': 'https://twitter.com/BTNBrentYarina/status/705235433198714880', @@ -254,25 +360,39 @@ class TwitterIE(TwitterBaseIE): 'uploader': 'Brent Yarina', 'timestamp': 1456976204, 'upload_date': '20160303', + 'uploader_url': 'https://twitter.com/BTNBrentYarina', + 'comment_count': int, + 'repost_count': int, + 'like_count': int, + 'tags': [], + 'age_limit': 0, }, 'params': { # The same video as https://twitter.com/i/videos/tweet/705235433198714880 # Test case of TwitterCardIE 'skip_download': True, }, + 'skip': 'Dead external link', }, { 'url': 'https://twitter.com/jaydingeer/status/700207533655363584', 'info_dict': { - 'id': '700207533655363584', + 'id': '700207414000242688', + 'display_id': '700207533655363584', 'ext': 'mp4', - 'title': 'simon vertugo - BEAT PROD: @suhmeduh #Damndaniel', + 'title': 'jaydin donte geer - BEAT PROD: @suhmeduh #Damndaniel', 'description': 'BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ', 'thumbnail': r're:^https?://.*\.jpg', - 'uploader': 'simon vertugo', - 'uploader_id': 'simonvertugo', + 'uploader': 'jaydin donte geer', + 'uploader_id': 'jaydingeer', 'duration': 30.0, 'timestamp': 1455777459, 'upload_date': '20160218', + 'uploader_url': 'https://twitter.com/jaydingeer', + 'comment_count': int, + 'repost_count': int, + 'like_count': int, + 'tags': ['Damndaniel'], + 'age_limit': 0, }, }, { 'url': 'https://twitter.com/Filmdrunk/status/713801302971588609', @@ -285,12 +405,19 @@ class TwitterIE(TwitterBaseIE): 'uploader_id': '1004126642786242560', 'timestamp': 1402826626, 'upload_date': '20140615', + 'thumbnail': r're:^https?://.*\.jpg', + 'alt_title': 'Vine by TAKUMA', + 'comment_count': int, + 'repost_count': int, + 'like_count': int, + 'view_count': int, }, 'add_ie': ['Vine'], }, { 'url': 'https://twitter.com/captainamerica/status/719944021058060289', 'info_dict': { - 'id': '719944021058060289', + 'id': '717462543795523584', + 'display_id': '719944021058060289', 'ext': 'mp4', 'title': 'Captain America - @King0fNerd Are you sure you made the right choice? Find out in theaters.', 'description': '@King0fNerd Are you sure you made the right choice? Find out in theaters. https://t.co/GpgYi9xMJI', @@ -299,6 +426,13 @@ class TwitterIE(TwitterBaseIE): 'duration': 3.17, 'timestamp': 1460483005, 'upload_date': '20160412', + 'uploader_url': 'https://twitter.com/CaptainAmerica', + 'thumbnail': r're:^https?://.*\.jpg', + 'comment_count': int, + 'repost_count': int, + 'like_count': int, + 'tags': [], + 'age_limit': 0, }, }, { 'url': 'https://twitter.com/OPP_HSD/status/779210622571536384', @@ -310,6 +444,7 @@ class TwitterIE(TwitterBaseIE): 'uploader_id': '1PmKqpJdOJQoY', 'uploader': 'Sgt Kerry Schmidt - Ontario Provincial Police', 'timestamp': 1474613214, + 'thumbnail': r're:^https?://.*\.jpg', }, 'add_ie': ['Periscope'], }, { @@ -330,7 +465,8 @@ class TwitterIE(TwitterBaseIE): }, { 'url': 'https://twitter.com/i/web/status/910031516746514432', 'info_dict': { - 'id': '910031516746514432', + 'id': '910030238373089285', + 'display_id': '910031516746514432', 'ext': 'mp4', 'title': 'Préfet de Guadeloupe - [Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre.', 'thumbnail': r're:^https?://.*\.jpg', @@ -340,6 +476,12 @@ class TwitterIE(TwitterBaseIE): 'duration': 47.48, 'timestamp': 1505803395, 'upload_date': '20170919', + 'uploader_url': 'https://twitter.com/Prefet971', + 'comment_count': int, + 'repost_count': int, + 'like_count': int, + 'tags': ['Maria'], + 'age_limit': 0, }, 'params': { 'skip_download': True, # requires ffmpeg @@ -348,7 +490,8 @@ class TwitterIE(TwitterBaseIE): # card via api.twitter.com/1.1/videos/tweet/config 'url': 'https://twitter.com/LisPower1/status/1001551623938805763', 'info_dict': { - 'id': '1001551623938805763', + 'id': '1001551417340022785', + 'display_id': '1001551623938805763', 'ext': 'mp4', 'title': 're:.*?Shep is on a roll today.*?', 'thumbnail': r're:^https?://.*\.jpg', @@ -358,6 +501,12 @@ class TwitterIE(TwitterBaseIE): 'duration': 111.278, 'timestamp': 1527623489, 'upload_date': '20180529', + 'uploader_url': 'https://twitter.com/LisPower1', + 'comment_count': int, + 'repost_count': int, + 'like_count': int, + 'tags': [], + 'age_limit': 0, }, 'params': { 'skip_download': True, # requires ffmpeg @@ -365,7 +514,8 @@ class TwitterIE(TwitterBaseIE): }, { 'url': 'https://twitter.com/foobar/status/1087791357756956680', 'info_dict': { - 'id': '1087791357756956680', + 'id': '1087791272830607360', + 'display_id': '1087791357756956680', 'ext': 'mp4', 'title': 'Twitter - A new is coming. Some of you got an opt-in to try it now. Check out the emoji button, quick keyboard shortcuts, upgraded trends, advanced search, and more. Let us know your thoughts!', 'thumbnail': r're:^https?://.*\.jpg', @@ -375,6 +525,12 @@ class TwitterIE(TwitterBaseIE): 'duration': 61.567, 'timestamp': 1548184644, 'upload_date': '20190122', + 'uploader_url': 'https://twitter.com/Twitter', + 'comment_count': int, + 'repost_count': int, + 'like_count': int, + 'tags': [], + 'age_limit': 0, }, }, { # not available in Periscope @@ -385,13 +541,17 @@ class TwitterIE(TwitterBaseIE): 'title': 'Vivi - Vivi founder @lior_rauchy announcing our new student feedback tool live at @EduTECH_AU #EduTECH2019', 'uploader': 'Vivi', 'uploader_id': '1eVjYOLGkGrQL', + 'thumbnail': r're:^https?://.*\.jpg', + 'tags': ['EduTECH2019'], + 'view_count': int, }, 'add_ie': ['TwitterBroadcast'], }, { # unified card 'url': 'https://twitter.com/BrooklynNets/status/1349794411333394432?s=20', 'info_dict': { - 'id': '1349794411333394432', + 'id': '1349774757969989634', + 'display_id': '1349794411333394432', 'ext': 'mp4', 'title': 'md5:d1c4941658e4caaa6cb579260d85dcba', 'thumbnail': r're:^https?://.*\.jpg', @@ -401,11 +561,177 @@ class TwitterIE(TwitterBaseIE): 'duration': 324.484, 'timestamp': 1610651040, 'upload_date': '20210114', + 'uploader_url': 'https://twitter.com/BrooklynNets', + 'comment_count': int, + 'repost_count': int, + 'like_count': int, + 'tags': [], + 'age_limit': 0, }, 'params': { 'skip_download': True, }, }, { + 'url': 'https://twitter.com/oshtru/status/1577855540407197696', + 'info_dict': { + 'id': '1577855447914409984', + 'display_id': '1577855540407197696', + 'ext': 'mp4', + 'title': 'md5:9d198efb93557b8f8d5b78c480407214', + 'description': 'md5:b9c3699335447391d11753ab21c70a74', + 'upload_date': '20221006', + 'uploader': 'oshtru', + 'uploader_id': 'oshtru', + 'uploader_url': 'https://twitter.com/oshtru', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 30.03, + 'timestamp': 1665025050, + 'comment_count': int, + 'repost_count': int, + 'like_count': int, + 'tags': [], + 'age_limit': 0, + }, + 'params': {'skip_download': True}, + }, { + 'url': 'https://twitter.com/UltimaShadowX/status/1577719286659006464', + 'info_dict': { + 'id': '1577719286659006464', + 'title': 'Ultima | #\u0432\u029f\u043c - Test', + 'description': 'Test https://t.co/Y3KEZD7Dad', + 'uploader': 'Ultima | #\u0432\u029f\u043c', + 'uploader_id': 'UltimaShadowX', + 'uploader_url': 'https://twitter.com/UltimaShadowX', + 'upload_date': '20221005', + 'timestamp': 1664992565, + 'comment_count': int, + 'repost_count': int, + 'like_count': int, + 'tags': [], + 'age_limit': 0, + }, + 'playlist_count': 4, + 'params': {'skip_download': True}, + }, { + 'url': 'https://twitter.com/MesoMax919/status/1575560063510810624', + 'info_dict': { + 'id': '1575559336759263233', + 'display_id': '1575560063510810624', + 'ext': 'mp4', + 'title': 'md5:eec26382babd0f7c18f041db8ae1c9c9', + 'thumbnail': r're:^https?://.*\.jpg', + 'description': 'md5:95aea692fda36a12081b9629b02daa92', + 'uploader': 'Max Olson', + 'uploader_id': 'MesoMax919', + 'uploader_url': 'https://twitter.com/MesoMax919', + 'duration': 21.321, + 'timestamp': 1664477766, + 'upload_date': '20220929', + 'comment_count': int, + 'repost_count': int, + 'like_count': int, + 'tags': ['HurricaneIan'], + 'age_limit': 0, + }, + }, { + # Adult content, uses old token + # Fails if not logged in (GraphQL) + 'url': 'https://twitter.com/Rizdraws/status/1575199173472927762', + 'info_dict': { + 'id': '1575199163847000068', + 'display_id': '1575199173472927762', + 'ext': 'mp4', + 'title': str, + 'description': str, + 'uploader': str, + 'uploader_id': 'Rizdraws', + 'uploader_url': 'https://twitter.com/Rizdraws', + 'upload_date': '20220928', + 'timestamp': 1664391723, + 'thumbnail': 're:^https?://.*\\.jpg', + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + 'age_limit': 18, + 'tags': [] + }, + 'expected_warnings': ['404'], + }, { + # Description is missing one https://t.co url (GraphQL) + 'url': 'https://twitter.com/Srirachachau/status/1395079556562706435', + 'playlist_mincount': 2, + 'info_dict': { + 'id': '1395079556562706435', + 'title': str, + 'tags': [], + 'uploader': str, + 'like_count': int, + 'upload_date': '20210519', + 'age_limit': 0, + 'repost_count': int, + 'description': 'Here it is! Finished my gothic western cartoon. Pretty proud of it. It\'s got some goofs and lots of splashy over the top violence, something for everyone, hope you like it https://t.co/fOsG5glUnw https://t.co/kbXZrozlY7', + 'uploader_id': 'Srirachachau', + 'comment_count': int, + 'uploader_url': 'https://twitter.com/Srirachachau', + 'timestamp': 1621447860, + }, + }, { + # Description is missing one https://t.co url (GraphQL) + 'url': 'https://twitter.com/DavidToons_/status/1578353380363501568', + 'playlist_mincount': 2, + 'info_dict': { + 'id': '1578353380363501568', + 'title': str, + 'uploader_id': 'DavidToons_', + 'repost_count': int, + 'like_count': int, + 'uploader': str, + 'timestamp': 1665143744, + 'uploader_url': 'https://twitter.com/DavidToons_', + 'description': 'Chris sounds like Linda from Bob\'s Burgers, so as an animator: this had to be done. https://t.co/glfQdgfFXH https://t.co/WgJauwIW1w', + 'tags': [], + 'comment_count': int, + 'upload_date': '20221007', + 'age_limit': 0, + }, + }, { + 'url': 'https://twitter.com/primevideouk/status/1578401165338976258', + 'playlist_count': 2, + 'info_dict': { + 'id': '1578401165338976258', + 'title': str, + 'description': 'md5:659a6b517a034b4cee5d795381a2dc41', + 'uploader': str, + 'uploader_id': 'primevideouk', + 'timestamp': 1665155137, + 'upload_date': '20221007', + 'age_limit': 0, + 'uploader_url': 'https://twitter.com/primevideouk', + 'comment_count': int, + 'repost_count': int, + 'like_count': int, + 'tags': ['TheRingsOfPower'], + }, + }, { + # Twitter Spaces + 'url': 'https://twitter.com/MoniqueCamarra/status/1550101959377551360', + 'info_dict': { + 'id': '1lPJqmBeeNAJb', + 'ext': 'm4a', + 'title': 'EuroFile@6 Ukraine Up-date-Draghi Defenestration-the West', + 'uploader': r're:Monique Camarra.+?', + 'uploader_id': 'MoniqueCamarra', + 'live_status': 'was_live', + 'description': 'md5:acce559345fd49f129c20dbcda3f1201', + 'timestamp': 1658407771464, + }, + 'add_ie': ['TwitterSpaces'], + 'params': {'skip_download': 'm3u8'}, + }, { + # onion route + 'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273', + 'only_matching': True, + }, { # Twitch Clip Embed 'url': 'https://twitter.com/GunB1g/status/1163218564784017422', 'only_matching': True, @@ -439,10 +765,77 @@ class TwitterIE(TwitterBaseIE): 'only_matching': True, }] + def _graphql_to_legacy(self, data, twid): + result = traverse_obj(data, ( + 'threaded_conversation_with_injections_v2', 'instructions', 0, 'entries', + lambda _, v: v['entryId'] == f'tweet-{twid}', 'content', 'itemContent', + 'tweet_results', 'result' + ), expected_type=dict, default={}, get_all=False) + + if 'tombstone' in result: + cause = traverse_obj(result, ('tombstone', 'text', 'text'), expected_type=str) + raise ExtractorError(f'Twitter API says: {cause or "Unknown error"}', expected=True) + + status = result.get('legacy', {}) + status.update(traverse_obj(result, { + 'user': ('core', 'user_results', 'result', 'legacy'), + 'card': ('card', 'legacy'), + 'quoted_status': ('quoted_status_result', 'result', 'legacy'), + }, expected_type=dict, default={})) + + # extra transformation is needed since result does not match legacy format + binding_values = { + binding_value.get('key'): binding_value.get('value') + for binding_value in traverse_obj(status, ('card', 'binding_values', ...), expected_type=dict) + } + if binding_values: + status['card']['binding_values'] = binding_values + + return status + + def _build_graphql_query(self, media_id): + return { + 'variables': { + 'focalTweetId': media_id, + 'includePromotedContent': True, + 'with_rux_injections': False, + 'withBirdwatchNotes': True, + 'withCommunity': True, + 'withDownvotePerspective': False, + 'withQuickPromoteEligibilityTweetFields': True, + 'withReactionsMetadata': False, + 'withReactionsPerspective': False, + 'withSuperFollowsTweetFields': True, + 'withSuperFollowsUserFields': True, + 'withV2Timeline': True, + 'withVoice': True, + }, + 'features': { + 'graphql_is_translatable_rweb_tweet_is_translatable_enabled': False, + 'interactive_text_enabled': True, + 'responsive_web_edit_tweet_api_enabled': True, + 'responsive_web_enhance_cards_enabled': True, + 'responsive_web_graphql_timeline_navigation_enabled': False, + 'responsive_web_text_conversations_enabled': False, + 'responsive_web_uc_gql_enabled': True, + 'standardized_nudges_misinfo': True, + 'tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled': False, + 'tweetypie_unmention_optimization_enabled': True, + 'unified_cards_ad_metadata_container_dynamic_card_content_query_enabled': True, + 'verified_phone_label_enabled': False, + 'vibe_api_enabled': True, + }, + } + def _real_extract(self, url): twid = self._match_id(url) - status = self._call_api( - 'statuses/show/%s.json' % twid, twid, { + if self.is_logged_in or self._configuration_arg('force_graphql'): + self.write_debug(f'Using GraphQL API (Auth = {self.is_logged_in})') + result = self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid) + status = self._graphql_to_legacy(result, twid) + + else: + status = self._call_api(f'statuses/show/{twid}.json', twid, { 'cards_platform': 'Web-12', 'include_cards': 1, 'include_reply_count': 1, @@ -456,7 +849,7 @@ class TwitterIE(TwitterBaseIE): user = status.get('user') or {} uploader = user.get('name') if uploader: - title = '%s - %s' % (uploader, title) + title = f'{uploader} - {title}' uploader_id = user.get('screen_name') tags = [] @@ -473,7 +866,7 @@ class TwitterIE(TwitterBaseIE): 'uploader': uploader, 'timestamp': unified_timestamp(status.get('created_at')), 'uploader_id': uploader_id, - 'uploader_url': format_field(uploader_id, template='https://twitter.com/%s'), + 'uploader_url': format_field(uploader_id, None, 'https://twitter.com/%s'), 'like_count': int_or_none(status.get('favorite_count')), 'repost_count': int_or_none(status.get('retweet_count')), 'comment_count': int_or_none(status.get('reply_count')), @@ -482,6 +875,8 @@ class TwitterIE(TwitterBaseIE): } def extract_from_video_info(media): + media_id = traverse_obj(media, 'id_str', 'id', expected_type=str_or_none) + self.write_debug(f'Extracting from video info: {media_id}') video_info = media.get('video_info') or {} formats = [] @@ -490,7 +885,6 @@ class TwitterIE(TwitterBaseIE): fmts, subs = self._extract_variant_formats(variant, twid) subtitles = self._merge_subtitles(subtitles, subs) formats.extend(fmts) - self._sort_formats(formats, ('res', 'br', 'size', 'proto')) # The codec of http formats are unknown thumbnails = [] media_url = media.get('media_url_https') or media.get('media_url') @@ -506,90 +900,111 @@ class TwitterIE(TwitterBaseIE): add_thumbnail(name, size) add_thumbnail('orig', media.get('original_info') or {}) - info.update({ + return { + 'id': media_id, 'formats': formats, 'subtitles': subtitles, 'thumbnails': thumbnails, 'duration': float_or_none(video_info.get('duration_millis'), 1000), - }) + # The codec of http formats are unknown + '_format_sort_fields': ('res', 'br', 'size', 'proto'), + } - media = traverse_obj(status, ((None, 'quoted_status'), 'extended_entities', 'media', 0), get_all=False) - if media and media.get('type') != 'photo': - extract_from_video_info(media) - else: - card = status.get('card') - if card: - binding_values = card['binding_values'] - - def get_binding_value(k): - o = binding_values.get(k) or {} - return try_get(o, lambda x: x[x['type'].lower() + '_value']) - - card_name = card['name'].split(':')[-1] - if card_name == 'player': - info.update({ - '_type': 'url', - 'url': get_binding_value('player_url'), - }) - elif card_name == 'periscope_broadcast': - info.update({ - '_type': 'url', - 'url': get_binding_value('url') or get_binding_value('player_url'), - 'ie_key': PeriscopeIE.ie_key(), - }) - elif card_name == 'broadcast': - info.update({ - '_type': 'url', - 'url': get_binding_value('broadcast_url'), - 'ie_key': TwitterBroadcastIE.ie_key(), - }) - elif card_name == 'summary': - info.update({ - '_type': 'url', - 'url': get_binding_value('card_url'), - }) - elif card_name == 'unified_card': - media_entities = self._parse_json(get_binding_value('unified_card'), twid)['media_entities'] - extract_from_video_info(next(iter(media_entities.values()))) - # amplify, promo_video_website, promo_video_convo, appplayer, - # video_direct_message, poll2choice_video, poll3choice_video, - # poll4choice_video, ... - else: - is_amplify = card_name == 'amplify' - vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url') - content_id = get_binding_value('%s_content_id' % (card_name if is_amplify else 'player')) - formats, subtitles = self._extract_formats_from_vmap_url(vmap_url, content_id or twid) - self._sort_formats(formats) - - thumbnails = [] - for suffix in ('_small', '', '_large', '_x_large', '_original'): - image = get_binding_value('player_image' + suffix) or {} - image_url = image.get('url') - if not image_url or '/player-placeholder' in image_url: - continue - thumbnails.append({ - 'id': suffix[1:] if suffix else 'medium', - 'url': image_url, - 'width': int_or_none(image.get('width')), - 'height': int_or_none(image.get('height')), - }) - - info.update({ - 'formats': formats, - 'subtitles': subtitles, - 'thumbnails': thumbnails, - 'duration': int_or_none(get_binding_value( - 'content_duration_seconds')), - }) - else: - expanded_url = try_get(status, lambda x: x['entities']['urls'][0]['expanded_url']) - if not expanded_url: - raise ExtractorError("There's no video in this tweet.") - info.update({ + def extract_from_card_info(card): + if not card: + return + + self.write_debug(f'Extracting from card info: {card.get("url")}') + binding_values = card['binding_values'] + + def get_binding_value(k): + o = binding_values.get(k) or {} + return try_get(o, lambda x: x[x['type'].lower() + '_value']) + + card_name = card['name'].split(':')[-1] + if card_name == 'player': + yield { '_type': 'url', - 'url': expanded_url, - }) - return info + 'url': get_binding_value('player_url'), + } + elif card_name == 'periscope_broadcast': + yield { + '_type': 'url', + 'url': get_binding_value('url') or get_binding_value('player_url'), + 'ie_key': PeriscopeIE.ie_key(), + } + elif card_name == 'broadcast': + yield { + '_type': 'url', + 'url': get_binding_value('broadcast_url'), + 'ie_key': TwitterBroadcastIE.ie_key(), + } + elif card_name == 'audiospace': + yield { + '_type': 'url', + 'url': f'https://twitter.com/i/spaces/{get_binding_value("id")}', + 'ie_key': TwitterSpacesIE.ie_key(), + } + elif card_name == 'summary': + yield { + '_type': 'url', + 'url': get_binding_value('card_url'), + } + elif card_name == 'unified_card': + unified_card = self._parse_json(get_binding_value('unified_card'), twid) + yield from map(extract_from_video_info, traverse_obj( + unified_card, ('media_entities', ...), expected_type=dict)) + # amplify, promo_video_website, promo_video_convo, appplayer, + # video_direct_message, poll2choice_video, poll3choice_video, + # poll4choice_video, ... + else: + is_amplify = card_name == 'amplify' + vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url') + content_id = get_binding_value('%s_content_id' % (card_name if is_amplify else 'player')) + formats, subtitles = self._extract_formats_from_vmap_url(vmap_url, content_id or twid) + + thumbnails = [] + for suffix in ('_small', '', '_large', '_x_large', '_original'): + image = get_binding_value('player_image' + suffix) or {} + image_url = image.get('url') + if not image_url or '/player-placeholder' in image_url: + continue + thumbnails.append({ + 'id': suffix[1:] if suffix else 'medium', + 'url': image_url, + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + }) + + yield { + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + 'duration': int_or_none(get_binding_value( + 'content_duration_seconds')), + } + + media_path = ((None, 'quoted_status'), 'extended_entities', 'media', lambda _, m: m['type'] != 'photo') + videos = map(extract_from_video_info, traverse_obj(status, media_path, expected_type=dict)) + cards = extract_from_card_info(status.get('card')) + entries = [{**info, **data, 'display_id': twid} for data in (*videos, *cards)] + + if not entries: + expanded_url = traverse_obj(status, ('entities', 'urls', 0, 'expanded_url'), expected_type=url_or_none) + if not expanded_url or expanded_url == url: + raise ExtractorError('No video could be found in this tweet', expected=True) + + return self.url_result(expanded_url, display_id=twid, **info) + + entries[0]['_old_archive_ids'] = [make_archive_id(self, twid)] + + if len(entries) == 1: + return entries[0] + + for index, entry in enumerate(entries, 1): + entry['title'] += f' #{index}' + + return self.playlist_result(entries, **info) class TwitterAmplifyIE(TwitterBaseIE): @@ -598,13 +1013,14 @@ class TwitterAmplifyIE(TwitterBaseIE): _TEST = { 'url': 'https://amp.twimg.com/v/0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951', - 'md5': '7df102d0b9fd7066b86f3159f8e81bf6', + 'md5': 'fec25801d18a4557c5c9f33d2c379ffa', 'info_dict': { 'id': '0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951', 'ext': 'mp4', 'title': 'Twitter Video', 'thumbnail': 're:^https?://.*', }, + 'params': {'format': '[protocol=https]'}, } def _real_extract(self, url): @@ -613,7 +1029,7 @@ class TwitterAmplifyIE(TwitterBaseIE): vmap_url = self._html_search_meta( 'twitter:amplify:vmap', webpage, 'vmap url') - formats = self._extract_formats_from_vmap_url(vmap_url, video_id) + formats, _ = self._extract_formats_from_vmap_url(vmap_url, video_id) thumbnails = [] thumbnail = self._html_search_meta( @@ -661,6 +1077,8 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): 'title': 'Andrea May Sahouri - Periscope Broadcast', 'uploader': 'Andrea May Sahouri', 'uploader_id': '1PXEdBZWpGwKe', + 'thumbnail': r're:^https?://[^?#]+\.jpg\?token=', + 'view_count': int, }, } @@ -672,7 +1090,7 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): info = self._parse_broadcast_data(broadcast, broadcast_id) media_key = broadcast['media_key'] source = self._call_api( - 'live_video_stream/status/' + media_key, media_key)['source'] + f'live_video_stream/status/{media_key}', media_key)['source'] m3u8_url = source.get('noRedirectPlaybackUrl') or source['location'] if '/live_video_stream/geoblocked/' in m3u8_url: self.raise_geo_restricted() @@ -684,6 +1102,100 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE): return info +class TwitterSpacesIE(TwitterBaseIE): + IE_NAME = 'twitter:spaces' + _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/spaces/(?P<id>[0-9a-zA-Z]{13})' + + _TESTS = [{ + 'url': 'https://twitter.com/i/spaces/1RDxlgyvNXzJL', + 'info_dict': { + 'id': '1RDxlgyvNXzJL', + 'ext': 'm4a', + 'title': 'King Carlo e la mossa Kansas City per fare il Grande Centro', + 'description': 'Twitter Space participated by annarita digiorgio, Signor Ernesto, Raffaello Colosimo, Simone M. Sepe', + 'uploader': r're:Lucio Di Gaetano.*?', + 'uploader_id': 'luciodigaetano', + 'live_status': 'was_live', + 'timestamp': 1659877956397, + }, + 'params': {'skip_download': 'm3u8'}, + }] + + SPACE_STATUS = { + 'notstarted': 'is_upcoming', + 'ended': 'was_live', + 'running': 'is_live', + 'timedout': 'post_live', + } + + def _build_graphql_query(self, space_id): + return { + 'variables': { + 'id': space_id, + 'isMetatagsQuery': True, + 'withDownvotePerspective': False, + 'withReactionsMetadata': False, + 'withReactionsPerspective': False, + 'withReplays': True, + 'withSuperFollowsUserFields': True, + 'withSuperFollowsTweetFields': True, + }, + 'features': { + 'dont_mention_me_view_api_enabled': True, + 'interactive_text_enabled': True, + 'responsive_web_edit_tweet_api_enabled': True, + 'responsive_web_enhance_cards_enabled': True, + 'responsive_web_uc_gql_enabled': True, + 'spaces_2022_h2_clipping': True, + 'spaces_2022_h2_spaces_communities': False, + 'standardized_nudges_misinfo': True, + 'tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled': False, + 'vibe_api_enabled': True, + }, + } + + def _real_extract(self, url): + space_id = self._match_id(url) + space_data = self._call_graphql_api('HPEisOmj1epUNLCWTYhUWw/AudioSpaceById', space_id)['audioSpace'] + if not space_data: + raise ExtractorError('Twitter Space not found', expected=True) + + metadata = space_data['metadata'] + live_status = try_call(lambda: self.SPACE_STATUS[metadata['state'].lower()]) + + formats = [] + if live_status == 'is_upcoming': + self.raise_no_formats('Twitter Space not started yet', expected=True) + elif live_status == 'post_live': + self.raise_no_formats('Twitter Space ended but not downloadable yet', expected=True) + else: + source = self._call_api( + f'live_video_stream/status/{metadata["media_key"]}', metadata['media_key'])['source'] + + # XXX: Native downloader does not work + formats = self._extract_m3u8_formats( + traverse_obj(source, 'noRedirectPlaybackUrl', 'location'), + metadata['media_key'], 'm4a', 'm3u8', live=live_status == 'is_live', + headers={'Referer': 'https://twitter.com/'}) + for fmt in formats: + fmt.update({'vcodec': 'none', 'acodec': 'aac'}) + + participants = ', '.join(traverse_obj( + space_data, ('participants', 'speakers', ..., 'display_name'))) or 'nobody yet' + return { + 'id': space_id, + 'title': metadata.get('title'), + 'description': f'Twitter Space participated by {participants}', + 'uploader': traverse_obj( + metadata, ('creator_results', 'result', 'legacy', 'name')), + 'uploader_id': traverse_obj( + metadata, ('creator_results', 'result', 'legacy', 'screen_name')), + 'live_status': live_status, + 'timestamp': metadata.get('created_at'), + 'formats': formats, + } + + class TwitterShortenerIE(TwitterBaseIE): IE_NAME = 'twitter:shortener' _VALID_URL = r'https?://t.co/(?P<id>[^?]+)|tco:(?P<eid>[^?]+)' diff --git a/hypervideo_dl/extractor/udemy.py b/hypervideo_dl/extractor/udemy.py index 88b2310..4faad58 100644 --- a/hypervideo_dl/extractor/udemy.py +++ b/hypervideo_dl/extractor/udemy.py @@ -1,19 +1,12 @@ -from __future__ import unicode_literals - import re +import urllib.request from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_kwargs, - compat_str, - compat_urllib_request, - compat_urlparse, -) +from ..compat import compat_HTTPError, compat_str, compat_urlparse from ..utils import ( + ExtractorError, determine_ext, extract_attributes, - ExtractorError, float_or_none, int_or_none, js_to_json, @@ -132,7 +125,7 @@ class UdemyIE(InfoExtractor): headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36' kwargs['headers'] = headers ret = super(UdemyIE, self)._download_webpage_handle( - *args, **compat_kwargs(kwargs)) + *args, **kwargs) if not ret: return ret webpage, _ = ret @@ -151,14 +144,14 @@ class UdemyIE(InfoExtractor): 'X-Udemy-Snail-Case': 'true', 'X-Requested-With': 'XMLHttpRequest', } - for cookie in self._downloader.cookiejar: + for cookie in self.cookiejar: if cookie.name == 'client_id': headers['X-Udemy-Client-Id'] = cookie.value elif cookie.name == 'access_token': headers['X-Udemy-Bearer-Token'] = cookie.value headers['X-Udemy-Authorization'] = 'Bearer %s' % cookie.value - if isinstance(url_or_request, compat_urllib_request.Request): + if isinstance(url_or_request, urllib.request.Request): for header, value in headers.items(): url_or_request.add_header(header, value) else: @@ -398,8 +391,6 @@ class UdemyIE(InfoExtractor): if f.get('url'): formats.append(f) - self._sort_formats(formats) - return { 'id': video_id, 'title': title, @@ -412,7 +403,7 @@ class UdemyIE(InfoExtractor): } -class UdemyCourseIE(UdemyIE): +class UdemyCourseIE(UdemyIE): # XXX: Do not subclass from concrete IE IE_NAME = 'udemy:course' _VALID_URL = r'https?://(?:[^/]+\.)?udemy\.com/(?P<id>[^/?#&]+)' _TESTS = [{ diff --git a/hypervideo_dl/extractor/udn.py b/hypervideo_dl/extractor/udn.py index 2c8e5c7..10668ac 100644 --- a/hypervideo_dl/extractor/udn.py +++ b/hypervideo_dl/extractor/udn.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -16,6 +13,7 @@ class UDNEmbedIE(InfoExtractor): IE_DESC = '聯合影音' _PROTOCOL_RELATIVE_VALID_URL = r'//video\.udn\.com/(?:embed|play)/news/(?P<id>\d+)' _VALID_URL = r'https?:' + _PROTOCOL_RELATIVE_VALID_URL + _EMBED_REGEX = [r'<iframe[^>]+src="(?:https?:)?(?P<url>%s)"' % _PROTOCOL_RELATIVE_VALID_URL] _TESTS = [{ 'url': 'http://video.udn.com/embed/news/300040', 'info_dict': { @@ -92,8 +90,6 @@ class UDNEmbedIE(InfoExtractor): }) formats.append(a_format) - self._sort_formats(formats) - return { 'id': video_id, 'formats': formats, diff --git a/hypervideo_dl/extractor/ufctv.py b/hypervideo_dl/extractor/ufctv.py index 3d74ba0..2c1c5e0 100644 --- a/hypervideo_dl/extractor/ufctv.py +++ b/hypervideo_dl/extractor/ufctv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .imggaming import ImgGamingBaseIE diff --git a/hypervideo_dl/extractor/ukcolumn.py b/hypervideo_dl/extractor/ukcolumn.py index d2626f0..aade79f 100644 --- a/hypervideo_dl/extractor/ukcolumn.py +++ b/hypervideo_dl/extractor/ukcolumn.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from ..utils import ( unescapeHTML, urljoin, diff --git a/hypervideo_dl/extractor/uktvplay.py b/hypervideo_dl/extractor/uktvplay.py index f28fd51..ab22a8e 100644 --- a/hypervideo_dl/extractor/uktvplay.py +++ b/hypervideo_dl/extractor/uktvplay.py @@ -1,11 +1,8 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor class UKTVPlayIE(InfoExtractor): - _VALID_URL = r'https?://uktvplay\.uktv\.co\.uk/(?:.+?\?.*?\bvideo=|([^/]+/)*watch-online/)(?P<id>\d+)' + _VALID_URL = r'https?://uktvplay\.(?:uktv\.)?co\.uk/(?:.+?\?.*?\bvideo=|([^/]+/)*)(?P<id>\d+)' _TESTS = [{ 'url': 'https://uktvplay.uktv.co.uk/shows/world-at-war/c/200/watch-online/?video=2117008346001', 'info_dict': { @@ -25,6 +22,9 @@ class UKTVPlayIE(InfoExtractor): }, { 'url': 'https://uktvplay.uktv.co.uk/shows/africa/watch-online/5983349675001', 'only_matching': True, + }, { + 'url': 'https://uktvplay.co.uk/shows/hornby-a-model-world/series-1/episode-1/6276739790001?autoplaying=true', + 'only_matching': True, }] # BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/1242911124001/OrCyvJ2gyL_default/index.html?videoId=%s' BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1242911124001/H1xnMOqP_default/index.html?videoId=%s' diff --git a/hypervideo_dl/extractor/umg.py b/hypervideo_dl/extractor/umg.py index c1b65d1..3ffcb73 100644 --- a/hypervideo_dl/extractor/umg.py +++ b/hypervideo_dl/extractor/umg.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -89,7 +86,6 @@ class UMGDeIE(InfoExtractor): if not formats: for format_id in (867, 836, 940): add_m3u8_format(format_id) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/unistra.py b/hypervideo_dl/extractor/unistra.py index 685d74f..6e872cd 100644 --- a/hypervideo_dl/extractor/unistra.py +++ b/hypervideo_dl/extractor/unistra.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -49,7 +47,6 @@ class UnistraIE(InfoExtractor): 'format_id': format_id, 'quality': quality(format_id) }) - self._sort_formats(formats) title = self._html_search_regex( r'<title>UTV - (.*?)</', webpage, 'title') diff --git a/hypervideo_dl/extractor/unity.py b/hypervideo_dl/extractor/unity.py index 73daacf..d1b0ecb 100644 --- a/hypervideo_dl/extractor/unity.py +++ b/hypervideo_dl/extractor/unity.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from .youtube import YoutubeIE diff --git a/hypervideo_dl/extractor/unscripted.py b/hypervideo_dl/extractor/unscripted.py new file mode 100644 index 0000000..6643a71 --- /dev/null +++ b/hypervideo_dl/extractor/unscripted.py @@ -0,0 +1,53 @@ +from .common import InfoExtractor +from ..utils import parse_duration, traverse_obj + + +class UnscriptedNewsVideoIE(InfoExtractor): + _VALID_URL = r'https?://www\.unscripted\.news/videos/(?P<id>[\w-]+)' + _TESTS = [{ + 'url': 'https://www.unscripted.news/videos/a-day-at-the-farmers-protest', + 'info_dict': { + 'id': '60c0a55cd1e99b1079918a57', + 'display_id': 'a-day-at-the-farmers-protest', + 'ext': 'mp4', + 'title': 'A Day at the Farmers\' Protest', + 'description': 'md5:4b3df22747a03e8f14f746dd72190384', + 'thumbnail': 'https://s3.unscripted.news/anj2/60c0a55cd1e99b1079918a57/5f199a65-c803-4a5c-8fce-2077359c3b72.jpg', + 'duration': 2251.0, + 'series': 'Ground Reports', + } + }, { + 'url': 'https://www.unscripted.news/videos/you-get-the-politicians-you-deserve-ft-shashi-tharoor', + 'info_dict': { + 'id': '5fb3afbf18ac817d341a74d8', + 'display_id': 'you-get-the-politicians-you-deserve-ft-shashi-tharoor', + 'ext': 'mp4', + 'cast': ['Avalok Langer', 'Ashwin Mehta'], + 'thumbnail': 'https://s3.unscripted.news/anj2/5fb3afbf18ac817d341a74d8/82bd7942-4f20-4cd8-98ae-83f9e814f998.jpg', + 'description': 'md5:1e91b069238a705ca3a40f87e6f1182c', + 'duration': 1046.0, + 'series': 'Dumb Questions Only', + 'title': 'You Get The Politicians You Deserve! ft. Shashi Tharoor', + } + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + nextjs_data = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['dataLocal'] + + # TODO: get subtitle from srt key + formats, subtitles = self._extract_m3u8_formats_and_subtitles(nextjs_data['alt_content'], display_id) + + return { + 'id': nextjs_data['_id'], + 'display_id': display_id, + 'title': nextjs_data.get('title') or self._og_search_title(webpage), + 'description': nextjs_data.get('sh_heading') or self._og_search_description(webpage), + 'formats': formats, + 'subtitles': subtitles, + 'thumbnail': self._og_search_thumbnail(webpage), + 'duration': parse_duration(nextjs_data.get('duration')), + 'series': traverse_obj(nextjs_data, ('show', 'topic')), + 'cast': traverse_obj(nextjs_data, ('cast_crew', ..., 'displayname')), + } diff --git a/hypervideo_dl/extractor/unsupported.py b/hypervideo_dl/extractor/unsupported.py new file mode 100644 index 0000000..620c025 --- /dev/null +++ b/hypervideo_dl/extractor/unsupported.py @@ -0,0 +1,143 @@ +from .common import InfoExtractor +from ..utils import ExtractorError, classproperty, remove_start + + +class UnsupportedInfoExtractor(InfoExtractor): + IE_DESC = False + URLS = () # Redefine in subclasses + + @classproperty + def IE_NAME(cls): + return remove_start(super().IE_NAME, 'Known') + + @classproperty + def _VALID_URL(cls): + return rf'https?://(?:www\.)?(?:{"|".join(cls.URLS)})' + + +LF = '\n ' + + +class KnownDRMIE(UnsupportedInfoExtractor): + """Sites that are known to use DRM for all their videos + + Add to this list only if: + * You are reasonably certain that the site uses DRM for ALL their videos + * Multiple users have asked about this site on github/reddit/discord + """ + + URLS = ( + r'play\.hbomax\.com', + r'channel(?:4|5)\.com', + r'peacocktv\.com', + r'(?:[\w\.]+\.)?disneyplus\.com', + r'open\.spotify\.com/(?:track|playlist|album|artist)', + r'tvnz\.co\.nz', + r'oneplus\.ch', + r'artstation\.com/learning/courses', + r'philo\.com', + r'(?:[\w\.]+\.)?mech-plus\.com', + r'aha\.video', + r'mubi\.com', + r'vootkids\.com', + r'nowtv\.it/watch', + r'tv\.apple\.com', + ) + + _TESTS = [{ + # https://github.com/hypervideo/hypervideo/issues/4309 + 'url': 'https://peacocktv.com/watch/playback/vod/GMO_00000000073159_01/f9d03003-eb04-3c7f-a7b6-a83ab7eb55bc', + 'only_matching': True, + }, { + # https://github.com/hypervideo/hypervideo/issues/1719, + 'url': 'https://www.channel4.com/programmes/gurren-lagann/on-demand/69960-001', + 'only_matching': True, + }, { + # https://github.com/hypervideo/hypervideo/issues/1548 + 'url': 'https://www.channel5.com/show/uk-s-strongest-man-2021/season-2021/episode-1', + 'only_matching': True, + }, { + 'url': r'https://hsesn.apps.disneyplus.com', + 'only_matching': True, + }, { + 'url': r'https://www.disneyplus.com', + 'only_matching': True, + }, { + 'url': 'https://open.spotify.com/artist/', + 'only_matching': True, + }, { + 'url': 'https://open.spotify.com/track/', + 'only_matching': True, + }, { + # https://github.com/hypervideo/hypervideo/issues/4122 + 'url': 'https://www.tvnz.co.nz/shows/ice-airport-alaska/episodes/s1-e1', + 'only_matching': True, + }, { + # https://github.com/hypervideo/hypervideo/issues/1922 + 'url': 'https://www.oneplus.ch/play/1008188', + 'only_matching': True, + }, { + # https://github.com/hypervideo/hypervideo/issues/1140 + 'url': 'https://www.artstation.com/learning/courses/dqQ/character-design-masterclass-with-serge-birault/chapters/Rxn3/introduction', + 'only_matching': True, + }, { + # https://github.com/hypervideo/hypervideo/issues/3544 + 'url': 'https://www.philo.com/player/player/vod/Vk9EOjYwODU0ODg5OTY0ODY0OTQ5NA', + 'only_matching': True, + }, { + # https://github.com/hypervideo/hypervideo/issues/3533 + 'url': 'https://www.mech-plus.com/player/24892/stream?assetType=episodes&playlist_id=6', + 'only_matching': True, + }, { + 'url': 'https://watch.mech-plus.com/details/25240?playlist_id=6', + 'only_matching': True, + }, { + # https://github.com/hypervideo/hypervideo/issues/2934 + 'url': 'https://www.aha.video/player/movie/lucky-man', + 'only_matching': True, + }, { + # https://github.com/hypervideo/hypervideo/issues/2743 + 'url': 'https://mubi.com/films/the-night-doctor', + 'only_matching': True, + }, { + # https://github.com/hypervideo/hypervideo/issues/3287 + 'url': 'https://www.vootkids.com/movies/chhota-bheem-the-rise-of-kirmada/764459', + 'only_matching': True, + }, { + # https://github.com/hypervideo/hypervideo/issues/2744 + 'url': 'https://www.nowtv.it/watch/home/asset/and-just-like-that/skyserie_f8fe979772e8437d8a61ab83b6d293e9/seasons/1/episodes/8/R_126182_HD', + 'only_matching': True, + }, { + # https://github.com/hypervideo/hypervideo/issues/5557 + 'url': 'https://tv.apple.com/it/show/loot---una-fortuna/umc.cmc.5erbujil1mpazuerhr1udnk45?ctx_brand=tvs.sbd.4000', + 'only_matching': True, + }] + + def _real_extract(self, url): + raise ExtractorError( + f'The requested site is known to use DRM protection. ' + f'It will {self._downloader._format_err("NOT", self._downloader.Styles.EMPHASIS)} be supported.{LF}' + f'Please {self._downloader._format_err("DO NOT", self._downloader.Styles.ERROR)} open an issue, ' + 'unless you have evidence that the video is not DRM protected', expected=True) + + +class KnownPiracyIE(UnsupportedInfoExtractor): + """Sites that have been deemed to be piracy + + In order for this to not end up being a catalog of piracy sites, + only sites that were once supported should be added to this list + """ + + URLS = ( + r'dood\.(?:to|watch|so|pm|wf|re)', + ) + + _TESTS = [{ + 'url': 'http://dood.to/e/5s1wmbdacezb', + 'only_matching': True, + }] + + def _real_extract(self, url): + raise ExtractorError( + f'This website is no longer supported since it has been determined to be primarily used for piracy.{LF}' + f'{self._downloader._format_err("DO NOT", self._downloader.Styles.ERROR)} open issues for it', expected=True) diff --git a/hypervideo_dl/extractor/uol.py b/hypervideo_dl/extractor/uol.py index 1baee0b..068c2b8 100644 --- a/hypervideo_dl/extractor/uol.py +++ b/hypervideo_dl/extractor/uol.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import ( compat_str, @@ -110,7 +107,6 @@ class UOLIE(InfoExtractor): 'url': f_url, 'quality': quality(format_id), }) - self._sort_formats(formats) tags = [] for tag in video_data.get('tags', []): diff --git a/hypervideo_dl/extractor/uplynk.py b/hypervideo_dl/extractor/uplynk.py index 9adb969..87c427f 100644 --- a/hypervideo_dl/extractor/uplynk.py +++ b/hypervideo_dl/extractor/uplynk.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -36,7 +33,6 @@ class UplynkIE(InfoExtractor): if session_id: for f in formats: f['extra_param_to_segment_url'] = 'pbs=' + session_id - self._sort_formats(formats) asset = self._download_json('http://content.uplynk.com/player/assetinfo/%s.json' % path, display_id) if asset.get('error') == 1: raise ExtractorError('% said: %s' % (self.IE_NAME, asset['msg']), expected=True) @@ -55,10 +51,9 @@ class UplynkIE(InfoExtractor): return self._extract_uplynk_info(url) -class UplynkPreplayIE(UplynkIE): +class UplynkPreplayIE(UplynkIE): # XXX: Do not subclass from concrete IE IE_NAME = 'uplynk:preplay' _VALID_URL = r'https?://.*?\.uplynk\.com/preplay2?/(?P<path>ext/[0-9a-f]{32}/(?P<external_id>[^/?&]+)|(?P<id>[0-9a-f]{32}))\.json' - _TEST = None def _real_extract(self, url): path, external_id, video_id = self._match_valid_url(url).groups() diff --git a/hypervideo_dl/extractor/urort.py b/hypervideo_dl/extractor/urort.py index 020425f..debd2ba 100644 --- a/hypervideo_dl/extractor/urort.py +++ b/hypervideo_dl/extractor/urort.py @@ -1,13 +1,7 @@ -# coding: utf-8 -from __future__ import unicode_literals +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, -) -from ..utils import ( - unified_strdate, -) +from ..utils import unified_strdate class UrortIE(InfoExtractor): @@ -34,7 +28,7 @@ class UrortIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) - fstr = compat_urllib_parse.quote("InternalBandUrl eq '%s'" % playlist_id) + fstr = urllib.parse.quote("InternalBandUrl eq '%s'" % playlist_id) json_url = 'http://urort.p3.no/breeze/urort/TrackDTOViews?$filter=%s&$orderby=Released%%20desc&$expand=Tags%%2CFiles' % fstr songs = self._download_json(json_url, playlist_id) entries = [] @@ -46,7 +40,6 @@ class UrortIE(InfoExtractor): 'url': 'http://p3urort.blob.core.windows.net/tracks/%s' % f['FileRef'], 'quality': 3 if f['FileType'] == 'mp3' else 2, } for f in s['Files']] - self._sort_formats(formats) e = { 'id': '%d-%s' % (s['BandId'], s['$id']), 'title': s['Title'], diff --git a/hypervideo_dl/extractor/urplay.py b/hypervideo_dl/extractor/urplay.py index eb2ab26..0f0d659 100644 --- a/hypervideo_dl/extractor/urplay.py +++ b/hypervideo_dl/extractor/urplay.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( dict_get, @@ -79,7 +76,6 @@ class URPlayIE(InfoExtractor): formats.extend(self._extract_wowza_formats( 'http://%s/%splaylist.m3u8' % (host, file_http), video_id, skip_protocols=['f4m', 'rtmp', 'rtsp'])) - self._sort_formats(formats) subtitles = {} diff --git a/hypervideo_dl/extractor/usanetwork.py b/hypervideo_dl/extractor/usanetwork.py index d953e46..4a06a9a 100644 --- a/hypervideo_dl/extractor/usanetwork.py +++ b/hypervideo_dl/extractor/usanetwork.py @@ -1,10 +1,7 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .nbc import NBCIE -class USANetworkIE(NBCIE): +class USANetworkIE(NBCIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'https?(?P<permalink>://(?:www\.)?usanetwork\.com/(?:[^/]+/videos?|movies?)/(?:[^/]+/)?(?P<id>\d+))' _TESTS = [{ 'url': 'https://www.usanetwork.com/peacock-trailers/video/intelligence-trailer/4185302', diff --git a/hypervideo_dl/extractor/usatoday.py b/hypervideo_dl/extractor/usatoday.py index b210344..3243f3e 100644 --- a/hypervideo_dl/extractor/usatoday.py +++ b/hypervideo_dl/extractor/usatoday.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( ExtractorError, diff --git a/hypervideo_dl/extractor/ustream.py b/hypervideo_dl/extractor/ustream.py index 4a7a8f8..5df2416 100644 --- a/hypervideo_dl/extractor/ustream.py +++ b/hypervideo_dl/extractor/ustream.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import random import re @@ -22,6 +20,7 @@ from ..utils import ( class UstreamIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/(?P<type>recorded|embed|embed/recorded)/(?P<id>\d+)' IE_NAME = 'ustream' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/embed/.+?)\1'] _TESTS = [{ 'url': 'http://www.ustream.tv/recorded/20274954', 'md5': '088f151799e8f572f84eb62f17d73e5c', @@ -73,13 +72,6 @@ class UstreamIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/embed/.+?)\1', webpage) - if mobj is not None: - return mobj.group('url') - def _get_stream_info(self, url, video_id, app_id_ver, extra_note=None): def num_to_hex(n): return hex(n)[2:] @@ -218,8 +210,6 @@ class UstreamIE(InfoExtractor): formats.extend(self._parse_segmented_mp4(dash_streams)) ''' - self._sort_formats(formats) - description = video.get('description') timestamp = int_or_none(video.get('created_at')) duration = float_or_none(video.get('length')) diff --git a/hypervideo_dl/extractor/ustudio.py b/hypervideo_dl/extractor/ustudio.py index 92509d1..c3aeeb9 100644 --- a/hypervideo_dl/extractor/ustudio.py +++ b/hypervideo_dl/extractor/ustudio.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -42,7 +39,6 @@ class UstudioIE(InfoExtractor): } for item in config.findall('./qualities/quality/%s' % kind) if item.get('url')] formats = extract('video') - self._sort_formats(formats) webpage = self._download_webpage(url, display_id) @@ -101,7 +97,6 @@ class UstudioEmbedIE(InfoExtractor): 'width': int_or_none(quality.get('width')), 'height': height, }) - self._sort_formats(formats) thumbnails = [] for image in video_data.get('images', []): diff --git a/hypervideo_dl/extractor/utreon.py b/hypervideo_dl/extractor/utreon.py index 4986635..90c10c0 100644 --- a/hypervideo_dl/extractor/utreon.py +++ b/hypervideo_dl/extractor/utreon.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( dict_get, @@ -71,7 +68,6 @@ class UtreonIE(InfoExtractor): 'format_id': format_key.split('_')[1], 'height': int(format_key.split('_')[1][:-1]), } for format_key, format_url in videos_json.items() if url_or_none(format_url)] - self._sort_formats(formats) thumbnail = url_or_none(dict_get(json_data, ('cover_image_url', 'preview_image_url'))) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/varzesh3.py b/hypervideo_dl/extractor/varzesh3.py index 32655b9..2c13cbd 100644 --- a/hypervideo_dl/extractor/varzesh3.py +++ b/hypervideo_dl/extractor/varzesh3.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( clean_html, diff --git a/hypervideo_dl/extractor/vbox7.py b/hypervideo_dl/extractor/vbox7.py index 8152ace..be35dad 100644 --- a/hypervideo_dl/extractor/vbox7.py +++ b/hypervideo_dl/extractor/vbox7.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import ExtractorError @@ -20,6 +15,7 @@ class Vbox7IE(InfoExtractor): ) (?P<id>[\da-fA-F]+) ''' + _EMBED_REGEX = [r'<iframe[^>]+src=(?P<q>["\'])(?P<url>(?:https?:)?//vbox7\.com/emb/external\.php.+?)(?P=q)'] _GEO_COUNTRIES = ['BG'] _TESTS = [{ 'url': 'http://vbox7.com/play:0946fff23c', @@ -54,14 +50,6 @@ class Vbox7IE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+src=(?P<q>["\'])(?P<url>(?:https?:)?//vbox7\.com/emb/external\.php.+?)(?P=q)', - webpage) - if mobj: - return mobj.group('url') - def _real_extract(self, url): video_id = self._match_id(url) diff --git a/hypervideo_dl/extractor/veehd.py b/hypervideo_dl/extractor/veehd.py index a6dc3c8..5ecd887 100644 --- a/hypervideo_dl/extractor/veehd.py +++ b/hypervideo_dl/extractor/veehd.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re import json diff --git a/hypervideo_dl/extractor/veo.py b/hypervideo_dl/extractor/veo.py index d87bb5b..ef44d42 100644 --- a/hypervideo_dl/extractor/veo.py +++ b/hypervideo_dl/extractor/veo.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( @@ -68,8 +65,6 @@ class VeoIE(InfoExtractor): 'vbr': int_or_none(fmt.get('bit_rate'), scale=1000), }) - self._sort_formats(formats) - return { 'id': video_id, 'title': str_or_none(metadata.get('title')), diff --git a/hypervideo_dl/extractor/veoh.py b/hypervideo_dl/extractor/veoh.py index d9afb56..92ff865 100644 --- a/hypervideo_dl/extractor/veoh.py +++ b/hypervideo_dl/extractor/veoh.py @@ -1,11 +1,14 @@ -from __future__ import unicode_literals +import functools +import json from .common import InfoExtractor from ..utils import ( + ExtractorError, + OnDemandPagedList, int_or_none, parse_duration, qualities, - try_get + try_get, ) @@ -102,7 +105,6 @@ class VeohIE(InfoExtractor): 'quality': q(f_id), 'url': f_url, }) - self._sort_formats(formats) categories = metadata.get('categoryPath') if not categories: @@ -125,3 +127,62 @@ class VeohIE(InfoExtractor): 'categories': categories, 'tags': tags.split(', ') if tags else None, } + + +class VeohUserIE(VeohIE): # XXX: Do not subclass from concrete IE + _VALID_URL = r'https?://(?:www\.)?veoh\.com/users/(?P<id>[\w-]+)' + IE_NAME = 'veoh:user' + + _TESTS = [ + { + 'url': 'https://www.veoh.com/users/valentinazoe', + 'info_dict': { + 'id': 'valentinazoe', + 'title': 'valentinazoe (Uploads)' + }, + 'playlist_mincount': 75 + }, + { + 'url': 'https://www.veoh.com/users/PiensaLibre', + 'info_dict': { + 'id': 'PiensaLibre', + 'title': 'PiensaLibre (Uploads)' + }, + 'playlist_mincount': 2 + }] + + _PAGE_SIZE = 16 + + def _fetch_page(self, uploader, page): + response = self._download_json( + 'https://www.veoh.com/users/published/videos', uploader, + note=f'Downloading videos page {page + 1}', + headers={ + 'x-csrf-token': self._TOKEN, + 'content-type': 'application/json;charset=UTF-8' + }, + data=json.dumps({ + 'username': uploader, + 'maxResults': self._PAGE_SIZE, + 'page': page + 1, + 'requestName': 'userPage' + }).encode('utf-8')) + if not response.get('success'): + raise ExtractorError(response['message']) + + for video in response['videos']: + yield self.url_result(f'https://www.veoh.com/watch/{video["permalinkId"]}', VeohIE, + video['permalinkId'], video.get('title')) + + def _real_initialize(self): + webpage = self._download_webpage( + 'https://www.veoh.com', None, note='Downloading authorization token') + self._TOKEN = self._search_regex( + r'csrfToken:\s*(["\'])(?P<token>[0-9a-zA-Z]{40})\1', webpage, + 'request token', group='token') + + def _real_extract(self, url): + uploader = self._match_id(url) + return self.playlist_result(OnDemandPagedList( + functools.partial(self._fetch_page, uploader), + self._PAGE_SIZE), uploader, f'{uploader} (Uploads)') diff --git a/hypervideo_dl/extractor/vesti.py b/hypervideo_dl/extractor/vesti.py index 002047d..e9731a9 100644 --- a/hypervideo_dl/extractor/vesti.py +++ b/hypervideo_dl/extractor/vesti.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/vevo.py b/hypervideo_dl/extractor/vevo.py index 8a0f292..da4ce49 100644 --- a/hypervideo_dl/extractor/vevo.py +++ b/hypervideo_dl/extractor/vevo.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re import json @@ -35,10 +33,125 @@ class VevoIE(VevoBaseIE): https?://cache\.vevo\.com/m/html/embed\.html\?video=| https?://videoplayer\.vevo\.com/embed/embedded\?videoId=| https?://embed\.vevo\.com/.*?[?&]isrc=| + https?://tv\.vevo\.com/watch/artist/(?:[^/]+)/| vevo:) (?P<id>[^&?#]+)''' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:cache\.)?vevo\.com/.+?)\1'] - _TESTS = [] + _TESTS = [{ + 'url': 'http://www.vevo.com/watch/hurts/somebody-to-die-for/GB1101300280', + 'md5': '95ee28ee45e70130e3ab02b0f579ae23', + 'info_dict': { + 'id': 'GB1101300280', + 'ext': 'mp4', + 'title': 'Hurts - Somebody to Die For', + 'timestamp': 1372057200, + 'upload_date': '20130624', + 'uploader': 'Hurts', + 'track': 'Somebody to Die For', + 'artist': 'Hurts', + 'genre': 'Pop', + }, + 'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'], + }, { + 'note': 'v3 SMIL format', + 'url': 'http://www.vevo.com/watch/cassadee-pope/i-wish-i-could-break-your-heart/USUV71302923', + 'md5': 'f6ab09b034f8c22969020b042e5ac7fc', + 'info_dict': { + 'id': 'USUV71302923', + 'ext': 'mp4', + 'title': 'Cassadee Pope - I Wish I Could Break Your Heart', + 'timestamp': 1392796919, + 'upload_date': '20140219', + 'uploader': 'Cassadee Pope', + 'track': 'I Wish I Could Break Your Heart', + 'artist': 'Cassadee Pope', + 'genre': 'Country', + }, + 'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'], + }, { + 'note': 'Age-limited video', + 'url': 'https://www.vevo.com/watch/justin-timberlake/tunnel-vision-explicit/USRV81300282', + 'info_dict': { + 'id': 'USRV81300282', + 'ext': 'mp4', + 'title': 'Justin Timberlake - Tunnel Vision (Explicit)', + 'age_limit': 18, + 'timestamp': 1372888800, + 'upload_date': '20130703', + 'uploader': 'Justin Timberlake', + 'track': 'Tunnel Vision (Explicit)', + 'artist': 'Justin Timberlake', + 'genre': 'Pop', + }, + 'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'], + }, { + 'note': 'No video_info', + 'url': 'http://www.vevo.com/watch/k-camp-1/Till-I-Die/USUV71503000', + 'md5': '8b83cc492d72fc9cf74a02acee7dc1b0', + 'info_dict': { + 'id': 'USUV71503000', + 'ext': 'mp4', + 'title': 'K Camp ft. T.I. - Till I Die', + 'age_limit': 18, + 'timestamp': 1449468000, + 'upload_date': '20151207', + 'uploader': 'K Camp', + 'track': 'Till I Die', + 'artist': 'K Camp', + 'genre': 'Hip-Hop', + }, + 'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'], + }, { + 'note': 'Featured test', + 'url': 'https://www.vevo.com/watch/lemaitre/Wait/USUV71402190', + 'md5': 'd28675e5e8805035d949dc5cf161071d', + 'info_dict': { + 'id': 'USUV71402190', + 'ext': 'mp4', + 'title': 'Lemaitre ft. LoLo - Wait', + 'age_limit': 0, + 'timestamp': 1413432000, + 'upload_date': '20141016', + 'uploader': 'Lemaitre', + 'track': 'Wait', + 'artist': 'Lemaitre', + 'genre': 'Electronic', + }, + 'expected_warnings': ['Unable to download SMIL file', 'Unable to download info'], + }, { + 'note': 'Only available via webpage', + 'url': 'http://www.vevo.com/watch/GBUV71600656', + 'md5': '67e79210613865b66a47c33baa5e37fe', + 'info_dict': { + 'id': 'GBUV71600656', + 'ext': 'mp4', + 'title': 'ABC - Viva Love', + 'age_limit': 0, + 'timestamp': 1461830400, + 'upload_date': '20160428', + 'uploader': 'ABC', + 'track': 'Viva Love', + 'artist': 'ABC', + 'genre': 'Pop', + }, + 'expected_warnings': ['Failed to download video versions info'], + }, { + # no genres available + 'url': 'http://www.vevo.com/watch/INS171400764', + 'only_matching': True, + }, { + # Another case available only via the webpage; using streams/streamsV3 formats + # Geo-restricted to Netherlands/Germany + 'url': 'http://www.vevo.com/watch/boostee/pop-corn-clip-officiel/FR1A91600909', + 'only_matching': True, + }, { + 'url': 'https://embed.vevo.com/?isrc=USH5V1923499&partnerId=4d61b777-8023-4191-9ede-497ed6c24647&partnerAdCode=', + 'only_matching': True, + }, { + 'url': 'https://tv.vevo.com/watch/artist/janet-jackson/US0450100550', + 'only_matching': True, + }] _VERSIONS = { 0: 'youtube', # only in AuthenticateVideo videoVersions 1: 'level3', @@ -140,6 +253,7 @@ class VevoIE(VevoBaseIE): fatal=False)) else: m = re.search(r'''(?xi) + _(?P<quality>[a-z0-9]+) _(?P<width>[0-9]+)x(?P<height>[0-9]+) _(?P<vcodec>[a-z0-9]+) _(?P<vbr>[0-9]+) @@ -151,7 +265,7 @@ class VevoIE(VevoBaseIE): formats.append({ 'url': version_url, - 'format_id': 'http-%s-%s' % (version, video_version['quality']), + 'format_id': f'http-{version}-{video_version.get("quality") or m.group("quality")}', 'vcodec': m.group('vcodec'), 'acodec': m.group('acodec'), 'vbr': int(m.group('vbr')), @@ -160,7 +274,6 @@ class VevoIE(VevoBaseIE): 'width': int(m.group('width')), 'height': int(m.group('height')), }) - self._sort_formats(formats) track = video_info['title'] if featured_artist: diff --git a/hypervideo_dl/extractor/vgtv.py b/hypervideo_dl/extractor/vgtv.py index 9d6090b..db338fa 100644 --- a/hypervideo_dl/extractor/vgtv.py +++ b/hypervideo_dl/extractor/vgtv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -12,11 +9,12 @@ from ..utils import ( ) -class VGTVIE(XstreamIE): +class VGTVIE(XstreamIE): # XXX: Do not subclass from concrete IE IE_DESC = 'VGTV, BTTV, FTV, Aftenposten and Aftonbladet' _GEO_BYPASS = False _HOST_TO_APPNAME = { + 'tv.vg.no': 'vgtv', 'vgtv.no': 'vgtv', 'bt.no/tv': 'bttv', 'aftenbladet.no/tv': 'satv', @@ -130,6 +128,10 @@ class VGTVIE(XstreamIE): }, }, { + 'url': 'https://tv.vg.no/video/241779/politiets-ekstremkjoering', + 'only_matching': True, + }, + { 'url': 'http://www.bt.no/tv/#!/video/100250/norling-dette-er-forskjellen-paa-1-divisjon-og-eliteserien', 'only_matching': True, }, @@ -236,8 +238,6 @@ class VGTVIE(XstreamIE): raise self.raise_geo_restricted( countries=[host.rpartition('.')[-1].partition('/')[0].upper()]) - self._sort_formats(info['formats']) - info.update({ 'id': video_id, 'title': data['title'], diff --git a/hypervideo_dl/extractor/vh1.py b/hypervideo_dl/extractor/vh1.py index 862c5c7..41b8a46 100644 --- a/hypervideo_dl/extractor/vh1.py +++ b/hypervideo_dl/extractor/vh1.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .mtv import MTVServicesInfoExtractor # TODO Remove - Reason: Outdated Site diff --git a/hypervideo_dl/extractor/vice.py b/hypervideo_dl/extractor/vice.py index c8c3055..d1a3b48 100644 --- a/hypervideo_dl/extractor/vice.py +++ b/hypervideo_dl/extractor/vice.py @@ -1,11 +1,7 @@ -# coding: utf-8 -from __future__ import unicode_literals - import functools import hashlib import json import random -import re import time from .adobepass import AdobePassIE @@ -41,6 +37,7 @@ class ViceBaseIE(InfoExtractor): class ViceIE(ViceBaseIE, AdobePassIE): IE_NAME = 'vice' _VALID_URL = r'https?://(?:(?:video|vms)\.vice|(?:www\.)?vice(?:land|tv))\.com/(?P<locale>[^/]+)/(?:video/[^/]+|embed)/(?P<id>[\da-f]{24})' + _EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//video\.vice\.com/[^/]+/embed/[\da-f]{24})'] _TESTS = [{ 'url': 'https://video.vice.com/en_us/video/pet-cremator/58c69e38a55424f1227dc3f7', 'info_dict': { @@ -106,17 +103,6 @@ class ViceIE(ViceBaseIE, AdobePassIE): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe\b[^>]+\bsrc=["\']((?:https?:)?//video\.vice\.com/[^/]+/embed/[\da-f]{24})', - webpage) - - @staticmethod - def _extract_url(webpage): - urls = ViceIE._extract_urls(webpage) - return urls[0] if urls else None - def _real_extract(self, url): locale, video_id = self._match_valid_url(url).groups() @@ -164,7 +150,6 @@ class ViceIE(ViceBaseIE, AdobePassIE): video_data = preplay['video'] formats = self._extract_m3u8_formats( preplay['playURL'], video_id, 'mp4', 'm3u8_native') - self._sort_formats(formats) episode = video_data.get('episode') or {} channel = video_data.get('channel') or {} season = video_data.get('season') or {} diff --git a/hypervideo_dl/extractor/vidbit.py b/hypervideo_dl/extractor/vidbit.py index 91f45b7..2813032 100644 --- a/hypervideo_dl/extractor/vidbit.py +++ b/hypervideo_dl/extractor/vidbit.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_urlparse from ..utils import ( diff --git a/hypervideo_dl/extractor/viddler.py b/hypervideo_dl/extractor/viddler.py index ecc4824..4091477 100644 --- a/hypervideo_dl/extractor/viddler.py +++ b/hypervideo_dl/extractor/viddler.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( float_or_none, @@ -10,6 +7,8 @@ from ..utils import ( class ViddlerIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?viddler\.com/(?:v|embed|player)/(?P<id>[a-z0-9]+)(?:.+?\bsecret=(\d+))?' + _EMBED_REGEX = [r'<(?:iframe[^>]+?src|param[^>]+?value)=(["\'])(?P<url>(?:https?:)?//(?:www\.)?viddler\.com/(?:embed|player)/.+?)\1'] + _TESTS = [{ 'url': 'http://www.viddler.com/v/43903784', 'md5': '9eee21161d2c7f5b39690c3e325fab2f', @@ -117,7 +116,6 @@ class ViddlerIE(InfoExtractor): f['format_id'] = format_id + '-html5' f['source_preference'] = 0 formats.append(f) - self._sort_formats(formats) categories = [ t.get('text') for t in data.get('tags', []) if 'text' in t] diff --git a/hypervideo_dl/extractor/videa.py b/hypervideo_dl/extractor/videa.py index 90d7050..52fa8fc 100644 --- a/hypervideo_dl/extractor/videa.py +++ b/hypervideo_dl/extractor/videa.py @@ -1,11 +1,9 @@ -# coding: utf-8 -from __future__ import unicode_literals - import random -import re import string +import struct from .common import InfoExtractor +from ..compat import compat_b64decode, compat_ord from ..utils import ( ExtractorError, int_or_none, @@ -17,11 +15,6 @@ from ..utils import ( xpath_element, xpath_text, ) -from ..compat import ( - compat_b64decode, - compat_ord, - compat_struct_pack, -) class VideaIE(InfoExtractor): @@ -35,6 +28,7 @@ class VideaIE(InfoExtractor): ) (?P<id>[^?#&]+) ''' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1'] _TESTS = [{ 'url': 'http://videa.hu/videok/allatok/az-orult-kigyasz-285-kigyot-kigyo-8YfIAjxwWGwT8HVQ', 'md5': '97a7af41faeaffd9f1fc864a7c7e7603', @@ -81,12 +75,6 @@ class VideaIE(InfoExtractor): _STATIC_SECRET = 'xHb0ZvME5q8CBcoQi6AngerDu3FGO9fkUlwPmLVY_RTzj2hJIS4NasXWKy1td7p' @staticmethod - def _extract_urls(webpage): - return [url for _, url in re.findall( - r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1', - webpage)] - - @staticmethod def rc4(cipher_text, key): res = b'' @@ -105,7 +93,7 @@ class VideaIE(InfoExtractor): j = (j + S[i]) % 256 S[i], S[j] = S[j], S[i] k = S[(S[i] + S[j]) % 256] - res += compat_struct_pack('B', k ^ compat_ord(cipher_text[m])) + res += struct.pack('B', k ^ compat_ord(cipher_text[m])) return res.decode() @@ -179,7 +167,6 @@ class VideaIE(InfoExtractor): 'height': int_or_none(source.get('height')), }) formats.append(f) - self._sort_formats(formats) thumbnail = self._proto_relative_url(xpath_text(video, './poster_src')) diff --git a/hypervideo_dl/extractor/videocampus_sachsen.py b/hypervideo_dl/extractor/videocampus_sachsen.py index 96e9857..982ab3d 100644 --- a/hypervideo_dl/extractor/videocampus_sachsen.py +++ b/hypervideo_dl/extractor/videocampus_sachsen.py @@ -1,12 +1,80 @@ -# coding: utf-8 +import functools +import re + from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ExtractorError, OnDemandPagedList, urlencode_postdata class VideocampusSachsenIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://videocampus\.sachsen\.de/(?: + IE_NAME = 'ViMP' + _INSTANCES = ( + 'bergauf.tv', + 'campus.demo.vimp.com', + 'corporate.demo.vimp.com', + 'dancehalldatabase.com', + 'drehzahl.tv', + 'educhannel.hs-gesundheit.de', + 'emedia.ls.haw-hamburg.de', + 'globale-evolution.net', + 'hohu.tv', + 'htvideos.hightechhigh.org', + 'k210039.vimp.mivitec.net', + 'media.cmslegal.com', + 'media.hs-furtwangen.de', + 'media.hwr-berlin.de', + 'mediathek.dkfz.de', + 'mediathek.htw-berlin.de', + 'mediathek.polizei-bw.de', + 'medien.hs-merseburg.de', + 'mportal.europa-uni.de', + 'pacific.demo.vimp.com', + 'slctv.com', + 'streaming.prairiesouth.ca', + 'tube.isbonline.cn', + 'univideo.uni-kassel.de', + 'ursula2.genetics.emory.edu', + 'ursulablicklevideoarchiv.com', + 'v.agrarumweltpaedagogik.at', + 'video.eplay-tv.de', + 'video.fh-dortmund.de', + 'video.hs-offenburg.de', + 'video.hs-pforzheim.de', + 'video.hspv.nrw.de', + 'video.irtshdf.fr', + 'video.pareygo.de', + 'video.tu-freiberg.de', + 'videocampus.sachsen.de', + 'videoportal.uni-freiburg.de', + 'videoportal.vm.uni-freiburg.de', + 'videos.duoc.cl', + 'videos.uni-paderborn.de', + 'vimp-bemus.udk-berlin.de', + 'vimp.aekwl.de', + 'vimp.hs-mittweida.de', + 'vimp.oth-regensburg.de', + 'vimp.ph-heidelberg.de', + 'vimp.sma-events.com', + 'vimp.weka-fachmedien.de', + 'webtv.univ-montp3.fr', + 'www.b-tu.de/media', + 'www.bergauf.tv', + 'www.bigcitytv.de', + 'www.cad-videos.de', + 'www.drehzahl.tv', + 'www.fh-bielefeld.de/medienportal', + 'www.hohu.tv', + 'www.orvovideo.com', + 'www.rwe.tv', + 'www.salzi.tv', + 'www.wenglor-media.com', + 'www2.univ-sba.dz', + ) + _VALID_URL = r'''(?x)https?://(?P<host>%s)/(?: m/(?P<tmp_id>[0-9a-f]+)| - (?:category/)?video/(?P<display_id>[\w-]+)/(?P<id>[0-9a-f]{32}) - )''' + (?:category/)?video/(?P<display_id>[\w-]+)/(?P<id>[0-9a-f]{32})| + media/embed.*(?:\?|&)key=(?P<embed_id>[0-9a-f]{32}&?) + )''' % ('|'.join(map(re.escape, _INSTANCES))) _TESTS = [ { @@ -14,6 +82,8 @@ class VideocampusSachsenIE(InfoExtractor): 'info_dict': { 'id': 'e6b9349905c1628631f175712250f2a1', 'title': 'Konstruktiver Entwicklungsprozess Vorlesung 7', + 'description': 'Konstruktiver Entwicklungsprozess Vorlesung 7', + 'thumbnail': 'https://videocampus.sachsen.de/cache/1a985379ad3aecba8097a6902c7daa4e.jpg', 'ext': 'mp4', }, }, @@ -22,6 +92,8 @@ class VideocampusSachsenIE(InfoExtractor): 'info_dict': { 'id': 'fc99c527e4205b121cb7c74433469262', 'title': 'Was ist selbstgesteuertes Lernen?', + 'description': 'md5:196aa3b0509a526db62f84679522a2f5', + 'thumbnail': 'https://videocampus.sachsen.de/cache/6f4a85096ba24cb398e6ce54446b57ae.jpg', 'display_id': 'Was-ist-selbstgesteuertes-Lernen', 'ext': 'mp4', }, @@ -31,66 +103,151 @@ class VideocampusSachsenIE(InfoExtractor): 'info_dict': { 'id': '09d4ed029002eb1bdda610f1103dd54c', 'title': 'Tutorial zur Nutzung von Adobe Connect aus Veranstalter-Sicht', + 'description': 'md5:3d379ca3cc17b9da6784d7f58cca4d58', + 'thumbnail': 'https://videocampus.sachsen.de/cache/2452498fe8c2d5a7dc79a05d30f407b6.jpg', 'display_id': 'Tutorial-zur-Nutzung-von-Adobe-Connect-aus-Veranstalter-Sicht', 'ext': 'mp4', }, }, + { + 'url': 'https://www2.univ-sba.dz/video/Presentation-de-la-Faculte-de-droit-et-des-sciences-politiques-Journee-portes-ouvertes-202122/0183356e41af7bfb83d7667b20d9b6a3', + 'info_dict': { + 'url': 'https://www2.univ-sba.dz/getMedium/0183356e41af7bfb83d7667b20d9b6a3.mp4', + 'id': '0183356e41af7bfb83d7667b20d9b6a3', + 'title': 'Présentation de la Faculté de droit et des sciences politiques - Journée portes ouvertes 2021/22', + 'description': 'md5:508958bd93e0ca002ac731d94182a54f', + 'thumbnail': 'https://www2.univ-sba.dz/cache/4d5d4a0b4189271a8cc6cb5328e14769.jpg', + 'display_id': 'Presentation-de-la-Faculte-de-droit-et-des-sciences-politiques-Journee-portes-ouvertes-202122', + 'ext': 'mp4', + } + }, + { + 'url': 'https://vimp.weka-fachmedien.de/video/Preisverleihung-Produkte-des-Jahres-2022/c8816f1cc942c12b6cce57c835cffd7c', + 'info_dict': { + 'id': 'c8816f1cc942c12b6cce57c835cffd7c', + 'title': 'Preisverleihung »Produkte des Jahres 2022«', + 'description': 'md5:60c347568ca89aa25b772c4ea564ebd3', + 'thumbnail': 'https://vimp.weka-fachmedien.de/cache/da9f3090e9227b25beacf67ccf94de14.png', + 'display_id': 'Preisverleihung-Produkte-des-Jahres-2022', + 'ext': 'mp4', + }, + }, + { + 'url': 'https://videocampus.sachsen.de/media/embed?key=fc99c527e4205b121cb7c74433469262', + 'info_dict': { + 'id': 'fc99c527e4205b121cb7c74433469262', + 'title': 'Was ist selbstgesteuertes Lernen?', + 'ext': 'mp4', + }, + }, ] def _real_extract(self, url): - video_id, tmp_id, display_id = self._match_valid_url(url).group('id', 'tmp_id', 'display_id') + host, video_id, tmp_id, display_id, embed_id = self._match_valid_url(url).group( + 'host', 'id', 'tmp_id', 'display_id', 'embed_id') webpage = self._download_webpage(url, video_id or tmp_id, fatal=False) or '' - if not tmp_id: - video_id = self._html_search_regex( - r'src="https?://videocampus\.sachsen\.de/media/embed\?key=([0-9a-f]+)&', + if not video_id: + video_id = embed_id or self._html_search_regex( + rf'src="https?://{host}/media/embed.*(?:\?|&)key=([0-9a-f]+)&?', webpage, 'video_id') - title = self._html_search_regex( - (r'<h1>(?P<content>[^<]+)</h1>', *self._meta_regex('title')), - webpage, 'title', group='content', fatal=False) + if not (display_id or tmp_id): + # Title, description from embedded page's meta wouldn't be correct + title = self._html_search_regex(r'<video-js[^>]* data-piwik-title="([^"<]+)"', webpage, 'title', fatal=False) + description = None + thumbnail = None + else: + title = self._html_search_meta(('og:title', 'twitter:title', 'title'), webpage, fatal=False) + description = self._html_search_meta( + ('og:description', 'twitter:description', 'description'), webpage, fatal=False) + thumbnail = self._html_search_meta(('og:image', 'twitter:image'), webpage, fatal=False) + + formats, subtitles = [], {} + try: + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + f'https://{host}/media/hlsMedium/key/{video_id}/format/auto/ext/mp4/learning/0/path/m3u8', + video_id, 'mp4', m3u8_id='hls', fatal=True) + except ExtractorError as e: + if not isinstance(e.cause, compat_HTTPError) or e.cause.code not in (404, 500): + raise - formats, subtitles = self._extract_m3u8_formats_and_subtitles( - f'https://videocampus.sachsen.de/media/hlsMedium/key/{video_id}/format/auto/ext/mp4/learning/0/path/m3u8', - video_id, 'mp4', 'm3u8_native', m3u8_id='hls') - self._sort_formats(formats) + formats.append({'url': f'https://{host}/getMedium/{video_id}.mp4'}) return { 'id': video_id, 'title': title, + 'description': description, + 'thumbnail': thumbnail, 'display_id': display_id, 'formats': formats, - 'subtitles': subtitles + 'subtitles': subtitles, } -class VideocampusSachsenEmbedIE(InfoExtractor): - _VALID_URL = r'https?://videocampus.sachsen.de/media/embed\?key=(?P<id>[0-9a-f]+)' +class ViMPPlaylistIE(InfoExtractor): + IE_NAME = 'ViMP:Playlist' + _VALID_URL = r'''(?x)(?P<host>https?://(?:%s))/(?: + album/view/aid/(?P<album_id>[0-9]+)| + (?P<mode>category|channel)/(?P<name>[\w-]+)/(?P<id>[0-9]+) + )''' % '|'.join(map(re.escape, VideocampusSachsenIE._INSTANCES)) - _TESTS = [ - { - 'url': 'https://videocampus.sachsen.de/media/embed?key=fc99c527e4205b121cb7c74433469262', - 'info_dict': { - 'id': 'fc99c527e4205b121cb7c74433469262', - 'title': 'Was ist selbstgesteuertes Lernen?', - 'ext': 'mp4', - }, - } - ] + _TESTS = [{ + 'url': 'https://vimp.oth-regensburg.de/channel/Designtheorie-1-SoSe-2020/3', + 'info_dict': { + 'id': 'channel-3', + 'title': 'Designtheorie 1 SoSe 2020 :: Channels :: ViMP OTH Regensburg', + }, + 'playlist_mincount': 9, + }, { + 'url': 'https://www.fh-bielefeld.de/medienportal/album/view/aid/208', + 'info_dict': { + 'id': 'album-208', + 'title': 'KG Praktikum ABT/MEC :: Playlists :: FH-Medienportal', + }, + 'playlist_mincount': 4, + }, { + 'url': 'https://videocampus.sachsen.de/category/online-tutorials-onyx/91', + 'info_dict': { + 'id': 'category-91', + 'title': 'Online-Seminare ONYX - BPS - Bildungseinrichtungen - VCS', + }, + 'playlist_mincount': 7, + }] + _PAGE_SIZE = 10 + + def _fetch_page(self, host, url_part, id, data, page): + webpage = self._download_webpage( + f'{host}/media/ajax/component/boxList/{url_part}', id, + query={'page': page, 'page_only': 1}, data=urlencode_postdata(data)) + urls = re.findall(r'"([^"]+/video/[^"]+)"', webpage) + + for url in urls: + yield self.url_result(host + url, VideocampusSachsenIE) def _real_extract(self, url): - video_id = self._match_id(url) + host, album_id, mode, name, id = self._match_valid_url(url).group( + 'host', 'album_id', 'mode', 'name', 'id') - webpage = self._download_webpage(url, video_id) - title = self._html_search_regex(r'<img[^>]*title="([^"<]+)"', webpage, 'title', fatal=False) - formats, subtitles = self._extract_m3u8_formats_and_subtitles( - f'https://videocampus.sachsen.de/media/hlsMedium/key/{video_id}/format/auto/ext/mp4/learning/0/path/m3u8', - video_id, 'mp4', 'm3u8_native', m3u8_id='hls') - self._sort_formats(formats) + webpage = self._download_webpage(url, album_id or id, fatal=False) or '' + title = (self._html_search_meta('title', webpage, fatal=False) + or self._html_extract_title(webpage)) - return { - 'id': video_id, - 'title': title, - 'formats': formats, - 'subtitles': subtitles, + url_part = (f'aid/{album_id}' if album_id + else f'category/{name}/category_id/{id}' if mode == 'category' + else f'title/{name}/channel/{id}') + + mode = mode or 'album' + data = { + 'vars[mode]': mode, + f'vars[{mode}]': album_id or id, + 'vars[context]': '4' if album_id else '1' if mode == 'category' else '3', + 'vars[context_id]': album_id or id, + 'vars[layout]': 'thumb', + 'vars[per_page][thumb]': str(self._PAGE_SIZE), } + + return self.playlist_result( + OnDemandPagedList(functools.partial( + self._fetch_page, host, url_part, album_id or id, data), self._PAGE_SIZE), + playlist_title=title, id=f'{mode}-{album_id or id}') diff --git a/hypervideo_dl/extractor/videodetective.py b/hypervideo_dl/extractor/videodetective.py index fe70db7..7928a41 100644 --- a/hypervideo_dl/extractor/videodetective.py +++ b/hypervideo_dl/extractor/videodetective.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from .internetvideoarchive import InternetVideoArchiveIE diff --git a/hypervideo_dl/extractor/videofyme.py b/hypervideo_dl/extractor/videofyme.py index cd3f50a..1d1c8f7 100644 --- a/hypervideo_dl/extractor/videofyme.py +++ b/hypervideo_dl/extractor/videofyme.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, diff --git a/hypervideo_dl/extractor/videomore.py b/hypervideo_dl/extractor/videomore.py index 17ef3b1..ddc33f7 100644 --- a/hypervideo_dl/extractor/videomore.py +++ b/hypervideo_dl/extractor/videomore.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..compat import ( compat_str, @@ -50,6 +45,12 @@ class VideomoreIE(InfoExtractor): (?P<id>\d+) (?:[/?#&]|\.(?:xml|json)|$) ''' + _EMBED_REGEX = [r'''(?x) + (?: + <iframe[^>]+src=([\'"])| + <object[^>]+data=(["\'])https?://videomore\.ru/player\.swf\?.*config= + )(?P<url>https?://videomore\.ru/[^?#"']+/\d+(?:\.xml)?) + '''] _TESTS = [{ 'url': 'http://videomore.ru/kino_v_detalayah/5_sezon/367617', 'md5': '44455a346edc0d509ac5b5a5b531dc35', @@ -129,19 +130,6 @@ class VideomoreIE(InfoExtractor): }] _GEO_BYPASS = False - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<object[^>]+data=(["\'])https?://videomore\.ru/player\.swf\?.*config=(?P<url>https?://videomore\.ru/(?:[^/]+/)+\d+\.xml).*\1', - webpage) - if not mobj: - mobj = re.search( - r'<iframe[^>]+src=([\'"])(?P<url>https?://videomore\.ru/embed/\d+)', - webpage) - - if mobj: - return mobj.group('url') - def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('sid') or mobj.group('id') @@ -193,7 +181,6 @@ class VideomoreIE(InfoExtractor): if error in ('Данное видео недоступно для просмотра на территории этой страны', 'Данное видео доступно для просмотра только на территории России'): self.raise_geo_restricted(countries=['RU'], metadata_available=True) self.raise_no_formats(error, expected=True) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/videopress.py b/hypervideo_dl/extractor/videopress.py index 6376ff0..0734aee 100644 --- a/hypervideo_dl/extractor/videopress.py +++ b/hypervideo_dl/extractor/videopress.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -20,6 +15,7 @@ class VideoPressIE(InfoExtractor): _ID_REGEX = r'[\da-zA-Z]{8}' _PATH_REGEX = r'video(?:\.word)?press\.com/embed/' _VALID_URL = r'https?://%s(?P<id>%s)' % (_PATH_REGEX, _ID_REGEX) + _EMBED_REGEX = [rf'<iframe[^>]+src=["\'](?P<url>(?:https?://)?{_PATH_REGEX}{_ID_REGEX})'] _TESTS = [{ 'url': 'https://videopress.com/embed/kUJmAcSf', 'md5': '706956a6c875873d51010921310e4bc6', @@ -42,12 +38,6 @@ class VideoPressIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+src=["\']((?:https?://)?%s%s)' % (VideoPressIE._PATH_REGEX, VideoPressIE._ID_REGEX), - webpage) - def _real_extract(self, url): video_id = self._match_id(url) @@ -86,7 +76,6 @@ class VideoPressIE(InfoExtractor): 'width': int_or_none(video.get('width')), 'height': int_or_none(video.get('height')), }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/vidio.py b/hypervideo_dl/extractor/vidio.py index 6bfb8d4..770aa28 100644 --- a/hypervideo_dl/extractor/vidio.py +++ b/hypervideo_dl/extractor/vidio.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( clean_html, @@ -71,10 +67,10 @@ class VidioBaseIE(InfoExtractor): class VidioIE(VidioBaseIE): - _VALID_URL = r'https?://(?:www\.)?vidio\.com/watch/(?P<id>\d+)-(?P<display_id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?vidio\.com/(watch|embed)/(?P<id>\d+)-(?P<display_id>[^/?#&]+)' _TESTS = [{ 'url': 'http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015', - 'md5': 'cd2801394afc164e9775db6a140b91fe', + 'md5': 'abac81b1a205a8d94c609a473b5ea62a', 'info_dict': { 'id': '165683', 'display_id': 'dj_ambred-booyah-live-2015', @@ -93,7 +89,8 @@ class VidioIE(VidioBaseIE): 'view_count': int, 'dislike_count': int, 'comment_count': int, - 'tags': 'count:4', + 'tags': 'count:3', + 'uploader_url': 'https://www.vidio.com/@twelvepictures', }, }, { 'url': 'https://www.vidio.com/watch/77949-south-korea-test-fires-missile-that-can-strike-all-of-the-north', @@ -102,6 +99,30 @@ class VidioIE(VidioBaseIE): # Premier-exclusive video 'url': 'https://www.vidio.com/watch/1550718-stand-by-me-doraemon', 'only_matching': True + }, { + # embed url from https://enamplus.liputan6.com/read/5033648/video-fakta-temuan-suspek-cacar-monyet-di-jawa-tengah + 'url': 'https://www.vidio.com/embed/7115874-fakta-temuan-suspek-cacar-monyet-di-jawa-tengah', + 'info_dict': { + 'id': '7115874', + 'ext': 'mp4', + 'channel_id': '40172876', + 'comment_count': int, + 'uploader_id': 'liputan6', + 'view_count': int, + 'dislike_count': int, + 'upload_date': '20220804', + 'uploader': 'Liputan6.com', + 'display_id': 'fakta-temuan-suspek-cacar-monyet-di-jawa-tengah', + 'channel': 'ENAM PLUS 165', + 'timestamp': 1659605520, + 'title': 'Fakta Temuan Suspek Cacar Monyet di Jawa Tengah', + 'duration': 59, + 'like_count': int, + 'tags': ['monkeypox indonesia', 'cacar monyet menyebar', 'suspek cacar monyet di indonesia', 'fakta', 'hoax atau bukan?', 'jawa tengah'], + 'thumbnail': 'https://thumbor.prod.vidiocdn.com/83PN-_BKm5sS7emLtRxl506MLqQ=/640x360/filters:quality(70)/vidio-web-prod-video/uploads/video/image/7115874/fakta-suspek-cacar-monyet-di-jawa-tengah-24555a.jpg', + 'uploader_url': 'https://www.vidio.com/@liputan6', + 'description': 'md5:6d595a18d3b19ee378e335a6f288d5ac', + }, }] def _real_extract(self, url): @@ -135,8 +156,6 @@ class VidioIE(VidioBaseIE): formats, subs = self._extract_m3u8_formats_and_subtitles( hls_url, display_id, 'mp4', 'm3u8_native') - self._sort_formats(formats) - get_first = lambda x: try_get(data, lambda y: y[x + 's'][0], dict) or {} channel = get_first('channel') user = get_first('user') @@ -156,7 +175,7 @@ class VidioIE(VidioBaseIE): 'uploader': user.get('name'), 'timestamp': parse_iso8601(video.get('created_at')), 'uploader_id': username, - 'uploader_url': format_field(username, template='https://www.vidio.com/@%s'), + 'uploader_url': format_field(username, None, 'https://www.vidio.com/@%s'), 'channel': channel.get('name'), 'channel_id': str_or_none(channel.get('id')), 'view_count': get_count('view_count'), @@ -272,7 +291,6 @@ class VidioLiveIE(VidioBaseIE): if stream_meta.get('stream_url'): formats.extend(self._extract_m3u8_formats( stream_meta['stream_url'], display_id, 'mp4', 'm3u8_native')) - self._sort_formats(formats) return { 'id': video_id, @@ -287,5 +305,5 @@ class VidioLiveIE(VidioBaseIE): 'uploader': user.get('name'), 'timestamp': parse_iso8601(stream_meta.get('start_time')), 'uploader_id': username, - 'uploader_url': format_field(username, template='https://www.vidio.com/@%s'), + 'uploader_url': format_field(username, None, 'https://www.vidio.com/@%s'), } diff --git a/hypervideo_dl/extractor/vidlii.py b/hypervideo_dl/extractor/vidlii.py index a63919f..5933783 100644 --- a/hypervideo_dl/extractor/vidlii.py +++ b/hypervideo_dl/extractor/vidlii.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -80,7 +77,6 @@ class VidLiiIE(InfoExtractor): 'format_id': f'{height}p', 'height': height, }) - self._sort_formats(formats) title = self._search_regex( (r'<h1>([^<]+)</h1>', r'<title>([^<]+) - VidLii<'), webpage, @@ -103,7 +99,7 @@ class VidLiiIE(InfoExtractor): uploader = self._search_regex( r'<div[^>]+class=["\']wt_person[^>]+>\s*<a[^>]+\bhref=["\']/user/[^>]+>([^<]+)', webpage, 'uploader', fatal=False) - uploader_url = format_field(uploader, template='https://www.vidlii.com/user/%s') + uploader_url = format_field(uploader, None, 'https://www.vidlii.com/user/%s') upload_date = unified_strdate(self._html_search_meta( 'datePublished', webpage, default=None) or self._search_regex( diff --git a/hypervideo_dl/extractor/vidme.py b/hypervideo_dl/extractor/vidme.py deleted file mode 100644 index 174e69c..0000000 --- a/hypervideo_dl/extractor/vidme.py +++ /dev/null @@ -1,295 +0,0 @@ -from __future__ import unicode_literals - -import itertools - -from .common import InfoExtractor -from ..compat import compat_HTTPError -from ..utils import ( - ExtractorError, - int_or_none, - float_or_none, - parse_iso8601, - url_or_none, -) - - -class VidmeIE(InfoExtractor): - IE_NAME = 'vidme' - _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]{,5})(?:[^\da-zA-Z]|$)' - _TESTS = [{ - 'url': 'https://vid.me/QNB', - 'md5': 'f42d05e7149aeaec5c037b17e5d3dc82', - 'info_dict': { - 'id': 'QNB', - 'ext': 'mp4', - 'title': 'Fishing for piranha - the easy way', - 'description': 'source: https://www.facebook.com/photo.php?v=312276045600871', - 'thumbnail': r're:^https?://.*\.jpg', - 'timestamp': 1406313244, - 'upload_date': '20140725', - 'age_limit': 0, - 'duration': 119.92, - 'view_count': int, - 'like_count': int, - 'comment_count': int, - }, - }, { - 'url': 'https://vid.me/Gc6M', - 'md5': 'f42d05e7149aeaec5c037b17e5d3dc82', - 'info_dict': { - 'id': 'Gc6M', - 'ext': 'mp4', - 'title': 'O Mere Dil ke chain - Arnav and Khushi VM', - 'thumbnail': r're:^https?://.*\.jpg', - 'timestamp': 1441211642, - 'upload_date': '20150902', - 'uploader': 'SunshineM', - 'uploader_id': '3552827', - 'age_limit': 0, - 'duration': 223.72, - 'view_count': int, - 'like_count': int, - 'comment_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, { - # tests uploader field - 'url': 'https://vid.me/4Iib', - 'info_dict': { - 'id': '4Iib', - 'ext': 'mp4', - 'title': 'The Carver', - 'description': 'md5:e9c24870018ae8113be936645b93ba3c', - 'thumbnail': r're:^https?://.*\.jpg', - 'timestamp': 1433203629, - 'upload_date': '20150602', - 'uploader': 'Thomas', - 'uploader_id': '109747', - 'age_limit': 0, - 'duration': 97.859999999999999, - 'view_count': int, - 'like_count': int, - 'comment_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, { - # nsfw test from http://naked-yogi.tumblr.com/post/118312946248/naked-smoking-stretching - 'url': 'https://vid.me/e/Wmur', - 'info_dict': { - 'id': 'Wmur', - 'ext': 'mp4', - 'title': 'naked smoking & stretching', - 'thumbnail': r're:^https?://.*\.jpg', - 'timestamp': 1430931613, - 'upload_date': '20150506', - 'uploader': 'naked-yogi', - 'uploader_id': '1638622', - 'age_limit': 18, - 'duration': 653.26999999999998, - 'view_count': int, - 'like_count': int, - 'comment_count': int, - }, - 'params': { - 'skip_download': True, - }, - }, { - # nsfw, user-disabled - 'url': 'https://vid.me/dzGJ', - 'only_matching': True, - }, { - # suspended - 'url': 'https://vid.me/Ox3G', - 'only_matching': True, - }, { - # deleted - 'url': 'https://vid.me/KTPm', - 'only_matching': True, - }, { - # no formats in the API response - 'url': 'https://vid.me/e5g', - 'info_dict': { - 'id': 'e5g', - 'ext': 'mp4', - 'title': 'Video upload (e5g)', - 'thumbnail': r're:^https?://.*\.jpg', - 'timestamp': 1401480195, - 'upload_date': '20140530', - 'uploader': None, - 'uploader_id': None, - 'age_limit': 0, - 'duration': 483, - 'view_count': int, - 'like_count': int, - 'comment_count': int, - }, - 'params': { - 'skip_download': True, - }, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - try: - response = self._download_json( - 'https://api.vid.me/videoByUrl/%s' % video_id, video_id) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: - response = self._parse_json(e.cause.read(), video_id) - else: - raise - - error = response.get('error') - if error: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error), expected=True) - - video = response['video'] - - if video.get('state') == 'deleted': - raise ExtractorError( - 'Vidme said: Sorry, this video has been deleted.', - expected=True) - - if video.get('state') in ('user-disabled', 'suspended'): - raise ExtractorError( - 'Vidme said: This video has been suspended either due to a copyright claim, ' - 'or for violating the terms of use.', - expected=True) - - formats = [] - for f in video.get('formats', []): - format_url = url_or_none(f.get('uri')) - if not format_url: - continue - format_type = f.get('type') - if format_type == 'dash': - formats.extend(self._extract_mpd_formats( - format_url, video_id, mpd_id='dash', fatal=False)) - elif format_type == 'hls': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - else: - formats.append({ - 'format_id': f.get('type'), - 'url': format_url, - 'width': int_or_none(f.get('width')), - 'height': int_or_none(f.get('height')), - 'preference': 0 if f.get('type', '').endswith( - 'clip') else 1, - }) - - if not formats and video.get('complete_url'): - formats.append({ - 'url': video.get('complete_url'), - 'width': int_or_none(video.get('width')), - 'height': int_or_none(video.get('height')), - }) - - self._sort_formats(formats) - - title = video['title'] - description = video.get('description') - thumbnail = video.get('thumbnail_url') - timestamp = parse_iso8601(video.get('date_created'), ' ') - uploader = video.get('user', {}).get('username') - uploader_id = video.get('user', {}).get('user_id') - age_limit = 18 if video.get('nsfw') is True else 0 - duration = float_or_none(video.get('duration')) - view_count = int_or_none(video.get('view_count')) - like_count = int_or_none(video.get('likes_count')) - comment_count = int_or_none(video.get('comment_count')) - - return { - 'id': video_id, - 'title': title or 'Video upload (%s)' % video_id, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'age_limit': age_limit, - 'timestamp': timestamp, - 'duration': duration, - 'view_count': view_count, - 'like_count': like_count, - 'comment_count': comment_count, - 'formats': formats, - } - - -class VidmeListBaseIE(InfoExtractor): - # Max possible limit according to https://docs.vid.me/#api-Videos-List - _LIMIT = 100 - - def _entries(self, user_id, user_name): - for page_num in itertools.count(1): - page = self._download_json( - 'https://api.vid.me/videos/%s?user=%s&limit=%d&offset=%d' - % (self._API_ITEM, user_id, self._LIMIT, (page_num - 1) * self._LIMIT), - user_name, 'Downloading user %s page %d' % (self._API_ITEM, page_num)) - - videos = page.get('videos', []) - if not videos: - break - - for video in videos: - video_url = video.get('full_url') or video.get('embed_url') - if video_url: - yield self.url_result(video_url, VidmeIE.ie_key()) - - total = int_or_none(page.get('page', {}).get('total')) - if total and self._LIMIT * page_num >= total: - break - - def _real_extract(self, url): - user_name = self._match_id(url) - - user_id = self._download_json( - 'https://api.vid.me/userByUsername?username=%s' % user_name, - user_name)['user']['user_id'] - - return self.playlist_result( - self._entries(user_id, user_name), user_id, - '%s - %s' % (user_name, self._TITLE)) - - -class VidmeUserIE(VidmeListBaseIE): - IE_NAME = 'vidme:user' - _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z_-]{6,})(?!/likes)(?:[^\da-zA-Z_-]|$)' - _API_ITEM = 'list' - _TITLE = 'Videos' - _TESTS = [{ - 'url': 'https://vid.me/MasakoX', - 'info_dict': { - 'id': '16112341', - 'title': 'MasakoX - %s' % _TITLE, - }, - 'playlist_mincount': 191, - }, { - 'url': 'https://vid.me/unsQuare_netWork', - 'only_matching': True, - }] - - -class VidmeUserLikesIE(VidmeListBaseIE): - IE_NAME = 'vidme:user:likes' - _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z_-]{6,})/likes' - _API_ITEM = 'likes' - _TITLE = 'Likes' - _TESTS = [{ - 'url': 'https://vid.me/ErinAlexis/likes', - 'info_dict': { - 'id': '6483530', - 'title': 'ErinAlexis - %s' % _TITLE, - }, - 'playlist_mincount': 415, - }, { - 'url': 'https://vid.me/Kaleidoscope-Ish/likes', - 'only_matching': True, - }] diff --git a/hypervideo_dl/extractor/vidzi.py b/hypervideo_dl/extractor/vidzi.py deleted file mode 100644 index 42ea495..0000000 --- a/hypervideo_dl/extractor/vidzi.py +++ /dev/null @@ -1,68 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - decode_packed_codes, - js_to_json, - NO_DEFAULT, - PACKED_CODES_RE, -) - - -class VidziIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vidzi\.(?:tv|cc|si|nu)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' - _TESTS = [{ - 'url': 'http://vidzi.tv/cghql9yq6emu.html', - 'md5': '4f16c71ca0c8c8635ab6932b5f3f1660', - 'info_dict': { - 'id': 'cghql9yq6emu', - 'ext': 'mp4', - 'title': 'youtube-dl test video 1\\\\2\'3/4<5\\\\6ä7↭', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - }, { - 'url': 'http://vidzi.tv/embed-4z2yb0rzphe9-600x338.html', - 'only_matching': True, - }, { - 'url': 'http://vidzi.cc/cghql9yq6emu.html', - 'only_matching': True, - }, { - 'url': 'https://vidzi.si/rph9gztxj1et.html', - 'only_matching': True, - }, { - 'url': 'http://vidzi.nu/cghql9yq6emu.html', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage( - 'http://vidzi.tv/%s' % video_id, video_id) - title = self._html_search_regex( - r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title') - - codes = [webpage] - codes.extend([ - decode_packed_codes(mobj.group(0)).replace('\\\'', '\'') - for mobj in re.finditer(PACKED_CODES_RE, webpage)]) - for num, code in enumerate(codes, 1): - jwplayer_data = self._parse_json( - self._search_regex( - r'setup\(([^)]+)\)', code, 'jwplayer data', - default=NO_DEFAULT if num == len(codes) else '{}'), - video_id, transform_source=lambda s: js_to_json( - re.sub(r'\s*\+\s*window\[.+?\]', '', s))) - if jwplayer_data: - break - - info_dict = self._parse_jwplayer_data(jwplayer_data, video_id, require_title=False) - info_dict['title'] = title - - return info_dict diff --git a/hypervideo_dl/extractor/vier.py b/hypervideo_dl/extractor/vier.py deleted file mode 100644 index 94aa350..0000000 --- a/hypervideo_dl/extractor/vier.py +++ /dev/null @@ -1,264 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re -import itertools - -from .common import InfoExtractor -from ..utils import ( - urlencode_postdata, - int_or_none, - unified_strdate, -) - - -class VierIE(InfoExtractor): - IE_NAME = 'vier' - IE_DESC = 'vier.be and vijf.be' - _VALID_URL = r'''(?x) - https?:// - (?:www\.)?(?P<site>vier|vijf)\.be/ - (?: - (?: - [^/]+/videos| - video(?:/[^/]+)* - )/ - (?P<display_id>[^/]+)(?:/(?P<id>\d+))?| - (?: - video/v3/embed| - embed/video/public - )/(?P<embed_id>\d+) - ) - ''' - _NETRC_MACHINE = 'vier' - _TESTS = [{ - 'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129', - 'md5': 'e4ae2054a6b040ef1e289e20d111b46e', - 'info_dict': { - 'id': '16129', - 'display_id': 'het-wordt-warm-de-moestuin', - 'ext': 'mp4', - 'title': 'Het wordt warm in De Moestuin', - 'description': 'De vele uren werk eisen hun tol. Wim droomt van assistentie...', - 'upload_date': '20121025', - 'series': 'Plan B', - 'tags': ['De Moestuin', 'Moestuin', 'meisjes', 'Tomaat', 'Wim', 'Droom'], - }, - }, { - 'url': 'http://www.vijf.be/temptationisland/videos/zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas/2561614', - 'info_dict': { - 'id': '2561614', - 'display_id': 'zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas', - 'ext': 'mp4', - 'title': 'md5:84f45fe48b8c1fa296a7f6d208d080a7', - 'description': 'md5:0356d4981e58b8cbee19355cbd51a8fe', - 'upload_date': '20170228', - 'series': 'Temptation Island', - 'tags': list, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839', - 'info_dict': { - 'id': '2674839', - 'display_id': 'jani-gaat-naar-tokio-aflevering-4', - 'ext': 'mp4', - 'title': 'Jani gaat naar Tokio - Aflevering 4', - 'description': 'md5:aa8d611541db6ae9e863125704511f88', - 'upload_date': '20170501', - 'series': 'Jani gaat', - 'episode_number': 4, - 'tags': ['Jani Gaat', 'Volledige Aflevering'], - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'Requires account credentials', - }, { - # Requires account credentials but bypassed extraction via v3/embed page - # without metadata - 'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839', - 'info_dict': { - 'id': '2674839', - 'display_id': 'jani-gaat-naar-tokio-aflevering-4', - 'ext': 'mp4', - 'title': 'jani-gaat-naar-tokio-aflevering-4', - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Log in to extract metadata'], - }, { - # Without video id in URL - 'url': 'http://www.vier.be/planb/videos/dit-najaar-plan-b', - 'only_matching': True, - }, { - 'url': 'http://www.vier.be/video/v3/embed/16129', - 'only_matching': True, - }, { - 'url': 'https://www.vijf.be/embed/video/public/4093', - 'only_matching': True, - }, { - 'url': 'https://www.vier.be/video/blockbusters/in-juli-en-augustus-summer-classics', - 'only_matching': True, - }, { - 'url': 'https://www.vier.be/video/achter-de-rug/2017/achter-de-rug-seizoen-1-aflevering-6', - 'only_matching': True, - }] - - def _real_initialize(self): - self._logged_in = False - - def _login(self, site): - username, password = self._get_login_info() - if username is None or password is None: - return - - login_page = self._download_webpage( - 'http://www.%s.be/user/login' % site, - None, note='Logging in', errnote='Unable to log in', - data=urlencode_postdata({ - 'form_id': 'user_login', - 'name': username, - 'pass': password, - }), - headers={'Content-Type': 'application/x-www-form-urlencoded'}) - - login_error = self._html_search_regex( - r'(?s)<div class="messages error">\s*<div>\s*<h2.+?</h2>(.+?)<', - login_page, 'login error', default=None) - if login_error: - self.report_warning('Unable to log in: %s' % login_error) - else: - self._logged_in = True - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - embed_id = mobj.group('embed_id') - display_id = mobj.group('display_id') or embed_id - video_id = mobj.group('id') or embed_id - site = mobj.group('site') - - if not self._logged_in: - self._login(site) - - webpage = self._download_webpage(url, display_id) - - if r'id="user-login"' in webpage: - self.report_warning( - 'Log in to extract metadata', video_id=display_id) - webpage = self._download_webpage( - 'http://www.%s.be/video/v3/embed/%s' % (site, video_id), - display_id) - - video_id = self._search_regex( - [r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'], - webpage, 'video id', default=video_id or display_id) - - playlist_url = self._search_regex( - r'data-file=(["\'])(?P<url>(?:https?:)?//[^/]+/.+?\.m3u8.*?)\1', - webpage, 'm3u8 url', default=None, group='url') - - if not playlist_url: - application = self._search_regex( - [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'], - webpage, 'application', default=site + '_vod') - filename = self._search_regex( - [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'], - webpage, 'filename') - playlist_url = 'http://vod.streamcloud.be/%s/_definst_/mp4:%s.mp4/playlist.m3u8' % (application, filename) - - formats = self._extract_wowza_formats( - playlist_url, display_id, skip_protocols=['dash']) - self._sort_formats(formats) - - title = self._og_search_title(webpage, default=display_id) - description = self._html_search_regex( - r'(?s)<div\b[^>]+\bclass=(["\'])[^>]*?\bfield-type-text-with-summary\b[^>]*?\1[^>]*>.*?<p>(?P<value>.+?)</p>', - webpage, 'description', default=None, group='value') - thumbnail = self._og_search_thumbnail(webpage, default=None) - upload_date = unified_strdate(self._html_search_regex( - r'(?s)<div\b[^>]+\bclass=(["\'])[^>]*?\bfield-name-post-date\b[^>]*?\1[^>]*>.*?(?P<value>\d{2}/\d{2}/\d{4})', - webpage, 'upload date', default=None, group='value')) - - series = self._search_regex( - r'data-program=(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, - 'series', default=None, group='value') - episode_number = int_or_none(self._search_regex( - r'(?i)aflevering (\d+)', title, 'episode number', default=None)) - tags = re.findall(r'<a\b[^>]+\bhref=["\']/tags/[^>]+>([^<]+)<', webpage) - - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'upload_date': upload_date, - 'series': series, - 'episode_number': episode_number, - 'tags': tags, - 'formats': formats, - } - - -class VierVideosIE(InfoExtractor): - IE_NAME = 'vier:videos' - _VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?P<program>[^/]+)/videos(?:\?.*\bpage=(?P<page>\d+)|$)' - _TESTS = [{ - 'url': 'http://www.vier.be/demoestuin/videos', - 'info_dict': { - 'id': 'demoestuin', - }, - 'playlist_mincount': 153, - }, { - 'url': 'http://www.vijf.be/temptationisland/videos', - 'info_dict': { - 'id': 'temptationisland', - }, - 'playlist_mincount': 159, - }, { - 'url': 'http://www.vier.be/demoestuin/videos?page=6', - 'info_dict': { - 'id': 'demoestuin-page6', - }, - 'playlist_mincount': 20, - }, { - 'url': 'http://www.vier.be/demoestuin/videos?page=7', - 'info_dict': { - 'id': 'demoestuin-page7', - }, - 'playlist_mincount': 13, - }] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - program = mobj.group('program') - site = mobj.group('site') - - page_id = mobj.group('page') - if page_id: - page_id = int(page_id) - start_page = page_id - playlist_id = '%s-page%d' % (program, page_id) - else: - start_page = 0 - playlist_id = program - - entries = [] - for current_page_id in itertools.count(start_page): - current_page = self._download_webpage( - 'http://www.%s.be/%s/videos?page=%d' % (site, program, current_page_id), - program, - 'Downloading page %d' % (current_page_id + 1)) - page_entries = [ - self.url_result('http://www.' + site + '.be' + video_url, 'Vier') - for video_url in re.findall( - r'<h[23]><a href="(/[^/]+/videos/[^/]+(?:/\d+)?)">', current_page)] - entries.extend(page_entries) - if page_id or '>Meer<' not in current_page: - break - - return self.playlist_result(entries, playlist_id) diff --git a/hypervideo_dl/extractor/viewlift.py b/hypervideo_dl/extractor/viewlift.py index 4627f66..3812601 100644 --- a/hypervideo_dl/extractor/viewlift.py +++ b/hypervideo_dl/extractor/viewlift.py @@ -1,7 +1,4 @@ -from __future__ import unicode_literals - import json -import re from .common import InfoExtractor from ..compat import compat_HTTPError @@ -65,6 +62,7 @@ class ViewLiftBaseIE(InfoExtractor): class ViewLiftEmbedIE(ViewLiftBaseIE): IE_NAME = 'viewlift:embed' _VALID_URL = r'https?://(?:(?:www|embed)\.)?(?P<domain>%s)/embed/player\?.*\bfilmId=(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' % ViewLiftBaseIE._DOMAINS_REGEX + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?(?:%s)/embed/player.+?)\1' % ViewLiftBaseIE._DOMAINS_REGEX] _TESTS = [{ 'url': 'http://embed.snagfilms.com/embed/player?filmId=74849a00-85a9-11e1-9660-123139220831&w=500', 'md5': '2924e9215c6eff7a55ed35b72276bd93', @@ -91,14 +89,6 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): 'only_matching': True, }] - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:embed\.)?(?:%s)/embed/player.+?)\1' % ViewLiftBaseIE._DOMAINS_REGEX, - webpage) - if mobj: - return mobj.group('url') - def _real_extract(self, url): domain, film_id = self._match_valid_url(url).groups() site = domain.split('.')[-2] @@ -144,7 +134,6 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): 'url': sub_url, }) - self._sort_formats(formats) return { 'id': film_id, 'title': title, diff --git a/hypervideo_dl/extractor/viidea.py b/hypervideo_dl/extractor/viidea.py index 0da0681..4cdf267 100644 --- a/hypervideo_dl/extractor/viidea.py +++ b/hypervideo_dl/extractor/viidea.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -160,7 +158,6 @@ class ViideaIE(InfoExtractor): smil_url = '%s/%s/video/%s/smil.xml' % (base_url, lecture_slug, part_id) smil = self._download_smil(smil_url, lecture_id) info = self._parse_smil(smil, smil_url, lecture_id) - self._sort_formats(info['formats']) info['id'] = lecture_id if not multipart else '%s_part%s' % (lecture_id, part_id) info['display_id'] = lecture_slug if not multipart else '%s_part%s' % (lecture_slug, part_id) if multipart: diff --git a/hypervideo_dl/extractor/viki.py b/hypervideo_dl/extractor/viki.py index 8a93079..3246dab 100644 --- a/hypervideo_dl/extractor/viki.py +++ b/hypervideo_dl/extractor/viki.py @@ -1,5 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals import hashlib import hmac import json @@ -265,7 +263,6 @@ class VikiIE(VikiBaseIE): # Modify the URL to get 1080p mpd_url = mpd_url.replace('mpdhd', 'mpdhd_high') formats = self._extract_mpd_formats(mpd_url, video_id) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/vimeo.py b/hypervideo_dl/extractor/vimeo.py index 4f025a5..516b76d 100644 --- a/hypervideo_dl/extractor/vimeo.py +++ b/hypervideo_dl/extractor/vimeo.py @@ -1,14 +1,11 @@ -# coding: utf-8 -from __future__ import unicode_literals - import base64 import functools import re import itertools +import urllib.error from .common import InfoExtractor from ..compat import ( - compat_kwargs, compat_HTTPError, compat_str, compat_urlparse, @@ -34,7 +31,6 @@ from ..utils import ( unsmuggle_url, urlencode_postdata, urljoin, - unescapeHTML, urlhandle_detect_ext, ) @@ -44,6 +40,18 @@ class VimeoBaseInfoExtractor(InfoExtractor): _LOGIN_REQUIRED = False _LOGIN_URL = 'https://vimeo.com/log_in' + @staticmethod + def _smuggle_referrer(url, referrer_url): + return smuggle_url(url, {'http_headers': {'Referer': referrer_url}}) + + def _unsmuggle_headers(self, url): + """@returns (url, smuggled_data, headers)""" + url, data = unsmuggle_url(url, {}) + headers = self.get_param('http_headers').copy() + if 'http_headers' in data: + headers.update(data['http_headers']) + return url, data, headers + def _perform_login(self, username, password): webpage = self._download_webpage( self._LOGIN_URL, None, 'Downloading login page') @@ -109,21 +117,16 @@ class VimeoBaseInfoExtractor(InfoExtractor): def _extract_vimeo_config(self, webpage, video_id, *args, **kwargs): vimeo_config = self._search_regex( r'vimeo\.config\s*=\s*(?:({.+?})|_extend\([^,]+,\s+({.+?})\));', - webpage, 'vimeo config', *args, **compat_kwargs(kwargs)) + webpage, 'vimeo config', *args, **kwargs) if vimeo_config: return self._parse_json(vimeo_config, video_id) def _set_vimeo_cookie(self, name, value): self._set_cookie('vimeo.com', name, value) - def _vimeo_sort_formats(self, formats): - # Note: Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps - # at the same time without actual units specified. - self._sort_formats(formats, ('quality', 'res', 'fps', 'hdr:12', 'source')) - def _parse_config(self, config, video_id): video_data = config['video'] - video_title = video_data['title'] + video_title = video_data.get('title') live_event = video_data.get('live_event') or {} is_live = live_event.get('status') == 'started' request = config.get('request') or {} @@ -235,6 +238,9 @@ class VimeoBaseInfoExtractor(InfoExtractor): 'formats': formats, 'subtitles': subtitles, 'is_live': is_live, + # Note: Bitrates are completely broken. Single m3u8 may contain entries in kbps and bps + # at the same time without actual units specified. + '_format_sort_fields': ('quality', 'res', 'fps', 'hdr:12', 'source'), } def _extract_original_format(self, url, video_id, unlisted_hash=None): @@ -306,7 +312,7 @@ class VimeoIE(VimeoBaseInfoExtractor): ) \. )? - vimeo(?:pro)?\.com/ + vimeo\.com/ (?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/) (?:[^/]+/)*? (?: @@ -320,6 +326,14 @@ class VimeoIE(VimeoBaseInfoExtractor): /?(?:[?&].*)?(?:[#].*)?$ ''' IE_NAME = 'vimeo' + _EMBED_REGEX = [ + # iframe + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/\d+.*?)\1', + # Embedded (swf embed) Vimeo player + r'<embed[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)\1', + # Non-standard embedded Vimeo player + r'<video[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vimeo\.com/[0-9]+)\1', + ] _TESTS = [ { 'url': 'http://vimeo.com/56015672#at=0', @@ -343,31 +357,6 @@ class VimeoIE(VimeoBaseInfoExtractor): 'skip': 'No longer available' }, { - 'url': 'http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876', - 'md5': '3b5ca6aa22b60dfeeadf50b72e44ed82', - 'note': 'Vimeo Pro video (#1197)', - 'info_dict': { - 'id': '68093876', - 'ext': 'mp4', - 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/openstreetmapus', - 'uploader_id': 'openstreetmapus', - 'uploader': 'OpenStreetMap US', - 'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography', - 'description': 'md5:2c362968038d4499f4d79f88458590c1', - 'duration': 1595, - 'upload_date': '20130610', - 'timestamp': 1370893156, - 'license': 'by', - 'thumbnail': 'https://i.vimeocdn.com/video/440260469-19b0d92fca3bd84066623b53f1eb8aaa3980c6c809e2d67b6b39ab7b4a77a344-d_960', - 'view_count': int, - 'comment_count': int, - 'like_count': int, - }, - 'params': { - 'format': 'best[protocol=https]', - }, - }, - { 'url': 'http://player.vimeo.com/video/54469442', 'md5': 'b3e7f4d2cbb53bd7dc3bb6ff4ed5cfbd', 'note': 'Videos that embed the url in the player page', @@ -721,33 +710,14 @@ class VimeoIE(VimeoBaseInfoExtractor): # vimeo embed with check-password page protected by Referer header ] - @staticmethod - def _smuggle_referrer(url, referrer_url): - return smuggle_url(url, {'http_headers': {'Referer': referrer_url}}) - - @staticmethod - def _extract_urls(url, webpage): - urls = [] - # Look for embedded (iframe) Vimeo player - for mobj in re.finditer( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/\d+.*?)\1', - webpage): - urls.append(VimeoIE._smuggle_referrer(unescapeHTML(mobj.group('url')), url)) - PLAIN_EMBED_RE = ( - # Look for embedded (swf embed) Vimeo player - r'<embed[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)\1', - # Look more for non-standard embedded Vimeo player - r'<video[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?vimeo\.com/[0-9]+)\1', - ) - for embed_re in PLAIN_EMBED_RE: - for mobj in re.finditer(embed_re, webpage): - urls.append(mobj.group('url')) - return urls + @classmethod + def _extract_embed_urls(cls, url, webpage): + for embed_url in super()._extract_embed_urls(url, webpage): + yield cls._smuggle_referrer(embed_url, url) - @staticmethod - def _extract_url(url, webpage): - urls = VimeoIE._extract_urls(url, webpage) - return urls[0] if urls else None + @classmethod + def _extract_url(cls, url, webpage): + return next(cls._extract_embed_urls(url, webpage), None) def _verify_player_video_password(self, url, video_id, headers): password = self._get_video_password() @@ -758,8 +728,8 @@ class VimeoIE(VimeoBaseInfoExtractor): 'Content-Type': 'application/x-www-form-urlencoded', }) checked = self._download_json( - url + '/check-password', video_id, - 'Verifying the password', data=data, headers=headers) + f'{compat_urlparse.urlsplit(url)._replace(query=None).geturl()}/check-password', + video_id, 'Verifying the password', data=data, headers=headers) if checked is False: raise ExtractorError('Wrong video password', expected=True) return checked @@ -780,7 +750,6 @@ class VimeoIE(VimeoBaseInfoExtractor): }) info = self._parse_config(self._download_json( video['config_url'], video_id), video_id) - self._vimeo_sort_formats(info['formats']) get_timestamp = lambda x: parse_iso8601(video.get(x + '_time')) info.update({ 'description': video.get('description'), @@ -834,10 +803,7 @@ class VimeoIE(VimeoBaseInfoExtractor): raise def _real_extract(self, url): - url, data = unsmuggle_url(url, {}) - headers = self.get_param('http_headers').copy() - if 'http_headers' in data: - headers.update(data['http_headers']) + url, data, headers = self._unsmuggle_headers(url) if 'Referer' not in headers: headers['Referer'] = url @@ -847,15 +813,7 @@ class VimeoIE(VimeoBaseInfoExtractor): if unlisted_hash: return self._extract_from_api(video_id, unlisted_hash) - orig_url = url - is_pro = 'vimeopro.com/' in url - if is_pro: - # some videos require portfolio_id to be present in player url - # https://github.com/ytdl-org/youtube-dl/issues/20070 - url = self._extract_url(url, self._download_webpage(url, video_id)) - if not url: - url = 'https://vimeo.com/' + video_id - elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')): + if any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')): url = 'https://vimeo.com/' + video_id self._try_album_password(url) @@ -877,13 +835,11 @@ class VimeoIE(VimeoBaseInfoExtractor): if '://player.vimeo.com/video/' in url: config = self._parse_json(self._search_regex( - r'\bconfig\s*=\s*({.+?})\s*;', webpage, 'info section'), video_id) + r'\b(?:playerC|c)onfig\s*=\s*({.+?})\s*;', webpage, 'info section'), video_id) if config.get('view') == 4: config = self._verify_player_video_password( redirect_url, video_id, headers) - info = self._parse_config(config, video_id) - self._vimeo_sort_formats(info['formats']) - return info + return self._parse_config(config, video_id) if re.search(r'<form[^>]+?id="pw_form"', webpage): video_password = self._get_video_password() @@ -959,14 +915,6 @@ class VimeoIE(VimeoBaseInfoExtractor): video_description = self._html_search_meta( ['description', 'og:description', 'twitter:description'], webpage, default=None) - if not video_description and is_pro: - orig_webpage = self._download_webpage( - orig_url, video_id, - note='Downloading webpage for description', - fatal=False) - if orig_webpage: - video_description = self._html_search_meta( - 'description', orig_webpage, default=None) if not video_description: self.report_warning('Cannot find video description') @@ -988,7 +936,7 @@ class VimeoIE(VimeoBaseInfoExtractor): info_dict_config = self._parse_config(config, video_id) formats.extend(info_dict_config['formats']) - self._vimeo_sort_formats(formats) + info_dict['_format_sort_fields'] = info_dict_config['_format_sort_fields'] json_ld = self._search_json_ld(webpage, video_id, default={}) @@ -1011,7 +959,7 @@ class VimeoIE(VimeoBaseInfoExtractor): return merge_dicts(info_dict, info_dict_config, json_ld) -class VimeoOndemandIE(VimeoIE): +class VimeoOndemandIE(VimeoIE): # XXX: Do not subclass from concrete IE IE_NAME = 'vimeo:ondemand' _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/(?:[^/]+/)?(?P<id>[^/?#&]+)' _TESTS = [{ @@ -1136,9 +1084,9 @@ class VimeoChannelIE(VimeoBaseInfoExtractor): return self._extract_videos(channel_id, self._BASE_URL_TEMPL % channel_id) -class VimeoUserIE(VimeoChannelIE): +class VimeoUserIE(VimeoChannelIE): # XXX: Do not subclass from concrete IE IE_NAME = 'vimeo:user' - _VALID_URL = r'https://vimeo\.com/(?!(?:[0-9]+|watchlater)(?:$|[?#/]))(?P<id>[^/]+)(?:/videos|[#?]|$)' + _VALID_URL = r'https://vimeo\.com/(?!(?:[0-9]+|watchlater)(?:$|[?#/]))(?P<id>[^/]+)(?:/videos)?/?(?:$|[?#])' _TITLE_RE = r'<a[^>]+?class="user">([^<>]+?)</a>' _TESTS = [{ 'url': 'https://vimeo.com/nkistudio/videos', @@ -1147,6 +1095,9 @@ class VimeoUserIE(VimeoChannelIE): 'id': 'nkistudio', }, 'playlist_mincount': 66, + }, { + 'url': 'https://vimeo.com/nkistudio/', + 'only_matching': True, }] _BASE_URL_TEMPL = 'https://vimeo.com/%s' @@ -1243,7 +1194,7 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor): entries, album_id, album.get('name'), album.get('description')) -class VimeoGroupsIE(VimeoChannelIE): +class VimeoGroupsIE(VimeoChannelIE): # XXX: Do not subclass from concrete IE IE_NAME = 'vimeo:group' _VALID_URL = r'https://vimeo\.com/groups/(?P<id>[^/]+)(?:/(?!videos?/\d+)|$)' _TESTS = [{ @@ -1330,14 +1281,13 @@ class VimeoReviewIE(VimeoBaseInfoExtractor): page_url + '/action', video_id) if source_format: info_dict['formats'].append(source_format) - self._vimeo_sort_formats(info_dict['formats']) info_dict['description'] = clean_html(clip_data.get('description')) return info_dict -class VimeoWatchLaterIE(VimeoChannelIE): +class VimeoWatchLaterIE(VimeoChannelIE): # XXX: Do not subclass from concrete IE IE_NAME = 'vimeo:watchlater' - IE_DESC = 'Vimeo watch later list, "vimeowatchlater" keyword (requires authentication)' + IE_DESC = 'Vimeo watch later list, ":vimeowatchlater" keyword (requires authentication)' _VALID_URL = r'https://vimeo\.com/(?:home/)?watchlater|:vimeowatchlater' _TITLE = 'Watch Later' _LOGIN_REQUIRED = True @@ -1358,7 +1308,7 @@ class VimeoWatchLaterIE(VimeoChannelIE): return self._extract_videos('watchlater', 'https://vimeo.com/watchlater') -class VimeoLikesIE(VimeoChannelIE): +class VimeoLikesIE(VimeoChannelIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'https://(?:www\.)?vimeo\.com/(?P<id>[^/]+)/likes/?(?:$|[?#]|sort:)' IE_NAME = 'vimeo:likes' IE_DESC = 'Vimeo user likes' @@ -1385,21 +1335,107 @@ class VimeoLikesIE(VimeoChannelIE): class VHXEmbedIE(VimeoBaseInfoExtractor): IE_NAME = 'vhx:embed' _VALID_URL = r'https?://embed\.vhx\.tv/videos/(?P<id>\d+)' + _EMBED_REGEX = [r'<iframe[^>]+src="(?P<url>https?://embed\.vhx\.tv/videos/\d+[^"]*)"'] - @staticmethod - def _extract_url(webpage): - mobj = re.search( - r'<iframe[^>]+src="(https?://embed\.vhx\.tv/videos/\d+[^"]*)"', webpage) - return unescapeHTML(mobj.group(1)) if mobj else None + @classmethod + def _extract_embed_urls(cls, url, webpage): + for embed_url in super()._extract_embed_urls(url, webpage): + yield cls._smuggle_referrer(embed_url, url) def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + url, _, headers = self._unsmuggle_headers(url) + webpage = self._download_webpage(url, video_id, headers=headers) config_url = self._parse_json(self._search_regex( r'window\.OTTData\s*=\s*({.+})', webpage, 'ott data'), video_id, js_to_json)['config_url'] config = self._download_json(config_url, video_id) info = self._parse_config(config, video_id) info['id'] = video_id - self._vimeo_sort_formats(info['formats']) return info + + +class VimeoProIE(VimeoBaseInfoExtractor): + IE_NAME = 'vimeo:pro' + _VALID_URL = r'https?://(?:www\.)?vimeopro\.com/[^/?#]+/(?P<slug>[^/?#]+)(?:(?:/videos?/(?P<id>[0-9]+)))?' + _TESTS = [{ + # Vimeo URL derived from video_id + 'url': 'http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876', + 'md5': '3b5ca6aa22b60dfeeadf50b72e44ed82', + 'note': 'Vimeo Pro video (#1197)', + 'info_dict': { + 'id': '68093876', + 'ext': 'mp4', + 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/openstreetmapus', + 'uploader_id': 'openstreetmapus', + 'uploader': 'OpenStreetMap US', + 'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography', + 'description': 'md5:2c362968038d4499f4d79f88458590c1', + 'duration': 1595, + 'upload_date': '20130610', + 'timestamp': 1370893156, + 'license': 'by', + 'thumbnail': 'https://i.vimeocdn.com/video/440260469-19b0d92fca3bd84066623b53f1eb8aaa3980c6c809e2d67b6b39ab7b4a77a344-d_960', + 'view_count': int, + 'comment_count': int, + 'like_count': int, + 'tags': 'count:1', + }, + 'params': { + 'format': 'best[protocol=https]', + }, + }, { + # password-protected VimeoPro page with Vimeo player embed + 'url': 'https://vimeopro.com/cadfem/simulation-conference-mechanische-systeme-in-perfektion', + 'info_dict': { + 'id': '764543723', + 'ext': 'mp4', + 'title': 'Mechanische Systeme in Perfektion: Realität erfassen, Innovation treiben', + 'thumbnail': 'https://i.vimeocdn.com/video/1543784598-a1a750494a485e601110136b9fe11e28c2131942452b3a5d30391cb3800ca8fd-d_1280', + 'description': 'md5:2a9d195cd1b0f6f79827107dc88c2420', + 'uploader': 'CADFEM', + 'uploader_id': 'cadfem', + 'uploader_url': 'https://vimeo.com/cadfem', + 'duration': 12505, + 'chapters': 'count:10', + }, + 'params': { + 'videopassword': 'Conference2022', + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + display_id, video_id = self._match_valid_url(url).group('slug', 'id') + if video_id: + display_id = video_id + webpage = self._download_webpage(url, display_id) + + password_form = self._search_regex( + r'(?is)<form[^>]+?method=["\']post["\'][^>]*>(.+?password.+?)</form>', + webpage, 'password form', default=None) + if password_form: + try: + webpage = self._download_webpage(url, display_id, data=urlencode_postdata({ + 'password': self._get_video_password(), + **self._hidden_inputs(password_form), + }), note='Logging in with video password') + except ExtractorError as e: + if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 418: + raise ExtractorError('Wrong video password', expected=True) + raise + + description = None + # even if we have video_id, some videos require player URL with portfolio_id query param + # https://github.com/ytdl-org/youtube-dl/issues/20070 + vimeo_url = VimeoIE._extract_url(url, webpage) + if vimeo_url: + description = self._html_search_meta('description', webpage, default=None) + elif video_id: + vimeo_url = f'https://vimeo.com/{video_id}' + else: + raise ExtractorError( + 'No Vimeo embed or video ID could be found in VimeoPro page', expected=True) + + return self.url_result(vimeo_url, VimeoIE, video_id, url_transparent=True, + description=description) diff --git a/hypervideo_dl/extractor/vimm.py b/hypervideo_dl/extractor/vimm.py index 060b92b..7097149 100644 --- a/hypervideo_dl/extractor/vimm.py +++ b/hypervideo_dl/extractor/vimm.py @@ -1,4 +1,3 @@ -# coding: utf-8 from .common import InfoExtractor @@ -24,7 +23,6 @@ class VimmIE(InfoExtractor): formats, subs = self._extract_m3u8_formats_and_subtitles( f'https://www.vimm.tv/hls/{channel_id}.m3u8', channel_id, 'mp4', m3u8_id='hls', live=True) - self._sort_formats(formats) return { 'id': channel_id, @@ -57,7 +55,6 @@ class VimmRecordingIE(InfoExtractor): formats, subs = self._extract_m3u8_formats_and_subtitles( f'https://d211qfrkztakg3.cloudfront.net/{channel_id}/{video_id}/index.m3u8', video_id, 'mp4', m3u8_id='hls', live=False) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/vimple.py b/hypervideo_dl/extractor/vimple.py index c74b437..fdccf46 100644 --- a/hypervideo_dl/extractor/vimple.py +++ b/hypervideo_dl/extractor/vimple.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import int_or_none @@ -15,7 +13,6 @@ class SprutoBaseIE(InfoExtractor): formats = [{ 'url': f['url'], } for f in playlist['video']] - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/vine.py b/hypervideo_dl/extractor/vine.py index e59b103..1909980 100644 --- a/hypervideo_dl/extractor/vine.py +++ b/hypervideo_dl/extractor/vine.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -14,6 +10,7 @@ from ..utils import ( class VineIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?vine\.co/(?:v|oembed)/(?P<id>\w+)' + _EMBED_REGEX = [r'<iframe[^>]+src=[\'"](?P<url>(?:https?:)?//(?:www\.)?vine\.co/v/[^/]+/embed/(?:simple|postcard))'] _TESTS = [{ 'url': 'https://vine.co/v/b9KOOWX7HUx', 'md5': '2f36fed6235b16da96ce9b4dc890940d', @@ -89,11 +86,10 @@ class VineIE(InfoExtractor): 'quality': quality, }) self._check_formats(formats, video_id) - self._sort_formats(formats) username = data.get('username') - alt_title = format_field(username, template='Vine by %s') + alt_title = format_field(username, None, 'Vine by %s') return { 'id': video_id, diff --git a/hypervideo_dl/extractor/viqeo.py b/hypervideo_dl/extractor/viqeo.py index be7dfa8..79b9f29 100644 --- a/hypervideo_dl/extractor/viqeo.py +++ b/hypervideo_dl/extractor/viqeo.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -20,6 +15,7 @@ class ViqeoIE(InfoExtractor): ) (?P<id>[\da-f]+) ''' + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//cdn\.viqeo\.tv/embed/*\?.*?\bvid=[\da-f]+.*?)\1'] _TESTS = [{ 'url': 'https://cdn.viqeo.tv/embed/?vid=cde96f09d25f39bee837', 'md5': 'a169dd1a6426b350dca4296226f21e76', @@ -38,14 +34,6 @@ class ViqeoIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//cdn\.viqeo\.tv/embed/*\?.*?\bvid=[\da-f]+.*?)\1', - webpage)] - def _real_extract(self, url): video_id = self._match_id(url) @@ -86,7 +74,6 @@ class ViqeoIE(InfoExtractor): 'vcodec': 'none' if is_audio else None, }) formats.append(f) - self._sort_formats(formats) duration = int_or_none(data.get('duration')) diff --git a/hypervideo_dl/extractor/viu.py b/hypervideo_dl/extractor/viu.py index 3cfca89..b183c88 100644 --- a/hypervideo_dl/extractor/viu.py +++ b/hypervideo_dl/extractor/viu.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re import json import uuid @@ -89,7 +86,6 @@ class ViuIE(ViuBaseIE): # r'\1whe\2', video_data['href']) m3u8_url = video_data['href'] formats, subtitles = self._extract_m3u8_formats_and_subtitles(m3u8_url, video_id, 'mp4') - self._sort_formats(formats) for key, value in video_data.items(): mobj = re.match(r'^subtitle_(?P<lang>[^_]+)_(?P<ext>(vtt|srt))', key) @@ -167,12 +163,17 @@ class ViuOTTIE(InfoExtractor): }, 'skip': 'Geo-restricted to Singapore', }, { - 'url': 'http://www.viu.com/ott/hk/zh-hk/vod/7123/%E5%A4%A7%E4%BA%BA%E5%A5%B3%E5%AD%90', + 'url': 'https://www.viu.com/ott/hk/zh-hk/vod/430078/%E7%AC%AC%E5%85%AD%E6%84%9F-3', 'info_dict': { - 'id': '7123', + 'id': '430078', 'ext': 'mp4', - 'title': '這就是我的生活之道', - 'description': 'md5:4eb0d8b08cf04fcdc6bbbeb16043434f', + 'title': '大韓民國的1%', + 'description': 'md5:74d6db47ddd9ddb9c89a05739103ccdb', + 'episode_number': 1, + 'duration': 6614, + 'episode': '大韓民國的1%', + 'series': '第六感 3', + 'thumbnail': 'https://d2anahhhmp1ffz.cloudfront.net/1313295781/d2b14f48d008ef2f3a9200c98d8e9b63967b9cc2', }, 'params': { 'skip_download': 'm3u8 download', @@ -180,11 +181,12 @@ class ViuOTTIE(InfoExtractor): }, 'skip': 'Geo-restricted to Hong Kong', }, { - 'url': 'https://www.viu.com/ott/hk/zh-hk/vod/68776/%E6%99%82%E5%B0%9A%E5%AA%BD%E5%92%AA', - 'playlist_count': 12, + 'url': 'https://www.viu.com/ott/hk/zh-hk/vod/444666/%E6%88%91%E7%9A%84%E5%AE%A4%E5%8F%8B%E6%98%AF%E4%B9%9D%E5%B0%BE%E7%8B%90', + 'playlist_count': 16, 'info_dict': { - 'id': '3916', - 'title': '時尚媽咪', + 'id': '23807', + 'title': '我的室友是九尾狐', + 'description': 'md5:b42c95f2b4a316cdd6ae14ca695f33b9', }, 'params': { 'skip_download': 'm3u8 download', @@ -362,17 +364,22 @@ class ViuOTTIE(InfoExtractor): 'ext': 'mp4', 'filesize': try_get(stream_data, lambda x: x['size'][vid_format], int) }) - self._sort_formats(formats) subtitles = {} for sub in video_data.get('subtitle') or []: - sub_url = sub.get('url') - if not sub_url: - continue - subtitles.setdefault(sub.get('name'), []).append({ - 'url': sub_url, - 'ext': 'srt', - }) + lang = sub.get('name') or 'und' + if sub.get('url'): + subtitles.setdefault(lang, []).append({ + 'url': sub['url'], + 'ext': 'srt', + 'name': f'Spoken text for {lang}', + }) + if sub.get('second_subtitle_url'): + subtitles.setdefault(f'{lang}_ost', []).append({ + 'url': sub['second_subtitle_url'], + 'ext': 'srt', + 'name': f'On-screen text for {lang}', + }) title = strip_or_none(video_data.get('synopsis')) return { diff --git a/hypervideo_dl/extractor/vk.py b/hypervideo_dl/extractor/vk.py index cbc3159..347aa38 100644 --- a/hypervideo_dl/extractor/vk.py +++ b/hypervideo_dl/extractor/vk.py @@ -1,14 +1,17 @@ -# coding: utf-8 -from __future__ import unicode_literals - import collections +import hashlib import re from .common import InfoExtractor +from .dailymotion import DailymotionIE +from .odnoklassniki import OdnoklassnikiIE +from .pladform import PladformIE +from .vimeo import VimeoIE +from .youtube import YoutubeIE from ..compat import compat_urlparse from ..utils import ( - clean_html, ExtractorError, + clean_html, get_element_by_class, int_or_none, orderedSet, @@ -16,19 +19,29 @@ from ..utils import ( str_to_int, unescapeHTML, unified_timestamp, + update_url_query, url_or_none, urlencode_postdata, ) -from .dailymotion import DailymotionIE -from .odnoklassniki import OdnoklassnikiIE -from .pladform import PladformIE -from .vimeo import VimeoIE -from .youtube import YoutubeIE class VKBaseIE(InfoExtractor): _NETRC_MACHINE = 'vk' + def _download_webpage_handle(self, url_or_request, video_id, *args, fatal=True, **kwargs): + response = super()._download_webpage_handle(url_or_request, video_id, *args, fatal=fatal, **kwargs) + challenge_url, cookie = response[1].geturl() if response else '', None + if challenge_url.startswith('https://vk.com/429.html?'): + cookie = self._get_cookies(challenge_url).get('hash429') + if not cookie: + return response + + hash429 = hashlib.md5(cookie.value.encode('ascii')).hexdigest() + self._request_webpage( + update_url_query(challenge_url, {'key': hash429}), video_id, fatal=fatal, + note='Resolving WAF challenge', errnote='Failed to bypass WAF challenge') + return super()._download_webpage_handle(url_or_request, video_id, *args, fatal=True, **kwargs) + def _perform_login(self, username, password): login_page, url_handle = self._download_webpage_handle( 'https://vk.com', None, 'Downloading login page') @@ -54,11 +67,14 @@ class VKBaseIE(InfoExtractor): 'Unable to login, incorrect username and/or password', expected=True) def _download_payload(self, path, video_id, data, fatal=True): + endpoint = f'https://vk.com/{path}.php' data['al'] = 1 code, payload = self._download_json( - 'https://vk.com/%s.php' % path, video_id, - data=urlencode_postdata(data), fatal=fatal, - headers={'X-Requested-With': 'XMLHttpRequest'})['payload'] + endpoint, video_id, data=urlencode_postdata(data), fatal=fatal, + headers={ + 'Referer': endpoint, + 'X-Requested-With': 'XMLHttpRequest', + })['payload'] if code == '3': self.raise_login_required() elif code == '8': @@ -69,6 +85,7 @@ class VKBaseIE(InfoExtractor): class VKIE(VKBaseIE): IE_NAME = 'vk' IE_DESC = 'VK' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1'] _VALID_URL = r'''(?x) https?:// (?: @@ -84,20 +101,25 @@ class VKIE(VKBaseIE): (?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>([\da-f]+)|(ln-[\da-zA-Z]+)))? ) ''' + # https://help.sibnet.ru/?sibnet_video_embed + _EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//video\.sibnet\.ru/shell\.php\?.*?\bvideoid=\d+.*?)\1'] _TESTS = [ { 'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', - 'md5': '7babad3b85ea2e91948005b1b8b0cb84', 'info_dict': { 'id': '-77521_162222515', 'ext': 'mp4', 'title': 'ProtivoGunz - Хуёвая песня', 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*', - 'uploader_id': '-77521', + 'uploader_id': '39545378', 'duration': 195, 'timestamp': 1329049880, 'upload_date': '20120212', + 'comment_count': int, + 'like_count': int, + 'thumbnail': r're:https?://.+\.jpg$', }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'http://vk.com/video205387401_165548505', @@ -110,12 +132,14 @@ class VKIE(VKBaseIE): 'duration': 9, 'timestamp': 1374364108, 'upload_date': '20130720', + 'comment_count': int, + 'like_count': int, + 'thumbnail': r're:https?://.+\.jpg$', } }, { 'note': 'Embedded video', 'url': 'https://vk.com/video_ext.php?oid=-77521&id=162222515&hash=87b046504ccd8bfa', - 'md5': '7babad3b85ea2e91948005b1b8b0cb84', 'info_dict': { 'id': '-77521_162222515', 'ext': 'mp4', @@ -124,8 +148,10 @@ class VKIE(VKBaseIE): 'duration': 195, 'upload_date': '20120212', 'timestamp': 1329049880, - 'uploader_id': '-77521', + 'uploader_id': '39545378', + 'thumbnail': r're:https?://.+\.jpg$', }, + 'params': {'skip_download': 'm3u8'}, }, { # VIDEO NOW REMOVED @@ -179,8 +205,13 @@ class VKIE(VKBaseIE): 'ext': 'mp4', 'title': '8 серия (озвучка)', 'duration': 8383, + 'comment_count': int, + 'uploader': 'Dizi2021', + 'like_count': int, + 'timestamp': 1640162189, 'upload_date': '20211222', - 'view_count': int, + 'uploader_id': '-93049196', + 'thumbnail': r're:https?://.+\.jpg$', }, }, { @@ -207,10 +238,23 @@ class VKIE(VKBaseIE): 'title': "DSWD Awards 'Children's Joy Foundation, Inc.' Certificate of Registration and License to Operate", 'description': 'md5:bf9c26cfa4acdfb146362682edd3827a', 'duration': 178, - 'upload_date': '20130116', + 'upload_date': '20130117', 'uploader': "Children's Joy Foundation Inc.", 'uploader_id': 'thecjf', 'view_count': int, + 'channel_id': 'UCgzCNQ11TmR9V97ECnhi3gw', + 'availability': 'public', + 'like_count': int, + 'live_status': 'not_live', + 'playable_in_embed': True, + 'channel': 'Children\'s Joy Foundation Inc.', + 'uploader_url': 'http://www.youtube.com/user/thecjf', + 'thumbnail': r're:https?://.+\.jpg$', + 'tags': 'count:27', + 'start_time': 0.0, + 'categories': ['Nonprofits & Activism'], + 'channel_url': 'https://www.youtube.com/channel/UCgzCNQ11TmR9V97ECnhi3gw', + 'age_limit': 0, }, }, { @@ -226,9 +270,7 @@ class VKIE(VKBaseIE): 'uploader_id': 'x1p5vl5', 'timestamp': 1473877246, }, - 'params': { - 'skip_download': True, - }, + 'skip': 'Removed' }, { # video key is extra_data not url\d+ @@ -243,9 +285,7 @@ class VKIE(VKBaseIE): 'timestamp': 1454859345, 'upload_date': '20160207', }, - 'params': { - 'skip_download': True, - }, + 'skip': 'Removed', }, { # finished live stream, postlive_mp4 @@ -256,11 +296,12 @@ class VKIE(VKBaseIE): 'title': 'ИгроМир 2016 День 1 — Игромания Утром', 'uploader': 'Игромания', 'duration': 5239, - # TODO: use act=show to extract view_count - # 'view_count': int, 'upload_date': '20160929', 'uploader_id': '-387766', 'timestamp': 1475137527, + 'thumbnail': r're:https?://.+\.jpg$', + 'comment_count': int, + 'like_count': int, }, 'params': { 'skip_download': True, @@ -306,13 +347,6 @@ class VKIE(VKBaseIE): 'only_matching': True, }] - @staticmethod - def _extract_sibnet_urls(webpage): - # https://help.sibnet.ru/?sibnet_video_embed - return [unescapeHTML(mobj.group('url')) for mobj in re.finditer( - r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//video\.sibnet\.ru/shell\.php\?.*?\bvideoid=\d+.*?)\1', - webpage)] - def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('videoid') @@ -320,7 +354,7 @@ class VKIE(VKBaseIE): mv_data = {} if video_id: data = { - 'act': 'show_inline', + 'act': 'show', 'video': video_id, } # Some videos (removed?) can only be downloaded with list id specified @@ -413,17 +447,17 @@ class VKIE(VKBaseIE): m_rutube.group(1).replace('\\', '')) return self.url_result(rutube_url) - dailymotion_urls = DailymotionIE._extract_urls(info_page) - if dailymotion_urls: - return self.url_result(dailymotion_urls[0], DailymotionIE.ie_key()) + dailymotion_url = next(DailymotionIE._extract_embed_urls(url, info_page), None) + if dailymotion_url: + return self.url_result(dailymotion_url, DailymotionIE.ie_key()) odnoklassniki_url = OdnoklassnikiIE._extract_url(info_page) if odnoklassniki_url: return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key()) - sibnet_urls = self._extract_sibnet_urls(info_page) - if sibnet_urls: - return self.url_result(sibnet_urls[0]) + sibnet_url = next(self._extract_embed_urls(url, info_page), None) + if sibnet_url: + return self.url_result(sibnet_url) m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page) if m_opts: @@ -473,7 +507,6 @@ class VKIE(VKBaseIE): 'url': format_url, 'ext': 'flv', }) - self._sort_formats(formats) subtitles = {} for sub in data.get('subs') or {}: @@ -502,7 +535,7 @@ class VKIE(VKBaseIE): class VKUserVideosIE(VKBaseIE): IE_NAME = 'vk:uservideos' IE_DESC = "VK - User's Videos" - _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/video/@(?P<id>[^?$#/&]+)(?!\?.*\bz=video)(?:[/?#&](?:.*?\bsection=(?P<section>\w+))?|$)' + _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/video/(?:playlist/)?(?P<id>[^?$#/&]+)(?!\?.*\bz=video)(?:[/?#&](?:.*?\bsection=(?P<section>\w+))?|$)' _TEMPLATE_URL = 'https://vk.com/videos' _TESTS = [{ 'url': 'https://vk.com/video/@mobidevices', @@ -516,6 +549,13 @@ class VKUserVideosIE(VKBaseIE): 'id': '-17892518_uploaded', }, 'playlist_mincount': 182, + }, { + 'url': 'https://vk.com/video/playlist/-174476437_2', + 'info_dict': { + 'id': '-174476437_2', + 'title': 'Анонсы' + }, + 'playlist_mincount': 108, }] _VIDEO = collections.namedtuple('Video', ['owner_id', 'id']) @@ -550,11 +590,19 @@ class VKUserVideosIE(VKBaseIE): def _real_extract(self, url): u_id, section = self._match_valid_url(url).groups() webpage = self._download_webpage(url, u_id) - page_id = self._search_regex(r'data-owner-id\s?=\s?"([^"]+)"', webpage, 'page_id') + + if u_id.startswith('@'): + page_id = self._search_regex(r'data-owner-id\s?=\s?"([^"]+)"', webpage, 'page_id') + elif '_' in u_id: + page_id, section = u_id.split('_', 1) + else: + raise ExtractorError('Invalid URL', expected=True) + if not section: section = 'all' - return self.playlist_result(self._entries(page_id, section), '%s_%s' % (page_id, section)) + playlist_title = clean_html(get_element_by_class('VideoInfoPanel__title', webpage)) + return self.playlist_result(self._entries(page_id, section), '%s_%s' % (page_id, section), playlist_title) class VKWallPostIE(VKBaseIE): @@ -593,7 +641,6 @@ class VKWallPostIE(VKBaseIE): }], 'params': { 'skip_download': True, - 'usenetrc': True, }, 'skip': 'Requires vk account credentials', }, { @@ -604,9 +651,6 @@ class VKWallPostIE(VKBaseIE): 'title': 'Сергей Горбунов - Wall post 85155021_6319', }, 'playlist_count': 1, - 'params': { - 'usenetrc': True, - }, 'skip': 'Requires vk account credentials', }, { # wall page URL diff --git a/hypervideo_dl/extractor/vlive.py b/hypervideo_dl/extractor/vlive.py index ae35c97..e2fd393 100644 --- a/hypervideo_dl/extractor/vlive.py +++ b/hypervideo_dl/extractor/vlive.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import itertools import json @@ -16,6 +13,7 @@ from ..utils import ( merge_dicts, str_or_none, strip_or_none, + traverse_obj, try_get, urlencode_postdata, url_or_none, @@ -84,6 +82,13 @@ class VLiveIE(VLiveBaseIE): 'upload_date': '20150817', 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', 'timestamp': 1439816449, + 'like_count': int, + 'channel': 'Girl\'s Day', + 'channel_id': 'FDF27', + 'comment_count': int, + 'release_timestamp': 1439818140, + 'release_date': '20150817', + 'duration': 1014, }, 'params': { 'skip_download': True, @@ -101,6 +106,13 @@ class VLiveIE(VLiveBaseIE): 'upload_date': '20161112', 'thumbnail': r're:^https?://.*\.(?:jpg|png)$', 'timestamp': 1478923074, + 'like_count': int, + 'channel': 'EXO', + 'channel_id': 'F94BD', + 'comment_count': int, + 'release_timestamp': 1478924280, + 'release_date': '20161112', + 'duration': 906, }, 'params': { 'skip_download': True, @@ -172,6 +184,7 @@ class VLiveIE(VLiveBaseIE): 'like_count': int_or_none(video.get('likeCount')), 'comment_count': int_or_none(video.get('commentCount')), 'timestamp': int_or_none(video.get('createdAt'), scale=1000), + 'release_timestamp': int_or_none(traverse_obj(video, 'onAirStartAt', 'willStartAt'), scale=1000), 'thumbnail': video.get('thumb'), } @@ -195,7 +208,6 @@ class VLiveIE(VLiveBaseIE): 'old/v3/live/%s/playInfo', video_id)['result']['adaptiveStreamUrl'] formats = self._extract_m3u8_formats(stream_url, video_id, 'mp4') - self._sort_formats(formats) info = get_common_fields() info.update({ 'title': video['title'], @@ -273,7 +285,6 @@ class VLivePostIE(VLiveBaseIE): 'url': f_url, 'height': int_or_none(f_id[:-1]), }) - self._sort_formats(formats) entry = { 'formats': formats, 'id': video_id, diff --git a/hypervideo_dl/extractor/vodlocker.py b/hypervideo_dl/extractor/vodlocker.py index 02c9617..1c7236e 100644 --- a/hypervideo_dl/extractor/vodlocker.py +++ b/hypervideo_dl/extractor/vodlocker.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( ExtractorError, diff --git a/hypervideo_dl/extractor/vodpl.py b/hypervideo_dl/extractor/vodpl.py index 9e91970..8af1572 100644 --- a/hypervideo_dl/extractor/vodpl.py +++ b/hypervideo_dl/extractor/vodpl.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .onet import OnetBaseIE diff --git a/hypervideo_dl/extractor/vodplatform.py b/hypervideo_dl/extractor/vodplatform.py index 74d2257..5ff0500 100644 --- a/hypervideo_dl/extractor/vodplatform.py +++ b/hypervideo_dl/extractor/vodplatform.py @@ -1,12 +1,10 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import unescapeHTML class VODPlatformIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www\.)?vod-platform\.net|embed\.kwikmotion\.com)/[eE]mbed/(?P<id>[^/?#]+)' + _EMBED_REGEX = [r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:(?:www\.)?vod-platform\.net|embed\.kwikmotion\.com)/[eE]mbed/.+?)\1'] _TESTS = [{ # from http://www.lbcgroup.tv/watch/chapter/29143/52844/%D8%A7%D9%84%D9%86%D8%B5%D8%B1%D8%A9-%D9%81%D9%8A-%D8%B6%D9%8A%D8%A7%D9%81%D8%A9-%D8%A7%D9%84%D9%80-cnn/ar 'url': 'http://vod-platform.net/embed/RufMcytHDolTH1MuKHY9Fw', @@ -30,7 +28,6 @@ class VODPlatformIE(InfoExtractor): formats = self._extract_wowza_formats( hidden_inputs.get('HiddenmyhHlsLink') or hidden_inputs['HiddenmyDashLink'], video_id, skip_protocols=['f4m', 'smil']) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/voicerepublic.py b/hypervideo_dl/extractor/voicerepublic.py index a52e40a..47502af 100644 --- a/hypervideo_dl/extractor/voicerepublic.py +++ b/hypervideo_dl/extractor/voicerepublic.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -48,7 +46,6 @@ class VoiceRepublicIE(InfoExtractor): 'ext': determine_ext(talk_url) or format_id, 'vcodec': 'none', } for format_id, talk_url in talk['media_links'].items()] - self._sort_formats(formats) return { 'id': compat_str(talk.get('id') or display_id), diff --git a/hypervideo_dl/extractor/voicy.py b/hypervideo_dl/extractor/voicy.py index 37c7d56..7438b49 100644 --- a/hypervideo_dl/extractor/voicy.py +++ b/hypervideo_dl/extractor/voicy.py @@ -1,5 +1,4 @@ -# coding: utf-8 -from __future__ import unicode_literals +import itertools from .common import InfoExtractor from ..compat import compat_str @@ -12,8 +11,6 @@ from ..utils import ( unsmuggle_url, ) -import itertools - class VoicyBaseIE(InfoExtractor): def _extract_from_playlist_data(self, value): @@ -47,7 +44,6 @@ class VoicyBaseIE(InfoExtractor): 'acodec': 'mp3', 'vcodec': 'none', }] - self._sort_formats(formats) return { 'id': compat_str(entry.get('ArticleId')), 'title': entry.get('ArticleTitle'), @@ -108,7 +104,7 @@ class VoicyChannelIE(VoicyBaseIE): @classmethod def suitable(cls, url): - return not VoicyIE.suitable(url) and super(VoicyChannelIE, cls).suitable(url) + return not VoicyIE.suitable(url) and super().suitable(url) def _entries(self, channel_id): pager = '' diff --git a/hypervideo_dl/extractor/voot.py b/hypervideo_dl/extractor/voot.py index a9b66b9..b709b74 100644 --- a/hypervideo_dl/extractor/voot.py +++ b/hypervideo_dl/extractor/voot.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -17,7 +14,7 @@ class VootIE(InfoExtractor): voot:| https?://(?:www\.)?voot\.com/? (?: - movies/[^/]+/| + movies?/[^/]+/| (?:shows|kids)/(?:[^/]+/){4} ) ) @@ -50,6 +47,9 @@ class VootIE(InfoExtractor): }, { 'url': 'https://www.voot.com/movies/pandavas-5/424627', 'only_matching': True, + }, { + 'url': 'https://www.voot.com/movie/fight-club/621842', + 'only_matching': True, }] def _real_extract(self, url): @@ -73,7 +73,6 @@ class VootIE(InfoExtractor): formats = self._extract_m3u8_formats( 'https://cdnapisec.kaltura.com/p/1982551/playManifest/pt/https/f/applehttp/t/web/e/' + entry_id, video_id, 'mp4', m3u8_id='hls') - self._sort_formats(formats) description, series, season_number, episode, episode_number = [None] * 5 diff --git a/hypervideo_dl/extractor/voxmedia.py b/hypervideo_dl/extractor/voxmedia.py index 6612081..f936200 100644 --- a/hypervideo_dl/extractor/voxmedia.py +++ b/hypervideo_dl/extractor/voxmedia.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from .once import OnceIE from ..compat import compat_urllib_parse_unquote @@ -50,7 +47,6 @@ class VoxMediaVolumeIE(OnceIE): 'tbr': int_or_none(tbr), }) if formats: - self._sort_formats(formats) info['formats'] = formats info['duration'] = int_or_none(asset.get('duration')) return info @@ -61,7 +57,6 @@ class VoxMediaVolumeIE(OnceIE): continue if provider_video_type == 'brightcove': info['formats'] = self._extract_once_formats(provider_video_id) - self._sort_formats(info['formats']) else: info.update({ '_type': 'url_transparent', @@ -74,6 +69,7 @@ class VoxMediaVolumeIE(OnceIE): class VoxMediaIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?(?:(?:theverge|vox|sbnation|eater|polygon|curbed|racked|funnyordie)\.com|recode\.net)/(?:[^/]+/)*(?P<id>[^/?]+)' + _EMBED_REGEX = [r'<iframe[^>]+?src="(?P<url>https?://(?:www\.)?funnyordie\.com/embed/[^"]+)"'] _TESTS = [{ # Volume embed, Youtube 'url': 'http://www.theverge.com/2014/6/27/5849272/material-world-how-google-discovered-what-software-is-made-of', diff --git a/hypervideo_dl/extractor/vrak.py b/hypervideo_dl/extractor/vrak.py index daa247c..198c0a2 100644 --- a/hypervideo_dl/extractor/vrak.py +++ b/hypervideo_dl/extractor/vrak.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/vrt.py b/hypervideo_dl/extractor/vrt.py index 10dc94a..26f48bf 100644 --- a/hypervideo_dl/extractor/vrt.py +++ b/hypervideo_dl/extractor/vrt.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( extract_attributes, diff --git a/hypervideo_dl/extractor/vrv.py b/hypervideo_dl/extractor/vrv.py index 00e1006..89fa7af 100644 --- a/hypervideo_dl/extractor/vrv.py +++ b/hypervideo_dl/extractor/vrv.py @@ -1,20 +1,14 @@ -# coding: utf-8 -from __future__ import unicode_literals - import base64 -import json import hashlib import hmac +import json import random import string import time +import urllib.parse from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_urllib_parse_urlencode, - compat_urllib_parse, -) +from ..compat import compat_HTTPError, compat_urllib_parse_urlencode from ..utils import ( ExtractorError, float_or_none, @@ -49,12 +43,12 @@ class VRVBaseIE(InfoExtractor): headers['Content-Type'] = 'application/json' base_string = '&'.join([ 'POST' if data else 'GET', - compat_urllib_parse.quote(base_url, ''), - compat_urllib_parse.quote(encoded_query, '')]) + urllib.parse.quote(base_url, ''), + urllib.parse.quote(encoded_query, '')]) oauth_signature = base64.b64encode(hmac.new( (self._API_PARAMS['oAuthSecret'] + '&' + self._TOKEN_SECRET).encode('ascii'), base_string.encode(), hashlib.sha1).digest()).decode() - encoded_query += '&oauth_signature=' + compat_urllib_parse.quote(oauth_signature, '') + encoded_query += '&oauth_signature=' + urllib.parse.quote(oauth_signature, '') try: return self._download_json( '?'.join([base_url, encoded_query]), video_id, @@ -198,7 +192,6 @@ class VRVIE(VRVBaseIE): formats.extend(self._extract_vrv_formats( stream.get('url'), video_id, stream_type.split('_')[1], audio_locale, stream.get('hardsub_locale'))) - self._sort_formats(formats) subtitles = {} for k in ('captions', 'subtitles'): diff --git a/hypervideo_dl/extractor/vshare.py b/hypervideo_dl/extractor/vshare.py index b4874ac..1bc7ae4 100644 --- a/hypervideo_dl/extractor/vshare.py +++ b/hypervideo_dl/extractor/vshare.py @@ -1,18 +1,10 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor -from ..compat import compat_chr -from ..utils import ( - decode_packed_codes, - ExtractorError, -) +from ..utils import ExtractorError, decode_packed_codes class VShareIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?vshare\.io/[dv]/(?P<id>[^/?#&]+)' + _EMBED_REGEX = [r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?vshare\.io/v/[^/?#&]+)'] _TESTS = [{ 'url': 'https://vshare.io/d/0f64ce6', 'md5': '17b39f55b5497ae8b59f5fbce8e35886', @@ -26,12 +18,6 @@ class VShareIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+?src=["\'](?P<url>(?:https?:)?//(?:www\.)?vshare\.io/v/[^/?#&]+)', - webpage) - def _extract_packed(self, webpage): packed = self._search_regex( r'(eval\(function.+)', webpage, 'packed code') @@ -40,7 +26,7 @@ class VShareIE(InfoExtractor): digits = [int(digit) for digit in digits.split(',')] key_digit = self._search_regex( r'fromCharCode\(.+?(\d+)\)}', unpacked, 'key digit') - chars = [compat_chr(d - int(key_digit)) for d in digits] + chars = [chr(d - int(key_digit)) for d in digits] return ''.join(chars) def _real_extract(self, url): @@ -63,8 +49,6 @@ class VShareIE(InfoExtractor): url, '<video>%s</video>' % self._extract_packed(webpage), video_id)[0] - self._sort_formats(info['formats']) - info.update({ 'id': video_id, 'title': title, diff --git a/hypervideo_dl/extractor/vtm.py b/hypervideo_dl/extractor/vtm.py index 093f1aa..6381fd3 100644 --- a/hypervideo_dl/extractor/vtm.py +++ b/hypervideo_dl/extractor/vtm.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, diff --git a/hypervideo_dl/extractor/vube.py b/hypervideo_dl/extractor/vube.py deleted file mode 100644 index 1c8f80a..0000000 --- a/hypervideo_dl/extractor/vube.py +++ /dev/null @@ -1,170 +0,0 @@ -from __future__ import unicode_literals - - -from .common import InfoExtractor -from ..compat import ( - compat_str, -) -from ..utils import ( - int_or_none, -) - - -class VubeIE(InfoExtractor): - IE_NAME = 'vube' - IE_DESC = 'Vube.com' - _VALID_URL = r'https?://vube\.com/(?:[^/]+/)+(?P<id>[\da-zA-Z]{10})\b' - - _TESTS = [ - { - 'url': 'http://vube.com/trending/William+Wei/Y8NUZ69Tf7?t=s', - 'md5': 'e7aabe1f8f1aa826b9e4735e1f9cee42', - 'info_dict': { - 'id': 'Y8NUZ69Tf7', - 'ext': 'mp4', - 'title': 'Best Drummer Ever [HD]', - 'description': 'md5:2d63c4b277b85c2277761c2cf7337d71', - 'thumbnail': r're:^https?://.*\.jpg', - 'uploader': 'William', - 'timestamp': 1406876915, - 'upload_date': '20140801', - 'duration': 258.051, - 'like_count': int, - 'dislike_count': int, - 'comment_count': int, - 'categories': ['amazing', 'hd', 'best drummer ever', 'william wei', 'bucket drumming', 'street drummer', 'epic street drumming'], - }, - 'skip': 'Not accessible from Travis CI server', - }, { - 'url': 'http://vube.com/Chiara+Grispo+Video+Channel/YL2qNPkqon', - 'md5': 'db7aba89d4603dadd627e9d1973946fe', - 'info_dict': { - 'id': 'YL2qNPkqon', - 'ext': 'mp4', - 'title': 'Chiara Grispo - Price Tag by Jessie J', - 'description': 'md5:8ea652a1f36818352428cb5134933313', - 'thumbnail': r're:^http://frame\.thestaticvube\.com/snap/[0-9x]+/102e7e63057-5ebc-4f5c-4065-6ce4ebde131f\.jpg$', - 'uploader': 'Chiara.Grispo', - 'timestamp': 1388743358, - 'upload_date': '20140103', - 'duration': 170.56, - 'like_count': int, - 'dislike_count': int, - 'comment_count': int, - 'categories': ['pop', 'music', 'cover', 'singing', 'jessie j', 'price tag', 'chiara grispo'], - }, - 'skip': 'Removed due to DMCA', - }, - { - 'url': 'http://vube.com/SerainaMusic/my-7-year-old-sister-and-i-singing-alive-by-krewella/UeBhTudbfS?t=s&n=1', - 'md5': '5d4a52492d76f72712117ce6b0d98d08', - 'info_dict': { - 'id': 'UeBhTudbfS', - 'ext': 'mp4', - 'title': 'My 7 year old Sister and I singing "Alive" by Krewella', - 'description': 'md5:40bcacb97796339f1690642c21d56f4a', - 'thumbnail': r're:^http://frame\.thestaticvube\.com/snap/[0-9x]+/102265d5a9f-0f17-4f6b-5753-adf08484ee1e\.jpg$', - 'uploader': 'Seraina', - 'timestamp': 1396492438, - 'upload_date': '20140403', - 'duration': 240.107, - 'like_count': int, - 'dislike_count': int, - 'comment_count': int, - 'categories': ['seraina', 'jessica', 'krewella', 'alive'], - }, - 'skip': 'Removed due to DMCA', - }, { - 'url': 'http://vube.com/vote/Siren+Gene/0nmsMY5vEq?n=2&t=s', - 'md5': '0584fc13b50f887127d9d1007589d27f', - 'info_dict': { - 'id': '0nmsMY5vEq', - 'ext': 'mp4', - 'title': 'Frozen - Let It Go Cover by Siren Gene', - 'description': 'My rendition of "Let It Go" originally sung by Idina Menzel.', - 'thumbnail': r're:^http://frame\.thestaticvube\.com/snap/[0-9x]+/10283ab622a-86c9-4681-51f2-30d1f65774af\.jpg$', - 'uploader': 'Siren', - 'timestamp': 1395448018, - 'upload_date': '20140322', - 'duration': 221.788, - 'like_count': int, - 'dislike_count': int, - 'comment_count': int, - 'categories': ['let it go', 'cover', 'idina menzel', 'frozen', 'singing', 'disney', 'siren gene'], - }, - 'skip': 'Removed due to DMCA', - } - ] - - def _real_extract(self, url): - mobj = self._match_valid_url(url) - video_id = mobj.group('id') - - video = self._download_json( - 'http://vube.com/t-api/v1/video/%s' % video_id, video_id, 'Downloading video JSON') - - public_id = video['public_id'] - - formats = [] - - for media in video['media'].get('video', []) + video['media'].get('audio', []): - if media['transcoding_status'] != 'processed': - continue - fmt = { - 'url': 'http://video.thestaticvube.com/video/%s/%s.mp4' % (media['media_resolution_id'], public_id), - 'abr': int(media['audio_bitrate']), - 'format_id': compat_str(media['media_resolution_id']), - } - vbr = int(media['video_bitrate']) - if vbr: - fmt.update({ - 'vbr': vbr, - 'height': int(media['height']), - }) - formats.append(fmt) - - if not formats and video.get('vst') == 'dmca': - self.raise_no_formats( - 'This video has been removed in response to a complaint received under the US Digital Millennium Copyright Act.', - expected=True) - - self._sort_formats(formats) - - title = video['title'] - description = video.get('description') - thumbnail = self._proto_relative_url(video.get('thumbnail_src'), scheme='http:') - uploader = video.get('user_alias') or video.get('channel') - timestamp = int_or_none(video.get('upload_time')) - duration = video['duration'] - view_count = video.get('raw_view_count') - like_count = video.get('total_likes') - dislike_count = video.get('total_hates') - - comments = video.get('comments') - comment_count = None - if comments is None: - comment_data = self._download_json( - 'http://vube.com/api/video/%s/comment' % video_id, - video_id, 'Downloading video comment JSON', fatal=False) - if comment_data is not None: - comment_count = int_or_none(comment_data.get('total')) - else: - comment_count = len(comments) - - categories = [tag['text'] for tag in video['tags']] - - return { - 'id': video_id, - 'formats': formats, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'timestamp': timestamp, - 'duration': duration, - 'view_count': view_count, - 'like_count': like_count, - 'dislike_count': dislike_count, - 'comment_count': comment_count, - 'categories': categories, - } diff --git a/hypervideo_dl/extractor/vuclip.py b/hypervideo_dl/extractor/vuclip.py index 55e087b..0e56298 100644 --- a/hypervideo_dl/extractor/vuclip.py +++ b/hypervideo_dl/extractor/vuclip.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/vupload.py b/hypervideo_dl/extractor/vupload.py index b561f63..23ea70c 100644 --- a/hypervideo_dl/extractor/vupload.py +++ b/hypervideo_dl/extractor/vupload.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( parse_duration, diff --git a/hypervideo_dl/extractor/vvvvid.py b/hypervideo_dl/extractor/vvvvid.py index 3faa90f..ed725a5 100644 --- a/hypervideo_dl/extractor/vvvvid.py +++ b/hypervideo_dl/extractor/vvvvid.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -65,6 +62,18 @@ class VVVVIDIE(InfoExtractor): 'skip_download': True, }, }, { + # video_type == 'video/dash' + 'url': 'https://www.vvvvid.it/show/683/made-in-abyss/1542/693786/nanachi', + 'info_dict': { + 'id': '693786', + 'ext': 'mp4', + 'title': 'Nanachi', + }, + 'params': { + 'skip_download': True, + 'format': 'mp4', + }, + }, { 'url': 'https://www.vvvvid.it/show/434/perche-dovrei-guardarlo-di-dario-moccia/437/489048', 'only_matching': True }] @@ -205,13 +214,15 @@ class VVVVIDIE(InfoExtractor): }) is_youtube = True break + elif video_type == 'video/dash': + formats.extend(self._extract_m3u8_formats( + embed_code, video_id, 'mp4', m3u8_id='hls', fatal=False)) else: formats.extend(self._extract_wowza_formats( 'http://sb.top-ix.org/videomg/_definst_/mp4:%s/playlist.m3u8' % embed_code, video_id)) metadata_from_url(embed_code) if not is_youtube: - self._sort_formats(formats) info['formats'] = formats metadata_from_url(video_data.get('thumbnail')) @@ -230,7 +241,7 @@ class VVVVIDIE(InfoExtractor): return info -class VVVVIDShowIE(VVVVIDIE): +class VVVVIDShowIE(VVVVIDIE): # XXX: Do not subclass from concrete IE _VALID_URL = r'(?P<base_url>%s(?P<id>\d+)(?:/(?P<show_title>[^/?&#]+))?)/?(?:[?#&]|$)' % VVVVIDIE._VALID_URL_BASE _TESTS = [{ 'url': 'https://www.vvvvid.it/show/156/psyco-pass', diff --git a/hypervideo_dl/extractor/vyborymos.py b/hypervideo_dl/extractor/vyborymos.py index 4d93666..3865187 100644 --- a/hypervideo_dl/extractor/vyborymos.py +++ b/hypervideo_dl/extractor/vyborymos.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str diff --git a/hypervideo_dl/extractor/vzaar.py b/hypervideo_dl/extractor/vzaar.py index 54f88bb..6b9817c 100644 --- a/hypervideo_dl/extractor/vzaar.py +++ b/hypervideo_dl/extractor/vzaar.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -15,6 +10,7 @@ from ..utils import ( class VzaarIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www|view)\.)?vzaar\.com/(?:videos/)?(?P<id>\d+)' + _EMBED_REGEX = [r'<iframe[^>]+src=["\'](?P<url>(?:https?:)?//(?:view\.vzaar\.com)/[0-9]+)'] _TESTS = [{ # HTTP and HLS 'url': 'https://vzaar.com/videos/1152805', @@ -50,12 +46,6 @@ class VzaarIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+src=["\']((?:https?:)?//(?:view\.vzaar\.com)/[0-9]+)', - webpage) - def _real_extract(self, url): video_id = self._match_id(url) video_data = self._download_json( @@ -100,8 +90,6 @@ class VzaarIE(InfoExtractor): f['_decryption_key_url'] = url_templ % ('goose', '') + qs formats.extend(m3u8_formats) - self._sort_formats(formats) - return { 'id': video_id, 'title': title, diff --git a/hypervideo_dl/extractor/wakanim.py b/hypervideo_dl/extractor/wakanim.py index a70a719..155008f 100644 --- a/hypervideo_dl/extractor/wakanim.py +++ b/hypervideo_dl/extractor/wakanim.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from urllib.parse import unquote from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/walla.py b/hypervideo_dl/extractor/walla.py index 00f081b..a1a9c17 100644 --- a/hypervideo_dl/extractor/walla.py +++ b/hypervideo_dl/extractor/walla.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -72,7 +69,6 @@ class WallaIE(InfoExtractor): if m: fmt['height'] = int(m.group('height')) formats.append(fmt) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/wasdtv.py b/hypervideo_dl/extractor/wasdtv.py index 38c10dc..f57c619 100644 --- a/hypervideo_dl/extractor/wasdtv.py +++ b/hypervideo_dl/extractor/wasdtv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -40,7 +37,6 @@ class WASDTVBaseIE(InfoExtractor): media_url, is_live = self._get_media_url(media_meta) video_id = media.get('media_id') or container.get('media_container_id') formats, subtitles = self._extract_m3u8_formats_and_subtitles(media_url, video_id, 'mp4') - self._sort_formats(formats) return { 'id': str(video_id), 'title': container.get('media_container_name') or self._og_search_title(self._download_webpage(url, video_id)), @@ -98,7 +94,7 @@ class WASDTVStreamIE(WASDTVBaseIE): class WASDTVRecordIE(WASDTVBaseIE): IE_NAME = 'wasdtv:record' - _VALID_URL = r'https?://wasd\.tv/[^/#?]+/videos\?record=(?P<id>\d+)$' + _VALID_URL = r'https?://wasd\.tv/[^/#?]+(?:/videos)?\?record=(?P<id>\d+)$' _TESTS = [{ 'url': 'https://wasd.tv/spacemita/videos?record=907755', 'md5': 'c9899dd85be4cc997816ff9f9ca516ce', @@ -113,6 +109,9 @@ class WASDTVRecordIE(WASDTVBaseIE): 'is_live': False, 'view_count': int, }, + }, { + 'url': 'https://wasd.tv/spacemita?record=907755', + 'only_matching': True, }] def _get_container(self, url): @@ -149,7 +148,6 @@ class WASDTVClipIE(WASDTVBaseIE): clip = self._fetch(f'v2/clips/{clip_id}', video_id=clip_id, description='clip') clip_data = clip.get('clip_data') formats, subtitles = self._extract_m3u8_formats_and_subtitles(clip_data.get('url'), video_id=clip_id, ext='mp4') - self._sort_formats(formats) return { 'id': clip_id, 'title': clip.get('clip_title') or self._og_search_title(self._download_webpage(url, clip_id, fatal=False)), diff --git a/hypervideo_dl/extractor/washingtonpost.py b/hypervideo_dl/extractor/washingtonpost.py index 9d6ae28..74501b1 100644 --- a/hypervideo_dl/extractor/washingtonpost.py +++ b/hypervideo_dl/extractor/washingtonpost.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -11,7 +8,7 @@ from ..utils import traverse_obj class WashingtonPostIE(InfoExtractor): IE_NAME = 'washingtonpost' _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/(?:video|posttv)/(?:[^/]+/)*)(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' - _EMBED_URL = r'https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'] _TESTS = [{ 'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d', 'md5': '6f537e1334b714eb15f9563bd4b9cdfa', @@ -31,11 +28,6 @@ class WashingtonPostIE(InfoExtractor): 'only_matching': True, }] - @classmethod - def _extract_urls(cls, webpage): - return re.findall( - r'<iframe[^>]+\bsrc=["\'](%s)' % cls._EMBED_URL, webpage) - def _real_extract(self, url): video_id = self._match_id(url) return self.url_result( diff --git a/hypervideo_dl/extractor/wat.py b/hypervideo_dl/extractor/wat.py index 9ff4523..7c62d28 100644 --- a/hypervideo_dl/extractor/wat.py +++ b/hypervideo_dl/extractor/wat.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -57,7 +54,7 @@ class WatIE(InfoExtractor): # 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id) video_data = self._download_json( 'https://mediainfo.tf1.fr/mediainfocombo/' + video_id, - video_id, query={'context': 'MYTF1'}) + video_id, query={'context': 'MYTF1', 'pver': '4020003'}) video_info = video_data['media'] error_desc = video_info.get('error_desc') @@ -98,8 +95,6 @@ class WatIE(InfoExtractor): if manifest_urls: extract_formats(manifest_urls) - self._sort_formats(formats) - return { 'id': video_id, 'title': title, diff --git a/hypervideo_dl/extractor/watchbox.py b/hypervideo_dl/extractor/watchbox.py index d19d801..c973ca9 100644 --- a/hypervideo_dl/extractor/watchbox.py +++ b/hypervideo_dl/extractor/watchbox.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -113,7 +109,6 @@ class WatchBoxIE(InfoExtractor): 'height': int_or_none(item.get('height')), 'tbr': int_or_none(item.get('bitrate')), }) - self._sort_formats(formats) description = strip_or_none(item.get('descr')) thumbnail = item.get('media_content_thumbnail_large') or source.get('poster') or item.get('media_thumbnail') diff --git a/hypervideo_dl/extractor/watchindianporn.py b/hypervideo_dl/extractor/watchindianporn.py index a868191..3ded2d1 100644 --- a/hypervideo_dl/extractor/watchindianporn.py +++ b/hypervideo_dl/extractor/watchindianporn.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/wdr.py b/hypervideo_dl/extractor/wdr.py index ef58a66..de5dc26 100644 --- a/hypervideo_dl/extractor/wdr.py +++ b/hypervideo_dl/extractor/wdr.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -106,8 +103,6 @@ class WDRIE(InfoExtractor): a_format['ext'] = ext formats.append(a_format) - self._sort_formats(formats) - caption_url = media_resource.get('captionURL') if caption_url: subtitles['de'] = [{ @@ -136,7 +131,7 @@ class WDRIE(InfoExtractor): } -class WDRPageIE(WDRIE): +class WDRPageIE(WDRIE): # XXX: Do not subclass from concrete IE _MAUS_REGEX = r'https?://(?:www\.)wdrmaus.de/(?:[^/]+/)*?(?P<maus_id>[^/?#.]+)(?:/?|/index\.php5|\.php5)$' _PAGE_REGEX = r'/(?:mediathek/)?(?:[^/]+/)*(?P<display_id>[^/]+)\.html' _VALID_URL = r'https?://(?:www\d?\.)?(?:(?:kinder\.)?wdr\d?|sportschau)\.de' + _PAGE_REGEX + '|' + _MAUS_REGEX diff --git a/hypervideo_dl/extractor/webcaster.py b/hypervideo_dl/extractor/webcaster.py index a858e99..43eeca0 100644 --- a/hypervideo_dl/extractor/webcaster.py +++ b/hypervideo_dl/extractor/webcaster.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -53,7 +50,6 @@ class WebcasterIE(InfoExtractor): 'format_note': track.get('title'), }) formats.extend(m3u8_formats) - self._sort_formats(formats) thumbnail = xpath_text(video, './/image', 'thumbnail') @@ -67,27 +63,23 @@ class WebcasterIE(InfoExtractor): class WebcasterFeedIE(InfoExtractor): _VALID_URL = r'https?://bl\.webcaster\.pro/feed/start/free_(?P<id>[^/]+)' + _EMBED_REGEX = [r'<(?:object|a[^>]+class=["\']webcaster-player["\'])[^>]+data(?:-config)?=(["\']).*?config=(?P<url>https?://bl\.webcaster\.pro/feed/start/free_.*?)(?:[?&]|\1)'] _TEST = { 'url': 'http://bl.webcaster.pro/feed/start/free_c8cefd240aa593681c8d068cff59f407_hd/q393859/eb173f99dd5f558674dae55f4ba6806d/1480289104', 'only_matching': True, } - @staticmethod - def _extract_url(ie, webpage): - mobj = re.search( - r'<(?:object|a[^>]+class=["\']webcaster-player["\'])[^>]+data(?:-config)?=(["\']).*?config=(?P<url>https?://bl\.webcaster\.pro/feed/start/free_.*?)(?:[?&]|\1)', - webpage) - if mobj: - return mobj.group('url') + def _extract_from_webpage(self, url, webpage): + yield from super()._extract_from_webpage(url, webpage) + for secure in (True, False): - video_url = ie._og_search_video_url( - webpage, secure=secure, default=None) + video_url = self._og_search_video_url(webpage, secure=secure, default=None) if video_url: mobj = re.search( r'config=(?P<url>https?://bl\.webcaster\.pro/feed/start/free_[^?&=]+)', video_url) if mobj: - return mobj.group('url') + yield self.url_result(mobj.group('url'), self) def _real_extract(self, url): video_id = self._match_id(url) diff --git a/hypervideo_dl/extractor/webofstories.py b/hypervideo_dl/extractor/webofstories.py index f2b8d19..65f48f3 100644 --- a/hypervideo_dl/extractor/webofstories.py +++ b/hypervideo_dl/extractor/webofstories.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -107,8 +104,6 @@ class WebOfStoriesIE(InfoExtractor): 'play_path': play_path, }] - self._sort_formats(formats) - return { 'id': story_id, 'title': title, diff --git a/hypervideo_dl/extractor/weibo.py b/hypervideo_dl/extractor/weibo.py index dafa2af..81a23b9 100644 --- a/hypervideo_dl/extractor/weibo.py +++ b/hypervideo_dl/extractor/weibo.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor import json @@ -91,8 +88,6 @@ class WeiboIE(InfoExtractor): 'height': res, }) - self._sort_formats(formats) - uploader = self._og_search_property( 'nick-name', webpage, 'uploader', default=None) diff --git a/hypervideo_dl/extractor/weiqitv.py b/hypervideo_dl/extractor/weiqitv.py index 7e0befd..c9ff641 100644 --- a/hypervideo_dl/extractor/weiqitv.py +++ b/hypervideo_dl/extractor/weiqitv.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/whowatch.py b/hypervideo_dl/extractor/whowatch.py index e4b610d..f2808cd 100644 --- a/hypervideo_dl/extractor/whowatch.py +++ b/hypervideo_dl/extractor/whowatch.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -73,7 +70,6 @@ class WhoWatchIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( hls_url, video_id, ext='mp4', m3u8_id='hls')) self._remove_duplicate_formats(formats) - self._sort_formats(formats) uploader_url = try_get(metadata, lambda x: x['live']['user']['user_path'], compat_str) if uploader_url: diff --git a/hypervideo_dl/extractor/wikimedia.py b/hypervideo_dl/extractor/wikimedia.py new file mode 100644 index 0000000..11c801f --- /dev/null +++ b/hypervideo_dl/extractor/wikimedia.py @@ -0,0 +1,55 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + clean_html, + get_element_by_class, + parse_qs, + remove_start, + unescapeHTML, + urljoin, +) + + +class WikimediaIE(InfoExtractor): + IE_NAME = 'wikimedia.org' + _VALID_URL = r'https?://commons\.wikimedia\.org/wiki/File:(?P<id>[^/#?]+)\.\w+' + _TESTS = [{ + 'url': 'https://commons.wikimedia.org/wiki/File:Die_Temperaturkurve_der_Erde_(ZDF,_Terra_X)_720p_HD_50FPS.webm', + 'info_dict': { + 'url': 're:https?://upload.wikimedia.org/wikipedia', + 'ext': 'webm', + 'id': 'Die_Temperaturkurve_der_Erde_(ZDF,_Terra_X)_720p_HD_50FPS', + 'title': 'Die Temperaturkurve der Erde (ZDF, Terra X) 720p HD 50FPS.webm - Wikimedia Commons', + 'description': 'md5:7cd84f76e7081f1be033d0b155b4a460', + 'license': 'Creative Commons Attribution 4.0 International', + 'uploader': 'ZDF/Terra X/Gruppe 5/Luise Wagner, Jonas Sichert, Andreas Hougardy', + 'subtitles': 'count:4' + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + subtitles = {} + for sub in set(re.findall(r'\bsrc\s*=\s*["\'](/w/api[^"]+)["\']', webpage)): + sub = urljoin('https://commons.wikimedia.org', unescapeHTML(sub)) + qs = parse_qs(sub) + lang = qs.get('lang', [None])[-1] + sub_ext = qs.get('trackformat', [None])[-1] + if lang and sub_ext: + subtitles.setdefault(lang, []).append({'ext': sub_ext, 'url': sub}) + + return { + 'id': video_id, + 'url': self._html_search_regex(r'<source\s[^>]*\bsrc="([^"]+)"', webpage, 'video URL'), + 'description': clean_html(get_element_by_class('description', webpage)), + 'title': remove_start(self._og_search_title(webpage), 'File:'), + 'license': self._html_search_regex( + r'licensed under(?: the)? (.+?) license', + get_element_by_class('licensetpl', webpage), 'license', default=None), + 'uploader': self._html_search_regex( + r'>\s*Author\s*</td>\s*<td\b[^>]*>\s*([^<]+)\s*</td>', webpage, 'video author', default=None), + 'subtitles': subtitles, + } diff --git a/hypervideo_dl/extractor/willow.py b/hypervideo_dl/extractor/willow.py index 4d3d62f..0ec9c9d 100644 --- a/hypervideo_dl/extractor/willow.py +++ b/hypervideo_dl/extractor/willow.py @@ -1,4 +1,3 @@ -# coding: utf-8 from ..utils import ExtractorError from .common import InfoExtractor @@ -42,7 +41,6 @@ class WillowIE(InfoExtractor): raise ExtractorError('No videos found') formats = self._extract_m3u8_formats(video['secureurl'], video_id, 'mp4') - self._sort_formats(formats) return { 'id': str(video.get('content_id')), diff --git a/hypervideo_dl/extractor/wimtv.py b/hypervideo_dl/extractor/wimtv.py index ea953bf..5711123 100644 --- a/hypervideo_dl/extractor/wimtv.py +++ b/hypervideo_dl/extractor/wimtv.py @@ -1,8 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import ( determine_ext, @@ -15,14 +10,15 @@ from ..utils import ( class WimTVIE(InfoExtractor): _player = None _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' - _VALID_URL = r'''(?x) + _VALID_URL = r'''(?x: https?://platform.wim.tv/ (?: (?:embed/)?\? |\#/webtv/.+?/ ) (?P<type>vod|live|cast)[=/] - (?P<id>%s).*?''' % _UUID_RE + (?P<id>%s).*?)''' % _UUID_RE + _EMBED_REGEX = [rf'<iframe[^>]+src=["\'](?P<url>{_VALID_URL})'] _TESTS = [{ # vod stream 'url': 'https://platform.wim.tv/embed/?vod=db29fb32-bade-47b6-a3a6-cb69fe80267a', @@ -57,14 +53,6 @@ class WimTVIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<iframe[^>]+src=["\'](?P<url>%s)' % WimTVIE._VALID_URL, - webpage)] - def _real_initialize(self): if not self._player: self._get_player_data() @@ -151,7 +139,6 @@ class WimTVIE(InfoExtractor): }) json = json.get('resource') thumb = self._generate_thumbnail(json.get('thumbnailId')) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/wistia.py b/hypervideo_dl/extractor/wistia.py index a170966..38dcc2f 100644 --- a/hypervideo_dl/extractor/wistia.py +++ b/hypervideo_dl/extractor/wistia.py @@ -1,32 +1,36 @@ -from __future__ import unicode_literals - import re +import urllib.error +import urllib.parse +from base64 import b64decode from .common import InfoExtractor from ..utils import ( ExtractorError, float_or_none, int_or_none, + parse_qs, + traverse_obj, try_get, - unescapeHTML, + update_url_query, ) class WistiaBaseIE(InfoExtractor): _VALID_ID_REGEX = r'(?P<id>[a-z0-9]{10})' - _VALID_URL_BASE = r'https?://(?:fast\.)?wistia\.(?:net|com)/embed/' - _EMBED_BASE_URL = 'http://fast.wistia.com/embed/' + _VALID_URL_BASE = r'https?://(?:\w+\.)?wistia\.(?:net|com)/(?:embed/)?' + _EMBED_BASE_URL = 'http://fast.wistia.net/embed/' def _download_embed_config(self, config_type, config_id, referer): - base_url = self._EMBED_BASE_URL + '%ss/%s' % (config_type, config_id) + base_url = self._EMBED_BASE_URL + '%s/%s' % (config_type, config_id) embed_config = self._download_json( base_url + '.json', config_id, headers={ 'Referer': referer if referer.startswith('http') else base_url, # Some videos require this. }) - if isinstance(embed_config, dict) and embed_config.get('error'): + error = traverse_obj(embed_config, 'error') + if error: raise ExtractorError( - 'Error while getting the playlist', expected=True) + f'Error while getting the playlist: {error}', expected=True) return embed_config @@ -94,8 +98,6 @@ class WistiaBaseIE(InfoExtractor): }) formats.append(f) - self._sort_formats(formats) - subtitles = {} for caption in data.get('captions', []): language = caption.get('language') @@ -116,10 +118,38 @@ class WistiaBaseIE(InfoExtractor): 'subtitles': subtitles, } + @classmethod + def _extract_from_webpage(cls, url, webpage): + from .teachable import TeachableIE + + if list(TeachableIE._extract_embed_urls(url, webpage)): + return + + yield from super()._extract_from_webpage(url, webpage) + + @classmethod + def _extract_wistia_async_embed(cls, webpage): + # https://wistia.com/support/embed-and-share/video-on-your-website + # https://wistia.com/support/embed-and-share/channel-embeds + yield from re.finditer( + r'''(?sx) + <(?:div|section)[^>]+class=([\"'])(?:(?!\1).)*?(?P<type>wistia[a-z_0-9]+)\s*\bwistia_async_(?P<id>[a-z0-9]{10})\b(?:(?!\1).)*?\1 + ''', webpage) + + @classmethod + def _extract_url_media_id(cls, url): + mobj = re.search(r'(?:wmediaid|wvideo(?:id)?)]?=(?P<id>[a-z0-9]{10})', urllib.parse.unquote_plus(url)) + if mobj: + return mobj.group('id') + class WistiaIE(WistiaBaseIE): _VALID_URL = r'(?:wistia:|%s(?:iframe|medias)/)%s' % (WistiaBaseIE._VALID_URL_BASE, WistiaBaseIE._VALID_ID_REGEX) - + _EMBED_REGEX = [ + r'''(?x) + <(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'] + (?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10}) + '''] _TESTS = [{ # with hls video 'url': 'wistia:807fafadvk', @@ -133,6 +163,33 @@ class WistiaIE(WistiaBaseIE): 'timestamp': 1463607249, 'duration': 4987.11, }, + 'skip': 'video unavailable', + }, { + 'url': 'wistia:a6ndpko1wg', + 'md5': '10c1ce9c4dde638202513ed17a3767bd', + 'info_dict': { + 'id': 'a6ndpko1wg', + 'ext': 'bin', + 'title': 'Episode 2: Boxed Water\'s retention is thirsty', + 'upload_date': '20210324', + 'description': 'md5:da5994c2c2d254833b412469d9666b7a', + 'duration': 966.0, + 'timestamp': 1616614369, + 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/53dc60239348dc9b9fba3755173ea4c2.bin', + } + }, { + 'url': 'wistia:5vd7p4bct5', + 'md5': 'b9676d24bf30945d97060638fbfe77f0', + 'info_dict': { + 'id': '5vd7p4bct5', + 'ext': 'bin', + 'title': 'md5:eaa9f64c4efd7b5f098b9b6118597679', + 'description': 'md5:a9bea0315f0616aa5df2dc413ddcdd0f', + 'upload_date': '20220915', + 'timestamp': 1663258727, + 'duration': 623.019, + 'thumbnail': r're:https?://embed(?:-ssl)?.wistia.com/.+\.(?:jpg|bin)$', + }, }, { 'url': 'wistia:sh7fpupwlt', 'only_matching': True, @@ -147,35 +204,56 @@ class WistiaIE(WistiaBaseIE): 'only_matching': True, }] - # https://wistia.com/support/embed-and-share/video-on-your-website - @staticmethod - def _extract_url(webpage): - urls = WistiaIE._extract_urls(webpage) - return urls[0] if urls else None - - @staticmethod - def _extract_urls(webpage): - urls = [] - for match in re.finditer( - r'<(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'](?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})', webpage): - urls.append(unescapeHTML(match.group('url'))) - for match in re.finditer( - r'''(?sx) - <div[^>]+class=(["'])(?:(?!\1).)*?\bwistia_async_(?P<id>[a-z0-9]{10})\b(?:(?!\1).)*?\1 - ''', webpage): - urls.append('wistia:%s' % match.group('id')) - for match in re.finditer(r'(?:data-wistia-?id=["\']|Wistia\.embed\(["\']|id=["\']wistia_)(?P<id>[a-z0-9]{10})', webpage): - urls.append('wistia:%s' % match.group('id')) - return urls + _WEBPAGE_TESTS = [{ + 'url': 'https://www.weidert.com/blog/wistia-channels-video-marketing-tool', + 'info_dict': { + 'id': 'cqwukac3z1', + 'ext': 'bin', + 'title': 'How Wistia Channels Can Help Capture Inbound Value From Your Video Content', + 'duration': 158.125, + 'timestamp': 1618974400, + 'description': 'md5:27abc99a758573560be72600ef95cece', + 'upload_date': '20210421', + 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/6c551820ae950cdee2306d6cbe9ef742.bin', + } + }, { + 'url': 'https://study.com/academy/lesson/north-american-exploration-failed-colonies-of-spain-france-england.html#lesson', + 'md5': 'b9676d24bf30945d97060638fbfe77f0', + 'info_dict': { + 'id': '5vd7p4bct5', + 'ext': 'bin', + 'title': 'paywall_north-american-exploration-failed-colonies-of-spain-france-england', + 'upload_date': '20220915', + 'timestamp': 1663258727, + 'duration': 623.019, + 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/83e6ec693e2c05a0ce65809cbaead86a.bin', + 'description': 'a Paywall Videos video', + }, + }] def _real_extract(self, url): video_id = self._match_id(url) - embed_config = self._download_embed_config('media', video_id, url) + embed_config = self._download_embed_config('medias', video_id, url) return self._extract_media(embed_config) + @classmethod + def _extract_embed_urls(cls, url, webpage): + urls = list(super()._extract_embed_urls(url, webpage)) + for match in cls._extract_wistia_async_embed(webpage): + if match.group('type') != 'wistia_channel': + urls.append('wistia:%s' % match.group('id')) + for match in re.finditer(r'(?:data-wistia-?id=["\']|Wistia\.embed\(["\']|id=["\']wistia_)(?P<id>[a-z0-9]{10})', + webpage): + urls.append('wistia:%s' % match.group('id')) + if not WistiaChannelIE._extract_embed_urls(url, webpage): # Fallback + media_id = cls._extract_url_media_id(url) + if media_id: + urls.append('wistia:%s' % match.group('id')) + return urls + class WistiaPlaylistIE(WistiaBaseIE): - _VALID_URL = r'%splaylists/%s' % (WistiaIE._VALID_URL_BASE, WistiaIE._VALID_ID_REGEX) + _VALID_URL = r'%splaylists/%s' % (WistiaBaseIE._VALID_URL_BASE, WistiaBaseIE._VALID_ID_REGEX) _TEST = { 'url': 'https://fast.wistia.net/embed/playlists/aodt9etokc', @@ -187,7 +265,7 @@ class WistiaPlaylistIE(WistiaBaseIE): def _real_extract(self, url): playlist_id = self._match_id(url) - playlist = self._download_embed_config('playlist', playlist_id, url) + playlist = self._download_embed_config('playlists', playlist_id, url) entries = [] for media in (try_get(playlist, lambda x: x[0]['medias']) or []): @@ -197,3 +275,107 @@ class WistiaPlaylistIE(WistiaBaseIE): entries.append(self._extract_media(embed_config)) return self.playlist_result(entries, playlist_id) + + +class WistiaChannelIE(WistiaBaseIE): + _VALID_URL = r'(?:wistiachannel:|%schannel/)%s' % (WistiaBaseIE._VALID_URL_BASE, WistiaBaseIE._VALID_ID_REGEX) + + _TESTS = [{ + # JSON Embed API returns 403, should fall back to webpage + 'url': 'https://fast.wistia.net/embed/channel/yvyvu7wjbg?wchannelid=yvyvu7wjbg', + 'info_dict': { + 'id': 'yvyvu7wjbg', + 'title': 'Copysmith Tutorials and Education!', + 'description': 'Learn all things Copysmith via short and informative videos!' + }, + 'playlist_mincount': 7, + 'expected_warnings': ['falling back to webpage'], + }, { + 'url': 'https://fast.wistia.net/embed/channel/3802iirk0l', + 'info_dict': { + 'id': '3802iirk0l', + 'title': 'The Roof', + }, + 'playlist_mincount': 20, + }, { + # link to popup video, follow --no-playlist + 'url': 'https://fast.wistia.net/embed/channel/3802iirk0l?wchannelid=3802iirk0l&wmediaid=sp5dqjzw3n', + 'info_dict': { + 'id': 'sp5dqjzw3n', + 'ext': 'bin', + 'title': 'The Roof S2: The Modern CRO', + 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/dadfa9233eaa505d5e0c85c23ff70741.bin', + 'duration': 86.487, + 'description': 'A sales leader on The Roof? Man, they really must be letting anyone up here this season.\n', + 'timestamp': 1619790290, + 'upload_date': '20210430', + }, + 'params': {'noplaylist': True, 'skip_download': True}, + }] + _WEBPAGE_TESTS = [{ + 'url': 'https://www.profitwell.com/recur/boxed-out', + 'info_dict': { + 'id': '6jyvmqz6zs', + 'title': 'Boxed Out', + 'description': 'md5:14a8a93a1dbe236718e6a59f8c8c7bae', + }, + 'playlist_mincount': 30, + }, { + # section instead of div + 'url': 'https://360learning.com/studio/onboarding-joei/', + 'info_dict': { + 'id': 'z874k93n2o', + 'title': 'Onboarding Joei.', + 'description': 'Coming to you weekly starting Feb 19th.', + }, + 'playlist_mincount': 20, + }, { + 'url': 'https://amplitude.com/amplify-sessions?amp%5Bwmediaid%5D=pz0m0l0if3&%5Bwvideo%5D=pz0m0l0if3&wchannelid=emyjmwjf79&wmediaid=i8um783bdt', + 'info_dict': { + 'id': 'pz0m0l0if3', + 'title': 'A Framework for Improving Product Team Performance', + 'ext': 'bin', + 'timestamp': 1653935275, + 'upload_date': '20220530', + 'description': 'Learn how to help your company improve and achieve your product related goals.', + 'duration': 1854.39, + 'thumbnail': 'https://embed-ssl.wistia.com/deliveries/12fd19e56413d9d6f04e2185c16a6f8854e25226.bin', + }, + 'params': {'noplaylist': True, 'skip_download': True}, + }] + + def _real_extract(self, url): + channel_id = self._match_id(url) + media_id = self._extract_url_media_id(url) + if not self._yes_playlist(channel_id, media_id, playlist_label='channel'): + return self.url_result(f'wistia:{media_id}', 'Wistia') + + try: + data = self._download_embed_config('channel', channel_id, url) + except (ExtractorError, urllib.error.HTTPError): + # Some channels give a 403 from the JSON API + self.report_warning('Failed to download channel data from API, falling back to webpage.') + webpage = self._download_webpage(f'https://fast.wistia.net/embed/channel/{channel_id}', channel_id) + data = self._parse_json( + self._search_regex(r'wchanneljsonp-%s\'\]\s*=[^\"]*\"([A-Za-z0-9=/]*)' % channel_id, webpage, 'jsonp', channel_id), + channel_id, transform_source=lambda x: urllib.parse.unquote_plus(b64decode(x).decode('utf-8'))) + + # XXX: can there be more than one series? + series = traverse_obj(data, ('series', 0), default={}) + + entries = [ + self.url_result(f'wistia:{video["hashedId"]}', WistiaIE, title=video.get('name')) + for video in traverse_obj(series, ('sections', ..., 'videos', ...)) or [] + if video.get('hashedId') + ] + + return self.playlist_result( + entries, channel_id, playlist_title=series.get('title'), playlist_description=series.get('description')) + + @classmethod + def _extract_embed_urls(cls, url, webpage): + yield from super()._extract_embed_urls(url, webpage) + for match in cls._extract_wistia_async_embed(webpage): + if match.group('type') == 'wistia_channel': + # original url may contain wmediaid query param + yield update_url_query(f'wistiachannel:{match.group("id")}', parse_qs(url)) diff --git a/hypervideo_dl/extractor/wordpress.py b/hypervideo_dl/extractor/wordpress.py new file mode 100644 index 0000000..53820b5 --- /dev/null +++ b/hypervideo_dl/extractor/wordpress.py @@ -0,0 +1,154 @@ +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + get_elements_by_class, + get_elements_text_and_html_by_attribute, + int_or_none, + parse_duration, + traverse_obj, +) + + +# https://codex.wordpress.org/Playlist_Shortcode +class WordpressPlaylistEmbedIE(InfoExtractor): + _VALID_URL = False + IE_NAME = 'wordpress:playlist' + _WEBPAGE_TESTS = [{ + # 5 WordPress playlists. This is using wpse-playlist, which is similar. + # See: https://github.com/birgire/wpse-playlist + 'url': 'https://xlino.com/wordpress-playlist-shortcode-with-external-audio-or-video-files/', + 'info_dict': { + 'id': 'wordpress-playlist-shortcode-with-external-audio-or-video-files', + 'title': 'WordPress: Playlist shortcode with external audio or video files – Birgir Erlendsson (birgire)', + 'age_limit': 0, + }, + 'playlist_count': 5, + }, { + 'url': 'https://pianoadventures.com/products/piano-adventures-level-1-lesson-book-enhanced-cd/', + 'info_dict': { + 'id': 'piano-adventures-level-1-lesson-book-enhanced-cd-wp-playlist-1', + 'title': 'Wordpress Playlist', + 'thumbnail': 'https://pianoadventures.com/wp-content/uploads/sites/13/2022/01/CD1002cover.jpg', + 'age_limit': 0, + }, + 'playlist': [{ + 'info_dict': { + 'id': 'CD1002-21', + 'ext': 'mp3', + 'title': '21 Half-Time Show', + 'thumbnail': 'https://pianoadventures.com/wp-content/plugins/media-library-assistant/images/crystal/audio.png', + 'album': 'Piano Adventures Level 1 Lesson Book (2nd Edition)', + 'genre': 'Classical', + 'duration': 49.0, + 'artist': 'Nancy and Randall Faber', + 'description': 'md5:a9f8e9aeabbd2912bc13cc0fab1a4ce8', + } + }], + 'playlist_count': 6, + 'params': {'skip_download': True} + }] + + def _extract_from_webpage(self, url, webpage): + # class should always be "wp-playlist-script" + # See: https://core.trac.wordpress.org/browser/trunk/src/wp-includes/media.php#L2930 + for i, j in enumerate(get_elements_by_class('wp-playlist-script', webpage)): + playlist_json = self._parse_json(j, self._generic_id(url), fatal=False, ignore_extra=True, errnote='') or {} + if not playlist_json: + continue + entries = [{ + 'id': self._generic_id(track['src']), + 'title': track.get('title'), + 'url': track.get('src'), + 'thumbnail': traverse_obj(track, ('thumb', 'src')), + 'album': traverse_obj(track, ('meta', 'album')), + 'artist': traverse_obj(track, ('meta', 'artist')), + 'genre': traverse_obj(track, ('meta', 'genre')), + 'duration': parse_duration(traverse_obj(track, ('meta', 'length_formatted'))), + 'description': track.get('description'), + 'height': int_or_none(traverse_obj(track, ('dimensions', 'original', 'height'))), + 'width': int_or_none(traverse_obj(track, ('dimensions', 'original', 'width'))), + } for track in traverse_obj(playlist_json, ('tracks', ...), expected_type=dict)] + yield self.playlist_result(entries, self._generic_id(url) + f'-wp-playlist-{i+1}', 'Wordpress Playlist') + + +class WordpressMiniAudioPlayerEmbedIE(InfoExtractor): + # WordPress MB Mini Player Plugin + # https://wordpress.org/plugins/wp-miniaudioplayer/ + # Note: This is for the WordPress plugin version only. + _VALID_URL = False + IE_NAME = 'wordpress:mb.miniAudioPlayer' + _WEBPAGE_TESTS = [{ + # Version 1.8.10: https://plugins.trac.wordpress.org/browser/wp-miniaudioplayer/tags/1.8.10 + 'url': 'https://news.samsung.com/global/over-the-horizon-the-evolution-of-the-samsung-galaxy-brand-sound', + 'info_dict': { + 'id': 'over-the-horizon-the-evolution-of-the-samsung-galaxy-brand-sound', + 'title': 'Over the Horizon: The Evolution of the Samsung Galaxy Brand Sound', + 'age_limit': 0, + 'thumbnail': 'https://img.global.news.samsung.com/global/wp-content/uploads/2015/04/OTH_Main_Title-e1429612467870.jpg', + 'description': 'md5:bc3dd738d1f11d9232e94e6629983bf7', + }, + 'playlist': [{ + 'info_dict': { + 'id': 'over_the_horizon_2013', + 'ext': 'mp3', + 'title': 'Over the Horizon 2013', + 'url': 'http://news.samsung.com/global/wp-content/uploads/ringtones/over_the_horizon_2013.mp3' + } + }], + 'playlist_count': 6, + 'params': {'skip_download': True} + }, { + # Version 1.9.3: https://plugins.trac.wordpress.org/browser/wp-miniaudioplayer/tags/1.9.3 + 'url': 'https://www.booksontape.com/collections/audiobooks-with-teacher-guides/', + 'info_dict': { + 'id': 'audiobooks-with-teacher-guides', + 'title': 'Audiobooks with Teacher Guides | Books on Tape', + 'age_limit': 0, + 'thumbnail': 'https://www.booksontape.com/wp-content/uploads/2016/09/bot-logo-1200x630.jpg', + }, + 'playlist_mincount': 12 + }, { + # Version 1.9.7: https://plugins.trac.wordpress.org/browser/wp-miniaudioplayer/tags/1.9.7 + # But has spaces around href filter + 'url': 'https://www.estudiords.com.br/temas/', + 'info_dict': { + 'id': 'temas', + 'title': 'Temas Variados', + 'age_limit': 0, + 'timestamp': float, + 'upload_date': str, + 'thumbnail': 'https://www.estudiords.com.br/wp-content/uploads/2021/03/LOGO-TEMAS.png', + 'description': 'md5:ab24d6a7ed0312ad2d466e721679f5a0', + }, + 'playlist_mincount': 30 + }] + + def _extract_from_webpage(self, url, webpage): + # Common function for the WordPress plugin version only. + mb_player_params = self._search_regex( + r'function\s*initializeMiniAudioPlayer\(\){[^}]+jQuery([^;]+)\.mb_miniPlayer', + webpage, 'mb player params', default=None) + if not mb_player_params: + return + # v1.55 - 1.9.3 has "a[href*='.mp3'] ,a[href*='.m4a']" + # v1.9.4+ has "a[href*='.mp3']" only + file_exts = re.findall(r'a\[href\s*\*=\s*\'\.([a-zA-Z\d]+)\'', mb_player_params) + if not file_exts: + return + + candidates = get_elements_text_and_html_by_attribute( + 'href', rf'(?:[^\"\']+\.(?:{"|".join(file_exts)}))', webpage, escape_value=False, tag='a') + + for title, html in candidates: + attrs = extract_attributes(html) + # XXX: not tested - have not found any example of it being used + if any(c in (attrs.get('class') or '') for c in re.findall(r'\.not\("\.([^"]+)', mb_player_params)): + continue + href = attrs['href'] + yield { + 'id': self._generic_id(href), + 'title': title or self._generic_title(href), + 'url': href, + } diff --git a/hypervideo_dl/extractor/worldstarhiphop.py b/hypervideo_dl/extractor/worldstarhiphop.py index 82587b4..c6948a1 100644 --- a/hypervideo_dl/extractor/worldstarhiphop.py +++ b/hypervideo_dl/extractor/worldstarhiphop.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/wppilot.py b/hypervideo_dl/extractor/wppilot.py index 3003a0f..5e590e2 100644 --- a/hypervideo_dl/extractor/wppilot.py +++ b/hypervideo_dl/extractor/wppilot.py @@ -1,5 +1,3 @@ -# coding: utf-8 - from .common import InfoExtractor from ..utils import ( try_get, @@ -22,7 +20,7 @@ class WPPilotBaseIE(InfoExtractor): def _get_channel_list(self, cache=True): if cache is True: - cache_res = self._downloader.cache.load('wppilot', 'channel-list') + cache_res = self.cache.load('wppilot', 'channel-list') if cache_res: return cache_res, True webpage = self._download_webpage('https://pilot.wp.pl/tv/', None, 'Downloading webpage') @@ -37,7 +35,7 @@ class WPPilotBaseIE(InfoExtractor): channel_list = try_get(qhash_content, lambda x: x['data']['allChannels']['nodes']) if channel_list is None: continue - self._downloader.cache.store('wppilot', 'channel-list', channel_list) + self.cache.store('wppilot', 'channel-list', channel_list) return channel_list, False raise ExtractorError('Unable to find the channel list') @@ -103,7 +101,7 @@ class WPPilotIE(WPPilotBaseIE): channel = self._get_channel(video_id) video_id = str(channel['id']) - is_authorized = next((c for c in self._downloader.cookiejar if c.name == 'netviapisessid'), None) + is_authorized = next((c for c in self.cookiejar if c.name == 'netviapisessid'), None) # cookies starting with "g:" are assigned to guests is_authorized = True if is_authorized is not None and not is_authorized.value.startswith('g:') else False @@ -140,8 +138,6 @@ class WPPilotIE(WPPilotBaseIE): random.choice(fmt['url']), video_id, live=True)) - self._sort_formats(formats) - channel['formats'] = formats return channel diff --git a/hypervideo_dl/extractor/wsj.py b/hypervideo_dl/extractor/wsj.py index 67236f3..86e2646 100644 --- a/hypervideo_dl/extractor/wsj.py +++ b/hypervideo_dl/extractor/wsj.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -85,7 +82,6 @@ class WSJIE(InfoExtractor): 'height': int_or_none(v.get('height')), 'fps': float_or_none(v.get('fps')), }) - self._sort_formats(formats) return { 'id': video_id, @@ -119,5 +115,6 @@ class WSJArticleIE(InfoExtractor): article_id = self._match_id(url) webpage = self._download_webpage(url, article_id) video_id = self._search_regex( - r'data-src=["\']([a-fA-F0-9-]{36})', webpage, 'video id') + r'(?:id=["\']video|video-|iframe\.html\?guid=|data-src=["\'])([a-fA-F0-9-]{36})', + webpage, 'video id') return self.url_result('wsj:%s' % video_id, WSJIE.ie_key(), video_id) diff --git a/hypervideo_dl/extractor/wwe.py b/hypervideo_dl/extractor/wwe.py index bebc77b..9bbd477 100644 --- a/hypervideo_dl/extractor/wwe.py +++ b/hypervideo_dl/extractor/wwe.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/xbef.py b/hypervideo_dl/extractor/xbef.py index 4c41e98..ac69528 100644 --- a/hypervideo_dl/extractor/xbef.py +++ b/hypervideo_dl/extractor/xbef.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote diff --git a/hypervideo_dl/extractor/xboxclips.py b/hypervideo_dl/extractor/xboxclips.py index 9bac982..235b567 100644 --- a/hypervideo_dl/extractor/xboxclips.py +++ b/hypervideo_dl/extractor/xboxclips.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor diff --git a/hypervideo_dl/extractor/xfileshare.py b/hypervideo_dl/extractor/xfileshare.py index cd97c77..08c6d6c 100644 --- a/hypervideo_dl/extractor/xfileshare.py +++ b/hypervideo_dl/extractor/xfileshare.py @@ -1,14 +1,10 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor -from ..compat import compat_chr from ..utils import ( + ExtractorError, decode_packed_codes, determine_ext, - ExtractorError, int_or_none, js_to_json, urlencode_postdata, @@ -35,11 +31,11 @@ def aa_decode(aa_code): aa_char = aa_char.replace('+ ', '') m = re.match(r'^\d+', aa_char) if m: - ret += compat_chr(int(m.group(0), 8)) + ret += chr(int(m.group(0), 8)) else: m = re.match(r'^u([\da-f]+)', aa_char) if m: - ret += compat_chr(int(m.group(1), 16)) + ret += chr(int(m.group(1), 16)) return ret @@ -65,6 +61,7 @@ class XFileShareIE(InfoExtractor): IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1]) _VALID_URL = (r'https?://(?:www\.)?(?P<host>%s)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' % '|'.join(site for site in list(zip(*_SITES))[0])) + _EMBED_REGEX = [r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:%s)/embed-[0-9a-zA-Z]+.*?)\1' % '|'.join(site for site in list(zip(*_SITES))[0])] _FILE_NOT_FOUND_REGEXES = ( r'>(?:404 - )?File Not Found<', @@ -72,6 +69,15 @@ class XFileShareIE(InfoExtractor): ) _TESTS = [{ + 'url': 'https://uqload.com/dltx1wztngdz', + 'md5': '3cfbb65e4c90e93d7b37bcb65a595557', + 'info_dict': { + 'id': 'dltx1wztngdz', + 'ext': 'mp4', + 'title': 'Rick Astley Never Gonna Give You mp4', + 'thumbnail': r're:https://.*\.jpg' + } + }, { 'url': 'http://xvideosharing.com/fq65f94nd2ve', 'md5': '4181f63957e8fe90ac836fa58dc3c8a6', 'info_dict': { @@ -88,15 +94,6 @@ class XFileShareIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//(?:%s)/embed-[0-9a-zA-Z]+.*?)\1' - % '|'.join(site for site in list(zip(*XFileShareIE._SITES))[0]), - webpage)] - def _real_extract(self, url): host, video_id = self._match_valid_url(url).groups() @@ -185,7 +182,6 @@ class XFileShareIE(InfoExtractor): 'url': video_url, 'format_id': 'sd', }) - self._sort_formats(formats) thumbnail = self._search_regex( [ @@ -198,4 +194,5 @@ class XFileShareIE(InfoExtractor): 'title': title, 'thumbnail': thumbnail, 'formats': formats, + 'http_headers': {'Referer': url} } diff --git a/hypervideo_dl/extractor/xhamster.py b/hypervideo_dl/extractor/xhamster.py index 9d4ed47..59eecec 100644 --- a/hypervideo_dl/extractor/xhamster.py +++ b/hypervideo_dl/extractor/xhamster.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import itertools import re @@ -23,7 +21,7 @@ from ..utils import ( class XHamsterIE(InfoExtractor): - _DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster\d+\.com)' + _DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster\d+\.com|xhday\.com)' _VALID_URL = r'''(?x) https?:// (?:.+?\.)?%s/ @@ -34,7 +32,7 @@ class XHamsterIE(InfoExtractor): ''' % _DOMAINS _TESTS = [{ 'url': 'https://xhamster.com/videos/femaleagent-shy-beauty-takes-the-bait-1509445', - 'md5': '98b4687efb1ffd331c4197854dc09e8f', + 'md5': '34e1ab926db5dc2750fed9e1f34304bb', 'info_dict': { 'id': '1509445', 'display_id': 'femaleagent-shy-beauty-takes-the-bait', @@ -43,6 +41,7 @@ class XHamsterIE(InfoExtractor): 'timestamp': 1350194821, 'upload_date': '20121014', 'uploader': 'Ruseful2011', + 'uploader_id': 'ruseful2011', 'duration': 893, 'age_limit': 18, }, @@ -72,6 +71,7 @@ class XHamsterIE(InfoExtractor): 'timestamp': 1454948101, 'upload_date': '20160208', 'uploader': 'parejafree', + 'uploader_id': 'parejafree', 'duration': 72, 'age_limit': 18, }, @@ -117,6 +117,9 @@ class XHamsterIE(InfoExtractor): }, { 'url': 'http://de.xhamster.com/videos/skinny-girl-fucks-herself-hard-in-the-forest-xhnBJZx', 'only_matching': True, + }, { + 'url': 'https://xhday.com/videos/strapless-threesome-xhh7yVf', + 'only_matching': True, }] def _real_extract(self, url): @@ -231,7 +234,6 @@ class XHamsterIE(InfoExtractor): 'Referer': standard_url, }, }) - self._sort_formats(formats) categories_list = video.get('categories') if isinstance(categories_list, list): @@ -246,7 +248,6 @@ class XHamsterIE(InfoExtractor): categories = None uploader_url = url_or_none(try_get(video, lambda x: x['author']['pageURL'])) - return { 'id': video_id, 'display_id': display_id, @@ -265,7 +266,7 @@ class XHamsterIE(InfoExtractor): 'dislike_count': int_or_none(try_get( video, lambda x: x['rating']['dislikes'], int)), 'comment_count': int_or_none(video.get('views')), - 'age_limit': age_limit, + 'age_limit': age_limit if age_limit is not None else 18, 'categories': categories, 'formats': formats, } @@ -309,8 +310,6 @@ class XHamsterIE(InfoExtractor): 'url': video_url, }) - self._sort_formats(formats) - # Only a few videos have an description mobj = re.search(r'<span>Description: </span>([^<]+)', webpage) description = mobj.group(1) if mobj else None @@ -371,6 +370,7 @@ class XHamsterIE(InfoExtractor): class XHamsterEmbedIE(InfoExtractor): _VALID_URL = r'https?://(?:.+?\.)?%s/xembed\.php\?video=(?P<id>\d+)' % XHamsterIE._DOMAINS + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?xhamster\.com/xembed\.php\?video=\d+)\1'] _TEST = { 'url': 'http://xhamster.com/xembed.php?video=3328539', 'info_dict': { @@ -385,12 +385,6 @@ class XHamsterEmbedIE(InfoExtractor): } } - @staticmethod - def _extract_urls(webpage): - return [url for _, url in re.findall( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?xhamster\.com/xembed\.php\?video=\d+)\1', - webpage)] - def _real_extract(self, url): video_id = self._match_id(url) @@ -425,6 +419,9 @@ class XHamsterUserIE(InfoExtractor): 'id': 'firatkaan', }, 'playlist_mincount': 1, + }, { + 'url': 'https://xhday.com/users/mobhunter', + 'only_matching': True, }] def _entries(self, user_id): diff --git a/hypervideo_dl/extractor/xiami.py b/hypervideo_dl/extractor/xiami.py index 769aab3..71b2956 100644 --- a/hypervideo_dl/extractor/xiami.py +++ b/hypervideo_dl/extractor/xiami.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote from ..utils import int_or_none diff --git a/hypervideo_dl/extractor/ximalaya.py b/hypervideo_dl/extractor/ximalaya.py index 802d1bb..b25be77 100644 --- a/hypervideo_dl/extractor/ximalaya.py +++ b/hypervideo_dl/extractor/ximalaya.py @@ -1,11 +1,7 @@ -# coding: utf-8 - -from __future__ import unicode_literals - -import itertools -import re +import math from .common import InfoExtractor +from ..utils import traverse_obj, try_call, InAdvancePagedList class XimalayaBaseIE(InfoExtractor): @@ -15,11 +11,10 @@ class XimalayaBaseIE(InfoExtractor): class XimalayaIE(XimalayaBaseIE): IE_NAME = 'ximalaya' IE_DESC = '喜马拉雅FM' - _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?P<uid>[0-9]+)/sound/(?P<id>[0-9]+)' - _USER_URL_FORMAT = '%s://www.ximalaya.com/zhubo/%i/' + _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(:?(?P<uid>\d+)/)?sound/(?P<id>[0-9]+)' _TESTS = [ { - 'url': 'http://www.ximalaya.com/61425525/sound/47740352/', + 'url': 'http://www.ximalaya.com/sound/47740352/', 'info_dict': { 'id': '47740352', 'ext': 'm4a', @@ -28,19 +23,20 @@ class XimalayaIE(XimalayaBaseIE): 'uploader_url': 'http://www.ximalaya.com/zhubo/61425525/', 'title': '261.唐诗三百首.卷八.送孟浩然之广陵.李白', 'description': "contains:《送孟浩然之广陵》\n作者:李白\n故人西辞黄鹤楼,烟花三月下扬州。\n孤帆远影碧空尽,惟见长江天际流。", + 'thumbnail': r're:^https?://.*\.jpg', 'thumbnails': [ { 'name': 'cover_url', - 'url': r're:^https?://.*\.jpg$', + 'url': r're:^https?://.*\.jpg', }, { 'name': 'cover_url_142', - 'url': r're:^https?://.*\.jpg$', + 'url': r're:^https?://.*\.jpg', 'width': 180, 'height': 180 } ], - 'categories': ['renwen', '人文'], + 'categories': ['人文'], 'duration': 93, 'view_count': int, 'like_count': int, @@ -56,77 +52,42 @@ class XimalayaIE(XimalayaBaseIE): 'uploader_url': 'http://www.ximalaya.com/zhubo/61425525/', 'title': '261.唐诗三百首.卷八.送孟浩然之广陵.李白', 'description': "contains:《送孟浩然之广陵》\n作者:李白\n故人西辞黄鹤楼,烟花三月下扬州。\n孤帆远影碧空尽,惟见长江天际流。", + 'thumbnail': r're:^https?://.*\.jpg', 'thumbnails': [ { 'name': 'cover_url', - 'url': r're:^https?://.*\.jpg$', + 'url': r're:^https?://.*\.jpg', }, { 'name': 'cover_url_142', - 'url': r're:^https?://.*\.jpg$', + 'url': r're:^https?://.*\.jpg', 'width': 180, 'height': 180 } ], - 'categories': ['renwen', '人文'], + 'categories': ['人文'], 'duration': 93, 'view_count': int, 'like_count': int, } - }, - { - 'url': 'https://www.ximalaya.com/11045267/sound/15705996/', - 'info_dict': { - 'id': '15705996', - 'ext': 'm4a', - 'uploader': '李延隆老师', - 'uploader_id': 11045267, - 'uploader_url': 'https://www.ximalaya.com/zhubo/11045267/', - 'title': 'Lesson 1 Excuse me!', - 'description': "contains:Listen to the tape then answer\xa0this question. Whose handbag is it?\n" - "听录音,然后回答问题,这是谁的手袋?", - 'thumbnails': [ - { - 'name': 'cover_url', - 'url': r're:^https?://.*\.jpg$', - }, - { - 'name': 'cover_url_142', - 'url': r're:^https?://.*\.jpg$', - 'width': 180, - 'height': 180 - } - ], - 'categories': ['train', '外语'], - 'duration': 40, - 'view_count': int, - 'like_count': int, - } - }, + } ] def _real_extract(self, url): - - is_m = 'm.ximalaya' in url scheme = 'https' if url.startswith('https') else 'http' audio_id = self._match_id(url) - webpage = self._download_webpage(url, audio_id, - note='Download sound page for %s' % audio_id, - errnote='Unable to get sound page') - audio_info_file = '%s://m.ximalaya.com/tracks/%s.json' % (scheme, audio_id) audio_info = self._download_json(audio_info_file, audio_id, 'Downloading info json %s' % audio_info_file, 'Unable to download info file') - formats = [] - for bps, k in (('24k', 'play_path_32'), ('64k', 'play_path_64')): - if audio_info.get(k): - formats.append({ - 'format_id': bps, - 'url': audio_info[k], - }) + formats = [{ + 'format_id': f'{bps}k', + 'url': audio_info[k], + 'abr': bps, + 'vcodec': 'none' + } for bps, k in ((24, 'play_path_32'), (64, 'play_path_64')) if audio_info.get(k)] thumbnails = [] for k in audio_info.keys(): @@ -140,30 +101,18 @@ class XimalayaIE(XimalayaBaseIE): audio_uploader_id = audio_info.get('uid') - if is_m: - audio_description = self._html_search_regex(r'(?s)<section\s+class=["\']content[^>]+>(.+?)</section>', - webpage, 'audio_description', fatal=False) - else: - audio_description = self._html_search_regex(r'(?s)<div\s+class=["\']rich_intro[^>]*>(.+?</article>)', - webpage, 'audio_description', fatal=False) - - if not audio_description: - audio_description_file = '%s://www.ximalaya.com/sounds/%s/rich_intro' % (scheme, audio_id) - audio_description = self._download_webpage(audio_description_file, audio_id, - note='Downloading description file %s' % audio_description_file, - errnote='Unable to download descrip file', - fatal=False) - audio_description = audio_description.strip() if audio_description else None + audio_description = try_call( + lambda: audio_info['intro'].replace('\r\n\r\n\r\n ', '\n').replace('\r\n', '\n')) return { 'id': audio_id, 'uploader': audio_info.get('nickname'), 'uploader_id': audio_uploader_id, - 'uploader_url': self._USER_URL_FORMAT % (scheme, audio_uploader_id) if audio_uploader_id else None, + 'uploader_url': f'{scheme}://www.ximalaya.com/zhubo/{audio_uploader_id}/' if audio_uploader_id else None, 'title': audio_info['title'], 'thumbnails': thumbnails, 'description': audio_description, - 'categories': list(filter(None, (audio_info.get('category_name'), audio_info.get('category_title')))), + 'categories': list(filter(None, [audio_info.get('category_name')])), 'duration': audio_info.get('duration'), 'view_count': audio_info.get('play_count'), 'like_count': audio_info.get('favorites_count'), @@ -174,60 +123,38 @@ class XimalayaIE(XimalayaBaseIE): class XimalayaAlbumIE(XimalayaBaseIE): IE_NAME = 'ximalaya:album' IE_DESC = '喜马拉雅FM 专辑' - _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/(?P<uid>[0-9]+)/album/(?P<id>[0-9]+)' - _TEMPLATE_URL = '%s://www.ximalaya.com/%s/album/%s/' - _BASE_URL_TEMPL = '%s://www.ximalaya.com%s' - _LIST_VIDEO_RE = r'<a[^>]+?href="(?P<url>/%s/sound/(?P<id>\d+)/?)"[^>]+?title="(?P<title>[^>]+)">' + _VALID_URL = r'https?://(?:www\.|m\.)?ximalaya\.com/\d+/album/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://www.ximalaya.com/61425525/album/5534601/', 'info_dict': { 'title': '唐诗三百首(含赏析)', 'id': '5534601', }, - 'playlist_count': 312, - }, { - 'url': 'http://m.ximalaya.com/61425525/album/5534601', - 'info_dict': { - 'title': '唐诗三百首(含赏析)', - 'id': '5534601', - }, - 'playlist_count': 312, - }, - ] + 'playlist_mincount': 323, + }] def _real_extract(self, url): - self.scheme = scheme = 'https' if url.startswith('https') else 'http' - - mobj = self._match_valid_url(url) - uid, playlist_id = mobj.group('uid'), mobj.group('id') - - webpage = self._download_webpage(self._TEMPLATE_URL % (scheme, uid, playlist_id), playlist_id, - note='Download album page for %s' % playlist_id, - errnote='Unable to get album info') + playlist_id = self._match_id(url) - title = self._html_search_regex(r'detailContent_title[^>]*><h1(?:[^>]+)?>([^<]+)</h1>', - webpage, 'title', fatal=False) + first_page = self._fetch_page(playlist_id, 1) + page_count = math.ceil(first_page['trackTotalCount'] / first_page['pageSize']) - return self.playlist_result(self._entries(webpage, playlist_id, uid), playlist_id, title) + entries = InAdvancePagedList( + lambda idx: self._get_entries(self._fetch_page(playlist_id, idx + 1) if idx else first_page), + page_count, first_page['pageSize']) - def _entries(self, page, playlist_id, uid): - html = page - for page_num in itertools.count(1): - for entry in self._process_page(html, uid): - yield entry + title = traverse_obj(first_page, ('tracks', 0, 'albumTitle'), expected_type=str) - next_url = self._search_regex(r'<a\s+href=(["\'])(?P<more>[\S]+)\1[^>]+rel=(["\'])next\3', - html, 'list_next_url', default=None, group='more') - if not next_url: - break + return self.playlist_result(entries, playlist_id, title) - next_full_url = self._BASE_URL_TEMPL % (self.scheme, next_url) - html = self._download_webpage(next_full_url, playlist_id) + def _fetch_page(self, playlist_id, page_idx): + return self._download_json( + 'https://www.ximalaya.com/revision/album/v1/getTracksList', + playlist_id, note=f'Downloading tracks list page {page_idx}', + query={'albumId': playlist_id, 'pageNum': page_idx, 'sort': 1})['data'] - def _process_page(self, html, uid): - find_from = html.index('album_soundlist') - for mobj in re.finditer(self._LIST_VIDEO_RE % uid, html[find_from:]): - yield self.url_result(self._BASE_URL_TEMPL % (self.scheme, mobj.group('url')), - XimalayaIE.ie_key(), - mobj.group('id'), - mobj.group('title')) + def _get_entries(self, page_data): + for e in page_data['tracks']: + yield self.url_result( + self._proto_relative_url(f'//www.ximalaya.com{e["url"]}'), + XimalayaIE, e.get('trackId'), e.get('title')) diff --git a/hypervideo_dl/extractor/xinpianchang.py b/hypervideo_dl/extractor/xinpianchang.py index 9832d23..ddc1d0b 100644 --- a/hypervideo_dl/extractor/xinpianchang.py +++ b/hypervideo_dl/extractor/xinpianchang.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -75,8 +72,6 @@ class XinpianchangIE(InfoExtractor): 'ext': 'mp4', } for prog in v if prog.get('url') or []]) - self._sort_formats(formats) - return { 'id': video_id, 'title': data.get('title'), diff --git a/hypervideo_dl/extractor/xminus.py b/hypervideo_dl/extractor/xminus.py index 36e5ead..5f11381 100644 --- a/hypervideo_dl/extractor/xminus.py +++ b/hypervideo_dl/extractor/xminus.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re import time diff --git a/hypervideo_dl/extractor/xnxx.py b/hypervideo_dl/extractor/xnxx.py index 27f9916..1452aae 100644 --- a/hypervideo_dl/extractor/xnxx.py +++ b/hypervideo_dl/extractor/xnxx.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -67,7 +64,6 @@ class XNXXIE(InfoExtractor): 'format_id': format_id, 'quality': -1 if format_id == 'low' else 0, }) - self._sort_formats(formats) thumbnail = self._og_search_thumbnail(webpage, default=None) or get( 'ThumbUrl', fatal=False) or get('ThumbUrl169', fatal=False) diff --git a/hypervideo_dl/extractor/xstream.py b/hypervideo_dl/extractor/xstream.py index 792843d..8dd1cd9 100644 --- a/hypervideo_dl/extractor/xstream.py +++ b/hypervideo_dl/extractor/xstream.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -85,7 +82,6 @@ class XstreamIE(InfoExtractor): 'url': media_url, 'tbr': tbr, }) - self._sort_formats(formats) link = find_xpath_attr( entry, xpath_with_ns('./atom:link', NS_MAP), 'rel', 'original') diff --git a/hypervideo_dl/extractor/xtube.py b/hypervideo_dl/extractor/xtube.py index abd3191..ce4480c 100644 --- a/hypervideo_dl/extractor/xtube.py +++ b/hypervideo_dl/extractor/xtube.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import itertools import re @@ -131,7 +129,6 @@ class XTubeIE(InfoExtractor): }) self._remove_duplicate_formats(formats) - self._sort_formats(formats) if not title: title = self._search_regex( diff --git a/hypervideo_dl/extractor/xuite.py b/hypervideo_dl/extractor/xuite.py index 0276c0d..71ddadd 100644 --- a/hypervideo_dl/extractor/xuite.py +++ b/hypervideo_dl/extractor/xuite.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -119,7 +116,6 @@ class XuiteIE(InfoExtractor): 'format_id': format_id, 'height': int(format_id) if format_id.isnumeric() else None, }) - self._sort_formats(formats) timestamp = media_info.get('PUBLISH_DATETIME') if timestamp: diff --git a/hypervideo_dl/extractor/xvideos.py b/hypervideo_dl/extractor/xvideos.py index d5261b6..5c505c8 100644 --- a/hypervideo_dl/extractor/xvideos.py +++ b/hypervideo_dl/extractor/xvideos.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -151,8 +149,6 @@ class XVideosIE(InfoExtractor): 'quality': -2 if format_id.endswith('low') else None, }) - self._sort_formats(formats) - return { 'id': video_id, 'formats': formats, diff --git a/hypervideo_dl/extractor/xxxymovies.py b/hypervideo_dl/extractor/xxxymovies.py index 0d53601..e3e3a9f 100644 --- a/hypervideo_dl/extractor/xxxymovies.py +++ b/hypervideo_dl/extractor/xxxymovies.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( parse_duration, diff --git a/hypervideo_dl/extractor/yahoo.py b/hypervideo_dl/extractor/yahoo.py index 20504de..a69715b 100644 --- a/hypervideo_dl/extractor/yahoo.py +++ b/hypervideo_dl/extractor/yahoo.py @@ -1,33 +1,28 @@ -# coding: utf-8 -from __future__ import unicode_literals - import hashlib import itertools -import re +import urllib.parse +from .brightcove import BrightcoveNewIE from .common import InfoExtractor, SearchInfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse, -) +from .youtube import YoutubeIE from ..utils import ( - clean_html, ExtractorError, + clean_html, int_or_none, mimetype2ext, parse_iso8601, smuggle_url, + traverse_obj, try_get, url_or_none, ) -from .brightcove import BrightcoveNewIE -from .youtube import YoutubeIE - class YahooIE(InfoExtractor): IE_DESC = 'Yahoo screen and movies' _VALID_URL = r'(?P<url>https?://(?:(?P<country>[a-zA-Z]{2}(?:-[a-zA-Z]{2})?|malaysia)\.)?(?:[\da-zA-Z_-]+\.)?yahoo\.com/(?:[^/]+/)*(?P<id>[^?&#]*-[0-9]+(?:-[a-z]+)?)\.html)' + _EMBED_REGEX = [r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:screen|movies)\.yahoo\.com/.+?\.html\?format=embed)\1'] + _TESTS = [{ 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html', 'info_dict': { @@ -246,8 +241,6 @@ class YahooIE(InfoExtractor): if not formats and msg == 'geo restricted': self.raise_geo_restricted(metadata_available=True) - self._sort_formats(formats) - thumbnails = [] for thumb in video.get('thumbnails', []): thumb_url = thumb.get('url') @@ -317,7 +310,7 @@ class YahooIE(InfoExtractor): if items.get('markup'): entries.extend( - self.url_result(yt_url) for yt_url in YoutubeIE._extract_urls(items['markup'])) + self.url_result(yt_url) for yt_url in YoutubeIE._extract_embed_urls(url, items['markup'])) return self.playlist_result( entries, item.get('uuid'), @@ -336,7 +329,7 @@ class YahooSearchIE(SearchInfoExtractor): def _search_results(self, query): for pagenum in itertools.count(0): - result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30) + result_url = 'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (urllib.parse.quote_plus(query), pagenum * 30) info = self._download_json(result_url, query, note='Downloading results page ' + str(pagenum + 1)) yield from (self.url_result(result['rurl']) for result in info['results']) @@ -437,7 +430,7 @@ class YahooGyaOIE(InfoExtractor): page = 1 while True: playlist = self._download_json( - f'https://gyao.yahoo.co.jp/api/programs/{program_id}/videos?page={page}', program_id, + f'https://gyao.yahoo.co.jp/api/programs/{program_id}/videos?page={page}&serviceId=gy', program_id, note=f'Downloading JSON metadata page {page}') if not playlist: break @@ -462,34 +455,21 @@ class YahooGyaOIE(InfoExtractor): class YahooJapanNewsIE(InfoExtractor): IE_NAME = 'yahoo:japannews' IE_DESC = 'Yahoo! Japan News' - _VALID_URL = r'https?://(?P<host>(?:news|headlines)\.yahoo\.co\.jp)[^\d]*(?P<id>\d[\d-]*\d)?' + _VALID_URL = r'https?://news\.yahoo\.co\.jp/(?:articles|feature)/(?P<id>[a-zA-Z0-9]+)' _GEO_COUNTRIES = ['JP'] _TESTS = [{ - 'url': 'https://headlines.yahoo.co.jp/videonews/ann?a=20190716-00000071-ann-int', + 'url': 'https://news.yahoo.co.jp/articles/a70fe3a064f1cfec937e2252c7fc6c1ba3201c0e', 'info_dict': { - 'id': '1736242', + 'id': 'a70fe3a064f1cfec937e2252c7fc6c1ba3201c0e', 'ext': 'mp4', - 'title': 'ムン大統領が対日批判を強化“現金化”効果は?(テレビ朝日系(ANN)) - Yahoo!ニュース', - 'description': '韓国の元徴用工らを巡る裁判の原告が弁護士が差し押さえた三菱重工業の資産を売却して - Yahoo!ニュース(テレビ朝日系(ANN))', - 'thumbnail': r're:^https?://.*\.[a-zA-Z\d]{3,4}$', + 'title': '【独自】安倍元総理「国葬」中止求め“脅迫メール”…「子ども誘拐」“送信者”を追跡', + 'description': 'md5:1c06974575f930f692d8696fbcfdc546', + 'thumbnail': r're:https://.+', }, 'params': { 'skip_download': True, }, }, { - # geo restricted - 'url': 'https://headlines.yahoo.co.jp/hl?a=20190721-00000001-oxv-l04', - 'only_matching': True, - }, { - 'url': 'https://headlines.yahoo.co.jp/videonews/', - 'only_matching': True, - }, { - 'url': 'https://news.yahoo.co.jp', - 'only_matching': True, - }, { - 'url': 'https://news.yahoo.co.jp/byline/hashimotojunji/20190628-00131977/', - 'only_matching': True, - }, { 'url': 'https://news.yahoo.co.jp/feature/1356', 'only_matching': True }] @@ -497,11 +477,7 @@ class YahooJapanNewsIE(InfoExtractor): def _extract_formats(self, json_data, content_id): formats = [] - video_data = try_get( - json_data, - lambda x: x['ResultSet']['Result'][0]['VideoUrlSet']['VideoUrl'], - list) - for vid in video_data or []: + for vid in traverse_obj(json_data, ('ResultSet', 'Result', ..., 'VideoUrlSet', 'VideoUrl', ...)) or []: delivery = vid.get('delivery') url = url_or_none(vid.get('Url')) if not delivery or not url: @@ -514,73 +490,58 @@ class YahooJapanNewsIE(InfoExtractor): else: formats.append({ 'url': url, - 'format_id': 'http-%s' % compat_str(vid.get('bitrate', '')), + 'format_id': f'http-{vid.get("bitrate")}', 'height': int_or_none(vid.get('height')), 'width': int_or_none(vid.get('width')), 'tbr': int_or_none(vid.get('bitrate')), }) self._remove_duplicate_formats(formats) - self._sort_formats(formats) return formats def _real_extract(self, url): - mobj = self._match_valid_url(url) - host = mobj.group('host') - display_id = mobj.group('id') or host - - webpage = self._download_webpage(url, display_id) - - title = self._html_search_meta( - ['og:title', 'twitter:title'], webpage, 'title', default=None - ) or self._html_extract_title(webpage) - - if display_id == host: - # Headline page (w/ multiple BC playlists) ('news.yahoo.co.jp', 'headlines.yahoo.co.jp/videonews/', ...) - stream_plists = re.findall(r'plist=(\d+)', webpage) or re.findall(r'plist["\']:\s*["\']([^"\']+)', webpage) - entries = [ - self.url_result( - smuggle_url( - 'http://players.brightcove.net/5690807595001/HyZNerRl7_default/index.html?playlistId=%s' % plist_id, - {'geo_countries': ['JP']}), - ie='BrightcoveNew', video_id=plist_id) - for plist_id in stream_plists] - return self.playlist_result(entries, playlist_title=title) - - # Article page - description = self._html_search_meta( - ['og:description', 'description', 'twitter:description'], - webpage, 'description', default=None) - thumbnail = self._og_search_thumbnail( - webpage, default=None) or self._html_search_meta( - 'twitter:image', webpage, 'thumbnail', default=None) - space_id = self._search_regex([ - r'<script[^>]+class=["\']yvpub-player["\'][^>]+spaceid=([^&"\']+)', - r'YAHOO\.JP\.srch\.\w+link\.onLoad[^;]+spaceID["\' ]*:["\' ]+([^"\']+)', - r'<!--\s+SpaceID=(\d+)' - ], webpage, 'spaceid') - - content_id = self._search_regex( - r'<script[^>]+class=["\']yvpub-player["\'][^>]+contentid=(?P<contentid>[^&"\']+)', - webpage, 'contentid', group='contentid') - + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + preloaded_state = self._search_json(r'__PRELOADED_STATE__\s*=', webpage, 'preloaded state', video_id) + + content_id = traverse_obj( + preloaded_state, ('articleDetail', 'paragraphs', ..., 'objectItems', ..., 'video', 'vid'), + get_all=False, expected_type=int) + if content_id is None: + raise ExtractorError('This article does not contain a video', expected=True) + + HOST = 'news.yahoo.co.jp' + space_id = traverse_obj(preloaded_state, ('pageData', 'spaceId'), expected_type=str) json_data = self._download_json( - 'https://feapi-yvpub.yahooapis.jp/v1/content/%s' % content_id, - content_id, - query={ + f'https://feapi-yvpub.yahooapis.jp/v1/content/{content_id}', + video_id, query={ 'appid': 'dj0zaiZpPVZMTVFJR0FwZWpiMyZzPWNvbnN1bWVyc2VjcmV0Jng9YjU-', 'output': 'json', - 'space_id': space_id, - 'domain': host, - 'ak': hashlib.md5('_'.join((space_id, host)).encode()).hexdigest(), + 'domain': HOST, + 'ak': hashlib.md5('_'.join((space_id, HOST)).encode()).hexdigest() if space_id else '', 'device_type': '1100', }) - formats = self._extract_formats(json_data, content_id) + + title = ( + traverse_obj(preloaded_state, + ('articleDetail', 'headline'), ('pageData', 'pageParam', 'title'), + expected_type=str) + or self._html_search_meta(('og:title', 'twitter:title'), webpage, 'title', default=None) + or self._html_extract_title(webpage)) + description = ( + traverse_obj(preloaded_state, ('pageData', 'description'), expected_type=str) + or self._html_search_meta( + ('og:description', 'description', 'twitter:description'), + webpage, 'description', default=None)) + thumbnail = ( + traverse_obj(preloaded_state, ('pageData', 'ogpImage'), expected_type=str) + or self._og_search_thumbnail(webpage, default=None) + or self._html_search_meta('twitter:image', webpage, 'thumbnail', default=None)) return { - 'id': content_id, + 'id': video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, - 'formats': formats, + 'formats': self._extract_formats(json_data, video_id), } diff --git a/hypervideo_dl/extractor/yandexdisk.py b/hypervideo_dl/extractor/yandexdisk.py index c15f3a4..d5eecbd 100644 --- a/hypervideo_dl/extractor/yandexdisk.py +++ b/hypervideo_dl/extractor/yandexdisk.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json from .common import InfoExtractor @@ -130,7 +127,6 @@ class YandexDiskIE(InfoExtractor): 'url': format_url, 'width': int_or_none(size.get('width')), }) - self._sort_formats(formats) uid = resource.get('uid') display_name = try_get(store, lambda x: x['users'][uid]['displayName']) diff --git a/hypervideo_dl/extractor/yandexmusic.py b/hypervideo_dl/extractor/yandexmusic.py index 8e94f1f..1869091 100644 --- a/hypervideo_dl/extractor/yandexmusic.py +++ b/hypervideo_dl/extractor/yandexmusic.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import hashlib import itertools @@ -118,8 +115,7 @@ class YandexMusicTrackIE(YandexMusicBaseIE): download_data = self._download_json( 'https://music.yandex.ru/api/v2.1/handlers/track/%s:%s/web-album_track-track-track-main/download/m' % (track_id, album_id), - track_id, 'Downloading track location url JSON', - headers={'X-Retpath-Y': url}) + track_id, 'Downloading track location url JSON', query={'hq': 1}, headers={'X-Retpath-Y': url}) fd_data = self._download_json( download_data['src'], track_id, diff --git a/hypervideo_dl/extractor/yandexvideo.py b/hypervideo_dl/extractor/yandexvideo.py index 7d3966b..535b61f 100644 --- a/hypervideo_dl/extractor/yandexvideo.py +++ b/hypervideo_dl/extractor/yandexvideo.py @@ -1,17 +1,15 @@ -# coding: utf-8 -from __future__ import unicode_literals - import itertools -import re from .common import InfoExtractor from ..utils import ( determine_ext, extract_attributes, int_or_none, + lowercase_escape, + parse_qs, + traverse_obj, try_get, url_or_none, - lowercase_escape, ) @@ -26,7 +24,6 @@ class YandexVideoIE(InfoExtractor): ''' _TESTS = [{ 'url': 'https://yandex.ru/portal/video?stream_id=4dbb36ec4e0526d58f9f2dc8f0ecf374', - 'md5': 'e02a05bfaf0d9615ef07ae3a10f4faf4', 'info_dict': { 'id': '4dbb36ec4e0526d58f9f2dc8f0ecf374', 'ext': 'mp4', @@ -41,6 +38,7 @@ class YandexVideoIE(InfoExtractor): 'like_count': int, 'dislike_count': int, }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://yandex.ru/portal/efir?stream_id=4dbb262b4fe5cf15a215de4f34eee34d&from=morda', 'only_matching': True, @@ -123,8 +121,6 @@ class YandexVideoIE(InfoExtractor): else: formats.append({'url': content_url}) - self._sort_formats(formats) - timestamp = (int_or_none(content.get('release_date')) or int_or_none(content.get('release_date_ut')) or int_or_none(content.get('start_time'))) @@ -150,7 +146,7 @@ class YandexVideoIE(InfoExtractor): class YandexVideoPreviewIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?yandex\.ru/video/preview(?:/?\?.*?filmId=|/)(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?yandex\.\w{2,3}(?:\.(?:am|ge|il|tr))?/video/preview(?:/?\?.*?filmId=|/)(?P<id>\d+)' _TESTS = [{ # Odnoklassniki 'url': 'https://yandex.ru/video/preview/?filmId=10682852472978372885&text=summer', 'info_dict': { @@ -177,6 +173,9 @@ class YandexVideoPreviewIE(InfoExtractor): }, { # Odnoklassniki 'url': 'https://yandex.ru/video/preview/?text=Francis%20Lai%20-%20Le%20Bon%20Et%20Les%20MC)chants&path=wizard&parent-reqid=1643208087979310-1481782809207673478-sas3-0931-2f9-sas-l7-balancer-8080-BAL-9380&wiz_type=vital&filmId=12508152936505397283', 'only_matching': True, + }, { # Odnoklassniki + 'url': 'https://yandex.com/video/preview/?text=dossier%2051%20film%201978&path=yandex_search&parent-reqid=1664361087754492-8727541069609384458-sas2-0340-sas-l7-balancer-8080-BAL-8045&noreask=1&from_type=vast&filmId=5794987234584444632', + 'only_matching': True, }] def _real_extract(self, url): @@ -188,34 +187,35 @@ class YandexVideoPreviewIE(InfoExtractor): class ZenYandexIE(InfoExtractor): - _VALID_URL = r'https?://zen\.yandex\.ru(?:/video)?/(media|watch)/(?:(?:id/[^/]+/|[^/]+/)(?:[a-z0-9-]+)-)?(?P<id>[a-z0-9-]+)' + _VALID_URL = r'https?://(zen\.yandex|dzen)\.ru(?:/video)?/(media|watch)/(?:(?:id/[^/]+/|[^/]+/)(?:[a-z0-9-]+)-)?(?P<id>[a-z0-9-]+)' _TESTS = [{ - 'url': 'https://zen.yandex.ru/media/popmech/izverjenie-vulkana-iz-spichek-zreliscnyi-opyt-6002240ff8b1af50bb2da5e3', + 'url': 'https://zen.yandex.ru/media/id/606fd806cc13cb3c58c05cf5/vot-eto-focus-dedy-morozy-na-gidrociklah-60c7c443da18892ebfe85ed7', 'info_dict': { - 'id': '6002240ff8b1af50bb2da5e3', + 'id': '60c7c443da18892ebfe85ed7', 'ext': 'mp4', - 'title': 'Извержение вулкана из спичек: зрелищный опыт', - 'description': 'md5:053ad3c61b5596d510c9a199dc8ee633', - 'thumbnail': 're:^https://avatars.mds.yandex.net/', - 'uploader': 'Популярная механика', + 'title': 'ВОТ ЭТО Focus. Деды Морозы на гидроциклах', + 'description': 'md5:f3db3d995763b9bbb7b56d4ccdedea89', + 'thumbnail': 're:^https://avatars.dzeninfra.ru/', + 'uploader': 'AcademeG DailyStream' }, 'params': { 'skip_download': 'm3u8', + 'format': 'bestvideo', }, + 'skip': 'The page does not exist', }, { - 'url': 'https://zen.yandex.ru/media/id/606fd806cc13cb3c58c05cf5/vot-eto-focus-dedy-morozy-na-gidrociklah-60c7c443da18892ebfe85ed7', + 'url': 'https://dzen.ru/media/id/606fd806cc13cb3c58c05cf5/vot-eto-focus-dedy-morozy-na-gidrociklah-60c7c443da18892ebfe85ed7', 'info_dict': { 'id': '60c7c443da18892ebfe85ed7', 'ext': 'mp4', 'title': 'ВОТ ЭТО Focus. Деды Морозы на гидроциклах', 'description': 'md5:f3db3d995763b9bbb7b56d4ccdedea89', - 'thumbnail': 're:^https://avatars.mds.yandex.net/', - 'uploader': 'AcademeG DailyStream' - }, - 'params': { - 'skip_download': 'm3u8', - 'format': 'bestvideo', + 'thumbnail': r're:^https://avatars\.dzeninfra\.ru/', + 'uploader': 'AcademeG DailyStream', + 'upload_date': '20191111', + 'timestamp': 1573465585, }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://zen.yandex.ru/video/watch/6002240ff8b1af50bb2da5e3', 'info_dict': { @@ -223,21 +223,42 @@ class ZenYandexIE(InfoExtractor): 'ext': 'mp4', 'title': 'Извержение вулкана из спичек: зрелищный опыт', 'description': 'md5:053ad3c61b5596d510c9a199dc8ee633', - 'uploader': 'Популярная механика', + 'thumbnail': r're:^https://avatars\.dzeninfra\.ru/', + 'uploader': 'TechInsider', + 'timestamp': 1611378221, + 'upload_date': '20210123', }, - 'params': { - 'skip_download': 'm3u8', + 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://dzen.ru/video/watch/6002240ff8b1af50bb2da5e3', + 'info_dict': { + 'id': '6002240ff8b1af50bb2da5e3', + 'ext': 'mp4', + 'title': 'Извержение вулкана из спичек: зрелищный опыт', + 'description': 'md5:053ad3c61b5596d510c9a199dc8ee633', + 'thumbnail': 're:^https://avatars.dzeninfra.ru/', + 'uploader': 'TechInsider', + 'upload_date': '20210123', + 'timestamp': 1611378221, }, + 'params': {'skip_download': 'm3u8'}, }, { 'url': 'https://zen.yandex.ru/media/id/606fd806cc13cb3c58c05cf5/novyi-samsung-fold-3-moskvich-barahlit-612f93b7f8d48e7e945792a2?from=channel&rid=2286618386.482.1630817595976.42360', 'only_matching': True, + }, { + 'url': 'https://dzen.ru/media/id/606fd806cc13cb3c58c05cf5/novyi-samsung-fold-3-moskvich-barahlit-612f93b7f8d48e7e945792a2?from=channel&rid=2286618386.482.1630817595976.42360', + 'only_matching': True, }] def _real_extract(self, url): - id = self._match_id(url) - webpage = self._download_webpage(url, id) - data_json = self._parse_json( - self._search_regex(r'data\s*=\s*({["\']_*serverState_*video.+?});', webpage, 'metadata'), id) + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + redirect = self._search_json(r'var it\s*=', webpage, 'redirect', id, default={}).get('retpath') + if redirect: + video_id = self._match_id(redirect) + webpage = self._download_webpage(redirect, video_id, note='Redirecting') + data_json = self._search_json( + r'data\s*=', webpage, 'metadata', video_id, contains_pattern=r'{["\']_*serverState_*video.+}') serverstate = self._search_regex(r'(_+serverState_+video-site_[^_]+_+)', webpage, 'server state').replace('State', 'Settings') uploader = self._search_regex(r'(<a\s*class=["\']card-channel-link[^"\']+["\'][^>]+>)', @@ -252,13 +273,13 @@ class ZenYandexIE(InfoExtractor): formats.extend(self._extract_mpd_formats(s_url, id, mpd_id='dash')) elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats(s_url, id, 'mp4')) - self._sort_formats(formats) return { - 'id': id, + 'id': video_id, 'title': video_json.get('title') or self._og_search_title(webpage), 'formats': formats, 'duration': int_or_none(video_json.get('duration')), 'view_count': int_or_none(video_json.get('views')), + 'timestamp': int_or_none(video_json.get('publicationDate')), 'uploader': uploader_name or data_json.get('authorName') or try_get(data_json, lambda x: x['publisher']['name']), 'description': self._og_search_description(webpage) or try_get(data_json, lambda x: x['og']['description']), 'thumbnail': self._og_search_thumbnail(webpage) or try_get(data_json, lambda x: x['og']['imageUrl']), @@ -266,40 +287,99 @@ class ZenYandexIE(InfoExtractor): class ZenYandexChannelIE(InfoExtractor): - _VALID_URL = r'https?://zen\.yandex\.ru/(?!media|video)(?:id/)?(?P<id>[a-z0-9-_]+)' + _VALID_URL = r'https?://(zen\.yandex|dzen)\.ru/(?!media|video)(?:id/)?(?P<id>[a-z0-9-_]+)' _TESTS = [{ 'url': 'https://zen.yandex.ru/tok_media', 'info_dict': { 'id': 'tok_media', + 'title': 'СПЕКТР', + 'description': 'md5:a9e5b3c247b7fe29fd21371a428bcf56', + }, + 'playlist_mincount': 169, + }, { + 'url': 'https://dzen.ru/tok_media', + 'info_dict': { + 'id': 'tok_media', + 'title': 'СПЕКТР', + 'description': 'md5:a9e5b3c247b7fe29fd21371a428bcf56', }, 'playlist_mincount': 169, }, { 'url': 'https://zen.yandex.ru/id/606fd806cc13cb3c58c05cf5', 'info_dict': { 'id': '606fd806cc13cb3c58c05cf5', + 'description': 'md5:517b7c97d8ca92e940f5af65448fd928', + 'title': 'AcademeG DailyStream', + }, + 'playlist_mincount': 657, + }, { + # Test that the playlist extractor finishes extracting when the + # channel has less than one page + 'url': 'https://zen.yandex.ru/jony_me', + 'info_dict': { + 'id': 'jony_me', + 'description': 'md5:a2c62b4ef5cf3e3efb13d25f61f739e1', + 'title': 'JONY ', + }, + 'playlist_count': 20, + }, { + # Test that the playlist extractor finishes extracting when the + # channel has more than one page of entries + 'url': 'https://zen.yandex.ru/tatyanareva', + 'info_dict': { + 'id': 'tatyanareva', + 'description': 'md5:296b588d60841c3756c9105f237b70c6', + 'title': 'Татьяна Рева', + 'entries': 'maxcount:200', + }, + 'playlist_count': 46, + }, { + 'url': 'https://dzen.ru/id/606fd806cc13cb3c58c05cf5', + 'info_dict': { + 'id': '606fd806cc13cb3c58c05cf5', + 'title': 'AcademeG DailyStream', + 'description': 'md5:517b7c97d8ca92e940f5af65448fd928', }, 'playlist_mincount': 657, }] - def _entries(self, id, url): - webpage = self._download_webpage(url, id) - data_json = self._parse_json(re.findall(r'var\s?data\s?=\s?({.+?})\s?;', webpage)[-1], id) - for key in data_json.keys(): - if key.startswith('__serverState__'): - data_json = data_json[key] - items = list(try_get(data_json, lambda x: x['feed']['items'], dict).values()) - more = try_get(data_json, lambda x: x['links']['more']) or None + def _entries(self, item_id, server_state_json, server_settings_json): + items = (traverse_obj(server_state_json, ('feed', 'items', ...)) + or traverse_obj(server_settings_json, ('exportData', 'items', ...))) + + more = (traverse_obj(server_state_json, ('links', 'more')) + or traverse_obj(server_settings_json, ('exportData', 'more', 'link'))) + + next_page_id = None for page in itertools.count(1): - for item in items: - video_id = item.get('publication_id') or item.get('publicationId') - video_url = item.get('link') - yield self.url_result(video_url, ie=ZenYandexIE.ie_key(), video_id=video_id.split(':')[-1]) - if not more: + for item in items or []: + if item.get('type') != 'gif': + continue + video_id = traverse_obj(item, 'publication_id', 'publicationId') or '' + yield self.url_result(item['link'], ZenYandexIE, video_id.split(':')[-1]) + + current_page_id = next_page_id + next_page_id = traverse_obj(parse_qs(more), ('next_page_id', -1)) + if not all((more, items, next_page_id, next_page_id != current_page_id)): break - data_json = self._download_json(more, id, note='Downloading Page %d' % page) - items = data_json.get('items', []) - more = try_get(data_json, lambda x: x['more']['link']) or None + + data = self._download_json(more, item_id, note=f'Downloading Page {page}') + items, more = data.get('items'), traverse_obj(data, ('more', 'link')) def _real_extract(self, url): - id = self._match_id(url) - return self.playlist_result(self._entries(id, url), playlist_id=id) + item_id = self._match_id(url) + webpage = self._download_webpage(url, item_id) + redirect = self._search_json( + r'var it\s*=', webpage, 'redirect', item_id, default={}).get('retpath') + if redirect: + item_id = self._match_id(redirect) + webpage = self._download_webpage(redirect, item_id, note='Redirecting') + data = self._search_json( + r'var\s+data\s*=', webpage, 'channel data', item_id, contains_pattern=r'{\"__serverState__.+}') + server_state_json = traverse_obj(data, lambda k, _: k.startswith('__serverState__'), get_all=False) + server_settings_json = traverse_obj(data, lambda k, _: k.startswith('__serverSettings__'), get_all=False) + + return self.playlist_result( + self._entries(item_id, server_state_json, server_settings_json), + item_id, traverse_obj(server_state_json, ('channel', 'source', 'title')), + traverse_obj(server_state_json, ('channel', 'source', 'description'))) diff --git a/hypervideo_dl/extractor/yapfiles.py b/hypervideo_dl/extractor/yapfiles.py index cfb368d..19812ba 100644 --- a/hypervideo_dl/extractor/yapfiles.py +++ b/hypervideo_dl/extractor/yapfiles.py @@ -1,14 +1,8 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, qualities, - unescapeHTML, url_or_none, ) @@ -16,6 +10,7 @@ from ..utils import ( class YapFilesIE(InfoExtractor): _YAPFILES_URL = r'//(?:(?:www|api)\.)?yapfiles\.ru/get_player/*\?.*?\bv=(?P<id>\w+)' _VALID_URL = r'https?:%s' % _YAPFILES_URL + _EMBED_REGEX = [rf'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?{_YAPFILES_URL}.*?)\1'] _TESTS = [{ # with hd 'url': 'http://www.yapfiles.ru/get_player/?v=vMDE1NjcyNDUt0413', @@ -33,12 +28,6 @@ class YapFilesIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def _extract_urls(webpage): - return [unescapeHTML(mobj.group('url')) for mobj in re.finditer( - r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?%s.*?)\1' - % YapFilesIE._YAPFILES_URL, webpage)] - def _real_extract(self, url): video_id = self._match_id(url) @@ -90,7 +79,6 @@ class YapFilesIE(InfoExtractor): 'quality': quality_key(format_id), 'height': hd_height if is_hd else None, }) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/yesjapan.py b/hypervideo_dl/extractor/yesjapan.py index 681338c..b45fa8f 100644 --- a/hypervideo_dl/extractor/yesjapan.py +++ b/hypervideo_dl/extractor/yesjapan.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ( HEADRequest, diff --git a/hypervideo_dl/extractor/yinyuetai.py b/hypervideo_dl/extractor/yinyuetai.py index 1fd8d35..b2e3172 100644 --- a/hypervideo_dl/extractor/yinyuetai.py +++ b/hypervideo_dl/extractor/yinyuetai.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import ExtractorError @@ -44,7 +41,6 @@ class YinYueTaiIE(InfoExtractor): 'ext': 'mp4', 'tbr': format_info.get('bitrate'), } for format_info in info['videoUrlModels']] - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/yle_areena.py b/hypervideo_dl/extractor/yle_areena.py new file mode 100644 index 0000000..118dc12 --- /dev/null +++ b/hypervideo_dl/extractor/yle_areena.py @@ -0,0 +1,71 @@ +from .common import InfoExtractor +from .kaltura import KalturaIE +from ..utils import int_or_none, traverse_obj, url_or_none + + +class YleAreenaIE(InfoExtractor): + _VALID_URL = r'https?://areena\.yle\.fi/(?P<id>[\d-]+)' + _TESTS = [{ + 'url': 'https://areena.yle.fi/1-4371942', + 'md5': '932edda0ecf5dfd6423804182d32f8ac', + 'info_dict': { + 'id': '0_a3tjk92c', + 'ext': 'mp4', + 'title': 'Pouchit', + 'description': 'md5:d487309c3abbe5650265bbd1742d2f82', + 'series': 'Modernit miehet', + 'season': 'Season 1', + 'season_number': 1, + 'episode': 'Episode 2', + 'episode_number': 2, + 'thumbnail': 'http://cfvod.kaltura.com/p/1955031/sp/195503100/thumbnail/entry_id/0_a3tjk92c/version/100061', + 'uploader_id': 'ovp@yle.fi', + 'duration': 1435, + 'view_count': int, + 'upload_date': '20181204', + 'timestamp': 1543916210, + 'subtitles': {'fin': [{'url': r're:^https?://', 'ext': 'srt'}]}, + 'age_limit': 7, + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + info = self._search_json_ld(self._download_webpage(url, video_id), video_id, default={}) + video_data = self._download_json( + f'https://player.api.yle.fi/v1/preview/{video_id}.json?app_id=player_static_prod&app_key=8930d72170e48303cf5f3867780d549b', + video_id) + + # Example title: 'K1, J2: Pouchit | Modernit miehet' + series, season_number, episode_number, episode = self._search_regex( + r'K(?P<season_no>[\d]+),\s*J(?P<episode_no>[\d]+):?\s*\b(?P<episode>[^|]+)\s*|\s*(?P<series>.+)', + info.get('title') or '', 'episode metadata', group=('season_no', 'episode_no', 'episode', 'series'), + default=(None, None, None, None)) + description = traverse_obj(video_data, ('data', 'ongoing_ondemand', 'description', 'fin'), expected_type=str) + + subtitles = {} + for sub in traverse_obj(video_data, ('data', 'ongoing_ondemand', 'subtitles', ...)): + if url_or_none(sub.get('uri')): + subtitles.setdefault(sub.get('language') or 'und', []).append({ + 'url': sub['uri'], + 'ext': 'srt', + 'name': sub.get('kind'), + }) + + return { + '_type': 'url_transparent', + 'url': 'kaltura:1955031:%s' % traverse_obj(video_data, ('data', 'ongoing_ondemand', 'kaltura', 'id')), + 'ie_key': KalturaIE.ie_key(), + 'title': (traverse_obj(video_data, ('data', 'ongoing_ondemand', 'title', 'fin'), expected_type=str) + or episode or info.get('title')), + 'description': description, + 'series': (traverse_obj(video_data, ('data', 'ongoing_ondemand', 'series', 'title', 'fin'), expected_type=str) + or series), + 'season_number': (int_or_none(self._search_regex(r'Kausi (\d+)', description, 'season number', default=None)) + or int(season_number)), + 'episode_number': (traverse_obj(video_data, ('data', 'ongoing_ondemand', 'episode_number'), expected_type=int_or_none) + or int(episode_number)), + 'thumbnails': traverse_obj(info, ('thumbnails', ..., {'url': 'url'})), + 'age_limit': traverse_obj(video_data, ('data', 'ongoing_ondemand', 'content_rating', 'age_restriction'), expected_type=int_or_none), + 'subtitles': subtitles, + } diff --git a/hypervideo_dl/extractor/ynet.py b/hypervideo_dl/extractor/ynet.py index c4ae4d8..a7d7371 100644 --- a/hypervideo_dl/extractor/ynet.py +++ b/hypervideo_dl/extractor/ynet.py @@ -1,11 +1,8 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re import json +import re +import urllib.parse from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote_plus class YnetIE(InfoExtractor): @@ -34,7 +31,7 @@ class YnetIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - content = compat_urllib_parse_unquote_plus(self._og_search_video_url(webpage)) + content = urllib.parse.unquote_plus(self._og_search_video_url(webpage)) config = json.loads(self._search_regex(r'config=({.+?})$', content, 'video config')) f4m_url = config['clip']['url'] title = self._og_search_title(webpage) @@ -42,7 +39,6 @@ class YnetIE(InfoExtractor): if m: title = m.group('title') formats = self._extract_f4m_formats(f4m_url, video_id) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/youjizz.py b/hypervideo_dl/extractor/youjizz.py index 111623f..cd12be5 100644 --- a/hypervideo_dl/extractor/youjizz.py +++ b/hypervideo_dl/extractor/youjizz.py @@ -1,6 +1,3 @@ -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( determine_ext, diff --git a/hypervideo_dl/extractor/youku.py b/hypervideo_dl/extractor/youku.py index b505799..624975b 100644 --- a/hypervideo_dl/extractor/youku.py +++ b/hypervideo_dl/extractor/youku.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import random import re import string @@ -201,7 +198,6 @@ class YoukuIE(InfoExtractor): 'width': stream.get('width'), 'height': stream.get('height'), } for stream in data['stream'] if stream.get('channel_type') != 'tail'] - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/younow.py b/hypervideo_dl/extractor/younow.py index 583aea3..18112ba 100644 --- a/hypervideo_dl/extractor/younow.py +++ b/hypervideo_dl/extractor/younow.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import itertools from .common import InfoExtractor @@ -94,7 +91,7 @@ def _extract_moment(item, fatal=True): uploader = try_get(item, lambda x: x['owner']['name'], compat_str) uploader_id = try_get(item, lambda x: x['owner']['userId']) - uploader_url = format_field(uploader, template='https://www.younow.com/%s') + uploader_url = format_field(uploader, None, 'https://www.younow.com/%s') entry = { 'extractor_key': 'YouNowMoment', diff --git a/hypervideo_dl/extractor/youporn.py b/hypervideo_dl/extractor/youporn.py index 5feb568..8f1b991 100644 --- a/hypervideo_dl/extractor/youporn.py +++ b/hypervideo_dl/extractor/youporn.py @@ -1,11 +1,10 @@ -from __future__ import unicode_literals - import re from .common import InfoExtractor from ..utils import ( extract_attributes, int_or_none, + merge_dicts, str_to_int, unified_strdate, url_or_none, @@ -14,6 +13,7 @@ from ..utils import ( class YouPornIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?youporn\.com/(?:watch|embed)/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?' + _EMBED_REGEX = [r'<iframe[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?youporn\.com/embed/\d+)'] _TESTS = [{ 'url': 'http://www.youporn.com/watch/505835/sex-ed-is-it-safe-to-masturbate-daily/', 'md5': '3744d24c50438cf5b6f6d59feb5055c2', @@ -65,14 +65,26 @@ class YouPornIE(InfoExtractor): }, { 'url': 'https://www.youporn.com/watch/13922959/femdom-principal/', 'only_matching': True, + }, { + 'url': 'https://www.youporn.com/watch/16290308/tinderspecial-trailer1/', + 'info_dict': { + 'id': '16290308', + 'age_limit': 18, + 'categories': [], + 'description': 'md5:00ea70f642f431c379763c17c2f396bc', + 'display_id': 'tinderspecial-trailer1', + 'duration': 298.0, + 'ext': 'mp4', + 'upload_date': '20201123', + 'uploader': 'Ersties', + 'tags': [], + 'thumbnail': 'https://fi1.ypncdn.com/202011/23/16290308/original/8/tinderspecial-trailer1-8(m=eaAaaEPbaaaa).jpg', + 'timestamp': 1606089600, + 'title': 'Tinder In Real Life', + 'view_count': int, + } }] - @staticmethod - def _extract_urls(webpage): - return re.findall( - r'<iframe[^>]+\bsrc=["\']((?:https?:)?//(?:www\.)?youporn\.com/embed/\d+)', - webpage) - def _real_extract(self, url): mobj = self._match_valid_url(url) video_id = mobj.group('id') @@ -110,7 +122,6 @@ class YouPornIE(InfoExtractor): }) f['height'] = height formats.append(f) - self._sort_formats(formats) webpage = self._download_webpage( 'http://www.youporn.com/watch/%s' % video_id, display_id, @@ -137,9 +148,10 @@ class YouPornIE(InfoExtractor): r'(?s)<div[^>]+class=["\']submitByLink["\'][^>]*>(.+?)</div>', webpage, 'uploader', fatal=False) upload_date = unified_strdate(self._html_search_regex( - [r'UPLOADED:\s*<span>([^<]+)', + (r'UPLOADED:\s*<span>([^<]+)', r'Date\s+[Aa]dded:\s*<span>([^<]+)', - r'(?s)<div[^>]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)</div>'], + r'''(?s)<div[^>]+class=["']videoInfo(?:Date|Time)\b[^>]*>(.+?)</div>''', + r'(?s)<label\b[^>]*>Uploaded[^<]*</label>\s*<span\b[^>]*>(.+?)</span>'), webpage, 'upload date', fatal=False)) age_limit = self._rta_search(webpage) @@ -166,7 +178,8 @@ class YouPornIE(InfoExtractor): r'(?s)Tags:.*?</div>\s*<div[^>]+class=["\']tagBoxContent["\'][^>]*>(.+?)</div>', 'tags') - return { + data = self._search_json_ld(webpage, video_id, expected_type='VideoObject', fatal=False) + return merge_dicts(data, { 'id': video_id, 'display_id': display_id, 'title': title, @@ -181,4 +194,4 @@ class YouPornIE(InfoExtractor): 'tags': tags, 'age_limit': age_limit, 'formats': formats, - } + }) diff --git a/hypervideo_dl/extractor/yourporn.py b/hypervideo_dl/extractor/yourporn.py index 9834749..38f42a9 100644 --- a/hypervideo_dl/extractor/yourporn.py +++ b/hypervideo_dl/extractor/yourporn.py @@ -1,5 +1,3 @@ -from __future__ import unicode_literals - from .common import InfoExtractor from ..compat import compat_str from ..utils import ( diff --git a/hypervideo_dl/extractor/yourupload.py b/hypervideo_dl/extractor/yourupload.py index 9fa7728..def6329 100644 --- a/hypervideo_dl/extractor/yourupload.py +++ b/hypervideo_dl/extractor/yourupload.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import urljoin diff --git a/hypervideo_dl/extractor/youtube.py b/hypervideo_dl/extractor/youtube.py index dec3b14..f7e3c75 100644 --- a/hypervideo_dl/extractor/youtube.py +++ b/hypervideo_dl/extractor/youtube.py @@ -1,11 +1,9 @@ -# coding: utf-8 - -from __future__ import unicode_literals - +import base64 import calendar +import collections import copy import datetime -import functools +import enum import hashlib import itertools import json @@ -14,29 +12,27 @@ import os.path import random import re import sys +import threading import time import traceback -import threading +import urllib.error +import urllib.parse from .common import InfoExtractor, SearchInfoExtractor -from ..compat import ( - compat_chr, - compat_HTTPError, - compat_parse_qs, - compat_str, - compat_urllib_parse_unquote_plus, - compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse, - compat_urlparse, -) +from .openload import PhantomJSwrapper +from ..compat import functools from ..jsinterp import JSInterpreter from ..utils import ( + NO_DEFAULT, + ExtractorError, + LazyList, + UserNotLive, bug_reports_message, + classproperty, clean_html, datetime_from_str, dict_get, - error_to_compat_str, - ExtractorError, + filter_dict, float_or_none, format_field, get_first, @@ -46,7 +42,6 @@ from ..utils import ( js_to_json, mimetype2ext, network_exceptions, - NO_DEFAULT, orderedSet, parse_codecs, parse_count, @@ -54,7 +49,6 @@ from ..utils import ( parse_iso8601, parse_qs, qualities, - remove_end, remove_start, smuggle_url, str_or_none, @@ -72,15 +66,14 @@ from ..utils import ( variadic, ) - -# any clients starting with _ cannot be explicity requested by the user +# any clients starting with _ cannot be explicitly requested by the user INNERTUBE_CLIENTS = { 'web': { 'INNERTUBE_API_KEY': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8', 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB', - 'clientVersion': '2.20211221.00.00', + 'clientVersion': '2.20220801.00.00', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 1 @@ -90,7 +83,7 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB_EMBEDDED_PLAYER', - 'clientVersion': '1.20211215.00.01', + 'clientVersion': '1.20220731.00.00', }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 56 @@ -101,7 +94,7 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB_REMIX', - 'clientVersion': '1.20211213.00.00', + 'clientVersion': '1.20220727.01.00', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 67, @@ -111,7 +104,7 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'WEB_CREATOR', - 'clientVersion': '1.20211220.02.00', + 'clientVersion': '1.20220726.00.00', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 62, @@ -121,7 +114,9 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID', - 'clientVersion': '16.49', + 'clientVersion': '17.31.35', + 'androidSdkVersion': 30, + 'userAgent': 'com.google.android.youtube/17.31.35 (Linux; U; Android 11) gzip' } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 3, @@ -132,7 +127,9 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_EMBEDDED_PLAYER', - 'clientVersion': '16.49', + 'clientVersion': '17.31.35', + 'androidSdkVersion': 30, + 'userAgent': 'com.google.android.youtube/17.31.35 (Linux; U; Android 11) gzip' }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 55, @@ -143,7 +140,9 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_MUSIC', - 'clientVersion': '4.57', + 'clientVersion': '5.16.51', + 'androidSdkVersion': 30, + 'userAgent': 'com.google.android.apps.youtube.music/5.16.51 (Linux; U; Android 11) gzip' } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 21, @@ -154,7 +153,9 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'ANDROID_CREATOR', - 'clientVersion': '21.47', + 'clientVersion': '22.30.100', + 'androidSdkVersion': 30, + 'userAgent': 'com.google.android.apps.youtube.creator/22.30.100 (Linux; U; Android 11) gzip' }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 14, @@ -167,8 +168,9 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS', - 'clientVersion': '16.46', + 'clientVersion': '17.33.2', 'deviceModel': 'iPhone14,3', + 'userAgent': 'com.google.ios.youtube/17.33.2 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)' } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 5, @@ -178,8 +180,9 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_MESSAGES_EXTENSION', - 'clientVersion': '16.46', + 'clientVersion': '17.33.2', 'deviceModel': 'iPhone14,3', + 'userAgent': 'com.google.ios.youtube/17.33.2 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)' }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 66, @@ -190,7 +193,9 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_MUSIC', - 'clientVersion': '4.57', + 'clientVersion': '5.21', + 'deviceModel': 'iPhone14,3', + 'userAgent': 'com.google.ios.youtubemusic/5.21 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)' }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 26, @@ -200,7 +205,9 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'IOS_CREATOR', - 'clientVersion': '21.47', + 'clientVersion': '22.33.101', + 'deviceModel': 'iPhone14,3', + 'userAgent': 'com.google.ios.ytcreator/22.33.101 (iPhone14,3; U; CPU iOS 15_6 like Mac OS X)' }, }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 15, @@ -213,7 +220,7 @@ INNERTUBE_CLIENTS = { 'INNERTUBE_CONTEXT': { 'client': { 'clientName': 'MWEB', - 'clientVersion': '2.20211221.01.00', + 'clientVersion': '2.20220801.00.00', } }, 'INNERTUBE_CONTEXT_CLIENT_NAME': 2 @@ -272,14 +279,23 @@ def build_innertube_clients(): build_innertube_clients() +class BadgeType(enum.Enum): + AVAILABILITY_UNLISTED = enum.auto() + AVAILABILITY_PRIVATE = enum.auto() + AVAILABILITY_PUBLIC = enum.auto() + AVAILABILITY_PREMIUM = enum.auto() + AVAILABILITY_SUBSCRIPTION = enum.auto() + LIVE_NOW = enum.auto() + + class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _RESERVED_NAMES = ( r'channel|c|user|playlist|watch|w|v|embed|e|watch_popup|clip|' r'shorts|movies|results|search|shared|hashtag|trending|explore|feed|feeds|' - r'browse|oembed|get_video_info|iframe_api|s/player|' - r'storefront|oops|index|account|reporthistory|t/terms|about|upload|signin|logout') + r'browse|oembed|get_video_info|iframe_api|s/player|source|' + r'storefront|oops|index|account|t/terms|about|upload|signin|logout') _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)' @@ -292,7 +308,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # invidious-redirect websites r'(?:www\.)?redirect\.invidious\.io', r'(?:(?:www|dev)\.)?invidio\.us', - # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/Invidious-Instances.md + # Invidious instances taken from https://github.com/iv-org/documentation/blob/master/docs/instances.md r'(?:www\.)?invidious\.pussthecat\.org', r'(?:www\.)?invidious\.zee\.li', r'(?:www\.)?invidious\.ethibox\.fr', @@ -352,8 +368,62 @@ class YoutubeBaseInfoExtractor(InfoExtractor): r'(?:www\.)?kbjggqkzv65ivcqj6bumvp337z6264huv5kpkwuv6gu5yjiskvan7fad\.onion', r'(?:www\.)?grwp24hodrefzvjjuccrkw3mjq4tzhaaq32amf33dzpmuxe7ilepcmad\.onion', r'(?:www\.)?hpniueoejy4opn7bc4ftgazyqjoeqwlvh2uiku2xqku6zpoa4bf5ruid\.onion', + # piped instances from https://github.com/TeamPiped/Piped/wiki/Instances + r'(?:www\.)?piped\.kavin\.rocks', + r'(?:www\.)?piped\.tokhmi\.xyz', + r'(?:www\.)?piped\.syncpundit\.io', + r'(?:www\.)?piped\.mha\.fi', + r'(?:www\.)?watch\.whatever\.social', + r'(?:www\.)?piped\.garudalinux\.org', + r'(?:www\.)?piped\.rivo\.lol', + r'(?:www\.)?piped-libre\.kavin\.rocks', + r'(?:www\.)?yt\.jae\.fi', + r'(?:www\.)?piped\.mint\.lgbt', + r'(?:www\.)?il\.ax', + r'(?:www\.)?piped\.esmailelbob\.xyz', + r'(?:www\.)?piped\.projectsegfau\.lt', + r'(?:www\.)?piped\.privacydev\.net', + r'(?:www\.)?piped\.palveluntarjoaja\.eu', + r'(?:www\.)?piped\.smnz\.de', + r'(?:www\.)?piped\.adminforge\.de', + r'(?:www\.)?watch\.whatevertinfoil\.de', + r'(?:www\.)?piped\.qdi\.fi', + r'(?:www\.)?piped\.video', + r'(?:www\.)?piped\.aeong\.one', ) + # extracted from account/account_menu ep + # XXX: These are the supported YouTube UI and API languages, + # which is slightly different from languages supported for translation in YouTube studio + _SUPPORTED_LANG_CODES = [ + 'af', 'az', 'id', 'ms', 'bs', 'ca', 'cs', 'da', 'de', 'et', 'en-IN', 'en-GB', 'en', 'es', + 'es-419', 'es-US', 'eu', 'fil', 'fr', 'fr-CA', 'gl', 'hr', 'zu', 'is', 'it', 'sw', 'lv', + 'lt', 'hu', 'nl', 'no', 'uz', 'pl', 'pt-PT', 'pt', 'ro', 'sq', 'sk', 'sl', 'sr-Latn', 'fi', + 'sv', 'vi', 'tr', 'be', 'bg', 'ky', 'kk', 'mk', 'mn', 'ru', 'sr', 'uk', 'el', 'hy', 'iw', + 'ur', 'ar', 'fa', 'ne', 'mr', 'hi', 'as', 'bn', 'pa', 'gu', 'or', 'ta', 'te', 'kn', 'ml', + 'si', 'th', 'lo', 'my', 'ka', 'am', 'km', 'zh-CN', 'zh-TW', 'zh-HK', 'ja', 'ko' + ] + + _IGNORED_WARNINGS = {'Unavailable videos will be hidden during playback'} + + @functools.cached_property + def _preferred_lang(self): + """ + Returns a language code supported by YouTube for the user preferred language. + Returns None if no preferred language set. + """ + preferred_lang = self._configuration_arg('lang', ie_key='Youtube', casesense=True, default=[''])[0] + if not preferred_lang: + return + if preferred_lang not in self._SUPPORTED_LANG_CODES: + raise ExtractorError( + f'Unsupported language code: {preferred_lang}. Supported language codes (case-sensitive): {join_nonempty(*self._SUPPORTED_LANG_CODES, delim=", ")}.', + expected=True) + elif preferred_lang != 'en': + self.report_warning( + f'Preferring "{preferred_lang}" translated fields. Note that some metadata extraction may fail or be incorrect.') + return preferred_lang + def _initialize_consent(self): cookies = self._get_cookies('https://www.youtube.com/') if cookies.get('__Secure-3PSID'): @@ -375,23 +445,23 @@ class YoutubeBaseInfoExtractor(InfoExtractor): pref = {} if pref_cookie: try: - pref = dict(compat_urlparse.parse_qsl(pref_cookie.value)) + pref = dict(urllib.parse.parse_qsl(pref_cookie.value)) except ValueError: self.report_warning('Failed to parse user PREF cookie' + bug_reports_message()) - pref.update({'hl': 'en', 'tz': 'UTC'}) - self._set_cookie('.youtube.com', name='PREF', value=compat_urllib_parse_urlencode(pref)) + pref.update({'hl': self._preferred_lang or 'en', 'tz': 'UTC'}) + self._set_cookie('.youtube.com', name='PREF', value=urllib.parse.urlencode(pref)) def _real_initialize(self): self._initialize_pref() self._initialize_consent() - if (self._LOGIN_REQUIRED - and self.get_param('cookiefile') is None - and self.get_param('cookiesfrombrowser') is None): + self._check_login_required() + + def _check_login_required(self): + if self._LOGIN_REQUIRED and not self._cookies_passed: self.raise_login_required('Login details are needed to download this content', method='cookies') - _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;' - _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;' - _YT_INITIAL_BOUNDARY_RE = r'(?:var\s+meta|</script|\n)' + _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=' + _YT_INITIAL_PLAYER_RESPONSE_RE = r'ytInitialPlayerResponse\s*=' def _get_default_ytcfg(self, client='web'): return copy.deepcopy(INNERTUBE_CLIENTS[client]) @@ -407,22 +477,26 @@ class YoutubeBaseInfoExtractor(InfoExtractor): def _extract_client_name(self, ytcfg, default_client='web'): return self._ytcfg_get_safe( ytcfg, (lambda x: x['INNERTUBE_CLIENT_NAME'], - lambda x: x['INNERTUBE_CONTEXT']['client']['clientName']), compat_str, default_client) + lambda x: x['INNERTUBE_CONTEXT']['client']['clientName']), str, default_client) def _extract_client_version(self, ytcfg, default_client='web'): return self._ytcfg_get_safe( ytcfg, (lambda x: x['INNERTUBE_CLIENT_VERSION'], - lambda x: x['INNERTUBE_CONTEXT']['client']['clientVersion']), compat_str, default_client) + lambda x: x['INNERTUBE_CONTEXT']['client']['clientVersion']), str, default_client) + + def _select_api_hostname(self, req_api_hostname, default_client=None): + return (self._configuration_arg('innertube_host', [''], ie_key=YoutubeIE.ie_key())[0] + or req_api_hostname or self._get_innertube_host(default_client or 'web')) def _extract_api_key(self, ytcfg=None, default_client='web'): - return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_API_KEY'], compat_str, default_client) + return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_API_KEY'], str, default_client) def _extract_context(self, ytcfg=None, default_client='web'): context = get_first( (ytcfg, self._get_default_ytcfg(default_client)), 'INNERTUBE_CONTEXT', expected_type=dict) # Enforce language and tz for extraction client_context = traverse_obj(context, 'client', expected_type=dict, default={}) - client_context.update({'hl': 'en', 'timeZone': 'UTC', 'utcOffsetMinutes': 0}) + client_context.update({'hl': self._preferred_lang or 'en', 'timeZone': 'UTC', 'utcOffsetMinutes': 0}) return context _SAPISID = None @@ -449,7 +523,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return None # SAPISIDHASH algorithm from https://stackoverflow.com/a/32065323 sapisidhash = hashlib.sha1( - f'{time_now} {self._SAPISID} {origin}'.encode('utf-8')).hexdigest() + f'{time_now} {self._SAPISID} {origin}'.encode()).hexdigest() return f'SAPISIDHASH {time_now}_{sapisidhash}' def _call_api(self, ep, query, video_id, fatal=True, headers=None, @@ -462,18 +536,16 @@ class YoutubeBaseInfoExtractor(InfoExtractor): real_headers.update({'content-type': 'application/json'}) if headers: real_headers.update(headers) + api_key = (self._configuration_arg('innertube_key', [''], ie_key=YoutubeIE.ie_key(), casesense=True)[0] + or api_key or self._extract_api_key(default_client=default_client)) return self._download_json( - 'https://%s/youtubei/v1/%s' % (api_hostname or self._get_innertube_host(default_client), ep), + f'https://{self._select_api_hostname(api_hostname, default_client)}/youtubei/v1/{ep}', video_id=video_id, fatal=fatal, note=note, errnote=errnote, data=json.dumps(data).encode('utf8'), headers=real_headers, - query={'key': api_key or self._extract_api_key(), 'prettyPrint': 'false'}) + query={'key': api_key, 'prettyPrint': 'false'}) def extract_yt_initial_data(self, item_id, webpage, fatal=True): - data = self._search_regex( - (r'%s\s*%s' % (self._YT_INITIAL_DATA_RE, self._YT_INITIAL_BOUNDARY_RE), - self._YT_INITIAL_DATA_RE), webpage, 'yt initial data', fatal=fatal) - if data: - return self._parse_json(data, item_id, fatal=fatal) + return self._search_json(self._YT_INITIAL_DATA_RE, webpage, 'yt initial data', item_id, fatal=fatal) @staticmethod def _extract_session_index(*data): @@ -489,7 +561,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # Deprecated? def _extract_identity_token(self, ytcfg=None, webpage=None): if ytcfg: - token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str) + token = try_get(ytcfg, lambda x: x['ID_TOKEN'], str) if token: return token if webpage: @@ -505,12 +577,12 @@ class YoutubeBaseInfoExtractor(InfoExtractor): """ for data in args: # ytcfg includes channel_syncid if on secondary channel - delegated_sid = try_get(data, lambda x: x['DELEGATED_SESSION_ID'], compat_str) + delegated_sid = try_get(data, lambda x: x['DELEGATED_SESSION_ID'], str) if delegated_sid: return delegated_sid sync_ids = (try_get( data, (lambda x: x['responseContext']['mainAppWebResponseContext']['datasyncId'], - lambda x: x['DATASYNC_ID']), compat_str) or '').split('||') + lambda x: x['DATASYNC_ID']), str) or '').split('||') if len(sync_ids) >= 2 and sync_ids[1]: # datasyncid is of the form "channel_syncid||user_syncid" for secondary channel # and just "user_syncid||" for primary channel. We only want the channel_syncid @@ -526,7 +598,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): args, [('VISITOR_DATA', ('INNERTUBE_CONTEXT', 'client', 'visitorData'), ('responseContext', 'visitorData'))], expected_type=str) - @property + @functools.cached_property def is_authenticated(self): return bool(self._generate_sapisidhash_header()) @@ -542,15 +614,16 @@ class YoutubeBaseInfoExtractor(InfoExtractor): self, *, ytcfg=None, account_syncid=None, session_index=None, visitor_data=None, identity_token=None, api_hostname=None, default_client='web'): - origin = 'https://' + (api_hostname if api_hostname else self._get_innertube_host(default_client)) + origin = 'https://' + (self._select_api_hostname(api_hostname, default_client)) headers = { - 'X-YouTube-Client-Name': compat_str( + 'X-YouTube-Client-Name': str( self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT_CLIENT_NAME'], default_client=default_client)), 'X-YouTube-Client-Version': self._extract_client_version(ytcfg, default_client), 'Origin': origin, 'X-Youtube-Identity-Token': identity_token or self._extract_identity_token(ytcfg), 'X-Goog-PageId': account_syncid or self._extract_account_syncid(ytcfg), - 'X-Goog-Visitor-Id': visitor_data or self._extract_visitor_data(ytcfg) + 'X-Goog-Visitor-Id': visitor_data or self._extract_visitor_data(ytcfg), + 'User-Agent': self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_CONTEXT']['client']['userAgent'], default_client=default_client) } if session_index is None: session_index = self._extract_session_index(ytcfg) @@ -561,7 +634,19 @@ class YoutubeBaseInfoExtractor(InfoExtractor): if auth is not None: headers['Authorization'] = auth headers['X-Origin'] = origin - return {h: v for h, v in headers.items() if v is not None} + return filter_dict(headers) + + def _download_ytcfg(self, client, video_id): + url = { + 'web': 'https://www.youtube.com', + 'web_music': 'https://music.youtube.com', + 'web_embedded': f'https://www.youtube.com/embed/{video_id}?html5=1' + }.get(client) + if not url: + return {} + webpage = self._download_webpage( + url, video_id, fatal=False, note=f'Downloading {client.replace("_", " ").strip()} client config') + return self.extract_ytcfg(video_id, webpage) or {} @staticmethod def _build_api_continuation_query(continuation, ctp=None): @@ -592,7 +677,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): def _extract_continuation_ep_data(cls, continuation_ep: dict): if isinstance(continuation_ep, dict): continuation = try_get( - continuation_ep, lambda x: x['continuationCommand']['token'], compat_str) + continuation_ep, lambda x: x['continuationCommand']['token'], str) if not continuation: return ctp = continuation_ep.get('clickTrackingParams') @@ -604,20 +689,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor): if next_continuation: return next_continuation - contents = [] - for key in ('contents', 'items'): - contents.extend(try_get(renderer, lambda x: x[key], list) or []) - - for content in contents: - if not isinstance(content, dict): - continue - continuation_ep = try_get( - content, (lambda x: x['continuationItemRenderer']['continuationEndpoint'], - lambda x: x['continuationItemRenderer']['button']['buttonRenderer']['command']), - dict) - continuation = cls._extract_continuation_ep_data(continuation_ep) - if continuation: - return continuation + return traverse_obj(renderer, ( + ('contents', 'items', 'rows'), ..., 'continuationItemRenderer', + ('continuationEndpoint', ('button', 'buttonRenderer', 'command')) + ), get_all=False, expected_type=cls._extract_continuation_ep_data) @classmethod def _extract_alerts(cls, data): @@ -633,16 +708,15 @@ class YoutubeBaseInfoExtractor(InfoExtractor): yield alert_type, message def _report_alerts(self, alerts, expected=True, fatal=True, only_once=False): - errors = [] - warnings = [] + errors, warnings = [], [] for alert_type, alert_message in alerts: if alert_type.lower() == 'error' and fatal: errors.append([alert_type, alert_message]) - else: + elif alert_message not in self._IGNORED_WARNINGS: warnings.append([alert_type, alert_message]) for alert_type, alert_message in (warnings + errors[:-1]): - self.report_warning('YouTube said: %s - %s' % (alert_type, alert_message), only_once=only_once) + self.report_warning(f'YouTube said: {alert_type} - {alert_message}', only_once=only_once) if errors: raise ExtractorError('YouTube said: %s' % errors[-1][1], expected=expected) @@ -650,14 +724,50 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return self._report_alerts(self._extract_alerts(data), *args, **kwargs) def _extract_badges(self, renderer: dict): - badges = set() - for badge in try_get(renderer, lambda x: x['badges'], list) or []: - label = try_get(badge, lambda x: x['metadataBadgeRenderer']['label'], compat_str) - if label: - badges.add(label.lower()) + privacy_icon_map = { + 'PRIVACY_UNLISTED': BadgeType.AVAILABILITY_UNLISTED, + 'PRIVACY_PRIVATE': BadgeType.AVAILABILITY_PRIVATE, + 'PRIVACY_PUBLIC': BadgeType.AVAILABILITY_PUBLIC + } + + badge_style_map = { + 'BADGE_STYLE_TYPE_MEMBERS_ONLY': BadgeType.AVAILABILITY_SUBSCRIPTION, + 'BADGE_STYLE_TYPE_PREMIUM': BadgeType.AVAILABILITY_PREMIUM, + 'BADGE_STYLE_TYPE_LIVE_NOW': BadgeType.LIVE_NOW + } + + label_map = { + 'unlisted': BadgeType.AVAILABILITY_UNLISTED, + 'private': BadgeType.AVAILABILITY_PRIVATE, + 'members only': BadgeType.AVAILABILITY_SUBSCRIPTION, + 'live': BadgeType.LIVE_NOW, + 'premium': BadgeType.AVAILABILITY_PREMIUM + } + + badges = [] + for badge in traverse_obj(renderer, ('badges', ..., 'metadataBadgeRenderer'), default=[]): + badge_type = ( + privacy_icon_map.get(traverse_obj(badge, ('icon', 'iconType'), expected_type=str)) + or badge_style_map.get(traverse_obj(badge, 'style')) + ) + if badge_type: + badges.append({'type': badge_type}) + continue + + # fallback, won't work in some languages + label = traverse_obj(badge, 'label', expected_type=str, default='') + for match, label_badge_type in label_map.items(): + if match in label.lower(): + badges.append({'type': badge_type}) + continue + return badges @staticmethod + def _has_badge(badges, badge_type): + return bool(traverse_obj(badges, lambda _, v: v['type'] == badge_type)) + + @staticmethod def _get_text(data, *path_list, max_runs=None): for path in path_list or [None]: if path is None: @@ -667,7 +777,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): if not any(key is ... or isinstance(key, (list, tuple)) for key in variadic(path)): obj = [obj] for item in obj: - text = try_get(item, lambda x: x['simpleText'], compat_str) + text = try_get(item, lambda x: x['simpleText'], str) if text: return text runs = try_get(item, lambda x: x['runs'], list) or [] @@ -727,8 +837,9 @@ class YoutubeBaseInfoExtractor(InfoExtractor): except ValueError: return None - def _extract_time_text(self, renderer, *path_list): - text = self._get_text(renderer, *path_list) or '' + def _parse_time_text(self, text): + if not text: + return dt = self.extract_relative_time(text) timestamp = None if isinstance(dt, datetime.datetime): @@ -741,81 +852,62 @@ class YoutubeBaseInfoExtractor(InfoExtractor): (r'([a-z]+\s*\d{1,2},?\s*20\d{2})', r'(?:.+|^)(?:live|premieres|ed|ing)(?:\s*(?:on|for))?\s*(.+\d)'), text.lower(), 'time text', default=None))) - if text and timestamp is None: - self.report_warning(f"Cannot parse localized time text '{text}'" + bug_reports_message(), only_once=True) - return timestamp, text + if text and timestamp is None and self._preferred_lang in (None, 'en'): + self.report_warning( + f'Cannot parse localized time text "{text}"', only_once=True) + return timestamp def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None, ytcfg=None, check_get_keys=None, ep='browse', fatal=True, api_hostname=None, default_client='web'): - response = None - last_error = None - count = -1 - retries = self.get_param('extractor_retries', 3) - if check_get_keys is None: - check_get_keys = [] - while count < retries: - count += 1 - if last_error: - self.report_warning('%s. Retrying ...' % remove_end(last_error, '.')) + for retry in self.RetryManager(): try: response = self._call_api( ep=ep, fatal=True, headers=headers, - video_id=item_id, query=query, + video_id=item_id, query=query, note=note, context=self._extract_context(ytcfg, default_client), api_key=self._extract_api_key(ytcfg, default_client), - api_hostname=api_hostname, default_client=default_client, - note='%s%s' % (note, ' (retry #%d)' % count if count else '')) + api_hostname=api_hostname, default_client=default_client) except ExtractorError as e: - if isinstance(e.cause, network_exceptions): - if isinstance(e.cause, compat_HTTPError): - first_bytes = e.cause.read(512) - if not is_html(first_bytes): - yt_error = try_get( - self._parse_json( - self._webpage_read_content(e.cause, None, item_id, prefix=first_bytes) or '{}', item_id, fatal=False), - lambda x: x['error']['message'], compat_str) - if yt_error: - self._report_alerts([('ERROR', yt_error)], fatal=False) - # Downloading page may result in intermittent 5xx HTTP error - # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289 - # We also want to catch all other network exceptions since errors in later pages can be troublesome - # See https://github.com/hypervideo/hypervideo/issues/507#issuecomment-880188210 - if not isinstance(e.cause, compat_HTTPError) or e.cause.code not in (403, 429): - last_error = error_to_compat_str(e.cause or e.msg) - if count < retries: - continue - if fatal: - raise - else: - self.report_warning(error_to_compat_str(e)) - return + if not isinstance(e.cause, network_exceptions): + return self._error_or_warning(e, fatal=fatal) + elif not isinstance(e.cause, urllib.error.HTTPError): + retry.error = e + continue - else: - try: - self._extract_and_report_alerts(response, only_once=True) - except ExtractorError as e: - # YouTube servers may return errors we want to retry on in a 200 OK response - # See: https://github.com/hypervideo/hypervideo/issues/839 - if 'unknown error' in e.msg.lower(): - last_error = e.msg - continue - if fatal: - raise - self.report_warning(error_to_compat_str(e)) - return - if not check_get_keys or dict_get(response, check_get_keys): - break - # Youtube sometimes sends incomplete data - # See: https://github.com/ytdl-org/youtube-dl/issues/28194 - last_error = 'Incomplete data received' - if count >= retries: - if fatal: - raise ExtractorError(last_error) - else: - self.report_warning(last_error) - return - return response + first_bytes = e.cause.read(512) + if not is_html(first_bytes): + yt_error = try_get( + self._parse_json( + self._webpage_read_content(e.cause, None, item_id, prefix=first_bytes) or '{}', item_id, fatal=False), + lambda x: x['error']['message'], str) + if yt_error: + self._report_alerts([('ERROR', yt_error)], fatal=False) + # Downloading page may result in intermittent 5xx HTTP error + # Sometimes a 404 is also recieved. See: https://github.com/ytdl-org/youtube-dl/issues/28289 + # We also want to catch all other network exceptions since errors in later pages can be troublesome + # See https://github.com/hypervideo/hypervideo/issues/507#issuecomment-880188210 + if e.cause.code not in (403, 429): + retry.error = e + continue + return self._error_or_warning(e, fatal=fatal) + + try: + self._extract_and_report_alerts(response, only_once=True) + except ExtractorError as e: + # YouTube servers may return errors we want to retry on in a 200 OK response + # See: https://github.com/hypervideo/hypervideo/issues/839 + if 'unknown error' in e.msg.lower(): + retry.error = e + continue + return self._error_or_warning(e, fatal=fatal) + # Youtube sometimes sends incomplete data + # See: https://github.com/ytdl-org/youtube-dl/issues/28194 + if not traverse_obj(response, *variadic(check_get_keys)): + retry.error = ExtractorError('Incomplete data received', expected=True) + continue + + return response @staticmethod def is_music_url(url): @@ -823,29 +915,36 @@ class YoutubeBaseInfoExtractor(InfoExtractor): def _extract_video(self, renderer): video_id = renderer.get('videoId') - title = self._get_text(renderer, 'title') + + reel_header_renderer = traverse_obj(renderer, ( + 'navigationEndpoint', 'reelWatchEndpoint', 'overlay', 'reelPlayerOverlayRenderer', + 'reelPlayerHeaderSupportedRenderers', 'reelPlayerHeaderRenderer')) + + title = self._get_text(renderer, 'title', 'headline') or self._get_text(reel_header_renderer, 'reelTitleText') description = self._get_text(renderer, 'descriptionSnippet') - duration = parse_duration(self._get_text( - renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text'))) + + duration = int_or_none(renderer.get('lengthSeconds')) + if duration is None: + duration = parse_duration(self._get_text( + renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text'))) if duration is None: + # XXX: should write a parser to be more general to support more cases (e.g. shorts in shorts tab) duration = parse_duration(self._search_regex( r'(?i)(ago)(?!.*\1)\s+(?P<duration>[a-z0-9 ,]+?)(?:\s+[\d,]+\s+views)?(?:\s+-\s+play\s+short)?$', traverse_obj(renderer, ('title', 'accessibility', 'accessibilityData', 'label'), default='', expected_type=str), video_id, default=None, group='duration')) - view_count = self._get_count(renderer, 'viewCountText') - - uploader = self._get_text(renderer, 'ownerText', 'shortBylineText') channel_id = traverse_obj( renderer, ('shortBylineText', 'runs', ..., 'navigationEndpoint', 'browseEndpoint', 'browseId'), expected_type=str, get_all=False) - timestamp, time_text = self._extract_time_text(renderer, 'publishedTimeText') - scheduled_timestamp = str_to_int(traverse_obj(renderer, ('upcomingEventData', 'startTime'), get_all=False)) + if not channel_id: + channel_id = traverse_obj(reel_header_renderer, ('channelNavigationEndpoint', 'browseEndpoint', 'browseId')) + overlay_style = traverse_obj( renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), get_all=False, expected_type=str) badges = self._extract_badges(renderer) - thumbnails = self._extract_thumbnails(renderer, 'thumbnail') + navigation_url = urljoin('https://www.youtube.com/', traverse_obj( renderer, ('navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url'), expected_type=str)) or '' @@ -853,6 +952,22 @@ class YoutubeBaseInfoExtractor(InfoExtractor): if overlay_style == 'SHORTS' or '/shorts/' in navigation_url: url = f'https://www.youtube.com/shorts/{video_id}' + time_text = (self._get_text(renderer, 'publishedTimeText', 'videoInfo') + or self._get_text(reel_header_renderer, 'timestampText') or '') + scheduled_timestamp = str_to_int(traverse_obj(renderer, ('upcomingEventData', 'startTime'), get_all=False)) + + live_status = ( + 'is_upcoming' if scheduled_timestamp is not None + else 'was_live' if 'streamed' in time_text.lower() + else 'is_live' if overlay_style == 'LIVE' or self._has_badge(badges, BadgeType.LIVE_NOW) + else None) + + # videoInfo is a string like '50K views • 10 years ago'. + view_count_text = self._get_text(renderer, 'viewCountText', 'shortViewCountText', 'videoInfo') or '' + view_count = (0 if 'no views' in view_count_text.lower() + else self._get_count({'simpleText': view_count_text})) + view_count_field = 'concurrent_view_count' if live_status in ('is_live', 'is_upcoming') else 'view_count' + return { '_type': 'url', 'ie_key': YoutubeIE.ie_key(), @@ -861,19 +976,24 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'title': title, 'description': description, 'duration': duration, - 'view_count': view_count, - 'uploader': uploader, 'channel_id': channel_id, - 'thumbnails': thumbnails, - 'upload_date': (strftime_or_none(timestamp, '%Y%m%d') - if self._configuration_arg('approximate_date', ie_key='youtubetab') - else None), - 'live_status': ('is_upcoming' if scheduled_timestamp is not None - else 'was_live' if 'streamed' in time_text.lower() - else 'is_live' if overlay_style is not None and overlay_style == 'LIVE' or 'live now' in badges - else None), + 'channel': (self._get_text(renderer, 'ownerText', 'shortBylineText') + or self._get_text(reel_header_renderer, 'channelTitleText')), + 'channel_url': f'https://www.youtube.com/channel/{channel_id}' if channel_id else None, + 'thumbnails': self._extract_thumbnails(renderer, 'thumbnail'), + 'timestamp': (self._parse_time_text(time_text) + if self._configuration_arg('approximate_date', ie_key=YoutubeTabIE) + else None), 'release_timestamp': scheduled_timestamp, - 'availability': self._availability(needs_premium='premium' in badges, needs_subscription='members only' in badges) + 'availability': + 'public' if self._has_badge(badges, BadgeType.AVAILABILITY_PUBLIC) + else self._availability( + is_private=self._has_badge(badges, BadgeType.AVAILABILITY_PRIVATE) or None, + needs_premium=self._has_badge(badges, BadgeType.AVAILABILITY_PREMIUM) or None, + needs_subscription=self._has_badge(badges, BadgeType.AVAILABILITY_SUBSCRIPTION) or None, + is_unlisted=self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED) or None), + view_count_field: view_count, + 'live_status': live_status } @@ -914,6 +1034,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (?:\#|$)""" % { 'invidious': '|'.join(YoutubeBaseInfoExtractor._INVIDIOUS_SITES), } + _EMBED_REGEX = [ + r'''(?x) + (?: + <(?:[0-9A-Za-z-]+?)?iframe[^>]+?src=| + data-video-url=| + <embed[^>]+?src=| + embedSWF\(?:\s*| + <object[^>]+data=| + new\s+SWFObject\( + ) + (["\']) + (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ + (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?) + \1''', + # https://wordpress.org/plugins/lazy-load-for-videos/ + r'''(?xs) + <a\s[^>]*\bhref="(?P<url>https://www\.youtube\.com/watch\?v=[0-9A-Za-z_-]{11})" + \s[^>]*\bclass="[^"]*\blazy-load-youtube''', + ] + _RETURN_TYPE = 'video' # XXX: How to handle multifeed? + _PLAYER_INFO_RE = ( r'/s/player/(?P<id>[a-zA-Z0-9_-]{8,})/player', r'/(?P<id>[a-zA-Z0-9_-]{8,})/player(?:_ias\.vflset(?:/[a-zA-Z]{2,3}_[a-zA-Z]{2,3})?|-plasma-ias-(?:phone|tablet)-[a-z]{2}_[A-Z]{2}\.vflset)/base\.js$', @@ -1060,6 +1201,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'age_limit': 0, 'start_time': 1, 'end_time': 9, + 'comment_count': int, 'channel_follower_count': int } }, @@ -1104,6 +1246,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'thumbnail': 'https://i.ytimg.com/vi/BaW_jenozKc/maxresdefault.jpg', 'live_status': 'not_live', 'age_limit': 0, + 'comment_count': int, 'channel_follower_count': int }, 'params': { @@ -1246,6 +1389,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'categories': ['Entertainment'], 'duration': 106, 'channel_url': 'https://www.youtube.com/channel/UC1yoRdFoFJaCY-AGfD9W0wQ', + 'comment_count': int, 'channel_follower_count': int }, }, @@ -1333,7 +1477,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'upload_date': '20150827', 'uploader_id': 'olympic', 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/olympic', - 'description': 'HO09 - Women - GER-AUS - Hockey - 31 July 2012 - London 2012 Olympic Games', + 'description': 'md5:04bbbf3ccceb6795947572ca36f45904', 'uploader': 'Olympics', 'title': 'Hockey - Women - GER-AUS - London 2012 Olympic Games', 'like_count': int, @@ -1382,6 +1526,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'like_count': int, 'live_status': 'not_live', 'availability': 'unlisted', + 'comment_count': int, 'channel_follower_count': int }, }, @@ -1439,66 +1584,99 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'skip': 'This live event has ended.', }, { - # Multifeed videos (multiple cameras), URL is for Main Camera - 'url': 'https://www.youtube.com/watch?v=jvGDaLqkpTg', + # Multifeed videos (multiple cameras), URL can be of any Camera + 'url': 'https://www.youtube.com/watch?v=zaPI8MvL8pg', 'info_dict': { - 'id': 'jvGDaLqkpTg', - 'title': 'Tom Clancy Free Weekend Rainbow Whatever', - 'description': 'md5:e03b909557865076822aa169218d6a5d', + 'id': 'zaPI8MvL8pg', + 'title': 'Terraria 1.2 Live Stream | Let\'s Play - Part 04', + 'description': 'md5:563ccbc698b39298481ca3c571169519', }, 'playlist': [{ 'info_dict': { - 'id': 'jvGDaLqkpTg', - 'ext': 'mp4', - 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Main Camera)', - 'description': 'md5:e03b909557865076822aa169218d6a5d', - 'duration': 10643, - 'upload_date': '20161111', - 'uploader': 'Team PGP', - 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg', - }, - }, { - 'info_dict': { - 'id': '3AKt1R1aDnw', + 'id': 'j5yGuxZ8lLU', 'ext': 'mp4', - 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 2)', - 'description': 'md5:e03b909557865076822aa169218d6a5d', - 'duration': 10991, - 'upload_date': '20161111', - 'uploader': 'Team PGP', - 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg', + 'title': 'Terraria 1.2 Live Stream | Let\'s Play - Part 04 (Chris)', + 'uploader': 'WiiLikeToPlay', + 'description': 'md5:563ccbc698b39298481ca3c571169519', + 'uploader_url': 'http://www.youtube.com/user/WiiRikeToPray', + 'duration': 10120, + 'channel_follower_count': int, + 'channel_url': 'https://www.youtube.com/channel/UCN2XePorRokPB9TEgRZpddg', + 'availability': 'public', + 'playable_in_embed': True, + 'upload_date': '20131105', + 'uploader_id': 'WiiRikeToPray', + 'categories': ['Gaming'], + 'live_status': 'was_live', + 'tags': 'count:24', + 'release_timestamp': 1383701910, + 'thumbnail': 'https://i.ytimg.com/vi/j5yGuxZ8lLU/maxresdefault.jpg', + 'comment_count': int, + 'age_limit': 0, + 'like_count': int, + 'channel_id': 'UCN2XePorRokPB9TEgRZpddg', + 'channel': 'WiiLikeToPlay', + 'view_count': int, + 'release_date': '20131106', }, }, { 'info_dict': { - 'id': 'RtAMM00gpVc', + 'id': 'zaPI8MvL8pg', 'ext': 'mp4', - 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 3)', - 'description': 'md5:e03b909557865076822aa169218d6a5d', - 'duration': 10995, - 'upload_date': '20161111', - 'uploader': 'Team PGP', - 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg', + 'title': 'Terraria 1.2 Live Stream | Let\'s Play - Part 04 (Tyson)', + 'uploader_id': 'WiiRikeToPray', + 'availability': 'public', + 'channel_url': 'https://www.youtube.com/channel/UCN2XePorRokPB9TEgRZpddg', + 'channel': 'WiiLikeToPlay', + 'uploader_url': 'http://www.youtube.com/user/WiiRikeToPray', + 'channel_follower_count': int, + 'description': 'md5:563ccbc698b39298481ca3c571169519', + 'duration': 10108, + 'age_limit': 0, + 'like_count': int, + 'tags': 'count:24', + 'channel_id': 'UCN2XePorRokPB9TEgRZpddg', + 'uploader': 'WiiLikeToPlay', + 'release_timestamp': 1383701915, + 'comment_count': int, + 'upload_date': '20131105', + 'thumbnail': 'https://i.ytimg.com/vi/zaPI8MvL8pg/maxresdefault.jpg', + 'release_date': '20131106', + 'playable_in_embed': True, + 'live_status': 'was_live', + 'categories': ['Gaming'], + 'view_count': int, }, }, { 'info_dict': { - 'id': '6N2fdlP3C5U', + 'id': 'R7r3vfO7Hao', 'ext': 'mp4', - 'title': 'Tom Clancy Free Weekend Rainbow Whatever (Camera 4)', - 'description': 'md5:e03b909557865076822aa169218d6a5d', - 'duration': 10990, - 'upload_date': '20161111', - 'uploader': 'Team PGP', - 'uploader_id': 'UChORY56LMMETTuGjXaJXvLg', - 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UChORY56LMMETTuGjXaJXvLg', + 'title': 'Terraria 1.2 Live Stream | Let\'s Play - Part 04 (Spencer)', + 'thumbnail': 'https://i.ytimg.com/vi/R7r3vfO7Hao/maxresdefault.jpg', + 'channel_id': 'UCN2XePorRokPB9TEgRZpddg', + 'like_count': int, + 'availability': 'public', + 'playable_in_embed': True, + 'upload_date': '20131105', + 'description': 'md5:563ccbc698b39298481ca3c571169519', + 'uploader_id': 'WiiRikeToPray', + 'uploader_url': 'http://www.youtube.com/user/WiiRikeToPray', + 'channel_follower_count': int, + 'tags': 'count:24', + 'release_date': '20131106', + 'uploader': 'WiiLikeToPlay', + 'comment_count': int, + 'channel_url': 'https://www.youtube.com/channel/UCN2XePorRokPB9TEgRZpddg', + 'channel': 'WiiLikeToPlay', + 'categories': ['Gaming'], + 'release_timestamp': 1383701914, + 'live_status': 'was_live', + 'age_limit': 0, + 'duration': 10128, + 'view_count': int, }, }], - 'params': { - 'skip_download': True, - }, - 'skip': 'Not multifeed anymore', + 'params': {'skip_download': True}, }, { # Multifeed video with comma in title (see https://github.com/ytdl-org/youtube-dl/issues/8536) @@ -1610,7 +1788,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'thumbnail': 'https://i.ytimg.com/vi_webp/M4gD1WSo5mA/maxresdefault.webp', 'live_status': 'not_live', 'playable_in_embed': True, - 'channel_follower_count': int + 'comment_count': int, + 'channel_follower_count': int, + 'chapters': list, }, 'params': { 'skip_download': True, @@ -1642,7 +1822,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'view_count': int, 'live_status': 'not_live', 'channel_url': 'https://www.youtube.com/channel/UCH1dpzjCEiGAt8CXkryhkZg', - 'channel_follower_count': int + 'comment_count': int, + 'channel_follower_count': int, + 'chapters': list, }, 'params': { 'skip_download': True, @@ -1906,7 +2088,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'view_count': int, 'duration': 522, 'channel': 'kudvenkat', - 'channel_follower_count': int + 'comment_count': int, + 'channel_follower_count': int, + 'chapters': list, }, 'params': { 'skip_download': True, @@ -2056,7 +2240,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'like_count': int, 'live_status': 'not_live', 'playable_in_embed': True, - 'channel_follower_count': int + 'channel_follower_count': int, + 'chapters': list, }, 'params': { 'format': '17', # 3gp format available on android @@ -2100,7 +2285,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'duration': 248, 'categories': ['Education'], 'age_limit': 0, - 'channel_follower_count': int + 'channel_follower_count': int, + 'chapters': list, }, 'params': {'format': 'mhtml', 'skip_download': True} }, { # Ensure video upload_date is in UTC timezone (video was uploaded 1641170939) @@ -2127,9 +2313,39 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'availability': 'public', 'channel': 'Leon Nguyen', 'thumbnail': 'https://i.ytimg.com/vi_webp/2NUZ8W2llS4/maxresdefault.webp', + 'comment_count': int, 'channel_follower_count': int } }, { + # Same video as above, but with --compat-opt no-youtube-prefer-utc-upload-date + 'url': 'https://www.youtube.com/watch?v=2NUZ8W2llS4', + 'info_dict': { + 'id': '2NUZ8W2llS4', + 'ext': 'mp4', + 'title': 'The NP that test your phone performance 🙂', + 'description': 'md5:144494b24d4f9dfacb97c1bbef5de84d', + 'uploader': 'Leon Nguyen', + 'uploader_id': 'VNSXIII', + 'uploader_url': 'http://www.youtube.com/user/VNSXIII', + 'channel_id': 'UCRqNBSOHgilHfAczlUmlWHA', + 'channel_url': 'https://www.youtube.com/channel/UCRqNBSOHgilHfAczlUmlWHA', + 'duration': 21, + 'view_count': int, + 'age_limit': 0, + 'categories': ['Gaming'], + 'tags': 'count:23', + 'playable_in_embed': True, + 'live_status': 'not_live', + 'upload_date': '20220102', + 'like_count': int, + 'availability': 'public', + 'channel': 'Leon Nguyen', + 'thumbnail': 'https://i.ytimg.com/vi_webp/2NUZ8W2llS4/maxresdefault.webp', + 'comment_count': int, + 'channel_follower_count': int + }, + 'params': {'compat_opts': ['no-youtube-prefer-utc-upload-date']} + }, { # date text is premiered video, ensure upload date in UTC (published 1641172509) 'url': 'https://www.youtube.com/watch?v=mzZzzBU6lrM', 'info_dict': { @@ -2186,8 +2402,184 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'view_count': int, 'playable_in_embed': True, 'description': 'md5:2ef1d002cad520f65825346e2084e49d', + 'concurrent_view_count': int, }, 'params': {'skip_download': True} + }, { + # Story. Requires specific player params to work. + 'url': 'https://www.youtube.com/watch?v=vv8qTUWmulI', + 'info_dict': { + 'id': 'vv8qTUWmulI', + 'ext': 'mp4', + 'availability': 'unlisted', + 'view_count': int, + 'channel_id': 'UCzIZ8HrzDgc-pNQDUG6avBA', + 'upload_date': '20220526', + 'categories': ['Education'], + 'title': 'Story', + 'channel': 'IT\'S HISTORY', + 'description': '', + 'uploader_id': 'BlastfromthePast', + 'duration': 12, + 'uploader': 'IT\'S HISTORY', + 'playable_in_embed': True, + 'age_limit': 0, + 'live_status': 'not_live', + 'tags': [], + 'thumbnail': 'https://i.ytimg.com/vi_webp/vv8qTUWmulI/maxresdefault.webp', + 'uploader_url': 'http://www.youtube.com/user/BlastfromthePast', + 'channel_url': 'https://www.youtube.com/channel/UCzIZ8HrzDgc-pNQDUG6avBA', + }, + 'skip': 'stories get removed after some period of time', + }, { + 'url': 'https://www.youtube.com/watch?v=tjjjtzRLHvA', + 'info_dict': { + 'id': 'tjjjtzRLHvA', + 'ext': 'mp4', + 'title': 'ハッシュタグ無し };if window.ytcsi', + 'upload_date': '20220323', + 'like_count': int, + 'availability': 'unlisted', + 'channel': 'nao20010128nao', + 'thumbnail': 'https://i.ytimg.com/vi_webp/tjjjtzRLHvA/maxresdefault.webp', + 'age_limit': 0, + 'uploader': 'nao20010128nao', + 'uploader_id': 'nao20010128nao', + 'categories': ['Music'], + 'view_count': int, + 'description': '', + 'channel_url': 'https://www.youtube.com/channel/UCdqltm_7iv1Vs6kp6Syke5A', + 'channel_id': 'UCdqltm_7iv1Vs6kp6Syke5A', + 'live_status': 'not_live', + 'playable_in_embed': True, + 'channel_follower_count': int, + 'duration': 6, + 'tags': [], + 'uploader_url': 'http://www.youtube.com/user/nao20010128nao', + } + }, { + # Prefer primary title+description language metadata by default + # Do not prefer translated description if primary is empty + 'url': 'https://www.youtube.com/watch?v=el3E4MbxRqQ', + 'info_dict': { + 'id': 'el3E4MbxRqQ', + 'ext': 'mp4', + 'title': 'dlp test video 2 - primary sv no desc', + 'description': '', + 'channel': 'cole-dlp-test-acc', + 'tags': [], + 'view_count': int, + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'like_count': int, + 'playable_in_embed': True, + 'availability': 'unlisted', + 'thumbnail': 'https://i.ytimg.com/vi_webp/el3E4MbxRqQ/maxresdefault.webp', + 'age_limit': 0, + 'duration': 5, + 'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'uploader_url': 'http://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'live_status': 'not_live', + 'upload_date': '20220908', + 'categories': ['People & Blogs'], + 'uploader': 'cole-dlp-test-acc', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + }, + 'params': {'skip_download': True} + }, { + # Extractor argument: prefer translated title+description + 'url': 'https://www.youtube.com/watch?v=gHKT4uU8Zng', + 'info_dict': { + 'id': 'gHKT4uU8Zng', + 'ext': 'mp4', + 'channel': 'cole-dlp-test-acc', + 'tags': [], + 'duration': 5, + 'live_status': 'not_live', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'upload_date': '20220728', + 'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'view_count': int, + 'categories': ['People & Blogs'], + 'thumbnail': 'https://i.ytimg.com/vi_webp/gHKT4uU8Zng/maxresdefault.webp', + 'title': 'dlp test video title translated (fr)', + 'availability': 'public', + 'uploader': 'cole-dlp-test-acc', + 'age_limit': 0, + 'description': 'dlp test video description translated (fr)', + 'playable_in_embed': True, + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'uploader_url': 'http://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + }, + 'params': {'skip_download': True, 'extractor_args': {'youtube': {'lang': ['fr']}}}, + 'expected_warnings': [r'Preferring "fr" translated fields'], + }, { + 'note': '6 channel audio', + 'url': 'https://www.youtube.com/watch?v=zgdo7-RRjgo', + 'only_matching': True, + }, { + 'note': 'Multiple HLS formats with same itag', + 'url': 'https://www.youtube.com/watch?v=kX3nB4PpJko', + 'info_dict': { + 'id': 'kX3nB4PpJko', + 'ext': 'mp4', + 'categories': ['Entertainment'], + 'description': 'md5:e8031ff6e426cdb6a77670c9b81f6fa6', + 'uploader_url': 'http://www.youtube.com/user/MrBeast6000', + 'live_status': 'not_live', + 'duration': 937, + 'channel_follower_count': int, + 'thumbnail': 'https://i.ytimg.com/vi_webp/kX3nB4PpJko/maxresdefault.webp', + 'title': 'Last To Take Hand Off Jet, Keeps It!', + 'channel': 'MrBeast', + 'playable_in_embed': True, + 'view_count': int, + 'upload_date': '20221112', + 'uploader': 'MrBeast', + 'uploader_id': 'MrBeast6000', + 'channel_url': 'https://www.youtube.com/channel/UCX6OQ3DkcsbYNE6H8uQQuVA', + 'age_limit': 0, + 'availability': 'public', + 'channel_id': 'UCX6OQ3DkcsbYNE6H8uQQuVA', + 'like_count': int, + 'tags': [], + }, + 'params': {'extractor_args': {'youtube': {'player_client': ['ios']}}, 'format': '233-1'}, + } + ] + + _WEBPAGE_TESTS = [ + # YouTube <object> embed + { + 'url': 'http://www.improbable.com/2017/04/03/untrained-modern-youths-and-ancient-masters-in-selfie-portraits/', + 'md5': '873c81d308b979f0e23ee7e620b312a3', + 'info_dict': { + 'id': 'msN87y-iEx0', + 'ext': 'mp4', + 'title': 'Feynman: Mirrors FUN TO IMAGINE 6', + 'upload_date': '20080526', + 'description': 'md5:873c81d308b979f0e23ee7e620b312a3', + 'uploader': 'Christopher Sykes', + 'uploader_id': 'ChristopherJSykes', + 'age_limit': 0, + 'tags': ['feynman', 'mirror', 'science', 'physics', 'imagination', 'fun', 'cool', 'puzzle'], + 'channel_id': 'UCCeo--lls1vna5YJABWAcVA', + 'playable_in_embed': True, + 'thumbnail': 'https://i.ytimg.com/vi/msN87y-iEx0/hqdefault.jpg', + 'like_count': int, + 'comment_count': int, + 'channel': 'Christopher Sykes', + 'live_status': 'not_live', + 'channel_url': 'https://www.youtube.com/channel/UCCeo--lls1vna5YJABWAcVA', + 'availability': 'public', + 'duration': 195, + 'view_count': int, + 'categories': ['Science & Technology'], + 'channel_follower_count': int, + 'uploader_url': 'http://www.youtube.com/user/ChristopherJSykes', + }, + 'params': { + 'skip_download': True, + } }, ] @@ -2198,17 +2590,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): qs = parse_qs(url) if qs.get('list', [None])[0]: return False - return super(YoutubeIE, cls).suitable(url) + return super().suitable(url) def __init__(self, *args, **kwargs): - super(YoutubeIE, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) self._code_cache = {} self._player_cache = {} - def _prepare_live_from_start_formats(self, formats, video_id, live_start_time, url, webpage_url, smuggled_data): + def _prepare_live_from_start_formats(self, formats, video_id, live_start_time, url, webpage_url, smuggled_data, is_live): lock = threading.Lock() - - is_live = True start_time = time.time() formats = [f for f in formats if f.get('is_from_start')] @@ -2223,7 +2613,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): microformats = traverse_obj( prs, (..., 'microformat', 'playerMicroformatRenderer'), expected_type=dict, default=[]) - _, is_live, _, formats = self._list_formats(video_id, microformats, video_details, prs, player_url) + _, live_status, _, formats, _ = self._list_formats(video_id, microformats, video_details, prs, player_url) + is_live = live_status == 'is_live' start_time = time.time() def mpd_feed(format_id, delay): @@ -2244,12 +2635,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return f['manifest_url'], f['manifest_stream_number'], is_live for f in formats: - f['is_live'] = True - f['protocol'] = 'http_dash_segments_generator' - f['fragments'] = functools.partial( - self._live_dash_fragments, f['format_id'], live_start_time, mpd_feed) + f['is_live'] = is_live + gen = functools.partial(self._live_dash_fragments, video_id, f['format_id'], + live_start_time, mpd_feed, not is_live and f.copy()) + if is_live: + f['fragments'] = gen + f['protocol'] = 'http_dash_segments_generator' + else: + f['fragments'] = LazyList(gen({})) + del f['is_from_start'] - def _live_dash_fragments(self, format_id, live_start_time, mpd_feed, ctx): + def _live_dash_fragments(self, video_id, format_id, live_start_time, mpd_feed, manifestless_orig_fmt, ctx): FETCH_SPAN, MAX_DURATION = 5, 432000 mpd_url, stream_number, is_live = None, None, True @@ -2272,7 +2668,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Obtain from MPD's maximum seq value old_mpd_url = mpd_url last_error = ctx.pop('last_error', None) - expire_fast = immediate or last_error and isinstance(last_error, compat_HTTPError) and last_error.code == 403 + expire_fast = immediate or last_error and isinstance(last_error, urllib.error.HTTPError) and last_error.code == 403 mpd_url, stream_number, is_live = (mpd_feed(format_id, 5 if expire_fast else 18000) or (mpd_url, stream_number, False)) if not refresh_sequence: @@ -2280,15 +2676,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return False, last_seq elif old_mpd_url == mpd_url: return True, last_seq - try: - fmts, _ = self._extract_mpd_formats_and_subtitles( - mpd_url, None, note=False, errnote=False, fatal=False) - except ExtractorError: - fmts = None - if not fmts: - no_fragment_score += 2 - return False, last_seq - fmt_info = next(x for x in fmts if x['manifest_stream_number'] == stream_number) + if manifestless_orig_fmt: + fmt_info = manifestless_orig_fmt + else: + try: + fmts, _ = self._extract_mpd_formats_and_subtitles( + mpd_url, None, note=False, errnote=False, fatal=False) + except ExtractorError: + fmts = None + if not fmts: + no_fragment_score += 2 + return False, last_seq + fmt_info = next(x for x in fmts if x['manifest_stream_number'] == stream_number) fragments = fmt_info['fragments'] fragment_base_url = fmt_info['fragment_base_url'] assert fragment_base_url @@ -2296,6 +2695,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): _last_seq = int(re.search(r'(?:/|^)sq/(\d+)', fragments[-1]['path']).group(1)) return True, _last_seq + self.write_debug(f'[{video_id}] Generating fragments for format {format_id}') while is_live: fetch_time = time.time() if no_fragment_score > 30: @@ -2339,6 +2739,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): last_segment_url = urljoin(fragment_base_url, 'sq/%d' % idx) yield { 'url': last_segment_url, + 'fragment_count': last_seq, } if known_idx == last_seq: no_fragment_score += 5 @@ -2348,12 +2749,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): except ExtractorError: continue + if manifestless_orig_fmt: + # Stop at the first iteration if running for post-live manifestless; + # fragment count no longer increase since it starts + break + time.sleep(max(0, FETCH_SPAN + fetch_time - time.time())) def _extract_player_url(self, *ytcfgs, webpage=None): player_url = traverse_obj( ytcfgs, (..., 'PLAYER_JS_URL'), (..., 'WEB_PLAYER_CONTEXT_CONFIGS', ..., 'jsUrl'), - get_all=False, expected_type=compat_str) + get_all=False, expected_type=str) if not player_url: return return urljoin('https://www.youtube.com', player_url) @@ -2370,7 +2776,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _signature_cache_id(self, example_sig): """ Return a string representation of a signature """ - return '.'.join(compat_str(len(part)) for part in example_sig.split('.')) + return '.'.join(str(len(part)) for part in example_sig.split('.')) @classmethod def _extract_player_info(cls, player_url): @@ -2397,24 +2803,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor): player_id = self._extract_player_info(player_url) # Read from filesystem cache - func_id = 'js_%s_%s' % ( - player_id, self._signature_cache_id(example_sig)) + func_id = f'js_{player_id}_{self._signature_cache_id(example_sig)}' assert os.path.basename(func_id) == func_id - cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id) - if cache_spec is not None: - return lambda s: ''.join(s[i] for i in cache_spec) + self.write_debug(f'Extracting signature function {func_id}') + cache_spec, code = self.cache.load('youtube-sigfuncs', func_id), None - code = self._load_player(video_id, player_url) + if not cache_spec: + code = self._load_player(video_id, player_url) if code: res = self._parse_sig_js(code) + test_string = ''.join(map(chr, range(len(example_sig)))) + cache_spec = [ord(c) for c in res(test_string)] + self.cache.store('youtube-sigfuncs', func_id, cache_spec) - test_string = ''.join(map(compat_chr, range(len(example_sig)))) - cache_res = res(test_string) - cache_spec = [ord(c) for c in cache_res] - - self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec) - return res + return lambda s: ''.join(s[i] for i in cache_spec) def _print_sig_code(self, func, example_sig): if not self.get_param('youtube_print_sig_code'): @@ -2425,7 +2828,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): starts = '' if start == 0 else str(start) ends = (':%d' % (end + step)) if end + step >= 0 else ':' steps = '' if step == 1 else (':%d' % step) - return 's[%s%s%s]' % (starts, ends, steps) + return f's[{starts}{ends}{steps}]' step = None # Quelch pyflakes warnings - start will be set when step is set @@ -2448,12 +2851,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: yield _genslice(start, i, step) - test_string = ''.join(map(compat_chr, range(len(example_sig)))) + test_string = ''.join(map(chr, range(len(example_sig)))) cache_res = func(test_string) cache_spec = [ord(c) for c in cache_res] expr_code = ' + '.join(gen_sig_code(cache_spec)) signature_id_tuple = '(%s)' % ( - ', '.join(compat_str(len(p)) for p in example_sig.split('.'))) + ', '.join(str(len(p)) for p in example_sig.split('.'))) code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n' ' return %s\n') % (signature_id_tuple, expr_code) self.to_screen('Extracted signature function:\n' + code) @@ -2482,24 +2885,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor): initial_function = jsi.extract_function(funcname) return lambda s: initial_function([s]) - def _decrypt_signature(self, s, video_id, player_url): - """Turn the encrypted s field into a working signature""" + def _cached(self, func, *cache_id): + def inner(*args, **kwargs): + if cache_id not in self._player_cache: + try: + self._player_cache[cache_id] = func(*args, **kwargs) + except ExtractorError as e: + self._player_cache[cache_id] = e + except Exception as e: + self._player_cache[cache_id] = ExtractorError(traceback.format_exc(), cause=e) - if player_url is None: - raise ExtractorError('Cannot decrypt signature without player_url') + ret = self._player_cache[cache_id] + if isinstance(ret, Exception): + raise ret + return ret + return inner - try: - player_id = (player_url, self._signature_cache_id(s)) - if player_id not in self._player_cache: - func = self._extract_signature_function( - video_id, player_url, s - ) - self._player_cache[player_id] = func - func = self._player_cache[player_id] - self._print_sig_code(func, s) - return func(s) - except Exception as e: - raise ExtractorError('Signature extraction failed: ' + traceback.format_exc(), cause=e) + def _decrypt_signature(self, s, video_id, player_url): + """Turn the encrypted s field into a working signature""" + extract_sig = self._cached( + self._extract_signature_function, 'sig', player_url, self._signature_cache_id(s)) + func = extract_sig(video_id, player_url, s) + self._print_sig_code(func, s) + return func(s) def _decrypt_nsig(self, s, video_id, player_url): """Turn the encrypted n field into a working signature""" @@ -2507,48 +2915,87 @@ class YoutubeIE(YoutubeBaseInfoExtractor): raise ExtractorError('Cannot decrypt nsig without player_url') player_url = urljoin('https://www.youtube.com', player_url) - sig_id = ('nsig_value', s) - if sig_id in self._player_cache: - return self._player_cache[sig_id] + try: + jsi, player_id, func_code = self._extract_n_function_code(video_id, player_url) + except ExtractorError as e: + raise ExtractorError('Unable to extract nsig function code', cause=e) + if self.get_param('youtube_print_sig_code'): + self.to_screen(f'Extracted nsig function from {player_id}:\n{func_code[1]}\n') try: - player_id = ('nsig', player_url) - if player_id not in self._player_cache: - self._player_cache[player_id] = self._extract_n_function(video_id, player_url) - func = self._player_cache[player_id] - self._player_cache[sig_id] = func(s) - self.write_debug(f'Decrypted nsig {s} => {self._player_cache[sig_id]}') - return self._player_cache[sig_id] - except Exception as e: - raise ExtractorError(traceback.format_exc(), cause=e, video_id=video_id) + extract_nsig = self._cached(self._extract_n_function_from_code, 'nsig func', player_url) + ret = extract_nsig(jsi, func_code)(s) + except JSInterpreter.Exception as e: + try: + jsi = PhantomJSwrapper(self, timeout=5000) + except ExtractorError: + raise e + self.report_warning( + f'Native nsig extraction failed: Trying with PhantomJS\n' + f' n = {s} ; player = {player_url}', video_id) + self.write_debug(e, only_once=True) + + args, func_body = func_code + ret = jsi.execute( + f'console.log(function({", ".join(args)}) {{ {func_body} }}({s!r}));', + video_id=video_id, note='Executing signature code').strip() + + self.write_debug(f'Decrypted nsig {s} => {ret}') + return ret def _extract_n_function_name(self, jscode): - nfunc, idx = self._search_regex( + funcname, idx = self._search_regex( r'\.get\("n"\)\)&&\(b=(?P<nfunc>[a-zA-Z0-9$]+)(?:\[(?P<idx>\d+)\])?\([a-zA-Z0-9]\)', jscode, 'Initial JS player n function name', group=('nfunc', 'idx')) if not idx: - return nfunc + return funcname + return json.loads(js_to_json(self._search_regex( - rf'var {re.escape(nfunc)}\s*=\s*(\[.+?\]);', jscode, - f'Initial JS player n function list ({nfunc}.{idx})')))[int(idx)] + rf'var {re.escape(funcname)}\s*=\s*(\[.+?\]);', jscode, + f'Initial JS player n function list ({funcname}.{idx})')))[int(idx)] - def _extract_n_function(self, video_id, player_url): + def _extract_n_function_code(self, video_id, player_url): player_id = self._extract_player_info(player_url) - func_code = self._downloader.cache.load('youtube-nsig', player_id) + func_code = self.cache.load('youtube-nsig', player_id, min_ver='2022.09.1') + jscode = func_code or self._load_player(video_id, player_url) + jsi = JSInterpreter(jscode) + + if func_code: + return jsi, player_id, func_code + + func_name = self._extract_n_function_name(jscode) + # For redundancy + func_code = self._search_regex( + r'''(?xs)%s\s*=\s*function\s*\((?P<var>[\w$]+)\)\s* + # NB: The end of the regex is intentionally kept strict + {(?P<code>.+?}\s*return\ [\w$]+.join\(""\))};''' % func_name, + jscode, 'nsig function', group=('var', 'code'), default=None) if func_code: - jsi = JSInterpreter(func_code) + func_code = ([func_code[0]], func_code[1]) else: - jscode = self._load_player(video_id, player_url) - funcname = self._extract_n_function_name(jscode) - jsi = JSInterpreter(jscode) - func_code = jsi.extract_function_code(funcname) - self._downloader.cache.store('youtube-nsig', player_id, func_code) + self.write_debug('Extracting nsig function with jsinterp') + func_code = jsi.extract_function_code(func_name) - if self.get_param('youtube_print_sig_code'): - self.to_screen(f'Extracted nsig function from {player_id}:\n{func_code[1]}\n') + self.cache.store('youtube-nsig', player_id, func_code) + return jsi, player_id, func_code + + def _extract_n_function_from_code(self, jsi, func_code): + func = jsi.extract_function_from_code(*func_code) + + def extract_nsig(s): + try: + ret = func([s]) + except JSInterpreter.Exception: + raise + except Exception as e: + raise JSInterpreter.Exception(traceback.format_exc(), cause=e) - return lambda s: jsi.extract_function_from_code(*func_code)([s]) + if ret.startswith('enhanced_except_'): + raise JSInterpreter.Exception('Signature function returned an exception') + return ret + + return extract_nsig def _extract_signature_timestamp(self, video_id, player_url, ytcfg=None, fatal=False): """ @@ -2575,74 +3022,76 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return sts def _mark_watched(self, video_id, player_responses): - playback_url = get_first( - player_responses, ('playbackTracking', 'videostatsPlaybackUrl', 'baseUrl'), - expected_type=url_or_none) - if not playback_url: - self.report_warning('Unable to mark watched') - return - parsed_playback_url = compat_urlparse.urlparse(playback_url) - qs = compat_urlparse.parse_qs(parsed_playback_url.query) + for is_full, key in enumerate(('videostatsPlaybackUrl', 'videostatsWatchtimeUrl')): + label = 'fully ' if is_full else '' + url = get_first(player_responses, ('playbackTracking', key, 'baseUrl'), + expected_type=url_or_none) + if not url: + self.report_warning(f'Unable to mark {label}watched') + return + parsed_url = urllib.parse.urlparse(url) + qs = urllib.parse.parse_qs(parsed_url.query) + + # cpn generation algorithm is reverse engineered from base.js. + # In fact it works even with dummy cpn. + CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' + cpn = ''.join(CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16)) + + # # more consistent results setting it to right before the end + video_length = [str(float((qs.get('len') or ['1.5'])[0]) - 1)] + + qs.update({ + 'ver': ['2'], + 'cpn': [cpn], + 'cmt': video_length, + 'el': 'detailpage', # otherwise defaults to "shorts" + }) - # cpn generation algorithm is reverse engineered from base.js. - # In fact it works even with dummy cpn. - CPN_ALPHABET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_' - cpn = ''.join((CPN_ALPHABET[random.randint(0, 256) & 63] for _ in range(0, 16))) + if is_full: + # these seem to mark watchtime "history" in the real world + # they're required, so send in a single value + qs.update({ + 'st': 0, + 'et': video_length, + }) - qs.update({ - 'ver': ['2'], - 'cpn': [cpn], - }) - playback_url = compat_urlparse.urlunparse( - parsed_playback_url._replace(query=compat_urllib_parse_urlencode(qs, True))) + url = urllib.parse.urlunparse( + parsed_url._replace(query=urllib.parse.urlencode(qs, True))) - self._download_webpage( - playback_url, video_id, 'Marking watched', - 'Unable to mark watched', fatal=False) + self._download_webpage( + url, video_id, f'Marking {label}watched', + 'Unable to mark watched', fatal=False) - @staticmethod - def _extract_urls(webpage): - # Embedded YouTube player - entries = [ - unescapeHTML(mobj.group('url')) - for mobj in re.finditer(r'''(?x) - (?: - <iframe[^>]+?src=| - data-video-url=| - <embed[^>]+?src=| - embedSWF\(?:\s*| - <object[^>]+data=| - new\s+SWFObject\( - ) - (["\']) - (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ - (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?) - \1''', webpage)] + @classmethod + def _extract_from_webpage(cls, url, webpage): + # Invidious Instances + # https://github.com/hypervideo/hypervideo/issues/195 + # https://github.com/iv-org/invidious/pull/1730 + mobj = re.search( + r'<link rel="alternate" href="(?P<url>https://www\.youtube\.com/watch\?v=[0-9A-Za-z_-]{11})"', + webpage) + if mobj: + yield cls.url_result(mobj.group('url'), cls) + raise cls.StopExtraction() + + yield from super()._extract_from_webpage(url, webpage) # lazyYT YouTube embed - entries.extend(list(map( - unescapeHTML, - re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)))) + for id_ in re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage): + yield cls.url_result(unescapeHTML(id_), cls, id_) # Wordpress "YouTube Video Importer" plugin - matches = re.findall(r'''(?x)<div[^>]+ - class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ - data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage) - entries.extend(m[-1] for m in matches) - - return entries - - @staticmethod - def _extract_url(webpage): - urls = YoutubeIE._extract_urls(webpage) - return urls[0] if urls else None + for m in re.findall(r'''(?x)<div[^>]+ + class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ + data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage): + yield cls.url_result(m[-1], cls, m[-1]) @classmethod def extract_id(cls, url): - mobj = re.match(cls._VALID_URL, url, re.VERBOSE) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) - return mobj.group('id') + video_id = cls.get_temp_id(url) + if not video_id: + raise ExtractorError(f'Invalid URL: {url}') + return video_id def _extract_chapters_from_json(self, data, duration): chapter_list = traverse_obj( @@ -2667,39 +3116,42 @@ class YoutubeIE(YoutubeBaseInfoExtractor): chapter_time = lambda chapter: parse_duration(self._get_text(chapter, 'timeDescription')) chapter_title = lambda chapter: self._get_text(chapter, 'title') - return next(( - filter(None, ( - self._extract_chapters( - traverse_obj(contents, (..., 'macroMarkersListItemRenderer')), - chapter_time, chapter_title, duration) - for contents in content_list - ))), []) - - def _extract_chapters(self, chapter_list, chapter_time, chapter_title, duration): - chapters = [] - last_chapter = {'start_time': 0} - for idx, chapter in enumerate(chapter_list or []): - title = chapter_title(chapter) - start_time = chapter_time(chapter) - if start_time is None: - continue - last_chapter['end_time'] = start_time - if start_time < last_chapter['start_time']: - if idx == 1: - chapters.pop() - self.report_warning('Invalid start time for chapter "%s"' % last_chapter['title']) - else: - self.report_warning(f'Invalid start time for chapter "{title}"') - continue - last_chapter = {'start_time': start_time, 'title': title} - chapters.append(last_chapter) - last_chapter['end_time'] = duration - return chapters + return next(filter(None, ( + self._extract_chapters(traverse_obj(contents, (..., 'macroMarkersListItemRenderer')), + chapter_time, chapter_title, duration) + for contents in content_list)), []) - def _extract_yt_initial_variable(self, webpage, regex, video_id, name): - return self._parse_json(self._search_regex( - (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE), - regex), webpage, name, default='{}'), video_id, fatal=False) + def _extract_chapters_from_description(self, description, duration): + duration_re = r'(?:\d+:)?\d{1,2}:\d{2}' + sep_re = r'(?m)^\s*(%s)\b\W*\s(%s)\s*$' + return self._extract_chapters( + re.findall(sep_re % (duration_re, r'.+?'), description or ''), + chapter_time=lambda x: parse_duration(x[0]), chapter_title=lambda x: x[1], + duration=duration, strict=False) or self._extract_chapters( + re.findall(sep_re % (r'.+?', duration_re), description or ''), + chapter_time=lambda x: parse_duration(x[1]), chapter_title=lambda x: x[0], + duration=duration, strict=False) + + def _extract_chapters(self, chapter_list, chapter_time, chapter_title, duration, strict=True): + if not duration: + return + chapter_list = [{ + 'start_time': chapter_time(chapter), + 'title': chapter_title(chapter), + } for chapter in chapter_list or []] + if not strict: + chapter_list.sort(key=lambda c: c['start_time'] or 0) + + chapters = [{'start_time': 0}] + for idx, chapter in enumerate(chapter_list): + if chapter['start_time'] is None: + self.report_warning(f'Incomplete chapter {idx}') + elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration: + chapters.append(chapter) + elif chapter not in chapters: + self.report_warning( + f'Invalid start time ({chapter["start_time"]} < {chapters[-1]["start_time"]}) for chapter "{chapter["title"]}"') + return chapters[1:] def _extract_comment(self, comment_renderer, parent=None): comment_id = comment_renderer.get('commentId') @@ -2708,16 +3160,18 @@ class YoutubeIE(YoutubeBaseInfoExtractor): text = self._get_text(comment_renderer, 'contentText') - # note: timestamp is an estimate calculated from the current time and time_text - timestamp, time_text = self._extract_time_text(comment_renderer, 'publishedTimeText') + # Timestamp is an estimate calculated from the current time and time_text + time_text = self._get_text(comment_renderer, 'publishedTimeText') or '' + timestamp = self._parse_time_text(time_text) + author = self._get_text(comment_renderer, 'authorText') author_id = try_get(comment_renderer, - lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str) + lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], str) votes = parse_count(try_get(comment_renderer, (lambda x: x['voteCount']['simpleText'], - lambda x: x['likeCount']), compat_str)) or 0 + lambda x: x['likeCount']), str)) or 0 author_thumbnail = try_get(comment_renderer, - lambda x: x['authorThumbnail']['thumbnails'][-1]['url'], compat_str) + lambda x: x['authorThumbnail']['thumbnails'][-1]['url'], str) author_is_uploader = try_get(comment_renderer, lambda x: x['authorIsChannelOwner'], bool) is_favorited = 'creatorHeart' in (try_get( @@ -2796,8 +3250,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): comment_entries_iter = self._comment_entries( comment_replies_renderer, ytcfg, video_id, parent=comment.get('id'), tracker=tracker) - for reply_comment in itertools.islice(comment_entries_iter, min(max_replies_per_thread, max(0, max_replies - tracker['total_reply_comments']))): - yield reply_comment + yield from itertools.islice(comment_entries_iter, min( + max_replies_per_thread, max(0, max_replies - tracker['total_reply_comments']))) # Keeps track of counts across recursive calls if not tracker: @@ -2812,8 +3266,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # YouTube comments have a max depth of 2 max_depth = int_or_none(get_single_config_arg('max_comment_depth')) if max_depth: - self._downloader.deprecation_warning( - '[youtube] max_comment_depth extractor argument is deprecated. Set max replies in the max-comments extractor argument instead.') + self._downloader.deprecated_feature('[youtube] max_comment_depth extractor argument is deprecated. ' + 'Set max replies in the max-comments extractor argument instead') if max_depth == 1 and parent: return @@ -2821,12 +3275,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): lambda p: int_or_none(p, default=sys.maxsize), self._configuration_arg('max_comments', ) + [''] * 4) continuation = self._extract_continuation(root_continuation_data) - message = self._get_text(root_continuation_data, ('contents', ..., 'messageRenderer', 'text'), max_runs=1) - if message and not parent: - self.report_warning(message, video_id=video_id) response = None + is_forced_continuation = False is_first_continuation = parent is None + if is_first_continuation and not continuation: + # Sometimes you can get comments by generating the continuation yourself, + # even if YouTube initially reports them being disabled - e.g. stories comments. + # Note: if the comment section is actually disabled, YouTube may return a response with + # required check_get_keys missing. So we will disable that check initially in this case. + continuation = self._build_api_continuation_query(self._generate_comment_continuation(video_id)) + is_forced_continuation = True for page_num in itertools.count(0): if not continuation: @@ -2843,12 +3302,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): note_prefix = '%sDownloading comment%s API JSON page %d %s' % ( ' ' if parent else '', ' replies' if parent else '', page_num, comment_prog_str) - - response = self._extract_response( - item_id=None, query=continuation, - ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix, - check_get_keys='onResponseReceivedEndpoints') - + try: + response = self._extract_response( + item_id=None, query=continuation, + ep='next', ytcfg=ytcfg, headers=headers, note=note_prefix, + check_get_keys='onResponseReceivedEndpoints' if not is_forced_continuation else None) + except ExtractorError as e: + # Ignore incomplete data error for replies if retries didn't work. + # This is to allow any other parent comments and comment threads to be downloaded. + # See: https://github.com/hypervideo/hypervideo/issues/4669 + if 'incomplete data' in str(e).lower() and parent and self.get_param('ignoreerrors') is True: + self.report_warning( + 'Received incomplete data for a comment reply thread and retrying did not help. ' + 'Ignoring to let other comments be downloaded.') + else: + raise + is_forced_continuation = False continuation_contents = traverse_obj( response, 'onResponseReceivedEndpoints', expected_type=list, default=[]) @@ -2873,6 +3342,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if continuation: break + message = self._get_text(root_continuation_data, ('contents', ..., 'messageRenderer', 'text'), max_runs=1) + if message and not parent and tracker['running_total'] == 0: + self.report_warning(f'Youtube said: {message}', video_id=video_id, only_once=True) + raise self.CommentsDisabled + + @staticmethod + def _generate_comment_continuation(video_id): + """ + Generates initial comment section continuation token from given video id + """ + token = f'\x12\r\x12\x0b{video_id}\x18\x062\'"\x11"\x0b{video_id}0\x00x\x020\x00B\x10comments-section' + return base64.b64encode(token.encode()).decode() + def _get_comments(self, ytcfg, video_id, contents, webpage): """Entry for comment extraction""" def _real_comment_extract(contents): @@ -2918,7 +3400,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _is_unplayable(player_response): return traverse_obj(player_response, ('playabilityStatus', 'status')) == 'UNPLAYABLE' - def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr): + _STORY_PLAYER_PARAMS = '8AEB' + + def _extract_player_response(self, client, video_id, master_ytcfg, player_ytcfg, player_url, initial_pr, smuggled_data): session_index = self._extract_session_index(player_ytcfg, master_ytcfg) syncid = self._extract_account_syncid(player_ytcfg, master_ytcfg, initial_pr) @@ -2926,7 +3410,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): headers = self.generate_api_headers( ytcfg=player_ytcfg, account_syncid=syncid, session_index=session_index, default_client=client) - yt_query = {'videoId': video_id} + yt_query = { + 'videoId': video_id, + } + if smuggled_data.get('is_story') or _split_innertube_client(client)[0] == 'android': + yt_query['params'] = self._STORY_PLAYER_PARAMS + yt_query.update(self._generate_player_context(sts)) return self._extract_response( item_id=video_id, ep='player', query=yt_query, @@ -2939,7 +3428,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): requested_clients = [] default = ['android', 'web'] allowed_clients = sorted( - [client for client in INNERTUBE_CLIENTS.keys() if client[:1] != '_'], + (client for client in INNERTUBE_CLIENTS.keys() if client[:1] != '_'), key=lambda client: INNERTUBE_CLIENTS[client]['priority'], reverse=True) for client in self._configuration_arg('player_client'): if client in allowed_clients: @@ -2959,22 +3448,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return orderedSet(requested_clients) - def _extract_player_ytcfg(self, client, video_id): - url = { - 'web_music': 'https://music.youtube.com', - 'web_embedded': f'https://www.youtube.com/embed/{video_id}?html5=1' - }.get(client) - if not url: - return {} - webpage = self._download_webpage(url, video_id, fatal=False, note='Downloading %s config' % client.replace('_', ' ').strip()) - return self.extract_ytcfg(video_id, webpage) or {} - - def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg): + def _extract_player_responses(self, clients, video_id, webpage, master_ytcfg, smuggled_data): initial_pr = None if webpage: - initial_pr = self._extract_yt_initial_variable( - webpage, self._YT_INITIAL_PLAYER_RESPONSE_RE, - video_id, 'initial player response') + initial_pr = self._search_json( + self._YT_INITIAL_PLAYER_RESPONSE_RE, webpage, 'initial player response', video_id, fatal=False) all_clients = set(clients) clients = clients[::-1] @@ -3005,8 +3483,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): while clients: client, base_client, variant = _split_innertube_client(clients.pop()) player_ytcfg = master_ytcfg if client == 'web' else {} - if 'configs' not in self._configuration_arg('player_skip'): - player_ytcfg = self._extract_player_ytcfg(client, video_id) or player_ytcfg + if 'configs' not in self._configuration_arg('player_skip') and client != 'web': + player_ytcfg = self._download_ytcfg(client, video_id) or player_ytcfg player_url = player_url or self._extract_player_url(master_ytcfg, player_ytcfg, webpage=webpage) require_js_player = self._get_default_ytcfg(client).get('REQUIRE_JS_PLAYER') @@ -3020,7 +3498,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): try: pr = initial_pr if client == 'web' and initial_pr else self._extract_player_response( - client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, player_url if require_js_player else None, initial_pr) + client, video_id, player_ytcfg or master_ytcfg, player_ytcfg, player_url if require_js_player else None, initial_pr, smuggled_data) except ExtractorError as e: if last_error: self.report_warning(last_error) @@ -3028,7 +3506,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): continue if pr: - prs.append(pr) + # YouTube may return a different video player response than expected. + # See: https://github.com/TeamNewPipe/NewPipe/issues/8713 + pr_video_id = traverse_obj(pr, ('videoDetails', 'videoId')) + if pr_video_id and pr_video_id != video_id: + self.report_warning( + f'Skipping player response from {client} client (got player response for video "{pr_video_id}" instead of "{video_id}")' + bug_reports_message()) + else: + prs.append(pr) # creator clients can bypass AGE_VERIFICATION_REQUIRED if logged in if variant == 'embedded' and self._is_unplayable(pr) and self.is_authenticated: @@ -3045,9 +3530,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): self.report_warning(last_error) return prs, player_url - def _extract_formats(self, streaming_data, video_id, player_url, is_live, duration): - itags, stream_ids = {}, [] - itag_qualities, res_qualities = {}, {} + def _needs_live_processing(self, live_status, duration): + if (live_status == 'is_live' and self.get_param('live_from_start') + or live_status == 'post_live' and (duration or 0) > 4 * 3600): + return live_status + + def _extract_formats_and_subtitles(self, streaming_data, video_id, player_url, live_status, duration): + itags, stream_ids = collections.defaultdict(set), [] + itag_qualities, res_qualities = {}, {0: None} q = qualities([ # Normally tiny is the smallest video-only formats. But # audio-only formats with unknown quality may get tagged as tiny @@ -3088,45 +3578,59 @@ class YoutubeIE(YoutubeBaseInfoExtractor): fmt_url = fmt.get('url') if not fmt_url: - sc = compat_parse_qs(fmt.get('signatureCipher')) + sc = urllib.parse.parse_qs(fmt.get('signatureCipher')) fmt_url = url_or_none(try_get(sc, lambda x: x['url'][0])) encrypted_sig = try_get(sc, lambda x: x['s'][0]) - if not (sc and fmt_url and encrypted_sig): + if not all((sc, fmt_url, player_url, encrypted_sig)): continue - if not player_url: + try: + fmt_url += '&%s=%s' % ( + traverse_obj(sc, ('sp', -1)) or 'signature', + self._decrypt_signature(encrypted_sig, video_id, player_url) + ) + except ExtractorError as e: + self.report_warning('Signature extraction failed: Some formats may be missing', + video_id=video_id, only_once=True) + self.write_debug(e, only_once=True) continue - signature = self._decrypt_signature(sc['s'][0], video_id, player_url) - sp = try_get(sc, lambda x: x['sp'][0]) or 'signature' - fmt_url += '&' + sp + '=' + signature query = parse_qs(fmt_url) throttled = False if query.get('n'): try: + decrypt_nsig = self._cached(self._decrypt_nsig, 'nsig', query['n'][0]) fmt_url = update_url_query(fmt_url, { - 'n': self._decrypt_nsig(query['n'][0], video_id, player_url)}) + 'n': decrypt_nsig(query['n'][0], video_id, player_url) + }) except ExtractorError as e: - self.report_warning( - f'nsig extraction failed: You may experience throttling for some formats\n' - f'n = {query["n"][0]} ; player = {player_url}\n{e}', only_once=True) + phantomjs_hint = '' + if isinstance(e, JSInterpreter.Exception): + phantomjs_hint = (f' Install {self._downloader._format_err("PhantomJS", self._downloader.Styles.EMPHASIS)} ' + f'to workaround the issue. {PhantomJSwrapper.INSTALL_HINT}\n') + if player_url: + self.report_warning( + f'nsig extraction failed: You may experience throttling for some formats\n{phantomjs_hint}' + f' n = {query["n"][0]} ; player = {player_url}', video_id=video_id, only_once=True) + self.write_debug(e, only_once=True) + else: + self.report_warning( + 'Cannot decrypt nsig without player_url: You may experience throttling for some formats', + video_id=video_id, only_once=True) throttled = True - if itag: - itags[itag] = 'https' - stream_ids.append(stream_id) - tbr = float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) language_preference = ( 10 if audio_track.get('audioIsDefault') and 10 else -10 if 'descriptive' in (audio_track.get('displayName') or '').lower() and -10 else -1) # Some formats may have much smaller duration than others (possibly damaged during encoding) - # Eg: 2-nOtRESiUc Ref: https://github.com/hypervideo/hypervideo/issues/2823 + # E.g. 2-nOtRESiUc Ref: https://github.com/hypervideo/hypervideo/issues/2823 # Make sure to avoid false positives with small duration differences. - # Eg: __2ABJjxzNo, ySuUZEjARPY + # E.g. __2ABJjxzNo, ySuUZEjARPY is_damaged = try_get(fmt, lambda x: float(x['approxDurationMs']) / duration < 500) if is_damaged: - self.report_warning(f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True) + self.report_warning( + f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True) dct = { 'asr': int_or_none(fmt.get('audioSampleRate')), 'filesize': int_or_none(fmt.get('contentLength')), @@ -3135,9 +3639,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '%s%s' % (audio_track.get('displayName') or '', ' (default)' if language_preference > 0 else ''), fmt.get('qualityLabel') or quality.replace('audio_quality_', ''), + try_get(fmt, lambda x: x['projectionType'].replace('RECTANGULAR', '').lower()), + try_get(fmt, lambda x: x['spatialAudioType'].replace('SPATIAL_AUDIO_TYPE_', '').lower()), throttled and 'THROTTLED', is_damaged and 'DAMAGED', delim=', '), - 'source_preference': -10 if throttled else -1, + # Format 22 is likely to be damaged. See https://github.com/hypervideo/hypervideo/issues/3372 + 'source_preference': -10 if throttled else -5 if itag == '22' else -1, 'fps': int_or_none(fmt.get('fps')) or None, + 'audio_channels': fmt.get('audioChannels'), 'height': height, 'quality': q(quality), 'has_drm': bool(fmt.get('drmFamilies')), @@ -3168,49 +3676,70 @@ class YoutubeIE(YoutubeBaseInfoExtractor): } if dct.get('ext'): dct['container'] = dct['ext'] + '_dash' + + if itag: + itags[itag].add(('https', dct.get('language'))) + stream_ids.append(stream_id) yield dct - live_from_start = is_live and self.get_param('live_from_start') - skip_manifests = self._configuration_arg('skip') - if not self.get_param('youtube_include_hls_manifest', True): - skip_manifests.append('hls') - get_dash = 'dash' not in skip_manifests and ( - not is_live or live_from_start or self._configuration_arg('include_live_dash')) - get_hls = not live_from_start and 'hls' not in skip_manifests + needs_live_processing = self._needs_live_processing(live_status, duration) + skip_bad_formats = not self._configuration_arg('include_incomplete_formats') + + skip_manifests = set(self._configuration_arg('skip')) + if (not self.get_param('youtube_include_hls_manifest', True) + or needs_live_processing == 'is_live' # These will be filtered out by YoutubeDL anyway + or needs_live_processing and skip_bad_formats): + skip_manifests.add('hls') + + if not self.get_param('youtube_include_dash_manifest', True): + skip_manifests.add('dash') + if self._configuration_arg('include_live_dash'): + self._downloader.deprecated_feature('[youtube] include_live_dash extractor argument is deprecated. ' + 'Use include_incomplete_formats extractor argument instead') + elif skip_bad_formats and live_status == 'is_live' and needs_live_processing != 'is_live': + skip_manifests.add('dash') def process_manifest_format(f, proto, itag): - if itag in itags: - if itags[itag] == proto or f'{itag}-{proto}' in itags: - return False - itag = f'{itag}-{proto}' - if itag: + key = (proto, f.get('language')) + if key in itags[itag]: + return False + itags[itag].add(key) + + if any(p != proto for p, _ in itags[itag]): + f['format_id'] = f'{itag}-{proto}' + elif itag: f['format_id'] = itag - itags[itag] = proto - f['quality'] = next(( - q(qdict[val]) - for val, qdict in ((f.get('format_id', '').split('-')[0], itag_qualities), (f.get('height'), res_qualities)) - if val in qdict), -1) + f['quality'] = q(itag_qualities.get(try_get(f, lambda f: f['format_id'].split('-')[0]), -1)) + if f['quality'] == -1 and f.get('height'): + f['quality'] = q(res_qualities[min(res_qualities, key=lambda x: abs(x - f['height']))]) return True + subtitles = {} for sd in streaming_data: - hls_manifest_url = get_hls and sd.get('hlsManifestUrl') + hls_manifest_url = 'hls' not in skip_manifests and sd.get('hlsManifestUrl') if hls_manifest_url: - for f in self._extract_m3u8_formats(hls_manifest_url, video_id, 'mp4', fatal=False): + fmts, subs = self._extract_m3u8_formats_and_subtitles( + hls_manifest_url, video_id, 'mp4', fatal=False, live=live_status == 'is_live') + subtitles = self._merge_subtitles(subs, subtitles) + for f in fmts: if process_manifest_format(f, 'hls', self._search_regex( r'/itag/(\d+)', f['url'], 'itag', default=None)): yield f - dash_manifest_url = get_dash and sd.get('dashManifestUrl') + dash_manifest_url = 'dash' not in skip_manifests and sd.get('dashManifestUrl') if dash_manifest_url: - for f in self._extract_mpd_formats(dash_manifest_url, video_id, fatal=False): + formats, subs = self._extract_mpd_formats_and_subtitles(dash_manifest_url, video_id, fatal=False) + subtitles = self._merge_subtitles(subs, subtitles) # Prioritize HLS subs over DASH + for f in formats: if process_manifest_format(f, 'dash', f['format_id']): f['filesize'] = int_or_none(self._search_regex( r'/clen/(\d+)', f.get('fragment_base_url') or f['url'], 'file size', default=None)) - if live_from_start: + if needs_live_processing: f['is_from_start'] = True yield f + yield subtitles def _extract_storyboard(self, player_responses, duration): spec = get_first( @@ -3241,6 +3770,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'url': url, 'width': width, 'height': height, + 'fps': frame_count / duration, + 'rows': rows, + 'columns': cols, 'fragments': [{ 'url': url.replace('$M', str(j)), 'duration': min(fragment_duration, duration - (j * fragment_duration)), @@ -3250,14 +3782,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _download_player_responses(self, url, smuggled_data, video_id, webpage_url): webpage = None if 'webpage' not in self._configuration_arg('player_skip'): + query = {'bpctr': '9999999999', 'has_verified': '1'} + if smuggled_data.get('is_story'): + query['pp'] = self._STORY_PLAYER_PARAMS webpage = self._download_webpage( - webpage_url + '&bpctr=9999999999&has_verified=1', video_id, fatal=False) + webpage_url, video_id, fatal=False, query=query) master_ytcfg = self.extract_ytcfg(video_id, webpage) or self._get_default_ytcfg() player_responses, player_url = self._extract_player_responses( self._get_requested_clients(url, smuggled_data), - video_id, webpage, master_ytcfg) + video_id, webpage, master_ytcfg, smuggled_data) return webpage, master_ytcfg, player_responses, player_url @@ -3266,11 +3801,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): is_live = get_first(video_details, 'isLive') if is_live is None: is_live = get_first(live_broadcast_details, 'isLiveNow') - + live_content = get_first(video_details, 'isLiveContent') + is_upcoming = get_first(video_details, 'isUpcoming') + post_live = get_first(video_details, 'isPostLiveDvr') + live_status = ('post_live' if post_live + else 'is_live' if is_live + else 'is_upcoming' if is_upcoming + else 'was_live' if live_content + else 'not_live' if False in (is_live, live_content) + else None) streaming_data = traverse_obj(player_responses, (..., 'streamingData'), default=[]) - formats = list(self._extract_formats(streaming_data, video_id, player_url, is_live, duration)) + *formats, subtitles = self._extract_formats_and_subtitles(streaming_data, video_id, player_url, live_status, duration) - return live_broadcast_details, is_live, streaming_data, formats + return live_broadcast_details, live_status, streaming_data, formats, subtitles def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -3300,11 +3843,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): microformats = traverse_obj( player_responses, (..., 'microformat', 'playerMicroformatRenderer'), expected_type=dict, default=[]) - video_title = ( - get_first(video_details, 'title') - or self._get_text(microformats, (..., 'title')) - or search_meta(['og:title', 'twitter:title', 'title'])) - video_description = get_first(video_details, 'shortDescription') + + translated_title = self._get_text(microformats, (..., 'title')) + video_title = (self._preferred_lang and translated_title + or get_first(video_details, 'title') # primary + or translated_title + or search_meta(['og:title', 'twitter:title', 'title'])) + translated_description = self._get_text(microformats, (..., 'description')) + original_description = get_first(video_details, 'shortDescription') + video_description = ( + self._preferred_lang and translated_description + # If original description is blank, it will be an empty string. + # Do not prefer translated description in this case. + or original_description if original_description is not None else translated_description) multifeed_metadata_list = get_first( player_responses, @@ -3320,12 +3871,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Unquote should take place before split on comma (,) since textual # fields may contain comma as well (see # https://github.com/ytdl-org/youtube-dl/issues/8536) - feed_data = compat_parse_qs( - compat_urllib_parse_unquote_plus(feed)) + feed_data = urllib.parse.parse_qs( + urllib.parse.unquote_plus(feed)) def feed_entry(name): return try_get( - feed_data, lambda x: x[name][0], compat_str) + feed_data, lambda x: x[name][0], str) feed_id = feed_entry('id') if not feed_id: @@ -3349,13 +3900,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return self.playlist_result( entries, video_id, video_title, video_description) - duration = int_or_none( - get_first(video_details, 'lengthSeconds') - or get_first(microformats, 'lengthSeconds') - or parse_duration(search_meta('duration'))) or None + duration = (int_or_none(get_first(video_details, 'lengthSeconds')) + or int_or_none(get_first(microformats, 'lengthSeconds')) + or parse_duration(search_meta('duration')) or None) - live_broadcast_details, is_live, streaming_data, formats = self._list_formats( - video_id, microformats, video_details, player_responses, player_url, duration) + live_broadcast_details, live_status, streaming_data, formats, automatic_captions = \ + self._list_formats(video_id, microformats, video_details, player_responses, player_url, duration) + if live_status == 'post_live': + self.write_debug(f'{video_id}: Video is in Post-Live Manifestless mode') if not formats: if not self.get_param('allow_unplayable_formats') and traverse_obj(streaming_data, (..., 'licenseInfos')): @@ -3402,19 +3954,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): original_thumbnails = thumbnails.copy() # The best resolution thumbnails sometimes does not appear in the webpage - # See: https://github.com/ytdl-org/youtube-dl/issues/29049, https://github.com/hypervideo/hypervideo/issues/340 + # See: https://github.com/hypervideo/hypervideo/issues/340 # List of possible thumbnails - Ref: <https://stackoverflow.com/a/20542029> thumbnail_names = [ - 'maxresdefault', 'hq720', 'sddefault', 'sd1', 'sd2', 'sd3', - 'hqdefault', 'hq1', 'hq2', 'hq3', '0', - 'mqdefault', 'mq1', 'mq2', 'mq3', - 'default', '1', '2', '3' + # While the *1,*2,*3 thumbnails are just below their corresponding "*default" variants + # in resolution, these are not the custom thumbnail. So de-prioritize them + 'maxresdefault', 'hq720', 'sddefault', 'hqdefault', '0', 'mqdefault', 'default', + 'sd1', 'sd2', 'sd3', 'hq1', 'hq2', 'hq3', 'mq1', 'mq2', 'mq3', '1', '2', '3' ] n_thumbnail_names = len(thumbnail_names) thumbnails.extend({ 'url': 'https://i.ytimg.com/vi{webp}/{video_id}/{name}{live}.{ext}'.format( video_id=video_id, name=name, ext=ext, - webp='_webp' if ext == 'webp' else '', live='_live' if is_live else ''), + webp='_webp' if ext == 'webp' else '', live='_live' if live_status == 'is_live' else ''), } for name in thumbnail_names for ext in ('webp', 'jpg')) for thumb in thumbnails: i = next((i for i, t in enumerate(thumbnail_names) if f'/{video_id}/{t}' in thumb['url']), n_thumbnail_names) @@ -3429,26 +3981,29 @@ class YoutubeIE(YoutubeBaseInfoExtractor): or search_meta('channelId')) owner_profile_url = get_first(microformats, 'ownerProfileUrl') - live_content = get_first(video_details, 'isLiveContent') - is_upcoming = get_first(video_details, 'isUpcoming') - if is_live is None: - if is_upcoming or live_content is False: - is_live = False - if is_upcoming is None and (live_content or is_live): - is_upcoming = False live_start_time = parse_iso8601(get_first(live_broadcast_details, 'startTimestamp')) live_end_time = parse_iso8601(get_first(live_broadcast_details, 'endTimestamp')) if not duration and live_end_time and live_start_time: duration = live_end_time - live_start_time - if is_live and self.get_param('live_from_start'): - self._prepare_live_from_start_formats(formats, video_id, live_start_time, url, webpage_url, smuggled_data) + needs_live_processing = self._needs_live_processing(live_status, duration) - formats.extend(self._extract_storyboard(player_responses, duration)) + def is_bad_format(fmt): + if needs_live_processing and not fmt.get('is_from_start'): + return True + elif (live_status == 'is_live' and needs_live_processing != 'is_live' + and fmt.get('protocol') == 'http_dash_segments'): + return True + + for fmt in filter(is_bad_format, formats): + fmt['preference'] = (fmt.get('preference') or -1) - 10 + fmt['format_note'] = join_nonempty(fmt.get('format_note'), '(Last 4 hours)', delim=' ') - # Source is given priority since formats that throttle are given lower source_preference - # When throttling issue is fully fixed, remove this - self._sort_formats(formats, ('quality', 'res', 'fps', 'hdr:12', 'source', 'codec:vp9.2', 'lang', 'proto')) + if needs_live_processing: + self._prepare_live_from_start_formats( + formats, video_id, live_start_time, url, webpage_url, smuggled_data, live_status == 'is_live') + + formats.extend(self._extract_storyboard(player_responses, duration)) info = { 'id': video_id, @@ -3463,7 +4018,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'uploader_id': self._search_regex(r'/(?:channel|user)/([^/?&#]+)', owner_profile_url, 'uploader id') if owner_profile_url else None, 'uploader_url': owner_profile_url, 'channel_id': channel_id, - 'channel_url': format_field(channel_id, template='https://www.youtube.com/channel/%s'), + 'channel_url': format_field(channel_id, None, 'https://www.youtube.com/channel/%s'), 'duration': duration, 'view_count': int_or_none( get_first((video_details, microformats), (..., 'viewCount')) @@ -3477,14 +4032,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'categories': [category] if category else None, 'tags': keywords, 'playable_in_embed': get_first(playability_statuses, 'playableInEmbed'), - 'is_live': is_live, - 'was_live': (False if is_live or is_upcoming or live_content is False - else None if is_live is None or is_upcoming is None - else live_content), - 'live_status': 'is_upcoming' if is_upcoming else None, # rest will be set by YoutubeDL + 'live_status': live_status, 'release_timestamp': live_start_time, + '_format_sort_fields': ( # source_preference is lower for throttled/potentially damaged formats + 'quality', 'res', 'fps', 'hdr:12', 'source', 'vcodec:vp9.2', 'channels', 'acodec', 'lang', 'proto') } + subtitles = {} pctr = traverse_obj(player_responses, (..., 'captions', 'playerCaptionsTracklistRenderer'), expected_type=dict) if pctr: def get_lang_code(track): @@ -3511,7 +4065,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'name': sub_name, }) - subtitles, automatic_captions = {}, {} + # NB: Constructing the full subtitle dictionary is slow + get_translated_subs = 'translated_subs' not in self._configuration_arg('skip') and ( + self.get_param('writeautomaticsub', False) or self.get_param('listsubtitles')) for lang_code, caption_track in captions.items(): base_url = caption_track.get('baseUrl') orig_lang = parse_qs(base_url).get('lang', [None])[-1] @@ -3529,11 +4085,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not trans_code: continue orig_trans_code = trans_code - if caption_track.get('kind') != 'asr': - if 'translated_subs' in self._configuration_arg('skip'): + if caption_track.get('kind') != 'asr' and trans_code != 'und': + if not get_translated_subs: continue trans_code += f'-{lang_code}' - trans_name += format_field(lang_name, template=' from %s') + trans_name += format_field(lang_name, None, ' from %s') # Add an "-orig" label to the original language so that it can be distinguished. # The subs are returned without "-orig" as well for compatibility if lang_code == f'a-{orig_trans_code}': @@ -3542,12 +4098,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Setting tlang=lang returns damaged subtitles. process_language(automatic_captions, base_url, trans_code, trans_name, {} if orig_lang == orig_trans_code else {'tlang': trans_code}) - info['automatic_captions'] = automatic_captions - info['subtitles'] = subtitles - parsed_url = compat_urllib_parse_urlparse(url) + info['automatic_captions'] = automatic_captions + info['subtitles'] = subtitles + + parsed_url = urllib.parse.urlparse(url) for component in [parsed_url.fragment, parsed_url.query]: - query = compat_parse_qs(component) + query = urllib.parse.parse_qs(component) for k, v in query.items(): for d_k, s_ks in [('start', ('start', 't')), ('end', ('end',))]: d_k += '_time' @@ -3556,7 +4113,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Youtube Music Auto-generated description if video_description: - mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description) + mobj = re.search( + r'''(?xs) + (?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+ + (?P<album>[^\n]+) + (?:.+?℗\s*(?P<release_year>\d{4})(?!\d))? + (?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))? + (.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))? + .+\nAuto-generated\ by\ YouTube\.\s*$ + ''', video_description) if mobj: release_year = mobj.group('release_year') release_date = mobj.group('release_date') @@ -3574,9 +4139,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): initial_data = None if webpage: - initial_data = self._extract_yt_initial_variable( - webpage, self._YT_INITIAL_DATA_RE, video_id, - 'yt initial data') + initial_data = self.extract_yt_initial_data(video_id, webpage, fatal=False) if not initial_data: query = {'videoId': video_id} query.update(self._get_checkok_params()) @@ -3586,22 +4149,33 @@ class YoutubeIE(YoutubeBaseInfoExtractor): headers=self.generate_api_headers(ytcfg=master_ytcfg), note='Downloading initial data API JSON') - try: - # This will error if there is no livechat + info['comment_count'] = traverse_obj(initial_data, ( + 'contents', 'twoColumnWatchNextResults', 'results', 'results', 'contents', ..., 'itemSectionRenderer', + 'contents', ..., 'commentsEntryPointHeaderRenderer', 'commentCount', 'simpleText' + ), ( + 'engagementPanels', lambda _, v: v['engagementPanelSectionListRenderer']['panelIdentifier'] == 'comment-item-section', + 'engagementPanelSectionListRenderer', 'header', 'engagementPanelTitleHeaderRenderer', 'contextualInfo', 'runs', ..., 'text' + ), expected_type=int_or_none, get_all=False) + + try: # This will error if there is no livechat initial_data['contents']['twoColumnWatchNextResults']['conversationBar']['liveChatRenderer']['continuations'][0]['reloadContinuationData']['continuation'] + except (KeyError, IndexError, TypeError): + pass + else: info.setdefault('subtitles', {})['live_chat'] = [{ - 'url': 'https://www.youtube.com/watch?v=%s' % video_id, # url is needed to set cookies + # url is needed to set cookies + 'url': f'https://www.youtube.com/watch?v={video_id}&bpctr=9999999999&has_verified=1', 'video_id': video_id, 'ext': 'json', - 'protocol': 'youtube_live_chat' if is_live or is_upcoming else 'youtube_live_chat_replay', + 'protocol': ('youtube_live_chat' if live_status in ('is_live', 'is_upcoming') + else 'youtube_live_chat_replay'), }] - except (KeyError, IndexError, TypeError): - pass if initial_data: info['chapters'] = ( self._extract_chapters_from_json(initial_data, duration) or self._extract_chapters_from_engagement_panel(initial_data, duration) + or self._extract_chapters_from_description(video_description, duration) or None) contents = traverse_obj( @@ -3618,7 +4192,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): lambda x: x['superTitleIcon']['iconType']) == 'LOCATION_PIN': info['location'] = stl else: - mobj = re.search(r'(.+?)\s*S(\d+)\s*•\s*E(\d+)', stl) + mobj = re.search(r'(.+?)\s*S(\d+)\s*•?\s*E(\d+)', stl) if mobj: info.update({ 'series': mobj.group(1), @@ -3629,19 +4203,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor): vpir, lambda x: x['videoActions']['menuRenderer']['topLevelButtons'], list) or []): - tbr = tlb.get('toggleButtonRenderer') or {} - for getter, regex in [( - lambda x: x['defaultText']['accessibility']['accessibilityData'], - r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([ - lambda x: x['accessibility'], - lambda x: x['accessibilityData']['accessibilityData'], - ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]: - label = (try_get(tbr, getter, dict) or {}).get('label') - if label: - mobj = re.match(regex, label) - if mobj: - info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count')) - break + tbrs = variadic( + traverse_obj( + tlb, 'toggleButtonRenderer', + ('segmentedLikeDislikeButtonRenderer', ..., 'toggleButtonRenderer'), + default=[])) + for tbr in tbrs: + for getter, regex in [( + lambda x: x['defaultText']['accessibility']['accessibilityData'], + r'(?P<count>[\d,]+)\s*(?P<type>(?:dis)?like)'), ([ + lambda x: x['accessibility'], + lambda x: x['accessibilityData']['accessibilityData'], + ], r'(?P<type>(?:dis)?like) this video along with (?P<count>[\d,]+) other people')]: + label = (try_get(tbr, getter, dict) or {}).get('label') + if label: + mobj = re.match(regex, label) + if mobj: + info[mobj.group('type') + '_count'] = str_to_int(mobj.group('count')) + break sbr_tooltip = try_get( vpir, lambda x: x['sentimentBar']['sentimentBarRenderer']['tooltip']) if sbr_tooltip: @@ -3650,6 +4229,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'like_count': str_to_int(like_count), 'dislike_count': str_to_int(dislike_count), }) + vcr = traverse_obj(vpir, ('viewCount', 'videoViewCountRenderer')) + if vcr: + vc = self._get_count(vcr, 'viewCount') + # Upcoming premieres with waiting count are treated as live here + if vcr.get('isLive'): + info['concurrent_view_count'] = vc + elif info.get('view_count') is None: + info['view_count'] = vc + vsir = get_first(contents, 'videoSecondaryInfoRenderer') if vsir: vor = traverse_obj(vsir, ('owner', 'videoOwnerRenderer')) @@ -3695,8 +4283,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): upload_date = ( unified_strdate(get_first(microformats, 'uploadDate')) or unified_strdate(search_meta('uploadDate'))) - if not upload_date or (not info.get('is_live') and not info.get('was_live') and info.get('live_status') != 'is_upcoming'): - upload_date = strftime_or_none(self._extract_time_text(vpir, 'dateText')[0], '%Y%m%d') + if not upload_date or ( + live_status in ('not_live', None) + and 'no-youtube-prefer-utc-upload-date' not in self.get_param('compat_opts', []) + ): + upload_date = strftime_or_none( + self._parse_time_text(self._get_text(vpir, 'dateText')), '%Y%m%d') or upload_date info['upload_date'] = upload_date for to, frm in fallbacks.items(): @@ -3708,33 +4300,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if v: info[d_k] = v - is_private = get_first(video_details, 'isPrivate', expected_type=bool) - is_unlisted = get_first(microformats, 'isUnlisted', expected_type=bool) - is_membersonly = None - is_premium = None - if initial_data and is_private is not None: - is_membersonly = False - is_premium = False - contents = try_get(initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or [] - badge_labels = set() - for content in contents: - if not isinstance(content, dict): - continue - badge_labels.update(self._extract_badges(content.get('videoPrimaryInfoRenderer'))) - for badge_label in badge_labels: - if badge_label.lower() == 'members only': - is_membersonly = True - elif badge_label.lower() == 'premium': - is_premium = True - elif badge_label.lower() == 'unlisted': - is_unlisted = True - - info['availability'] = self._availability( - is_private=is_private, - needs_premium=is_premium, - needs_subscription=is_membersonly, - needs_auth=info['age_limit'] >= 18, - is_unlisted=None if is_private is None else is_unlisted) + badges = self._extract_badges(traverse_obj(contents, (..., 'videoPrimaryInfoRenderer'), get_all=False)) + + is_private = (self._has_badge(badges, BadgeType.AVAILABILITY_PRIVATE) + or get_first(video_details, 'isPrivate', expected_type=bool)) + + info['availability'] = ( + 'public' if self._has_badge(badges, BadgeType.AVAILABILITY_PUBLIC) + else self._availability( + is_private=is_private, + needs_premium=( + self._has_badge(badges, BadgeType.AVAILABILITY_PREMIUM) + or False if initial_data and is_private is not None else None), + needs_subscription=( + self._has_badge(badges, BadgeType.AVAILABILITY_SUBSCRIPTION) + or False if initial_data and is_private is not None else None), + needs_auth=info['age_limit'] >= 18, + is_unlisted=None if is_private is None else ( + self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED) + or get_first(microformats, 'isUnlisted', expected_type=bool)))) info['__post_extractor'] = self.extract_comments(master_ytcfg, video_id, contents, webpage) @@ -3744,15 +4328,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): - @staticmethod def passthrough_smuggled_data(func): - def _smuggle(entries, smuggled_data): - for entry in entries: - # TODO: Convert URL to music.youtube instead. - # Do we need to passthrough any other smuggled_data? - entry['url'] = smuggle_url(entry['url'], smuggled_data) - yield entry + def _smuggle(info, smuggled_data): + if info.get('_type') not in ('url', 'url_transparent'): + return info + if smuggled_data.get('is_music_url'): + parsed_url = urllib.parse.urlparse(info['url']) + if parsed_url.netloc in ('www.youtube.com', 'music.youtube.com'): + smuggled_data.pop('is_music_url') + info['url'] = urllib.parse.urlunparse(parsed_url._replace(netloc='music.youtube.com')) + if smuggled_data: + info['url'] = smuggle_url(info['url'], smuggled_data) + return info @functools.wraps(func) def wrapper(self, url): @@ -3760,8 +4348,10 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): if self.is_music_url(url): smuggled_data['is_music_url'] = True info_dict = func(self, url, smuggled_data) - if smuggled_data and info_dict.get('entries'): - info_dict['entries'] = _smuggle(info_dict['entries'], smuggled_data) + if smuggled_data: + _smuggle(info_dict, smuggled_data) + if info_dict.get('entries'): + info_dict['entries'] = (_smuggle(i, smuggled_data.copy()) for i in info_dict['entries']) return info_dict return wrapper @@ -3824,7 +4414,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): # generic endpoint URL support ep_url = urljoin('https://www.youtube.com/', try_get( renderer, lambda x: x['navigationEndpoint']['commandMetadata']['webCommandMetadata']['url'], - compat_str)) + str)) if ep_url: for ie in (YoutubeTabIE, YoutubePlaylistIE, YoutubeIE): if ie.suitable(ep_url): @@ -3859,8 +4449,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): # TODO: add support for nested playlists so each shelf is processed # as separate playlist # TODO: this includes only first N items - for entry in self._grid_entries(renderer): - yield entry + yield from self._grid_entries(renderer) renderer = content.get('horizontalListRenderer') if renderer: # TODO @@ -3869,7 +4458,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): def _shelf_entries(self, shelf_renderer, skip_channels=False): ep = try_get( shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'], - compat_str) + str) shelf_url = urljoin('https://www.youtube.com', ep) if shelf_url: # Skipping links to another channels, note that checking for @@ -3880,8 +4469,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): title = self._get_text(shelf_renderer, 'title') yield self.url_result(shelf_url, video_title=title) # Shelf may not contain shelf URL, fallback to extraction from content - for entry in self._shelf_entries_from_content(shelf_renderer): - yield entry + yield from self._shelf_entries_from_content(shelf_renderer) def _playlist_entries(self, video_list_renderer): for content in video_list_renderer['contents']: @@ -3896,8 +4484,8 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): yield self._extract_video(renderer) def _rich_entries(self, rich_grid_renderer): - renderer = try_get( - rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {} + renderer = traverse_obj( + rich_grid_renderer, ('content', ('videoRenderer', 'reelItemRenderer')), get_all=False) or {} video_id = renderer.get('videoId') if not video_id: return @@ -3930,7 +4518,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): yield entry # playlist attachment playlist_id = try_get( - post_renderer, lambda x: x['backstageAttachment']['playlistRenderer']['playlistId'], compat_str) + post_renderer, lambda x: x['backstageAttachment']['playlistRenderer']['playlistId'], str) if playlist_id: yield self.url_result( 'https://www.youtube.com/playlist?list=%s' % playlist_id, @@ -3941,7 +4529,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): if not isinstance(run, dict): continue ep_url = try_get( - run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str) + run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], str) if not ep_url: continue if not YoutubeIE.suitable(ep_url): @@ -3957,10 +4545,12 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): return for content in contents: renderer = content.get('backstagePostThreadRenderer') - if not isinstance(renderer, dict): + if isinstance(renderer, dict): + yield from self._post_thread_entries(renderer) continue - for entry in self._post_thread_entries(renderer): - yield entry + renderer = content.get('videoRenderer') + if isinstance(renderer, dict): + yield self._video_entry(renderer) r''' # unused def _rich_grid_entries(self, contents): @@ -3972,6 +4562,13 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): yield entry ''' + def _report_history_entries(self, renderer): + for url in traverse_obj(renderer, ( + 'rows', ..., 'reportHistoryTableRowRenderer', 'cells', ..., + 'reportHistoryTableCellRenderer', 'cell', 'reportHistoryTableTextCellRenderer', 'text', 'runs', ..., + 'navigationEndpoint', 'commandMetadata', 'webCommandMetadata', 'url')): + yield self.url_result(urljoin('https://www.youtube.com', url), YoutubeIE) + def _extract_entries(self, parent_renderer, continuation_list): # continuation_list is modified in-place with continuation_list = [continuation_token] continuation_list[:] = [None] @@ -3983,12 +4580,16 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): content, 'itemSectionRenderer', 'musicShelfRenderer', 'musicShelfContinuation', expected_type=dict) if not is_renderer: - renderer = content.get('richItemRenderer') - if renderer: - for entry in self._rich_entries(renderer): + if content.get('richItemRenderer'): + for entry in self._rich_entries(content['richItemRenderer']): yield entry continuation_list[0] = self._extract_continuation(parent_renderer) + elif content.get('reportHistorySectionRenderer'): # https://www.youtube.com/reporthistory + table = traverse_obj(content, ('reportHistorySectionRenderer', 'table', 'tableRenderer')) + yield from self._report_history_entries(table) + continuation_list[0] = self._extract_continuation(table) continue + isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or [] for isr_content in isr_contents: if not isinstance(isr_content, dict): @@ -4030,8 +4631,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): parent_renderer = ( try_get(tab_content, lambda x: x['sectionListRenderer'], dict) or try_get(tab_content, lambda x: x['richGridRenderer'], dict) or {}) - for entry in extract_entries(parent_renderer): - yield entry + yield from extract_entries(parent_renderer) continuation = continuation_list[0] for page_num in itertools.count(1): @@ -4040,7 +4640,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): headers = self.generate_api_headers( ytcfg=ytcfg, account_syncid=account_syncid, visitor_data=visitor_data) response = self._extract_response( - item_id='%s page %s' % (item_id, page_num), + item_id=f'{item_id} page {page_num}', query=continuation, headers=headers, ytcfg=ytcfg, check_get_keys=('continuationContents', 'onResponseReceivedActions', 'onResponseReceivedEndpoints')) @@ -4050,27 +4650,6 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): # See: https://github.com/ytdl-org/youtube-dl/issues/28702 visitor_data = self._extract_visitor_data(response) or visitor_data - known_continuation_renderers = { - 'playlistVideoListContinuation': self._playlist_entries, - 'gridContinuation': self._grid_entries, - 'itemSectionContinuation': self._post_thread_continuation_entries, - 'sectionListContinuation': extract_entries, # for feeds - } - continuation_contents = try_get( - response, lambda x: x['continuationContents'], dict) or {} - continuation_renderer = None - for key, value in continuation_contents.items(): - if key not in known_continuation_renderers: - continue - continuation_renderer = value - continuation_list = [None] - for entry in known_continuation_renderers[key](continuation_renderer): - yield entry - continuation = continuation_list[0] or self._extract_continuation(continuation_renderer) - break - if continuation_renderer: - continue - known_renderers = { 'videoRenderer': (self._grid_entries, 'items'), # for membership tab 'gridPlaylistRenderer': (self._grid_entries, 'items'), @@ -4079,79 +4658,81 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): 'playlistVideoRenderer': (self._playlist_entries, 'contents'), 'itemSectionRenderer': (extract_entries, 'contents'), # for feeds 'richItemRenderer': (extract_entries, 'contents'), # for hashtag - 'backstagePostThreadRenderer': (self._post_thread_continuation_entries, 'contents') + 'backstagePostThreadRenderer': (self._post_thread_continuation_entries, 'contents'), + 'reportHistoryTableRowRenderer': (self._report_history_entries, 'rows'), + 'playlistVideoListContinuation': (self._playlist_entries, None), + 'gridContinuation': (self._grid_entries, None), + 'itemSectionContinuation': (self._post_thread_continuation_entries, None), + 'sectionListContinuation': (extract_entries, None), # for feeds } - on_response_received = dict_get(response, ('onResponseReceivedActions', 'onResponseReceivedEndpoints')) - continuation_items = try_get( - on_response_received, lambda x: x[0]['appendContinuationItemsAction']['continuationItems'], list) - continuation_item = try_get(continuation_items, lambda x: x[0], dict) or {} + + continuation_items = traverse_obj(response, ( + ('onResponseReceivedActions', 'onResponseReceivedEndpoints'), ..., + 'appendContinuationItemsAction', 'continuationItems' + ), 'continuationContents', get_all=False) + continuation_item = traverse_obj(continuation_items, 0, None, expected_type=dict, default={}) + video_items_renderer = None - for key, value in continuation_item.items(): + for key in continuation_item.keys(): if key not in known_renderers: continue - video_items_renderer = {known_renderers[key][1]: continuation_items} + func, parent_key = known_renderers[key] + video_items_renderer = {parent_key: continuation_items} if parent_key else continuation_items continuation_list = [None] - for entry in known_renderers[key][0](video_items_renderer): - yield entry + yield from func(video_items_renderer) continuation = continuation_list[0] or self._extract_continuation(video_items_renderer) + + if not video_items_renderer: break - if video_items_renderer: - continue - break @staticmethod def _extract_selected_tab(tabs, fatal=True): - for tab in tabs: - renderer = dict_get(tab, ('tabRenderer', 'expandableTabRenderer')) or {} - if renderer.get('selected') is True: - return renderer - else: - if fatal: - raise ExtractorError('Unable to find selected tab') + for tab_renderer in tabs: + if tab_renderer.get('selected'): + return tab_renderer + if fatal: + raise ExtractorError('Unable to find selected tab') - @classmethod - def _extract_uploader(cls, data): - uploader = {} - renderer = cls._extract_sidebar_info_renderer(data, 'playlistSidebarSecondaryInfoRenderer') or {} - owner = try_get( - renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict) - if owner: - uploader['uploader'] = owner.get('text') - uploader['uploader_id'] = try_get( - owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str) - uploader['uploader_url'] = urljoin( - 'https://www.youtube.com/', - try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str)) - return {k: v for k, v in uploader.items() if v is not None} + @staticmethod + def _extract_tab_renderers(response): + return traverse_obj( + response, ('contents', 'twoColumnBrowseResultsRenderer', 'tabs', ..., ('tabRenderer', 'expandableTabRenderer')), expected_type=dict) def _extract_from_tabs(self, item_id, ytcfg, data, tabs): - playlist_id = title = description = channel_url = channel_name = channel_id = None - tags = [] + metadata = self._extract_metadata_from_tabs(item_id, data) selected_tab = self._extract_selected_tab(tabs) - primary_sidebar_renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') - renderer = try_get( - data, lambda x: x['metadata']['channelMetadataRenderer'], dict) - if renderer: - channel_name = renderer.get('title') - channel_url = renderer.get('channelUrl') - channel_id = renderer.get('externalId') - else: - renderer = try_get( - data, lambda x: x['metadata']['playlistMetadataRenderer'], dict) + metadata['title'] += format_field(selected_tab, 'title', ' - %s') + metadata['title'] += format_field(selected_tab, 'expandedText', ' - %s') - if renderer: - title = renderer.get('title') - description = renderer.get('description', '') - playlist_id = channel_id - tags = renderer.get('keywords', '').split() + return self.playlist_result( + self._entries( + selected_tab, metadata['id'], ytcfg, + self._extract_account_syncid(ytcfg, data), + self._extract_visitor_data(data, ytcfg)), + **metadata) + + def _extract_metadata_from_tabs(self, item_id, data): + info = {'id': item_id} + + metadata_renderer = traverse_obj(data, ('metadata', 'channelMetadataRenderer'), expected_type=dict) + if metadata_renderer: + info.update({ + 'uploader': metadata_renderer.get('title'), + 'uploader_id': metadata_renderer.get('externalId'), + 'uploader_url': metadata_renderer.get('channelUrl'), + }) + if info['uploader_id']: + info['id'] = info['uploader_id'] + else: + metadata_renderer = traverse_obj(data, ('metadata', 'playlistMetadataRenderer'), expected_type=dict) # We can get the uncropped banner/avatar by replacing the crop params with '=s0' # See: https://github.com/hypervideo/hypervideo/issues/2237#issuecomment-1013694714 def _get_uncropped(url): return url_or_none((url or '').split('=')[0] + '=s0') - avatar_thumbnails = self._extract_thumbnails(renderer, 'avatar') + avatar_thumbnails = self._extract_thumbnails(metadata_renderer, 'avatar') if avatar_thumbnails: uncropped_avatar = _get_uncropped(avatar_thumbnails[0]['url']) if uncropped_avatar: @@ -4162,7 +4743,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): }) channel_banners = self._extract_thumbnails( - data, ('header', ..., ['banner', 'mobileBanner', 'tvBanner'])) + data, ('header', ..., ('banner', 'mobileBanner', 'tvBanner'))) for banner in channel_banners: banner['preference'] = -10 @@ -4175,48 +4756,66 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): 'preference': -5 }) + # Deprecated - remove primary_sidebar_renderer when layout discontinued + primary_sidebar_renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') + playlist_header_renderer = traverse_obj(data, ('header', 'playlistHeaderRenderer'), expected_type=dict) + primary_thumbnails = self._extract_thumbnails( primary_sidebar_renderer, ('thumbnailRenderer', ('playlistVideoThumbnailRenderer', 'playlistCustomThumbnailRenderer'), 'thumbnail')) + playlist_thumbnails = self._extract_thumbnails( + playlist_header_renderer, ('playlistHeaderBanner', 'heroPlaylistThumbnailRenderer', 'thumbnail')) - if playlist_id is None: - playlist_id = item_id - - playlist_stats = traverse_obj(primary_sidebar_renderer, 'stats') - last_updated_unix, _ = self._extract_time_text(playlist_stats, 2) - if title is None: - title = self._get_text(data, ('header', 'hashtagHeaderRenderer', 'hashtag')) or playlist_id - title += format_field(selected_tab, 'title', ' - %s') - title += format_field(selected_tab, 'expandedText', ' - %s') - - metadata = { - 'playlist_id': playlist_id, - 'playlist_title': title, - 'playlist_description': description, - 'uploader': channel_name, - 'uploader_id': channel_id, - 'uploader_url': channel_url, - 'thumbnails': primary_thumbnails + avatar_thumbnails + channel_banners, - 'tags': tags, - 'view_count': self._get_count(playlist_stats, 1), + info.update({ + 'title': (traverse_obj(metadata_renderer, 'title') + or self._get_text(data, ('header', 'hashtagHeaderRenderer', 'hashtag')) + or info['id']), 'availability': self._extract_availability(data), - 'modified_date': strftime_or_none(last_updated_unix, '%Y%m%d'), - 'playlist_count': self._get_count(playlist_stats, 0), 'channel_follower_count': self._get_count(data, ('header', ..., 'subscriberCountText')), - } - if not channel_id: - metadata.update(self._extract_uploader(data)) - metadata.update({ - 'channel': metadata['uploader'], - 'channel_id': metadata['uploader_id'], - 'channel_url': metadata['uploader_url']}) - return self.playlist_result( - self._entries( - selected_tab, playlist_id, ytcfg, - self._extract_account_syncid(ytcfg, data), - self._extract_visitor_data(data, ytcfg)), - **metadata) + 'description': try_get(metadata_renderer, lambda x: x.get('description', '')), + 'tags': try_get(metadata_renderer or {}, lambda x: x.get('keywords', '').split()), + 'thumbnails': (primary_thumbnails or playlist_thumbnails) + avatar_thumbnails + channel_banners, + }) - def _extract_mix_playlist(self, playlist, playlist_id, data, ytcfg): + # Playlist stats is a text runs array containing [video count, view count, last updated]. + # last updated or (view count and last updated) may be missing. + playlist_stats = get_first( + (primary_sidebar_renderer, playlist_header_renderer), (('stats', 'briefStats', 'numVideosText'), )) + + last_updated_unix = self._parse_time_text( + self._get_text(playlist_stats, 2) # deprecated, remove when old layout discontinued + or self._get_text(playlist_header_renderer, ('byline', 1, 'playlistBylineRenderer', 'text'))) + info['modified_date'] = strftime_or_none(last_updated_unix, '%Y%m%d') + + info['view_count'] = self._get_count(playlist_stats, 1) + if info['view_count'] is None: # 0 is allowed + info['view_count'] = self._get_count(playlist_header_renderer, 'viewCountText') + + info['playlist_count'] = self._get_count(playlist_stats, 0) + if info['playlist_count'] is None: # 0 is allowed + info['playlist_count'] = self._get_count(playlist_header_renderer, ('byline', 0, 'playlistBylineRenderer', 'text')) + + if not info.get('uploader_id'): + owner = traverse_obj(playlist_header_renderer, 'ownerText') + if not owner: # Deprecated + owner = traverse_obj( + self._extract_sidebar_info_renderer(data, 'playlistSidebarSecondaryInfoRenderer'), + ('videoOwner', 'videoOwnerRenderer', 'title')) + owner_text = self._get_text(owner) + browse_ep = traverse_obj(owner, ('runs', 0, 'navigationEndpoint', 'browseEndpoint')) or {} + info.update({ + 'uploader': self._search_regex(r'^by (.+) and \d+ others?$', owner_text, 'uploader', default=owner_text), + 'uploader_id': browse_ep.get('browseId'), + 'uploader_url': urljoin('https://www.youtube.com', browse_ep.get('canonicalBaseUrl')) + }) + + info.update({ + 'channel': info['uploader'], + 'channel_id': info['uploader_id'], + 'channel_url': info['uploader_url'] + }) + return info + + def _extract_inline_playlist(self, playlist, playlist_id, data, ytcfg): first_id = last_id = response = None for page_num in itertools.count(1): videos = list(self._playlist_entries(playlist)) @@ -4225,11 +4824,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): start = next((i for i, v in enumerate(videos) if v['id'] == last_id), -1) + 1 if start >= len(videos): return - for video in videos[start:]: - if video['id'] == first_id: - self.to_screen('First video %s found again; Assuming end of Mix' % first_id) - return - yield video + yield from videos[start:] first_id = first_id or videos[0]['id'] last_id = videos[-1]['id'] watch_endpoint = try_get( @@ -4253,20 +4848,25 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): def _extract_from_playlist(self, item_id, url, data, playlist, ytcfg): title = playlist.get('title') or try_get( - data, lambda x: x['titleText']['simpleText'], compat_str) + data, lambda x: x['titleText']['simpleText'], str) playlist_id = playlist.get('playlistId') or item_id # Delegating everything except mix playlists to regular tab-based playlist URL playlist_url = urljoin(url, try_get( playlist, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'], - compat_str)) - if playlist_url and playlist_url != url: + str)) + + # Some playlists are unviewable but YouTube still provides a link to the (broken) playlist page [1] + # [1] MLCT, RLTDwFCb4jeqaKWnciAYM-ZVHg + is_known_unviewable = re.fullmatch(r'MLCT|RLTD[\w-]{22}', playlist_id) + + if playlist_url and playlist_url != url and not is_known_unviewable: return self.url_result( playlist_url, ie=YoutubeTabIE.ie_key(), video_id=playlist_id, video_title=title) return self.playlist_result( - self._extract_mix_playlist(playlist, playlist_id, data, ytcfg), + self._extract_inline_playlist(playlist, playlist_id, data, ytcfg), playlist_id=playlist_id, playlist_title=title) def _extract_availability(self, data): @@ -4275,31 +4875,40 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): Note: Unless YouTube tells us explicitly, we do not assume it is public @param data: response """ - is_private = is_unlisted = None - renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') or {} - badge_labels = self._extract_badges(renderer) + sidebar_renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') or {} + playlist_header_renderer = traverse_obj(data, ('header', 'playlistHeaderRenderer')) or {} + player_header_privacy = playlist_header_renderer.get('privacy') + + badges = self._extract_badges(sidebar_renderer) # Personal playlists, when authenticated, have a dropdown visibility selector instead of a badge - privacy_dropdown_entries = try_get( - renderer, lambda x: x['privacyForm']['dropdownFormFieldRenderer']['dropdown']['dropdownRenderer']['entries'], list) or [] - for renderer_dict in privacy_dropdown_entries: - is_selected = try_get( - renderer_dict, lambda x: x['privacyDropdownItemRenderer']['isSelected'], bool) or False - if not is_selected: - continue - label = self._get_text(renderer_dict, ('privacyDropdownItemRenderer', 'label')) - if label: - badge_labels.add(label.lower()) - break + privacy_setting_icon = get_first( + (playlist_header_renderer, sidebar_renderer), + ('privacyForm', 'dropdownFormFieldRenderer', 'dropdown', 'dropdownRenderer', 'entries', + lambda _, v: v['privacyDropdownItemRenderer']['isSelected'], 'privacyDropdownItemRenderer', 'icon', 'iconType'), + expected_type=str) - for badge_label in badge_labels: - if badge_label == 'unlisted': - is_unlisted = True - elif badge_label == 'private': - is_private = True - elif badge_label == 'public': - is_unlisted = is_private = False - return self._availability(is_private, False, False, False, is_unlisted) + microformats_is_unlisted = traverse_obj( + data, ('microformat', 'microformatDataRenderer', 'unlisted'), expected_type=bool) + + return ( + 'public' if ( + self._has_badge(badges, BadgeType.AVAILABILITY_PUBLIC) + or player_header_privacy == 'PUBLIC' + or privacy_setting_icon == 'PRIVACY_PUBLIC') + else self._availability( + is_private=( + self._has_badge(badges, BadgeType.AVAILABILITY_PRIVATE) + or player_header_privacy == 'PRIVATE' if player_header_privacy is not None + else privacy_setting_icon == 'PRIVACY_PRIVATE' if privacy_setting_icon is not None else None), + is_unlisted=( + self._has_badge(badges, BadgeType.AVAILABILITY_UNLISTED) + or player_header_privacy == 'UNLISTED' if player_header_privacy is not None + else privacy_setting_icon == 'PRIVACY_UNLISTED' if privacy_setting_icon is not None + else microformats_is_unlisted if microformats_is_unlisted is not None else None), + needs_subscription=self._has_badge(badges, BadgeType.AVAILABILITY_SUBSCRIPTION) or None, + needs_premium=self._has_badge(badges, BadgeType.AVAILABILITY_PREMIUM) or None, + needs_auth=False)) @staticmethod def _extract_sidebar_info_renderer(data, info_renderer, expected_type=dict): @@ -4312,94 +4921,75 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): def _reload_with_unavailable_videos(self, item_id, data, ytcfg): """ - Get playlist with unavailable videos if the 'show unavailable videos' button exists. + Reload playlists with unavailable videos (e.g. private videos, region blocked, etc.) """ - browse_id = params = None - renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') - if not renderer: + is_playlist = bool(traverse_obj( + data, ('metadata', 'playlistMetadataRenderer'), ('header', 'playlistHeaderRenderer'))) + if not is_playlist: return - menu_renderer = try_get( - renderer, lambda x: x['menu']['menuRenderer']['items'], list) or [] - for menu_item in menu_renderer: - if not isinstance(menu_item, dict): - continue - nav_item_renderer = menu_item.get('menuNavigationItemRenderer') - text = try_get( - nav_item_renderer, lambda x: x['text']['simpleText'], compat_str) - if not text or text.lower() != 'show unavailable videos': - continue - browse_endpoint = try_get( - nav_item_renderer, lambda x: x['navigationEndpoint']['browseEndpoint'], dict) or {} - browse_id = browse_endpoint.get('browseId') - params = browse_endpoint.get('params') - break - headers = self.generate_api_headers( ytcfg=ytcfg, account_syncid=self._extract_account_syncid(ytcfg, data), visitor_data=self._extract_visitor_data(data, ytcfg)) query = { - 'params': params or 'wgYCCAA=', - 'browseId': browse_id or 'VL%s' % item_id + 'params': 'wgYCCAA=', + 'browseId': f'VL{item_id}' } return self._extract_response( item_id=item_id, headers=headers, query=query, check_get_keys='contents', fatal=False, ytcfg=ytcfg, - note='Downloading API JSON with unavailable videos') + note='Redownloading playlist API JSON with unavailable videos') + + @functools.cached_property + def skip_webpage(self): + return 'webpage' in self._configuration_arg('skip', ie_key=YoutubeTabIE.ie_key()) def _extract_webpage(self, url, item_id, fatal=True): - retries = self.get_param('extractor_retries', 3) - count = -1 - webpage = data = last_error = None - while count < retries: - count += 1 - # Sometimes youtube returns a webpage with incomplete ytInitialData - # See: https://github.com/hypervideo/hypervideo/issues/116 - if last_error: - self.report_warning('%s. Retrying ...' % last_error) + webpage, data = None, None + for retry in self.RetryManager(fatal=fatal): try: - webpage = self._download_webpage( - url, item_id, - note='Downloading webpage%s' % (' (retry #%d)' % count if count else '',)) + webpage = self._download_webpage(url, item_id, note='Downloading webpage') data = self.extract_yt_initial_data(item_id, webpage or '', fatal=fatal) or {} except ExtractorError as e: if isinstance(e.cause, network_exceptions): - if not isinstance(e.cause, compat_HTTPError) or e.cause.code not in (403, 429): - last_error = error_to_compat_str(e.cause or e.msg) - if count < retries: - continue - if fatal: - raise - self.report_warning(error_to_compat_str(e)) + if not isinstance(e.cause, urllib.error.HTTPError) or e.cause.code not in (403, 429): + retry.error = e + continue + self._error_or_warning(e, fatal=fatal) break - else: - try: - self._extract_and_report_alerts(data) - except ExtractorError as e: - if fatal: - raise - self.report_warning(error_to_compat_str(e)) - break - if dict_get(data, ('contents', 'currentVideoEndpoint', 'onResponseReceivedActions')): - break + try: + self._extract_and_report_alerts(data) + except ExtractorError as e: + self._error_or_warning(e, fatal=fatal) + break - last_error = 'Incomplete yt initial data received' - if count >= retries: - if fatal: - raise ExtractorError(last_error) - self.report_warning(last_error) - break + # Sometimes youtube returns a webpage with incomplete ytInitialData + # See: https://github.com/hypervideo/hypervideo/issues/116 + if not traverse_obj(data, 'contents', 'currentVideoEndpoint', 'onResponseReceivedActions'): + retry.error = ExtractorError('Incomplete yt initial data received') + continue return webpage, data + def _report_playlist_authcheck(self, ytcfg, fatal=True): + """Use if failed to extract ytcfg (and data) from initial webpage""" + if not ytcfg and self.is_authenticated: + msg = 'Playlists that require authentication may not extract correctly without a successful webpage download' + if 'authcheck' not in self._configuration_arg('skip', ie_key=YoutubeTabIE.ie_key()) and fatal: + raise ExtractorError( + f'{msg}. If you are not downloading private content, or ' + 'your cookies are only for the first account and channel,' + ' pass "--extractor-args youtubetab:skip=authcheck" to skip this check', + expected=True) + self.report_warning(msg, only_once=True) + def _extract_data(self, url, item_id, ytcfg=None, fatal=True, webpage_fatal=False, default_client='web'): data = None - if 'webpage' not in self._configuration_arg('skip'): + if not self.skip_webpage: webpage, data = self._extract_webpage(url, item_id, fatal=webpage_fatal) ytcfg = ytcfg or self.extract_ytcfg(item_id, webpage) # Reject webpage data if redirected to home page without explicitly requesting - selected_tab = self._extract_selected_tab(traverse_obj( - data, ('contents', 'twoColumnBrowseResultsRenderer', 'tabs'), expected_type=list, default=[]), fatal=False) or {} + selected_tab = self._extract_selected_tab(self._extract_tab_renderers(data), fatal=False) or {} if (url != 'https://www.youtube.com/feed/recommended' and selected_tab.get('tabIdentifier') == 'FEwhat_to_watch' # Home page and 'no-youtube-channel-redirect' not in self.get_param('compat_opts', [])): @@ -4408,14 +4998,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): raise ExtractorError(msg, expected=True) self.report_warning(msg, only_once=True) if not data: - if not ytcfg and self.is_authenticated: - msg = 'Playlists that require authentication may not extract correctly without a successful webpage download.' - if 'authcheck' not in self._configuration_arg('skip') and fatal: - raise ExtractorError( - msg + ' If you are not downloading private content, or your cookies are only for the first account and channel,' - ' pass "--extractor-args youtubetab:skip=authcheck" to skip this check', - expected=True) - self.report_warning(msg, only_once=True) + self._report_playlist_authcheck(ytcfg, fatal=fatal) data = self._extract_tab_endpoint(url, item_id, ytcfg, fatal=fatal, default_client=default_client) return data, ytcfg @@ -4453,14 +5036,20 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): ('contents', 'tabbedSearchResultsRenderer', 'tabs', 0, 'tabRenderer', 'content', 'sectionListRenderer', 'contents'), ('continuationContents', ), ) - check_get_keys = tuple(set(keys[0] for keys in content_keys)) + display_id = f'query "{query}"' + check_get_keys = tuple({keys[0] for keys in content_keys}) + ytcfg = self._download_ytcfg(default_client, display_id) if not self.skip_webpage else {} + self._report_playlist_authcheck(ytcfg, fatal=False) continuation_list = [None] + search = None for page_num in itertools.count(1): data.update(continuation_list[0] or {}) + headers = self.generate_api_headers( + ytcfg=ytcfg, visitor_data=self._extract_visitor_data(search), default_client=default_client) search = self._extract_response( - item_id='query "%s" page %s' % (query, page_num), ep='search', query=data, - default_client=default_client, check_get_keys=check_get_keys) + item_id=f'{display_id} page {page_num}', ep='search', query=data, + default_client=default_client, check_get_keys=check_get_keys, ytcfg=ytcfg, headers=headers) slr_contents = traverse_obj(search, *content_keys) yield from self._extract_entries({'contents': list(variadic(slr_contents))}, continuation_list) if not continuation_list[0]: @@ -4578,6 +5167,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_id': 'UCmlqkdCBesrv2Lak1mF_MxA', 'uploader_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', 'channel_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'availability': 'public', }, 'playlist_count': 1, }, { @@ -4595,6 +5185,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_id': 'UCmlqkdCBesrv2Lak1mF_MxA', 'channel_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', 'uploader_url': 'https://www.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA', + 'availability': 'public', }, 'playlist_count': 0, }, { @@ -4741,6 +5332,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_id': 'UCEPzS1rYsrkqzSLNp76nrcg', 'channel_url': 'https://www.youtube.com/c/ChRiStIaAn008', 'channel': 'Christiaan008', + 'availability': 'public', }, 'playlist_count': 96, }, { @@ -4759,6 +5351,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'view_count': int, 'description': '', 'channel_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q', + 'availability': 'public', }, 'playlist_mincount': 1123, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], @@ -4782,6 +5375,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel': 'Interstellar Movie', 'description': '', 'modified_date': r're:\d{8}', + 'availability': 'public', }, 'playlist_mincount': 21, }, { @@ -4800,6 +5394,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_url': 'https://www.youtube.com/channel/UCTYLiWFZy8xtPwxFwX9rV7Q', 'channel_id': 'UCTYLiWFZy8xtPwxFwX9rV7Q', 'modified_date': r're:\d{8}', + 'availability': 'public', }, 'playlist_mincount': 200, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], @@ -4819,6 +5414,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader_url': 'https://www.youtube.com/c/blanktv', 'modified_date': r're:\d{8}', 'description': '', + 'availability': 'public', }, 'playlist_mincount': 1000, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], @@ -4837,6 +5433,8 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_id': 'UC9-y-6csu5WGm29I7JiwpnA', 'channel_url': 'https://www.youtube.com/user/Computerphile', 'channel': 'Computerphile', + 'availability': 'public', + 'modified_date': '20190712', }, 'playlist_mincount': 11, }, { @@ -4874,7 +5472,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): }, { 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live', 'info_dict': { - 'id': 'GgL890LIznQ', # This will keep changing + 'id': 'Wq15eF5vCbI', # This will keep changing 'ext': 'mp4', 'title': str, 'uploader': 'Sky News', @@ -4885,18 +5483,19 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'categories': ['News & Politics'], 'tags': list, 'like_count': int, - 'release_timestamp': 1642502819, + 'release_timestamp': int, 'channel': 'Sky News', 'channel_id': 'UCoMdktPbSTixAyNGwb-UYkQ', 'age_limit': 0, 'view_count': int, - 'thumbnail': 'https://i.ytimg.com/vi/GgL890LIznQ/maxresdefault_live.jpg', + 'thumbnail': r're:https?://i\.ytimg\.com/vi/[^/]+/maxresdefault(?:_live)?\.jpg', 'playable_in_embed': True, - 'release_date': '20220118', + 'release_date': r're:\d+', 'availability': 'public', 'live_status': 'is_live', 'channel_url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ', - 'channel_follower_count': int + 'channel_follower_count': int, + 'concurrent_view_count': int, }, 'params': { 'skip_download': True, @@ -4974,7 +5573,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'title': '#cctv9', 'tags': [], }, - 'playlist_mincount': 350, + 'playlist_mincount': 300, # not consistent but should be over 300 }, { 'url': 'https://www.youtube.com/watch?list=PLW4dVinRY435CBE_JD3t-0SRXKfnZHS1P&feature=youtu.be&v=M9cJMXmQ_ZU', 'only_matching': True, @@ -4994,7 +5593,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'uploader': 'NoCopyrightSounds', 'description': 'Providing you with copyright free / safe music for gaming, live streaming, studying and more!', 'uploader_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', - 'title': 'NCS Releases', + 'title': 'NCS : All Releases 💿', 'uploader_url': 'https://www.youtube.com/c/NoCopyrightSounds', 'channel_url': 'https://www.youtube.com/c/NoCopyrightSounds', 'modified_date': r're:\d{8}', @@ -5002,6 +5601,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel_id': 'UC_aEa8K-EOJ3D6gOs7HcyNg', 'tags': [], 'channel': 'NoCopyrightSounds', + 'availability': 'public', }, 'playlist_mincount': 166, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], @@ -5022,23 +5622,18 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'modified_date': r're:\d{8}', 'uploader_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw', 'description': '', + 'availability': 'public', }, - 'expected_warnings': [ - 'The URL does not have a videos tab', - r'[Uu]navailable videos (are|will be) hidden', - ], 'playlist_mincount': 101, }, { - 'note': 'Topic without a UU playlist', + # Destination channel with only a hidden self tab (tab id is UCtFRv9O2AHqOZjjynzrv-xg) + # Treat as a general feed 'url': 'https://www.youtube.com/channel/UCtFRv9O2AHqOZjjynzrv-xg', 'info_dict': { 'id': 'UCtFRv9O2AHqOZjjynzrv-xg', 'title': 'UCtFRv9O2AHqOZjjynzrv-xg', 'tags': [], }, - 'expected_warnings': [ - 'the playlist redirect gave error', - ], 'playlist_mincount': 9, }, { 'note': 'Youtube music Album', @@ -5063,7 +5658,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'title': 'hypervideo unlisted playlist test', 'availability': 'unlisted', 'tags': [], - 'modified_date': '20211208', + 'modified_date': '20220418', 'channel': 'colethedj', 'view_count': int, 'description': '', @@ -5106,6 +5701,7 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'skip_download': True, 'extractor_args': {'youtubetab': {'skip': ['webpage']}} }, + 'skip': 'Query for sorting no longer works', }, { 'note': 'API Fallback: Topic, should redirect to playlist?list=UU...', 'url': 'https://music.youtube.com/browse/UC9ALqqC4aIeG5iDs7i90Bfw', @@ -5122,11 +5718,8 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'channel': 'Royalty Free Music - Topic', 'view_count': int, 'uploader_url': 'https://www.youtube.com/channel/UC9ALqqC4aIeG5iDs7i90Bfw', + 'availability': 'public', }, - 'expected_warnings': [ - 'does not have a videos tab', - r'[Uu]navailable videos (are|will be) hidden', - ], 'playlist_mincount': 101, 'params': { 'skip_download': True, @@ -5136,130 +5729,429 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): 'note': 'non-standard redirect to regional channel', 'url': 'https://www.youtube.com/channel/UCwVVpHQ2Cs9iGJfpdFngePQ', 'only_matching': True + }, { + 'note': 'collaborative playlist (uploader name in the form "by <uploader> and x other(s)")', + 'url': 'https://www.youtube.com/playlist?list=PLx-_-Kk4c89oOHEDQAojOXzEzemXxoqx6', + 'info_dict': { + 'id': 'PLx-_-Kk4c89oOHEDQAojOXzEzemXxoqx6', + 'modified_date': '20220407', + 'channel_url': 'https://www.youtube.com/channel/UCKcqXmCcyqnhgpA5P0oHH_Q', + 'tags': [], + 'uploader_id': 'UCKcqXmCcyqnhgpA5P0oHH_Q', + 'uploader': 'pukkandan', + 'availability': 'unlisted', + 'channel_id': 'UCKcqXmCcyqnhgpA5P0oHH_Q', + 'channel': 'pukkandan', + 'description': 'Test for collaborative playlist', + 'title': 'hypervideo test - collaborative playlist', + 'view_count': int, + 'uploader_url': 'https://www.youtube.com/channel/UCKcqXmCcyqnhgpA5P0oHH_Q', + }, + 'playlist_mincount': 2 + }, { + 'note': 'translated tab name', + 'url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA/playlists', + 'info_dict': { + 'id': 'UCiu-3thuViMebBjw_5nWYrA', + 'tags': [], + 'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'description': 'test description', + 'title': 'cole-dlp-test-acc - 再生リスト', + 'uploader_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'uploader': 'cole-dlp-test-acc', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel': 'cole-dlp-test-acc', + 'channel_follower_count': int, + }, + 'playlist_mincount': 1, + 'params': {'extractor_args': {'youtube': {'lang': ['ja']}}}, + 'expected_warnings': ['Preferring "ja"'], + }, { + # XXX: this should really check flat playlist entries, but the test suite doesn't support that + 'note': 'preferred lang set with playlist with translated video titles', + 'url': 'https://www.youtube.com/playlist?list=PLt5yu3-wZAlQAaPZ5Z-rJoTdbT-45Q7c0', + 'info_dict': { + 'id': 'PLt5yu3-wZAlQAaPZ5Z-rJoTdbT-45Q7c0', + 'tags': [], + 'view_count': int, + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'uploader': 'cole-dlp-test-acc', + 'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel': 'cole-dlp-test-acc', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'description': 'test', + 'uploader_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'title': 'dlp test playlist', + 'availability': 'public', + }, + 'playlist_mincount': 1, + 'params': {'extractor_args': {'youtube': {'lang': ['ja']}}}, + 'expected_warnings': ['Preferring "ja"'], + }, { + # shorts audio pivot for 2GtVksBMYFM. + 'url': 'https://www.youtube.com/feed/sfv_audio_pivot?bp=8gUrCikSJwoLMkd0VmtzQk1ZRk0SCzJHdFZrc0JNWUZNGgsyR3RWa3NCTVlGTQ==', + 'info_dict': { + 'id': 'sfv_audio_pivot', + 'title': 'sfv_audio_pivot', + 'tags': [], + }, + 'playlist_mincount': 50, + + }, { + # Channel with a real live tab (not to be mistaken with streams tab) + # Do not treat like it should redirect to live stream + 'url': 'https://www.youtube.com/channel/UCEH7P7kyJIkS_gJf93VYbmg/live', + 'info_dict': { + 'id': 'UCEH7P7kyJIkS_gJf93VYbmg', + 'title': 'UCEH7P7kyJIkS_gJf93VYbmg - Live', + 'tags': [], + }, + 'playlist_mincount': 20, + }, { + # Tab name is not the same as tab id + 'url': 'https://www.youtube.com/channel/UCQvWX73GQygcwXOTSf_VDVg/letsplay', + 'info_dict': { + 'id': 'UCQvWX73GQygcwXOTSf_VDVg', + 'title': 'UCQvWX73GQygcwXOTSf_VDVg - Let\'s play', + 'tags': [], + }, + 'playlist_mincount': 8, + }, { + # Home tab id is literally home. Not to get mistaken with featured + 'url': 'https://www.youtube.com/channel/UCQvWX73GQygcwXOTSf_VDVg/home', + 'info_dict': { + 'id': 'UCQvWX73GQygcwXOTSf_VDVg', + 'title': 'UCQvWX73GQygcwXOTSf_VDVg - Home', + 'tags': [], + }, + 'playlist_mincount': 8, + }, { + # Should get three playlists for videos, shorts and streams tabs + 'url': 'https://www.youtube.com/channel/UCK9V2B22uJYu3N7eR_BT9QA', + 'info_dict': { + 'id': 'UCK9V2B22uJYu3N7eR_BT9QA', + 'title': 'Polka Ch. 尾丸ポルカ', + 'channel_follower_count': int, + 'channel_id': 'UCK9V2B22uJYu3N7eR_BT9QA', + 'channel_url': 'https://www.youtube.com/channel/UCK9V2B22uJYu3N7eR_BT9QA', + 'uploader': 'Polka Ch. 尾丸ポルカ', + 'description': 'md5:3b8df1ac5af337aa206e37ee3d181ec9', + 'channel': 'Polka Ch. 尾丸ポルカ', + 'tags': 'count:35', + 'uploader_url': 'https://www.youtube.com/channel/UCK9V2B22uJYu3N7eR_BT9QA', + 'uploader_id': 'UCK9V2B22uJYu3N7eR_BT9QA', + }, + 'playlist_count': 3, + }, { + # Shorts tab with channel with handle + 'url': 'https://www.youtube.com/@NotJustBikes/shorts', + 'info_dict': { + 'id': 'UC0intLFzLaudFG-xAvUEO-A', + 'title': 'Not Just Bikes - Shorts', + 'tags': 'count:12', + 'uploader': 'Not Just Bikes', + 'channel_url': 'https://www.youtube.com/channel/UC0intLFzLaudFG-xAvUEO-A', + 'description': 'md5:7513148b1f02b924783157d84c4ea555', + 'channel_follower_count': int, + 'uploader_id': 'UC0intLFzLaudFG-xAvUEO-A', + 'channel_id': 'UC0intLFzLaudFG-xAvUEO-A', + 'uploader_url': 'https://www.youtube.com/channel/UC0intLFzLaudFG-xAvUEO-A', + 'channel': 'Not Just Bikes', + }, + 'playlist_mincount': 10, + }, { + # Streams tab + 'url': 'https://www.youtube.com/channel/UC3eYAvjCVwNHgkaGbXX3sig/streams', + 'info_dict': { + 'id': 'UC3eYAvjCVwNHgkaGbXX3sig', + 'title': '中村悠一 - Live', + 'tags': 'count:7', + 'channel_id': 'UC3eYAvjCVwNHgkaGbXX3sig', + 'channel_url': 'https://www.youtube.com/channel/UC3eYAvjCVwNHgkaGbXX3sig', + 'uploader_id': 'UC3eYAvjCVwNHgkaGbXX3sig', + 'channel': '中村悠一', + 'uploader_url': 'https://www.youtube.com/channel/UC3eYAvjCVwNHgkaGbXX3sig', + 'channel_follower_count': int, + 'uploader': '中村悠一', + 'description': 'md5:e744f6c93dafa7a03c0c6deecb157300', + }, + 'playlist_mincount': 60, + }, { + # Channel with no uploads and hence no videos, streams, shorts tabs or uploads playlist. This should fail. + # See test_youtube_lists + 'url': 'https://www.youtube.com/channel/UC2yXPzFejc422buOIzn_0CA', + 'only_matching': True, + }, { + # No uploads and no UCID given. Should fail with no uploads error + # See test_youtube_lists + 'url': 'https://www.youtube.com/news', + 'only_matching': True + }, { + # No videos tab but has a shorts tab + 'url': 'https://www.youtube.com/c/TKFShorts', + 'info_dict': { + 'id': 'UCgJ5_1F6yJhYLnyMszUdmUg', + 'title': 'Shorts Break - Shorts', + 'tags': 'count:32', + 'channel_id': 'UCgJ5_1F6yJhYLnyMszUdmUg', + 'channel': 'Shorts Break', + 'description': 'md5:a6c234cf3d50d878ef8721e34457cd11', + 'uploader': 'Shorts Break', + 'channel_follower_count': int, + 'uploader_id': 'UCgJ5_1F6yJhYLnyMszUdmUg', + 'uploader_url': 'https://www.youtube.com/channel/UCgJ5_1F6yJhYLnyMszUdmUg', + 'channel_url': 'https://www.youtube.com/channel/UCgJ5_1F6yJhYLnyMszUdmUg', + }, + 'playlist_mincount': 30, + }, { + # Trending Now Tab. tab id is empty + 'url': 'https://www.youtube.com/feed/trending', + 'info_dict': { + 'id': 'trending', + 'title': 'trending - Now', + 'tags': [], + }, + 'playlist_mincount': 30, + }, { + # Trending Gaming Tab. tab id is empty + 'url': 'https://www.youtube.com/feed/trending?bp=4gIcGhpnYW1pbmdfY29ycHVzX21vc3RfcG9wdWxhcg%3D%3D', + 'info_dict': { + 'id': 'trending', + 'title': 'trending - Gaming', + 'tags': [], + }, + 'playlist_mincount': 30, + }, { + # Shorts url result in shorts tab + 'url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA/shorts', + 'info_dict': { + 'id': 'UCiu-3thuViMebBjw_5nWYrA', + 'title': 'cole-dlp-test-acc - Shorts', + 'uploader_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel': 'cole-dlp-test-acc', + 'channel_follower_count': int, + 'description': 'test description', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'tags': [], + 'uploader': 'cole-dlp-test-acc', + 'uploader_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + + }, + 'playlist': [{ + 'info_dict': { + '_type': 'url', + 'ie_key': 'Youtube', + 'url': 'https://www.youtube.com/shorts/sSM9J5YH_60', + 'id': 'sSM9J5YH_60', + 'channel_id': 'UCiu-3thuViMebBjw_5nWYrA', + 'title': 'SHORT short', + 'channel': 'cole-dlp-test-acc', + 'channel_url': 'https://www.youtube.com/channel/UCiu-3thuViMebBjw_5nWYrA', + 'view_count': int, + 'thumbnails': list, + } + }], + 'params': {'extract_flat': True}, + }, { + # Live video status should be extracted + 'url': 'https://www.youtube.com/channel/UCQvWX73GQygcwXOTSf_VDVg/live', + 'info_dict': { + 'id': 'UCQvWX73GQygcwXOTSf_VDVg', + 'title': 'UCQvWX73GQygcwXOTSf_VDVg - Live', # TODO, should be Minecraft - Live or Minecraft - Topic - Live + 'tags': [] + }, + 'playlist': [{ + 'info_dict': { + '_type': 'url', + 'ie_key': 'Youtube', + 'url': 'startswith:https://www.youtube.com/watch?v=', + 'id': str, + 'title': str, + 'live_status': 'is_live', + 'channel_id': str, + 'channel_url': str, + 'concurrent_view_count': int, + 'channel': str, + } + }], + 'params': {'extract_flat': True}, + 'playlist_mincount': 1 }] @classmethod def suitable(cls, url): - return False if YoutubeIE.suitable(url) else super( - YoutubeTabIE, cls).suitable(url) + return False if YoutubeIE.suitable(url) else super().suitable(url) + + _URL_RE = re.compile(rf'(?P<pre>{_VALID_URL})(?(not_channel)|(?P<tab>/[^?#/]+))?(?P<post>.*)$') + + def _get_url_mobj(self, url): + mobj = self._URL_RE.match(url).groupdict() + mobj.update((k, '') for k, v in mobj.items() if v is None) + return mobj + + def _extract_tab_id_and_name(self, tab, base_url='https://www.youtube.com'): + tab_name = (tab.get('title') or '').lower() + tab_url = urljoin(base_url, traverse_obj( + tab, ('endpoint', 'commandMetadata', 'webCommandMetadata', 'url'))) + + tab_id = (tab_url and self._get_url_mobj(tab_url)['tab'][1:] + or traverse_obj(tab, 'tabIdentifier', expected_type=str)) + if tab_id: + return { + 'TAB_ID_SPONSORSHIPS': 'membership', + }.get(tab_id, tab_id), tab_name + + # Fallback to tab name if we cannot get the tab id. + # XXX: should we strip non-ascii letters? e.g. in case of 'let's play' tab example on special gaming channel + # Note that in the case of translated tab name this may result in an empty string, which we don't want. + if tab_name: + self.write_debug(f'Falling back to selected tab name: {tab_name}') + return { + 'home': 'featured', + 'live': 'streams', + }.get(tab_name, tab_name), tab_name - _URL_RE = re.compile(rf'(?P<pre>{_VALID_URL})(?(not_channel)|(?P<tab>/\w+))?(?P<post>.*)$') + def _has_tab(self, tabs, tab_id): + return any(self._extract_tab_id_and_name(tab)[0] == tab_id for tab in tabs) @YoutubeTabBaseInfoExtractor.passthrough_smuggled_data def _real_extract(self, url, smuggled_data): item_id = self._match_id(url) - url = compat_urlparse.urlunparse( - compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com')) + url = urllib.parse.urlunparse( + urllib.parse.urlparse(url)._replace(netloc='www.youtube.com')) compat_opts = self.get_param('compat_opts', []) - def get_mobj(url): - mobj = self._URL_RE.match(url).groupdict() - mobj.update((k, '') for k, v in mobj.items() if v is None) - return mobj - - mobj, redirect_warning = get_mobj(url), None - # Youtube returns incomplete data if tabname is not lower case - pre, tab, post, is_channel = mobj['pre'], mobj['tab'].lower(), mobj['post'], not mobj['not_channel'] - if is_channel: - if smuggled_data.get('is_music_url'): - if item_id[:2] == 'VL': # Youtube music VL channels have an equivalent playlist - item_id = item_id[2:] - pre, tab, post, is_channel = f'https://www.youtube.com/playlist?list={item_id}', '', '', False - elif item_id[:2] == 'MP': # Resolve albums (/[channel/browse]/MP...) to their equivalent playlist - mdata = self._extract_tab_endpoint( - f'https://music.youtube.com/channel/{item_id}', item_id, default_client='web_music') - murl = traverse_obj(mdata, ('microformat', 'microformatDataRenderer', 'urlCanonical'), - get_all=False, expected_type=compat_str) - if not murl: - raise ExtractorError('Failed to resolve album to playlist') - return self.url_result(murl, ie=YoutubeTabIE.ie_key()) - elif mobj['channel_type'] == 'browse': # Youtube music /browse/ should be changed to /channel/ - pre = f'https://www.youtube.com/channel/{item_id}' - - original_tab_name = tab + mobj = self._get_url_mobj(url) + pre, tab, post, is_channel = mobj['pre'], mobj['tab'], mobj['post'], not mobj['not_channel'] + if is_channel and smuggled_data.get('is_music_url'): + if item_id[:2] == 'VL': # Youtube music VL channels have an equivalent playlist + return self.url_result( + f'https://music.youtube.com/playlist?list={item_id[2:]}', YoutubeTabIE, item_id[2:]) + elif item_id[:2] == 'MP': # Resolve albums (/[channel/browse]/MP...) to their equivalent playlist + mdata = self._extract_tab_endpoint( + f'https://music.youtube.com/channel/{item_id}', item_id, default_client='web_music') + murl = traverse_obj(mdata, ('microformat', 'microformatDataRenderer', 'urlCanonical'), + get_all=False, expected_type=str) + if not murl: + raise ExtractorError('Failed to resolve album to playlist') + return self.url_result(murl, YoutubeTabIE) + elif mobj['channel_type'] == 'browse': # Youtube music /browse/ should be changed to /channel/ + return self.url_result( + f'https://music.youtube.com/channel/{item_id}{tab}{post}', YoutubeTabIE, item_id) + + original_tab_id, display_id = tab[1:], f'{item_id}{tab}' if is_channel and not tab and 'no-youtube-channel-redirect' not in compat_opts: - # Home URLs should redirect to /videos/ - redirect_warning = ('A channel/user page was given. All the channel\'s videos will be downloaded. ' - 'To download only the videos in the home page, add a "/featured" to the URL') - tab = '/videos' - - url = ''.join((pre, tab, post)) - mobj = get_mobj(url) + url = f'{pre}/videos{post}' # Handle both video/playlist URLs qs = parse_qs(url) - video_id, playlist_id = [qs.get(key, [None])[0] for key in ('v', 'list')] - + video_id, playlist_id = [traverse_obj(qs, (key, 0)) for key in ('v', 'list')] if not video_id and mobj['not_channel'].startswith('watch'): if not playlist_id: # If there is neither video or playlist ids, youtube redirects to home page, which is undesirable - raise ExtractorError('Unable to recognize tab page') + raise ExtractorError('A video URL was given without video ID', expected=True) # Common mistake: https://www.youtube.com/watch?list=playlist_id self.report_warning(f'A video URL was given without video ID. Trying to download playlist {playlist_id}') - url = f'https://www.youtube.com/playlist?list={playlist_id}' - mobj = get_mobj(url) + return self.url_result( + f'https://www.youtube.com/playlist?list={playlist_id}', YoutubeTabIE, playlist_id) - if video_id and playlist_id: - if self.get_param('noplaylist'): - self.to_screen(f'Downloading just video {video_id} because of --no-playlist') - return self.url_result(f'https://www.youtube.com/watch?v={video_id}', - ie=YoutubeIE.ie_key(), video_id=video_id) - self.to_screen(f'Downloading playlist {playlist_id}; add --no-playlist to just download video {video_id}') + if not self._yes_playlist(playlist_id, video_id): + return self.url_result( + f'https://www.youtube.com/watch?v={video_id}', YoutubeIE, video_id) - data, ytcfg = self._extract_data(url, item_id) + data, ytcfg = self._extract_data(url, display_id) # YouTube may provide a non-standard redirect to the regional channel # See: https://github.com/hypervideo/hypervideo/issues/2694 + # https://support.google.com/youtube/answer/2976814#zippy=,conditional-redirects redirect_url = traverse_obj( data, ('onResponseReceivedActions', ..., 'navigateAction', 'endpoint', 'commandMetadata', 'webCommandMetadata', 'url'), get_all=False) if redirect_url and 'no-youtube-channel-redirect' not in compat_opts: - redirect_url = ''.join(( - urljoin('https://www.youtube.com', redirect_url), mobj['tab'], mobj['post'])) - self.to_screen(f'This playlist is likely not available in your region. Following redirect to regional playlist {redirect_url}') - return self.url_result(redirect_url, ie=YoutubeTabIE.ie_key()) + redirect_url = ''.join((urljoin('https://www.youtube.com', redirect_url), tab, post)) + self.to_screen(f'This playlist is likely not available in your region. Following conditional redirect to {redirect_url}') + return self.url_result(redirect_url, YoutubeTabIE) - tabs = traverse_obj(data, ('contents', 'twoColumnBrowseResultsRenderer', 'tabs'), expected_type=list) - if tabs: + tabs, extra_tabs = self._extract_tab_renderers(data), [] + if is_channel and tabs and 'no-youtube-channel-redirect' not in compat_opts: selected_tab = self._extract_selected_tab(tabs) - selected_tab_name = selected_tab.get('title', '').lower() - if selected_tab_name == 'home': - selected_tab_name = 'featured' - requested_tab_name = mobj['tab'][1:] - if 'no-youtube-channel-redirect' not in compat_opts: - if requested_tab_name == 'live': - # Live tab should have redirected to the video - raise ExtractorError('The channel is not currently live', expected=True) - if requested_tab_name not in ('', selected_tab_name): - redirect_warning = f'The channel does not have a {requested_tab_name} tab' - if not original_tab_name: - if item_id[:2] == 'UC': - # Topic channels don't have /videos. Use the equivalent playlist instead - pl_id = f'UU{item_id[2:]}' - pl_url = f'https://www.youtube.com/playlist?list={pl_id}' - try: - data, ytcfg = self._extract_data(pl_url, pl_id, ytcfg=ytcfg, fatal=True, webpage_fatal=True) - except ExtractorError: - redirect_warning += ' and the playlist redirect gave error' - else: - item_id, url, selected_tab_name = pl_id, pl_url, requested_tab_name - redirect_warning += f'. Redirecting to playlist {pl_id} instead' - if selected_tab_name and selected_tab_name != requested_tab_name: - redirect_warning += f'. {selected_tab_name} tab is being downloaded instead' + selected_tab_id, selected_tab_name = self._extract_tab_id_and_name(selected_tab, url) # NB: Name may be translated + self.write_debug(f'Selected tab: {selected_tab_id!r} ({selected_tab_name}), Requested tab: {original_tab_id!r}') + + if not original_tab_id and selected_tab_name: + self.to_screen('Downloading all uploads of the channel. ' + 'To download only the videos in a specific tab, pass the tab\'s URL') + if self._has_tab(tabs, 'streams'): + extra_tabs.append(''.join((pre, '/streams', post))) + if self._has_tab(tabs, 'shorts'): + extra_tabs.append(''.join((pre, '/shorts', post))) + # XXX: Members-only tab should also be extracted + + if not extra_tabs and selected_tab_id != 'videos': + # Channel does not have streams, shorts or videos tabs + if item_id[:2] != 'UC': + raise ExtractorError('This channel has no uploads', expected=True) + + # Topic channels don't have /videos. Use the equivalent playlist instead + pl_id = f'UU{item_id[2:]}' + pl_url = f'https://www.youtube.com/playlist?list={pl_id}' + try: + data, ytcfg = self._extract_data(pl_url, pl_id, ytcfg=ytcfg, fatal=True, webpage_fatal=True) + except ExtractorError: + raise ExtractorError('This channel has no uploads', expected=True) else: - raise ExtractorError(redirect_warning, expected=True) + item_id, url = pl_id, pl_url + self.to_screen( + f'The channel does not have a videos, shorts, or live tab. Redirecting to playlist {pl_id} instead') + + elif extra_tabs and selected_tab_id != 'videos': + # When there are shorts/live tabs but not videos tab + url, data = f'{pre}{post}', None + + elif (original_tab_id or 'videos') != selected_tab_id: + if original_tab_id == 'live': + # Live tab should have redirected to the video + # Except in the case the channel has an actual live tab + # Example: https://www.youtube.com/channel/UCEH7P7kyJIkS_gJf93VYbmg/live + raise UserNotLive(video_id=item_id) + elif selected_tab_name: + raise ExtractorError(f'This channel does not have a {original_tab_id} tab', expected=True) - if redirect_warning: - self.to_screen(redirect_warning) - self.write_debug(f'Final URL: {url}') + # For channels such as https://www.youtube.com/channel/UCtFRv9O2AHqOZjjynzrv-xg + url = f'{pre}{post}' # YouTube sometimes provides a button to reload playlist with unavailable videos. if 'no-youtube-unavailable-videos' not in compat_opts: - data = self._reload_with_unavailable_videos(item_id, data, ytcfg) or data + data = self._reload_with_unavailable_videos(display_id, data, ytcfg) or data self._extract_and_report_alerts(data, only_once=True) - tabs = traverse_obj(data, ('contents', 'twoColumnBrowseResultsRenderer', 'tabs'), expected_type=list) - if tabs: - return self._extract_from_tabs(item_id, ytcfg, data, tabs) + tabs, entries = self._extract_tab_renderers(data), [] + if tabs: + entries = [self._extract_from_tabs(item_id, ytcfg, data, tabs)] + entries[0].update({ + 'extractor_key': YoutubeTabIE.ie_key(), + 'extractor': YoutubeTabIE.IE_NAME, + 'webpage_url': url, + }) + if self.get_param('playlist_items') == '0': + entries.extend(self.url_result(u, YoutubeTabIE) for u in extra_tabs) + else: # Users expect to get all `video_id`s even with `--flat-playlist`. So don't return `url_result` + entries.extend(map(self._real_extract, extra_tabs)) + + if len(entries) == 1: + return entries[0] + elif entries: + metadata = self._extract_metadata_from_tabs(item_id, data) + uploads_url = 'the Uploads (UU) playlist URL' + if try_get(metadata, lambda x: x['channel_id'].startswith('UC')): + uploads_url = f'https://www.youtube.com/playlist?list=UU{metadata["channel_id"][2:]}' + self.to_screen( + 'Downloading as multiple playlists, separated by tabs. ' + f'To download as a single playlist instead, pass {uploads_url}') + return self.playlist_result(entries, item_id, **metadata) + + # Inline playlist playlist = traverse_obj( data, ('contents', 'twoColumnWatchNextResults', 'playlist', 'playlist'), expected_type=dict) if playlist: @@ -5268,10 +6160,9 @@ class YoutubeTabIE(YoutubeTabBaseInfoExtractor): video_id = traverse_obj( data, ('currentVideoEndpoint', 'watchEndpoint', 'videoId'), expected_type=str) or video_id if video_id: - if mobj['tab'] != '/live': # live tab is expected to redirect to video + if tab != '/live': # live tab is expected to redirect to video self.report_warning(f'Unable to recognize playlist. Downloading just video {video_id}') - return self.url_result(f'https://www.youtube.com/watch?v={video_id}', - ie=YoutubeIE.ie_key(), video_id=video_id) + return self.url_result(f'https://www.youtube.com/watch?v={video_id}', YoutubeIE, video_id) raise ExtractorError('Unable to recognize tab page') @@ -5304,12 +6195,13 @@ class YoutubePlaylistIE(InfoExtractor): 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q', 'description': 'md5:8fa6f52abb47a9552002fa3ddfc57fc2', 'view_count': int, - 'uploader_url': 'https://www.youtube.com/user/Wickydoo', + 'uploader_url': 'https://www.youtube.com/c/WickmanVT', 'modified_date': r're:\d{8}', 'channel_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q', 'channel': 'Wickman', 'tags': [], - 'channel_url': 'https://www.youtube.com/user/Wickydoo', + 'channel_url': 'https://www.youtube.com/c/WickmanVT', + 'availability': 'public', }, 'playlist_mincount': 29, }, { @@ -5337,11 +6229,12 @@ class YoutubePlaylistIE(InfoExtractor): 'channel': 'milan', 'channel_id': 'UCEI1-PVPcYXjB73Hfelbmaw', 'uploader_url': 'https://www.youtube.com/channel/UCEI1-PVPcYXjB73Hfelbmaw', + 'availability': 'public', }, - 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], + 'expected_warnings': [r'[Uu]navailable videos? (is|are|will be) hidden'], }, { 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', - 'playlist_mincount': 654, + 'playlist_mincount': 455, 'info_dict': { 'title': '2018 Chinese New Singles (11/6 updated)', 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', @@ -5355,6 +6248,7 @@ class YoutubePlaylistIE(InfoExtractor): 'uploader_url': 'https://www.youtube.com/c/愛低音的國王', 'channel_id': 'UC21nz3_MesPLqtDqwdvnoxA', 'modified_date': r're:\d{8}', + 'availability': 'public', }, 'expected_warnings': [r'[Uu]navailable videos (are|will be) hidden'], }, { @@ -5374,7 +6268,7 @@ class YoutubePlaylistIE(InfoExtractor): qs = parse_qs(url) if qs.get('v', [None])[0]: return False - return super(YoutubePlaylistIE, cls).suitable(url) + return super().suitable(url) def _real_extract(self, url): playlist_id = self._match_id(url) @@ -5414,6 +6308,8 @@ class YoutubeYtBeIE(InfoExtractor): 'channel_url': 'https://www.youtube.com/channel/UCEfMCQ9bs3tjvjy1s451zaw', 'availability': 'public', 'duration': 59, + 'comment_count': int, + 'channel_follower_count': int }, 'params': { 'noplaylist': True, @@ -5462,9 +6358,7 @@ class YoutubeYtUserIE(InfoExtractor): def _real_extract(self, url): user_id = self._match_id(url) - return self.url_result( - 'https://www.youtube.com/user/%s/videos' % user_id, - ie=YoutubeTabIE.ie_key(), video_id=user_id) + return self.url_result(f'https://www.youtube.com/user/{user_id}', YoutubeTabIE, user_id) class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): @@ -5486,6 +6380,97 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): ie=YoutubeTabIE.ie_key()) +class YoutubeNotificationsIE(YoutubeTabBaseInfoExtractor): + IE_NAME = 'youtube:notif' + IE_DESC = 'YouTube notifications; ":ytnotif" keyword (requires cookies)' + _VALID_URL = r':ytnotif(?:ication)?s?' + _LOGIN_REQUIRED = True + _TESTS = [{ + 'url': ':ytnotif', + 'only_matching': True, + }, { + 'url': ':ytnotifications', + 'only_matching': True, + }] + + def _extract_notification_menu(self, response, continuation_list): + notification_list = traverse_obj( + response, + ('actions', 0, 'openPopupAction', 'popup', 'multiPageMenuRenderer', 'sections', 0, 'multiPageMenuNotificationSectionRenderer', 'items'), + ('actions', 0, 'appendContinuationItemsAction', 'continuationItems'), + expected_type=list) or [] + continuation_list[0] = None + for item in notification_list: + entry = self._extract_notification_renderer(item.get('notificationRenderer')) + if entry: + yield entry + continuation = item.get('continuationItemRenderer') + if continuation: + continuation_list[0] = continuation + + def _extract_notification_renderer(self, notification): + video_id = traverse_obj( + notification, ('navigationEndpoint', 'watchEndpoint', 'videoId'), expected_type=str) + url = f'https://www.youtube.com/watch?v={video_id}' + channel_id = None + if not video_id: + browse_ep = traverse_obj( + notification, ('navigationEndpoint', 'browseEndpoint'), expected_type=dict) + channel_id = traverse_obj(browse_ep, 'browseId', expected_type=str) + post_id = self._search_regex( + r'/post/(.+)', traverse_obj(browse_ep, 'canonicalBaseUrl', expected_type=str), + 'post id', default=None) + if not channel_id or not post_id: + return + # The direct /post url redirects to this in the browser + url = f'https://www.youtube.com/channel/{channel_id}/community?lb={post_id}' + + channel = traverse_obj( + notification, ('contextualMenu', 'menuRenderer', 'items', 1, 'menuServiceItemRenderer', 'text', 'runs', 1, 'text'), + expected_type=str) + notification_title = self._get_text(notification, 'shortMessage') + if notification_title: + notification_title = notification_title.replace('\xad', '') # remove soft hyphens + # TODO: handle recommended videos + title = self._search_regex( + rf'{re.escape(channel or "")}[^:]+: (.+)', notification_title, + 'video title', default=None) + timestamp = (self._parse_time_text(self._get_text(notification, 'sentTimeText')) + if self._configuration_arg('approximate_date', ie_key=YoutubeTabIE) + else None) + return { + '_type': 'url', + 'url': url, + 'ie_key': (YoutubeIE if video_id else YoutubeTabIE).ie_key(), + 'video_id': video_id, + 'title': title, + 'channel_id': channel_id, + 'channel': channel, + 'thumbnails': self._extract_thumbnails(notification, 'videoThumbnail'), + 'timestamp': timestamp, + } + + def _notification_menu_entries(self, ytcfg): + continuation_list = [None] + response = None + for page in itertools.count(1): + ctoken = traverse_obj( + continuation_list, (0, 'continuationEndpoint', 'getNotificationMenuEndpoint', 'ctoken'), expected_type=str) + response = self._extract_response( + item_id=f'page {page}', query={'ctoken': ctoken} if ctoken else {}, ytcfg=ytcfg, + ep='notification/get_notification_menu', check_get_keys='actions', + headers=self.generate_api_headers(ytcfg=ytcfg, visitor_data=self._extract_visitor_data(response))) + yield from self._extract_notification_menu(response, continuation_list) + if not continuation_list[0]: + break + + def _real_extract(self, url): + display_id = 'notifications' + ytcfg = self._download_ytcfg('web', display_id) if not self.skip_webpage else {} + self._report_playlist_authcheck(ytcfg) + return self.playlist_result(self._notification_menu_entries(ytcfg), display_id, display_id) + + class YoutubeSearchIE(YoutubeTabBaseInfoExtractor, SearchInfoExtractor): IE_DESC = 'YouTube search' IE_NAME = 'youtube:search' @@ -5540,10 +6525,11 @@ class YoutubeSearchURLIE(YoutubeTabBaseInfoExtractor): 'info_dict': { 'id': '#cats', 'title': '#cats', - 'entries': [{ - 'url': r're:https://(www\.)?youtube\.com/hashtag/cats', - 'title': '#cats', - }], + # The test suite does not have support for nested playlists + # 'entries': [{ + # 'url': r're:https://(www\.)?youtube\.com/hashtag/cats', + # 'title': '#cats', + # }], }, }, { 'url': 'https://www.youtube.com/results?q=test&sp=EgQIBBgB', @@ -5557,7 +6543,7 @@ class YoutubeSearchURLIE(YoutubeTabBaseInfoExtractor): class YoutubeMusicSearchURLIE(YoutubeTabBaseInfoExtractor): - IE_DESC = 'YouTube music search URLs with selectable sections (Eg: #songs)' + IE_DESC = 'YouTube music search URLs with selectable sections, e.g. #songs' IE_NAME = 'youtube:music:search_url' _VALID_URL = r'https?://music\.youtube\.com/search\?([^#]+&)?(?:search_query|q)=(?:[^&]+)(?:[&#]|$)' _TESTS = [{ @@ -5601,7 +6587,7 @@ class YoutubeMusicSearchURLIE(YoutubeTabBaseInfoExtractor): if params: section = next((k for k, v in self._SECTIONS.items() if v == params), params) else: - section = compat_urllib_parse_unquote_plus((url.split('#') + [''])[1]).lower() + section = urllib.parse.unquote_plus((url.split('#') + [''])[1]).lower() params = self._SECTIONS.get(section) if not params: section = None @@ -5612,14 +6598,17 @@ class YoutubeMusicSearchURLIE(YoutubeTabBaseInfoExtractor): class YoutubeFeedsInfoExtractor(InfoExtractor): """ Base class for feed extractors - Subclasses must define the _FEED_NAME property. + Subclasses must re-define the _FEED_NAME property. """ _LOGIN_REQUIRED = True - _TESTS = [] + _FEED_NAME = 'feeds' + + def _real_initialize(self): + YoutubeBaseInfoExtractor._check_login_required(self) - @property + @classproperty def IE_NAME(self): - return 'youtube:%s' % self._FEED_NAME + return f'youtube:{self._FEED_NAME}' def _real_extract(self, url): return self.url_result( @@ -5680,6 +6669,46 @@ class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): }] +class YoutubeStoriesIE(InfoExtractor): + IE_DESC = 'YouTube channel stories; "ytstories:" prefix' + IE_NAME = 'youtube:stories' + _VALID_URL = r'ytstories:UC(?P<id>[A-Za-z0-9_-]{21}[AQgw])$' + _TESTS = [{ + 'url': 'ytstories:UCwFCb4jeqaKWnciAYM-ZVHg', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = f'RLTD{self._match_id(url)}' + return self.url_result( + smuggle_url(f'https://www.youtube.com/playlist?list={playlist_id}&playnext=1', {'is_story': True}), + ie=YoutubeTabIE, video_id=playlist_id) + + +class YoutubeShortsAudioPivotIE(InfoExtractor): + IE_DESC = 'YouTube Shorts audio pivot (Shorts using audio of a given video)' + IE_NAME = 'youtube:shorts:pivot:audio' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/source/(?P<id>[\w-]{11})/shorts' + _TESTS = [{ + 'url': 'https://www.youtube.com/source/Lyj-MZSAA9o/shorts', + 'only_matching': True, + }] + + @staticmethod + def _generate_audio_pivot_params(video_id): + """ + Generates sfv_audio_pivot browse params for this video id + """ + pb_params = b'\xf2\x05+\n)\x12\'\n\x0b%b\x12\x0b%b\x1a\x0b%b' % ((video_id.encode(),) * 3) + return urllib.parse.quote(base64.b64encode(pb_params).decode()) + + def _real_extract(self, url): + video_id = self._match_id(url) + return self.url_result( + f'https://www.youtube.com/feed/sfv_audio_pivot?bp={self._generate_audio_pivot_params(video_id)}', + ie=YoutubeTabIE) + + class YoutubeTruncatedURLIE(InfoExtractor): IE_NAME = 'youtube:truncated_url' IE_DESC = False # Do not list @@ -5729,14 +6758,62 @@ class YoutubeTruncatedURLIE(InfoExtractor): expected=True) -class YoutubeClipIE(InfoExtractor): +class YoutubeClipIE(YoutubeTabBaseInfoExtractor): IE_NAME = 'youtube:clip' - IE_DESC = False # Do not list - _VALID_URL = r'https?://(?:www\.)?youtube\.com/clip/' + _VALID_URL = r'https?://(?:www\.)?youtube\.com/clip/(?P<id>[^/?#]+)' + _TESTS = [{ + # FIXME: Other metadata should be extracted from the clip, not from the base video + 'url': 'https://www.youtube.com/clip/UgytZKpehg-hEMBSn3F4AaABCQ', + 'info_dict': { + 'id': 'UgytZKpehg-hEMBSn3F4AaABCQ', + 'ext': 'mp4', + 'section_start': 29.0, + 'section_end': 39.7, + 'duration': 10.7, + 'age_limit': 0, + 'availability': 'public', + 'categories': ['Gaming'], + 'channel': 'Scott The Woz', + 'channel_id': 'UC4rqhyiTs7XyuODcECvuiiQ', + 'channel_url': 'https://www.youtube.com/channel/UC4rqhyiTs7XyuODcECvuiiQ', + 'description': 'md5:7a4517a17ea9b4bd98996399d8bb36e7', + 'like_count': int, + 'playable_in_embed': True, + 'tags': 'count:17', + 'thumbnail': 'https://i.ytimg.com/vi_webp/ScPX26pdQik/maxresdefault.webp', + 'title': 'Mobile Games on Console - Scott The Woz', + 'upload_date': '20210920', + 'uploader': 'Scott The Woz', + 'uploader_id': 'scottthewoz', + 'uploader_url': 'http://www.youtube.com/user/scottthewoz', + 'view_count': int, + 'live_status': 'not_live', + 'channel_follower_count': int + } + }] def _real_extract(self, url): - self.report_warning('YouTube clips are not currently supported. The entire video will be downloaded instead') - return self.url_result(url, 'Generic') + clip_id = self._match_id(url) + _, data = self._extract_webpage(url, clip_id) + + video_id = traverse_obj(data, ('currentVideoEndpoint', 'watchEndpoint', 'videoId')) + if not video_id: + raise ExtractorError('Unable to find video ID') + + clip_data = traverse_obj(data, ( + 'engagementPanels', ..., 'engagementPanelSectionListRenderer', 'content', 'clipSectionRenderer', + 'contents', ..., 'clipAttributionRenderer', 'onScrubExit', 'commandExecutorCommand', 'commands', ..., + 'openPopupAction', 'popup', 'notificationActionRenderer', 'actionButton', 'buttonRenderer', 'command', + 'commandExecutorCommand', 'commands', ..., 'loopCommand'), get_all=False) + + return { + '_type': 'url_transparent', + 'url': f'https://www.youtube.com/watch?v={video_id}', + 'ie_key': YoutubeIE.ie_key(), + 'id': clip_id, + 'section_start': int(clip_data['startTimeMs']) / 1000, + 'section_end': int(clip_data['endTimeMs']) / 1000, + } class YoutubeTruncatedIDIE(InfoExtractor): @@ -5752,5 +6829,5 @@ class YoutubeTruncatedIDIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) raise ExtractorError( - 'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url), + f'Incomplete YouTube ID {video_id}. URL {url} looks truncated.', expected=True) diff --git a/hypervideo_dl/extractor/zapiks.py b/hypervideo_dl/extractor/zapiks.py index 161b011..88f526b 100644 --- a/hypervideo_dl/extractor/zapiks.py +++ b/hypervideo_dl/extractor/zapiks.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -15,6 +12,7 @@ from ..utils import ( class ZapiksIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?zapiks\.(?:fr|com)/(?:(?:[a-z]{2}/)?(?P<display_id>.+?)\.html|index\.php\?.*\bmedia_id=(?P<id>\d+))' + _EMBED_REGEX = [r'<iframe[^>]+src="(?P<url>https?://(?:www\.)?zapiks\.fr/index\.php\?.+?)"'] _TESTS = [ { 'url': 'http://www.zapiks.fr/ep2s3-bon-appetit-eh-be-viva.html', @@ -94,7 +92,6 @@ class ZapiksIE(InfoExtractor): if m: f['height'] = int(m.group('height')) formats.append(f) - self._sort_formats(formats) return { 'id': video_id, diff --git a/hypervideo_dl/extractor/zaq1.py b/hypervideo_dl/extractor/zaq1.py deleted file mode 100644 index 889aff5..0000000 --- a/hypervideo_dl/extractor/zaq1.py +++ /dev/null @@ -1,101 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - unified_timestamp, -) - - -class Zaq1IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?zaq1\.pl/video/(?P<id>[^/?#&]+)' - _TESTS = [{ - 'url': 'http://zaq1.pl/video/xev0e', - 'md5': '24a5eb3f052e604ae597c4d0d19b351e', - 'info_dict': { - 'id': 'xev0e', - 'title': 'DJ NA WESELE. TANIEC Z FIGURAMI.węgrów/sokołów podlaski/siedlce/mińsk mazowiecki/warszawa', - 'description': 'www.facebook.com/weseledjKontakt: 728 448 199 / 505 419 147', - 'ext': 'mp4', - 'duration': 511, - 'timestamp': 1490896361, - 'uploader': 'Anonim', - 'upload_date': '20170330', - 'view_count': int, - } - }, { - # malformed JSON-LD - 'url': 'http://zaq1.pl/video/x81vn', - 'info_dict': { - 'id': 'x81vn', - 'title': 'SEKRETNE ŻYCIE WALTERA MITTY', - 'ext': 'mp4', - 'duration': 6234, - 'timestamp': 1493494860, - 'uploader': 'Anonim', - 'upload_date': '20170429', - 'view_count': int, - }, - 'params': { - 'skip_download': True, - }, - 'expected_warnings': ['Failed to parse JSON'], - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - video_url = self._search_regex( - r'data-video-url=(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, - 'video url', group='url') - - info = self._search_json_ld(webpage, video_id, fatal=False) - - def extract_data(field, name, fatal=False): - return self._search_regex( - r'data-%s=(["\'])(?P<field>(?:(?!\1).)+)\1' % field, - webpage, field, fatal=fatal, group='field') - - if not info.get('title'): - info['title'] = extract_data('file-name', 'title', fatal=True) - - if not info.get('duration'): - info['duration'] = int_or_none(extract_data('duration', 'duration')) - - if not info.get('thumbnail'): - info['thumbnail'] = extract_data('photo-url', 'thumbnail') - - if not info.get('timestamp'): - info['timestamp'] = unified_timestamp(self._html_search_meta( - 'uploadDate', webpage, 'timestamp')) - - if not info.get('interactionCount'): - info['view_count'] = int_or_none(self._html_search_meta( - 'interactionCount', webpage, 'view count')) - - uploader = self._html_search_regex( - r'Wideo dodał:\s*<a[^>]*>([^<]+)</a>', webpage, 'uploader', - fatal=False) - - width = int_or_none(self._html_search_meta( - 'width', webpage, fatal=False)) - height = int_or_none(self._html_search_meta( - 'height', webpage, fatal=False)) - - info.update({ - 'id': video_id, - 'formats': [{ - 'url': video_url, - 'width': width, - 'height': height, - 'http_headers': { - 'Referer': url, - }, - }], - 'uploader': uploader, - }) - - return info diff --git a/hypervideo_dl/extractor/zattoo.py b/hypervideo_dl/extractor/zattoo.py index c02b4ca..22620c0 100644 --- a/hypervideo_dl/extractor/zattoo.py +++ b/hypervideo_dl/extractor/zattoo.py @@ -1,14 +1,8 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from uuid import uuid4 from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_str, -) +from ..compat import compat_HTTPError, compat_str from ..utils import ( ExtractorError, int_or_none, @@ -51,25 +45,30 @@ class ZattooPlatformBaseIE(InfoExtractor): self._power_guide_hash = data['session']['power_guide_hash'] def _initialize_pre_login(self): - webpage = self._download_webpage( - self._host_url(), None, 'Downloading app token') - app_token = self._html_search_regex( - r'appToken\s*=\s*(["\'])(?P<token>(?:(?!\1).)+?)\1', - webpage, 'app token', group='token') - app_version = self._html_search_regex( - r'<!--\w+-(.+?)-', webpage, 'app version', default='2.8.2') + session_token = self._download_json( + f'{self._host_url()}/token.json', None, 'Downloading session token')['session_token'] # Will setup appropriate cookies self._request_webpage( - '%s/zapi/v2/session/hello' % self._host_url(), None, + '%s/zapi/v3/session/hello' % self._host_url(), None, 'Opening session', data=urlencode_postdata({ - 'client_app_token': app_token, 'uuid': compat_str(uuid4()), 'lang': 'en', - 'app_version': app_version, + 'app_version': '1.8.2', 'format': 'json', + 'client_app_token': session_token, })) + def _extract_video_id_from_recording(self, recid): + playlist = self._download_json( + f'{self._host_url()}/zapi/v2/playlist', recid, 'Downloading playlist') + try: + return next( + str(item['program_id']) for item in playlist['recordings'] + if item.get('program_id') and str(item.get('id')) == recid) + except (StopIteration, KeyError): + raise ExtractorError('Could not extract video id from recording') + def _extract_cid(self, video_id, channel_name): channel_groups = self._download_json( '%s/zapi/v2/cached/channels/%s' % (self._host_url(), @@ -118,7 +117,26 @@ class ZattooPlatformBaseIE(InfoExtractor): return cid, info_dict - def _extract_formats(self, cid, video_id, record_id=None, is_live=False): + def _extract_ondemand_info(self, ondemand_id): + """ + @returns (ondemand_token, ondemand_type, info_dict) + """ + data = self._download_json( + '%s/zapi/vod/movies/%s' % (self._host_url(), ondemand_id), + ondemand_id, 'Downloading ondemand information') + info_dict = { + 'id': ondemand_id, + 'title': data.get('title'), + 'description': data.get('description'), + 'duration': int_or_none(data.get('duration')), + 'release_year': int_or_none(data.get('year')), + 'episode_number': int_or_none(data.get('episode_number')), + 'season_number': int_or_none(data.get('season_number')), + 'categories': try_get(data, lambda x: x['categories'], list), + } + return data['terms_catalog'][0]['terms'][0]['token'], data['type'], info_dict + + def _extract_formats(self, cid, video_id, record_id=None, ondemand_id=None, ondemand_termtoken=None, ondemand_type=None, is_live=False): postdata_common = { 'https_watch_urls': True, } @@ -128,11 +146,18 @@ class ZattooPlatformBaseIE(InfoExtractor): url = '%s/zapi/watch/live/%s' % (self._host_url(), cid) elif record_id: url = '%s/zapi/watch/recording/%s' % (self._host_url(), record_id) + elif ondemand_id: + postdata_common.update({ + 'teasable_id': ondemand_id, + 'term_token': ondemand_termtoken, + 'teasable_type': ondemand_type + }) + url = '%s/zapi/watch/vod/video' % self._host_url() else: - url = '%s/zapi/watch/recall/%s/%s' % (self._host_url(), cid, video_id) - + url = '%s/zapi/v3/watch/replay/%s/%s' % (self._host_url(), cid, video_id) formats = [] - for stream_type in ('dash', 'hls', 'hls5', 'hds'): + subtitles = {} + for stream_type in ('dash', 'hls7'): postdata = postdata_common.copy() postdata['stream_type'] = stream_type @@ -156,14 +181,16 @@ class ZattooPlatformBaseIE(InfoExtractor): audio_channel = watch.get('audio_channel') preference = 1 if audio_channel == 'A' else None format_id = join_nonempty(stream_type, watch.get('maxrate'), audio_channel) - if stream_type in ('dash', 'dash_widevine', 'dash_playready'): - this_formats = self._extract_mpd_formats( + if stream_type.startswith('dash'): + this_formats, subs = self._extract_mpd_formats_and_subtitles( watch_url, video_id, mpd_id=format_id, fatal=False) - elif stream_type in ('hls', 'hls5', 'hls5_fairplay'): - this_formats = self._extract_m3u8_formats( + self._merge_subtitles(subs, target=subtitles) + elif stream_type.startswith('hls'): + this_formats, subs = self._extract_m3u8_formats_and_subtitles( watch_url, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id=format_id, fatal=False) + self._merge_subtitles(subs, target=subtitles) elif stream_type == 'hds': this_formats = self._extract_f4m_formats( watch_url, video_id, f4m_id=format_id, fatal=False) @@ -175,58 +202,48 @@ class ZattooPlatformBaseIE(InfoExtractor): for this_format in this_formats: this_format['quality'] = preference formats.extend(this_formats) - self._sort_formats(formats) - return formats + return formats, subtitles - def _extract_video(self, channel_name, video_id, record_id=None, is_live=False): - if is_live: - cid = self._extract_cid(video_id, channel_name) - info_dict = { - 'id': channel_name, - 'title': channel_name, - 'is_live': True, - } - else: - cid, info_dict = self._extract_cid_and_video_info(video_id) - formats = self._extract_formats( - cid, video_id, record_id=record_id, is_live=is_live) - info_dict['formats'] = formats + def _extract_video(self, video_id, record_id=None): + cid, info_dict = self._extract_cid_and_video_info(video_id) + info_dict['formats'], info_dict['subtitles'] = self._extract_formats(cid, video_id, record_id=record_id) return info_dict + def _extract_live(self, channel_name): + cid = self._extract_cid(channel_name, channel_name) + formats, subtitles = self._extract_formats(cid, cid, is_live=True) + return { + 'id': channel_name, + 'title': channel_name, + 'is_live': True, + 'formats': formats, + 'subtitles': subtitles + } -class QuicklineBaseIE(ZattooPlatformBaseIE): - _NETRC_MACHINE = 'quickline' - _HOST = 'mobiltv.quickline.com' - - -class QuicklineIE(QuicklineBaseIE): - _VALID_URL = r'https?://(?:www\.)?%s/watch/(?P<channel>[^/]+)/(?P<id>[0-9]+)' % re.escape(QuicklineBaseIE._HOST) + def _extract_record(self, record_id): + video_id = self._extract_video_id_from_recording(record_id) + cid, info_dict = self._extract_cid_and_video_info(video_id) + info_dict['formats'], info_dict['subtitles'] = self._extract_formats(cid, video_id, record_id=record_id) + return info_dict - _TEST = { - 'url': 'https://mobiltv.quickline.com/watch/prosieben/130671867-maze-runner-die-auserwaehlten-in-der-brandwueste', - 'only_matching': True, - } + def _extract_ondemand(self, ondemand_id): + ondemand_termtoken, ondemand_type, info_dict = self._extract_ondemand_info(ondemand_id) + info_dict['formats'], info_dict['subtitles'] = self._extract_formats( + None, ondemand_id, ondemand_id=ondemand_id, + ondemand_termtoken=ondemand_termtoken, ondemand_type=ondemand_type) + return info_dict def _real_extract(self, url): - channel_name, video_id = self._match_valid_url(url).groups() - return self._extract_video(channel_name, video_id) - + video_id, record_id = self._match_valid_url(url).groups() + return getattr(self, f'_extract_{self._TYPE}')(video_id or record_id) -class QuicklineLiveIE(QuicklineBaseIE): - _VALID_URL = r'https?://(?:www\.)?%s/watch/(?P<id>[^/]+)' % re.escape(QuicklineBaseIE._HOST) - _TEST = { - 'url': 'https://mobiltv.quickline.com/watch/srf1', - 'only_matching': True, - } - - @classmethod - def suitable(cls, url): - return False if QuicklineIE.suitable(url) else super(QuicklineLiveIE, cls).suitable(url) - - def _real_extract(self, url): - channel_name = video_id = self._match_id(url) - return self._extract_video(channel_name, video_id, is_live=True) +def _create_valid_url(host, match, qs, base_re=None): + match_base = fr'|{base_re}/(?P<vid1>{match})' if base_re else '(?P<vid1>)' + return rf'''(?x)https?://(?:www\.)?{re.escape(host)}/(?: + [^?#]+\?(?:[^#]+&)?{qs}=(?P<vid2>{match}) + {match_base} + )''' class ZattooBaseIE(ZattooPlatformBaseIE): @@ -234,191 +251,614 @@ class ZattooBaseIE(ZattooPlatformBaseIE): _HOST = 'zattoo.com' -def _make_valid_url(tmpl, host): - return tmpl % re.escape(host) - - class ZattooIE(ZattooBaseIE): - _VALID_URL_TEMPLATE = r'https?://(?:www\.)?%s/watch/(?P<channel>[^/]+?)/(?P<id>[0-9]+)[^/]+(?:/(?P<recid>[0-9]+))?' - _VALID_URL = _make_valid_url(_VALID_URL_TEMPLATE, ZattooBaseIE._HOST) - - # Since regular videos are only available for 7 days and recorded videos - # are only available for a specific user, we cannot have detailed tests. + _VALID_URL = _create_valid_url(ZattooBaseIE._HOST, r'\d+', 'program', '(?:program|watch)/[^/]+') + _TYPE = 'video' _TESTS = [{ - 'url': 'https://zattoo.com/watch/prosieben/130671867-maze-runner-die-auserwaehlten-in-der-brandwueste', + 'url': 'https://zattoo.com/program/zdf/250170418', + 'info_dict': { + 'id': '250170418', + 'ext': 'mp4', + 'title': 'Markus Lanz', + 'description': 'md5:e41cb1257de008ca62a73bb876ffa7fc', + 'thumbnail': 're:http://images.zattic.com/cms/.+/format_480x360.jpg', + 'creator': 'ZDF HD', + 'release_year': 2022, + 'episode': 'Folge 1655', + 'categories': 'count:1', + 'tags': 'count:2' + }, + 'params': {'skip_download': 'm3u8'} + }, { + 'url': 'https://zattoo.com/program/daserste/210177916', 'only_matching': True, }, { - 'url': 'https://zattoo.com/watch/srf_zwei/132905652-eishockey-spengler-cup/102791477/1512211800000/1514433500000/92000', + 'url': 'https://zattoo.com/guide/german?channel=srf1&program=169860555', 'only_matching': True, }] - def _real_extract(self, url): - channel_name, video_id, record_id = self._match_valid_url(url).groups() - return self._extract_video(channel_name, video_id, record_id) - class ZattooLiveIE(ZattooBaseIE): - _VALID_URL = r'https?://(?:www\.)?zattoo\.com/watch/(?P<id>[^/]+)' - - _TEST = { - 'url': 'https://zattoo.com/watch/srf1', + _VALID_URL = _create_valid_url(ZattooBaseIE._HOST, r'[^/?&#]+', 'channel', 'live') + _TYPE = 'live' + _TESTS = [{ + 'url': 'https://zattoo.com/channels/german?channel=srf_zwei', + 'only_matching': True, + }, { + 'url': 'https://zattoo.com/live/srf1', 'only_matching': True, - } + }] @classmethod def suitable(cls, url): - return False if ZattooIE.suitable(url) else super(ZattooLiveIE, cls).suitable(url) + return False if ZattooIE.suitable(url) else super().suitable(url) + + +class ZattooMoviesIE(ZattooBaseIE): + _VALID_URL = _create_valid_url(ZattooBaseIE._HOST, r'\w+', 'movie_id', 'vod/movies') + _TYPE = 'ondemand' + _TESTS = [{ + 'url': 'https://zattoo.com/vod/movies/7521', + 'only_matching': True, + }, { + 'url': 'https://zattoo.com/ondemand?movie_id=7521&term_token=9f00f43183269484edde', + 'only_matching': True, + }] - def _real_extract(self, url): - channel_name = video_id = self._match_id(url) - return self._extract_video(channel_name, video_id, is_live=True) +class ZattooRecordingsIE(ZattooBaseIE): + _VALID_URL = _create_valid_url('zattoo.com', r'\d+', 'recording') + _TYPE = 'record' + _TESTS = [{ + 'url': 'https://zattoo.com/recordings?recording=193615508', + 'only_matching': True, + }, { + 'url': 'https://zattoo.com/tc/ptc_recordings_all_recordings?recording=193615420', + 'only_matching': True, + }] -class NetPlusIE(ZattooIE): + +class NetPlusTVBaseIE(ZattooPlatformBaseIE): _NETRC_MACHINE = 'netplus' _HOST = 'netplus.tv' _API_HOST = 'www.%s' % _HOST - _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + +class NetPlusTVIE(NetPlusTVBaseIE): + _VALID_URL = _create_valid_url(NetPlusTVBaseIE._HOST, r'\d+', 'program', '(?:program|watch)/[^/]+') + _TYPE = 'video' + _TESTS = [{ + 'url': 'https://netplus.tv/program/daserste/210177916', + 'only_matching': True, + }, { + 'url': 'https://netplus.tv/guide/german?channel=srf1&program=169860555', + 'only_matching': True, + }] + + +class NetPlusTVLiveIE(NetPlusTVBaseIE): + _VALID_URL = _create_valid_url(NetPlusTVBaseIE._HOST, r'[^/?&#]+', 'channel', 'live') + _TYPE = 'live' + _TESTS = [{ + 'url': 'https://netplus.tv/channels/german?channel=srf_zwei', + 'only_matching': True, + }, { + 'url': 'https://netplus.tv/live/srf1', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if NetPlusTVIE.suitable(url) else super().suitable(url) + + +class NetPlusTVRecordingsIE(NetPlusTVBaseIE): + _VALID_URL = _create_valid_url(NetPlusTVBaseIE._HOST, r'\d+', 'recording') + _TYPE = 'record' _TESTS = [{ - 'url': 'https://www.netplus.tv/watch/abc/123-abc', + 'url': 'https://netplus.tv/recordings?recording=193615508', + 'only_matching': True, + }, { + 'url': 'https://netplus.tv/tc/ptc_recordings_all_recordings?recording=193615420', 'only_matching': True, }] -class MNetTVIE(ZattooIE): +class MNetTVBaseIE(ZattooPlatformBaseIE): _NETRC_MACHINE = 'mnettv' _HOST = 'tvplus.m-net.de' - _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + +class MNetTVIE(MNetTVBaseIE): + _VALID_URL = _create_valid_url(MNetTVBaseIE._HOST, r'\d+', 'program', '(?:program|watch)/[^/]+') + _TYPE = 'video' _TESTS = [{ - 'url': 'https://tvplus.m-net.de/watch/abc/123-abc', + 'url': 'https://tvplus.m-net.de/program/daserste/210177916', + 'only_matching': True, + }, { + 'url': 'https://tvplus.m-net.de/guide/german?channel=srf1&program=169860555', + 'only_matching': True, + }] + + +class MNetTVLiveIE(MNetTVBaseIE): + _VALID_URL = _create_valid_url(MNetTVBaseIE._HOST, r'[^/?&#]+', 'channel', 'live') + _TYPE = 'live' + _TESTS = [{ + 'url': 'https://tvplus.m-net.de/channels/german?channel=srf_zwei', + 'only_matching': True, + }, { + 'url': 'https://tvplus.m-net.de/live/srf1', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if MNetTVIE.suitable(url) else super().suitable(url) + + +class MNetTVRecordingsIE(MNetTVBaseIE): + _VALID_URL = _create_valid_url(MNetTVBaseIE._HOST, r'\d+', 'recording') + _TYPE = 'record' + _TESTS = [{ + 'url': 'https://tvplus.m-net.de/recordings?recording=193615508', + 'only_matching': True, + }, { + 'url': 'https://tvplus.m-net.de/tc/ptc_recordings_all_recordings?recording=193615420', 'only_matching': True, }] -class WalyTVIE(ZattooIE): +class WalyTVBaseIE(ZattooPlatformBaseIE): _NETRC_MACHINE = 'walytv' _HOST = 'player.waly.tv' - _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + +class WalyTVIE(WalyTVBaseIE): + _VALID_URL = _create_valid_url(WalyTVBaseIE._HOST, r'\d+', 'program', '(?:program|watch)/[^/]+') + _TYPE = 'video' + _TESTS = [{ + 'url': 'https://player.waly.tv/program/daserste/210177916', + 'only_matching': True, + }, { + 'url': 'https://player.waly.tv/guide/german?channel=srf1&program=169860555', + 'only_matching': True, + }] + + +class WalyTVLiveIE(WalyTVBaseIE): + _VALID_URL = _create_valid_url(WalyTVBaseIE._HOST, r'[^/?&#]+', 'channel', 'live') + _TYPE = 'live' _TESTS = [{ - 'url': 'https://player.waly.tv/watch/abc/123-abc', + 'url': 'https://player.waly.tv/channels/german?channel=srf_zwei', + 'only_matching': True, + }, { + 'url': 'https://player.waly.tv/live/srf1', 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return False if WalyTVIE.suitable(url) else super().suitable(url) + -class BBVTVIE(ZattooIE): +class WalyTVRecordingsIE(WalyTVBaseIE): + _VALID_URL = _create_valid_url(WalyTVBaseIE._HOST, r'\d+', 'recording') + _TYPE = 'record' + _TESTS = [{ + 'url': 'https://player.waly.tv/recordings?recording=193615508', + 'only_matching': True, + }, { + 'url': 'https://player.waly.tv/tc/ptc_recordings_all_recordings?recording=193615420', + 'only_matching': True, + }] + + +class BBVTVBaseIE(ZattooPlatformBaseIE): _NETRC_MACHINE = 'bbvtv' _HOST = 'bbv-tv.net' _API_HOST = 'www.%s' % _HOST - _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + +class BBVTVIE(BBVTVBaseIE): + _VALID_URL = _create_valid_url(BBVTVBaseIE._HOST, r'\d+', 'program', '(?:program|watch)/[^/]+') + _TYPE = 'video' + _TESTS = [{ + 'url': 'https://bbv-tv.net/program/daserste/210177916', + 'only_matching': True, + }, { + 'url': 'https://bbv-tv.net/guide/german?channel=srf1&program=169860555', + 'only_matching': True, + }] + + +class BBVTVLiveIE(BBVTVBaseIE): + _VALID_URL = _create_valid_url(BBVTVBaseIE._HOST, r'[^/?&#]+', 'channel', 'live') + _TYPE = 'live' _TESTS = [{ - 'url': 'https://www.bbv-tv.net/watch/abc/123-abc', + 'url': 'https://bbv-tv.net/channels/german?channel=srf_zwei', + 'only_matching': True, + }, { + 'url': 'https://bbv-tv.net/live/srf1', 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return False if BBVTVIE.suitable(url) else super().suitable(url) + -class VTXTVIE(ZattooIE): +class BBVTVRecordingsIE(BBVTVBaseIE): + _VALID_URL = _create_valid_url(BBVTVBaseIE._HOST, r'\d+', 'recording') + _TYPE = 'record' + _TESTS = [{ + 'url': 'https://bbv-tv.net/recordings?recording=193615508', + 'only_matching': True, + }, { + 'url': 'https://bbv-tv.net/tc/ptc_recordings_all_recordings?recording=193615420', + 'only_matching': True, + }] + + +class VTXTVBaseIE(ZattooPlatformBaseIE): _NETRC_MACHINE = 'vtxtv' _HOST = 'vtxtv.ch' _API_HOST = 'www.%s' % _HOST - _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + +class VTXTVIE(VTXTVBaseIE): + _VALID_URL = _create_valid_url(VTXTVBaseIE._HOST, r'\d+', 'program', '(?:program|watch)/[^/]+') + _TYPE = 'video' _TESTS = [{ - 'url': 'https://www.vtxtv.ch/watch/abc/123-abc', + 'url': 'https://vtxtv.ch/program/daserste/210177916', + 'only_matching': True, + }, { + 'url': 'https://vtxtv.ch/guide/german?channel=srf1&program=169860555', 'only_matching': True, }] -class MyVisionTVIE(ZattooIE): - _NETRC_MACHINE = 'myvisiontv' - _HOST = 'myvisiontv.ch' - _API_HOST = 'www.%s' % _HOST - _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) +class VTXTVLiveIE(VTXTVBaseIE): + _VALID_URL = _create_valid_url(VTXTVBaseIE._HOST, r'[^/?&#]+', 'channel', 'live') + _TYPE = 'live' + _TESTS = [{ + 'url': 'https://vtxtv.ch/channels/german?channel=srf_zwei', + 'only_matching': True, + }, { + 'url': 'https://vtxtv.ch/live/srf1', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if VTXTVIE.suitable(url) else super().suitable(url) + +class VTXTVRecordingsIE(VTXTVBaseIE): + _VALID_URL = _create_valid_url(VTXTVBaseIE._HOST, r'\d+', 'recording') + _TYPE = 'record' _TESTS = [{ - 'url': 'https://www.myvisiontv.ch/watch/abc/123-abc', + 'url': 'https://vtxtv.ch/recordings?recording=193615508', + 'only_matching': True, + }, { + 'url': 'https://vtxtv.ch/tc/ptc_recordings_all_recordings?recording=193615420', 'only_matching': True, }] -class GlattvisionTVIE(ZattooIE): +class GlattvisionTVBaseIE(ZattooPlatformBaseIE): _NETRC_MACHINE = 'glattvisiontv' _HOST = 'iptv.glattvision.ch' - _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + +class GlattvisionTVIE(GlattvisionTVBaseIE): + _VALID_URL = _create_valid_url(GlattvisionTVBaseIE._HOST, r'\d+', 'program', '(?:program|watch)/[^/]+') + _TYPE = 'video' + _TESTS = [{ + 'url': 'https://iptv.glattvision.ch/program/daserste/210177916', + 'only_matching': True, + }, { + 'url': 'https://iptv.glattvision.ch/guide/german?channel=srf1&program=169860555', + 'only_matching': True, + }] + + +class GlattvisionTVLiveIE(GlattvisionTVBaseIE): + _VALID_URL = _create_valid_url(GlattvisionTVBaseIE._HOST, r'[^/?&#]+', 'channel', 'live') + _TYPE = 'live' + _TESTS = [{ + 'url': 'https://iptv.glattvision.ch/channels/german?channel=srf_zwei', + 'only_matching': True, + }, { + 'url': 'https://iptv.glattvision.ch/live/srf1', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if GlattvisionTVIE.suitable(url) else super().suitable(url) + + +class GlattvisionTVRecordingsIE(GlattvisionTVBaseIE): + _VALID_URL = _create_valid_url(GlattvisionTVBaseIE._HOST, r'\d+', 'recording') + _TYPE = 'record' _TESTS = [{ - 'url': 'https://iptv.glattvision.ch/watch/abc/123-abc', + 'url': 'https://iptv.glattvision.ch/recordings?recording=193615508', + 'only_matching': True, + }, { + 'url': 'https://iptv.glattvision.ch/tc/ptc_recordings_all_recordings?recording=193615420', 'only_matching': True, }] -class SAKTVIE(ZattooIE): +class SAKTVBaseIE(ZattooPlatformBaseIE): _NETRC_MACHINE = 'saktv' _HOST = 'saktv.ch' _API_HOST = 'www.%s' % _HOST - _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + +class SAKTVIE(SAKTVBaseIE): + _VALID_URL = _create_valid_url(SAKTVBaseIE._HOST, r'\d+', 'program', '(?:program|watch)/[^/]+') + _TYPE = 'video' + _TESTS = [{ + 'url': 'https://saktv.ch/program/daserste/210177916', + 'only_matching': True, + }, { + 'url': 'https://saktv.ch/guide/german?channel=srf1&program=169860555', + 'only_matching': True, + }] + + +class SAKTVLiveIE(SAKTVBaseIE): + _VALID_URL = _create_valid_url(SAKTVBaseIE._HOST, r'[^/?&#]+', 'channel', 'live') + _TYPE = 'live' + _TESTS = [{ + 'url': 'https://saktv.ch/channels/german?channel=srf_zwei', + 'only_matching': True, + }, { + 'url': 'https://saktv.ch/live/srf1', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if SAKTVIE.suitable(url) else super().suitable(url) + + +class SAKTVRecordingsIE(SAKTVBaseIE): + _VALID_URL = _create_valid_url(SAKTVBaseIE._HOST, r'\d+', 'recording') + _TYPE = 'record' _TESTS = [{ - 'url': 'https://www.saktv.ch/watch/abc/123-abc', + 'url': 'https://saktv.ch/recordings?recording=193615508', + 'only_matching': True, + }, { + 'url': 'https://saktv.ch/tc/ptc_recordings_all_recordings?recording=193615420', 'only_matching': True, }] -class EWETVIE(ZattooIE): +class EWETVBaseIE(ZattooPlatformBaseIE): _NETRC_MACHINE = 'ewetv' _HOST = 'tvonline.ewe.de' - _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + +class EWETVIE(EWETVBaseIE): + _VALID_URL = _create_valid_url(EWETVBaseIE._HOST, r'\d+', 'program', '(?:program|watch)/[^/]+') + _TYPE = 'video' + _TESTS = [{ + 'url': 'https://tvonline.ewe.de/program/daserste/210177916', + 'only_matching': True, + }, { + 'url': 'https://tvonline.ewe.de/guide/german?channel=srf1&program=169860555', + 'only_matching': True, + }] + + +class EWETVLiveIE(EWETVBaseIE): + _VALID_URL = _create_valid_url(EWETVBaseIE._HOST, r'[^/?&#]+', 'channel', 'live') + _TYPE = 'live' + _TESTS = [{ + 'url': 'https://tvonline.ewe.de/channels/german?channel=srf_zwei', + 'only_matching': True, + }, { + 'url': 'https://tvonline.ewe.de/live/srf1', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if EWETVIE.suitable(url) else super().suitable(url) + + +class EWETVRecordingsIE(EWETVBaseIE): + _VALID_URL = _create_valid_url(EWETVBaseIE._HOST, r'\d+', 'recording') + _TYPE = 'record' _TESTS = [{ - 'url': 'https://tvonline.ewe.de/watch/abc/123-abc', + 'url': 'https://tvonline.ewe.de/recordings?recording=193615508', + 'only_matching': True, + }, { + 'url': 'https://tvonline.ewe.de/tc/ptc_recordings_all_recordings?recording=193615420', 'only_matching': True, }] -class QuantumTVIE(ZattooIE): +class QuantumTVBaseIE(ZattooPlatformBaseIE): _NETRC_MACHINE = 'quantumtv' _HOST = 'quantum-tv.com' _API_HOST = 'www.%s' % _HOST - _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + +class QuantumTVIE(QuantumTVBaseIE): + _VALID_URL = _create_valid_url(QuantumTVBaseIE._HOST, r'\d+', 'program', '(?:program|watch)/[^/]+') + _TYPE = 'video' _TESTS = [{ - 'url': 'https://www.quantum-tv.com/watch/abc/123-abc', + 'url': 'https://quantum-tv.com/program/daserste/210177916', + 'only_matching': True, + }, { + 'url': 'https://quantum-tv.com/guide/german?channel=srf1&program=169860555', 'only_matching': True, }] -class OsnatelTVIE(ZattooIE): +class QuantumTVLiveIE(QuantumTVBaseIE): + _VALID_URL = _create_valid_url(QuantumTVBaseIE._HOST, r'[^/?&#]+', 'channel', 'live') + _TYPE = 'live' + _TESTS = [{ + 'url': 'https://quantum-tv.com/channels/german?channel=srf_zwei', + 'only_matching': True, + }, { + 'url': 'https://quantum-tv.com/live/srf1', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if QuantumTVIE.suitable(url) else super().suitable(url) + + +class QuantumTVRecordingsIE(QuantumTVBaseIE): + _VALID_URL = _create_valid_url(QuantumTVBaseIE._HOST, r'\d+', 'recording') + _TYPE = 'record' + _TESTS = [{ + 'url': 'https://quantum-tv.com/recordings?recording=193615508', + 'only_matching': True, + }, { + 'url': 'https://quantum-tv.com/tc/ptc_recordings_all_recordings?recording=193615420', + 'only_matching': True, + }] + + +class OsnatelTVBaseIE(ZattooPlatformBaseIE): _NETRC_MACHINE = 'osnateltv' _HOST = 'tvonline.osnatel.de' - _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + +class OsnatelTVIE(OsnatelTVBaseIE): + _VALID_URL = _create_valid_url(OsnatelTVBaseIE._HOST, r'\d+', 'program', '(?:program|watch)/[^/]+') + _TYPE = 'video' + _TESTS = [{ + 'url': 'https://tvonline.osnatel.de/program/daserste/210177916', + 'only_matching': True, + }, { + 'url': 'https://tvonline.osnatel.de/guide/german?channel=srf1&program=169860555', + 'only_matching': True, + }] + + +class OsnatelTVLiveIE(OsnatelTVBaseIE): + _VALID_URL = _create_valid_url(OsnatelTVBaseIE._HOST, r'[^/?&#]+', 'channel', 'live') + _TYPE = 'live' + _TESTS = [{ + 'url': 'https://tvonline.osnatel.de/channels/german?channel=srf_zwei', + 'only_matching': True, + }, { + 'url': 'https://tvonline.osnatel.de/live/srf1', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if OsnatelTVIE.suitable(url) else super().suitable(url) + + +class OsnatelTVRecordingsIE(OsnatelTVBaseIE): + _VALID_URL = _create_valid_url(OsnatelTVBaseIE._HOST, r'\d+', 'recording') + _TYPE = 'record' _TESTS = [{ - 'url': 'https://tvonline.osnatel.de/watch/abc/123-abc', + 'url': 'https://tvonline.osnatel.de/recordings?recording=193615508', + 'only_matching': True, + }, { + 'url': 'https://tvonline.osnatel.de/tc/ptc_recordings_all_recordings?recording=193615420', 'only_matching': True, }] -class EinsUndEinsTVIE(ZattooIE): +class EinsUndEinsTVBaseIE(ZattooPlatformBaseIE): _NETRC_MACHINE = '1und1tv' _HOST = '1und1.tv' _API_HOST = 'www.%s' % _HOST - _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + +class EinsUndEinsTVIE(EinsUndEinsTVBaseIE): + _VALID_URL = _create_valid_url(EinsUndEinsTVBaseIE._HOST, r'\d+', 'program', '(?:program|watch)/[^/]+') + _TYPE = 'video' _TESTS = [{ - 'url': 'https://www.1und1.tv/watch/abc/123-abc', + 'url': 'https://1und1.tv/program/daserste/210177916', + 'only_matching': True, + }, { + 'url': 'https://1und1.tv/guide/german?channel=srf1&program=169860555', 'only_matching': True, }] -class SaltTVIE(ZattooIE): +class EinsUndEinsTVLiveIE(EinsUndEinsTVBaseIE): + _VALID_URL = _create_valid_url(EinsUndEinsTVBaseIE._HOST, r'[^/?&#]+', 'channel', 'live') + _TYPE = 'live' + _TESTS = [{ + 'url': 'https://1und1.tv/channels/german?channel=srf_zwei', + 'only_matching': True, + }, { + 'url': 'https://1und1.tv/live/srf1', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if EinsUndEinsTVIE.suitable(url) else super().suitable(url) + + +class EinsUndEinsTVRecordingsIE(EinsUndEinsTVBaseIE): + _VALID_URL = _create_valid_url(EinsUndEinsTVBaseIE._HOST, r'\d+', 'recording') + _TYPE = 'record' + _TESTS = [{ + 'url': 'https://1und1.tv/recordings?recording=193615508', + 'only_matching': True, + }, { + 'url': 'https://1und1.tv/tc/ptc_recordings_all_recordings?recording=193615420', + 'only_matching': True, + }] + + +class SaltTVBaseIE(ZattooPlatformBaseIE): _NETRC_MACHINE = 'salttv' _HOST = 'tv.salt.ch' - _VALID_URL = _make_valid_url(ZattooIE._VALID_URL_TEMPLATE, _HOST) + +class SaltTVIE(SaltTVBaseIE): + _VALID_URL = _create_valid_url(SaltTVBaseIE._HOST, r'\d+', 'program', '(?:program|watch)/[^/]+') + _TYPE = 'video' _TESTS = [{ - 'url': 'https://tv.salt.ch/watch/abc/123-abc', + 'url': 'https://tv.salt.ch/program/daserste/210177916', + 'only_matching': True, + }, { + 'url': 'https://tv.salt.ch/guide/german?channel=srf1&program=169860555', + 'only_matching': True, + }] + + +class SaltTVLiveIE(SaltTVBaseIE): + _VALID_URL = _create_valid_url(SaltTVBaseIE._HOST, r'[^/?&#]+', 'channel', 'live') + _TYPE = 'live' + _TESTS = [{ + 'url': 'https://tv.salt.ch/channels/german?channel=srf_zwei', + 'only_matching': True, + }, { + 'url': 'https://tv.salt.ch/live/srf1', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if SaltTVIE.suitable(url) else super().suitable(url) + + +class SaltTVRecordingsIE(SaltTVBaseIE): + _VALID_URL = _create_valid_url(SaltTVBaseIE._HOST, r'\d+', 'recording') + _TYPE = 'record' + _TESTS = [{ + 'url': 'https://tv.salt.ch/recordings?recording=193615508', + 'only_matching': True, + }, { + 'url': 'https://tv.salt.ch/tc/ptc_recordings_all_recordings?recording=193615420', 'only_matching': True, }] diff --git a/hypervideo_dl/extractor/zdf.py b/hypervideo_dl/extractor/zdf.py index 5f4d266..fca426a 100644 --- a/hypervideo_dl/extractor/zdf.py +++ b/hypervideo_dl/extractor/zdf.py @@ -1,18 +1,16 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor from ..compat import compat_str from ..utils import ( + NO_DEFAULT, + ExtractorError, determine_ext, + extract_attributes, float_or_none, int_or_none, join_nonempty, merge_dicts, - NO_DEFAULT, - orderedSet, parse_codecs, qualities, traverse_obj, @@ -72,6 +70,7 @@ class ZDFBaseIE(InfoExtractor): f.update({ 'url': format_url, 'format_id': join_nonempty('http', meta.get('type'), meta.get('quality')), + 'tbr': int_or_none(self._search_regex(r'_(\d+)k_', format_url, 'tbr', default=None)) }) new_formats = [f] formats.extend(merge_dicts(f, { @@ -111,7 +110,6 @@ class ZDFBaseIE(InfoExtractor): 'class': track.get('class'), 'language': track.get('language'), }) - self._sort_formats(formats, ('hasaud', 'res', 'quality', 'language_preference')) duration = float_or_none(try_get( ptmd, lambda x: x['attributes']['duration']['value']), scale=1000) @@ -122,6 +120,7 @@ class ZDFBaseIE(InfoExtractor): 'duration': duration, 'formats': formats, 'subtitles': self._extract_subtitles(ptmd), + '_format_sort_fields': ('tbr', 'res', 'quality', 'language_preference'), } def _extract_player(self, webpage, video_id, fatal=True): @@ -190,7 +189,7 @@ class ZDFIE(ZDFBaseIE): }, }, { 'url': 'https://www.zdf.de/funk/druck-11790/funk-alles-ist-verzaubert-102.html', - 'md5': '3d6f1049e9682178a11c54b91f3dd065', + 'md5': '1b93bdec7d02fc0b703c5e7687461628', 'info_dict': { 'ext': 'mp4', 'id': 'video_funk_1770473', @@ -233,23 +232,34 @@ class ZDFIE(ZDFBaseIE): 'timestamp': 1641355200, 'upload_date': '20220105', }, + 'skip': 'No longer available "Diese Seite wurde leider nicht gefunden"' + }, { + 'url': 'https://www.zdf.de/serien/soko-stuttgart/das-geld-anderer-leute-100.html', + 'info_dict': { + 'id': '191205_1800_sendung_sok8', + 'ext': 'mp4', + 'title': 'Das Geld anderer Leute', + 'description': 'md5:cb6f660850dc5eb7d1ab776ea094959d', + 'duration': 2581.0, + 'timestamp': 1654790700, + 'upload_date': '20220609', + 'thumbnail': 'https://epg-image.zdf.de/fotobase-webdelivery/images/e2d7e55a-09f0-424e-ac73-6cac4dd65f35?layout=2400x1350', + }, }] def _extract_entry(self, url, player, content, video_id): title = content.get('title') or content['teaserHeadline'] t = content['mainVideoContent']['http://zdf.de/rels/target'] - - ptmd_path = t.get('http://zdf.de/rels/streams/ptmd') - + ptmd_path = traverse_obj(t, ( + (('streams', 'default'), None), + ('http://zdf.de/rels/streams/ptmd', 'http://zdf.de/rels/streams/ptmd-template') + ), get_all=False) if not ptmd_path: - ptmd_path = traverse_obj( - t, ('streams', 'default', 'http://zdf.de/rels/streams/ptmd-template'), - 'http://zdf.de/rels/streams/ptmd-template').replace( - '{playerId}', 'ngplayer_2_4') + raise ExtractorError('Could not extract ptmd_path') info = self._extract_ptmd( - urljoin(url, ptmd_path), video_id, player['apiToken'], url) + urljoin(url, ptmd_path.replace('{playerId}', 'ngplayer_2_4')), video_id, player['apiToken'], url) thumbnails = [] layouts = try_get( @@ -298,16 +308,16 @@ class ZDFIE(ZDFBaseIE): 'https://zdf-cdn.live.cellular.de/mediathekV2/document/%s' % video_id, video_id) - document = video['document'] - - title = document['titel'] - content_id = document['basename'] - formats = [] - format_urls = set() - for f in document['formitaeten']: - self._extract_format(content_id, formats, format_urls, f) - self._sort_formats(formats) + formitaeten = try_get(video, lambda x: x['document']['formitaeten'], list) + document = formitaeten and video['document'] + if formitaeten: + title = document['titel'] + content_id = document['basename'] + + format_urls = set() + for f in formitaeten or []: + self._extract_format(content_id, formats, format_urls, f) thumbnails = [] teaser_bild = document.get('teaserBild') @@ -353,9 +363,9 @@ class ZDFChannelIE(ZDFBaseIE): 'url': 'https://www.zdf.de/sport/das-aktuelle-sportstudio', 'info_dict': { 'id': 'das-aktuelle-sportstudio', - 'title': 'das aktuelle sportstudio | ZDF', + 'title': 'das aktuelle sportstudio', }, - 'playlist_mincount': 23, + 'playlist_mincount': 18, }, { 'url': 'https://www.zdf.de/dokumentation/planet-e', 'info_dict': { @@ -364,6 +374,14 @@ class ZDFChannelIE(ZDFBaseIE): }, 'playlist_mincount': 50, }, { + 'url': 'https://www.zdf.de/gesellschaft/aktenzeichen-xy-ungeloest', + 'info_dict': { + 'id': 'aktenzeichen-xy-ungeloest', + 'title': 'Aktenzeichen XY... ungelöst', + 'entries': "lambda x: not any('xy580-fall1-kindermoerder-gesucht-100' in e['url'] for e in x)", + }, + 'playlist_mincount': 2, + }, { 'url': 'https://www.zdf.de/filme/taunuskrimi/', 'only_matching': True, }] @@ -372,60 +390,36 @@ class ZDFChannelIE(ZDFBaseIE): def suitable(cls, url): return False if ZDFIE.suitable(url) else super(ZDFChannelIE, cls).suitable(url) + def _og_search_title(self, webpage, fatal=False): + title = super(ZDFChannelIE, self)._og_search_title(webpage, fatal=fatal) + return re.split(r'\s+[-|]\s+ZDF(?:mediathek)?$', title or '')[0] or None + def _real_extract(self, url): channel_id = self._match_id(url) webpage = self._download_webpage(url, channel_id) - entries = [ - self.url_result(item_url, ie=ZDFIE.ie_key()) - for item_url in orderedSet(re.findall( - r'data-plusbar-url=["\'](http.+?\.html)', webpage))] - - return self.playlist_result( - entries, channel_id, self._og_search_title(webpage, fatal=False)) - - r""" - player = self._extract_player(webpage, channel_id) - - channel_id = self._search_regex( - r'docId\s*:\s*(["\'])(?P<id>(?!\1).+?)\1', webpage, - 'channel id', group='id') - - channel = self._call_api( - 'https://api.zdf.de/content/documents/%s.json' % channel_id, - player, url, channel_id) - - items = [] - for module in channel['module']: - for teaser in try_get(module, lambda x: x['teaser'], list) or []: - t = try_get( - teaser, lambda x: x['http://zdf.de/rels/target'], dict) - if not t: - continue - items.extend(try_get( - t, - lambda x: x['resultsWithVideo']['http://zdf.de/rels/search/results'], - list) or []) - items.extend(try_get( - module, - lambda x: x['filterRef']['resultsWithVideo']['http://zdf.de/rels/search/results'], - list) or []) - - entries = [] - entry_urls = set() - for item in items: - t = try_get(item, lambda x: x['http://zdf.de/rels/target'], dict) - if not t: - continue - sharing_url = t.get('http://zdf.de/rels/sharing-url') - if not sharing_url or not isinstance(sharing_url, compat_str): - continue - if sharing_url in entry_urls: - continue - entry_urls.add(sharing_url) - entries.append(self.url_result( - sharing_url, ie=ZDFIE.ie_key(), video_id=t.get('id'))) - - return self.playlist_result(entries, channel_id, channel.get('title')) - """ + matches = re.finditer( + r'''<div\b[^>]*?\sdata-plusbar-id\s*=\s*(["'])(?P<p_id>[\w-]+)\1[^>]*?\sdata-plusbar-url=\1(?P<url>%s)\1''' % ZDFIE._VALID_URL, + webpage) + + if self._downloader.params.get('noplaylist', False): + entry = next( + (self.url_result(m.group('url'), ie=ZDFIE.ie_key()) for m in matches), + None) + self.to_screen('Downloading just the main video because of --no-playlist') + if entry: + return entry + else: + self.to_screen('Downloading playlist %s - add --no-playlist to download just the main video' % (channel_id, )) + + def check_video(m): + v_ref = self._search_regex( + r'''(<a\b[^>]*?\shref\s*=[^>]+?\sdata-target-id\s*=\s*(["'])%s\2[^>]*>)''' % (m.group('p_id'), ), + webpage, 'check id', default='') + v_ref = extract_attributes(v_ref) + return v_ref.get('data-target-video-type') != 'novideo' + + return self.playlist_from_matches( + (m.group('url') for m in matches if check_video(m)), + channel_id, self._og_search_title(webpage, fatal=False)) diff --git a/hypervideo_dl/extractor/zee5.py b/hypervideo_dl/extractor/zee5.py index 3e3f11b..a64eb9e 100644 --- a/hypervideo_dl/extractor/zee5.py +++ b/hypervideo_dl/extractor/zee5.py @@ -1,7 +1,6 @@ -# coding: utf-8 -from __future__ import unicode_literals - import json +import random +import string from .common import InfoExtractor from ..compat import compat_str @@ -24,25 +23,25 @@ class Zee5IE(InfoExtractor): https?://(?:www\.)?zee5\.com/(?:[^#?]+/)? (?: (?:tv-shows|kids|web-series|zee5originals)(?:/[^#/?]+){3} - |movies/[^#/?]+ + |(?:movies|kids|videos|news|music-videos)/(?!kids-shows)[^#/?]+ )/(?P<display_id>[^#/?]+)/ ) (?P<id>[^#/?]+)/?(?:$|[?#]) ''' _TESTS = [{ - 'url': 'https://www.zee5.com/movies/details/krishna-the-birth/0-0-63098', + 'url': 'https://www.zee5.com/movies/details/adavari-matalaku-ardhale-verule/0-0-movie_1143162669', 'info_dict': { - 'id': '0-0-63098', + 'id': '0-0-movie_1143162669', 'ext': 'mp4', - 'display_id': 'krishna-the-birth', - 'title': 'Krishna - The Birth', - 'duration': 4368, + 'display_id': 'adavari-matalaku-ardhale-verule', + 'title': 'Adavari Matalaku Ardhale Verule', + 'duration': 9360, 'description': compat_str, - 'alt_title': 'Krishna - The Birth', + 'alt_title': 'Adavari Matalaku Ardhale Verule', 'uploader': 'Zee Entertainment Enterprises Ltd', - 'release_date': '20060101', - 'upload_date': '20060101', - 'timestamp': 1136073600, + 'release_date': '20070427', + 'upload_date': '20070427', + 'timestamp': 1177632000, 'thumbnail': r're:^https?://.*\.jpg$', 'episode_number': 0, 'episode': 'Episode 0', @@ -85,9 +84,18 @@ class Zee5IE(InfoExtractor): }, { 'url': 'https://www.zee5.com/web-series/details/mithya/0-6-4z587408/maine-dekhi-hai-uski-mrityu/0-1-6z587412', 'only_matching': True + }, { + 'url': 'https://www.zee5.com/kids/kids-movies/maya-bommalu/0-0-movie_1040370005', + 'only_matching': True + }, { + 'url': 'https://www.zee5.com/news/details/jana-sena-chief-pawan-kalyan-shows-slippers-to-ysrcp-leaders/0-0-newsauto_6ettj4242oo0', + 'only_matching': True + }, { + 'url': 'https://www.zee5.com/music-videos/details/adhento-gaani-vunnapaatuga-jersey-nani-shraddha-srinath/0-0-56973', + 'only_matching': True }] - _DETAIL_API_URL = 'https://spapi.zee5.com/singlePlayback/getDetails?content_id={}&device_id={}&platform_name=desktop_web&country=IN&check_parental_control=false' - _DEVICE_ID = 'iIxsxYf40cqO3koIkwzKHZhnJzHN13zb' + _DETAIL_API_URL = 'https://spapi.zee5.com/singlePlayback/getDetails/secure?content_id={}&device_id={}&platform_name=desktop_web&country=IN&check_parental_control=false' + _DEVICE_ID = ''.join(random.choices(string.ascii_letters + string.digits, k=20)).ljust(32, '0') _USER_TOKEN = None _LOGIN_HINT = 'Use "--username <mobile_number>" to login using otp or "--username token" and "--password <user_token>" to login using user token.' _NETRC_MACHINE = 'zee5' @@ -96,14 +104,14 @@ class Zee5IE(InfoExtractor): def _perform_login(self, username, password): if len(username) == 10 and username.isdigit() and self._USER_TOKEN is None: self.report_login() - otp_request_json = self._download_json('https://b2bapi.zee5.com/device/sendotp_v1.php?phoneno=91{}'.format(username), + otp_request_json = self._download_json(f'https://b2bapi.zee5.com/device/sendotp_v1.php?phoneno=91{username}', None, note='Sending OTP') if otp_request_json['code'] == 0: self.to_screen(otp_request_json['message']) else: raise ExtractorError(otp_request_json['message'], expected=True) otp_code = self._get_tfa_info('OTP') - otp_verify_json = self._download_json('https://b2bapi.zee5.com/device/verifyotp_v1.php?phoneno=91{}&otp={}&guest_token={}&platform=web'.format(username, otp_code, self._DEVICE_ID), + otp_verify_json = self._download_json(f'https://b2bapi.zee5.com/device/verifyotp_v1.php?phoneno=91{username}&otp={otp_code}&guest_token={self._DEVICE_ID}&platform=web', None, note='Verifying OTP', fatal=False) if not otp_verify_json: raise ExtractorError('Unable to verify OTP.', expected=True) @@ -138,7 +146,6 @@ class Zee5IE(InfoExtractor): if not asset_data.get('hls_url'): self.raise_login_required(self._LOGIN_HINT, metadata_available=True, method=None) formats, m3u8_subs = self._extract_m3u8_formats_and_subtitles(asset_data['hls_url'], video_id, 'mp4', fatal=False) - self._sort_formats(formats) subtitles = {} for sub in asset_data.get('subtitle_url', []): @@ -177,7 +184,7 @@ class Zee5SeriesIE(InfoExtractor): (?: zee5:series:| https?://(?:www\.)?zee5\.com/(?:[^#?]+/)? - (?:tv-shows|web-series|kids|zee5originals)(?:/[^#/?]+){2}/ + (?:tv-shows|web-series|kids|zee5originals)/(?!kids-movies)(?:[^#/?]+/){2} ) (?P<id>[^#/?]+)(?:/episodes)?/?(?:$|[?#]) ''' @@ -227,13 +234,13 @@ class Zee5SeriesIE(InfoExtractor): 'X-Access-Token': access_token_request['token'], 'Referer': 'https://www.zee5.com/', } - show_url = 'https://gwapi.zee5.com/content/tvshow/{}?translation=en&country=IN'.format(show_id) + show_url = f'https://gwapi.zee5.com/content/tvshow/{show_id}?translation=en&country=IN' page_num = 0 show_json = self._download_json(show_url, video_id=show_id, headers=headers) for season in show_json.get('seasons') or []: season_id = try_get(season, lambda x: x['id'], compat_str) - next_url = 'https://gwapi.zee5.com/content/tvshow/?season_id={}&type=episode&translation=en&country=IN&on_air=false&asset_subtype=tvshow&page=1&limit=100'.format(season_id) + next_url = f'https://gwapi.zee5.com/content/tvshow/?season_id={season_id}&type=episode&translation=en&country=IN&on_air=false&asset_subtype=tvshow&page=1&limit=100' while next_url: page_num += 1 episodes_json = self._download_json( diff --git a/hypervideo_dl/extractor/zeenews.py b/hypervideo_dl/extractor/zeenews.py new file mode 100644 index 0000000..1616dbf --- /dev/null +++ b/hypervideo_dl/extractor/zeenews.py @@ -0,0 +1,57 @@ +from .common import InfoExtractor +from ..utils import ExtractorError, traverse_obj + + +class ZeeNewsIE(InfoExtractor): + _VALID_URL = r'https?://zeenews\.india\.com/[^#?]+/video/(?P<display_id>[^#/?]+)/(?P<id>\d+)' + _TESTS = [ + { + 'url': 'https://zeenews.india.com/hindi/india/delhi-ncr-haryana/delhi-ncr/video/greater-noida-video-viral-on-social-media-attackers-beat-businessman-and-his-son-oppose-market-closed-atdnh/1402138', + 'info_dict': { + 'id': '1402138', + 'ext': 'mp4', + 'title': 'Greater Noida Video: हमलावरों ने दिनदहाड़े दुकान में घुसकर की मारपीट, देखें वीडियो', + 'display_id': 'greater-noida-video-viral-on-social-media-attackers-beat-businessman-and-his-son-oppose-market-closed-atdnh', + 'upload_date': '20221019', + 'thumbnail': r're:^https?://.*\.jpg*', + 'timestamp': 1666174501, + 'view_count': int, + 'duration': 97, + 'description': 'ग्रेटर नोएडा जारचा थाना क्षेत्र के प्याबली में दिनदहाड़े दुकान में घुसकर अज्ञात हमलावरों ने हमला कर', + } + }, + { + 'url': 'https://zeenews.india.com/hindi/india/video/videsh-superfast-queen-elizabeth-iis-funeral-today/1357710', + 'info_dict': { + 'id': '1357710', + 'ext': 'mp4', + 'title': 'Videsh Superfast: महारानी के अंतिम संस्कार की तैयारी शुरू', + 'display_id': 'videsh-superfast-queen-elizabeth-iis-funeral-today', + 'upload_date': '20220919', + 'thumbnail': r're:^https?://.*\.jpg*', + 'timestamp': 1663556881, + 'view_count': int, + 'duration': 133, + 'description': 'सेगमेंट विदेश सुपराफास्ट में देखिए देश और दुनिया की सभी बड़ी खबरें, वो भी हर खबर फटाफट अंदाज में.', + } + } + ] + + def _real_extract(self, url): + content_id, display_id = self._match_valid_url(url).group('id', 'display_id') + webpage = self._download_webpage(url, content_id) + json_ld_list = list(self._yield_json_ld(webpage, display_id)) + + embed_url = traverse_obj( + json_ld_list, (lambda _, v: v['@type'] == 'VideoObject', 'embedUrl'), get_all=False) + if not embed_url: + raise ExtractorError('No video found', expected=True) + + formats = self._extract_m3u8_formats(embed_url, content_id, 'mp4') + + return { + **self._json_ld(json_ld_list, display_id), + 'id': content_id, + 'display_id': display_id, + 'formats': formats, + } diff --git a/hypervideo_dl/extractor/zhihu.py b/hypervideo_dl/extractor/zhihu.py index 278a943..c24b338 100644 --- a/hypervideo_dl/extractor/zhihu.py +++ b/hypervideo_dl/extractor/zhihu.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - from .common import InfoExtractor from ..utils import format_field, float_or_none, int_or_none @@ -48,7 +45,6 @@ class ZhihuIE(InfoExtractor): 'url': play_url, 'width': int_or_none(q.get('width')), }) - self._sort_formats(formats) author = zvideo.get('author') or {} url_token = author.get('url_token') @@ -61,7 +57,7 @@ class ZhihuIE(InfoExtractor): 'uploader': author.get('name'), 'timestamp': int_or_none(zvideo.get('published_at')), 'uploader_id': author.get('id'), - 'uploader_url': format_field(url_token, template='https://www.zhihu.com/people/%s'), + 'uploader_url': format_field(url_token, None, 'https://www.zhihu.com/people/%s'), 'duration': float_or_none(video.get('duration')), 'view_count': int_or_none(zvideo.get('play_count')), 'like_count': int_or_none(zvideo.get('liked_count')), diff --git a/hypervideo_dl/extractor/zingmp3.py b/hypervideo_dl/extractor/zingmp3.py index 419bf30..a818c9f 100644 --- a/hypervideo_dl/extractor/zingmp3.py +++ b/hypervideo_dl/extractor/zingmp3.py @@ -1,131 +1,77 @@ -# coding: utf-8 -from __future__ import unicode_literals - +import functools import hashlib import hmac +import json import urllib.parse from .common import InfoExtractor from ..utils import ( + OnDemandPagedList, int_or_none, traverse_obj, + urljoin, ) class ZingMp3BaseIE(InfoExtractor): - _VALID_URL_TMPL = r'https?://(?:mp3\.zing|zingmp3)\.vn/(?P<type>(?:%s))/[^/]+/(?P<id>\w+)(?:\.html|\?)' + _VALID_URL_TMPL = r'https?://(?:mp3\.zing|zingmp3)\.vn/(?P<type>(?:%s))/[^/?#]+/(?P<id>\w+)(?:\.html|\?)' _GEO_COUNTRIES = ['VN'] _DOMAIN = 'https://zingmp3.vn' - _SLUG_API = { + _PER_PAGE = 50 + _API_SLUGS = { + # Audio/video 'bai-hat': '/api/v2/page/get/song', 'embed': '/api/v2/page/get/song', 'video-clip': '/api/v2/page/get/video', + 'lyric': '/api/v2/lyric/get/lyric', + 'song-streaming': '/api/v2/song/get/streaming', + # Playlist 'playlist': '/api/v2/page/get/playlist', 'album': '/api/v2/page/get/playlist', - 'lyric': '/api/v2/lyric/get/lyric', - 'song_streaming': '/api/v2/song/get/streaming', + # Chart + 'zing-chart': '/api/v2/page/get/chart-home', + 'zing-chart-tuan': '/api/v2/page/get/week-chart', + 'moi-phat-hanh': '/api/v2/page/get/newrelease-chart', + 'the-loai-video': '/api/v2/video/get/list', + # User + 'info-artist': '/api/v2/page/get/artist', + 'user-list-song': '/api/v2/song/get/list', + 'user-list-video': '/api/v2/video/get/list', } - _API_KEY = '88265e23d4284f25963e6eedac8fbfa3' - _SECRET_KEY = b'2aa2d1c561e809b267f3638c4a307aab' - - def _extract_item(self, item, song_id, type_url, fatal): - item_id = item.get('encodeId') or song_id - title = item.get('title') or item.get('alias') - - if type_url == 'video-clip': - source = item.get('streaming') - else: - api = self.get_api_with_signature(name_api=self._SLUG_API.get('song_streaming'), param={'id': item_id}) - source = self._download_json(api, video_id=item_id).get('data') - - formats = [] - for k, v in (source or {}).items(): - if not v: - continue - if k in ('mp4', 'hls'): - for res, video_url in v.items(): - if not video_url: - continue - if k == 'hls': - formats.extend(self._extract_m3u8_formats( - video_url, item_id, 'mp4', - 'm3u8_native', m3u8_id=k, fatal=False)) - elif k == 'mp4': - formats.append({ - 'format_id': 'mp4-' + res, - 'url': video_url, - 'height': int_or_none(self._search_regex( - r'^(\d+)p', res, 'resolution', default=None)), - }) - continue - elif v == 'VIP': - continue - formats.append({ - 'ext': 'mp3', - 'format_id': k, - 'tbr': int_or_none(k), - 'url': self._proto_relative_url(v), - 'vcodec': 'none', - }) - if not formats: - if not fatal: - return - msg = item.get('msg') - if msg == 'Sorry, this content is not available in your country.': - self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) - self.raise_no_formats(msg, expected=True) - self._sort_formats(formats) - - lyric = item.get('lyric') - if not lyric: - api = self.get_api_with_signature(name_api=self._SLUG_API.get("lyric"), param={'id': item_id}) - info_lyric = self._download_json(api, video_id=item_id) - lyric = traverse_obj(info_lyric, ('data', 'file')) - subtitles = { - 'origin': [{ - 'url': lyric, - }], - } if lyric else None - - album = item.get('album') or {} - - return { - 'id': item_id, - 'title': title, - 'formats': formats, - 'thumbnail': traverse_obj(item, 'thumbnail', 'thumbnailM'), - 'subtitles': subtitles, - 'duration': int_or_none(item.get('duration')), - 'track': title, - 'artist': traverse_obj(item, 'artistsNames', 'artists_names'), - 'album': traverse_obj(album, 'name', 'title'), - 'album_artist': traverse_obj(album, 'artistsNames', 'artists_names'), + def _api_url(self, url_type, params): + api_slug = self._API_SLUGS[url_type] + params.update({'ctime': '1'}) + sha256 = hashlib.sha256( + ''.join(f'{k}={v}' for k, v in sorted(params.items())).encode()).hexdigest() + data = { + **params, + 'apiKey': '88265e23d4284f25963e6eedac8fbfa3', + 'sig': hmac.new( + b'2aa2d1c561e809b267f3638c4a307aab', f'{api_slug}{sha256}'.encode(), hashlib.sha512).hexdigest(), } + return f'{self._DOMAIN}{api_slug}?{urllib.parse.urlencode(data)}' + + def _call_api(self, url_type, params, display_id=None, **kwargs): + resp = self._download_json( + self._api_url(url_type, params), display_id or params.get('id'), + note=f'Downloading {url_type} JSON metadata', **kwargs) + return (resp or {}).get('data') or {} def _real_initialize(self): - if not self.get_param('cookiefile') and not self.get_param('cookiesfrombrowser'): - self._request_webpage(self.get_api_with_signature(name_api=self._SLUG_API['bai-hat'], param={'id': ''}), - None, note='Updating cookies') + if not self._cookies_passed: + self._request_webpage( + self._api_url('bai-hat', {'id': ''}), None, note='Updating cookies') - def _real_extract(self, url): - song_id, type_url = self._match_valid_url(url).group('id', 'type') - api = self.get_api_with_signature(name_api=self._SLUG_API[type_url], param={'id': song_id}) - return self._process_data(self._download_json(api, song_id)['data'], song_id, type_url) - - def get_api_with_signature(self, name_api, param): - param.update({'ctime': '1'}) - sha256 = hashlib.sha256(''.join(f'{i}={param[i]}' for i in sorted(param)).encode('utf-8')).hexdigest() - data = { - 'apiKey': self._API_KEY, - 'sig': hmac.new(self._SECRET_KEY, f'{name_api}{sha256}'.encode('utf-8'), hashlib.sha512).hexdigest(), - **param, - } - return f'{self._DOMAIN}{name_api}?{urllib.parse.urlencode(data)}' + def _parse_items(self, items): + for url in traverse_obj(items, (..., 'link')) or []: + yield self.url_result(urljoin(self._DOMAIN, url)) class ZingMp3IE(ZingMp3BaseIE): _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'bai-hat|video-clip|embed' + IE_NAME = 'zingmp3' + IE_DESC = 'zingmp3.vn' _TESTS = [{ 'url': 'https://mp3.zing.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html', 'md5': 'ead7ae13693b3205cbc89536a077daed', @@ -147,7 +93,7 @@ class ZingMp3IE(ZingMp3BaseIE): }, }, { 'url': 'https://zingmp3.vn/video-clip/Suong-Hoa-Dua-Loi-K-ICM-RYO/ZO8ZF7C7.html', - 'md5': 'c7f23d971ac1a4f675456ed13c9b9612', + 'md5': '3c2081e79471a2f4a3edd90b70b185ea', 'info_dict': { 'id': 'ZO8ZF7C7', 'title': 'Sương Hoa Đưa Lối', @@ -180,11 +126,63 @@ class ZingMp3IE(ZingMp3BaseIE): 'url': 'https://zingmp3.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html', 'only_matching': True, }] - IE_NAME = 'zingmp3' - IE_DESC = 'zingmp3.vn' - def _process_data(self, data, song_id, type_url): - return self._extract_item(data, song_id, type_url, True) + def _real_extract(self, url): + song_id, url_type = self._match_valid_url(url).group('id', 'type') + item = self._call_api(url_type, {'id': song_id}) + + item_id = item.get('encodeId') or song_id + if url_type == 'video-clip': + source = item.get('streaming') + source['mp4'] = self._download_json( + 'http://api.mp3.zing.vn/api/mobile/video/getvideoinfo', item_id, + query={'requestdata': json.dumps({'id': item_id})}, + note='Downloading mp4 JSON metadata').get('source') + else: + source = self._call_api('song-streaming', {'id': item_id}) + + formats = [] + for k, v in (source or {}).items(): + if not v or v == 'VIP': + continue + if k not in ('mp4', 'hls'): + formats.append({ + 'ext': 'mp3', + 'format_id': k, + 'tbr': int_or_none(k), + 'url': self._proto_relative_url(v), + 'vcodec': 'none', + }) + continue + for res, video_url in v.items(): + if not video_url: + continue + if k == 'hls': + formats.extend(self._extract_m3u8_formats(video_url, item_id, 'mp4', m3u8_id=k, fatal=False)) + continue + formats.append({ + 'format_id': f'mp4-{res}', + 'url': video_url, + 'height': int_or_none(res), + }) + + if not formats and item.get('msg') == 'Sorry, this content is not available in your country.': + self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True) + + lyric = item.get('lyric') or self._call_api('lyric', {'id': item_id}, fatal=False).get('file') + + return { + 'id': item_id, + 'title': traverse_obj(item, 'title', 'alias'), + 'thumbnail': traverse_obj(item, 'thumbnail', 'thumbnailM'), + 'duration': int_or_none(item.get('duration')), + 'track': traverse_obj(item, 'title', 'alias'), + 'artist': traverse_obj(item, 'artistsNames', 'artists_names'), + 'album': traverse_obj(item, ('album', ('name', 'title')), get_all=False), + 'album_artist': traverse_obj(item, ('album', ('artistsNames', 'artists_names')), get_all=False), + 'formats': formats, + 'subtitles': {'origin': [{'url': lyric}]} if lyric else None, + } class ZingMp3AlbumIE(ZingMp3BaseIE): @@ -192,19 +190,17 @@ class ZingMp3AlbumIE(ZingMp3BaseIE): _TESTS = [{ 'url': 'http://mp3.zing.vn/album/Lau-Dai-Tinh-Ai-Bang-Kieu-Minh-Tuyet/ZWZBWDAF.html', 'info_dict': { - '_type': 'playlist', 'id': 'ZWZBWDAF', 'title': 'Lâu Đài Tình Ái', }, - 'playlist_count': 9, + 'playlist_mincount': 9, }, { 'url': 'https://zingmp3.vn/album/Nhung-Bai-Hat-Hay-Nhat-Cua-Mr-Siro-Mr-Siro/ZWZAEZZD.html', 'info_dict': { - '_type': 'playlist', 'id': 'ZWZAEZZD', 'title': 'Những Bài Hát Hay Nhất Của Mr. Siro', }, - 'playlist_count': 49, + 'playlist_mincount': 49, }, { 'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html', 'only_matching': True, @@ -214,12 +210,176 @@ class ZingMp3AlbumIE(ZingMp3BaseIE): }] IE_NAME = 'zingmp3:album' - def _process_data(self, data, song_id, type_url): - def entries(): - for item in traverse_obj(data, ('song', 'items')) or []: - entry = self._extract_item(item, song_id, type_url, False) - if entry: - yield entry + def _real_extract(self, url): + song_id, url_type = self._match_valid_url(url).group('id', 'type') + data = self._call_api(url_type, {'id': song_id}) + return self.playlist_result( + self._parse_items(traverse_obj(data, ('song', 'items'))), + traverse_obj(data, 'id', 'encodeId'), traverse_obj(data, 'name', 'title')) + + +class ZingMp3ChartHomeIE(ZingMp3BaseIE): + _VALID_URL = r'https?://(?:mp3\.zing|zingmp3)\.vn/(?P<id>(?:zing-chart|moi-phat-hanh))/?(?:[#?]|$)' + _TESTS = [{ + 'url': 'https://zingmp3.vn/zing-chart', + 'info_dict': { + 'id': 'zing-chart', + }, + 'playlist_mincount': 100, + }, { + 'url': 'https://zingmp3.vn/moi-phat-hanh', + 'info_dict': { + 'id': 'moi-phat-hanh', + }, + 'playlist_mincount': 100, + }] + IE_NAME = 'zingmp3:chart-home' + + def _real_extract(self, url): + url_type = self._match_id(url) + data = self._call_api(url_type, {'id': url_type}) + items = traverse_obj(data, ('RTChart', 'items') if url_type == 'zing-chart' else 'items') + return self.playlist_result(self._parse_items(items), url_type) + - return self.playlist_result(entries(), traverse_obj(data, 'id', 'encodeId'), - traverse_obj(data, 'name', 'title')) +class ZingMp3WeekChartIE(ZingMp3BaseIE): + _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'zing-chart-tuan' + IE_NAME = 'zingmp3:week-chart' + _TESTS = [{ + 'url': 'https://zingmp3.vn/zing-chart-tuan/Bai-hat-Viet-Nam/IWZ9Z08I.html', + 'info_dict': { + 'id': 'IWZ9Z08I', + 'title': 'zing-chart-vn', + }, + 'playlist_mincount': 10, + }, { + 'url': 'https://zingmp3.vn/zing-chart-tuan/Bai-hat-US-UK/IWZ9Z0BW.html', + 'info_dict': { + 'id': 'IWZ9Z0BW', + 'title': 'zing-chart-us', + }, + 'playlist_mincount': 10, + }, { + 'url': 'https://zingmp3.vn/zing-chart-tuan/Bai-hat-KPop/IWZ9Z0BO.html', + 'info_dict': { + 'id': 'IWZ9Z0BO', + 'title': 'zing-chart-korea', + }, + 'playlist_mincount': 10, + }] + + def _real_extract(self, url): + song_id, url_type = self._match_valid_url(url).group('id', 'type') + data = self._call_api(url_type, {'id': song_id}) + return self.playlist_result( + self._parse_items(data['items']), song_id, f'zing-chart-{data.get("country", "")}') + + +class ZingMp3ChartMusicVideoIE(ZingMp3BaseIE): + _VALID_URL = r'https?://(?:mp3\.zing|zingmp3)\.vn/(?P<type>the-loai-video)/(?P<regions>[^/]+)/(?P<id>[^\.]+)' + IE_NAME = 'zingmp3:chart-music-video' + _TESTS = [{ + 'url': 'https://zingmp3.vn/the-loai-video/Viet-Nam/IWZ9Z08I.html', + 'info_dict': { + 'id': 'IWZ9Z08I', + 'title': 'the-loai-video_Viet-Nam', + }, + 'playlist_mincount': 400, + }, { + 'url': 'https://zingmp3.vn/the-loai-video/Au-My/IWZ9Z08O.html', + 'info_dict': { + 'id': 'IWZ9Z08O', + 'title': 'the-loai-video_Au-My', + }, + 'playlist_mincount': 40, + }, { + 'url': 'https://zingmp3.vn/the-loai-video/Han-Quoc/IWZ9Z08W.html', + 'info_dict': { + 'id': 'IWZ9Z08W', + 'title': 'the-loai-video_Han-Quoc', + }, + 'playlist_mincount': 30, + }, { + 'url': 'https://zingmp3.vn/the-loai-video/Khong-Loi/IWZ9Z086.html', + 'info_dict': { + 'id': 'IWZ9Z086', + 'title': 'the-loai-video_Khong-Loi', + }, + 'playlist_mincount': 10, + }] + + def _fetch_page(self, song_id, url_type, page): + return self._parse_items(self._call_api(url_type, { + 'id': song_id, + 'type': 'genre', + 'page': page + 1, + 'count': self._PER_PAGE + }).get('items')) + + def _real_extract(self, url): + song_id, regions, url_type = self._match_valid_url(url).group('id', 'regions', 'type') + return self.playlist_result( + OnDemandPagedList(functools.partial(self._fetch_page, song_id, url_type), self._PER_PAGE), + song_id, f'{url_type}_{regions}') + + +class ZingMp3UserIE(ZingMp3BaseIE): + _VALID_URL = r'https?://(?:mp3\.zing|zingmp3)\.vn/(?P<user>[^/]+)/(?P<type>bai-hat|single|album|video)/?(?:[?#]|$)' + IE_NAME = 'zingmp3:user' + _TESTS = [{ + 'url': 'https://zingmp3.vn/Mr-Siro/bai-hat', + 'info_dict': { + 'id': 'IWZ98609', + 'title': 'Mr. Siro - bai-hat', + 'description': 'md5:85ab29bd7b21725c12bf76fd1d6922e5', + }, + 'playlist_mincount': 91, + }, { + 'url': 'https://zingmp3.vn/Mr-Siro/album', + 'info_dict': { + 'id': 'IWZ98609', + 'title': 'Mr. Siro - album', + 'description': 'md5:85ab29bd7b21725c12bf76fd1d6922e5', + }, + 'playlist_mincount': 3, + }, { + 'url': 'https://zingmp3.vn/Mr-Siro/single', + 'info_dict': { + 'id': 'IWZ98609', + 'title': 'Mr. Siro - single', + 'description': 'md5:85ab29bd7b21725c12bf76fd1d6922e5', + }, + 'playlist_mincount': 20, + }, { + 'url': 'https://zingmp3.vn/Mr-Siro/video', + 'info_dict': { + 'id': 'IWZ98609', + 'title': 'Mr. Siro - video', + 'description': 'md5:85ab29bd7b21725c12bf76fd1d6922e5', + }, + 'playlist_mincount': 15, + }] + + def _fetch_page(self, user_id, url_type, page): + url_type = 'user-list-song' if url_type == 'bai-hat' else 'user-list-video' + return self._parse_items(self._call_api(url_type, { + 'id': user_id, + 'type': 'artist', + 'page': page + 1, + 'count': self._PER_PAGE + }, query={'sort': 'new', 'sectionId': 'aSong'}).get('items')) + + def _real_extract(self, url): + user_alias, url_type = self._match_valid_url(url).group('user', 'type') + if not url_type: + url_type = 'bai-hat' + + user_info = self._call_api('info-artist', {}, user_alias, query={'alias': user_alias}) + if url_type in ('bai-hat', 'video'): + entries = OnDemandPagedList( + functools.partial(self._fetch_page, user_info['id'], url_type), self._PER_PAGE) + else: + entries = self._parse_items(traverse_obj(user_info, ( + 'sections', lambda _, v: v['link'] == f'/{user_alias}/{url_type}', 'items', ...))) + return self.playlist_result( + entries, user_info['id'], f'{user_info.get("name")} - {url_type}', user_info.get('biography')) diff --git a/hypervideo_dl/extractor/zoom.py b/hypervideo_dl/extractor/zoom.py index c005488..ef8b715 100644 --- a/hypervideo_dl/extractor/zoom.py +++ b/hypervideo_dl/extractor/zoom.py @@ -1,7 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -90,8 +86,6 @@ class ZoomIE(InfoExtractor): 'preference': -1 }) - self._sort_formats(formats) - return { 'id': play_id, 'title': data.get('topic'), diff --git a/hypervideo_dl/extractor/zype.py b/hypervideo_dl/extractor/zype.py index 7663cb3..8cf9945 100644 --- a/hypervideo_dl/extractor/zype.py +++ b/hypervideo_dl/extractor/zype.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals - import re from .common import InfoExtractor @@ -18,6 +15,7 @@ class ZypeIE(InfoExtractor): _ID_RE = r'[\da-fA-F]+' _COMMON_RE = r'//player\.zype\.com/embed/%s\.(?:js|json|html)\?.*?(?:access_token|(?:ap[ip]|player)_key)=' _VALID_URL = r'https?:%s[^&]+' % (_COMMON_RE % ('(?P<id>%s)' % _ID_RE)) + _EMBED_REGEX = [fr'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?{_COMMON_RE % _ID_RE}.+?)\1'] _TEST = { 'url': 'https://player.zype.com/embed/5b400b834b32992a310622b9.js?api_key=jZ9GUhRmxcPvX7M3SlfejB6Hle9jyHTdk2jVxG7wOHPLODgncEKVdPYBhuz9iWXQ&autoplay=false&controls=true&da=false', 'md5': 'eaee31d474c76a955bdaba02a505c595', @@ -32,14 +30,6 @@ class ZypeIE(InfoExtractor): }, } - @staticmethod - def _extract_urls(webpage): - return [ - mobj.group('url') - for mobj in re.finditer( - r'<script[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?%s.+?)\1' % (ZypeIE._COMMON_RE % ZypeIE._ID_RE), - webpage)] - def _real_extract(self, url): video_id = self._match_id(url) @@ -107,7 +97,6 @@ class ZypeIE(InfoExtractor): if text_tracks: text_tracks = self._parse_json( text_tracks, video_id, js_to_json, False) - self._sort_formats(formats) if text_tracks: for text_track in text_tracks: diff --git a/hypervideo_dl/jsinterp.py b/hypervideo_dl/jsinterp.py index 46834f8..adc5a19 100644 --- a/hypervideo_dl/jsinterp.py +++ b/hypervideo_dl/jsinterp.py @@ -1,31 +1,136 @@ -from collections.abc import MutableMapping +import collections +import contextlib +import itertools import json +import math import operator import re from .utils import ( + NO_DEFAULT, ExtractorError, + js_to_json, remove_quotes, + truncate_string, + unified_timestamp, + write_string, ) -_OPERATORS = [ - ('|', operator.or_), - ('^', operator.xor), - ('&', operator.and_), - ('>>', operator.rshift), - ('<<', operator.lshift), - ('-', operator.sub), - ('+', operator.add), - ('%', operator.mod), - ('/', operator.truediv), - ('*', operator.mul), -] -_ASSIGN_OPERATORS = [(op + '=', opfunc) for op, opfunc in _OPERATORS] -_ASSIGN_OPERATORS.append(('=', (lambda cur, right: right))) -_NAME_RE = r'[a-zA-Z_$][a-zA-Z_$0-9]*' +def _js_bit_op(op): + def zeroise(x): + return 0 if x in (None, JS_Undefined) else x -_MATCHING_PARENS = dict(zip('({[', ')}]')) + def wrapped(a, b): + return op(zeroise(a), zeroise(b)) & 0xffffffff + + return wrapped + + +def _js_arith_op(op): + + def wrapped(a, b): + if JS_Undefined in (a, b): + return float('nan') + return op(a or 0, b or 0) + + return wrapped + + +def _js_div(a, b): + if JS_Undefined in (a, b) or not (a and b): + return float('nan') + return (a or 0) / b if b else float('inf') + + +def _js_mod(a, b): + if JS_Undefined in (a, b) or not b: + return float('nan') + return (a or 0) % b + + +def _js_exp(a, b): + if not b: + return 1 # even 0 ** 0 !! + elif JS_Undefined in (a, b): + return float('nan') + return (a or 0) ** b + + +def _js_eq_op(op): + + def wrapped(a, b): + if {a, b} <= {None, JS_Undefined}: + return op(a, a) + return op(a, b) + + return wrapped + + +def _js_comp_op(op): + + def wrapped(a, b): + if JS_Undefined in (a, b): + return False + if isinstance(a, str) or isinstance(b, str): + return op(str(a or 0), str(b or 0)) + return op(a or 0, b or 0) + + return wrapped + + +def _js_ternary(cndn, if_true=True, if_false=False): + """Simulate JS's ternary operator (cndn?if_true:if_false)""" + if cndn in (False, None, 0, '', JS_Undefined): + return if_false + with contextlib.suppress(TypeError): + if math.isnan(cndn): # NB: NaN cannot be checked by membership + return if_false + return if_true + + +# Ref: https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/Operator_Precedence +_OPERATORS = { # None => Defined in JSInterpreter._operator + '?': None, + '??': None, + '||': None, + '&&': None, + + '|': _js_bit_op(operator.or_), + '^': _js_bit_op(operator.xor), + '&': _js_bit_op(operator.and_), + + '===': operator.is_, + '!==': operator.is_not, + '==': _js_eq_op(operator.eq), + '!=': _js_eq_op(operator.ne), + + '<=': _js_comp_op(operator.le), + '>=': _js_comp_op(operator.ge), + '<': _js_comp_op(operator.lt), + '>': _js_comp_op(operator.gt), + + '>>': _js_bit_op(operator.rshift), + '<<': _js_bit_op(operator.lshift), + + '+': _js_arith_op(operator.add), + '-': _js_arith_op(operator.sub), + + '*': _js_arith_op(operator.mul), + '%': _js_mod, + '/': _js_div, + '**': _js_exp, +} + +_COMP_OPERATORS = {'===', '!==', '==', '!=', '<=', '>=', '<', '>'} + +_NAME_RE = r'[a-zA-Z_$][\w$]*' +_MATCHING_PARENS = dict(zip(*zip('()', '{}', '[]'))) +_QUOTES = '\'"/' + + +class JS_Undefined: + pass class JS_Break(ExtractorError): @@ -38,47 +143,79 @@ class JS_Continue(ExtractorError): ExtractorError.__init__(self, 'Invalid continue') -class LocalNameSpace(MutableMapping): - def __init__(self, *stack): - self.stack = tuple(stack) +class JS_Throw(ExtractorError): + def __init__(self, e): + self.error = e + ExtractorError.__init__(self, f'Uncaught exception {e}') - def __getitem__(self, key): - for scope in self.stack: - if key in scope: - return scope[key] - raise KeyError(key) +class LocalNameSpace(collections.ChainMap): def __setitem__(self, key, value): - for scope in self.stack: + for scope in self.maps: if key in scope: scope[key] = value - break - else: - self.stack[0][key] = value - return value + return + self.maps[0][key] = value def __delitem__(self, key): raise NotImplementedError('Deleting is not supported') - def __iter__(self): - for scope in self.stack: - yield from scope - - def __len__(self, key): - return len(iter(self)) - def __repr__(self): - return f'LocalNameSpace{self.stack}' +class Debugger: + import sys + ENABLED = False and 'pytest' in sys.modules + @staticmethod + def write(*args, level=100): + write_string(f'[debug] JS: {" " * (100 - level)}' + f'{" ".join(truncate_string(str(x), 50, 50) for x in args)}\n') + + @classmethod + def wrap_interpreter(cls, f): + def interpret_statement(self, stmt, local_vars, allow_recursion, *args, **kwargs): + if cls.ENABLED and stmt.strip(): + cls.write(stmt, level=allow_recursion) + try: + ret, should_ret = f(self, stmt, local_vars, allow_recursion, *args, **kwargs) + except Exception as e: + if cls.ENABLED: + if isinstance(e, ExtractorError): + e = e.orig_msg + cls.write('=> Raises:', e, '<-|', stmt, level=allow_recursion) + raise + if cls.ENABLED and stmt.strip(): + cls.write(['->', '=>'][should_ret], repr(ret), '<-|', stmt, level=allow_recursion) + return ret, should_ret + return interpret_statement + + +class JSInterpreter: + __named_object_counter = 0 + + _RE_FLAGS = { + # special knowledge: Python's re flags are bitmask values, current max 128 + # invent new bitmask values well above that for literal parsing + # TODO: new pattern class to execute matches with these flags + 'd': 1024, # Generate indices for substring matches + 'g': 2048, # Global search + 'i': re.I, # Case-insensitive search + 'm': re.M, # Multi-line search + 's': re.S, # Allows . to match newline characters + 'u': re.U, # Treat a pattern as a sequence of unicode code points + 'y': 4096, # Perform a "sticky" search that matches starting at the current position in the target string + } + + _EXC_NAME = '__hypervideo_dl_exception__' -class JSInterpreter(object): def __init__(self, code, objects=None): - if objects is None: - objects = {} - self.code = code - self._functions = {} - self._objects = objects - self.__named_object_counter = 0 + self.code, self._functions = code, {} + self._objects = {} if objects is None else objects + + class Exception(ExtractorError): + def __init__(self, msg, expr=None, *args, **kwargs): + if expr is not None: + msg = f'{msg.rstrip()} in: {truncate_string(expr, 50, 50)}' + super().__init__(msg, *args, **kwargs) def _named_object(self, namespace, obj): self.__named_object_counter += 1 @@ -86,18 +223,42 @@ class JSInterpreter(object): namespace[name] = obj return name + @classmethod + def _regex_flags(cls, expr): + flags = 0 + if not expr: + return flags, expr + for idx, ch in enumerate(expr): + if ch not in cls._RE_FLAGS: + break + flags |= cls._RE_FLAGS[ch] + return flags, expr[idx + 1:] + @staticmethod def _separate(expr, delim=',', max_split=None): + OP_CHARS = '+-*/%&|^=<>!,;{}:[' if not expr: return counters = {k: 0 for k in _MATCHING_PARENS.values()} start, splits, pos, delim_len = 0, 0, 0, len(delim) - 1 + in_quote, escaping, after_op, in_regex_char_group = None, False, True, False for idx, char in enumerate(expr): - if char in _MATCHING_PARENS: + if not in_quote and char in _MATCHING_PARENS: counters[_MATCHING_PARENS[char]] += 1 - elif char in counters: - counters[char] -= 1 - if char != delim[pos] or any(counters.values()): + elif not in_quote and char in counters: + # Something's wrong if we get negative, but ignore it anyway + if counters[char]: + counters[char] -= 1 + elif not escaping: + if char in _QUOTES and in_quote in (char, None): + if in_quote or after_op or char != '/': + in_quote = None if in_quote and not in_regex_char_group else char + elif in_quote == '/' and char in '[]': + in_regex_char_group = char == '[' + escaping = not escaping and in_quote and char == '\\' + after_op = not in_quote and char in OP_CHARS or (char.isspace() and after_op) + + if char != delim[pos] or any(counters.values()) or in_quote: pos = 0 continue elif pos != delim_len: @@ -110,150 +271,241 @@ class JSInterpreter(object): break yield expr[start:] - @staticmethod - def _separate_at_paren(expr, delim): - separated = list(JSInterpreter._separate(expr, delim, 1)) + @classmethod + def _separate_at_paren(cls, expr, delim=None): + if delim is None: + delim = expr and _MATCHING_PARENS[expr[0]] + separated = list(cls._separate(expr, delim, 1)) if len(separated) < 2: - raise ExtractorError(f'No terminating paren {delim} in {expr}') + raise cls.Exception(f'No terminating paren {delim}', expr) return separated[0][1:].strip(), separated[1].strip() + def _operator(self, op, left_val, right_expr, expr, local_vars, allow_recursion): + if op in ('||', '&&'): + if (op == '&&') ^ _js_ternary(left_val): + return left_val # short circuiting + elif op == '??': + if left_val not in (None, JS_Undefined): + return left_val + elif op == '?': + right_expr = _js_ternary(left_val, *self._separate(right_expr, ':', 1)) + + right_val = self.interpret_expression(right_expr, local_vars, allow_recursion) + if not _OPERATORS.get(op): + return right_val + + try: + return _OPERATORS[op](left_val, right_val) + except Exception as e: + raise self.Exception(f'Failed to evaluate {left_val!r} {op} {right_val!r}', expr, cause=e) + + def _index(self, obj, idx, allow_undefined=False): + if idx == 'length': + return len(obj) + try: + return obj[int(idx)] if isinstance(obj, list) else obj[idx] + except Exception as e: + if allow_undefined: + return JS_Undefined + raise self.Exception(f'Cannot get index {idx}', repr(obj), cause=e) + + def _dump(self, obj, namespace): + try: + return json.dumps(obj) + except TypeError: + return self._named_object(namespace, obj) + + @Debugger.wrap_interpreter def interpret_statement(self, stmt, local_vars, allow_recursion=100): if allow_recursion < 0: - raise ExtractorError('Recursion limit reached') + raise self.Exception('Recursion limit reached') + allow_recursion -= 1 + + should_return = False + sub_statements = list(self._separate(stmt, ';')) or [''] + expr = stmt = sub_statements.pop().strip() - sub_statements = list(self._separate(stmt, ';')) - stmt = (sub_statements or ['']).pop() for sub_stmt in sub_statements: - ret, should_abort = self.interpret_statement(sub_stmt, local_vars, allow_recursion - 1) - if should_abort: - return ret + ret, should_return = self.interpret_statement(sub_stmt, local_vars, allow_recursion) + if should_return: + return ret, should_return - should_abort = False - stmt = stmt.lstrip() - stmt_m = re.match(r'var\s', stmt) - if stmt_m: - expr = stmt[len(stmt_m.group(0)):] - else: - return_m = re.match(r'return(?:\s+|$)', stmt) - if return_m: - expr = stmt[len(return_m.group(0)):] - should_abort = True - else: - # Try interpreting it as an expression - expr = stmt + m = re.match(r'(?P<var>(?:var|const|let)\s)|return(?:\s+|(?=["\'])|$)|(?P<throw>throw\s+)', stmt) + if m: + expr = stmt[len(m.group(0)):].strip() + if m.group('throw'): + raise JS_Throw(self.interpret_expression(expr, local_vars, allow_recursion)) + should_return = not m.group('var') + if not expr: + return None, should_return - v = self.interpret_expression(expr, local_vars, allow_recursion) - return v, should_abort + if expr[0] in _QUOTES: + inner, outer = self._separate(expr, expr[0], 1) + if expr[0] == '/': + flags, outer = self._regex_flags(outer) + inner = re.compile(inner[1:], flags=flags) + else: + inner = json.loads(js_to_json(f'{inner}{expr[0]}', strict=True)) + if not outer: + return inner, should_return + expr = self._named_object(local_vars, inner) + outer + + if expr.startswith('new '): + obj = expr[4:] + if obj.startswith('Date('): + left, right = self._separate_at_paren(obj[4:]) + expr = unified_timestamp( + self.interpret_expression(left, local_vars, allow_recursion), False) + if not expr: + raise self.Exception(f'Failed to parse date {left!r}', expr) + expr = self._dump(int(expr * 1000), local_vars) + right + else: + raise self.Exception(f'Unsupported object {obj}', expr) - def interpret_expression(self, expr, local_vars, allow_recursion): - expr = expr.strip() - if expr == '': # Empty expression - return None + if expr.startswith('void '): + left = self.interpret_expression(expr[5:], local_vars, allow_recursion) + return None, should_return if expr.startswith('{'): - inner, outer = self._separate_at_paren(expr, '}') - inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion - 1) + inner, outer = self._separate_at_paren(expr) + # try for object expression (Map) + sub_expressions = [list(self._separate(sub_expr.strip(), ':', 1)) for sub_expr in self._separate(inner)] + if all(len(sub_expr) == 2 for sub_expr in sub_expressions): + def dict_item(key, val): + val = self.interpret_expression(val, local_vars, allow_recursion) + if re.match(_NAME_RE, key): + return key, val + return self.interpret_expression(key, local_vars, allow_recursion), val + + return dict(dict_item(k, v) for k, v in sub_expressions), should_return + + inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion) if not outer or should_abort: - return inner + return inner, should_abort or should_return else: - expr = json.dumps(inner) + outer + expr = self._dump(inner, local_vars) + outer if expr.startswith('('): - inner, outer = self._separate_at_paren(expr, ')') - inner = self.interpret_expression(inner, local_vars, allow_recursion) - if not outer: - return inner + inner, outer = self._separate_at_paren(expr) + inner, should_abort = self.interpret_statement(inner, local_vars, allow_recursion) + if not outer or should_abort: + return inner, should_abort or should_return else: - expr = json.dumps(inner) + outer + expr = self._dump(inner, local_vars) + outer if expr.startswith('['): - inner, outer = self._separate_at_paren(expr, ']') + inner, outer = self._separate_at_paren(expr) name = self._named_object(local_vars, [ self.interpret_expression(item, local_vars, allow_recursion) for item in self._separate(inner)]) expr = name + outer - m = re.match(r'try\s*', expr) - if m: - if expr[m.end()] == '{': - try_expr, expr = self._separate_at_paren(expr[m.end():], '}') - else: - try_expr, expr = expr[m.end() - 1:], '' - ret, should_abort = self.interpret_statement(try_expr, local_vars, allow_recursion - 1) + m = re.match(r'''(?x) + (?P<try>try)\s*\{| + (?P<switch>switch)\s*\(| + (?P<for>for)\s*\( + ''', expr) + md = m.groupdict() if m else {} + if md.get('try'): + try_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) + err = None + try: + ret, should_abort = self.interpret_statement(try_expr, local_vars, allow_recursion) + if should_abort: + return ret, True + except Exception as e: + # XXX: This works for now, but makes debugging future issues very hard + err = e + + pending = (None, False) + m = re.match(r'catch\s*(?P<err>\(\s*{_NAME_RE}\s*\))?\{{'.format(**globals()), expr) + if m: + sub_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) + if err: + catch_vars = {} + if m.group('err'): + catch_vars[m.group('err')] = err.error if isinstance(err, JS_Throw) else err + catch_vars = local_vars.new_child(catch_vars) + err, pending = None, self.interpret_statement(sub_expr, catch_vars, allow_recursion) + + m = re.match(r'finally\s*\{', expr) + if m: + sub_expr, expr = self._separate_at_paren(expr[m.end() - 1:]) + ret, should_abort = self.interpret_statement(sub_expr, local_vars, allow_recursion) + if should_abort: + return ret, True + + ret, should_abort = pending if should_abort: - return ret - return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0] + return ret, True - m = re.match(r'catch\s*\(', expr) - if m: - # We ignore the catch block - _, expr = self._separate_at_paren(expr, '}') - return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0] + if err: + raise err - m = re.match(r'for\s*\(', expr) - if m: - constructor, remaining = self._separate_at_paren(expr[m.end() - 1:], ')') + elif md.get('for'): + constructor, remaining = self._separate_at_paren(expr[m.end() - 1:]) if remaining.startswith('{'): - body, expr = self._separate_at_paren(remaining, '}') + body, expr = self._separate_at_paren(remaining) else: - m = re.match(r'switch\s*\(', remaining) # FIXME - if m: - switch_val, remaining = self._separate_at_paren(remaining[m.end() - 1:], ')') + switch_m = re.match(r'switch\s*\(', remaining) # FIXME + if switch_m: + switch_val, remaining = self._separate_at_paren(remaining[switch_m.end() - 1:]) body, expr = self._separate_at_paren(remaining, '}') body = 'switch(%s){%s}' % (switch_val, body) else: body, expr = remaining, '' start, cndn, increment = self._separate(constructor, ';') - if self.interpret_statement(start, local_vars, allow_recursion - 1)[1]: - raise ExtractorError( - f'Premature return in the initialization of a for loop in {constructor!r}') + self.interpret_expression(start, local_vars, allow_recursion) while True: - if not self.interpret_expression(cndn, local_vars, allow_recursion): + if not _js_ternary(self.interpret_expression(cndn, local_vars, allow_recursion)): break try: - ret, should_abort = self.interpret_statement(body, local_vars, allow_recursion - 1) + ret, should_abort = self.interpret_statement(body, local_vars, allow_recursion) if should_abort: - return ret + return ret, True except JS_Break: break except JS_Continue: pass - if self.interpret_statement(increment, local_vars, allow_recursion - 1)[1]: - raise ExtractorError( - f'Premature return in the initialization of a for loop in {constructor!r}') - return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0] + self.interpret_expression(increment, local_vars, allow_recursion) - m = re.match(r'switch\s*\(', expr) - if m: - switch_val, remaining = self._separate_at_paren(expr[m.end() - 1:], ')') + elif md.get('switch'): + switch_val, remaining = self._separate_at_paren(expr[m.end() - 1:]) switch_val = self.interpret_expression(switch_val, local_vars, allow_recursion) body, expr = self._separate_at_paren(remaining, '}') items = body.replace('default:', 'case default:').split('case ')[1:] for default in (False, True): matched = False for item in items: - case, stmt = [i.strip() for i in self._separate(item, ':', 1)] + case, stmt = (i.strip() for i in self._separate(item, ':', 1)) if default: matched = matched or case == 'default' elif not matched: - matched = case != 'default' and switch_val == self.interpret_expression(case, local_vars, allow_recursion) + matched = (case != 'default' + and switch_val == self.interpret_expression(case, local_vars, allow_recursion)) if not matched: continue try: - ret, should_abort = self.interpret_statement(stmt, local_vars, allow_recursion - 1) + ret, should_abort = self.interpret_statement(stmt, local_vars, allow_recursion) if should_abort: return ret except JS_Break: break if matched: break - return self.interpret_statement(expr, local_vars, allow_recursion - 1)[0] + + if md: + ret, should_abort = self.interpret_statement(expr, local_vars, allow_recursion) + return ret, should_abort or should_return # Comma separated statements sub_expressions = list(self._separate(expr)) - expr = sub_expressions.pop().strip() if sub_expressions else '' - for sub_expr in sub_expressions: - self.interpret_expression(sub_expr, local_vars, allow_recursion) + if len(sub_expressions) > 1: + for sub_expr in sub_expressions: + ret, should_abort = self.interpret_statement(sub_expr, local_vars, allow_recursion) + if should_abort: + return ret, True + return ret, False for m in re.finditer(rf'''(?x) (?P<pre_sign>\+\+|--)(?P<var1>{_NAME_RE})| @@ -265,107 +517,123 @@ class JSInterpreter(object): local_vars[var] += 1 if sign[0] == '+' else -1 if m.group('pre_sign'): ret = local_vars[var] - expr = expr[:start] + json.dumps(ret) + expr[end:] - - for op, opfunc in _ASSIGN_OPERATORS: - m = re.match(r'''(?x) - (?P<out>%s)(?:\[(?P<index>[^\]]+?)\])? - \s*%s - (?P<expr>.*)$''' % (_NAME_RE, re.escape(op)), expr) - if not m: - continue - right_val = self.interpret_expression(m.group('expr'), local_vars, allow_recursion) - - if m.groupdict().get('index'): - lvar = local_vars[m.group('out')] - idx = self.interpret_expression(m.group('index'), local_vars, allow_recursion) - if not isinstance(idx, int): - raise ExtractorError(f'List indices must be integers: {idx}') - cur = lvar[idx] - val = opfunc(cur, right_val) - lvar[idx] = val - return val - else: - cur = local_vars.get(m.group('out')) - val = opfunc(cur, right_val) - local_vars[m.group('out')] = val - return val + expr = expr[:start] + self._dump(ret, local_vars) + expr[end:] - if expr.isdigit(): - return int(expr) - - if expr == 'break': + if not expr: + return None, should_return + + m = re.match(fr'''(?x) + (?P<assign> + (?P<out>{_NAME_RE})(?:\[(?P<index>[^\]]+?)\])?\s* + (?P<op>{"|".join(map(re.escape, set(_OPERATORS) - _COMP_OPERATORS))})? + =(?!=)(?P<expr>.*)$ + )|(?P<return> + (?!if|return|true|false|null|undefined|NaN)(?P<name>{_NAME_RE})$ + )|(?P<indexing> + (?P<in>{_NAME_RE})\[(?P<idx>.+)\]$ + )|(?P<attribute> + (?P<var>{_NAME_RE})(?:(?P<nullish>\?)?\.(?P<member>[^(]+)|\[(?P<member2>[^\]]+)\])\s* + )|(?P<function> + (?P<fname>{_NAME_RE})\((?P<args>.*)\)$ + )''', expr) + if m and m.group('assign'): + left_val = local_vars.get(m.group('out')) + + if not m.group('index'): + local_vars[m.group('out')] = self._operator( + m.group('op'), left_val, m.group('expr'), expr, local_vars, allow_recursion) + return local_vars[m.group('out')], should_return + elif left_val in (None, JS_Undefined): + raise self.Exception(f'Cannot index undefined variable {m.group("out")}', expr) + + idx = self.interpret_expression(m.group('index'), local_vars, allow_recursion) + if not isinstance(idx, (int, float)): + raise self.Exception(f'List index {idx} must be integer', expr) + idx = int(idx) + left_val[idx] = self._operator( + m.group('op'), self._index(left_val, idx), m.group('expr'), expr, local_vars, allow_recursion) + return left_val[idx], should_return + + elif expr.isdigit(): + return int(expr), should_return + + elif expr == 'break': raise JS_Break() elif expr == 'continue': raise JS_Continue() + elif expr == 'undefined': + return JS_Undefined, should_return + elif expr == 'NaN': + return float('NaN'), should_return - var_m = re.match( - r'(?!if|return|true|false|null)(?P<name>%s)$' % _NAME_RE, - expr) - if var_m: - return local_vars[var_m.group('name')] + elif m and m.group('return'): + return local_vars.get(m.group('name'), JS_Undefined), should_return - try: - return json.loads(expr) - except ValueError: - pass + with contextlib.suppress(ValueError): + return json.loads(js_to_json(expr, strict=True)), should_return - m = re.match( - r'(?P<in>%s)\[(?P<idx>.+)\]$' % _NAME_RE, expr) - if m: + if m and m.group('indexing'): val = local_vars[m.group('in')] idx = self.interpret_expression(m.group('idx'), local_vars, allow_recursion) - return val[idx] + return self._index(val, idx), should_return - for op, opfunc in _OPERATORS: + for op in _OPERATORS: separated = list(self._separate(expr, op)) - if len(separated) < 2: + right_expr = separated.pop() + while True: + if op in '?<>*-' and len(separated) > 1 and not separated[-1].strip(): + separated.pop() + elif not (separated and op == '?' and right_expr.startswith('.')): + break + right_expr = f'{op}{right_expr}' + if op != '-': + right_expr = f'{separated.pop()}{op}{right_expr}' + if not separated: continue - right_val = separated.pop() - left_val = op.join(separated) - left_val, should_abort = self.interpret_statement( - left_val, local_vars, allow_recursion - 1) - if should_abort: - raise ExtractorError(f'Premature left-side return of {op} in {expr!r}') - right_val, should_abort = self.interpret_statement( - right_val, local_vars, allow_recursion - 1) - if should_abort: - raise ExtractorError(f'Premature right-side return of {op} in {expr!r}') - return opfunc(left_val or 0, right_val) + left_val = self.interpret_expression(op.join(separated), local_vars, allow_recursion) + return self._operator(op, left_val, right_expr, expr, local_vars, allow_recursion), should_return - m = re.match( - r'(?P<var>%s)(?:\.(?P<member>[^(]+)|\[(?P<member2>[^]]+)\])\s*' % _NAME_RE, - expr) - if m: - variable = m.group('var') - member = remove_quotes(m.group('member') or m.group('member2')) + if m and m.group('attribute'): + variable, member, nullish = m.group('var', 'member', 'nullish') + if not member: + member = self.interpret_expression(m.group('member2'), local_vars, allow_recursion) arg_str = expr[m.end():] if arg_str.startswith('('): - arg_str, remaining = self._separate_at_paren(arg_str, ')') + arg_str, remaining = self._separate_at_paren(arg_str) else: arg_str, remaining = None, arg_str def assertion(cndn, msg): """ assert, but without risk of getting optimized out """ if not cndn: - raise ExtractorError(f'{member} {msg}: {expr}') + raise self.Exception(f'{member} {msg}', expr) def eval_method(): - nonlocal member - if variable == 'String': - obj = str - elif variable in local_vars: - obj = local_vars[variable] - else: + if (variable, member) == ('console', 'debug'): + if Debugger.ENABLED: + Debugger.write(self.interpret_expression(f'[{arg_str}]', local_vars, allow_recursion)) + return + + types = { + 'String': str, + 'Math': float, + } + obj = local_vars.get(variable, types.get(variable, NO_DEFAULT)) + if obj is NO_DEFAULT: if variable not in self._objects: - self._objects[variable] = self.extract_object(variable) - obj = self._objects[variable] + try: + self._objects[variable] = self.extract_object(variable) + except self.Exception: + if not nullish: + raise + obj = self._objects.get(variable, JS_Undefined) + + if nullish and obj is JS_Undefined: + return JS_Undefined + # Member access if arg_str is None: - # Member access - if member == 'length': - return len(obj) - return obj[member] + return self._index(obj, member, nullish) # Function call argvals = [ @@ -376,12 +644,17 @@ class JSInterpreter(object): if member == 'fromCharCode': assertion(argvals, 'takes one or more arguments') return ''.join(map(chr, argvals)) - raise ExtractorError(f'Unsupported string method {member}') + raise self.Exception(f'Unsupported String method {member}', expr) + elif obj == float: + if member == 'pow': + assertion(len(argvals) == 2, 'takes two arguments') + return argvals[0] ** argvals[1] + raise self.Exception(f'Unsupported Math method {member}', expr) if member == 'split': assertion(argvals, 'takes one or more arguments') - assertion(argvals == [''], 'with arguments is not implemented') - return list(obj) + assertion(len(argvals) == 1, 'with limit argument is not implemented') + return obj.split(argvals[0]) if argvals[0] else list(obj) elif member == 'join': assertion(isinstance(obj, list), 'must be applied on a list') assertion(len(argvals) == 1, 'takes exactly one argument') @@ -427,7 +700,7 @@ class JSInterpreter(object): assertion(argvals, 'takes one or more arguments') assertion(len(argvals) <= 2, 'takes at-most 2 arguments') f, this = (argvals + [''])[:2] - return [f((item, idx, obj), this=this) for idx, item in enumerate(obj)] + return [f((item, idx, obj), {'this': this}, allow_recursion) for idx, item in enumerate(obj)] elif member == 'indexOf': assertion(argvals, 'takes one or more arguments') assertion(len(argvals) <= 2, 'takes at-most 2 arguments') @@ -436,32 +709,43 @@ class JSInterpreter(object): return obj.index(idx, start) except ValueError: return -1 + elif member == 'charCodeAt': + assertion(isinstance(obj, str), 'must be applied on a string') + assertion(len(argvals) == 1, 'takes exactly one argument') + idx = argvals[0] if isinstance(argvals[0], int) else 0 + if idx >= len(obj): + return None + return ord(obj[idx]) - if isinstance(obj, list): - member = int(member) - return obj[member](argvals) + idx = int(member) if isinstance(obj, list) else member + return obj[idx](argvals, allow_recursion=allow_recursion) if remaining: - return self.interpret_expression( + ret, should_abort = self.interpret_statement( self._named_object(local_vars, eval_method()) + remaining, local_vars, allow_recursion) + return ret, should_return or should_abort else: - return eval_method() + return eval_method(), should_return - m = re.match(r'^(?P<func>%s)\((?P<args>[a-zA-Z0-9_$,]*)\)$' % _NAME_RE, expr) - if m: - fname = m.group('func') - argvals = tuple([ - int(v) if v.isdigit() else local_vars[v] - for v in self._separate(m.group('args'))]) + elif m and m.group('function'): + fname = m.group('fname') + argvals = [self.interpret_expression(v, local_vars, allow_recursion) + for v in self._separate(m.group('args'))] if fname in local_vars: - return local_vars[fname](argvals) + return local_vars[fname](argvals, allow_recursion=allow_recursion), should_return elif fname not in self._functions: self._functions[fname] = self.extract_function(fname) - return self._functions[fname](argvals) + return self._functions[fname](argvals, allow_recursion=allow_recursion), should_return + + raise self.Exception( + f'Unsupported JS expression {truncate_string(expr, 20, 20) if expr != stmt else ""}', stmt) - if expr: - raise ExtractorError('Unsupported JS expression %r' % expr) + def interpret_expression(self, expr, local_vars, allow_recursion): + ret, should_return = self.interpret_statement(expr, local_vars, allow_recursion) + if should_return: + raise self.Exception('Cannot return from an expression', expr) + return ret def extract_object(self, objname): _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')''' @@ -473,12 +757,14 @@ class JSInterpreter(object): }\s*; ''' % (re.escape(objname), _FUNC_NAME_RE), self.code) + if not obj_m: + raise self.Exception(f'Could not find object {objname}') fields = obj_m.group('fields') # Currently, it only supports function definitions fields_m = re.finditer( r'''(?x) - (?P<key>%s)\s*:\s*function\s*\((?P<args>[a-z,]+)\){(?P<code>[^}]+)} - ''' % _FUNC_NAME_RE, + (?P<key>%s)\s*:\s*function\s*\((?P<args>(?:%s|,)*)\){(?P<code>[^}]+)} + ''' % (_FUNC_NAME_RE, _NAME_RE), fields) for f in fields_m: argnames = f.group('args').split(',') @@ -489,16 +775,19 @@ class JSInterpreter(object): def extract_function_code(self, funcname): """ @returns argnames, code """ func_m = re.search( - r'''(?x) - (?:function\s+%s|[{;,]\s*%s\s*=\s*function|var\s+%s\s*=\s*function)\s* + r'''(?xs) + (?: + function\s+%(name)s| + [{;,]\s*%(name)s\s*=\s*function| + (?:var|const|let)\s+%(name)s\s*=\s*function + )\s* \((?P<args>[^)]*)\)\s* - (?P<code>\{(?:(?!};)[^"]|"([^"]|\\")*")+\})''' % ( - re.escape(funcname), re.escape(funcname), re.escape(funcname)), + (?P<code>{.+})''' % {'name': re.escape(funcname)}, self.code) - code, _ = self._separate_at_paren(func_m.group('code'), '}') # refine the match + code, _ = self._separate_at_paren(func_m.group('code')) if func_m is None: - raise ExtractorError('Could not find JS function %r' % funcname) - return func_m.group('args').split(','), code + raise self.Exception(f'Could not find JS function "{funcname}"') + return [x.strip() for x in func_m.group('args').split(',')], code def extract_function(self, funcname): return self.extract_function_from_code(*self.extract_function_code(funcname)) @@ -510,12 +799,10 @@ class JSInterpreter(object): if mobj is None: break start, body_start = mobj.span() - body, remaining = self._separate_at_paren(code[body_start - 1:], '}') - name = self._named_object( - local_vars, - self.extract_function_from_code( - [str.strip(x) for x in mobj.group('args').split(',')], - body, local_vars, *global_stack)) + body, remaining = self._separate_at_paren(code[body_start - 1:]) + name = self._named_object(local_vars, self.extract_function_from_code( + [x.strip() for x in mobj.group('args').split(',')], + body, local_vars, *global_stack)) code = code[:start] + name + remaining return self.build_function(argnames, code, local_vars, *global_stack) @@ -524,17 +811,13 @@ class JSInterpreter(object): def build_function(self, argnames, code, *global_stack): global_stack = list(global_stack) or [{}] - local_vars = global_stack.pop(0) - - def resf(args, **kwargs): - local_vars.update({ - **dict(zip(argnames, args)), - **kwargs - }) - var_stack = LocalNameSpace(local_vars, *global_stack) - for stmt in self._separate(code.replace('\n', ''), ';'): - ret, should_abort = self.interpret_statement(stmt, var_stack) - if should_abort: - break - return ret + argnames = tuple(argnames) + + def resf(args, kwargs={}, allow_recursion=100): + global_stack[0].update(itertools.zip_longest(argnames, args, fillvalue=None)) + global_stack[0].update(kwargs) + var_stack = LocalNameSpace(*global_stack) + ret, should_abort = self.interpret_statement(code.replace('\n', ' '), var_stack, allow_recursion - 1) + if should_abort: + return ret return resf diff --git a/hypervideo_dl/minicurses.py b/hypervideo_dl/minicurses.py index f9f99e3..7db02cb 100644 --- a/hypervideo_dl/minicurses.py +++ b/hypervideo_dl/minicurses.py @@ -1,7 +1,7 @@ import functools from threading import Lock -from .utils import supports_terminal_sequences, write_string +from .utils import supports_terminal_sequences, write_string CONTROL_SEQUENCES = { 'DOWN': '\n', @@ -34,7 +34,7 @@ def format_text(text, f): ''' @param f String representation of formatting to apply in the form: [style] [light] font_color [on [light] bg_color] - Eg: "red", "bold green on light blue" + E.g. "red", "bold green on light blue" ''' f = f.upper() tokens = f.strip().split() @@ -69,6 +69,7 @@ def format_text(text, f): raise SyntaxError(f'Invalid format {" ".join(tokens)!r} in {f!r}') if fg_color or bg_color: + text = text.replace(CONTROL_SEQUENCES['RESET'], f'{fg_color}{bg_color}') return f'{fg_color}{bg_color}{text}{CONTROL_SEQUENCES["RESET"]}' else: return text @@ -178,4 +179,4 @@ class MultilinePrinter(MultilinePrinterBase): *text, CONTROL_SEQUENCES['ERASE_LINE'], f'{CONTROL_SEQUENCES["UP"]}{CONTROL_SEQUENCES["ERASE_LINE"]}' * self.maximum) else: - self.write(*text, ' ' * self._lastlength) + self.write('\r', ' ' * self._lastlength, '\r') diff --git a/hypervideo_dl/options.py b/hypervideo_dl/options.py index b91193a..bf8684c 100644 --- a/hypervideo_dl/options.py +++ b/hypervideo_dl/options.py @@ -1,54 +1,49 @@ -from __future__ import unicode_literals - -import os.path +import collections +import contextlib import optparse +import os.path import re +import shlex +import shutil +import string import sys -from .compat import ( - compat_expanduser, - compat_get_terminal_size, - compat_getenv, - compat_kwargs, - compat_shlex_split, -) -from .utils import ( - Config, - expand_path, - get_executable_path, - OUTTMPL_TYPES, - POSTPROCESS_WHEN, - remove_end, - write_string, -) +from .compat import compat_expanduser from .cookies import SUPPORTED_BROWSERS, SUPPORTED_KEYRINGS -from .version import __version__ - from .downloader.external import list_external_downloaders from .postprocessor import ( FFmpegExtractAudioPP, + FFmpegMergerPP, FFmpegSubtitlesConvertorPP, FFmpegThumbnailsConvertorPP, FFmpegVideoRemuxerPP, SponsorBlockPP, ) from .postprocessor.modify_chapters import DEFAULT_SPONSORBLOCK_CHAPTER_TITLE +from .utils import ( + OUTTMPL_TYPES, + POSTPROCESS_WHEN, + Config, + deprecation_warning, + expand_path, + format_field, + get_executable_path, + join_nonempty, + orderedSet_from_options, + remove_end, + write_string, +) +from .version import __version__ def parseOpts(overrideArguments=None, ignore_config_files='if_override'): - parser = create_parser() - root = Config(parser) - + root = Config(create_parser()) if ignore_config_files == 'if_override': ignore_config_files = overrideArguments is not None - if overrideArguments: - root.append_config(overrideArguments, label='Override') - else: - root.append_config(sys.argv[1:], label='Command-line') def _readUserConf(package_name, default=[]): # .config - xdg_config_home = compat_getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config') + xdg_config_home = os.getenv('XDG_CONFIG_HOME') or compat_expanduser('~/.config') userConfFile = os.path.join(xdg_config_home, package_name, 'config') if not os.path.isfile(userConfFile): userConfFile = os.path.join(xdg_config_home, '%s.conf' % package_name) @@ -57,7 +52,7 @@ def parseOpts(overrideArguments=None, ignore_config_files='if_override'): return userConf, userConfFile # appdata - appdata_dir = compat_getenv('appdata') + appdata_dir = os.getenv('appdata') if appdata_dir: userConfFile = os.path.join(appdata_dir, package_name, 'config') userConf = Config.read_file(userConfFile, default=None) @@ -80,10 +75,10 @@ def parseOpts(overrideArguments=None, ignore_config_files='if_override'): def add_config(label, path, user=False): """ Adds config and returns whether to continue """ - if root.parse_args()[0].ignoreconfig: + if root.parse_known_args()[0].ignoreconfig: return False # Multiple package names can be given here - # Eg: ('hypervideo', 'youtube-dlc', 'youtube-dl') will look for + # E.g. ('hypervideo', 'youtube-dlc', 'youtube-dl') will look for # the configuration file of any of these three packages for package in ('hypervideo',): if user: @@ -99,55 +94,138 @@ def parseOpts(overrideArguments=None, ignore_config_files='if_override'): def load_configs(): yield not ignore_config_files yield add_config('Portable', get_executable_path()) - yield add_config('Home', expand_path(root.parse_args()[0].paths.get('home', '')).strip()) + yield add_config('Home', expand_path(root.parse_known_args()[0].paths.get('home', '')).strip()) yield add_config('User', None, user=True) yield add_config('System', '/etc') - if all(load_configs()): - # If ignoreconfig is found inside the system configuration file, - # the user configuration is removed - if root.parse_args()[0].ignoreconfig: - user_conf = next((i for i, conf in enumerate(root.configs) if conf.label == 'User'), None) - if user_conf is not None: - root.configs.pop(user_conf) + opts = optparse.Values({'verbose': True, 'print_help': False}) + try: + try: + if overrideArguments: + root.append_config(overrideArguments, label='Override') + else: + root.append_config(sys.argv[1:], label='Command-line') + loaded_all_configs = all(load_configs()) + except ValueError as err: + raise root.parser.error(err) + + if loaded_all_configs: + # If ignoreconfig is found inside the system configuration file, + # the user configuration is removed + if root.parse_known_args()[0].ignoreconfig: + user_conf = next((i for i, conf in enumerate(root.configs) if conf.label == 'User'), None) + if user_conf is not None: + root.configs.pop(user_conf) + + try: + root.configs[0].load_configs() # Resolve any aliases using --config-location + except ValueError as err: + raise root.parser.error(err) + + opts, args = root.parse_args() + except optparse.OptParseError: + with contextlib.suppress(optparse.OptParseError): + opts, _ = root.parse_known_args(strict=False) + raise + except (SystemExit, KeyboardInterrupt): + opts.verbose = False + raise + finally: + verbose = opts.verbose and f'\n{root}'.replace('\n| ', '\n[debug] ')[1:] + if verbose: + write_string(f'{verbose}\n') + if opts.print_help: + if verbose: + write_string('\n') + root.parser.print_help() + if opts.print_help: + sys.exit() + return root.parser, opts, args + - opts, args = root.parse_args() - if opts.verbose: - write_string(f'\n{root}'.replace('\n| ', '\n[debug] ')[1:] + '\n') - return parser, opts, args +class _YoutubeDLHelpFormatter(optparse.IndentedHelpFormatter): + def __init__(self): + # No need to wrap help messages if we're on a wide console + max_width = shutil.get_terminal_size().columns or 80 + # The % is chosen to get a pretty output in README.md + super().__init__(width=max_width, max_help_position=int(0.45 * max_width)) + + @staticmethod + def format_option_strings(option): + """ ('-o', '--option') -> -o, --format METAVAR """ + opts = join_nonempty( + option._short_opts and option._short_opts[0], + option._long_opts and option._long_opts[0], + delim=', ') + if option.takes_value(): + opts += f' {option.metavar}' + return opts class _YoutubeDLOptionParser(optparse.OptionParser): # optparse is deprecated since python 3.2. So assume a stable interface even for private methods + ALIAS_DEST = '_triggered_aliases' + ALIAS_TRIGGER_LIMIT = 100 + + def __init__(self): + super().__init__( + prog='hypervideo', + version=__version__, + usage='%prog [OPTIONS] URL [URL...]', + epilog='See full documentation at https://github.com/hypervideo/hypervideo#readme', + formatter=_YoutubeDLHelpFormatter(), + conflict_handler='resolve', + ) + self.set_default(self.ALIAS_DEST, collections.defaultdict(int)) + + _UNKNOWN_OPTION = (optparse.BadOptionError, optparse.AmbiguousOptionError) + _BAD_OPTION = optparse.OptionValueError + + def parse_known_args(self, args=None, values=None, strict=True): + """Same as parse_args, but ignore unknown switches. Similar to argparse.parse_known_args""" + self.rargs, self.largs = self._get_args(args), [] + self.values = values or self.get_default_values() + while self.rargs: + arg = self.rargs[0] + try: + if arg == '--': + del self.rargs[0] + break + elif arg.startswith('--'): + self._process_long_opt(self.rargs, self.values) + elif arg.startswith('-') and arg != '-': + self._process_short_opts(self.rargs, self.values) + elif self.allow_interspersed_args: + self.largs.append(self.rargs.pop(0)) + else: + break + except optparse.OptParseError as err: + if isinstance(err, self._UNKNOWN_OPTION): + self.largs.append(err.opt_str) + elif strict: + if isinstance(err, self._BAD_OPTION): + self.error(str(err)) + raise + return self.check_values(self.values, self.largs) + + def error(self, msg): + msg = f'{self.get_prog_name()}: error: {str(msg).strip()}\n' + raise optparse.OptParseError(f'{self.get_usage()}\n{msg}' if self.usage else msg) + + def _get_args(self, args): + return sys.argv[1:] if args is None else list(args) def _match_long_opt(self, opt): - """Improve ambigious argument resolution by comparing option objects instead of argument strings""" + """Improve ambiguous argument resolution by comparing option objects instead of argument strings""" try: return super()._match_long_opt(opt) except optparse.AmbiguousOptionError as e: - if len(set(self._long_opt[p] for p in e.possibilities)) == 1: + if len({self._long_opt[p] for p in e.possibilities}) == 1: return e.possibilities[0] raise def create_parser(): - def _format_option_string(option): - ''' ('-o', '--option') -> -o, --format METAVAR''' - - opts = [] - - if option._short_opts: - opts.append(option._short_opts[0]) - if option._long_opts: - opts.append(option._long_opts[0]) - if len(opts) > 1: - opts.insert(1, ', ') - - if option.takes_value(): - opts.append(' %s' % option.metavar) - - return ''.join(opts) - def _list_from_options_callback(option, opt_str, value, parser, append=True, delim=',', process=str.strip): # append can be True, False or -1 (prepend) current = list(getattr(parser.values, option.dest)) if append else [] @@ -157,30 +235,16 @@ def create_parser(): current + value if append is True else value + current) def _set_from_options_callback( - option, opt_str, value, parser, delim=',', allowed_values=None, aliases={}, + option, opt_str, value, parser, allowed_values, delim=',', aliases={}, process=lambda x: x.lower().strip()): - current = set(getattr(parser.values, option.dest)) - values = [process(value)] if delim is None else list(map(process, value.split(delim)[::-1])) - while values: - actual_val = val = values.pop() - if not val: - raise optparse.OptionValueError(f'Invalid {option.metavar} for {opt_str}: {value}') - if val == 'all': - current.update(allowed_values) - elif val == '-all': - current = set() - elif val in aliases: - values.extend(aliases[val]) - else: - if val[0] == '-': - val = val[1:] - current.discard(val) - else: - current.update([val]) - if allowed_values is not None and val not in allowed_values: - raise optparse.OptionValueError(f'wrong {option.metavar} for {opt_str}: {actual_val}') + values = [process(value)] if delim is None else map(process, value.split(delim)) + try: + requested = orderedSet_from_options(values, collections.ChainMap(aliases, {'all': allowed_values}), + start=getattr(parser.values, option.dest)) + except ValueError as e: + raise optparse.OptionValueError(f'wrong {option.metavar} for {opt_str}: {e.args[0]}') - setattr(parser.values, option.dest, current) + setattr(parser.values, option.dest, set(requested)) def _dict_from_options_callback( option, opt_str, value, parser, @@ -190,9 +254,9 @@ def create_parser(): out_dict = dict(getattr(parser.values, option.dest)) multiple_args = not isinstance(value, str) if multiple_keys: - allowed_keys = r'(%s)(,(%s))*' % (allowed_keys, allowed_keys) + allowed_keys = fr'({allowed_keys})(,({allowed_keys}))*' mobj = re.match( - r'(?i)(?P<keys>%s)%s(?P<val>.*)$' % (allowed_keys, delimiter), + fr'(?i)(?P<keys>{allowed_keys}){delimiter}(?P<val>.*)$', value[0] if multiple_args else value) if mobj is not None: keys, val = mobj.group('keys').split(','), mobj.group('val') @@ -202,7 +266,7 @@ def create_parser(): keys, val = [default_key], value else: raise optparse.OptionValueError( - 'wrong %s formatting; it should be %s, not "%s"' % (opt_str, option.metavar, value)) + f'wrong {opt_str} formatting; it should be {option.metavar}, not "{value}"') try: keys = map(process_key, keys) if process_key else keys val = process(val) if process else val @@ -212,30 +276,45 @@ def create_parser(): out_dict[key] = out_dict.get(key, []) + [val] if append else val setattr(parser.values, option.dest, out_dict) - # No need to wrap help messages if we're on a wide console - columns = compat_get_terminal_size().columns - max_width = columns if columns else 80 - # 47% is chosen because that is how README.md is currently formatted - # and moving help text even further to the right is undesirable. - # This can be reduced in the future to get a prettier output - max_help_position = int(0.47 * max_width) + parser = _YoutubeDLOptionParser() + alias_group = optparse.OptionGroup(parser, 'Aliases') + Formatter = string.Formatter() - fmt = optparse.IndentedHelpFormatter(width=max_width, max_help_position=max_help_position) - fmt.format_option_strings = _format_option_string + def _create_alias(option, opt_str, value, parser): + aliases, opts = value + try: + nargs = len({i if f == '' else f + for i, (_, f, _, _) in enumerate(Formatter.parse(opts)) if f is not None}) + opts.format(*map(str, range(nargs))) # validate + except Exception as err: + raise optparse.OptionValueError(f'wrong {opt_str} OPTIONS formatting; {err}') + if alias_group not in parser.option_groups: + parser.add_option_group(alias_group) - kw = { - 'version': __version__, - 'formatter': fmt, - 'usage': '%prog [OPTIONS] URL [URL...]', - 'conflict_handler': 'resolve', - } + aliases = (x if x.startswith('-') else f'--{x}' for x in map(str.strip, aliases.split(','))) + try: + args = [f'ARG{i}' for i in range(nargs)] + alias_group.add_option( + *aliases, nargs=nargs, dest=parser.ALIAS_DEST, type='str' if nargs else None, + metavar=' '.join(args), help=opts.format(*args), action='callback', + callback=_alias_callback, callback_kwargs={'opts': opts, 'nargs': nargs}) + except Exception as err: + raise optparse.OptionValueError(f'wrong {opt_str} formatting; {err}') - parser = _YoutubeDLOptionParser(**compat_kwargs(kw)) + def _alias_callback(option, opt_str, value, parser, opts, nargs): + counter = getattr(parser.values, option.dest) + counter[opt_str] += 1 + if counter[opt_str] > parser.ALIAS_TRIGGER_LIMIT: + raise optparse.OptionValueError(f'Alias {opt_str} exceeded invocation limit') + if nargs == 1: + value = [value] + assert (nargs == 0 and value is None) or len(value) == nargs + parser.rargs[:0] = shlex.split( + opts if value is None else opts.format(*map(shlex.quote, value))) general = optparse.OptionGroup(parser, 'General Options') general.add_option( - '-h', '--help', - action='help', + '-h', '--help', dest='print_help', action='store_true', help='Print this help text and exit') general.add_option( '--version', @@ -266,13 +345,28 @@ def create_parser(): action='store_true', dest='list_extractor_descriptions', default=False, help='Output descriptions of all supported extractors and exit') general.add_option( + '--use-extractors', '--ies', + action='callback', dest='allowed_extractors', metavar='NAMES', type='str', + default=[], callback=_list_from_options_callback, + help=( + 'Extractor names to use separated by commas. ' + 'You can also use regexes, "all", "default" and "end" (end URL matching); ' + 'e.g. --ies "holodex.*,end,youtube". ' + 'Prefix the name with a "-" to exclude it, e.g. --ies default,-generic. ' + 'Use --list-extractors for a list of extractor names. (Alias: --ies)')) + general.add_option( '--force-generic-extractor', action='store_true', dest='force_generic_extractor', default=False, - help='Force extraction to use the generic extractor') + help=optparse.SUPPRESS_HELP) general.add_option( '--default-search', dest='default_search', metavar='PREFIX', - help='Use this prefix for unqualified URLs. For example "gvsearch2:" downloads two videos from google videos for the search term "large apple". Use the value "auto" to let hypervideo guess ("auto_warning" to emit a warning when guessing). "error" just throws an error. The default value "fixup_error" repairs broken URLs, but emits an error if this is not possible instead of searching') + help=( + 'Use this prefix for unqualified URLs. ' + 'E.g. "gvsearch2:python" downloads two videos from google videos for the search term "python". ' + 'Use the value "auto" to let hypervideo guess ("auto_warning" to emit a warning when guessing). ' + '"error" just throws an error. The default value "fixup_error" repairs broken URLs, ' + 'but emits an error if this is not possible instead of searching')) general.add_option( '--ignore-config', '--no-config', action='store_true', dest='ignoreconfig', @@ -290,8 +384,8 @@ def create_parser(): '--config-locations', dest='config_locations', metavar='PATH', action='append', help=( - 'Location of the main configuration file; either the path to the config or its containing directory. ' - 'Can be used multiple times and inside other configuration files')) + 'Location of the main configuration file; either the path to the config or its containing directory ' + '("-" for stdin). Can be used multiple times and inside other configuration files')) general.add_option( '--flat-playlist', action='store_const', dest='extract_flat', const='in_playlist', default=False, @@ -327,9 +421,9 @@ def create_parser(): action='store_false', dest='mark_watched', help='Do not mark videos watched (default)') general.add_option( - '--no-colors', + '--no-colors', '--no-colours', action='store_true', dest='no_color', default=False, - help='Do not emit color codes in output') + help='Do not emit color codes in output (Alias: --no-colours)') general.add_option( '--compat-options', metavar='OPTS', dest='compat_opts', default=set(), type='str', @@ -338,26 +432,37 @@ def create_parser(): 'allowed_values': { 'filename', 'filename-sanitization', 'format-sort', 'abort-on-error', 'format-spec', 'no-playlist-metafiles', 'multistreams', 'no-live-chat', 'playlist-index', 'list-formats', 'no-direct-merge', - 'no-youtube-channel-redirect', 'no-youtube-unavailable-videos', 'no-attach-info-json', 'embed-metadata', - 'embed-thumbnail-atomicparsley', 'seperate-video-versions', 'no-clean-infojson', 'no-keep-subs', 'no-certifi', + 'no-attach-info-json', 'embed-metadata', 'embed-thumbnail-atomicparsley', + 'seperate-video-versions', 'no-clean-infojson', 'no-keep-subs', 'no-certifi', + 'no-youtube-channel-redirect', 'no-youtube-unavailable-videos', 'no-youtube-prefer-utc-upload-date', }, 'aliases': { - 'youtube-dl': ['-multistreams', 'all'], - 'youtube-dlc': ['-no-youtube-channel-redirect', '-no-live-chat', 'all'], + 'youtube-dl': ['all', '-multistreams'], + 'youtube-dlc': ['all', '-no-youtube-channel-redirect', '-no-live-chat'], } }, help=( 'Options that can help keep compatibility with youtube-dl or youtube-dlc ' 'configurations by reverting some of the changes made in hypervideo. ' 'See "Differences in default behavior" for details')) + general.add_option( + '--alias', metavar='ALIASES OPTIONS', dest='_', type='str', nargs=2, + action='callback', callback=_create_alias, + help=( + 'Create aliases for an option string. Unless an alias starts with a dash "-", it is prefixed with "--". ' + 'Arguments are parsed according to the Python string formatting mini-language. ' + 'E.g. --alias get-audio,-X "-S=aext:{0},abr -x --audio-format {0}" creates options ' + '"--get-audio" and "-X" that takes an argument (ARG0) and expands to ' + '"-S=aext:ARG0,abr -x --audio-format ARG0". All defined aliases are listed in the --help output. ' + 'Alias options can trigger more aliases; so be careful to avoid defining recursive options. ' + f'As a safety measure, each alias may be triggered a maximum of {_YoutubeDLOptionParser.ALIAS_TRIGGER_LIMIT} times. ' + 'This option can be used multiple times')) network = optparse.OptionGroup(parser, 'Network Options') network.add_option( '--proxy', dest='proxy', default=None, metavar='URL', help=( - 'Use the specified HTTP/HTTPS/SOCKS proxy. To enable ' - 'SOCKS proxy, specify a proper scheme. For example ' - 'socks5://user:pass@127.0.0.1:1080/. Pass in an empty string (--proxy "") ' - 'for direct connection')) + 'Use the specified HTTP/HTTPS/SOCKS proxy. To enable SOCKS proxy, specify a proper scheme, ' + 'e.g. socks5://user:pass@127.0.0.1:1080/. Pass in an empty string (--proxy "") for direct connection')) network.add_option( '--socket-timeout', dest='socket_timeout', type=float, default=None, metavar='SECONDS', @@ -410,15 +515,19 @@ def create_parser(): selection.add_option( '--playlist-start', dest='playliststart', metavar='NUMBER', default=1, type=int, - help='Playlist video to start at (default is %default)') + help=optparse.SUPPRESS_HELP) selection.add_option( '--playlist-end', dest='playlistend', metavar='NUMBER', default=None, type=int, - help='Playlist video to end at (default is last)') + help=optparse.SUPPRESS_HELP) selection.add_option( - '--playlist-items', + '-I', '--playlist-items', dest='playlist_items', metavar='ITEM_SPEC', default=None, - help='Playlist video items to download. Specify indices of the videos in the playlist separated by commas like: "--playlist-items 1,2,5,8" if you want to download videos indexed 1, 2, 5, 8 in the playlist. You can specify range: "--playlist-items 1-3,7,10-13", it will download the videos at index 1, 2, 3, 7, 10, 11, 12 and 13') + help=( + 'Comma separated playlist_index of the videos to download. ' + 'You can specify a range using "[START]:[STOP][:STEP]". For backward compatibility, START-STOP is also supported. ' + 'Use negative indices to count from the right and negative STEP to download in reverse order. ' + 'E.g. "-I 1:3,7,-5::2" used on a playlist of size 15 will download the videos at index 1,2,3,7,11,13,15')) selection.add_option( '--match-title', dest='matchtitle', metavar='REGEX', @@ -430,18 +539,17 @@ def create_parser(): selection.add_option( '--min-filesize', metavar='SIZE', dest='min_filesize', default=None, - help='Do not download any videos smaller than SIZE (e.g. 50k or 44.6m)') + help='Abort download if filesize is smaller than SIZE, e.g. 50k or 44.6M') selection.add_option( '--max-filesize', metavar='SIZE', dest='max_filesize', default=None, - help='Do not download any videos larger than SIZE (e.g. 50k or 44.6m)') + help='Abort download if filesize if larger than SIZE, e.g. 50k or 44.6M') selection.add_option( '--date', metavar='DATE', dest='date', default=None, help=( - 'Download only videos uploaded on this date. ' - 'The date can be "YYYYMMDD" or in the format ' - '"(now|today)[+-][0-9](day|week|month|year)(s)?"')) + 'Download only videos uploaded on this date. The date can be "YYYYMMDD" or in the format ' + '[now|today|yesterday][-N[day|week|month|year]]. E.g. --date today-2weeks')) selection.add_option( '--datebefore', metavar='DATE', dest='datebefore', default=None, @@ -466,16 +574,17 @@ def create_parser(): '--match-filters', metavar='FILTER', dest='match_filter', action='append', help=( - 'Generic video filter. Any field (see "OUTPUT TEMPLATE") can be compared with a ' - 'number or a string using the operators defined in "Filtering formats". ' + 'Generic video filter. Any "OUTPUT TEMPLATE" field can be compared with a ' + 'number or a string using the operators defined in "Filtering Formats". ' 'You can also simply specify a field to match if the field is present, ' 'use "!field" to check if the field is not present, and "&" to check multiple conditions. ' 'Use a "\\" to escape "&" or quotes if needed. If used multiple times, ' - 'the filter matches if atleast one of the conditions are met. Eg: --match-filter ' + 'the filter matches if atleast one of the conditions are met. E.g. --match-filter ' '!is_live --match-filter "like_count>?100 & description~=\'(?i)\\bcats \\& dogs\\b\'" ' 'matches only videos that are not live OR those that have a like count more than 100 ' '(or the like field is not available) and also has a description ' - 'that contains the phrase "cats & dogs" (ignoring case)')) + 'that contains the phrase "cats & dogs" (caseless). ' + 'Use "--match-filter -" to interactively ask whether to download each video')) selection.add_option( '--no-match-filter', metavar='FILTER', dest='match_filter', action='store_const', const=None, @@ -515,11 +624,11 @@ def create_parser(): selection.add_option( '--break-per-input', action='store_true', dest='break_per_url', default=False, - help='Make --break-on-existing and --break-on-reject act only on the current input URL') + help='--break-on-existing, --break-on-reject, --max-downloads, and autonumber resets per input URL') selection.add_option( '--no-break-per-input', action='store_false', dest='break_per_url', - help='--break-on-existing and --break-on-reject terminates the entire download queue') + help='--break-on-existing and similar options terminates the entire download queue') selection.add_option( '--skip-playlist-after-errors', metavar='N', dest='skip_playlist_after_errors', default=None, type=int, @@ -574,6 +683,19 @@ def create_parser(): '--ap-list-mso', action='store_true', dest='ap_list_mso', default=False, help='List all supported multiple-system operators') + authentication.add_option( + '--client-certificate', + dest='client_certificate', metavar='CERTFILE', + help='Path to client certificate file in PEM format. May include the private key') + authentication.add_option( + '--client-certificate-key', + dest='client_certificate_key', metavar='KEYFILE', + help='Path to private key file for client certificate') + authentication.add_option( + '--client-certificate-password', + dest='client_certificate_password', metavar='PASSWORD', + help='Password for client certificate private key, if encrypted. ' + 'If not provided, and the key is encrypted, hypervideo will ask interactively') video_format = optparse.OptionGroup(parser, 'Video Format Options') video_format.add_option( @@ -590,13 +712,11 @@ def create_parser(): action='store_true', dest='format_sort_force', metavar='FORMAT', default=False, help=( 'Force user specified sort order to have precedence over all fields, ' - 'see "Sorting Formats" for more details')) + 'see "Sorting Formats" for more details (Alias: --S-force)')) video_format.add_option( '--no-format-sort-force', action='store_false', dest='format_sort_force', metavar='FORMAT', default=False, - help=( - 'Some fields have precedence over the user specified sort order (default), ' - 'see "Sorting Formats" for more details')) + help='Some fields have precedence over the user specified sort order (default)') video_format.add_option( '--video-multistreams', action='store_true', dest='allow_multiple_video_streams', default=None, @@ -630,7 +750,7 @@ def create_parser(): video_format.add_option( '--check-formats', action='store_const', const='selected', dest='check_formats', default=None, - help='Check that the selected formats are actually downloadable') + help='Make sure formats are selected only from those that are actually downloadable') video_format.add_option( '--check-all-formats', action='store_true', dest='check_formats', @@ -655,9 +775,9 @@ def create_parser(): '--merge-output-format', action='store', dest='merge_output_format', metavar='FORMAT', default=None, help=( - 'If a merge is required (e.g. bestvideo+bestaudio), ' - 'output to given container format. One of mkv, mp4, ogg, webm, flv. ' - 'Ignored if no merge is required')) + 'Containers that may be used when merging formats, separated by "/", e.g. "mp4/mkv". ' + 'Ignored if no merge is required. ' + f'(currently supported: {", ".join(sorted(FFmpegMergerPP.SUPPORTED_EXTS))})')) video_format.add_option( '--allow-unplayable-formats', action='store_true', dest='allow_unplayable_formats', default=False, @@ -695,14 +815,14 @@ def create_parser(): subtitles.add_option( '--sub-format', action='store', dest='subtitlesformat', metavar='FORMAT', default='best', - help='Subtitle format, accepts formats preference, for example: "srt" or "ass/srt/best"') + help='Subtitle format; accepts formats preference, e.g. "srt" or "ass/srt/best"') subtitles.add_option( '--sub-langs', '--srt-langs', action='callback', dest='subtitleslangs', metavar='LANGS', type='str', default=[], callback=_list_from_options_callback, help=( - 'Languages of the subtitles to download (can be regex) or "all" separated by commas. (Eg: --sub-langs "en.*,ja") ' - 'You can prefix the language code with a "-" to exempt it from the requested languages. (Eg: --sub-langs all,-live_chat) ' + 'Languages of the subtitles to download (can be regex) or "all" separated by commas, e.g. --sub-langs "en.*,ja". ' + 'You can prefix the language code with a "-" to exclude it from the requested languages, e.g. --sub-langs all,-live_chat. ' 'Use --list-subs for a list of available language tags')) downloader = optparse.OptionGroup(parser, 'Download Options') @@ -713,11 +833,11 @@ def create_parser(): downloader.add_option( '-r', '--limit-rate', '--rate-limit', dest='ratelimit', metavar='RATE', - help='Maximum download rate in bytes per second (e.g. 50K or 4.2M)') + help='Maximum download rate in bytes per second, e.g. 50K or 4.2M') downloader.add_option( '--throttled-rate', dest='throttledratelimit', metavar='RATE', - help='Minimum download rate in bytes per second below which throttling is assumed and the video data is re-extracted (e.g. 100K)') + help='Minimum download rate in bytes per second below which throttling is assumed and the video data is re-extracted, e.g. 100K') downloader.add_option( '-R', '--retries', dest='retries', metavar='RETRIES', default=10, @@ -731,13 +851,26 @@ def create_parser(): dest='fragment_retries', metavar='RETRIES', default=10, help='Number of retries for a fragment (default is %default), or "infinite" (DASH, hlsnative and ISM)') downloader.add_option( + '--retry-sleep', + dest='retry_sleep', metavar='[TYPE:]EXPR', default={}, type='str', + action='callback', callback=_dict_from_options_callback, + callback_kwargs={ + 'allowed_keys': 'http|fragment|file_access|extractor', + 'default_key': 'http', + }, help=( + 'Time to sleep between retries in seconds (optionally) prefixed by the type of retry ' + '(http (default), fragment, file_access, extractor) to apply the sleep to. ' + 'EXPR can be a number, linear=START[:END[:STEP=1]] or exp=START[:END[:BASE=2]]. ' + 'This option can be used multiple times to set the sleep for the different retry types, ' + 'e.g. --retry-sleep linear=1::2 --retry-sleep fragment:exp=1:20')) + downloader.add_option( '--skip-unavailable-fragments', '--no-abort-on-unavailable-fragment', action='store_true', dest='skip_unavailable_fragments', default=True, - help='Skip unavailable fragments for DASH, hlsnative and ISM (default) (Alias: --no-abort-on-unavailable-fragment)') + help='Skip unavailable fragments for DASH, hlsnative and ISM downloads (default) (Alias: --no-abort-on-unavailable-fragment)') downloader.add_option( '--abort-on-unavailable-fragment', '--no-skip-unavailable-fragments', action='store_false', dest='skip_unavailable_fragments', - help='Abort downloading if a fragment is unavailable (Alias: --no-skip-unavailable-fragments)') + help='Abort download if a fragment is unavailable (Alias: --no-skip-unavailable-fragments)') downloader.add_option( '--keep-fragments', action='store_true', dest='keep_fragments', default=False, @@ -749,7 +882,7 @@ def create_parser(): downloader.add_option( '--buffer-size', dest='buffersize', metavar='SIZE', default='1024', - help='Size of download buffer (e.g. 1024 or 16K) (default is %default)') + help='Size of download buffer, e.g. 1024 or 16K (default is %default)') downloader.add_option( '--resize-buffer', action='store_false', dest='noresizebuffer', @@ -762,7 +895,7 @@ def create_parser(): '--http-chunk-size', dest='http_chunk_size', metavar='SIZE', default=None, help=( - 'Size of a chunk for chunk-based HTTP downloading (e.g. 10485760 or 10M) (default is disabled). ' + 'Size of a chunk for chunk-based HTTP downloading, e.g. 10485760 or 10M (default is disabled). ' 'May be useful for bypassing bandwidth throttling imposed by a webserver (experimental)')) downloader.add_option( '--test', @@ -770,17 +903,25 @@ def create_parser(): help=optparse.SUPPRESS_HELP) downloader.add_option( '--playlist-reverse', - action='store_true', - help='Download playlist videos in reverse order') + action='store_true', dest='playlist_reverse', + help=optparse.SUPPRESS_HELP) downloader.add_option( '--no-playlist-reverse', action='store_false', dest='playlist_reverse', - help='Download playlist videos in default order (default)') + help=optparse.SUPPRESS_HELP) downloader.add_option( '--playlist-random', - action='store_true', + action='store_true', dest='playlist_random', help='Download playlist videos in random order') downloader.add_option( + '--lazy-playlist', + action='store_true', dest='lazy_playlist', + help='Process entries in the playlist as they are received. This disables n_entries, --playlist-random and --playlist-reverse') + downloader.add_option( + '--no-lazy-playlist', + action='store_false', dest='lazy_playlist', + help='Process videos in the playlist only after the entire playlist is parsed (default)') + downloader.add_option( '--xattr-set-filesize', dest='xattr_set_filesize', action='store_true', help='Set file xattribute ytdl.filesize with expected file size') @@ -807,6 +948,14 @@ def create_parser(): 'Do not use the mpegts container for HLS videos. ' 'This is default when not downloading live streams')) downloader.add_option( + '--download-sections', + metavar='REGEX', dest='download_ranges', action='append', + help=( + 'Download only chapters whose title matches the given regular expression. ' + 'Time ranges prefixed by a "*" can also be used in place of chapters to download the specified range. ' + 'Needs ffmpeg. This option can be used multiple times to download multiple sections, ' + 'e.g. --download-sections "*10:15-inf" --download-sections "intro"')) + downloader.add_option( '--downloader', '--external-downloader', dest='external_downloader', metavar='[PROTO:]NAME', default={}, type='str', action='callback', callback=_dict_from_options_callback, @@ -817,11 +966,11 @@ def create_parser(): }, help=( 'Name or path of the external downloader to use (optionally) prefixed by ' 'the protocols (http, ftp, m3u8, dash, rstp, rtmp, mms) to use it for. ' - 'Currently supports native, %s (Recommended: aria2c). ' + f'Currently supports native, {", ".join(sorted(list_external_downloaders()))}. ' 'You can use this option multiple times to set different downloaders for different protocols. ' - 'For example, --downloader aria2c --downloader "dash,m3u8:native" will use ' + 'E.g. --downloader aria2c --downloader "dash,m3u8:native" will use ' 'aria2c for http/ftp downloads, and the native downloader for dash/m3u8 downloads ' - '(Alias: --external-downloader)' % ', '.join(list_external_downloaders()))) + '(Alias: --external-downloader)')) downloader.add_option( '--downloader-args', '--external-downloader-args', metavar='NAME:ARGS', dest='external_downloader_args', default={}, type='str', @@ -829,7 +978,7 @@ def create_parser(): callback_kwargs={ 'allowed_keys': r'ffmpeg_[io]\d*|%s' % '|'.join(map(re.escape, list_external_downloaders())), 'default_key': 'default', - 'process': compat_shlex_split + 'process': shlex.split }, help=( 'Give these arguments to the external downloader. ' 'Specify the downloader name and the arguments separated by a colon ":". ' @@ -936,7 +1085,8 @@ def create_parser(): }, help=( 'Field name or output template to print to screen, optionally prefixed with when to print it, separated by a ":". ' 'Supported values of "WHEN" are the same as that of --use-postprocessor, and "video" (default). ' - 'Implies --quiet and --simulate (unless --no-simulate is used). This option can be used multiple times')) + 'Implies --quiet. Implies --simulate unless --no-simulate or later stages of WHEN are used. ' + 'This option can be used multiple times')) verbosity.add_option( '--print-to-file', metavar='[WHEN:]TEMPLATE FILE', dest='print_to_file', default={}, type='str', nargs=2, @@ -1028,7 +1178,7 @@ def create_parser(): 'Template for progress outputs, optionally prefixed with one of "download:" (default), ' '"download-title:" (the console title), "postprocess:", or "postprocess-title:". ' 'The video\'s fields are accessible under the "info" key and ' - 'the progress attributes are accessible under "progress" key. E.g.: ' + 'the progress attributes are accessible under "progress" key. E.g. ' # TODO: Document the fields inside "progress" '--console-title --progress-template "download-title:%(info.id)s-%(progress.eta)s"')) verbosity.add_option( @@ -1044,6 +1194,10 @@ def create_parser(): action='store_true', dest='write_pages', default=False, help='Write downloaded intermediary pages to files in the current directory to debug problems') verbosity.add_option( + '--load-pages', + action='store_true', dest='load_pages', default=False, + help=optparse.SUPPRESS_HELP) + verbosity.add_option( '--youtube-print-sig-code', action='store_true', dest='youtube_print_sig_code', default=False, help=optparse.SUPPRESS_HELP) @@ -1054,7 +1208,7 @@ def create_parser(): verbosity.add_option( '-C', '--call-home', dest='call_home', action='store_true', default=False, - # help='[Broken] Contact the hypervideo server for debugging') + # help='Contact the hypervideo server for debugging') help=optparse.SUPPRESS_HELP) verbosity.add_option( '--no-call-home', @@ -1102,7 +1256,7 @@ def create_parser(): filesystem.add_option( '--output-na-placeholder', dest='outtmpl_na_placeholder', metavar='TEXT', default='NA', - help=('Placeholder value for unavailable meta fields in output filename template (default: "%default")')) + help=('Placeholder for unavailable fields in "OUTPUT TEMPLATE" (default: "%default")')) filesystem.add_option( '--autonumber-size', dest='autonumber_size', metavar='NUMBER', type=int, @@ -1237,14 +1391,15 @@ def create_parser(): help='Do not read/dump cookies from/to file (default)') filesystem.add_option( '--cookies-from-browser', - dest='cookiesfrombrowser', metavar='BROWSER[+KEYRING][:PROFILE]', + dest='cookiesfrombrowser', metavar='BROWSER[+KEYRING][:PROFILE][::CONTAINER]', help=( - 'The name of the browser and (optionally) the name/path of ' - 'the profile to load cookies from, separated by a ":". ' + 'The name of the browser to load cookies from. ' f'Currently supported browsers are: {", ".join(sorted(SUPPORTED_BROWSERS))}. ' - 'By default, the most recently accessed profile is used. ' - 'The keyring used for decrypting Chromium cookies on Linux can be ' - '(optionally) specified after the browser name separated by a "+". ' + 'Optionally, the KEYRING used for decrypting Chromium cookies on Linux, ' + 'the name/path of the PROFILE to load cookies from, ' + 'and the CONTAINER name (if Firefox) ("none" for no container) ' + 'can be given with their respective seperators. ' + 'By default, all containers of the most recently accessed profile are used. ' f'Currently supported keyrings are: {", ".join(map(str.lower, sorted(SUPPORTED_KEYRINGS)))}')) filesystem.add_option( '--no-cookies-from-browser', @@ -1252,7 +1407,9 @@ def create_parser(): help='Do not load cookies from browser (default)') filesystem.add_option( '--cache-dir', dest='cachedir', default=None, metavar='DIR', - help='Location in the filesystem where youtube-dl can store some downloaded information (such as client ids and signatures) permanently. By default $XDG_CACHE_HOME/hypervideo or ~/.cache/hypervideo') + help=( + 'Location in the filesystem where hypervideo can store some downloaded information ' + '(such as client ids and signatures) permanently. By default ${XDG_CACHE_HOME}/hypervideo')) filesystem.add_option( '--no-cache-dir', action='store_false', dest='cachedir', help='Disable filesystem caching') @@ -1308,26 +1465,27 @@ def create_parser(): postproc.add_option( '--audio-format', metavar='FORMAT', dest='audioformat', default='best', help=( - 'Specify audio format to convert the audio to when -x is used. Currently supported formats are: ' - 'best (default) or one of %s' % ', '.join(FFmpegExtractAudioPP.SUPPORTED_EXTS))) + 'Format to convert the audio to when -x is used. ' + f'(currently supported: best (default), {", ".join(sorted(FFmpegExtractAudioPP.SUPPORTED_EXTS))}). ' + 'You can specify multiple rules using similar syntax as --remux-video')) postproc.add_option( '--audio-quality', metavar='QUALITY', dest='audioquality', default='5', - help='Specify ffmpeg audio quality to use when converting the audio with -x. Insert a value between 0 (best) and 10 (worst) for VBR or a specific bitrate like 128K (default %default)') + help=( + 'Specify ffmpeg audio quality to use when converting the audio with -x. ' + 'Insert a value between 0 (best) and 10 (worst) for VBR or a specific bitrate like 128K (default %default)')) postproc.add_option( '--remux-video', metavar='FORMAT', dest='remuxvideo', default=None, help=( - 'Remux the video into another container if necessary (currently supported: %s). ' - 'If target container does not support the video/audio codec, remuxing will fail. ' - 'You can specify multiple rules; Eg. "aac>m4a/mov>mp4/mkv" will remux aac to m4a, mov to mp4 ' - 'and anything else to mkv.' % ', '.join(FFmpegVideoRemuxerPP.SUPPORTED_EXTS))) + 'Remux the video into another container if necessary ' + f'(currently supported: {", ".join(FFmpegVideoRemuxerPP.SUPPORTED_EXTS)}). ' + 'If target container does not support the video/audio codec, remuxing will fail. You can specify multiple rules; ' + 'e.g. "aac>m4a/mov>mp4/mkv" will remux aac to m4a, mov to mp4 and anything else to mkv')) postproc.add_option( '--recode-video', metavar='FORMAT', dest='recodevideo', default=None, - help=( - 'Re-encode the video into another format if re-encoding is necessary. ' - 'The syntax and supported formats are the same as --remux-video')) + help='Re-encode the video into another format if necessary. The syntax and supported formats are the same as --remux-video') postproc.add_option( '--postprocessor-args', '--ppa', metavar='NAME:ARGS', dest='postprocessor_args', default={}, type='str', @@ -1335,7 +1493,7 @@ def create_parser(): callback_kwargs={ 'allowed_keys': r'\w+(?:\+\w+)?', 'default_key': 'default-compat', - 'process': compat_shlex_split, + 'process': shlex.split, 'multiple_keys': False }, help=( 'Give these arguments to the postprocessors. ' @@ -1348,7 +1506,7 @@ def create_parser(): 'You can also specify "PP+EXE:ARGS" to give the arguments to the specified executable ' 'only when being used by the specified postprocessor. Additionally, for ffmpeg/ffprobe, ' '"_i"/"_o" can be appended to the prefix optionally followed by a number to pass the argument ' - 'before the specified input/output file. Eg: --ppa "Merger+ffmpeg_i1:-v quiet". ' + 'before the specified input/output file, e.g. --ppa "Merger+ffmpeg_i1:-v quiet". ' 'You can use this option multiple times to give different arguments to different ' 'postprocessors. (Alias: --ppa)')) postproc.add_option( @@ -1424,7 +1582,7 @@ def create_parser(): dest='parse_metadata', metavar='FIELDS REGEX REPLACE', action='append', nargs=3, help='Replace text in a metadata field using the given regex. This option can be used multiple times') postproc.add_option( - '--xattrs', + '--xattrs', '--xattr', action='store_true', dest='xattrs', default=False, help='Write metadata to the video file\'s xattrs (using dublin core and xdg standards)') postproc.add_option( @@ -1491,13 +1649,14 @@ def create_parser(): metavar='FORMAT', dest='convertsubtitles', default=None, help=( 'Convert the subtitles to another format (currently supported: %s) ' - '(Alias: --convert-subtitles)' % ', '.join(FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS))) + '(Alias: --convert-subtitles)' % ', '.join(sorted(FFmpegSubtitlesConvertorPP.SUPPORTED_EXTS)))) postproc.add_option( '--convert-thumbnails', metavar='FORMAT', dest='convertthumbnails', default=None, help=( 'Convert the thumbnails to another format ' - '(currently supported: %s) ' % ', '.join(FFmpegThumbnailsConvertorPP.SUPPORTED_EXTS))) + f'(currently supported: {", ".join(sorted(FFmpegThumbnailsConvertorPP.SUPPORTED_EXTS))}). ' + 'You can specify multiple rules using similar syntax as --remux-video')) postproc.add_option( '--split-chapters', '--split-tracks', dest='split_chapters', action='store_true', default=False, @@ -1514,9 +1673,7 @@ def create_parser(): metavar='REGEX', dest='remove_chapters', action='append', help=( 'Remove chapters whose title matches the given regular expression. ' - 'Time ranges prefixed by a "*" can also be used in place of chapters to remove the specified range. ' - 'Eg: --remove-chapters "*10:15-15:00" --remove-chapters "intro". ' - 'This option can be used multiple times')) + 'The syntax is the same as --download-sections. This option can be used multiple times')) postproc.add_option( '--no-remove-chapters', dest='remove_chapters', action='store_const', const=None, help='Do not remove any chapters from the file (default)') @@ -1524,9 +1681,8 @@ def create_parser(): '--force-keyframes-at-cuts', action='store_true', dest='force_keyframes_at_cuts', default=False, help=( - 'Force keyframes around the chapters before removing/splitting them. ' - 'Requires a re-encode and thus is very slow, but the resulting video ' - 'may have fewer artifacts around the cuts')) + 'Force keyframes at cuts when downloading/splitting/removing sections. ' + 'This is slow due to needing a re-encode, but the resulting video may have fewer artifacts around the cuts')) postproc.add_option( '--no-force-keyframes-at-cuts', action='store_false', dest='force_keyframes_at_cuts', @@ -1564,14 +1720,14 @@ def create_parser(): 'aliases': {'default': ['all']} }, help=( 'SponsorBlock categories to create chapters for, separated by commas. ' - f'Available categories are all, default(=all), {", ".join(SponsorBlockPP.CATEGORIES.keys())}. ' - 'You can prefix the category with a "-" to exempt it. See [1] for description of the categories. ' - 'Eg: --sponsorblock-mark all,-preview [1] https://wiki.sponsor.ajay.app/w/Segment_Categories')) + f'Available categories are {", ".join(SponsorBlockPP.CATEGORIES.keys())}, all and default (=all). ' + 'You can prefix the category with a "-" to exclude it. See [1] for description of the categories. ' + 'E.g. --sponsorblock-mark all,-preview [1] https://wiki.sponsor.ajay.app/w/Segment_Categories')) sponsorblock.add_option( '--sponsorblock-remove', metavar='CATS', dest='sponsorblock_remove', default=set(), action='callback', type='str', callback=_set_from_options_callback, callback_kwargs={ - 'allowed_values': set(SponsorBlockPP.CATEGORIES.keys()) - set(SponsorBlockPP.POI_CATEGORIES.keys()), + 'allowed_values': set(SponsorBlockPP.CATEGORIES.keys()) - set(SponsorBlockPP.NON_SKIPPABLE_CATEGORIES.keys()), # Note: From https://wiki.sponsor.ajay.app/w/Types: # The filler category is very aggressive. # It is strongly recommended to not use this in a client by default. @@ -1581,14 +1737,14 @@ def create_parser(): 'If a category is present in both mark and remove, remove takes precedence. ' 'The syntax and available categories are the same as for --sponsorblock-mark ' 'except that "default" refers to "all,-filler" ' - f'and {", ".join(SponsorBlockPP.POI_CATEGORIES.keys())} is not available')) + f'and {", ".join(SponsorBlockPP.NON_SKIPPABLE_CATEGORIES.keys())} are not available')) sponsorblock.add_option( '--sponsorblock-chapter-title', metavar='TEMPLATE', default=DEFAULT_SPONSORBLOCK_CHAPTER_TITLE, dest='sponsorblock_chapter_title', help=( - 'The title template for SponsorBlock chapters created by --sponsorblock-mark. ' - 'The same syntax as the output template is used, but the only available fields are ' - 'start_time, end_time, category, categories, name, category_names. Defaults to "%default"')) + 'An output template for the title of the SponsorBlock chapters created by --sponsorblock-mark. ' + 'The only available fields are start_time, end_time, category, categories, name, category_names. ' + 'Defaults to "%default"')) sponsorblock.add_option( '--no-sponsorblock', default=False, action='store_true', dest='no_sponsorblock', @@ -1656,14 +1812,14 @@ def create_parser(): val.replace(r'\,', ',').strip() for val in re.split(r'(?<!\\),', vals)]) extractor.add_option( '--extractor-args', - metavar='KEY:ARGS', dest='extractor_args', default={}, type='str', + metavar='IE_KEY:ARGS', dest='extractor_args', default={}, type='str', action='callback', callback=_dict_from_options_callback, callback_kwargs={ 'multiple_keys': False, 'process': lambda val: dict( _extractor_arg_parser(*arg.split('=', 1)) for arg in val.split(';')) }, help=( - 'Pass these arguments to the extractor. See "EXTRACTOR ARGUMENTS" for details. ' + 'Pass ARGS arguments to the IE_KEY extractor. See "EXTRACTOR ARGUMENTS" for details. ' 'You can use this option multiple times to give arguments for different extractors')) extractor.add_option( '--youtube-include-dash-manifest', '--no-youtube-skip-dash-manifest', @@ -1703,7 +1859,6 @@ def create_parser(): def _hide_login_info(opts): - write_string( - 'DeprecationWarning: "hypervideo_dl.options._hide_login_info" is deprecated and may be removed in a future version. ' - 'Use "hypervideo_dl.utils.Config.hide_login_info" instead\n') + deprecation_warning(f'"{__name__}._hide_login_info" is deprecated and may be removed ' + 'in a future version. Use "hypervideo_dl.utils.Config.hide_login_info" instead') return Config.hide_login_info(opts) diff --git a/hypervideo_dl/postprocessor/__init__.py b/hypervideo_dl/postprocessor/__init__.py index e47631e..f168be4 100644 --- a/hypervideo_dl/postprocessor/__init__.py +++ b/hypervideo_dl/postprocessor/__init__.py @@ -1,27 +1,25 @@ # flake8: noqa: F401 -from ..utils import load_plugins - from .common import PostProcessor from .embedthumbnail import EmbedThumbnailPP -from .exec import ExecPP, ExecAfterDownloadPP +from .exec import ExecAfterDownloadPP, ExecPP from .ffmpeg import ( - FFmpegPostProcessor, - FFmpegCopyStreamPP, FFmpegConcatPP, + FFmpegCopyStreamPP, FFmpegEmbedSubtitlePP, FFmpegExtractAudioPP, FFmpegFixupDuplicateMoovPP, FFmpegFixupDurationPP, - FFmpegFixupStretchedPP, - FFmpegFixupTimestampPP, FFmpegFixupM3u8PP, FFmpegFixupM4aPP, + FFmpegFixupStretchedPP, + FFmpegFixupTimestampPP, FFmpegMergerPP, FFmpegMetadataPP, + FFmpegPostProcessor, + FFmpegSplitChaptersPP, FFmpegSubtitlesConvertorPP, FFmpegThumbnailsConvertorPP, - FFmpegSplitChaptersPP, FFmpegVideoConvertorPP, FFmpegVideoRemuxerPP, ) @@ -35,6 +33,7 @@ from .movefilesafterdownload import MoveFilesAfterDownloadPP from .sponskrub import SponSkrubPP from .sponsorblock import SponsorBlockPP from .xattrpp import XAttrMetadataPP +from ..utils import load_plugins _PLUGIN_CLASSES = load_plugins('postprocessor', 'PP', globals()) diff --git a/hypervideo_dl/postprocessor/common.py b/hypervideo_dl/postprocessor/common.py index 3899646..c3fca35 100644 --- a/hypervideo_dl/postprocessor/common.py +++ b/hypervideo_dl/postprocessor/common.py @@ -1,19 +1,16 @@ -from __future__ import unicode_literals - import functools -import itertools import json import os -import time import urllib.error from ..utils import ( + PostProcessingError, + RetryManager, _configuration_args, + deprecation_warning, encodeFilename, network_exceptions, - PostProcessingError, sanitized_Request, - write_string, ) @@ -47,9 +44,6 @@ class PostProcessor(metaclass=PostProcessorMetaClass): an initial argument and then with the returned value of the previous PostProcessor. - The chain will be stopped if one of them ever returns None or the end - of the chain is reached. - PostProcessor objects follow a "mutual registration" process similar to InfoExtractor objects. @@ -71,21 +65,26 @@ class PostProcessor(metaclass=PostProcessorMetaClass): return name[6:] if name[:6].lower() == 'ffmpeg' else name def to_screen(self, text, prefix=True, *args, **kwargs): - tag = '[%s] ' % self.PP_NAME if prefix else '' if self._downloader: - return self._downloader.to_screen('%s%s' % (tag, text), *args, **kwargs) + tag = '[%s] ' % self.PP_NAME if prefix else '' + return self._downloader.to_screen(f'{tag}{text}', *args, **kwargs) def report_warning(self, text, *args, **kwargs): if self._downloader: return self._downloader.report_warning(text, *args, **kwargs) - def deprecation_warning(self, text): + def deprecation_warning(self, msg): + warn = getattr(self._downloader, 'deprecation_warning', deprecation_warning) + return warn(msg, stacklevel=1) + + def deprecated_feature(self, msg): if self._downloader: - return self._downloader.deprecation_warning(text) - write_string(f'DeprecationWarning: {text}') + return self._downloader.deprecated_feature(msg) + return deprecation_warning(msg, stacklevel=1) def report_error(self, text, *args, **kwargs): - # Exists only for compatibility. Do not use + self.deprecation_warning('"hypervideo_dl.postprocessor.PostProcessor.report_error" is deprecated. ' + 'raise "hypervideo_dl.utils.PostProcessingError" instead') if self._downloader: return self._downloader.report_error(text, *args, **kwargs) @@ -93,6 +92,12 @@ class PostProcessor(metaclass=PostProcessorMetaClass): if self._downloader: return self._downloader.write_debug(text, *args, **kwargs) + def _delete_downloaded_files(self, *files_to_delete, **kwargs): + if self._downloader: + return self._downloader._delete_downloaded_files(*files_to_delete, **kwargs) + for filename in set(filter(None, files_to_delete)): + os.remove(filename) + def get_param(self, name, default=None, *args, **kwargs): if self._downloader: return self._downloader.params.get(name, default, *args, **kwargs) @@ -171,6 +176,8 @@ class PostProcessor(metaclass=PostProcessorMetaClass): def report_progress(self, s): s['_default_template'] = '%(postprocessor)s %(status)s' % s + if not self._downloader: + return progress_dict = s.copy() progress_dict.pop('info_dict') @@ -179,34 +186,31 @@ class PostProcessor(metaclass=PostProcessorMetaClass): progress_template = self.get_param('progress_template', {}) tmpl = progress_template.get('postprocess') if tmpl: - self._downloader.to_stdout(self._downloader.evaluate_outtmpl(tmpl, progress_dict)) + self._downloader.to_screen( + self._downloader.evaluate_outtmpl(tmpl, progress_dict), skip_eol=True, quiet=False) self._downloader.to_console_title(self._downloader.evaluate_outtmpl( progress_template.get('postprocess-title') or 'hypervideo %(progress._default_template)s', progress_dict)) - def _download_json(self, url, *, expected_http_errors=(404,)): + def _retry_download(self, err, count, retries): # While this is not an extractor, it behaves similar to one and - # so obey extractor_retries and sleep_interval_requests - max_retries = self.get_param('extractor_retries', 3) - sleep_interval = self.get_param('sleep_interval_requests') or 0 + # so obey extractor_retries and "--retry-sleep extractor" + RetryManager.report_retry(err, count, retries, info=self.to_screen, warn=self.report_warning, + sleep_func=self.get_param('retry_sleep_functions', {}).get('extractor')) + def _download_json(self, url, *, expected_http_errors=(404,)): self.write_debug(f'{self.PP_NAME} query: {url}') - for retries in itertools.count(): + for retry in RetryManager(self.get_param('extractor_retries', 3), self._retry_download): try: rsp = self._downloader.urlopen(sanitized_Request(url)) - return json.loads(rsp.read().decode(rsp.info().get_param('charset') or 'utf-8')) except network_exceptions as e: if isinstance(e, urllib.error.HTTPError) and e.code in expected_http_errors: return None - if retries < max_retries: - self.report_warning(f'{e}. Retrying...') - if sleep_interval > 0: - self.to_screen(f'Sleeping {sleep_interval} seconds ...') - time.sleep(sleep_interval) - continue - raise PostProcessingError(f'Unable to communicate with {self.PP_NAME} API: {e}') + retry.error = PostProcessingError(f'Unable to communicate with {self.PP_NAME} API: {e}') + continue + return json.loads(rsp.read().decode(rsp.info().get_param('charset') or 'utf-8')) -class AudioConversionError(PostProcessingError): +class AudioConversionError(PostProcessingError): # Deprecated pass diff --git a/hypervideo_dl/postprocessor/embedthumbnail.py b/hypervideo_dl/postprocessor/embedthumbnail.py index 815221d..7cd3952 100644 --- a/hypervideo_dl/postprocessor/embedthumbnail.py +++ b/hypervideo_dl/postprocessor/embedthumbnail.py @@ -1,37 +1,29 @@ -# coding: utf-8 -from __future__ import unicode_literals - import base64 -import imghdr import os -import subprocess import re - -try: - from mutagen.flac import Picture, FLAC - from mutagen.mp4 import MP4, MP4Cover - from mutagen.oggopus import OggOpus - from mutagen.oggvorbis import OggVorbis - has_mutagen = True -except ImportError: - has_mutagen = False +import subprocess from .common import PostProcessor -from .ffmpeg import ( - FFmpegPostProcessor, - FFmpegThumbnailsConvertorPP, -) +from .ffmpeg import FFmpegPostProcessor, FFmpegThumbnailsConvertorPP +from ..compat import imghdr +from ..dependencies import mutagen from ..utils import ( + Popen, + PostProcessingError, check_executable, encodeArgument, encodeFilename, error_to_compat_str, - Popen, - PostProcessingError, prepend_extension, shell_quote, ) +if mutagen: + from mutagen.flac import FLAC, Picture + from mutagen.mp4 import MP4, MP4Cover + from mutagen.oggopus import OggOpus + from mutagen.oggvorbis import OggVorbis + class EmbedThumbnailPPError(PostProcessingError): pass @@ -61,7 +53,7 @@ class EmbedThumbnailPP(FFmpegPostProcessor): return int(mobj.group('w')), int(mobj.group('h')) def _report_run(self, exe, filename): - self.to_screen('%s: Adding thumbnail to "%s"' % (exe, filename)) + self.to_screen(f'{exe}: Adding thumbnail to "{filename}"') @PostProcessor._restrict_to(images=False) def run(self, info): @@ -87,12 +79,10 @@ class EmbedThumbnailPP(FFmpegPostProcessor): original_thumbnail = thumbnail_filename = info['thumbnails'][idx]['filepath'] - # Convert unsupported thumbnail formats to PNG (see #25687, #25717) - # Original behavior was to convert to JPG, but since JPG is a lossy - # format, there will be some additional data loss. - # PNG, on the other hand, is lossless. + # Convert unsupported thumbnail formats (see #25687, #25717) + # PNG is preferred since JPEG is lossy thumbnail_ext = os.path.splitext(thumbnail_filename)[1][1:] - if thumbnail_ext not in ('jpg', 'jpeg', 'png'): + if info['ext'] not in ('mkv', 'mka') and thumbnail_ext not in ('jpg', 'jpeg', 'png'): thumbnail_filename = convertor.convert_thumbnail(thumbnail_filename, 'png') thumbnail_ext = 'png' @@ -101,8 +91,8 @@ class EmbedThumbnailPP(FFmpegPostProcessor): success = True if info['ext'] == 'mp3': options = [ - '-c', 'copy', '-map', '0:0', '-map', '1:0', '-id3v2_version', '3', - '-metadata:s:v', 'title="Album cover"', '-metadata:s:v', 'comment="Cover (front)"'] + '-c', 'copy', '-map', '0:0', '-map', '1:0', '-write_id3v1', '1', '-id3v2_version', '3', + '-metadata:s:v', 'title="Album cover"', '-metadata:s:v', 'comment=Cover (front)'] self._report_run('ffmpeg', filename) self.run_ffmpeg_multiple_files([filename, thumbnail_filename], temp_filename, options) @@ -110,7 +100,7 @@ class EmbedThumbnailPP(FFmpegPostProcessor): elif info['ext'] in ['mkv', 'mka']: options = list(self.stream_copy_opts()) - mimetype = 'image/%s' % ('png' if thumbnail_ext == 'png' else 'jpeg') + mimetype = f'image/{thumbnail_ext.replace("jpg", "jpeg")}' old_stream, new_stream = self.get_stream_number( filename, ('tags', 'mimetype'), mimetype) if old_stream is not None: @@ -127,7 +117,7 @@ class EmbedThumbnailPP(FFmpegPostProcessor): elif info['ext'] in ['m4a', 'mp4', 'mov']: prefer_atomicparsley = 'embed-thumbnail-atomicparsley' in self.get_param('compat_opts', []) # Method 1: Use mutagen - if not has_mutagen or prefer_atomicparsley: + if not mutagen or prefer_atomicparsley: success = False else: try: @@ -149,7 +139,8 @@ class EmbedThumbnailPP(FFmpegPostProcessor): if not success: success = True atomicparsley = next(( - x for x in ['AtomicParsley', 'atomicparsley'] + # libatomicparsley.so : See https://github.com/xibr/ytdlp-lazy/issues/1 + x for x in ['AtomicParsley', 'atomicparsley', 'libatomicparsley.so'] if check_executable(x, ['-v'])), None) if atomicparsley is None: self.to_screen('Neither mutagen nor AtomicParsley was found. Falling back to ffmpeg') @@ -167,14 +158,12 @@ class EmbedThumbnailPP(FFmpegPostProcessor): self._report_run('atomicparsley', filename) self.write_debug('AtomicParsley command line: %s' % shell_quote(cmd)) - p = Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout, stderr = p.communicate_or_kill() - if p.returncode != 0: - msg = stderr.decode('utf-8', 'replace').strip() - self.report_warning(f'Unable to embed thumbnails using AtomicParsley; {msg}') + stdout, stderr, returncode = Popen.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if returncode: + self.report_warning(f'Unable to embed thumbnails using AtomicParsley; {stderr.strip()}') # for formats that don't support thumbnails (like 3gp) AtomicParsley # won't create to the temporary file - if b'No changes' in stdout: + if 'No changes' in stdout: self.report_warning('The file format doesn\'t support embedding a thumbnail') success = False @@ -200,7 +189,7 @@ class EmbedThumbnailPP(FFmpegPostProcessor): raise EmbedThumbnailPPError(f'Unable to embed using ffprobe & ffmpeg; {err}') elif info['ext'] in ['ogg', 'opus', 'flac']: - if not has_mutagen: + if not mutagen: raise EmbedThumbnailPPError('module mutagen was not found. Please install using `python -m pip install mutagen`') self._report_run('mutagen', filename) @@ -230,11 +219,9 @@ class EmbedThumbnailPP(FFmpegPostProcessor): os.replace(temp_filename, filename) self.try_utime(filename, mtime, mtime) - - files_to_delete = [thumbnail_filename] - if self._already_have_thumbnail: - if original_thumbnail == thumbnail_filename: - files_to_delete = [] - elif original_thumbnail != thumbnail_filename: - files_to_delete.append(original_thumbnail) - return files_to_delete, info + converted = original_thumbnail != thumbnail_filename + self._delete_downloaded_files( + thumbnail_filename if converted or not self._already_have_thumbnail else None, + original_thumbnail if converted and not self._already_have_thumbnail else None, + info=info) + return [], info diff --git a/hypervideo_dl/postprocessor/exec.py b/hypervideo_dl/postprocessor/exec.py index c0bd6df..65fe6d4 100644 --- a/hypervideo_dl/postprocessor/exec.py +++ b/hypervideo_dl/postprocessor/exec.py @@ -1,14 +1,8 @@ -from __future__ import unicode_literals - import subprocess from .common import PostProcessor from ..compat import compat_shlex_quote -from ..utils import ( - encodeArgument, - PostProcessingError, - variadic, -) +from ..utils import PostProcessingError, encodeArgument, variadic class ExecPP(PostProcessor): diff --git a/hypervideo_dl/postprocessor/execafterdownload.py b/hypervideo_dl/postprocessor/execafterdownload.py deleted file mode 100644 index 64dabe7..0000000 --- a/hypervideo_dl/postprocessor/execafterdownload.py +++ /dev/null @@ -1,31 +0,0 @@ -from __future__ import unicode_literals - -import subprocess - -from .common import PostProcessor -from ..compat import compat_shlex_quote -from ..utils import ( - encodeArgument, - PostProcessingError, -) - - -class ExecAfterDownloadPP(PostProcessor): - def __init__(self, downloader, exec_cmd): - super(ExecAfterDownloadPP, self).__init__(downloader) - self.exec_cmd = exec_cmd - - def run(self, information): - cmd = self.exec_cmd - if '{}' not in cmd: - cmd += ' {}' - - cmd = cmd.replace('{}', compat_shlex_quote(information['filepath'])) - - self._downloader.to_screen('[exec] Executing command: %s' % cmd) - retCode = subprocess.call(encodeArgument(cmd), shell=True) - if retCode != 0: - raise PostProcessingError( - 'Command returned error code %d' % retCode) - - return [], information diff --git a/hypervideo_dl/postprocessor/ffmpeg.py b/hypervideo_dl/postprocessor/ffmpeg.py index 3e6edcf..0471594 100644 --- a/hypervideo_dl/postprocessor/ffmpeg.py +++ b/hypervideo_dl/postprocessor/ffmpeg.py @@ -1,30 +1,30 @@ -from __future__ import unicode_literals - import collections -import io +import contextvars import itertools +import json import os +import re import subprocess import time -import re -import json -from .common import AudioConversionError, PostProcessor - -from ..compat import compat_str +from .common import PostProcessor +from ..compat import functools, imghdr from ..utils import ( + MEDIA_EXTENSIONS, + ISO639Utils, + Popen, + PostProcessingError, + _get_exe_version_output, + deprecation_warning, + detect_exe_version, determine_ext, dfxp2srt, encodeArgument, encodeFilename, + filter_dict, float_or_none, - _get_exe_version_output, - detect_exe_version, is_outdated_version, - ISO639Utils, orderedSet, - Popen, - PostProcessingError, prepend_extension, replace_extension, shell_quote, @@ -33,7 +33,6 @@ from ..utils import ( write_json_file, ) - EXT_TO_OUT_FORMATS = { 'aac': 'adts', 'flac': 'flac', @@ -48,36 +47,48 @@ EXT_TO_OUT_FORMATS = { 'vtt': 'webvtt', } ACODECS = { - 'mp3': 'libmp3lame', - 'aac': 'aac', - 'flac': 'flac', - 'm4a': 'aac', - 'opus': 'libopus', - 'vorbis': 'libvorbis', - 'wav': None, - 'alac': None, + # name: (ext, encoder, opts) + 'mp3': ('mp3', 'libmp3lame', ()), + 'aac': ('m4a', 'aac', ('-f', 'adts')), + 'm4a': ('m4a', 'aac', ('-bsf:a', 'aac_adtstoasc')), + 'opus': ('opus', 'libopus', ()), + 'vorbis': ('ogg', 'libvorbis', ()), + 'flac': ('flac', 'flac', ()), + 'alac': ('m4a', None, ('-acodec', 'alac')), + 'wav': ('wav', None, ('-f', 'wav')), } +def create_mapping_re(supported): + return re.compile(r'{0}(?:/{0})*$'.format(r'(?:\s*\w+\s*>)?\s*(?:%s)\s*' % '|'.join(supported))) + + +def resolve_mapping(source, mapping): + """ + Get corresponding item from a mapping string like 'A>B/C>D/E' + @returns (target, error_message) + """ + for pair in mapping.lower().split('/'): + kv = pair.split('>', 1) + if len(kv) == 1 or kv[0].strip() == source: + target = kv[-1].strip() + if target == source: + return target, f'already is in target format {source}' + return target, None + return None, f'could not find a mapping for {source}' + + class FFmpegPostProcessorError(PostProcessingError): pass class FFmpegPostProcessor(PostProcessor): + _ffmpeg_location = contextvars.ContextVar('ffmpeg_location', default=None) + def __init__(self, downloader=None): PostProcessor.__init__(self, downloader) - self._determine_executables() - - def check_version(self): - if not self.available: - raise FFmpegPostProcessorError('ffmpeg not found. Please install or provide the path using --ffmpeg-location') - - required_version = '10-0' if self.basename == 'avconv' else '1.0' - if is_outdated_version( - self._versions[self.basename], required_version): - warning = 'Your copy of %s is outdated, update %s to version %s or newer if you encounter any errors.' % ( - self.basename, self.basename, required_version) - self.report_warning(warning) + self._prefer_ffmpeg = self.get_param('prefer_ffmpeg', True) + self._paths = self._determine_executables() @staticmethod def get_versions_and_features(downloader=None): @@ -88,87 +99,105 @@ class FFmpegPostProcessor(PostProcessor): def get_versions(downloader=None): return FFmpegPostProcessor.get_versions_and_features(downloader)[0] - _version_cache, _features_cache = {}, {} + _ffmpeg_to_avconv = {'ffmpeg': 'avconv', 'ffprobe': 'avprobe'} def _determine_executables(self): - programs = ['avprobe', 'avconv', 'ffmpeg', 'ffprobe'] - - def get_ffmpeg_version(path, prog): - if path in self._version_cache: - self._versions[prog], self._features = self._version_cache[path], self._features_cache.get(path, {}) - return - out = _get_exe_version_output(path, ['-bsfs'], to_screen=self.write_debug) - ver = detect_exe_version(out) if out else False - if ver: - regexs = [ - r'(?:\d+:)?([0-9.]+)-[0-9]+ubuntu[0-9.]+$', # Ubuntu, see [1] - r'n([0-9.]+)$', # Arch Linux - # 1. http://www.ducea.com/2006/06/17/ubuntu-package-version-naming-explanation/ - ] - for regex in regexs: - mobj = re.match(regex, ver) - if mobj: - ver = mobj.group(1) - self._versions[prog] = self._version_cache[path] = ver - if prog != 'ffmpeg' or not out: - return + programs = [*self._ffmpeg_to_avconv.keys(), *self._ffmpeg_to_avconv.values()] - mobj = re.search(r'(?m)^\s+libavformat\s+(?:[0-9. ]+)\s+/\s+(?P<runtime>[0-9. ]+)', out) - lavf_runtime_version = mobj.group('runtime').replace(' ', '') if mobj else None - self._features = self._features_cache[path] = { - 'fdk': '--enable-libfdk-aac' in out, - 'setts': 'setts' in out.splitlines(), - 'needs_adtstoasc': is_outdated_version(lavf_runtime_version, '57.56.100', False), - } - - self.basename = None - self.probe_basename = None - self._paths = None - self._versions = None - self._features = {} - - prefer_ffmpeg = self.get_param('prefer_ffmpeg', True) - location = self.get_param('ffmpeg_location') + location = self.get_param('ffmpeg_location', self._ffmpeg_location.get()) if location is None: - self._paths = {p: p for p in programs} + return {p: p for p in programs} + + if not os.path.exists(location): + self.report_warning( + f'ffmpeg-location {location} does not exist! Continuing without ffmpeg', only_once=True) + return {} + elif os.path.isdir(location): + dirname, basename, filename = location, None, None else: - if not os.path.exists(location): - self.report_warning( - 'ffmpeg-location %s does not exist! ' - 'Continuing without ffmpeg.' % (location)) - self._versions = {} - return - elif os.path.isdir(location): - dirname, basename = location, None - else: - basename = os.path.splitext(os.path.basename(location))[0] - basename = next((p for p in programs if basename.startswith(p)), 'ffmpeg') - dirname = os.path.dirname(os.path.abspath(location)) - if basename in ('ffmpeg', 'ffprobe'): - prefer_ffmpeg = True - - self._paths = dict( - (p, os.path.join(dirname, p)) for p in programs) - if basename: - self._paths[basename] = location - - self._versions = {} - executables = {'basename': ('ffmpeg', 'avconv'), 'probe_basename': ('ffprobe', 'avprobe')} - if prefer_ffmpeg is False: - executables = {k: v[::-1] for k, v in executables.items()} - for var, prefs in executables.items(): - for p in prefs: - get_ffmpeg_version(self._paths[p], p) - if self._versions[p]: - setattr(self, var, p) - break - - if self.basename == 'avconv': - self.deprecation_warning( - 'Support for avconv is deprecated and may be removed in a future version. Use ffmpeg instead') - if self.probe_basename == 'avprobe': - self.deprecation_warning( - 'Support for avprobe is deprecated and may be removed in a future version. Use ffprobe instead') + filename = os.path.basename(location) + basename = next((p for p in programs if p in filename), 'ffmpeg') + dirname = os.path.dirname(os.path.abspath(location)) + if basename in self._ffmpeg_to_avconv.keys(): + self._prefer_ffmpeg = True + + paths = {p: os.path.join(dirname, p) for p in programs} + if basename and basename in filename: + for p in programs: + path = os.path.join(dirname, filename.replace(basename, p)) + if os.path.exists(path): + paths[p] = path + if basename: + paths[basename] = location + return paths + + _version_cache, _features_cache = {None: None}, {} + + def _get_ffmpeg_version(self, prog): + path = self._paths.get(prog) + if path in self._version_cache: + return self._version_cache[path], self._features_cache.get(path, {}) + out = _get_exe_version_output(path, ['-bsfs']) + ver = detect_exe_version(out) if out else False + if ver: + regexs = [ + r'(?:\d+:)?([0-9.]+)-[0-9]+ubuntu[0-9.]+$', # Ubuntu, see [1] + r'n([0-9.]+)$', # Arch Linux + # 1. http://www.ducea.com/2006/06/17/ubuntu-package-version-naming-explanation/ + ] + for regex in regexs: + mobj = re.match(regex, ver) + if mobj: + ver = mobj.group(1) + self._version_cache[path] = ver + if prog != 'ffmpeg' or not out: + return ver, {} + + mobj = re.search(r'(?m)^\s+libavformat\s+(?:[0-9. ]+)\s+/\s+(?P<runtime>[0-9. ]+)', out) + lavf_runtime_version = mobj.group('runtime').replace(' ', '') if mobj else None + self._features_cache[path] = features = { + 'fdk': '--enable-libfdk-aac' in out, + 'setts': 'setts' in out.splitlines(), + 'needs_adtstoasc': is_outdated_version(lavf_runtime_version, '57.56.100', False), + } + return ver, features + + @property + def _versions(self): + return filter_dict({self.basename: self._version, self.probe_basename: self._probe_version}) + + @functools.cached_property + def basename(self): + self._version # run property + return self.basename + + @functools.cached_property + def probe_basename(self): + self._probe_version # run property + return self.probe_basename + + def _get_version(self, kind): + executables = (kind, ) + if not self._prefer_ffmpeg: + executables = (kind, self._ffmpeg_to_avconv[kind]) + basename, version, features = next(filter( + lambda x: x[1], ((p, *self._get_ffmpeg_version(p)) for p in executables)), (None, None, {})) + if kind == 'ffmpeg': + self.basename, self._features = basename, features + else: + self.probe_basename = basename + if basename == self._ffmpeg_to_avconv[kind]: + self.deprecated_feature(f'Support for {self._ffmpeg_to_avconv[kind]} is deprecated and ' + f'may be removed in a future version. Use {kind} instead') + return version + + @functools.cached_property + def _version(self): + return self._get_version('ffmpeg') + + @functools.cached_property + def _probe_version(self): + return self._get_version('ffprobe') @property def available(self): @@ -176,7 +205,7 @@ class FFmpegPostProcessor(PostProcessor): @property def executable(self): - return self._paths[self.basename] + return self._paths.get(self.basename) @property def probe_available(self): @@ -184,7 +213,7 @@ class FFmpegPostProcessor(PostProcessor): @property def probe_executable(self): - return self._paths[self.probe_basename] + return self._paths.get(self.probe_basename) @staticmethod def stream_copy_opts(copy=True, *, ext=None): @@ -194,10 +223,18 @@ class FFmpegPostProcessor(PostProcessor): yield from ('-dn', '-ignore_unknown') if copy: yield from ('-c', 'copy') - # For some reason, '-c copy -map 0' is not enough to copy subtitles - if ext in ('mp4', 'mov'): + if ext in ('mp4', 'mov', 'm4a'): yield from ('-c:s', 'mov_text') + def check_version(self): + if not self.available: + raise FFmpegPostProcessorError('ffmpeg not found. Please install or provide the path using --ffmpeg-location') + + required_version = '10-0' if self.basename == 'avconv' else '1.0' + if is_outdated_version(self._version, required_version): + self.report_warning(f'Your copy of {self.basename} is outdated, update {self.basename} ' + f'to version {required_version} or newer if you encounter any errors') + def get_audio_codec(self, path): if not self.probe_available and not self.available: raise PostProcessingError('ffprobe and ffmpeg not found. Please install or provide the path using --ffmpeg-location') @@ -211,15 +248,14 @@ class FFmpegPostProcessor(PostProcessor): encodeFilename(self.executable, True), encodeArgument('-i')] cmd.append(encodeFilename(self._ffmpeg_filename_argument(path), True)) - self.write_debug('%s command line: %s' % (self.basename, shell_quote(cmd))) - handle = Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - stdout_data, stderr_data = handle.communicate_or_kill() - expected_ret = 0 if self.probe_available else 1 - if handle.wait() != expected_ret: + self.write_debug(f'{self.basename} command line: {shell_quote(cmd)}') + stdout, stderr, returncode = Popen.run( + cmd, text=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + if returncode != (0 if self.probe_available else 1): return None - except (IOError, OSError): + except OSError: return None - output = (stdout_data if self.probe_available else stderr_data).decode('ascii', 'ignore') + output = stdout if self.probe_available else stderr if self.probe_available: audio_codec = None for line in output.split('\n'): @@ -253,11 +289,10 @@ class FFmpegPostProcessor(PostProcessor): ] cmd += opts - cmd.append(encodeFilename(self._ffmpeg_filename_argument(path), True)) - self.write_debug('ffprobe command line: %s' % shell_quote(cmd)) - p = Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) - stdout, stderr = p.communicate() - return json.loads(stdout.decode('utf-8', 'replace')) + cmd.append(self._ffmpeg_filename_argument(path)) + self.write_debug(f'ffprobe command line: {shell_quote(cmd)}') + stdout, _, _ = Popen.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) + return json.loads(stdout) def get_stream_number(self, path, keys, value): streams = self.get_metadata_object(path)['streams'] @@ -277,12 +312,12 @@ class FFmpegPostProcessor(PostProcessor): if fatal: raise PostProcessingError(f'Unable to determine video duration: {e.msg}') - def _duration_mismatch(self, d1, d2): + def _duration_mismatch(self, d1, d2, tolerance=2): if not d1 or not d2: return None # The duration is often only known to nearest second. So there can be <1sec disparity natually. # Further excuse an additional <1sec difference. - return abs(d1 - d2) > 2 + return abs(d1 - d2) > tolerance def run_ffmpeg_multiple_files(self, input_paths, out_path, opts, **kwargs): return self.real_run_ffmpeg( @@ -319,16 +354,15 @@ class FFmpegPostProcessor(PostProcessor): for i, (path, opts) in enumerate(path_opts) if path) self.write_debug('ffmpeg command line: %s' % shell_quote(cmd)) - p = Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) - stdout, stderr = p.communicate_or_kill() - if p.returncode not in variadic(expected_retcodes): - stderr = stderr.decode('utf-8', 'replace').strip() + _, stderr, returncode = Popen.run( + cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) + if returncode not in variadic(expected_retcodes): self.write_debug(stderr) - raise FFmpegPostProcessorError(stderr.split('\n')[-1]) + raise FFmpegPostProcessorError(stderr.strip().splitlines()[-1]) for out_path, _ in output_path_opts: if out_path: self.try_utime(out_path, oldest_mtime, oldest_mtime) - return stderr.decode('utf-8', 'replace') + return stderr def run_ffmpeg(self, path, out_path, opts, **kwargs): return self.run_ffmpeg_multiple_files([path], out_path, opts, **kwargs) @@ -381,7 +415,7 @@ class FFmpegPostProcessor(PostProcessor): self.real_run_ffmpeg( [(concat_file, ['-hide_banner', '-nostdin', '-f', 'concat', '-safe', '0'])], [(out_file, out_flags)]) - os.remove(concat_file) + self._delete_downloaded_files(concat_file) @classmethod def _concat_spec(cls, in_files, concat_opts=None): @@ -397,12 +431,13 @@ class FFmpegPostProcessor(PostProcessor): class FFmpegExtractAudioPP(FFmpegPostProcessor): - COMMON_AUDIO_EXTS = ('wav', 'flac', 'm4a', 'aiff', 'mp3', 'ogg', 'mka', 'opus', 'wma') - SUPPORTED_EXTS = ('aac', 'flac', 'mp3', 'm4a', 'opus', 'vorbis', 'wav', 'alac') + COMMON_AUDIO_EXTS = MEDIA_EXTENSIONS.common_audio + ('wma', ) + SUPPORTED_EXTS = tuple(ACODECS.keys()) + FORMAT_RE = create_mapping_re(('best', *SUPPORTED_EXTS)) def __init__(self, downloader=None, preferredcodec=None, preferredquality=None, nopostoverwrites=False): FFmpegPostProcessor.__init__(self, downloader) - self._preferredcodec = preferredcodec or 'best' + self.mapping = preferredcodec or 'best' self._preferredquality = float_or_none(preferredquality) self._nopostoverwrites = nopostoverwrites @@ -437,71 +472,47 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): try: FFmpegPostProcessor.run_ffmpeg(self, path, out_path, opts) except FFmpegPostProcessorError as err: - raise AudioConversionError(err.msg) + raise PostProcessingError(f'audio conversion failed: {err.msg}') @PostProcessor._restrict_to(images=False) def run(self, information): orig_path = path = information['filepath'] - orig_ext = information['ext'] - - if self._preferredcodec == 'best' and orig_ext in self.COMMON_AUDIO_EXTS: - self.to_screen('Skipping audio extraction since the file is already in a common audio format') + target_format, _skip_msg = resolve_mapping(information['ext'], self.mapping) + if target_format == 'best' and information['ext'] in self.COMMON_AUDIO_EXTS: + target_format, _skip_msg = None, 'the file is already in a common audio format' + if not target_format: + self.to_screen(f'Not converting audio {orig_path}; {_skip_msg}') return [], information filecodec = self.get_audio_codec(path) if filecodec is None: raise PostProcessingError('WARNING: unable to obtain file audio codec with ffprobe') - more_opts = [] - if self._preferredcodec == 'best' or self._preferredcodec == filecodec or (self._preferredcodec == 'm4a' and filecodec == 'aac'): - if filecodec == 'aac' and self._preferredcodec in ['m4a', 'best']: - # Lossless, but in another container - acodec = 'copy' - extension = 'm4a' - more_opts = ['-bsf:a', 'aac_adtstoasc'] - elif filecodec in ['aac', 'flac', 'mp3', 'vorbis', 'opus']: - # Lossless if possible - acodec = 'copy' - extension = filecodec - if filecodec == 'aac': - more_opts = ['-f', 'adts'] - if filecodec == 'vorbis': - extension = 'ogg' - elif filecodec == 'alac': - acodec = None - extension = 'm4a' - more_opts += ['-acodec', 'alac'] - else: - # MP3 otherwise. - acodec = 'libmp3lame' - extension = 'mp3' - more_opts = self._quality_args(acodec) + if filecodec == 'aac' and target_format in ('m4a', 'best'): + # Lossless, but in another container + extension, _, more_opts, acodec = *ACODECS['m4a'], 'copy' + elif target_format == 'best' or target_format == filecodec: + # Lossless if possible + try: + extension, _, more_opts, acodec = *ACODECS[filecodec], 'copy' + except KeyError: + extension, acodec, more_opts = ACODECS['mp3'] else: # We convert the audio (lossy if codec is lossy) - acodec = ACODECS[self._preferredcodec] + extension, acodec, more_opts = ACODECS[target_format] if acodec == 'aac' and self._features.get('fdk'): - acodec = 'libfdk_aac' - extension = self._preferredcodec + acodec, more_opts = 'libfdk_aac', [] + + more_opts = list(more_opts) + if acodec != 'copy': more_opts = self._quality_args(acodec) - if self._preferredcodec == 'aac': - more_opts += ['-f', 'adts'] - elif self._preferredcodec == 'm4a': - more_opts += ['-bsf:a', 'aac_adtstoasc'] - elif self._preferredcodec == 'vorbis': - extension = 'ogg' - elif self._preferredcodec == 'wav': - extension = 'wav' - more_opts += ['-f', 'wav'] - elif self._preferredcodec == 'alac': - extension = 'm4a' - more_opts += ['-acodec', 'alac'] - - prefix, sep, ext = path.rpartition('.') # not os.path.splitext, since the latter does not work on unicode in all setups - temp_path = new_path = prefix + sep + extension + + # not os.path.splitext, since the latter does not work on unicode in all setups + temp_path = new_path = f'{path.rpartition(".")[0]}.{extension}' if new_path == path: if acodec == 'copy': - self.to_screen(f'File is already in target format {self._preferredcodec}, skipping') + self.to_screen(f'Not converting audio {orig_path}; file is already in target format {target_format}') return [], information orig_path = prepend_extension(path, 'orig') temp_path = prepend_extension(path, 'temp') @@ -510,14 +521,8 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): self.to_screen('Post-process file %s exists, skipping' % new_path) return [], information - try: - self.to_screen(f'Destination: {new_path}') - self.run_ffmpeg(path, temp_path, acodec, more_opts) - except AudioConversionError as e: - raise PostProcessingError( - 'audio conversion failed: ' + e.msg) - except Exception: - raise PostProcessingError('error running ' + self.basename) + self.to_screen(f'Destination: {new_path}') + self.run_ffmpeg(path, temp_path, acodec, more_opts) os.replace(path, orig_path) os.replace(temp_path, new_path) @@ -527,26 +532,19 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor): # Try to update the date time for extracted audio file. if information.get('filetime') is not None: self.try_utime( - new_path, time.time(), information['filetime'], - errnote='Cannot update utime of audio file') + new_path, time.time(), information['filetime'], errnote='Cannot update utime of audio file') return [orig_path], information class FFmpegVideoConvertorPP(FFmpegPostProcessor): - SUPPORTED_EXTS = ('mp4', 'mkv', 'flv', 'webm', 'mov', 'avi', 'mka', 'ogg', *FFmpegExtractAudioPP.SUPPORTED_EXTS) - FORMAT_RE = re.compile(r'{0}(?:/{0})*$'.format(r'(?:\w+>)?(?:%s)' % '|'.join(SUPPORTED_EXTS))) + SUPPORTED_EXTS = (*MEDIA_EXTENSIONS.common_video, *sorted(MEDIA_EXTENSIONS.common_audio + ('aac', 'vorbis'))) + FORMAT_RE = create_mapping_re(SUPPORTED_EXTS) _ACTION = 'converting' def __init__(self, downloader=None, preferedformat=None): - super(FFmpegVideoConvertorPP, self).__init__(downloader) - self._preferedformats = preferedformat.lower().split('/') - - def _target_ext(self, source_ext): - for pair in self._preferedformats: - kv = pair.split('>') - if len(kv) == 1 or kv[0].strip() == source_ext: - return kv[-1].strip() + super().__init__(downloader) + self.mapping = preferedformat @staticmethod def _options(target_ext): @@ -557,11 +555,7 @@ class FFmpegVideoConvertorPP(FFmpegPostProcessor): @PostProcessor._restrict_to(images=False) def run(self, info): filename, source_ext = info['filepath'], info['ext'].lower() - target_ext = self._target_ext(source_ext) - _skip_msg = ( - f'could not find a mapping for {source_ext}' if not target_ext - else f'already is in target format {source_ext}' if source_ext == target_ext - else None) + target_ext, _skip_msg = resolve_mapping(source_ext, self.mapping) if _skip_msg: self.to_screen(f'Not {self._ACTION} media file "{filename}"; {_skip_msg}') return [], info @@ -584,14 +578,16 @@ class FFmpegVideoRemuxerPP(FFmpegVideoConvertorPP): class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): + SUPPORTED_EXTS = ('mp4', 'mov', 'm4a', 'webm', 'mkv', 'mka') + def __init__(self, downloader=None, already_have_subtitle=False): - super(FFmpegEmbedSubtitlePP, self).__init__(downloader) + super().__init__(downloader) self._already_have_subtitle = already_have_subtitle @PostProcessor._restrict_to(images=False) def run(self, info): - if info['ext'] not in ('mp4', 'webm', 'mkv'): - self.to_screen('Subtitles can only be embedded in mp4, webm or mkv files') + if info['ext'] not in self.SUPPORTED_EXTS: + self.to_screen(f'Subtitles can only be embedded in {", ".join(self.SUPPORTED_EXTS)} files') return [], info subtitles = info.get('requested_subtitles') if not subtitles: @@ -600,7 +596,7 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor): filename = info['filepath'] - # Disabled temporarily. There needs to be a way to overide this + # Disabled temporarily. There needs to be a way to override this # in case of duration actually mismatching in extractor # See: https://github.com/hypervideo/hypervideo/issues/1870, https://github.com/hypervideo/hypervideo/issues/1385 ''' @@ -706,14 +702,13 @@ class FFmpegMetadataPP(FFmpegPostProcessor): self.run_ffmpeg_multiple_files( (filename, metadata_filename), temp_filename, itertools.chain(self._options(info['ext']), *options)) - for file in filter(None, files_to_delete): - os.remove(file) # Don't obey --keep-files + self._delete_downloaded_files(*files_to_delete) os.replace(temp_filename, filename) return [], info @staticmethod def _get_chapter_opts(chapters, metadata_filename): - with io.open(metadata_filename, 'wt', encoding='utf-8') as f: + with open(metadata_filename, 'wt', encoding='utf-8') as f: def ffmpeg_escape(text): return re.sub(r'([\\=;#\n])', r'\\\1', text) @@ -737,13 +732,13 @@ class FFmpegMetadataPP(FFmpegPostProcessor): str(info[key]) for key in [f'{meta_prefix}_'] + list(variadic(info_list or meta_list)) if info.get(key) is not None), None) if value not in ('', None): + value = value.replace('\0', '') # nul character cannot be passed in command line metadata['common'].update({meta_f: value for meta_f in variadic(meta_list)}) - # See [1-4] for some info on media metadata/metadata supported - # by ffmpeg. - # 1. https://kdenlive.org/en/project/adding-meta-data-to-mp4-video/ - # 2. https://wiki.multimedia.cx/index.php/FFmpeg_Metadata - # 3. https://kodi.wiki/view/Video_file_tagging + # Info on media metadata/metadata supported by ffmpeg: + # https://wiki.multimedia.cx/index.php/FFmpeg_Metadata + # https://kdenlive.org/en/project/adding-meta-data-to-mp4-video/ + # https://kodi.wiki/view/Video_file_tagging add('title', ('track', 'title')) add('date', 'upload_date') @@ -767,7 +762,10 @@ class FFmpegMetadataPP(FFmpegPostProcessor): for key, value in info.items(): mobj = re.fullmatch(meta_regex, key) if value is not None and mobj: - metadata[mobj.group('i') or 'common'][mobj.group('key')] = value + metadata[mobj.group('i') or 'common'][mobj.group('key')] = value.replace('\0', '') + + # Write id3v1 metadata also since Windows Explorer can't handle id3v2 tags + yield ('-write_id3v1', '1') for name, value in metadata['common'].items(): yield ('-metadata', f'{name}={value}') @@ -801,11 +799,16 @@ class FFmpegMetadataPP(FFmpegPostProcessor): yield ('-map', '-0:%d' % old_stream) new_stream -= 1 - yield ('-attach', infofn, - '-metadata:s:%d' % new_stream, 'mimetype=application/json') + yield ( + '-attach', infofn, + f'-metadata:s:{new_stream}', 'mimetype=application/json', + f'-metadata:s:{new_stream}', 'filename=info.json', + ) class FFmpegMergerPP(FFmpegPostProcessor): + SUPPORTED_EXTS = MEDIA_EXTENSIONS.common_video + @PostProcessor._restrict_to(images=False) def run(self, info): filename = info['filepath'] @@ -895,7 +898,7 @@ class FFmpegFixupTimestampPP(FFmpegFixupPostProcessor): def __init__(self, downloader=None, trim=0.001): # "trim" should be used when the video contains unintended packets - super(FFmpegFixupTimestampPP, self).__init__(downloader) + super().__init__(downloader) assert isinstance(trim, (int, float)) self.trim = str(trim) @@ -930,10 +933,10 @@ class FFmpegFixupDuplicateMoovPP(FFmpegCopyStreamPP): class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): - SUPPORTED_EXTS = ('srt', 'vtt', 'ass', 'lrc') + SUPPORTED_EXTS = MEDIA_EXTENSIONS.subtitles def __init__(self, downloader=None, format=None): - super(FFmpegSubtitlesConvertorPP, self).__init__(downloader) + super().__init__(downloader) self.format = format def run(self, info): @@ -975,7 +978,7 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): with open(dfxp_file, 'rb') as f: srt_data = dfxp2srt(f.read()) - with io.open(srt_file, 'wt', encoding='utf-8') as f: + with open(srt_file, 'wt', encoding='utf-8') as f: f.write(srt_data) old_file = srt_file @@ -992,7 +995,7 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor): self.run_ffmpeg(old_file, new_file, ['-f', new_format]) - with io.open(new_file, 'rt', encoding='utf-8') as f: + with open(new_file, encoding='utf-8') as f: subs[lang] = { 'ext': new_ext, 'data': f.read(), @@ -1029,8 +1032,8 @@ class FFmpegSplitChaptersPP(FFmpegPostProcessor): self.to_screen('Chapter %03d; Destination: %s' % (number, destination)) return ( destination, - ['-ss', compat_str(chapter['start_time']), - '-t', compat_str(chapter['end_time'] - chapter['start_time'])]) + ['-ss', str(chapter['start_time']), + '-t', str(chapter['end_time'] - chapter['start_time'])]) @PostProcessor._restrict_to(images=False) def run(self, info): @@ -1047,29 +1050,28 @@ class FFmpegSplitChaptersPP(FFmpegPostProcessor): destination, opts = self._ffmpeg_args_for_chapter(idx + 1, chapter, info) self.real_run_ffmpeg([(in_file, opts)], [(destination, self.stream_copy_opts())]) if in_file != info['filepath']: - os.remove(in_file) + self._delete_downloaded_files(in_file, msg=None) return [], info class FFmpegThumbnailsConvertorPP(FFmpegPostProcessor): - SUPPORTED_EXTS = ('jpg', 'png', 'webp') + SUPPORTED_EXTS = MEDIA_EXTENSIONS.thumbnails + FORMAT_RE = create_mapping_re(SUPPORTED_EXTS) def __init__(self, downloader=None, format=None): - super(FFmpegThumbnailsConvertorPP, self).__init__(downloader) - self.format = format + super().__init__(downloader) + self.mapping = format - @staticmethod - def is_webp(path): - with open(encodeFilename(path), 'rb') as f: - b = f.read(12) - return b[0:4] == b'RIFF' and b[8:] == b'WEBP' + @classmethod + def is_webp(cls, path): + deprecation_warning(f'{cls.__module__}.{cls.__name__}.is_webp is deprecated') + return imghdr.what(path) == 'webp' def fixup_webp(self, info, idx=-1): thumbnail_filename = info['thumbnails'][idx]['filepath'] _, thumbnail_ext = os.path.splitext(thumbnail_filename) if thumbnail_ext: - thumbnail_ext = thumbnail_ext[1:].lower() - if thumbnail_ext != 'webp' and self.is_webp(thumbnail_filename): + if thumbnail_ext.lower() != '.webp' and imghdr.what(thumbnail_filename) == 'webp': self.to_screen('Correcting thumbnail "%s" extension to webp' % thumbnail_filename) webp_filename = replace_extension(thumbnail_filename, 'webp') os.replace(thumbnail_filename, webp_filename) @@ -1079,17 +1081,18 @@ class FFmpegThumbnailsConvertorPP(FFmpegPostProcessor): @staticmethod def _options(target_ext): + yield from ('-update', '1') if target_ext == 'jpg': - return ['-bsf:v', 'mjpeg2jpeg'] - return [] + yield from ('-bsf:v', 'mjpeg2jpeg') def convert_thumbnail(self, thumbnail_filename, target_ext): thumbnail_conv_filename = replace_extension(thumbnail_filename, target_ext) - self.to_screen('Converting thumbnail "%s" to %s' % (thumbnail_filename, target_ext)) + self.to_screen(f'Converting thumbnail "{thumbnail_filename}" to {target_ext}') + _, source_ext = os.path.splitext(thumbnail_filename) self.real_run_ffmpeg( - [(thumbnail_filename, ['-f', 'image2', '-pattern_type', 'none'])], - [(thumbnail_conv_filename.replace('%', '%%'), self._options(target_ext))]) + [(thumbnail_filename, [] if source_ext == '.gif' else ['-f', 'image2', '-pattern_type', 'none'])], + [(thumbnail_conv_filename, self._options(target_ext))]) return thumbnail_conv_filename def run(self, info): @@ -1102,18 +1105,18 @@ class FFmpegThumbnailsConvertorPP(FFmpegPostProcessor): continue has_thumbnail = True self.fixup_webp(info, idx) - _, thumbnail_ext = os.path.splitext(original_thumbnail) - if thumbnail_ext: - thumbnail_ext = thumbnail_ext[1:].lower() + original_thumbnail = thumbnail_dict['filepath'] # Path can change during fixup + thumbnail_ext = os.path.splitext(original_thumbnail)[1][1:].lower() if thumbnail_ext == 'jpeg': thumbnail_ext = 'jpg' - if thumbnail_ext == self.format: - self.to_screen('Thumbnail "%s" is already in the requested format' % original_thumbnail) + target_ext, _skip_msg = resolve_mapping(thumbnail_ext, self.mapping) + if _skip_msg: + self.to_screen(f'Not converting thumbnail "{original_thumbnail}"; {_skip_msg}') continue - thumbnail_dict['filepath'] = self.convert_thumbnail(original_thumbnail, self.format) + thumbnail_dict['filepath'] = self.convert_thumbnail(original_thumbnail, target_ext) files_to_delete.append(original_thumbnail) info['__files_to_move'][thumbnail_dict['filepath']] = replace_extension( - info['__files_to_move'][original_thumbnail], self.format) + info['__files_to_move'][original_thumbnail], target_ext) if not has_thumbnail: self.to_screen('There aren\'t any thumbnails to convert') @@ -1153,16 +1156,16 @@ class FFmpegConcatPP(FFmpegPostProcessor): entries = info.get('entries') or [] if not any(entries) or (self._only_multi_video and info['_type'] != 'multi_video'): return [], info - elif traverse_obj(entries, (..., 'requested_downloads', lambda _, v: len(v) > 1)): + elif traverse_obj(entries, (..., lambda k, v: k == 'requested_downloads' and len(v) > 1)): raise PostProcessingError('Concatenation is not supported when downloading multiple separate formats') in_files = traverse_obj(entries, (..., 'requested_downloads', 0, 'filepath')) or [] if len(in_files) < len(entries): raise PostProcessingError('Aborting concatenation because some downloads failed') - ie_copy = self._downloader._playlist_infodict(info) exts = traverse_obj(entries, (..., 'requested_downloads', 0, 'ext'), (..., 'ext')) - ie_copy['ext'] = exts[0] if len(set(exts)) == 1 else 'mkv' + ie_copy = collections.ChainMap({'ext': exts[0] if len(set(exts)) == 1 else 'mkv'}, + info, self._downloader._playlist_infodict(info)) out_file = self._downloader.prepare_filename(ie_copy, 'pl_video') files_to_delete = self.concat_files(in_files, out_file) diff --git a/hypervideo_dl/postprocessor/metadatafromtitle.py b/hypervideo_dl/postprocessor/metadatafromtitle.py deleted file mode 100644 index f5c14d9..0000000 --- a/hypervideo_dl/postprocessor/metadatafromtitle.py +++ /dev/null @@ -1,48 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import PostProcessor - - -class MetadataFromTitlePP(PostProcessor): - def __init__(self, downloader, titleformat): - super(MetadataFromTitlePP, self).__init__(downloader) - self._titleformat = titleformat - self._titleregex = (self.format_to_regex(titleformat) - if re.search(r'%\(\w+\)s', titleformat) - else titleformat) - - def format_to_regex(self, fmt): - r""" - Converts a string like - '%(title)s - %(artist)s' - to a regex like - '(?P<title>.+)\ \-\ (?P<artist>.+)' - """ - lastpos = 0 - regex = '' - # replace %(..)s with regex group and escape other string parts - for match in re.finditer(r'%\((\w+)\)s', fmt): - regex += re.escape(fmt[lastpos:match.start()]) - regex += r'(?P<' + match.group(1) + '>.+)' - lastpos = match.end() - if lastpos < len(fmt): - regex += re.escape(fmt[lastpos:]) - return regex - - def run(self, info): - title = info['title'] - match = re.match(self._titleregex, title) - if match is None: - self._downloader.to_screen( - '[fromtitle] Could not interpret title of video as "%s"' - % self._titleformat) - return [], info - for attribute, value in match.groupdict().items(): - info[attribute] = value - self._downloader.to_screen( - '[fromtitle] parsed %s: %s' - % (attribute, value if value is not None else 'NA')) - - return [], info diff --git a/hypervideo_dl/postprocessor/metadataparser.py b/hypervideo_dl/postprocessor/metadataparser.py index 01ee6c1..381182b 100644 --- a/hypervideo_dl/postprocessor/metadataparser.py +++ b/hypervideo_dl/postprocessor/metadataparser.py @@ -1,31 +1,27 @@ import re -from enum import Enum from .common import PostProcessor +from ..utils import Namespace, filter_dict class MetadataParserPP(PostProcessor): - class Actions(Enum): - INTERPRET = 'interpretter' - REPLACE = 'replacer' - def __init__(self, downloader, actions): - PostProcessor.__init__(self, downloader) + super().__init__(downloader) self._actions = [] for f in actions: - action = f[0] - assert isinstance(action, self.Actions) - self._actions.append(getattr(self, action.value)(*f[1:])) + action, *args = f + assert action in self.Actions + self._actions.append(action(self, *args)) @classmethod def validate_action(cls, action, *data): - ''' Each action can be: + """Each action can be: (Actions.INTERPRET, from, to) OR (Actions.REPLACE, field, search, replace) - ''' - if not isinstance(action, cls.Actions): + """ + if action not in cls.Actions: raise ValueError(f'{action!r} is not a valid action') - getattr(cls, action.value)(cls, *data) # So this can raise error to validate + action(cls, *data) # So this can raise error to validate @staticmethod def field_to_template(tmpl): @@ -72,9 +68,9 @@ class MetadataParserPP(PostProcessor): if match is None: self.to_screen(f'Could not interpret {inp!r} as {out!r}') return - for attribute, value in match.groupdict().items(): + for attribute, value in filter_dict(match.groupdict()).items(): info[attribute] = value - self.to_screen('Parsed %s from %r: %r' % (attribute, template, value if value is not None else 'NA')) + self.to_screen(f'Parsed {attribute} from {template!r}: {value!r}') template = self.field_to_template(inp) out_re = re.compile(self.format_to_regex(out)) @@ -99,6 +95,8 @@ class MetadataParserPP(PostProcessor): search_re = re.compile(search) return f + Actions = Namespace(INTERPRET=interpretter, REPLACE=replacer) + class MetadataFromFieldPP(MetadataParserPP): @classmethod diff --git a/hypervideo_dl/postprocessor/modify_chapters.py b/hypervideo_dl/postprocessor/modify_chapters.py index 22506bc..a745b45 100644 --- a/hypervideo_dl/postprocessor/modify_chapters.py +++ b/hypervideo_dl/postprocessor/modify_chapters.py @@ -3,17 +3,9 @@ import heapq import os from .common import PostProcessor -from .ffmpeg import ( - FFmpegPostProcessor, - FFmpegSubtitlesConvertorPP -) +from .ffmpeg import FFmpegPostProcessor, FFmpegSubtitlesConvertorPP from .sponsorblock import SponsorBlockPP -from ..utils import ( - orderedSet, - PostProcessingError, - prepend_extension, -) - +from ..utils import PostProcessingError, orderedSet, prepend_extension _TINY_CHAPTER_DURATION = 1 DEFAULT_SPONSORBLOCK_CHAPTER_TITLE = '[SponsorBlock]: %(category_names)l' @@ -24,7 +16,7 @@ class ModifyChaptersPP(FFmpegPostProcessor): *, sponsorblock_chapter_title=DEFAULT_SPONSORBLOCK_CHAPTER_TITLE, force_keyframes=False): FFmpegPostProcessor.__init__(self, downloader) self._remove_chapters_patterns = set(remove_chapters_patterns or []) - self._remove_sponsor_segments = set(remove_sponsor_segments or []) - set(SponsorBlockPP.POI_CATEGORIES.keys()) + self._remove_sponsor_segments = set(remove_sponsor_segments or []) - set(SponsorBlockPP.NON_SKIPPABLE_CATEGORIES.keys()) self._ranges_to_remove = set(remove_ranges or []) self._sponsorblock_chapter_title = sponsorblock_chapter_title self._force_keyframes = force_keyframes @@ -40,14 +32,18 @@ class ModifyChaptersPP(FFmpegPostProcessor): real_duration = self._get_real_video_duration(info['filepath']) if not chapters: - chapters = [{'start_time': 0, 'end_time': real_duration, 'title': info['title']}] + chapters = [{'start_time': 0, 'end_time': info.get('duration') or real_duration, 'title': info['title']}] info['chapters'], cuts = self._remove_marked_arrange_sponsors(chapters + sponsor_chapters) if not cuts: return [], info + elif not info['chapters']: + self.report_warning('You have requested to remove the entire video, which is not possible') + return [], info - if self._duration_mismatch(real_duration, info.get('duration')): - if not self._duration_mismatch(real_duration, info['chapters'][-1]['end_time']): + original_duration, info['duration'] = info.get('duration'), info['chapters'][-1]['end_time'] + if self._duration_mismatch(real_duration, original_duration, 1): + if not self._duration_mismatch(real_duration, info['duration']): self.to_screen(f'Skipping {self.pp_key()} since the video appears to be already cut') return [], info if not info.get('__real_download'): @@ -106,7 +102,7 @@ class ModifyChaptersPP(FFmpegPostProcessor): 'start_time': start, 'end_time': end, 'category': 'manually_removed', - '_categories': [('manually_removed', start, end)], + '_categories': [('manually_removed', start, end, 'Manually removed')], 'remove': True, } for start, end in self._ranges_to_remove) @@ -297,13 +293,12 @@ class ModifyChaptersPP(FFmpegPostProcessor): c.pop('_was_cut', None) cats = c.pop('_categories', None) if cats: - category = min(cats, key=lambda c: c[2] - c[1])[0] - cats = orderedSet(x[0] for x in cats) + category, _, _, category_name = min(cats, key=lambda c: c[2] - c[1]) c.update({ 'category': category, - 'categories': cats, - 'name': SponsorBlockPP.CATEGORIES[category], - 'category_names': [SponsorBlockPP.CATEGORIES[c] for c in cats] + 'categories': orderedSet(x[0] for x in cats), + 'name': category_name, + 'category_names': orderedSet(x[3] for x in cats), }) c['title'] = self._downloader.evaluate_outtmpl(self._sponsorblock_chapter_title, c.copy()) # Merge identically named sponsors. @@ -322,7 +317,7 @@ class ModifyChaptersPP(FFmpegPostProcessor): self.to_screen(f'Removing chapters from {filename}') self.concat_files([in_file] * len(concat_opts), out_file, concat_opts) if in_file != filename: - os.remove(in_file) + self._delete_downloaded_files(in_file, msg=None) return out_file @staticmethod diff --git a/hypervideo_dl/postprocessor/movefilesafterdownload.py b/hypervideo_dl/postprocessor/movefilesafterdownload.py index 1064a8c..23b0924 100644 --- a/hypervideo_dl/postprocessor/movefilesafterdownload.py +++ b/hypervideo_dl/postprocessor/movefilesafterdownload.py @@ -1,13 +1,12 @@ -from __future__ import unicode_literals import os -import shutil from .common import PostProcessor +from ..compat import shutil from ..utils import ( + PostProcessingError, decodeFilename, encodeFilename, make_dir, - PostProcessingError, ) @@ -47,7 +46,7 @@ class MoveFilesAfterDownloadPP(PostProcessor): % (oldfile, newfile)) continue make_dir(newfile, PostProcessingError) - self.to_screen('Moving file "%s" to "%s"' % (oldfile, newfile)) + self.to_screen(f'Moving file "{oldfile}" to "{newfile}"') shutil.move(oldfile, newfile) # os.rename cannot move between volumes info['filepath'] = finalpath diff --git a/hypervideo_dl/postprocessor/sponskrub.py b/hypervideo_dl/postprocessor/sponskrub.py index 400cbcc..4ba2520 100644 --- a/hypervideo_dl/postprocessor/sponskrub.py +++ b/hypervideo_dl/postprocessor/sponskrub.py @@ -1,19 +1,18 @@ -from __future__ import unicode_literals import os +import shlex import subprocess from .common import PostProcessor -from ..compat import compat_shlex_split from ..utils import ( + Popen, + PostProcessingError, check_executable, cli_option, encodeArgument, encodeFilename, + prepend_extension, shell_quote, str_or_none, - Popen, - PostProcessingError, - prepend_extension, ) @@ -79,23 +78,21 @@ class SponSkrubPP(PostProcessor): if not self.cutout: cmd += ['-chapter'] cmd += cli_option(self._downloader.params, '-proxy', 'proxy') - cmd += compat_shlex_split(self.args) # For backward compatibility + cmd += shlex.split(self.args) # For backward compatibility cmd += self._configuration_args(self._exe_name, use_compat=False) cmd += ['--', information['id'], filename, temp_filename] cmd = [encodeArgument(i) for i in cmd] self.write_debug('sponskrub command line: %s' % shell_quote(cmd)) - pipe = None if self.get_param('verbose') else subprocess.PIPE - p = Popen(cmd, stdout=pipe) - stdout = p.communicate_or_kill()[0] + stdout, _, returncode = Popen.run(cmd, text=True, stdout=None if self.get_param('verbose') else subprocess.PIPE) - if p.returncode == 0: + if not returncode: os.replace(temp_filename, filename) self.to_screen('Sponsor sections have been %s' % ('removed' if self.cutout else 'marked')) - elif p.returncode == 3: + elif returncode == 3: self.to_screen('No segments in the SponsorBlock database') else: - msg = stdout.decode('utf-8', 'replace').strip() if stdout else '' - msg = msg.split('\n')[0 if msg.lower().startswith('unrecognised') else -1] - raise PostProcessingError(msg if msg else 'sponskrub failed with error code %s' % p.returncode) + raise PostProcessingError( + stdout.strip().splitlines()[0 if stdout.strip().lower().startswith('unrecognised') else -1] + or f'sponskrub failed with error code {returncode}') return [], information diff --git a/hypervideo_dl/postprocessor/sponsorblock.py b/hypervideo_dl/postprocessor/sponsorblock.py index 7943014..6ba87cd 100644 --- a/hypervideo_dl/postprocessor/sponsorblock.py +++ b/hypervideo_dl/postprocessor/sponsorblock.py @@ -1,9 +1,9 @@ -from hashlib import sha256 +import hashlib import json import re +import urllib.parse from .ffmpeg import FFmpegPostProcessor -from ..compat import compat_urllib_parse_urlencode class SponsorBlockPP(FFmpegPostProcessor): @@ -14,6 +14,10 @@ class SponsorBlockPP(FFmpegPostProcessor): POI_CATEGORIES = { 'poi_highlight': 'Highlight', } + NON_SKIPPABLE_CATEGORIES = { + **POI_CATEGORIES, + 'chapter': 'Chapter', + } CATEGORIES = { 'sponsor': 'Sponsor', 'intro': 'Intermission/Intro Animation', @@ -23,7 +27,7 @@ class SponsorBlockPP(FFmpegPostProcessor): 'filler': 'Filler Tangent', 'interaction': 'Interaction Reminder', 'music_offtopic': 'Non-Music Section', - **POI_CATEGORIES, + **NON_SKIPPABLE_CATEGORIES } def __init__(self, downloader, categories=None, api='https://sponsor.ajay.app'): @@ -38,7 +42,7 @@ class SponsorBlockPP(FFmpegPostProcessor): return [], info self.to_screen('Fetching SponsorBlock segments') - info['sponsorblock_chapters'] = self._get_sponsor_chapters(info, info['duration']) + info['sponsorblock_chapters'] = self._get_sponsor_chapters(info, info.get('duration')) return [], info def _get_sponsor_chapters(self, info, duration): @@ -60,7 +64,8 @@ class SponsorBlockPP(FFmpegPostProcessor): if duration and duration - start_end[1] <= 1: start_end[1] = duration # SponsorBlock duration may be absent or it may deviate from the real one. - return s['videoDuration'] == 0 or not duration or abs(duration - s['videoDuration']) <= 1 + diff = abs(duration - s['videoDuration']) if s['videoDuration'] else 0 + return diff < 1 or (diff < 5 and diff / (start_end[1] - start_end[0]) < 0.05) duration_match = [s for s in segments if duration_filter(s)] if len(duration_match) != len(segments): @@ -68,28 +73,30 @@ class SponsorBlockPP(FFmpegPostProcessor): def to_chapter(s): (start, end), cat = s['segment'], s['category'] + title = s['description'] if cat == 'chapter' else self.CATEGORIES[cat] return { 'start_time': start, 'end_time': end, 'category': cat, - 'title': self.CATEGORIES[cat], - '_categories': [(cat, start, end)] + 'title': title, + 'type': s['actionType'], + '_categories': [(cat, start, end, title)], } sponsor_chapters = [to_chapter(s) for s in duration_match] if not sponsor_chapters: - self.to_screen('No segments were found in the SponsorBlock database') + self.to_screen('No matching segments were found in the SponsorBlock database') else: self.to_screen(f'Found {len(sponsor_chapters)} segments in the SponsorBlock database') return sponsor_chapters def _get_sponsor_segments(self, video_id, service): - hash = sha256(video_id.encode('ascii')).hexdigest() + hash = hashlib.sha256(video_id.encode('ascii')).hexdigest() # SponsorBlock API recommends using first 4 hash characters. - url = f'{self._API_URL}/api/skipSegments/{hash[:4]}?' + compat_urllib_parse_urlencode({ + url = f'{self._API_URL}/api/skipSegments/{hash[:4]}?' + urllib.parse.urlencode({ 'service': service, 'categories': json.dumps(self._categories), - 'actionTypes': json.dumps(['skip', 'poi']) + 'actionTypes': json.dumps(['skip', 'poi', 'chapter']) }) for d in self._download_json(url) or []: if d['videoID'] == video_id: diff --git a/hypervideo_dl/postprocessor/xattrpp.py b/hypervideo_dl/postprocessor/xattrpp.py index 93acd6d..f822eff 100644 --- a/hypervideo_dl/postprocessor/xattrpp.py +++ b/hypervideo_dl/postprocessor/xattrpp.py @@ -1,78 +1,63 @@ -from __future__ import unicode_literals +import os from .common import PostProcessor from ..compat import compat_os_name from ..utils import ( - hyphenate_date, - write_xattr, PostProcessingError, XAttrMetadataError, XAttrUnavailableError, + hyphenate_date, + write_xattr, ) class XAttrMetadataPP(PostProcessor): - # - # More info about extended attributes for media: - # http://freedesktop.org/wiki/CommonExtendedAttributes/ - # http://www.freedesktop.org/wiki/PhreedomDraft/ - # http://dublincore.org/documents/usageguide/elements.shtml - # - # TODO: - # * capture youtube keywords and put them in 'user.dublincore.subject' (comma-separated) - # * figure out which xattrs can be used for 'duration', 'thumbnail', 'resolution' - # + """Set extended attributes on downloaded file (if xattr support is found) + + More info about extended attributes for media: + http://freedesktop.org/wiki/CommonExtendedAttributes/ + http://www.freedesktop.org/wiki/PhreedomDraft/ + http://dublincore.org/documents/usageguide/elements.shtml + + TODO: + * capture youtube keywords and put them in 'user.dublincore.subject' (comma-separated) + * figure out which xattrs can be used for 'duration', 'thumbnail', 'resolution' + """ + + XATTR_MAPPING = { + 'user.xdg.referrer.url': 'webpage_url', + # 'user.xdg.comment': 'description', + 'user.dublincore.title': 'title', + 'user.dublincore.date': 'upload_date', + 'user.dublincore.description': 'description', + 'user.dublincore.contributor': 'uploader', + 'user.dublincore.format': 'format', + } def run(self, info): - """ Set extended attributes on downloaded file (if xattr support is found). """ - - # Write the metadata to the file's xattrs + mtime = os.stat(info['filepath']).st_mtime self.to_screen('Writing metadata to file\'s xattrs') - - filename = info['filepath'] - try: - xattr_mapping = { - 'user.xdg.referrer.url': 'webpage_url', - # 'user.xdg.comment': 'description', - 'user.dublincore.title': 'title', - 'user.dublincore.date': 'upload_date', - 'user.dublincore.description': 'description', - 'user.dublincore.contributor': 'uploader', - 'user.dublincore.format': 'format', - } - - num_written = 0 - for xattrname, infoname in xattr_mapping.items(): - + for xattrname, infoname in self.XATTR_MAPPING.items(): value = info.get(infoname) - if value: if infoname == 'upload_date': value = hyphenate_date(value) - - byte_value = value.encode('utf-8') - write_xattr(filename, xattrname, byte_value) - num_written += 1 - - return [], info + write_xattr(info['filepath'], xattrname, value.encode()) except XAttrUnavailableError as e: raise PostProcessingError(str(e)) - except XAttrMetadataError as e: if e.reason == 'NO_SPACE': self.report_warning( 'There\'s no disk space left, disk quota exceeded or filesystem xattr limit exceeded. ' - + (('Some ' if num_written else '') + 'extended attributes are not written.').capitalize()) + 'Some extended attributes are not written') elif e.reason == 'VALUE_TOO_LONG': - self.report_warning( - 'Unable to write extended attributes due to too long values.') + self.report_warning('Unable to write extended attributes due to too long values.') else: - msg = 'This filesystem doesn\'t support extended attributes. ' - if compat_os_name == 'nt': - msg += 'You need to use NTFS.' - else: - msg += '(You may have to enable them in your /etc/fstab)' - raise PostProcessingError(str(e)) - return [], info + tip = ('You need to use NTFS' if compat_os_name == 'nt' + else 'You may have to enable them in your "/etc/fstab"') + raise PostProcessingError(f'This filesystem doesn\'t support extended attributes. {tip}') + + self.try_utime(info['filepath'], mtime, mtime) + return [], info diff --git a/hypervideo_dl/socks.py b/hypervideo_dl/socks.py index 5d4adbe..f93328f 100644 --- a/hypervideo_dl/socks.py +++ b/hypervideo_dl/socks.py @@ -1,8 +1,5 @@ # Public Domain SOCKS proxy protocol implementation # Adapted from https://gist.github.com/bluec0re/cafd3764412967417fd3 - -from __future__ import unicode_literals - # References: # SOCKS4 protocol http://www.openssh.com/txt/socks4.protocol # SOCKS4A protocol http://www.openssh.com/txt/socks4a.protocol @@ -11,12 +8,9 @@ from __future__ import unicode_literals import collections import socket +import struct -from .compat import ( - compat_ord, - compat_struct_pack, - compat_struct_unpack, -) +from .compat import compat_ord __author__ = 'Timo Schmid <coding@timoschmid.de>' @@ -26,14 +20,14 @@ SOCKS4_REPLY_VERSION = 0x00 # if the client cannot resolve the destination host's domain name to find its # IP address, it should set the first three bytes of DSTIP to NULL and the last # byte to a non-zero value. -SOCKS4_DEFAULT_DSTIP = compat_struct_pack('!BBBB', 0, 0, 0, 0xFF) +SOCKS4_DEFAULT_DSTIP = struct.pack('!BBBB', 0, 0, 0, 0xFF) SOCKS5_VERSION = 5 SOCKS5_USER_AUTH_VERSION = 0x01 SOCKS5_USER_AUTH_SUCCESS = 0x00 -class Socks4Command(object): +class Socks4Command: CMD_CONNECT = 0x01 CMD_BIND = 0x02 @@ -42,14 +36,14 @@ class Socks5Command(Socks4Command): CMD_UDP_ASSOCIATE = 0x03 -class Socks5Auth(object): +class Socks5Auth: AUTH_NONE = 0x00 AUTH_GSSAPI = 0x01 AUTH_USER_PASS = 0x02 AUTH_NO_ACCEPTABLE = 0xFF # For server response -class Socks5AddressType(object): +class Socks5AddressType: ATYP_IPV4 = 0x01 ATYP_DOMAINNAME = 0x03 ATYP_IPV6 = 0x04 @@ -61,14 +55,14 @@ class ProxyError(socket.error): def __init__(self, code=None, msg=None): if code is not None and msg is None: msg = self.CODES.get(code) or 'unknown error' - super(ProxyError, self).__init__(code, msg) + super().__init__(code, msg) class InvalidVersionError(ProxyError): def __init__(self, expected_version, got_version): - msg = ('Invalid response version from server. Expected {0:02x} got ' - '{1:02x}'.format(expected_version, got_version)) - super(InvalidVersionError, self).__init__(0, msg) + msg = ('Invalid response version from server. Expected {:02x} got ' + '{:02x}'.format(expected_version, got_version)) + super().__init__(0, msg) class Socks4Error(ProxyError): @@ -98,7 +92,7 @@ class Socks5Error(ProxyError): } -class ProxyType(object): +class ProxyType: SOCKS4 = 0 SOCKS4A = 1 SOCKS5 = 2 @@ -111,7 +105,7 @@ Proxy = collections.namedtuple('Proxy', ( class sockssocket(socket.socket): def __init__(self, *args, **kwargs): self._proxy = None - super(sockssocket, self).__init__(*args, **kwargs) + super().__init__(*args, **kwargs) def setproxy(self, proxytype, addr, port, rdns=True, username=None, password=None): assert proxytype in (ProxyType.SOCKS4, ProxyType.SOCKS4A, ProxyType.SOCKS5) @@ -123,17 +117,17 @@ class sockssocket(socket.socket): while len(data) < cnt: cur = self.recv(cnt - len(data)) if not cur: - raise EOFError('{0} bytes missing'.format(cnt - len(data))) + raise EOFError(f'{cnt - len(data)} bytes missing') data += cur return data def _recv_bytes(self, cnt): data = self.recvall(cnt) - return compat_struct_unpack('!{0}B'.format(cnt), data) + return struct.unpack(f'!{cnt}B', data) @staticmethod def _len_and_data(data): - return compat_struct_pack('!B', len(data)) + data + return struct.pack('!B', len(data)) + data def _check_response_version(self, expected_version, got_version): if got_version != expected_version: @@ -143,7 +137,7 @@ class sockssocket(socket.socket): def _resolve_address(self, destaddr, default, use_remote_dns): try: return socket.inet_aton(destaddr) - except socket.error: + except OSError: if use_remote_dns and self._proxy.remote_dns: return default else: @@ -154,17 +148,17 @@ class sockssocket(socket.socket): ipaddr = self._resolve_address(destaddr, SOCKS4_DEFAULT_DSTIP, use_remote_dns=is_4a) - packet = compat_struct_pack('!BBH', SOCKS4_VERSION, Socks4Command.CMD_CONNECT, port) + ipaddr + packet = struct.pack('!BBH', SOCKS4_VERSION, Socks4Command.CMD_CONNECT, port) + ipaddr - username = (self._proxy.username or '').encode('utf-8') + username = (self._proxy.username or '').encode() packet += username + b'\x00' if is_4a and self._proxy.remote_dns: - packet += destaddr.encode('utf-8') + b'\x00' + packet += destaddr.encode() + b'\x00' self.sendall(packet) - version, resp_code, dstport, dsthost = compat_struct_unpack('!BBHI', self.recvall(8)) + version, resp_code, dstport, dsthost = struct.unpack('!BBHI', self.recvall(8)) self._check_response_version(SOCKS4_REPLY_VERSION, version) @@ -178,14 +172,14 @@ class sockssocket(socket.socket): self._setup_socks4(address, is_4a=True) def _socks5_auth(self): - packet = compat_struct_pack('!B', SOCKS5_VERSION) + packet = struct.pack('!B', SOCKS5_VERSION) auth_methods = [Socks5Auth.AUTH_NONE] if self._proxy.username and self._proxy.password: auth_methods.append(Socks5Auth.AUTH_USER_PASS) - packet += compat_struct_pack('!B', len(auth_methods)) - packet += compat_struct_pack('!{0}B'.format(len(auth_methods)), *auth_methods) + packet += struct.pack('!B', len(auth_methods)) + packet += struct.pack(f'!{len(auth_methods)}B', *auth_methods) self.sendall(packet) @@ -199,9 +193,9 @@ class sockssocket(socket.socket): raise Socks5Error(Socks5Auth.AUTH_NO_ACCEPTABLE) if method == Socks5Auth.AUTH_USER_PASS: - username = self._proxy.username.encode('utf-8') - password = self._proxy.password.encode('utf-8') - packet = compat_struct_pack('!B', SOCKS5_USER_AUTH_VERSION) + username = self._proxy.username.encode() + password = self._proxy.password.encode() + packet = struct.pack('!B', SOCKS5_USER_AUTH_VERSION) packet += self._len_and_data(username) + self._len_and_data(password) self.sendall(packet) @@ -221,14 +215,14 @@ class sockssocket(socket.socket): self._socks5_auth() reserved = 0 - packet = compat_struct_pack('!BBB', SOCKS5_VERSION, Socks5Command.CMD_CONNECT, reserved) + packet = struct.pack('!BBB', SOCKS5_VERSION, Socks5Command.CMD_CONNECT, reserved) if ipaddr is None: - destaddr = destaddr.encode('utf-8') - packet += compat_struct_pack('!B', Socks5AddressType.ATYP_DOMAINNAME) + destaddr = destaddr.encode() + packet += struct.pack('!B', Socks5AddressType.ATYP_DOMAINNAME) packet += self._len_and_data(destaddr) else: - packet += compat_struct_pack('!B', Socks5AddressType.ATYP_IPV4) + ipaddr - packet += compat_struct_pack('!H', port) + packet += struct.pack('!B', Socks5AddressType.ATYP_IPV4) + ipaddr + packet += struct.pack('!H', port) self.sendall(packet) @@ -247,7 +241,7 @@ class sockssocket(socket.socket): destaddr = self.recvall(alen) elif atype == Socks5AddressType.ATYP_IPV6: destaddr = self.recvall(16) - destport = compat_struct_unpack('!H', self.recvall(2))[0] + destport = struct.unpack('!H', self.recvall(2))[0] return (destaddr, destport) diff --git a/hypervideo_dl/utils.py b/hypervideo_dl/utils.py index 6379872..45847f9 100644 --- a/hypervideo_dl/utils.py +++ b/hypervideo_dl/utils.py @@ -1,8 +1,3 @@ -#!/usr/bin/env python3 -# coding: utf-8 - -from __future__ import unicode_literals - import asyncio import atexit import base64 @@ -10,86 +5,59 @@ import binascii import calendar import codecs import collections +import collections.abc import contextlib -import ctypes import datetime -import email.utils import email.header +import email.utils import errno -import functools import gzip import hashlib import hmac +import html.entities +import html.parser +import http.client +import http.cookiejar import importlib.util +import inspect import io import itertools import json import locale import math +import mimetypes import operator import os import platform import random import re +import shlex import socket import ssl +import struct import subprocess import sys import tempfile import time import traceback +import types +import unicodedata +import urllib.error +import urllib.parse +import urllib.request import xml.etree.ElementTree import zlib -import mimetypes +from .compat import functools # isort: split from .compat import ( - compat_HTMLParseError, - compat_HTMLParser, - compat_HTTPError, - compat_basestring, - compat_brotli, - compat_chr, - compat_cookiejar, - compat_ctypes_WINFUNCTYPE, compat_etree_fromstring, compat_expanduser, - compat_html_entities, - compat_html_entities_html5, - compat_http_client, - compat_integer_types, - compat_numeric_types, - compat_kwargs, + compat_HTMLParseError, compat_os_name, - compat_parse_qs, - compat_shlex_split, compat_shlex_quote, - compat_str, - compat_struct_pack, - compat_struct_unpack, - compat_urllib_error, - compat_urllib_parse, - compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse, - compat_urllib_parse_urlunparse, - compat_urllib_parse_quote, - compat_urllib_parse_quote_plus, - compat_urllib_parse_unquote_plus, - compat_urllib_request, - compat_urlparse, - compat_websockets, - compat_xpath, -) - -from .socks import ( - ProxyType, - sockssocket, ) - -try: - import certifi - has_certifi = True -except ImportError: - has_certifi = False +from .dependencies import brotli, certifi, websockets, xattr +from .socks import ProxyType, sockssocket def register_socks_protocols(): @@ -97,8 +65,8 @@ def register_socks_protocols(): # In Python < 2.6.5, urlsplit() suffers from bug https://bugs.python.org/issue7904 # URLs with protocols not in urlparse.uses_netloc are not handled correctly for scheme in ('socks', 'socks4', 'socks4a', 'socks5'): - if scheme not in compat_urlparse.uses_netloc: - compat_urlparse.uses_netloc.append(scheme) + if scheme not in urllib.parse.uses_netloc: + urllib.parse.uses_netloc.append(scheme) # This is not clearly defined otherwise @@ -153,7 +121,7 @@ def random_user_agent(): SUPPORTED_ENCODINGS = [ 'gzip', 'deflate' ] -if compat_brotli: +if brotli: SUPPORTED_ENCODINGS.append('br') std_headers = { @@ -170,6 +138,7 @@ USER_AGENTS = { NO_DEFAULT = object() +IDENTITY = lambda x: x ENGLISH_MONTH_NAMES = [ 'January', 'February', 'March', 'April', 'May', 'June', @@ -180,22 +149,22 @@ MONTH_NAMES = { 'fr': [ 'janvier', 'février', 'mars', 'avril', 'mai', 'juin', 'juillet', 'août', 'septembre', 'octobre', 'novembre', 'décembre'], + # these follow the genitive grammatical case (dopełniacz) + # some websites might be using nominative, which will require another month list + # https://en.wikibooks.org/wiki/Polish/Noun_cases + 'pl': ['stycznia', 'lutego', 'marca', 'kwietnia', 'maja', 'czerwca', + 'lipca', 'sierpnia', 'września', 'października', 'listopada', 'grudnia'], } -KNOWN_EXTENSIONS = ( - 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'aac', - 'flv', 'f4v', 'f4a', 'f4b', - 'webm', 'ogg', 'ogv', 'oga', 'ogx', 'spx', 'opus', - 'mkv', 'mka', 'mk3d', - 'avi', 'divx', - 'mov', - 'asf', 'wmv', 'wma', - '3gp', '3g2', - 'mp3', - 'flac', - 'ape', - 'wav', - 'f4f', 'f4m', 'm3u8', 'smil') +# From https://github.com/python/cpython/blob/3.11/Lib/email/_parseaddr.py#L36-L42 +TIMEZONE_NAMES = { + 'UT': 0, 'UTC': 0, 'GMT': 0, 'Z': 0, + 'AST': -4, 'ADT': -3, # Atlantic (used in Canada) + 'EST': -5, 'EDT': -4, # Eastern + 'CST': -6, 'CDT': -5, # Central + 'MST': -7, 'MDT': -6, # Mountain + 'PST': -8, 'PDT': -7 # Pacific +} # needed for sanitizing filenames in restricted mode ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐØŒÙÚÛÜŰÝÞßàáâãäåæçèéêëìíîïðñòóôõöőøœùúûüűýþÿ', @@ -255,6 +224,7 @@ DATE_FORMATS_DAY_FIRST.extend([ '%d/%m/%Y', '%d/%m/%y', '%d/%m/%Y %H:%M:%S', + '%d-%m-%Y %H:%M', ]) DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS) @@ -267,9 +237,12 @@ DATE_FORMATS_MONTH_FIRST.extend([ ]) PACKED_CODES_RE = r"}\('(.+)',(\d+),(\d+),'([^']+)'\.split\('\|'\)" -JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>' +JSON_LD_RE = r'(?is)<script[^>]+type=(["\']?)application/ld\+json\1[^>]*>\s*(?P<json_ld>{.+?}|\[.+?\])\s*</script>' + +NUMBER_RE = r'\d+(?:\.\d+)?' +@functools.cache def preferredencoding(): """Get preferred encoding. @@ -288,37 +261,9 @@ def preferredencoding(): def write_json_file(obj, fn): """ Encode obj as JSON and write it to fn, atomically if possible """ - fn = encodeFilename(fn) - if sys.version_info < (3, 0) and sys.platform != 'win32': - encoding = get_filesystem_encoding() - # os.path.basename returns a bytes object, but NamedTemporaryFile - # will fail if the filename contains non ascii characters unless we - # use a unicode object - path_basename = lambda f: os.path.basename(fn).decode(encoding) - # the same for os.path.dirname - path_dirname = lambda f: os.path.dirname(fn).decode(encoding) - else: - path_basename = os.path.basename - path_dirname = os.path.dirname - - args = { - 'suffix': '.tmp', - 'prefix': path_basename(fn) + '.', - 'dir': path_dirname(fn), - 'delete': False, - } - - # In Python 2.x, json.dump expects a bytestream. - # In Python 3.x, it writes to a character stream - if sys.version_info < (3, 0): - args['mode'] = 'wb' - else: - args.update({ - 'mode': 'w', - 'encoding': 'utf-8', - }) - - tf = tempfile.NamedTemporaryFile(**compat_kwargs(args)) + tf = tempfile.NamedTemporaryFile( + prefix=f'{os.path.basename(fn)}.', dir=os.path.dirname(fn), + suffix='.tmp', delete=False, mode='w', encoding='utf-8') try: with tf: @@ -326,39 +271,24 @@ def write_json_file(obj, fn): if sys.platform == 'win32': # Need to remove existing file on Windows, else os.rename raises # WindowsError or FileExistsError. - try: + with contextlib.suppress(OSError): os.unlink(fn) - except OSError: - pass - try: + with contextlib.suppress(OSError): mask = os.umask(0) os.umask(mask) os.chmod(tf.name, 0o666 & ~mask) - except OSError: - pass os.rename(tf.name, fn) except Exception: - try: + with contextlib.suppress(OSError): os.remove(tf.name) - except OSError: - pass raise -if sys.version_info >= (2, 7): - def find_xpath_attr(node, xpath, key, val=None): - """ Find the xpath xpath[@key=val] """ - assert re.match(r'^[a-zA-Z_-]+$', key) - expr = xpath + ('[@%s]' % key if val is None else "[@%s='%s']" % (key, val)) - return node.find(expr) -else: - def find_xpath_attr(node, xpath, key, val=None): - for f in node.findall(compat_xpath(xpath)): - if key not in f.attrib: - continue - if val is None or f.attrib.get(key) == val: - return f - return None +def find_xpath_attr(node, xpath, key, val=None): + """ Find the xpath xpath[@key=val] """ + assert re.match(r'^[a-zA-Z_-]+$', key) + expr = xpath + ('[@%s]' % key if val is None else f"[@{key}='{val}']") + return node.find(expr) # On python2.6 the xml.etree.ElementTree.Element methods don't support # the namespace parameter @@ -378,9 +308,9 @@ def xpath_with_ns(path, ns_map): def xpath_element(node, xpath, name=None, fatal=False, default=NO_DEFAULT): def _find_xpath(xpath): - return node.find(compat_xpath(xpath)) + return node.find(xpath) - if isinstance(xpath, (str, compat_str)): + if isinstance(xpath, str): n = _find_xpath(xpath) else: for xp in xpath: @@ -420,21 +350,21 @@ def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT): if default is not NO_DEFAULT: return default elif fatal: - name = '%s[@%s]' % (xpath, key) if name is None else name + name = f'{xpath}[@{key}]' if name is None else name raise ExtractorError('Could not find XML attribute %s' % name) else: return None return n.attrib[key] -def get_element_by_id(id, html): +def get_element_by_id(id, html, **kwargs): """Return the content of the tag with the specified ID in the passed HTML document""" - return get_element_by_attribute('id', id, html) + return get_element_by_attribute('id', id, html, **kwargs) -def get_element_html_by_id(id, html): +def get_element_html_by_id(id, html, **kwargs): """Return the html of the tag with the specified ID in the passed HTML document""" - return get_element_html_by_attribute('id', id, html) + return get_element_html_by_attribute('id', id, html, **kwargs) def get_element_by_class(class_name, html): @@ -449,27 +379,27 @@ def get_element_html_by_class(class_name, html): return retval[0] if retval else None -def get_element_by_attribute(attribute, value, html, escape_value=True): - retval = get_elements_by_attribute(attribute, value, html, escape_value) +def get_element_by_attribute(attribute, value, html, **kwargs): + retval = get_elements_by_attribute(attribute, value, html, **kwargs) return retval[0] if retval else None -def get_element_html_by_attribute(attribute, value, html, escape_value=True): - retval = get_elements_html_by_attribute(attribute, value, html, escape_value) +def get_element_html_by_attribute(attribute, value, html, **kargs): + retval = get_elements_html_by_attribute(attribute, value, html, **kargs) return retval[0] if retval else None -def get_elements_by_class(class_name, html): +def get_elements_by_class(class_name, html, **kargs): """Return the content of all tags with the specified class in the passed HTML document as a list""" return get_elements_by_attribute( - 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name), + 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name), html, escape_value=False) def get_elements_html_by_class(class_name, html): """Return the html of all tags with the specified class in the passed HTML document as a list""" return get_elements_html_by_attribute( - 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name), + 'class', r'[^\'"]*(?<=[\'"\s])%s(?=[\'"\s])[^\'"]*' % re.escape(class_name), html, escape_value=False) @@ -483,21 +413,23 @@ def get_elements_html_by_attribute(*args, **kwargs): return [whole for _, whole in get_elements_text_and_html_by_attribute(*args, **kwargs)] -def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value=True): +def get_elements_text_and_html_by_attribute(attribute, value, html, *, tag=r'[\w:.-]+', escape_value=True): """ Return the text (content) and the html (whole) of the tag with the specified attribute in the passed HTML document """ + if not value: + return - value_quote_optional = '' if re.match(r'''[\s"'`=<>]''', value) else '?' + quote = '' if re.match(r'''[\s"'`=<>]''', value) else '?' value = re.escape(value) if escape_value else value - partial_element_re = r'''(?x) - <(?P<tag>[a-zA-Z0-9:._-]+) + partial_element_re = rf'''(?x) + <(?P<tag>{tag}) (?:\s(?:[^>"']|"[^"]*"|'[^']*')*)? - \s%(attribute)s\s*=\s*(?P<_q>['"]%(vqo)s)(?-x:%(value)s)(?P=_q) - ''' % {'attribute': re.escape(attribute), 'value': value, 'vqo': value_quote_optional} + \s{re.escape(attribute)}\s*=\s*(?P<_q>['"]{quote})(?-x:{value})(?P=_q) + ''' for m in re.finditer(partial_element_re, html): content, whole = get_element_text_and_html_by_tag(m.group('tag'), html[m.start():]) @@ -508,7 +440,7 @@ def get_elements_text_and_html_by_attribute(attribute, value, html, escape_value ) -class HTMLBreakOnClosingTagParser(compat_HTMLParser): +class HTMLBreakOnClosingTagParser(html.parser.HTMLParser): """ HTML parser which raises HTMLBreakOnClosingTagException upon reaching the closing tag for the first opening tag it has encountered, and can be used @@ -520,7 +452,7 @@ class HTMLBreakOnClosingTagParser(compat_HTMLParser): def __init__(self): self.tagstack = collections.deque() - compat_HTMLParser.__init__(self) + html.parser.HTMLParser.__init__(self) def __enter__(self): return self @@ -550,6 +482,7 @@ class HTMLBreakOnClosingTagParser(compat_HTMLParser): raise self.HTMLBreakOnClosingTagException() +# XXX: This should be far less strict def get_element_text_and_html_by_tag(tag, html): """ For the first element with the specified tag in the passed HTML document @@ -585,22 +518,23 @@ def get_element_text_and_html_by_tag(tag, html): raise compat_HTMLParseError('unexpected end of html') -class HTMLAttributeParser(compat_HTMLParser): +class HTMLAttributeParser(html.parser.HTMLParser): """Trivial HTML parser to gather the attributes for a single element""" def __init__(self): self.attrs = {} - compat_HTMLParser.__init__(self) + html.parser.HTMLParser.__init__(self) def handle_starttag(self, tag, attrs): self.attrs = dict(attrs) + raise compat_HTMLParseError('done') -class HTMLListAttrsParser(compat_HTMLParser): +class HTMLListAttrsParser(html.parser.HTMLParser): """HTML parser to gather the attributes for the elements of a list""" def __init__(self): - compat_HTMLParser.__init__(self) + html.parser.HTMLParser.__init__(self) self.items = [] self._level = 0 @@ -626,16 +560,11 @@ def extract_attributes(html_element): 'empty': '', 'noval': None, 'entity': '&', 'sq': '"', 'dq': '\'' }. - NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions, - but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5. """ parser = HTMLAttributeParser() - try: + with contextlib.suppress(compat_HTMLParseError): parser.feed(html_element) parser.close() - # Older Python may throw HTMLParseError in case of malformed HTML - except compat_HTMLParseError: - pass return parser.attrs @@ -664,6 +593,24 @@ def clean_html(html): return html.strip() +class LenientJSONDecoder(json.JSONDecoder): + def __init__(self, *args, transform_source=None, ignore_extra=False, **kwargs): + self.transform_source, self.ignore_extra = transform_source, ignore_extra + super().__init__(*args, **kwargs) + + def decode(self, s): + if self.transform_source: + s = self.transform_source(s) + try: + if self.ignore_extra: + return self.raw_decode(s.lstrip())[0] + return super().decode(s) + except json.JSONDecodeError as e: + if e.pos is not None: + raise type(e)(f'{e.msg} in {s[e.pos-10:e.pos+10]!r}', s, e.pos) + raise + + def sanitize_open(filename, open_mode): """Try to open the given filename, and slightly tweak it if this fails. @@ -674,26 +621,33 @@ def sanitize_open(filename, open_mode): It returns the tuple (stream, definitive_file_name). """ - try: - if filename == '-': - if sys.platform == 'win32': - import msvcrt + if filename == '-': + if sys.platform == 'win32': + import msvcrt + + # stdout may be any IO stream, e.g. when using contextlib.redirect_stdout + with contextlib.suppress(io.UnsupportedOperation): msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY) - return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename) - stream = locked_file(filename, open_mode, block=False).open() - return (stream, filename) - except (IOError, OSError) as err: - if err.errno in (errno.EACCES,): - raise + return (sys.stdout.buffer if hasattr(sys.stdout, 'buffer') else sys.stdout, filename) - # In case of error, try to remove win32 forbidden chars - alt_filename = sanitize_path(filename) - if alt_filename == filename: - raise - else: - # An exception here should be caught in the caller - stream = locked_file(filename, open_mode, block=False).open() - return (stream, alt_filename) + for attempt in range(2): + try: + try: + if sys.platform == 'win32': + # FIXME: An exclusive lock also locks the file from being read. + # Since windows locks are mandatory, don't lock the file on windows (for now). + # Ref: https://github.com/hypervideo/hypervideo/issues/3124 + raise LockingUnsupportedError() + stream = locked_file(filename, open_mode, block=False).__enter__() + except OSError: + stream = open(filename, open_mode) + return stream, filename + except OSError as err: + if attempt or err.errno in (errno.EACCES,): + raise + old_filename, filename = filename, sanitize_path(filename) + if old_filename == filename: + raise def timeconvert(timestr): @@ -719,6 +673,9 @@ def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT): return ACCENT_CHARS[char] elif not restricted and char == '\n': return '\0 ' + elif is_id is NO_DEFAULT and not restricted and char in '"*:<>?|/\\': + # Replace with their full-width unicode counterparts + return {'/': '\u29F8', '\\': '\u29f9'}.get(char, chr(ord(char) + 0xfee0)) elif char == '?' or ord(char) < 32 or ord(char) == 127: return '' elif char == '"': @@ -731,11 +688,14 @@ def sanitize_filename(s, restricted=False, is_id=NO_DEFAULT): return '\0_' return char + # Replace look-alike Unicode glyphs + if restricted and (is_id is NO_DEFAULT or not is_id): + s = unicodedata.normalize('NFKC', s) s = re.sub(r'[0-9]+(?::[0-9]+)+', lambda m: m.group(0).replace(':', '_'), s) # Handle timestamps result = ''.join(map(replace_insane, s)) if is_id is NO_DEFAULT: - result = re.sub('(\0.)(?:(?=\\1)..)+', r'\1', result) # Remove repeated substitute chars - STRIP_RE = '(?:\0.|[ _-])*' + result = re.sub(r'(\0.)(?:(?=\1)..)+', r'\1', result) # Remove repeated substitute chars + STRIP_RE = r'(?:\0.|[ _-])*' result = re.sub(f'^\0.{STRIP_RE}|{STRIP_RE}\0.$', '', result) # Remove substitute chars from start/end result = result.replace('\0', '') or '_' @@ -759,8 +719,6 @@ def sanitize_path(s, force=False): if sys.platform == 'win32': force = False drive_or_unc, _ = os.path.splitdrive(s) - if sys.version_info < (2, 7) and not drive_or_unc: - drive_or_unc, _ = os.path.splitunc(s) elif force: drive_or_unc = '' else: @@ -774,16 +732,18 @@ def sanitize_path(s, force=False): for path_part in norm_path] if drive_or_unc: sanitized_path.insert(0, drive_or_unc + os.path.sep) - elif force and s[0] == os.path.sep: + elif force and s and s[0] == os.path.sep: sanitized_path.insert(0, os.path.sep) return os.path.join(*sanitized_path) -def sanitize_url(url): +def sanitize_url(url, *, scheme='http'): # Prepend protocol-less URLs with `http:` scheme in order to mitigate # the number of unwanted failures due to missing protocol - if url.startswith('//'): - return 'http:%s' % url + if url is None: + return + elif url.startswith('//'): + return f'{scheme}:{url}' # Fix some common typos seen so far COMMON_TYPOS = ( # https://github.com/ytdl-org/youtube-dl/issues/15649 @@ -798,15 +758,15 @@ def sanitize_url(url): def extract_basic_auth(url): - parts = compat_urlparse.urlsplit(url) + parts = urllib.parse.urlsplit(url) if parts.username is None: return url, None - url = compat_urlparse.urlunsplit(parts._replace(netloc=( + url = urllib.parse.urlunsplit(parts._replace(netloc=( parts.hostname if parts.port is None else '%s:%d' % (parts.hostname, parts.port)))) auth_payload = base64.b64encode( - ('%s:%s' % (parts.username, parts.password or '')).encode('utf-8')) - return url, 'Basic ' + auth_payload.decode('utf-8') + ('%s:%s' % (parts.username, parts.password or '')).encode()) + return url, f'Basic {auth_payload.decode()}' def sanitized_Request(url, *args, **kwargs): @@ -814,7 +774,7 @@ def sanitized_Request(url, *args, **kwargs): if auth_header is not None: headers = args[1] if len(args) >= 2 else kwargs.setdefault('headers', {}) headers['Authorization'] = auth_header - return compat_urllib_request.Request(url, *args, **kwargs) + return urllib.request.Request(url, *args, **kwargs) def expand_path(s): @@ -822,13 +782,16 @@ def expand_path(s): return os.path.expandvars(compat_expanduser(s)) -def orderedSet(iterable): - """ Remove all duplicates from the input iterable """ - res = [] - for el in iterable: - if el not in res: - res.append(el) - return res +def orderedSet(iterable, *, lazy=False): + """Remove all duplicates from the input iterable""" + def _iter(): + seen = [] # Do not use set since the items can be unhashable + for x in iterable: + if x not in seen: + seen.append(x) + yield x + + return _iter() if lazy else list(_iter()) def _htmlentity_transform(entity_with_semicolon): @@ -836,13 +799,13 @@ def _htmlentity_transform(entity_with_semicolon): entity = entity_with_semicolon[:-1] # Known non-numeric HTML entity - if entity in compat_html_entities.name2codepoint: - return compat_chr(compat_html_entities.name2codepoint[entity]) + if entity in html.entities.name2codepoint: + return chr(html.entities.name2codepoint[entity]) - # TODO: HTML5 allows entities without a semicolon. For example, - # 'Éric' should be decoded as 'Éric'. - if entity_with_semicolon in compat_html_entities_html5: - return compat_html_entities_html5[entity_with_semicolon] + # TODO: HTML5 allows entities without a semicolon. + # E.g. 'Éric' should be decoded as 'Éric'. + if entity_with_semicolon in html.entities.html5: + return html.entities.html5[entity_with_semicolon] mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity) if mobj is not None: @@ -853,10 +816,8 @@ def _htmlentity_transform(entity_with_semicolon): else: base = 10 # See https://github.com/ytdl-org/youtube-dl/issues/7518 - try: - return compat_chr(int(numstr, base)) - except ValueError: - pass + with contextlib.suppress(ValueError): + return chr(int(numstr, base)) # Unknown entity in name, return its literal representation return '&%s;' % entity @@ -865,7 +826,7 @@ def _htmlentity_transform(entity_with_semicolon): def unescapeHTML(s): if s is None: return None - assert type(s) == compat_str + assert isinstance(s, str) return re.sub( r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s) @@ -883,12 +844,9 @@ def escapeHTML(text): def process_communicate_or_kill(p, *args, **kwargs): - try: - return p.communicate(*args, **kwargs) - except BaseException: # Including KeyboardInterrupt - p.kill() - p.wait() - raise + deprecation_warning(f'"{__name__}.process_communicate_or_kill" is deprecated and may be removed ' + f'in a future version. Use "{__name__}.Popen.communicate_or_kill" instead') + return Popen.communicate_or_kill(p, *args, **kwargs) class Popen(subprocess.Popen): @@ -898,11 +856,54 @@ class Popen(subprocess.Popen): else: _startupinfo = None - def __init__(self, *args, **kwargs): - super(Popen, self).__init__(*args, **kwargs, startupinfo=self._startupinfo) + @staticmethod + def _fix_pyinstaller_ld_path(env): + """Restore LD_LIBRARY_PATH when using PyInstaller + Ref: https://github.com/pyinstaller/pyinstaller/blob/develop/doc/runtime-information.rst#ld_library_path--libpath-considerations + https://github.com/hypervideo/hypervideo/issues/4573 + """ + if not hasattr(sys, '_MEIPASS'): + return + + def _fix(key): + orig = env.get(f'{key}_ORIG') + if orig is None: + env.pop(key, None) + else: + env[key] = orig + + _fix('LD_LIBRARY_PATH') # Linux + _fix('DYLD_LIBRARY_PATH') # macOS + + def __init__(self, *args, env=None, text=False, **kwargs): + if env is None: + env = os.environ.copy() + self._fix_pyinstaller_ld_path(env) + + if text is True: + kwargs['universal_newlines'] = True # For 3.6 compatibility + kwargs.setdefault('encoding', 'utf-8') + kwargs.setdefault('errors', 'replace') + super().__init__(*args, env=env, **kwargs, startupinfo=self._startupinfo) def communicate_or_kill(self, *args, **kwargs): - return process_communicate_or_kill(self, *args, **kwargs) + try: + return self.communicate(*args, **kwargs) + except BaseException: # Including KeyboardInterrupt + self.kill(timeout=None) + raise + + def kill(self, *, timeout=0): + super().kill() + if timeout != 0: + self.wait(timeout=timeout) + + @classmethod + def run(cls, *args, timeout=None, **kwargs): + with cls(*args, **kwargs) as proc: + default = '' if proc.text_mode else b'' + stdout, stderr = proc.communicate_or_kill(timeout=timeout) + return stdout or default, stderr or default, proc.returncode def get_subprocess_encoding(): @@ -918,51 +919,23 @@ def get_subprocess_encoding(): def encodeFilename(s, for_subprocess=False): - """ - @param s The name of the file - """ - - assert type(s) == compat_str - - # Python 3 has a Unicode API - if sys.version_info >= (3, 0): - return s - - # Pass '' directly to use Unicode APIs on Windows 2000 and up - # (Detecting Windows NT 4 is tricky because 'major >= 4' would - # match Windows 9x series as well. Besides, NT 4 is obsolete.) - if not for_subprocess and sys.platform == 'win32' and sys.getwindowsversion()[0] >= 5: - return s - - # Jython assumes filenames are Unicode strings though reported as Python 2.x compatible - if sys.platform.startswith('java'): - return s - - return s.encode(get_subprocess_encoding(), 'ignore') + assert isinstance(s, str) + return s def decodeFilename(b, for_subprocess=False): - - if sys.version_info >= (3, 0): - return b - - if not isinstance(b, bytes): - return b - - return b.decode(get_subprocess_encoding(), 'ignore') + return b def encodeArgument(s): - if not isinstance(s, compat_str): - # Legacy code that uses byte strings - # Uncomment the following line after fixing all post processors - # assert False, 'Internal error: %r should be of type %r, is %r' % (s, compat_str, type(s)) - s = s.decode('ascii') - return encodeFilename(s, True) + # Legacy code that uses byte strings + # Uncomment the following line after fixing all post processors + # assert isinstance(s, str), 'Internal error: %r should be of type %r, is %r' % (s, str, type(s)) + return s if isinstance(s, str) else s.decode('ascii') def decodeArgument(b): - return decodeFilename(b, True) + return b def decodeOption(optval): @@ -971,7 +944,7 @@ def decodeOption(optval): if isinstance(optval, bytes): optval = optval.decode(preferredencoding()) - assert isinstance(optval, compat_str) + assert isinstance(optval, str) return optval @@ -1005,10 +978,8 @@ def _ssl_load_windows_store_certs(ssl_context, storename): except PermissionError: return for cert in certs: - try: + with contextlib.suppress(ssl.SSLError): ssl_context.load_verify_locations(cadata=cert) - except ssl.SSLError: - pass def make_HTTPS_handler(params, **kwargs): @@ -1017,6 +988,28 @@ def make_HTTPS_handler(params, **kwargs): context.check_hostname = opts_check_certificate if params.get('legacyserverconnect'): context.options |= 4 # SSL_OP_LEGACY_SERVER_CONNECT + # Allow use of weaker ciphers in Python 3.10+. See https://bugs.python.org/issue43998 + context.set_ciphers('DEFAULT') + elif ( + sys.version_info < (3, 10) + and ssl.OPENSSL_VERSION_INFO >= (1, 1, 1) + and not ssl.OPENSSL_VERSION.startswith('LibreSSL') + ): + # Backport the default SSL ciphers and minimum TLS version settings from Python 3.10 [1]. + # This is to ensure consistent behavior across Python versions, and help avoid fingerprinting + # in some situations [2][3]. + # Python 3.10 only supports OpenSSL 1.1.1+ [4]. Because this change is likely + # untested on older versions, we only apply this to OpenSSL 1.1.1+ to be safe. + # LibreSSL is excluded until further investigation due to cipher support issues [5][6]. + # 1. https://github.com/python/cpython/commit/e983252b516edb15d4338b0a47631b59ef1e2536 + # 2. https://github.com/hypervideo/hypervideo/issues/4627 + # 3. https://github.com/hypervideo/hypervideo/pull/5294 + # 4. https://peps.python.org/pep-0644/ + # 5. https://peps.python.org/pep-0644/#libressl-support + # 6. https://github.com/hypervideo/hypervideo/commit/5b9f253fa0aee996cf1ed30185d4b502e00609c4#commitcomment-89054368 + context.set_ciphers('@SECLEVEL=2:ECDH+AESGCM:ECDH+CHACHA20:ECDH+AES:DHE+AES:!aNULL:!eNULL:!aDSS:!SHA1:!AESCCM') + context.minimum_version = ssl.TLSVersion.TLSv1_2 + context.verify_mode = ssl.CERT_REQUIRED if opts_check_certificate else ssl.CERT_NONE if opts_check_certificate: if has_certifi and 'no-certifi' not in params.get('compat_opts', []): @@ -1030,12 +1023,25 @@ def make_HTTPS_handler(params, **kwargs): except ssl.SSLError: # enum_certificates is not present in mingw python. See https://github.com/hypervideo/hypervideo/issues/1151 if sys.platform == 'win32' and hasattr(ssl, 'enum_certificates'): - # Create a new context to discard any certificates that were already loaded - context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) - context.check_hostname, context.verify_mode = True, ssl.CERT_REQUIRED for storename in ('CA', 'ROOT'): _ssl_load_windows_store_certs(context, storename) context.set_default_verify_paths() + + client_certfile = params.get('client_certificate') + if client_certfile: + try: + context.load_cert_chain( + client_certfile, keyfile=params.get('client_certificate_key'), + password=params.get('client_certificate_password')) + except ssl.SSLError: + raise YoutubeDLError('Unable to load client certificate') + + # Some servers may reject requests if ALPN extension is not sent. See: + # https://github.com/python/cpython/issues/85140 + # https://github.com/hypervideo/hypervideo/issues/3878 + with contextlib.suppress(NotImplementedError): + context.set_alpn_protocols(['http/1.1']) + return YoutubeDLHTTPSHandler(params, context=context, **kwargs) @@ -1063,7 +1069,7 @@ class YoutubeDLError(Exception): super().__init__(self.msg) -network_exceptions = [compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error] +network_exceptions = [urllib.error.URLError, http.client.HTTPException, socket.error] if hasattr(ssl, 'CertificateError'): network_exceptions.append(ssl.CertificateError) network_exceptions = tuple(network_exceptions) @@ -1086,13 +1092,18 @@ class ExtractorError(YoutubeDLError): self.video_id = video_id self.ie = ie self.exc_info = sys.exc_info() # preserve original exception + if isinstance(self.exc_info[1], ExtractorError): + self.exc_info = self.exc_info[1].exc_info + super().__init__(self.__msg) - super(ExtractorError, self).__init__(''.join(( - format_field(ie, template='[%s] '), - format_field(video_id, template='%s: '), - msg, - format_field(cause, template=' (caused by %r)'), - '' if expected else bug_reports_message()))) + @property + def __msg(self): + return ''.join(( + format_field(self.ie, None, '[%s] '), + format_field(self.video_id, None, '%s: '), + self.orig_msg, + format_field(self.cause, None, ' (caused by %r)'), + '' if self.expected else bug_reports_message())) def format_traceback(self): return join_nonempty( @@ -1100,10 +1111,16 @@ class ExtractorError(YoutubeDLError): self.cause and ''.join(traceback.format_exception(None, self.cause, self.cause.__traceback__)[1:]), delim='\n') or None + def __setattr__(self, name, value): + super().__setattr__(name, value) + if getattr(self, 'msg', None) and name not in ('msg', 'args'): + self.msg = self.__msg or type(self).__name__ + self.args = (self.msg, ) # Cannot be property + class UnsupportedError(ExtractorError): def __init__(self, url): - super(UnsupportedError, self).__init__( + super().__init__( 'Unsupported URL: %s' % url, expected=True) self.url = url @@ -1122,10 +1139,18 @@ class GeoRestrictedError(ExtractorError): def __init__(self, msg, countries=None, **kwargs): kwargs['expected'] = True - super(GeoRestrictedError, self).__init__(msg, **kwargs) + super().__init__(msg, **kwargs) self.countries = countries +class UserNotLive(ExtractorError): + """Error when a channel/user is not live""" + + def __init__(self, msg=None, **kwargs): + kwargs['expected'] = True + super().__init__(msg or 'The channel is not currently live', **kwargs) + + class DownloadError(YoutubeDLError): """Download Error exception. @@ -1136,7 +1161,7 @@ class DownloadError(YoutubeDLError): def __init__(self, msg, exc_info=None): """ exc_info, if given, is the original exception that caused the trouble (as returned by sys.exc_info()). """ - super(DownloadError, self).__init__(msg) + super().__init__(msg) self.exc_info = exc_info @@ -1230,9 +1255,7 @@ class ContentTooShortError(YoutubeDLError): """ def __init__(self, downloaded, expected): - super(ContentTooShortError, self).__init__( - 'Downloaded {0} bytes, expected {1} bytes'.format(downloaded, expected) - ) + super().__init__(f'Downloaded {downloaded} bytes, expected {expected} bytes') # Both in bytes self.downloaded = downloaded self.expected = expected @@ -1240,7 +1263,7 @@ class ContentTooShortError(YoutubeDLError): class XAttrMetadataError(YoutubeDLError): def __init__(self, code=None, msg='Unknown error'): - super(XAttrMetadataError, self).__init__(msg) + super().__init__(msg) self.code = code self.msg = msg @@ -1259,12 +1282,7 @@ class XAttrUnavailableError(YoutubeDLError): def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): - # Working around python 2 bug (see http://bugs.python.org/issue17849) by limiting - # expected HTTP responses to meet HTTP/1.0 or later (see also - # https://github.com/ytdl-org/youtube-dl/issues/6727) - if sys.version_info < (3, 0): - kwargs['strict'] = True - hc = http_class(*args, **compat_kwargs(kwargs)) + hc = http_class(*args, **kwargs) source_address = ydl_handler._params.get('source_address') if source_address is not None: @@ -1281,7 +1299,7 @@ def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): ip_addrs = [addr for addr in addrs if addr[0] == af] if addrs and not ip_addrs: ip_version = 'v4' if af == socket.AF_INET else 'v6' - raise socket.error( + raise OSError( "No remote IP%s addresses available for connect, can't use '%s' as source address" % (ip_version, source_address[0])) for res in ip_addrs: @@ -1295,30 +1313,17 @@ def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): sock.connect(sa) err = None # Explicitly break reference cycle return sock - except socket.error as _: + except OSError as _: err = _ if sock is not None: sock.close() if err is not None: raise err else: - raise socket.error('getaddrinfo returns an empty list') + raise OSError('getaddrinfo returns an empty list') if hasattr(hc, '_create_connection'): hc._create_connection = _create_connection - sa = (source_address, 0) - if hasattr(hc, 'source_address'): # Python 2.7+ - hc.source_address = sa - else: # Python 2.6 - def _hc_connect(self, *args, **kwargs): - sock = _create_connection( - (self.host, self.port), self.timeout, sa) - if is_https: - self.sock = ssl.wrap_socket( - sock, self.key_file, self.cert_file, - ssl_version=ssl.PROTOCOL_TLSv1) - else: - self.sock = sock - hc.connect = functools.partial(_hc_connect, hc) + hc.source_address = (source_address, 0) return hc @@ -1327,13 +1332,13 @@ def handle_youtubedl_headers(headers): filtered_headers = headers if 'Youtubedl-no-compression' in filtered_headers: - filtered_headers = dict((k, v) for k, v in filtered_headers.items() if k.lower() != 'accept-encoding') + filtered_headers = {k: v for k, v in filtered_headers.items() if k.lower() != 'accept-encoding'} del filtered_headers['Youtubedl-no-compression'] return filtered_headers -class YoutubeDLHandler(compat_urllib_request.HTTPHandler): +class YoutubeDLHandler(urllib.request.HTTPHandler): """Handler for HTTP requests and responses. This class, when installed with an OpenerDirector, automatically adds @@ -1352,11 +1357,11 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): """ def __init__(self, params, *args, **kwargs): - compat_urllib_request.HTTPHandler.__init__(self, *args, **kwargs) + urllib.request.HTTPHandler.__init__(self, *args, **kwargs) self._params = params def http_open(self, req): - conn_class = compat_http_client.HTTPConnection + conn_class = http.client.HTTPConnection socks_proxy = req.headers.get('Ytdl-socks-proxy') if socks_proxy: @@ -1380,7 +1385,7 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): def brotli(data): if not data: return data - return compat_brotli.decompress(data) + return brotli.decompress(data) def http_request(self, req): # According to RFC 3986, URLs can not contain non-ASCII characters, however this is not @@ -1409,12 +1414,7 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): req.headers = handle_youtubedl_headers(req.headers) - if sys.version_info < (2, 7) and '#' in req.get_full_url(): - # Python 2.6 is brain-dead when it comes to fragments - req._Request__original = req._Request__original.partition('#')[0] - req._Request__r_type = req._Request__r_type.partition('#')[0] - - return req + return super().do_request_(req) def http_response(self, req, resp): old_resp = resp @@ -1424,30 +1424,30 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): gz = gzip.GzipFile(fileobj=io.BytesIO(content), mode='rb') try: uncompressed = io.BytesIO(gz.read()) - except IOError as original_ioerror: + except OSError as original_ioerror: # There may be junk add the end of the file # See http://stackoverflow.com/q/4928560/35070 for details for i in range(1, 1024): try: gz = gzip.GzipFile(fileobj=io.BytesIO(content[:-i]), mode='rb') uncompressed = io.BytesIO(gz.read()) - except IOError: + except OSError: continue break else: raise original_ioerror - resp = compat_urllib_request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code) + resp = urllib.request.addinfourl(uncompressed, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg del resp.headers['Content-encoding'] # deflate if resp.headers.get('Content-encoding', '') == 'deflate': gz = io.BytesIO(self.deflate(resp.read())) - resp = compat_urllib_request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) + resp = urllib.request.addinfourl(gz, old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg del resp.headers['Content-encoding'] # brotli if resp.headers.get('Content-encoding', '') == 'br': - resp = compat_urllib_request.addinfourl( + resp = urllib.request.addinfourl( io.BytesIO(self.brotli(resp.read())), old_resp.headers, old_resp.url, old_resp.code) resp.msg = old_resp.msg del resp.headers['Content-encoding'] @@ -1457,15 +1457,10 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): location = resp.headers.get('Location') if location: # As of RFC 2616 default charset is iso-8859-1 that is respected by python 3 - if sys.version_info >= (3, 0): - location = location.encode('iso-8859-1').decode('utf-8') - else: - location = location.decode('utf-8') + location = location.encode('iso-8859-1').decode() location_escaped = escape_url(location) if location != location_escaped: del resp.headers['Location'] - if sys.version_info < (3, 0): - location_escaped = location_escaped.encode('utf-8') resp.headers['Location'] = location_escaped return resp @@ -1475,9 +1470,9 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler): def make_socks_conn_class(base_class, socks_proxy): assert issubclass(base_class, ( - compat_http_client.HTTPConnection, compat_http_client.HTTPSConnection)) + http.client.HTTPConnection, http.client.HTTPSConnection)) - url_components = compat_urlparse.urlparse(socks_proxy) + url_components = urllib.parse.urlparse(socks_proxy) if url_components.scheme.lower() == 'socks5': socks_type = ProxyType.SOCKS5 elif url_components.scheme.lower() in ('socks', 'socks4'): @@ -1488,7 +1483,7 @@ def make_socks_conn_class(base_class, socks_proxy): def unquote_if_non_empty(s): if not s: return s - return compat_urllib_parse_unquote_plus(s) + return urllib.parse.unquote_plus(s) proxy_args = ( socks_type, @@ -1502,11 +1497,11 @@ def make_socks_conn_class(base_class, socks_proxy): def connect(self): self.sock = sockssocket() self.sock.setproxy(*proxy_args) - if type(self.timeout) in (int, float): + if isinstance(self.timeout, (int, float)): self.sock.settimeout(self.timeout) self.sock.connect((self.host, self.port)) - if isinstance(self, compat_http_client.HTTPSConnection): + if isinstance(self, http.client.HTTPSConnection): if hasattr(self, '_context'): # Python > 2.6 self.sock = self._context.wrap_socket( self.sock, server_hostname=self.host) @@ -1516,10 +1511,10 @@ def make_socks_conn_class(base_class, socks_proxy): return SocksConnection -class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler): +class YoutubeDLHTTPSHandler(urllib.request.HTTPSHandler): def __init__(self, params, https_conn_class=None, *args, **kwargs): - compat_urllib_request.HTTPSHandler.__init__(self, *args, **kwargs) - self._https_conn_class = https_conn_class or compat_http_client.HTTPSConnection + urllib.request.HTTPSHandler.__init__(self, *args, **kwargs) + self._https_conn_class = https_conn_class or http.client.HTTPSConnection self._params = params def https_open(self, req): @@ -1536,12 +1531,21 @@ class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler): conn_class = make_socks_conn_class(conn_class, socks_proxy) del req.headers['Ytdl-socks-proxy'] - return self.do_open(functools.partial( - _create_http_connection, self, conn_class, True), - req, **kwargs) + try: + return self.do_open( + functools.partial(_create_http_connection, self, conn_class, True), req, **kwargs) + except urllib.error.URLError as e: + if (isinstance(e.reason, ssl.SSLError) + and getattr(e.reason, 'reason', None) == 'SSLV3_ALERT_HANDSHAKE_FAILURE'): + raise YoutubeDLError('SSLV3_ALERT_HANDSHAKE_FAILURE: Try using --legacy-server-connect') + raise -class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar): +def is_path_like(f): + return isinstance(f, (str, bytes, os.PathLike)) + + +class YoutubeDLCookieJar(http.cookiejar.MozillaCookieJar): """ See [1] for cookie file format. @@ -1557,57 +1561,67 @@ class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar): 'CookieFileEntry', ('domain_name', 'include_subdomains', 'path', 'https_only', 'expires_at', 'name', 'value')) - def save(self, filename=None, ignore_discard=False, ignore_expires=False): + def __init__(self, filename=None, *args, **kwargs): + super().__init__(None, *args, **kwargs) + if is_path_like(filename): + filename = os.fspath(filename) + self.filename = filename + + @staticmethod + def _true_or_false(cndn): + return 'TRUE' if cndn else 'FALSE' + + @contextlib.contextmanager + def open(self, file, *, write=False): + if is_path_like(file): + with open(file, 'w' if write else 'r', encoding='utf-8') as f: + yield f + else: + if write: + file.truncate(0) + yield file + + def _really_save(self, f, ignore_discard=False, ignore_expires=False): + now = time.time() + for cookie in self: + if (not ignore_discard and cookie.discard + or not ignore_expires and cookie.is_expired(now)): + continue + name, value = cookie.name, cookie.value + if value is None: + # cookies.txt regards 'Set-Cookie: foo' as a cookie + # with no name, whereas http.cookiejar regards it as a + # cookie with no value. + name, value = '', name + f.write('%s\n' % '\t'.join(( + cookie.domain, + self._true_or_false(cookie.domain.startswith('.')), + cookie.path, + self._true_or_false(cookie.secure), + str_or_none(cookie.expires, default=''), + name, value + ))) + + def save(self, filename=None, *args, **kwargs): """ Save cookies to a file. + Code is taken from CPython 3.6 + https://github.com/python/cpython/blob/8d999cbf4adea053be6dbb612b9844635c4dfb8e/Lib/http/cookiejar.py#L2091-L2117 """ - Most of the code is taken from CPython 3.8 and slightly adapted - to support cookie files with UTF-8 in both python 2 and 3. - """ if filename is None: if self.filename is not None: filename = self.filename else: - raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT) + raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT) - # Store session cookies with `expires` set to 0 instead of an empty - # string + # Store session cookies with `expires` set to 0 instead of an empty string for cookie in self: if cookie.expires is None: cookie.expires = 0 - with io.open(filename, 'w', encoding='utf-8') as f: + with self.open(filename, write=True) as f: f.write(self._HEADER) - now = time.time() - for cookie in self: - if not ignore_discard and cookie.discard: - continue - if not ignore_expires and cookie.is_expired(now): - continue - if cookie.secure: - secure = 'TRUE' - else: - secure = 'FALSE' - if cookie.domain.startswith('.'): - initial_dot = 'TRUE' - else: - initial_dot = 'FALSE' - if cookie.expires is not None: - expires = compat_str(cookie.expires) - else: - expires = '' - if cookie.value is None: - # cookies.txt regards 'Set-Cookie: foo' as a cookie - # with no name, whereas http.cookiejar regards it as a - # cookie with no value. - name = '' - value = cookie.name - else: - name = cookie.name - value = cookie.value - f.write( - '\t'.join([cookie.domain, initial_dot, cookie.path, - secure, expires, name, value]) + '\n') + self._really_save(f, *args, **kwargs) def load(self, filename=None, ignore_discard=False, ignore_expires=False): """Load cookies from a file.""" @@ -1615,7 +1629,7 @@ class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar): if self.filename is not None: filename = self.filename else: - raise ValueError(compat_cookiejar.MISSING_FILENAME_TEXT) + raise ValueError(http.cookiejar.MISSING_FILENAME_TEXT) def prepare_line(line): if line.startswith(self._HTTPONLY_PREFIX): @@ -1625,21 +1639,23 @@ class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar): return line cookie_list = line.split('\t') if len(cookie_list) != self._ENTRY_LEN: - raise compat_cookiejar.LoadError('invalid length %d' % len(cookie_list)) + raise http.cookiejar.LoadError('invalid length %d' % len(cookie_list)) cookie = self._CookieFileEntry(*cookie_list) if cookie.expires_at and not cookie.expires_at.isdigit(): - raise compat_cookiejar.LoadError('invalid expires at %s' % cookie.expires_at) + raise http.cookiejar.LoadError('invalid expires at %s' % cookie.expires_at) return line cf = io.StringIO() - with io.open(filename, encoding='utf-8') as f: + with self.open(filename) as f: for line in f: try: cf.write(prepare_line(line)) - except compat_cookiejar.LoadError as e: - write_string( - 'WARNING: skipping cookie file entry due to %s: %r\n' - % (e, line), sys.stderr) + except http.cookiejar.LoadError as e: + if f'{line.strip()} '[0] in '[{"': + raise http.cookiejar.LoadError( + 'Cookies file must be Netscape formatted, not JSON. See ' + 'https://github.com/hypervideo/hypervideo/wiki/FAQ#how-do-i-pass-cookies-to-hypervideo') + write_string(f'WARNING: skipping cookie file entry due to {e}: {line!r}\n') continue cf.seek(0) self._really_load(cf, filename, ignore_discard, ignore_expires) @@ -1659,31 +1675,18 @@ class YoutubeDLCookieJar(compat_cookiejar.MozillaCookieJar): cookie.discard = True -class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor): +class YoutubeDLCookieProcessor(urllib.request.HTTPCookieProcessor): def __init__(self, cookiejar=None): - compat_urllib_request.HTTPCookieProcessor.__init__(self, cookiejar) + urllib.request.HTTPCookieProcessor.__init__(self, cookiejar) def http_response(self, request, response): - # Python 2 will choke on next HTTP request in row if there are non-ASCII - # characters in Set-Cookie HTTP header of last response (see - # https://github.com/ytdl-org/youtube-dl/issues/6769). - # In order to at least prevent crashing we will percent encode Set-Cookie - # header before HTTPCookieProcessor starts processing it. - # if sys.version_info < (3, 0) and response.headers: - # for set_cookie_header in ('Set-Cookie', 'Set-Cookie2'): - # set_cookie = response.headers.get(set_cookie_header) - # if set_cookie: - # set_cookie_escaped = compat_urllib_parse.quote(set_cookie, b"%/;:@&=+$,!~*'()?#[] ") - # if set_cookie != set_cookie_escaped: - # del response.headers[set_cookie_header] - # response.headers[set_cookie_header] = set_cookie_escaped - return compat_urllib_request.HTTPCookieProcessor.http_response(self, request, response) - - https_request = compat_urllib_request.HTTPCookieProcessor.http_request + return urllib.request.HTTPCookieProcessor.http_response(self, request, response) + + https_request = urllib.request.HTTPCookieProcessor.http_request https_response = http_response -class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler): +class YoutubeDLRedirectHandler(urllib.request.HTTPRedirectHandler): """YoutubeDL redirect handler The code is based on HTTPRedirectHandler implementation from CPython [1]. @@ -1698,7 +1701,7 @@ class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler): 3. https://github.com/ytdl-org/youtube-dl/issues/28768 """ - http_error_301 = http_error_303 = http_error_307 = http_error_308 = compat_urllib_request.HTTPRedirectHandler.http_error_302 + http_error_301 = http_error_303 = http_error_307 = http_error_308 = urllib.request.HTTPRedirectHandler.http_error_302 def redirect_request(self, req, fp, code, msg, headers, newurl): """Return a Request or None in response to a redirect. @@ -1713,19 +1716,13 @@ class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler): m = req.get_method() if (not (code in (301, 302, 303, 307, 308) and m in ("GET", "HEAD") or code in (301, 302, 303) and m == "POST")): - raise compat_HTTPError(req.full_url, code, msg, headers, fp) + raise urllib.error.HTTPError(req.full_url, code, msg, headers, fp) # Strictly (according to RFC 2616), 301 or 302 in response to # a POST MUST NOT cause a redirection without confirmation # from the user (of urllib.request, in this case). In practice, # essentially all clients do redirect in this case, so we do # the same. - # On python 2 urlh.geturl() may sometimes return redirect URL - # as byte string instead of unicode. This workaround allows - # to force it always return unicode. - if sys.version_info[0] < 3: - newurl = compat_str(newurl) - # Be conciliant with URIs containing a space. This is mainly # redundant with the more complete encoding done in http_error_302(), # but it is kept for compatibility with other callers. @@ -1733,11 +1730,22 @@ class YoutubeDLRedirectHandler(compat_urllib_request.HTTPRedirectHandler): CONTENT_HEADERS = ("content-length", "content-type") # NB: don't use dict comprehension for python 2.6 compatibility - newheaders = dict((k, v) for k, v in req.headers.items() - if k.lower() not in CONTENT_HEADERS) - return compat_urllib_request.Request( + newheaders = {k: v for k, v in req.headers.items() if k.lower() not in CONTENT_HEADERS} + + # A 303 must either use GET or HEAD for subsequent request + # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.4 + if code == 303 and m != 'HEAD': + m = 'GET' + # 301 and 302 redirects are commonly turned into a GET from a POST + # for subsequent requests by browsers, so we'll do the same. + # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.2 + # https://datatracker.ietf.org/doc/html/rfc7231#section-6.4.3 + if code in (301, 302) and m == 'POST': + m = 'GET' + + return urllib.request.Request( newurl, headers=newheaders, origin_req_host=req.origin_req_host, - unverifiable=True) + unverifiable=True, method=m) def extract_timezone(date_str): @@ -1753,7 +1761,11 @@ def extract_timezone(date_str): $) ''', date_str) if not m: - timezone = datetime.timedelta() + m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str) + timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip()) + if timezone is not None: + date_str = date_str[:-len(m.group('tz'))] + timezone = datetime.timedelta(hours=timezone or 0) else: date_str = date_str[:-len(m.group('tz'))] if not m.group('sign'): @@ -1777,12 +1789,10 @@ def parse_iso8601(date_str, delimiter='T', timezone=None): if timezone is None: timezone, date_str = extract_timezone(date_str) - try: - date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter) + with contextlib.suppress(ValueError): + date_format = f'%Y-%m-%d{delimiter}%H:%M:%S' dt = datetime.datetime.strptime(date_str, date_format) - timezone return calendar.timegm(dt.timetuple()) - except ValueError: - pass def date_formats(day_first=True): @@ -1802,26 +1812,23 @@ def unified_strdate(date_str, day_first=True): _, date_str = extract_timezone(date_str) for expression in date_formats(day_first): - try: + with contextlib.suppress(ValueError): upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') - except ValueError: - pass if upload_date is None: timetuple = email.utils.parsedate_tz(date_str) if timetuple: - try: + with contextlib.suppress(ValueError): upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') - except ValueError: - pass if upload_date is not None: - return compat_str(upload_date) + return str(upload_date) def unified_timestamp(date_str, day_first=True): if date_str is None: return None - date_str = re.sub(r'[,|]', '', date_str) + date_str = re.sub(r'\s+', ' ', re.sub( + r'(?i)[,|]|(mon|tues?|wed(nes)?|thu(rs)?|fri|sat(ur)?)(day)?', '', date_str)) pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0 timezone, date_str = extract_timezone(date_str) @@ -1840,14 +1847,13 @@ def unified_timestamp(date_str, day_first=True): date_str = m.group(1) for expression in date_formats(day_first): - try: + with contextlib.suppress(ValueError): dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta) return calendar.timegm(dt.timetuple()) - except ValueError: - pass + timetuple = email.utils.parsedate_tz(date_str) if timetuple: - return calendar.timegm(timetuple) + pm_delta * 3600 + return calendar.timegm(timetuple) + pm_delta * 3600 - timezone.total_seconds() def determine_ext(url, default_ext='unknown_video'): @@ -1868,14 +1874,14 @@ def subtitles_filename(filename, sub_lang, sub_format, expected_real_ext=None): def datetime_from_str(date_str, precision='auto', format='%Y%m%d'): - """ - Return a datetime object from a string in the format YYYYMMDD or - (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)? - - format: string date format used to return datetime object from - precision: round the time portion of a datetime object. - auto|microsecond|second|minute|hour|day. - auto: round to the unit provided in date_str (if applicable). + R""" + Return a datetime object from a string. + Supported format: + (now|today|yesterday|DATE)([+-]\d+(microsecond|second|minute|hour|day|week|month|year)s?)? + + @param format strftime format of DATE + @param precision Round the datetime object: auto|microsecond|second|minute|hour|day + auto: round to the unit provided in date_str (if applicable). """ auto_precision = False if precision == 'auto': @@ -1887,7 +1893,7 @@ def datetime_from_str(date_str, precision='auto', format='%Y%m%d'): if date_str == 'yesterday': return today - datetime.timedelta(days=1) match = re.match( - r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)(s)?', + r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?', date_str) if match is not None: start_time = datetime_from_str(match.group('start'), precision, format) @@ -1910,16 +1916,14 @@ def datetime_from_str(date_str, precision='auto', format='%Y%m%d'): def date_from_str(date_str, format='%Y%m%d', strict=False): - """ - Return a datetime object from a string in the format YYYYMMDD or - (now|today|yesterday|date)[+-][0-9](microsecond|second|minute|hour|day|week|month|year)(s)? - - If "strict", only (now|today)[+-][0-9](day|week|month|year)(s)? is allowed + R""" + Return a date object from a string using datetime_from_str - format: string date format used to return datetime object from + @param strict Restrict allowed patterns to "YYYYMMDD" and + (now|today|yesterday)(-\d+(day|week|month|year)s?)? """ - if strict and not re.fullmatch(r'\d{8}|(now|today)[+-]\d+(day|week|month|year)(s)?', date_str): - raise ValueError(f'Invalid date format {date_str}') + if strict and not re.fullmatch(r'\d{8}|(now|today|yesterday)(-\d+(day|week|month|year)s?)?', date_str): + raise ValueError(f'Invalid date format "{date_str}"') return datetime_from_str(date_str, precision='microsecond', format=format).date() @@ -1960,7 +1964,7 @@ def hyphenate_date(date_str): return date_str -class DateRange(object): +class DateRange: """Represents a time interval between two dates""" def __init__(self, start=None, end=None): @@ -1988,121 +1992,81 @@ class DateRange(object): return self.start <= date <= self.end def __str__(self): - return '%s - %s' % (self.start.isoformat(), self.end.isoformat()) + return f'{self.start.isoformat()} - {self.end.isoformat()}' + def __eq__(self, other): + return (isinstance(other, DateRange) + and self.start == other.start and self.end == other.end) -def platform_name(): - """ Returns the platform name as a compat_str """ - res = platform.platform() - if isinstance(res, bytes): - res = res.decode(preferredencoding()) - assert isinstance(res, compat_str) - return res +def platform_name(): + """ Returns the platform name as a str """ + deprecation_warning(f'"{__name__}.platform_name" is deprecated, use "platform.platform" instead') + return platform.platform() + + +@functools.cache +def system_identifier(): + python_implementation = platform.python_implementation() + if python_implementation == 'PyPy' and hasattr(sys, 'pypy_version_info'): + python_implementation += ' version %d.%d.%d' % sys.pypy_version_info[:3] + libc_ver = [] + with contextlib.suppress(OSError): # We may not have access to the executable + libc_ver = platform.libc_ver() + + return 'Python %s (%s %s %s) - %s (%s%s)' % ( + platform.python_version(), + python_implementation, + platform.machine(), + platform.architecture()[0], + platform.platform(), + ssl.OPENSSL_VERSION, + format_field(join_nonempty(*libc_ver, delim=' '), None, ', %s'), + ) +@functools.cache def get_windows_version(): - ''' Get Windows version. None if it's not running on Windows ''' + ''' Get Windows version. returns () if it's not running on Windows ''' if compat_os_name == 'nt': return version_tuple(platform.win32_ver()[1]) else: - return None - + return () -def _windows_write_string(s, out): - """ Returns True if the string was written using special methods, - False if it has yet to be written out.""" - # Adapted from http://stackoverflow.com/a/3259271/35070 - import ctypes.wintypes - - WIN_OUTPUT_IDS = { - 1: -11, - 2: -12, - } - - try: - fileno = out.fileno() - except AttributeError: - # If the output stream doesn't have a fileno, it's virtual - return False - except io.UnsupportedOperation: - # Some strange Windows pseudo files? - return False - if fileno not in WIN_OUTPUT_IDS: - return False - - GetStdHandle = compat_ctypes_WINFUNCTYPE( - ctypes.wintypes.HANDLE, ctypes.wintypes.DWORD)( - ('GetStdHandle', ctypes.windll.kernel32)) - h = GetStdHandle(WIN_OUTPUT_IDS[fileno]) - - WriteConsoleW = compat_ctypes_WINFUNCTYPE( - ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, ctypes.wintypes.LPWSTR, - ctypes.wintypes.DWORD, ctypes.POINTER(ctypes.wintypes.DWORD), - ctypes.wintypes.LPVOID)(('WriteConsoleW', ctypes.windll.kernel32)) - written = ctypes.wintypes.DWORD(0) - - GetFileType = compat_ctypes_WINFUNCTYPE(ctypes.wintypes.DWORD, ctypes.wintypes.DWORD)(('GetFileType', ctypes.windll.kernel32)) - FILE_TYPE_CHAR = 0x0002 - FILE_TYPE_REMOTE = 0x8000 - GetConsoleMode = compat_ctypes_WINFUNCTYPE( - ctypes.wintypes.BOOL, ctypes.wintypes.HANDLE, - ctypes.POINTER(ctypes.wintypes.DWORD))( - ('GetConsoleMode', ctypes.windll.kernel32)) - INVALID_HANDLE_VALUE = ctypes.wintypes.DWORD(-1).value - - def not_a_console(handle): - if handle == INVALID_HANDLE_VALUE or handle is None: - return True - return ((GetFileType(handle) & ~FILE_TYPE_REMOTE) != FILE_TYPE_CHAR - or GetConsoleMode(handle, ctypes.byref(ctypes.wintypes.DWORD())) == 0) +def write_string(s, out=None, encoding=None): + assert isinstance(s, str) + out = out or sys.stderr - if not_a_console(h): - return False + if compat_os_name == 'nt' and supports_terminal_sequences(out): + s = re.sub(r'([\r\n]+)', r' \1', s) - def next_nonbmp_pos(s): - try: - return next(i for i, c in enumerate(s) if ord(c) > 0xffff) - except StopIteration: - return len(s) - - while s: - count = min(next_nonbmp_pos(s), 1024) - - ret = WriteConsoleW( - h, s, count if count else 2, ctypes.byref(written), None) - if ret == 0: - raise OSError('Failed to write string') - if not count: # We just wrote a non-BMP character - assert written.value == 2 - s = s[1:] - else: - assert written.value > 0 - s = s[written.value:] - return True + enc, buffer = None, out + if 'b' in getattr(out, 'mode', ''): + enc = encoding or preferredencoding() + elif hasattr(out, 'buffer'): + buffer = out.buffer + enc = encoding or getattr(out, 'encoding', None) or preferredencoding() + buffer.write(s.encode(enc, 'ignore') if enc else s) + out.flush() -def write_string(s, out=None, encoding=None): - if out is None: - out = sys.stderr - assert type(s) == compat_str - if sys.platform == 'win32' and encoding is None and hasattr(out, 'fileno'): - if _windows_write_string(s, out): +def deprecation_warning(msg, *, printer=None, stacklevel=0, **kwargs): + from . import _IN_CLI + if _IN_CLI: + if msg in deprecation_warning._cache: return - - if ('b' in getattr(out, 'mode', '') - or sys.version_info[0] < 3): # Python 2 lies about mode of sys.stderr - byt = s.encode(encoding or preferredencoding(), 'ignore') - out.write(byt) - elif hasattr(out, 'buffer'): - enc = encoding or getattr(out, 'encoding', None) or preferredencoding() - byt = s.encode(enc, 'ignore') - out.buffer.write(byt) + deprecation_warning._cache.add(msg) + if printer: + return printer(f'{msg}{bug_reports_message()}', **kwargs) + return write_string(f'ERROR: {msg}{bug_reports_message()}\n', **kwargs) else: - out.write(s) - out.flush() + import warnings + warnings.warn(DeprecationWarning(msg), stacklevel=stacklevel + 3) + + +deprecation_warning._cache = set() def bytes_to_intlist(bs): @@ -2117,11 +2081,19 @@ def bytes_to_intlist(bs): def intlist_to_bytes(xs): if not xs: return b'' - return compat_struct_pack('%dB' % len(xs), *xs) + return struct.pack('%dB' % len(xs), *xs) + + +class LockingUnsupportedError(OSError): + msg = 'File locking is not supported' + + def __init__(self): + super().__init__(self.msg) # Cross-platform file locking if sys.platform == 'win32': + import ctypes import ctypes.wintypes import msvcrt @@ -2167,7 +2139,8 @@ if sys.platform == 'win32': if not LockFileEx(msvcrt.get_osfhandle(f.fileno()), (0x2 if exclusive else 0x0) | (0x0 if block else 0x1), 0, whole_low, whole_high, f._lock_file_overlapped_p): - raise BlockingIOError('Locking file failed: %r' % ctypes.FormatError()) + # NB: No argument form of "ctypes.FormatError" does not work on PyPy + raise BlockingIOError(f'Locking file failed: {ctypes.FormatError(ctypes.GetLastError())!r}') def _unlock_file(f): assert f._lock_file_overlapped_p @@ -2180,18 +2153,15 @@ else: import fcntl def _lock_file(f, exclusive, block): + flags = fcntl.LOCK_EX if exclusive else fcntl.LOCK_SH + if not block: + flags |= fcntl.LOCK_NB try: - fcntl.flock(f, - fcntl.LOCK_SH if not exclusive - else fcntl.LOCK_EX if block - else fcntl.LOCK_EX | fcntl.LOCK_NB) + fcntl.flock(f, flags) except BlockingIOError: raise except OSError: # AOSP does not have flock() - fcntl.lockf(f, - fcntl.LOCK_SH if not exclusive - else fcntl.LOCK_EX if block - else fcntl.LOCK_EX | fcntl.LOCK_NB) + fcntl.lockf(f, flags) def _unlock_file(f): try: @@ -2200,60 +2170,80 @@ else: fcntl.lockf(f, fcntl.LOCK_UN) except ImportError: - UNSUPPORTED_MSG = 'file locking is not supported on this platform' def _lock_file(f, exclusive, block): - raise IOError(UNSUPPORTED_MSG) + raise LockingUnsupportedError() def _unlock_file(f): - raise IOError(UNSUPPORTED_MSG) + raise LockingUnsupportedError() -class locked_file(object): - _closed = False +class locked_file: + locked = False def __init__(self, filename, mode, block=True, encoding=None): - assert mode in ['r', 'rb', 'a', 'ab', 'w', 'wb'] - self.f = io.open(filename, mode, encoding=encoding) - self.mode = mode - self.block = block + if mode not in {'r', 'rb', 'a', 'ab', 'w', 'wb'}: + raise NotImplementedError(mode) + self.mode, self.block = mode, block + + writable = any(f in mode for f in 'wax+') + readable = any(f in mode for f in 'r+') + flags = functools.reduce(operator.ior, ( + getattr(os, 'O_CLOEXEC', 0), # UNIX only + getattr(os, 'O_BINARY', 0), # Windows only + getattr(os, 'O_NOINHERIT', 0), # Windows only + os.O_CREAT if writable else 0, # O_TRUNC only after locking + os.O_APPEND if 'a' in mode else 0, + os.O_EXCL if 'x' in mode else 0, + os.O_RDONLY if not writable else os.O_RDWR if readable else os.O_WRONLY, + )) + + self.f = os.fdopen(os.open(filename, flags, 0o666), mode, encoding=encoding) def __enter__(self): exclusive = 'r' not in self.mode try: _lock_file(self.f, exclusive, self.block) - except IOError: + self.locked = True + except OSError: self.f.close() raise + if 'w' in self.mode: + try: + self.f.truncate() + except OSError as e: + if e.errno not in ( + errno.ESPIPE, # Illegal seek - expected for FIFO + errno.EINVAL, # Invalid argument - expected for /dev/null + ): + raise return self - def __exit__(self, etype, value, traceback): + def unlock(self): + if not self.locked: + return try: - if not self._closed: - _unlock_file(self.f) + _unlock_file(self.f) finally: - self.f.close() - self._closed = True + self.locked = False - def __iter__(self): - return iter(self.f) - - def write(self, *args): - return self.f.write(*args) - - def read(self, *args): - return self.f.read(*args) + def __exit__(self, *_): + try: + self.unlock() + finally: + self.f.close() - def flush(self): - self.f.flush() + open = __enter__ + close = __exit__ - def open(self): - return self.__enter__() + def __getattr__(self, attr): + return getattr(self.f, attr) - def close(self, *args): - self.__exit__(self, *args, value=False, traceback=False) + def __iter__(self): + return iter(self.f) +@functools.cache def get_filesystem_encoding(): encoding = sys.getfilesystemencoding() return encoding if encoding is not None else 'utf-8' @@ -2275,7 +2265,7 @@ def smuggle_url(url, data): url, idata = unsmuggle_url(url, {}) data.update(idata) - sdata = compat_urllib_parse_urlencode( + sdata = urllib.parse.urlencode( {'__youtubedl_smuggle': json.dumps(data)}) return url + '#' + sdata @@ -2284,7 +2274,7 @@ def unsmuggle_url(smug_url, default=None): if '#__youtubedl_smuggle' not in smug_url: return smug_url, default url, _, sdata = smug_url.rpartition('#') - jsond = compat_parse_qs(sdata)['__youtubedl_smuggle'][0] + jsond = urllib.parse.parse_qs(sdata)['__youtubedl_smuggle'][0] data = json.loads(jsond) return url, data @@ -2307,15 +2297,24 @@ def format_bytes(bytes): return format_decimal_suffix(bytes, '%.2f%sB', factor=1024) or 'N/A' -def lookup_unit_table(unit_table, s): +def lookup_unit_table(unit_table, s, strict=False): + num_re = NUMBER_RE if strict else NUMBER_RE.replace(R'\.', '[,.]') units_re = '|'.join(re.escape(u) for u in unit_table) - m = re.match( - r'(?P<num>[0-9]+(?:[,.][0-9]*)?)\s*(?P<unit>%s)\b' % units_re, s) + m = (re.fullmatch if strict else re.match)( + rf'(?P<num>{num_re})\s*(?P<unit>{units_re})\b', s) if not m: return None - num_str = m.group('num').replace(',', '.') + + num = float(m.group('num').replace(',', '.')) mult = unit_table[m.group('unit')] - return int(float(num_str) * mult) + return round(num * mult) + + +def parse_bytes(s): + """Parse a string indicating a byte quantity into an integer""" + return lookup_unit_table( + {u: 1024**i for i, u in enumerate(['', *'KMGTPEZY'])}, + s.upper(), strict=True) def parse_filesize(s): @@ -2444,7 +2443,7 @@ def parse_resolution(s, *, lenient=False): def parse_bitrate(s): - if not isinstance(s, compat_str): + if not isinstance(s, str): return mobj = re.search(r'\b(\d+)\s*kbps', s) if mobj: @@ -2481,11 +2480,12 @@ def fix_xml_ampersands(xml_str): def setproctitle(title): - assert isinstance(title, compat_str) + assert isinstance(title, str) - # ctypes in Jython is not complete - # http://bugs.jython.org/issue2148 - if sys.platform.startswith('java'): + # Workaround for https://github.com/hypervideo/hypervideo/issues/4541 + try: + import ctypes + except ImportError: return try: @@ -2497,7 +2497,7 @@ def setproctitle(title): # a bytestring, but since unicode_literals turns # every string into a unicode string, it fails. return - title_bytes = title.encode('utf-8') + title_bytes = title.encode() buf = ctypes.create_string_buffer(len(title_bytes)) buf.value = title_bytes try: @@ -2524,40 +2524,43 @@ def remove_quotes(s): def get_domain(url): - domain = re.match(r'(?:https?:\/\/)?(?:www\.)?(?P<domain>[^\n\/]+\.[^\n\/]+)(?:\/(.*))?', url) - return domain.group('domain') if domain else None + """ + This implementation is inconsistent, but is kept for compatibility. + Use this only for "webpage_url_domain" + """ + return remove_start(urllib.parse.urlparse(url).netloc, 'www.') or None def url_basename(url): - path = compat_urlparse.urlparse(url).path + path = urllib.parse.urlparse(url).path return path.strip('/').split('/')[-1] def base_url(url): - return re.match(r'https?://[^?#&]+/', url).group() + return re.match(r'https?://[^?#]+/', url).group() def urljoin(base, path): if isinstance(path, bytes): - path = path.decode('utf-8') - if not isinstance(path, compat_str) or not path: + path = path.decode() + if not isinstance(path, str) or not path: return None if re.match(r'^(?:[a-zA-Z][a-zA-Z0-9+-.]*:)?//', path): return path if isinstance(base, bytes): - base = base.decode('utf-8') - if not isinstance(base, compat_str) or not re.match( + base = base.decode() + if not isinstance(base, str) or not re.match( r'^(?:https?:)?//', base): return None - return compat_urlparse.urljoin(base, path) + return urllib.parse.urljoin(base, path) -class HEADRequest(compat_urllib_request.Request): +class HEADRequest(urllib.request.Request): def get_method(self): return 'HEAD' -class PUTRequest(compat_urllib_request.Request): +class PUTRequest(urllib.request.Request): def get_method(self): return 'PUT' @@ -2572,14 +2575,14 @@ def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): def str_or_none(v, default=None): - return default if v is None else compat_str(v) + return default if v is None else str(v) def str_to_int(int_str): """ A more relaxed version of int_or_none """ - if isinstance(int_str, compat_integer_types): + if isinstance(int_str, int): return int_str - elif isinstance(int_str, compat_str): + elif isinstance(int_str, str): int_str = re.sub(r'[,\.\+]', '', int_str) return int_or_none(int_str) @@ -2598,18 +2601,18 @@ def bool_or_none(v, default=None): def strip_or_none(v, default=None): - return v.strip() if isinstance(v, compat_str) else default + return v.strip() if isinstance(v, str) else default def url_or_none(url): - if not url or not isinstance(url, compat_str): + if not url or not isinstance(url, str): return None url = url.strip() return url if re.match(r'^(?:(?:https?|rt(?:m(?:pt?[es]?|fp)|sp[su]?)|mms|ftps?):)?//', url) else None def request_to_url(req): - if isinstance(req, compat_urllib_request.Request): + if isinstance(req, urllib.request.Request): return req.get_full_url() else: return req @@ -2618,17 +2621,21 @@ def request_to_url(req): def strftime_or_none(timestamp, date_format, default=None): datetime_object = None try: - if isinstance(timestamp, compat_numeric_types): # unix timestamp - datetime_object = datetime.datetime.utcfromtimestamp(timestamp) - elif isinstance(timestamp, compat_str): # assume YYYYMMDD + if isinstance(timestamp, (int, float)): # unix timestamp + # Using naive datetime here can break timestamp() in Windows + # Ref: https://github.com/hypervideo/hypervideo/issues/5185, https://github.com/python/cpython/issues/94414 + datetime_object = datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc) + elif isinstance(timestamp, str): # assume YYYYMMDD datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d') + date_format = re.sub( # Support %s on windows + r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format) return datetime_object.strftime(date_format) except (ValueError, TypeError, AttributeError): return default def parse_duration(s): - if not isinstance(s, compat_basestring): + if not isinstance(s, str): return None s = s.strip() if not s: @@ -2677,31 +2684,23 @@ def parse_duration(s): else: return None - duration = 0 - if secs: - duration += float(secs) - if mins: - duration += float(mins) * 60 - if hours: - duration += float(hours) * 60 * 60 - if days: - duration += float(days) * 24 * 60 * 60 if ms: - duration += float(ms.replace(':', '.')) - return duration + ms = ms.replace(':', '.') + return sum(float(part or 0) * mult for part, mult in ( + (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1))) def prepend_extension(filename, ext, expected_real_ext=None): name, real_ext = os.path.splitext(filename) return ( - '{0}.{1}{2}'.format(name, ext, real_ext) + f'{name}.{ext}{real_ext}' if not expected_real_ext or real_ext[1:] == expected_real_ext - else '{0}.{1}'.format(filename, ext)) + else f'{filename}.{ext}') def replace_extension(filename, ext, expected_real_ext=None): name, real_ext = os.path.splitext(filename) - return '{0}.{1}'.format( + return '{}.{}'.format( name if not expected_real_ext or real_ext[1:] == expected_real_ext else filename, ext) @@ -2710,31 +2709,26 @@ def check_executable(exe, args=[]): """ Checks if the given binary is installed somewhere in PATH, and returns its name. args can be a list of arguments for a short output (like -version) """ try: - Popen([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate_or_kill() + Popen.run([exe] + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except OSError: return False return exe -def _get_exe_version_output(exe, args, *, to_screen=None): - if to_screen: - to_screen(f'Checking exe version: {shell_quote([exe] + args)}') +def _get_exe_version_output(exe, args): try: # STDIN should be redirected too. On UNIX-like systems, ffmpeg triggers # SIGTTOU if hypervideo is run in the background. # See https://github.com/ytdl-org/youtube-dl/issues/955#issuecomment-209789656 - out, _ = Popen( - [encodeArgument(exe)] + args, stdin=subprocess.PIPE, - stdout=subprocess.PIPE, stderr=subprocess.STDOUT).communicate_or_kill() + stdout, _, _ = Popen.run([encodeArgument(exe)] + args, text=True, + stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) except OSError: return False - if isinstance(out, bytes): # Python 2.x - out = out.decode('ascii', 'ignore') - return out + return stdout def detect_exe_version(output, version_re=None, unrecognized='present'): - assert isinstance(output, compat_str) + assert isinstance(output, str) if version_re is None: version_re = r'version\s+([-0-9._a-zA-Z]+)' m = re.search(version_re, output) @@ -2752,50 +2746,59 @@ def get_exe_version(exe, args=['--version'], return detect_exe_version(out, version_re, unrecognized) if out else False +def frange(start=0, stop=None, step=1): + """Float range""" + if stop is None: + start, stop = 0, start + sign = [-1, 1][step > 0] if step else 0 + while sign * start < sign * stop: + yield start + start += step + + class LazyList(collections.abc.Sequence): - ''' Lazy immutable list from an iterable - Note that slices of a LazyList are lists and not LazyList''' + """Lazy immutable list from an iterable + Note that slices of a LazyList are lists and not LazyList""" class IndexError(IndexError): pass def __init__(self, iterable, *, reverse=False, _cache=None): - self.__iterable = iter(iterable) - self.__cache = [] if _cache is None else _cache - self.__reversed = reverse + self._iterable = iter(iterable) + self._cache = [] if _cache is None else _cache + self._reversed = reverse def __iter__(self): - if self.__reversed: + if self._reversed: # We need to consume the entire iterable to iterate in reverse yield from self.exhaust() return - yield from self.__cache - for item in self.__iterable: - self.__cache.append(item) + yield from self._cache + for item in self._iterable: + self._cache.append(item) yield item - def __exhaust(self): - self.__cache.extend(self.__iterable) - # Discard the emptied iterable to make it pickle-able - self.__iterable = [] - return self.__cache + def _exhaust(self): + self._cache.extend(self._iterable) + self._iterable = [] # Discard the emptied iterable to make it pickle-able + return self._cache def exhaust(self): - ''' Evaluate the entire iterable ''' - return self.__exhaust()[::-1 if self.__reversed else 1] + """Evaluate the entire iterable""" + return self._exhaust()[::-1 if self._reversed else 1] @staticmethod - def __reverse_index(x): - return None if x is None else -(x + 1) + def _reverse_index(x): + return None if x is None else ~x def __getitem__(self, idx): if isinstance(idx, slice): - if self.__reversed: - idx = slice(self.__reverse_index(idx.start), self.__reverse_index(idx.stop), -(idx.step or 1)) + if self._reversed: + idx = slice(self._reverse_index(idx.start), self._reverse_index(idx.stop), -(idx.step or 1)) start, stop, step = idx.start, idx.stop, idx.step or 1 elif isinstance(idx, int): - if self.__reversed: - idx = self.__reverse_index(idx) + if self._reversed: + idx = self._reverse_index(idx) start, stop, step = idx, idx, 0 else: raise TypeError('indices must be integers or slices') @@ -2804,35 +2807,35 @@ class LazyList(collections.abc.Sequence): or (stop is None and step > 0)): # We need to consume the entire iterable to be able to slice from the end # Obviously, never use this with infinite iterables - self.__exhaust() + self._exhaust() try: - return self.__cache[idx] + return self._cache[idx] except IndexError as e: raise self.IndexError(e) from e - n = max(start or 0, stop or 0) - len(self.__cache) + 1 + n = max(start or 0, stop or 0) - len(self._cache) + 1 if n > 0: - self.__cache.extend(itertools.islice(self.__iterable, n)) + self._cache.extend(itertools.islice(self._iterable, n)) try: - return self.__cache[idx] + return self._cache[idx] except IndexError as e: raise self.IndexError(e) from e def __bool__(self): try: - self[-1] if self.__reversed else self[0] + self[-1] if self._reversed else self[0] except self.IndexError: return False return True def __len__(self): - self.__exhaust() - return len(self.__cache) + self._exhaust() + return len(self._cache) def __reversed__(self): - return type(self)(self.__iterable, reverse=not self.__reversed, _cache=self.__cache) + return type(self)(self._iterable, reverse=not self._reversed, _cache=self._cache) def __copy__(self): - return type(self)(self.__iterable, reverse=self.__reversed, _cache=self.__cache) + return type(self)(self._iterable, reverse=self._reversed, _cache=self._cache) def __repr__(self): # repr and str should mimic a list. So we exhaust the iterable @@ -2884,6 +2887,7 @@ class PagedList: class OnDemandPagedList(PagedList): """Download pages until a page with less than maximum results""" + def _getslice(self, start, end): for pagenum in itertools.count(start // self._pagesize): firstid = pagenum * self._pagesize @@ -2924,6 +2928,7 @@ class OnDemandPagedList(PagedList): class InAdvancePagedList(PagedList): """PagedList with total number of pages known in advance""" + def __init__(self, pagefunc, pagecount, pagesize): PagedList.__init__(self, pagefunc, pagesize, True) self._pagecount = pagecount @@ -2947,6 +2952,140 @@ class InAdvancePagedList(PagedList): yield from page_results +class PlaylistEntries: + MissingEntry = object() + is_exhausted = False + + def __init__(self, ydl, info_dict): + self.ydl = ydl + + # _entries must be assigned now since infodict can change during iteration + entries = info_dict.get('entries') + if entries is None: + raise EntryNotInPlaylist('There are no entries') + elif isinstance(entries, list): + self.is_exhausted = True + + requested_entries = info_dict.get('requested_entries') + self.is_incomplete = requested_entries is not None + if self.is_incomplete: + assert self.is_exhausted + self._entries = [self.MissingEntry] * max(requested_entries or [0]) + for i, entry in zip(requested_entries, entries): + self._entries[i - 1] = entry + elif isinstance(entries, (list, PagedList, LazyList)): + self._entries = entries + else: + self._entries = LazyList(entries) + + PLAYLIST_ITEMS_RE = re.compile(r'''(?x) + (?P<start>[+-]?\d+)? + (?P<range>[:-] + (?P<end>[+-]?\d+|inf(?:inite)?)? + (?::(?P<step>[+-]?\d+))? + )?''') + + @classmethod + def parse_playlist_items(cls, string): + for segment in string.split(','): + if not segment: + raise ValueError('There is two or more consecutive commas') + mobj = cls.PLAYLIST_ITEMS_RE.fullmatch(segment) + if not mobj: + raise ValueError(f'{segment!r} is not a valid specification') + start, end, step, has_range = mobj.group('start', 'end', 'step', 'range') + if int_or_none(step) == 0: + raise ValueError(f'Step in {segment!r} cannot be zero') + yield slice(int_or_none(start), float_or_none(end), int_or_none(step)) if has_range else int(start) + + def get_requested_items(self): + playlist_items = self.ydl.params.get('playlist_items') + playlist_start = self.ydl.params.get('playliststart', 1) + playlist_end = self.ydl.params.get('playlistend') + # For backwards compatibility, interpret -1 as whole list + if playlist_end in (-1, None): + playlist_end = '' + if not playlist_items: + playlist_items = f'{playlist_start}:{playlist_end}' + elif playlist_start != 1 or playlist_end: + self.ydl.report_warning('Ignoring playliststart and playlistend because playlistitems was given', only_once=True) + + for index in self.parse_playlist_items(playlist_items): + for i, entry in self[index]: + yield i, entry + if not entry: + continue + try: + # TODO: Add auto-generated fields + self.ydl._match_entry(entry, incomplete=True, silent=True) + except (ExistingVideoReached, RejectedVideoReached): + return + + def get_full_count(self): + if self.is_exhausted and not self.is_incomplete: + return len(self) + elif isinstance(self._entries, InAdvancePagedList): + if self._entries._pagesize == 1: + return self._entries._pagecount + + @functools.cached_property + def _getter(self): + if isinstance(self._entries, list): + def get_entry(i): + try: + entry = self._entries[i] + except IndexError: + entry = self.MissingEntry + if not self.is_incomplete: + raise self.IndexError() + if entry is self.MissingEntry: + raise EntryNotInPlaylist(f'Entry {i + 1} cannot be found') + return entry + else: + def get_entry(i): + try: + return type(self.ydl)._handle_extraction_exceptions(lambda _, i: self._entries[i])(self.ydl, i) + except (LazyList.IndexError, PagedList.IndexError): + raise self.IndexError() + return get_entry + + def __getitem__(self, idx): + if isinstance(idx, int): + idx = slice(idx, idx) + + # NB: PlaylistEntries[1:10] => (0, 1, ... 9) + step = 1 if idx.step is None else idx.step + if idx.start is None: + start = 0 if step > 0 else len(self) - 1 + else: + start = idx.start - 1 if idx.start >= 0 else len(self) + idx.start + + # NB: Do not call len(self) when idx == [:] + if idx.stop is None: + stop = 0 if step < 0 else float('inf') + else: + stop = idx.stop - 1 if idx.stop >= 0 else len(self) + idx.stop + stop += [-1, 1][step > 0] + + for i in frange(start, stop, step): + if i < 0: + continue + try: + entry = self._getter(i) + except self.IndexError: + self.is_exhausted = True + if step > 0: + break + continue + yield i + 1, entry + + def __len__(self): + return len(tuple(self[:])) + + class IndexError(IndexError): + pass + + def uppercase_escape(s): unicode_escape = codecs.getdecoder('unicode_escape') return re.sub( @@ -2965,14 +3104,12 @@ def lowercase_escape(s): def escape_rfc3986(s): """Escape non-ASCII characters as suggested by RFC 3986""" - if sys.version_info < (3, 0) and isinstance(s, compat_str): - s = s.encode('utf-8') - return compat_urllib_parse.quote(s, b"%/;:@&=+$,!~*'()?#[]") + return urllib.parse.quote(s, b"%/;:@&=+$,!~*'()?#[]") def escape_url(url): """Escape URL as suggested by RFC 3986""" - url_parsed = compat_urllib_parse_urlparse(url) + url_parsed = urllib.parse.urlparse(url) return url_parsed._replace( netloc=url_parsed.netloc.encode('idna').decode('ascii'), path=escape_rfc3986(url_parsed.path), @@ -2982,13 +3119,13 @@ def escape_url(url): ).geturl() -def parse_qs(url): - return compat_parse_qs(compat_urllib_parse_urlparse(url).query) +def parse_qs(url, **kwargs): + return urllib.parse.parse_qs(urllib.parse.urlparse(url).query, **kwargs) def read_batch_urls(batch_fd): def fixup(url): - if not isinstance(url, compat_str): + if not isinstance(url, str): url = url.decode('utf-8', 'replace') BOM_UTF8 = ('\xef\xbb\xbf', '\ufeff') for bom in BOM_UTF8: @@ -2998,7 +3135,7 @@ def read_batch_urls(batch_fd): if not url or url.startswith(('#', ';', ']')): return False # "#" cannot be stripped out since it is part of the URI - # However, it can be safely stipped out if follwing a whitespace + # However, it can be safely stripped out if following a whitespace return re.split(r'\s#', url, 1)[0].rstrip() with contextlib.closing(batch_fd) as fd: @@ -3006,22 +3143,22 @@ def read_batch_urls(batch_fd): def urlencode_postdata(*args, **kargs): - return compat_urllib_parse_urlencode(*args, **kargs).encode('ascii') + return urllib.parse.urlencode(*args, **kargs).encode('ascii') def update_url_query(url, query): if not query: return url - parsed_url = compat_urlparse.urlparse(url) - qs = compat_parse_qs(parsed_url.query) + parsed_url = urllib.parse.urlparse(url) + qs = urllib.parse.parse_qs(parsed_url.query) qs.update(query) - return compat_urlparse.urlunparse(parsed_url._replace( - query=compat_urllib_parse_urlencode(qs, True))) + return urllib.parse.urlunparse(parsed_url._replace( + query=urllib.parse.urlencode(qs, True))) -def update_Request(req, url=None, data=None, headers={}, query={}): +def update_Request(req, url=None, data=None, headers=None, query=None): req_headers = req.headers.copy() - req_headers.update(headers) + req_headers.update(headers or {}) req_data = data or req.data req_url = update_url_query(url or req.get_full_url(), query) req_get_method = req.get_method() @@ -3030,7 +3167,7 @@ def update_Request(req, url=None, data=None, headers={}, query={}): elif req_get_method == 'PUT': req_type = PUTRequest else: - req_type = compat_urllib_request.Request + req_type = urllib.request.Request new_req = req_type( req_url, data=req_data, headers=req_headers, origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) @@ -3045,10 +3182,10 @@ def _multipart_encode_impl(data, boundary): out = b'' for k, v in data.items(): out += b'--' + boundary.encode('ascii') + b'\r\n' - if isinstance(k, compat_str): - k = k.encode('utf-8') - if isinstance(v, compat_str): - v = v.encode('utf-8') + if isinstance(k, str): + k = k.encode() + if isinstance(v, str): + v = v.encode() # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578 # suggests sending UTF-8 directly. Firefox sends UTF-8, too content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n' @@ -3091,6 +3228,10 @@ def multipart_encode(data, boundary=None): return out, content_type +def variadic(x, allowed_types=(str, bytes, dict)): + return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,) + + def dict_get(d, key_or_keys, default=None, skip_false_values=True): for val in map(d.get, variadic(key_or_keys)): if val is not None and (val or not skip_false_values): @@ -3102,7 +3243,7 @@ def try_call(*funcs, expected_type=None, args=[], kwargs={}): for f in funcs: try: val = f(*args, **kwargs) - except (AttributeError, KeyError, TypeError, IndexError, ZeroDivisionError): + except (AttributeError, KeyError, TypeError, IndexError, ValueError, ZeroDivisionError): pass else: if expected_type is None or isinstance(val, expected_type): @@ -3128,7 +3269,7 @@ def merge_dicts(*dicts): def encode_compat_str(string, encoding=preferredencoding(), errors='strict'): - return string if isinstance(string, compat_str) else compat_str(string, encoding, errors) + return string if isinstance(string, str) else str(string, encoding, errors) US_RATINGS = { @@ -3151,9 +3292,10 @@ TV_PARENTAL_GUIDELINES = { def parse_age_limit(s): - if type(s) == int: + # isinstance(False, int) is True. So type() must be used instead + if type(s) is int: # noqa: E721 return s if 0 <= s <= 21 else None - if not isinstance(s, compat_basestring): + elif not isinstance(s, str): return None m = re.match(r'^(?P<age>\d{1,2})\+?$', s) if m: @@ -3177,15 +3319,26 @@ def strip_jsonp(code): r'\g<callback_data>', code) -def js_to_json(code, vars={}): +def js_to_json(code, vars={}, *, strict=False): # vars is a dict of var, val pairs to substitute + STRING_QUOTES = '\'"' + STRING_RE = '|'.join(rf'{q}(?:\\.|[^\\{q}])*{q}' for q in STRING_QUOTES) COMMENT_RE = r'/\*(?:(?!\*/).)*?\*/|//[^\n]*\n' - SKIP_RE = r'\s*(?:{comment})?\s*'.format(comment=COMMENT_RE) + SKIP_RE = fr'\s*(?:{COMMENT_RE})?\s*' INTEGER_TABLE = ( - (r'(?s)^(0[xX][0-9a-fA-F]+){skip}:?$'.format(skip=SKIP_RE), 16), - (r'(?s)^(0+[0-7]+){skip}:?$'.format(skip=SKIP_RE), 8), + (fr'(?s)^(0[xX][0-9a-fA-F]+){SKIP_RE}:?$', 16), + (fr'(?s)^(0+[0-7]+){SKIP_RE}:?$', 8), ) + def process_escape(match): + JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu' + escape = match.group(1) or match.group(2) + + return (Rf'\{escape}' if escape in JSON_PASSTHROUGH_ESCAPES + else R'\u00' if escape == 'x' + else '' if escape == '\n' + else escape) + def fix_kv(m): v = m.group(0) if v in ('true', 'false', 'null'): @@ -3193,38 +3346,42 @@ def js_to_json(code, vars={}): elif v in ('undefined', 'void 0'): return 'null' elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',': - return "" - - if v[0] in ("'", '"'): - v = re.sub(r'(?s)\\.|"', lambda m: { - '"': '\\"', - "\\'": "'", - '\\\n': '', - '\\x': '\\u00', - }.get(m.group(0), m.group(0)), v[1:-1]) - else: - for regex, base in INTEGER_TABLE: - im = re.match(regex, v) - if im: - i = int(im.group(1), base) - return '"%d":' % i if v.endswith(':') else '%d' % i + return '' + + if v[0] in STRING_QUOTES: + escaped = re.sub(r'(?s)(")|\\(.)', process_escape, v[1:-1]) + return f'"{escaped}"' + + for regex, base in INTEGER_TABLE: + im = re.match(regex, v) + if im: + i = int(im.group(1), base) + return f'"{i}":' if v.endswith(':') else str(i) - if v in vars: - return vars[v] + if v in vars: + return json.dumps(vars[v]) - return '"%s"' % v + if not strict: + return f'"{v}"' - code = re.sub(r'new Date\((".+")\)', r'\g<1>', code) + raise ValueError(f'Unknown value: {v}') - return re.sub(r'''(?sx) - "(?:[^"\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^"\\]*"| - '(?:[^'\\]*(?:\\\\|\\['"nurtbfx/\n]))*[^'\\]*'| - {comment}|,(?={skip}[\]}}])| + def create_map(mobj): + return json.dumps(dict(json.loads(js_to_json(mobj.group(1) or '[]', vars=vars)))) + + code = re.sub(r'new Map\((\[.*?\])?\)', create_map, code) + if not strict: + code = re.sub(r'new Date\((".+")\)', r'\g<1>', code) + code = re.sub(r'new \w+\((.*?)\)', lambda m: json.dumps(m.group(0)), code) + + return re.sub(rf'''(?sx) + {STRING_RE}| + {COMMENT_RE}|,(?={SKIP_RE}[\]}}])| void\s0|(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_$])[.a-zA-Z_$0-9]*| - \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?| - [0-9]+(?={skip}:)| + \b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{SKIP_RE}:)?| + [0-9]+(?={SKIP_RE}:)| !+ - '''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code) + ''', fix_kv, code) def qualities(quality_ids): @@ -3237,7 +3394,7 @@ def qualities(quality_ids): return q -POSTPROCESS_WHEN = {'pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'} +POSTPROCESS_WHEN = ('pre_process', 'after_filter', 'before_dl', 'post_process', 'after_move', 'after_video', 'playlist') DEFAULT_OUTTMPL = { @@ -3315,12 +3472,7 @@ def args_to_str(args): def error_to_compat_str(err): - err_str = str(err) - # On python 2 error byte string must be decoded with proper - # encoding rather than ascii - if sys.version_info[0] < 3: - err_str = err_str.decode(preferredencoding()) - return err_str + return str(err) def error_to_str(err): @@ -3405,34 +3557,33 @@ def parse_codecs(codecs_str): return {} split_codecs = list(filter(None, map( str.strip, codecs_str.strip().strip(',').split(',')))) - vcodec, acodec, tcodec, hdr = None, None, None, None + vcodec, acodec, scodec, hdr = None, None, None, None for full_codec in split_codecs: - parts = full_codec.split('.') - codec = parts[0].replace('0', '') - if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', - 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'): - if not vcodec: - vcodec = '.'.join(parts[:4]) if codec in ('vp9', 'av1', 'hvc1') else full_codec - if codec in ('dvh1', 'dvhe'): - hdr = 'DV' - elif codec == 'av1' and len(parts) > 3 and parts[3] == '10': - hdr = 'HDR10' - elif full_codec.replace('0', '').startswith('vp9.2'): - hdr = 'HDR10' - elif codec in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'): - if not acodec: - acodec = full_codec - elif codec in ('stpp', 'wvtt',): - if not tcodec: - tcodec = full_codec + parts = re.sub(r'0+(?=\d)', '', full_codec).split('.') + if parts[0] in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', + 'h263', 'h264', 'mp4v', 'hvc1', 'av1', 'theora', 'dvh1', 'dvhe'): + if vcodec: + continue + vcodec = full_codec + if parts[0] in ('dvh1', 'dvhe'): + hdr = 'DV' + elif parts[0] == 'av1' and traverse_obj(parts, 3) == '10': + hdr = 'HDR10' + elif parts[:2] == ['vp9', '2']: + hdr = 'HDR10' + elif parts[0] in ('flac', 'mp4a', 'opus', 'vorbis', 'mp3', 'aac', + 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'): + acodec = acodec or full_codec + elif parts[0] in ('stpp', 'wvtt'): + scodec = scodec or full_codec else: - write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr) - if vcodec or acodec or tcodec: + write_string(f'WARNING: Unknown codec {full_codec}\n') + if vcodec or acodec or scodec: return { 'vcodec': vcodec or 'none', 'acodec': acodec or 'none', 'dynamic_range': hdr, - **({'tcodec': tcodec} if tcodec is not None else {}), + **({'scodec': scodec} if scodec is not None else {}), } elif len(split_codecs) == 2: return { @@ -3442,6 +3593,46 @@ def parse_codecs(codecs_str): return {} +def get_compatible_ext(*, vcodecs, acodecs, vexts, aexts, preferences=None): + assert len(vcodecs) == len(vexts) and len(acodecs) == len(aexts) + + allow_mkv = not preferences or 'mkv' in preferences + + if allow_mkv and max(len(acodecs), len(vcodecs)) > 1: + return 'mkv' # TODO: any other format allows this? + + # TODO: All codecs supported by parse_codecs isn't handled here + COMPATIBLE_CODECS = { + 'mp4': { + 'av1', 'hevc', 'avc1', 'mp4a', # fourcc (m3u8, mpd) + 'h264', 'aacl', 'ec-3', # Set in ISM + }, + 'webm': { + 'av1', 'vp9', 'vp8', 'opus', 'vrbs', + 'vp9x', 'vp8x', # in the webm spec + }, + } + + sanitize_codec = functools.partial(try_get, getter=lambda x: x[0].split('.')[0].replace('0', '')) + vcodec, acodec = sanitize_codec(vcodecs), sanitize_codec(acodecs) + + for ext in preferences or COMPATIBLE_CODECS.keys(): + codec_set = COMPATIBLE_CODECS.get(ext, set()) + if ext == 'mkv' or codec_set.issuperset((vcodec, acodec)): + return ext + + COMPATIBLE_EXTS = ( + {'mp3', 'mp4', 'm4a', 'm4p', 'm4b', 'm4r', 'm4v', 'ismv', 'isma', 'mov'}, + {'webm'}, + ) + for ext in preferences or vexts: + current_exts = {ext, *vexts, *aexts} + if ext == 'mkv' or current_exts == {ext} or any( + ext_sets.issuperset(current_exts) for ext_sets in COMPATIBLE_EXTS): + return ext + return 'mkv' if allow_mkv else preferences[-1] + + def urlhandle_detect_ext(url_handle): getheader = url_handle.headers.get @@ -3470,24 +3661,25 @@ def age_restricted(content_limit, age_limit): return age_limit < content_limit +# List of known byte-order-marks (BOM) +BOMS = [ + (b'\xef\xbb\xbf', 'utf-8'), + (b'\x00\x00\xfe\xff', 'utf-32-be'), + (b'\xff\xfe\x00\x00', 'utf-32-le'), + (b'\xff\xfe', 'utf-16-le'), + (b'\xfe\xff', 'utf-16-be'), +] + + def is_html(first_bytes): """ Detect whether a file contains HTML by examining its first bytes. """ - BOMS = [ - (b'\xef\xbb\xbf', 'utf-8'), - (b'\x00\x00\xfe\xff', 'utf-32-be'), - (b'\xff\xfe\x00\x00', 'utf-32-le'), - (b'\xff\xfe', 'utf-16-le'), - (b'\xfe\xff', 'utf-16-be'), - ] + encoding = 'utf-8' for bom, enc in BOMS: - if first_bytes.startswith(bom): - s = first_bytes[len(bom):].decode(enc, 'replace') - break - else: - s = first_bytes.decode('utf-8', 'replace') + while first_bytes.startswith(bom): + encoding, first_bytes = enc, first_bytes[len(bom):] - return re.match(r'^\s*<', s) + return re.match(r'^\s*<', first_bytes.decode(encoding, 'replace')) def determine_protocol(info_dict): @@ -3505,11 +3697,11 @@ def determine_protocol(info_dict): ext = determine_ext(url) if ext == 'm3u8': - return 'm3u8' + return 'm3u8' if info_dict.get('is_live') else 'm3u8_native' elif ext == 'f4m': return 'f4m' - return compat_urllib_parse_urlparse(url).scheme + return urllib.parse.urlparse(url).scheme def render_table(header_row, data, delim=False, extra_gap=0, hide_empty=False): @@ -3566,16 +3758,15 @@ def _match_one(filter_part, dct, incomplete): else: is_incomplete = lambda k: k in incomplete - operator_rex = re.compile(r'''(?x)\s* + operator_rex = re.compile(r'''(?x) (?P<key>[a-z_]+) \s*(?P<negation>!\s*)?(?P<op>%s)(?P<none_inclusive>\s*\?)?\s* (?: (?P<quote>["\'])(?P<quotedstrval>.+?)(?P=quote)| (?P<strval>.+?) ) - \s*$ ''' % '|'.join(map(re.escape, COMPARISON_OPERATORS.keys()))) - m = operator_rex.search(filter_part) + m = operator_rex.fullmatch(filter_part.strip()) if m: m = m.groupdict() unnegated_op = COMPARISON_OPERATORS[m['op']] @@ -3588,7 +3779,7 @@ def _match_one(filter_part, dct, incomplete): comparison_value = comparison_value.replace(r'\%s' % m['quote'], m['quote']) actual_value = dct.get(m['key']) numeric_comparison = None - if isinstance(actual_value, compat_numeric_types): + if isinstance(actual_value, (int, float)): # If the original field is a string and matching comparisonvalue is # a number we should respect the origin of the original field # and process comparison value as a string (see @@ -3611,11 +3802,10 @@ def _match_one(filter_part, dct, incomplete): '': lambda v: (v is True) if isinstance(v, bool) else (v is not None), '!': lambda v: (v is False) if isinstance(v, bool) else (v is None), } - operator_rex = re.compile(r'''(?x)\s* + operator_rex = re.compile(r'''(?x) (?P<op>%s)\s*(?P<key>[a-z_]+) - \s*$ ''' % '|'.join(map(re.escape, UNARY_OPERATORS.keys()))) - m = operator_rex.search(filter_part) + m = operator_rex.fullmatch(filter_part.strip()) if m: op = UNARY_OPERATORS[m.group('op')] actual_value = dct.get(m.group('key')) @@ -3641,23 +3831,52 @@ def match_str(filter_str, dct, incomplete=False): def match_filter_func(filters): if not filters: return None - filters = variadic(filters) + filters = set(variadic(filters)) - def _match_func(info_dict, *args, **kwargs): - if any(match_str(f, info_dict, *args, **kwargs) for f in filters): - return None + interactive = '-' in filters + if interactive: + filters.remove('-') + + def _match_func(info_dict, incomplete=False): + if not filters or any(match_str(f, info_dict, incomplete) for f in filters): + return NO_DEFAULT if interactive and not incomplete else None else: - video_title = info_dict.get('title') or info_dict.get('id') or 'video' + video_title = info_dict.get('title') or info_dict.get('id') or 'entry' filter_str = ') | ('.join(map(str.strip, filters)) return f'{video_title} does not pass filter ({filter_str}), skipping ..' return _match_func +class download_range_func: + def __init__(self, chapters, ranges): + self.chapters, self.ranges = chapters, ranges + + def __call__(self, info_dict, ydl): + if not self.ranges and not self.chapters: + yield {} + + warning = ('There are no chapters matching the regex' if info_dict.get('chapters') + else 'Cannot match chapters since chapter information is unavailable') + for regex in self.chapters or []: + for i, chapter in enumerate(info_dict.get('chapters') or []): + if re.search(regex, chapter['title']): + warning = None + yield {**chapter, 'index': i} + if self.chapters and warning: + ydl.to_screen(f'[info] {info_dict["id"]}: {warning}') + + yield from ({'start_time': start, 'end_time': end} for start, end in self.ranges or []) + + def __eq__(self, other): + return (isinstance(other, download_range_func) + and self.chapters == other.chapters and self.ranges == other.ranges) + + def parse_dfxp_time_expr(time_expr): if not time_expr: return - mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr) + mobj = re.match(rf'^(?P<time_offset>{NUMBER_RE})s?$', time_expr) if mobj: return float(mobj.group('time_offset')) @@ -3709,7 +3928,7 @@ def dfxp2srt(dfxp_data): styles = {} default_style = {} - class TTMLPElementParser(object): + class TTMLPElementParser: _out = '' _unclosed_elements = [] _applied_styles = [] @@ -3839,26 +4058,21 @@ def dfxp2srt(dfxp_data): return ''.join(out) -def cli_option(params, command_option, param): +def cli_option(params, command_option, param, separator=None): param = params.get(param) - if param: - param = compat_str(param) - return [command_option, param] if param is not None else [] + return ([] if param is None + else [command_option, str(param)] if separator is None + else [f'{command_option}{separator}{param}']) def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None): param = params.get(param) - if param is None: - return [] - assert isinstance(param, bool) - if separator: - return [command_option + separator + (true_value if param else false_value)] - return [command_option, true_value if param else false_value] + assert param in (True, False, None) + return cli_option({True: true_value, False: false_value}, command_option, param, separator) def cli_valueless_option(params, command_option, param, expected_value=True): - param = params.get(param) - return [command_option] if param == expected_value else [] + return [command_option] if params.get(param) == expected_value else [] def cli_configuration_args(argdict, keys, default=[], use_compat=True): @@ -3894,7 +4108,7 @@ def _configuration_args(main_key, argdict, exe, keys=None, default=[], use_compa return cli_configuration_args(argdict, keys, default, use_compat) -class ISO639Utils(object): +class ISO639Utils: # See http://www.loc.gov/standards/iso639-2/ISO-639-2_utf-8.txt _lang_map = { 'aa': 'aar', @@ -4099,7 +4313,7 @@ class ISO639Utils(object): return short_name -class ISO3166Utils(object): +class ISO3166Utils: # From http://data.okfn.org/data/core/country-list _country_map = { 'AF': 'Afghanistan', @@ -4351,6 +4565,9 @@ class ISO3166Utils(object): 'YE': 'Yemen', 'ZM': 'Zambia', 'ZW': 'Zimbabwe', + # Not ISO 3166 codes, but used for IP blocks + 'AP': 'Asia/Pacific Region', + 'EU': 'Europe', } @classmethod @@ -4359,7 +4576,7 @@ class ISO3166Utils(object): return cls._country_map.get(code.upper()) -class GeoUtils(object): +class GeoUtils: # Major IPv4 address blocks per country _country_ip_map = { 'AD': '46.172.224.0/19', @@ -4613,20 +4830,20 @@ class GeoUtils(object): else: block = code_or_block addr, preflen = block.split('/') - addr_min = compat_struct_unpack('!L', socket.inet_aton(addr))[0] + addr_min = struct.unpack('!L', socket.inet_aton(addr))[0] addr_max = addr_min | (0xffffffff >> int(preflen)) - return compat_str(socket.inet_ntoa( - compat_struct_pack('!L', random.randint(addr_min, addr_max)))) + return str(socket.inet_ntoa( + struct.pack('!L', random.randint(addr_min, addr_max)))) -class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): +class PerRequestProxyHandler(urllib.request.ProxyHandler): def __init__(self, proxies=None): # Set default handlers for type in ('http', 'https'): setattr(self, '%s_open' % type, lambda r, proxy='__noproxy__', type=type, meth=self.proxy_open: meth(r, proxy, type)) - compat_urllib_request.ProxyHandler.__init__(self, proxies) + urllib.request.ProxyHandler.__init__(self, proxies) def proxy_open(self, req, proxy, type): req_proxy = req.headers.get('Ytdl-request-proxy') @@ -4636,11 +4853,11 @@ class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): if proxy == '__noproxy__': return None # No Proxy - if compat_urlparse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'): + if urllib.parse.urlparse(proxy).scheme.lower() in ('socks', 'socks4', 'socks4a', 'socks5'): req.add_header('Ytdl-socks-proxy', proxy) # hypervideo's http/https handlers do wrapping the socket with socks return None - return compat_urllib_request.ProxyHandler.proxy_open( + return urllib.request.ProxyHandler.proxy_open( self, req, proxy, type) @@ -4660,7 +4877,7 @@ def long_to_bytes(n, blocksize=0): s = b'' n = int(n) while n > 0: - s = compat_struct_pack('>I', n & 0xffffffff) + s + s = struct.pack('>I', n & 0xffffffff) + s n = n >> 32 # strip off leading zeros for i in range(len(s)): @@ -4691,7 +4908,7 @@ def bytes_to_long(s): s = b'\000' * extra + s length = length + extra for i in range(0, length, 4): - acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0] + acc = (acc << 32) + struct.unpack('>I', s[i:i + 4])[0] return acc @@ -4727,22 +4944,42 @@ def pkcs1pad(data, length): return [0, 2] + pseudo_random + [0] + data -def encode_base_n(num, n, table=None): - FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' - if not table: - table = FULL_TABLE[:n] +def _base_n_table(n, table): + if not table and not n: + raise ValueError('Either table or n must be specified') + table = (table or '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ')[:n] + + if n and n != len(table): + raise ValueError(f'base {n} exceeds table length {len(table)}') + return table - if n > len(table): - raise ValueError('base %d exceeds table length %d' % (n, len(table))) - if num == 0: +def encode_base_n(num, n=None, table=None): + """Convert given int to a base-n string""" + table = _base_n_table(n, table) + if not num: return table[0] - ret = '' + result, base = '', len(table) while num: - ret = table[num % n] + ret - num = num // n - return ret + result = table[num % base] + result + num = num // base + return result + + +def decode_base_n(string, n=None, table=None): + """Convert given base-n string to int""" + table = {char: index for index, char in enumerate(_base_n_table(n, table))} + result, base = 0, len(table) + for char in string: + result = result * base + table[char] + return result + + +def decode_base(value, digits): + deprecation_warning(f'{__name__}.decode_base is deprecated and may be removed ' + f'in a future version. Use {__name__}.decode_base_n instead') + return decode_base_n(value, table=digits) def decode_packed_codes(code): @@ -4796,10 +5033,10 @@ def decode_png(png_data): header = png_data[8:] if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR': - raise IOError('Not a valid PNG file.') + raise OSError('Not a valid PNG file.') int_map = {1: '>B', 2: '>H', 4: '>I'} - unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0] + unpack_integer = lambda x: struct.unpack(int_map[len(x)], x)[0] chunks = [] @@ -4833,7 +5070,7 @@ def decode_png(png_data): idat += chunk['data'] if not idat: - raise IOError('Unable to read PNG data.') + raise OSError('Unable to read PNG data.') decompressed_data = bytearray(zlib.decompress(idat)) @@ -4897,87 +5134,54 @@ def decode_png(png_data): def write_xattr(path, key, value): - # This mess below finds the best xattr tool for the job - try: - # try the pyxattr module... - import xattr - - if hasattr(xattr, 'set'): # pyxattr - # Unicode arguments are not supported in python-pyxattr until - # version 0.5.0 - # See https://github.com/ytdl-org/youtube-dl/issues/5498 - pyxattr_required_version = '0.5.0' - if version_tuple(xattr.__version__) < version_tuple(pyxattr_required_version): - # TODO: fallback to CLI tools - raise XAttrUnavailableError( - 'python-pyxattr is detected but is too old. ' - 'hypervideo requires %s or above while your version is %s. ' - 'Falling back to other xattr implementations' % ( - pyxattr_required_version, xattr.__version__)) - - setxattr = xattr.set - else: # xattr - setxattr = xattr.setxattr + # Windows: Write xattrs to NTFS Alternate Data Streams: + # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29 + if compat_os_name == 'nt': + assert ':' not in key + assert os.path.exists(path) try: - setxattr(path, key, value) - except EnvironmentError as e: + with open(f'{path}:{key}', 'wb') as f: + f.write(value) + except OSError as e: raise XAttrMetadataError(e.errno, e.strerror) + return - except ImportError: - if compat_os_name == 'nt': - # Write xattrs to NTFS Alternate Data Streams: - # http://en.wikipedia.org/wiki/NTFS#Alternate_data_streams_.28ADS.29 - assert ':' not in key - assert os.path.exists(path) - - ads_fn = path + ':' + key - try: - with open(ads_fn, 'wb') as f: - f.write(value) - except EnvironmentError as e: - raise XAttrMetadataError(e.errno, e.strerror) - else: - user_has_setfattr = check_executable('setfattr', ['--version']) - user_has_xattr = check_executable('xattr', ['-h']) - - if user_has_setfattr or user_has_xattr: + # UNIX Method 1. Use xattrs/pyxattrs modules - value = value.decode('utf-8') - if user_has_setfattr: - executable = 'setfattr' - opts = ['-n', key, '-v', value] - elif user_has_xattr: - executable = 'xattr' - opts = ['-w', key, value] + setxattr = None + if getattr(xattr, '_hypervideo_dl__identifier', None) == 'pyxattr': + # Unicode arguments are not supported in pyxattr until version 0.5.0 + # See https://github.com/ytdl-org/youtube-dl/issues/5498 + if version_tuple(xattr.__version__) >= (0, 5, 0): + setxattr = xattr.set + elif xattr: + setxattr = xattr.setxattr - cmd = ([encodeFilename(executable, True)] - + [encodeArgument(o) for o in opts] - + [encodeFilename(path, True)]) + if setxattr: + try: + setxattr(path, key, value) + except OSError as e: + raise XAttrMetadataError(e.errno, e.strerror) + return - try: - p = Popen( - cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) - except EnvironmentError as e: - raise XAttrMetadataError(e.errno, e.strerror) - stdout, stderr = p.communicate_or_kill() - stderr = stderr.decode('utf-8', 'replace') - if p.returncode != 0: - raise XAttrMetadataError(p.returncode, stderr) + # UNIX Method 2. Use setfattr/xattr executables + exe = ('setfattr' if check_executable('setfattr', ['--version']) + else 'xattr' if check_executable('xattr', ['-h']) else None) + if not exe: + raise XAttrUnavailableError( + 'Couldn\'t find a tool to set the xattrs. Install either the python "xattr" or "pyxattr" modules or the ' + + ('"xattr" binary' if sys.platform != 'linux' else 'GNU "attr" package (which contains the "setfattr" tool)')) - else: - # On Unix, and can't find pyxattr, setfattr, or xattr. - if sys.platform.startswith('linux'): - raise XAttrUnavailableError( - "Couldn't find a tool to set the xattrs. " - "Install either the python 'pyxattr' or 'xattr' " - "modules, or the GNU 'attr' package " - "(which contains the 'setfattr' tool).") - else: - raise XAttrUnavailableError( - "Couldn't find a tool to set the xattrs. " - "Install either the python 'xattr' module, " - "or the 'xattr' binary.") + value = value.decode() + try: + _, stderr, returncode = Popen.run( + [exe, '-w', key, value, path] if exe == 'xattr' else [exe, '-n', key, '-v', value, path], + text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) + except OSError as e: + raise XAttrMetadataError(e.errno, e.strerror) + if returncode: + raise XAttrMetadataError(returncode, stderr) def random_birthday(year_field, month_field, day_field): @@ -4993,12 +5197,12 @@ def random_birthday(year_field, month_field, day_field): # Templates for internet shortcut files, which are plain text files. -DOT_URL_LINK_TEMPLATE = ''' +DOT_URL_LINK_TEMPLATE = '''\ [InternetShortcut] URL=%(url)s -'''.lstrip() +''' -DOT_WEBLOC_LINK_TEMPLATE = ''' +DOT_WEBLOC_LINK_TEMPLATE = '''\ <?xml version="1.0" encoding="UTF-8"?> <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd"> <plist version="1.0"> @@ -5007,16 +5211,16 @@ DOT_WEBLOC_LINK_TEMPLATE = ''' \t<string>%(url)s</string> </dict> </plist> -'''.lstrip() +''' -DOT_DESKTOP_LINK_TEMPLATE = ''' +DOT_DESKTOP_LINK_TEMPLATE = '''\ [Desktop Entry] Encoding=UTF-8 Name=%(filename)s Type=Link URL=%(url)s Icon=text-html -'''.lstrip() +''' LINK_TEMPLATES = { 'url': DOT_URL_LINK_TEMPLATE, @@ -5032,7 +5236,7 @@ def iri_to_uri(iri): The function doesn't add an additional layer of escaping; e.g., it doesn't escape `%3C` as `%253C`. Instead, it percent-escapes characters with an underlying UTF-8 encoding *besides* those already escaped, leaving the URI intact. """ - iri_parts = compat_urllib_parse_urlparse(iri) + iri_parts = urllib.parse.urlparse(iri) if '[' in iri_parts.netloc: raise ValueError('IPv6 URIs are not, yet, supported.') @@ -5042,29 +5246,29 @@ def iri_to_uri(iri): net_location = '' if iri_parts.username: - net_location += compat_urllib_parse_quote(iri_parts.username, safe=r"!$%&'()*+,~") + net_location += urllib.parse.quote(iri_parts.username, safe=r"!$%&'()*+,~") if iri_parts.password is not None: - net_location += ':' + compat_urllib_parse_quote(iri_parts.password, safe=r"!$%&'()*+,~") + net_location += ':' + urllib.parse.quote(iri_parts.password, safe=r"!$%&'()*+,~") net_location += '@' - net_location += iri_parts.hostname.encode('idna').decode('utf-8') # Punycode for Unicode hostnames. + net_location += iri_parts.hostname.encode('idna').decode() # Punycode for Unicode hostnames. # The 'idna' encoding produces ASCII text. if iri_parts.port is not None and iri_parts.port != 80: net_location += ':' + str(iri_parts.port) - return compat_urllib_parse_urlunparse( + return urllib.parse.urlunparse( (iri_parts.scheme, net_location, - compat_urllib_parse_quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"), + urllib.parse.quote_plus(iri_parts.path, safe=r"!$%&'()*+,/:;=@|~"), # Unsure about the `safe` argument, since this is a legacy way of handling parameters. - compat_urllib_parse_quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"), + urllib.parse.quote_plus(iri_parts.params, safe=r"!$%&'()*+,/:;=@|~"), # Not totally sure about the `safe` argument, since the source does not explicitly mention the query URI component. - compat_urllib_parse_quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"), + urllib.parse.quote_plus(iri_parts.query, safe=r"!$%&'()*+,/:;=?@{|}~"), - compat_urllib_parse_quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~"))) + urllib.parse.quote_plus(iri_parts.fragment, safe=r"!#$%&'()*+,/:;=?@{|}~"))) # Source for `safe` arguments: https://url.spec.whatwg.org/#percent-encoded-bytes. @@ -5072,16 +5276,16 @@ def iri_to_uri(iri): def to_high_limit_path(path): if sys.platform in ['win32', 'cygwin']: # Work around MAX_PATH limitation on Windows. The maximum allowed length for the individual path segments may still be quite limited. - return r'\\?\ '.rstrip() + os.path.abspath(path) + return '\\\\?\\' + os.path.abspath(path) return path -def format_field(obj, field=None, template='%s', ignore=(None, ''), default='', func=None): +def format_field(obj, field=None, template='%s', ignore=NO_DEFAULT, default='', func=IDENTITY): val = traverse_obj(obj, *variadic(field)) - if val in ignore: + if (not val and val != 0) if ignore is NO_DEFAULT else val in variadic(ignore): return default - return template % (func(val) if func else val) + return template % func(val) def clean_podcast_url(url): @@ -5114,7 +5318,7 @@ def make_dir(path, to_screen=None): if dn and not os.path.exists(dn): os.makedirs(dn) return True - except (OSError, IOError) as err: + except OSError as err: if callable(to_screen) is not None: to_screen('unable to create directory ' + error_to_compat_str(err)) return False @@ -5133,7 +5337,7 @@ def get_executable_path(): def load_plugins(name, suffix, namespace): classes = {} - try: + with contextlib.suppress(FileNotFoundError): plugins_spec = importlib.util.spec_from_file_location( name, os.path.join(get_executable_path(), 'ytdlp_plugins', name, '__init__.py')) plugins = importlib.util.module_from_spec(plugins_spec) @@ -5146,133 +5350,186 @@ def load_plugins(name, suffix, namespace): continue klass = getattr(plugins, name) classes[name] = namespace[name] = klass - except FileNotFoundError: - pass return classes def traverse_obj( - obj, *path_list, default=None, expected_type=None, get_all=True, + obj, *paths, default=NO_DEFAULT, expected_type=None, get_all=True, casesense=True, is_user_input=False, traverse_string=False): - ''' Traverse nested list/dict/tuple - @param path_list A list of paths which are checked one by one. - Each path is a list of keys where each key is a string, - a function, a tuple of strings/None or "...". - When a fuction is given, it takes the key and value as arguments - and returns whether the key matches or not. When a tuple is given, - all the keys given in the tuple are traversed, and - "..." traverses all the keys in the object - "None" returns the object without traversal - @param default Default value to return - @param expected_type Only accept final value of this type (Can also be any callable) - @param get_all Return all the values obtained from a path or only the first one - @param casesense Whether to consider dictionary keys as case sensitive - @param is_user_input Whether the keys are generated from user input. If True, - strings are converted to int/slice if necessary - @param traverse_string Whether to traverse inside strings. If True, any - non-compatible object will also be converted into a string - # TODO: Write tests - ''' - if not casesense: - _lower = lambda k: (k.lower() if isinstance(k, str) else k) - path_list = (map(_lower, variadic(path)) for path in path_list) - - def _traverse_obj(obj, path, _current_depth=0): - nonlocal depth - path = tuple(variadic(path)) - for i, key in enumerate(path): - if None in (key, obj): - return obj - if isinstance(key, (list, tuple)): - obj = [_traverse_obj(obj, sub_key, _current_depth) for sub_key in key] - key = ... - if key is ...: - obj = (obj.values() if isinstance(obj, dict) - else obj if isinstance(obj, (list, tuple, LazyList)) - else str(obj) if traverse_string else []) - _current_depth += 1 - depth = max(depth, _current_depth) - return [_traverse_obj(inner_obj, path[i + 1:], _current_depth) for inner_obj in obj] - elif callable(key): - if isinstance(obj, (list, tuple, LazyList)): - obj = enumerate(obj) - elif isinstance(obj, dict): - obj = obj.items() - else: - if not traverse_string: - return None - obj = str(obj) - _current_depth += 1 - depth = max(depth, _current_depth) - return [_traverse_obj(v, path[i + 1:], _current_depth) for k, v in obj if try_call(key, args=(k, v))] - elif isinstance(obj, dict) and not (is_user_input and key == ':'): - obj = (obj.get(key) if casesense or (key in obj) - else next((v for k, v in obj.items() if _lower(k) == key), None)) - else: - if is_user_input: - key = (int_or_none(key) if ':' not in key - else slice(*map(int_or_none, key.split(':')))) - if key == slice(None): - return _traverse_obj(obj, (..., *path[i + 1:]), _current_depth) - if not isinstance(key, (int, slice)): - return None - if not isinstance(obj, (list, tuple, LazyList)): - if not traverse_string: - return None - obj = str(obj) - try: - obj = obj[key] - except IndexError: - return None - return obj + """ + Safely traverse nested `dict`s and `Sequence`s + + >>> obj = [{}, {"key": "value"}] + >>> traverse_obj(obj, (1, "key")) + "value" + + Each of the provided `paths` is tested and the first producing a valid result will be returned. + The next path will also be tested if the path branched but no results could be found. + Supported values for traversal are `Mapping`, `Sequence` and `re.Match`. + A value of None is treated as the absence of a value. + + The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`. + + The keys in the path can be one of: + - `None`: Return the current object. + - `str`/`int`: Return `obj[key]`. For `re.Match, return `obj.group(key)`. + - `slice`: Branch out and return all values in `obj[key]`. + - `Ellipsis`: Branch out and return a list of all values. + - `tuple`/`list`: Branch out and return a list of all matching values. + Read as: `[traverse_obj(obj, branch) for branch in branches]`. + - `function`: Branch out and return values filtered by the function. + Read as: `[value for key, value in obj if function(key, value)]`. + For `Sequence`s, `key` is the index of the value. + - `dict` Transform the current object and return a matching dict. + Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`. + + `tuple`, `list`, and `dict` all support nested paths and branches. + + @params paths Paths which to traverse by. + @param default Value to return if the paths do not match. + @param expected_type If a `type`, only accept final values of this type. + If any other callable, try to call the function on each result. + @param get_all If `False`, return the first matching result, otherwise all matching ones. + @param casesense If `False`, consider string dictionary keys as case insensitive. + + The following are only meant to be used by YoutubeDL.prepare_outtmpl and are not part of the API + + @param is_user_input Whether the keys are generated from user input. + If `True` strings get converted to `int`/`slice` if needed. + @param traverse_string Whether to traverse into objects as strings. + If `True`, any non-compatible object will first be + converted into a string and then traversed into. + + + @returns The result of the object traversal. + If successful, `get_all=True`, and the path branches at least once, + then a list of results is returned instead. + A list is always returned if the last path branches and no `default` is given. + """ + is_sequence = lambda x: isinstance(x, collections.abc.Sequence) and not isinstance(x, (str, bytes)) + casefold = lambda k: k.casefold() if isinstance(k, str) else k if isinstance(expected_type, type): type_test = lambda val: val if isinstance(val, expected_type) else None - elif expected_type is not None: - type_test = expected_type else: - type_test = lambda val: val - - for path in path_list: - depth = 0 - val = _traverse_obj(obj, path) - if val is not None: - if depth: - for _ in range(depth - 1): - val = itertools.chain.from_iterable(v for v in val if v is not None) - val = [v for v in map(type_test, val) if v is not None] - if val: - return val if get_all else val[0] + type_test = lambda val: try_call(expected_type or IDENTITY, args=(val,)) + + def apply_key(key, obj): + if obj is None: + return + + elif key is None: + yield obj + + elif isinstance(key, (list, tuple)): + for branch in key: + _, result = apply_path(obj, branch) + yield from result + + elif key is ...: + if isinstance(obj, collections.abc.Mapping): + yield from obj.values() + elif is_sequence(obj): + yield from obj + elif isinstance(obj, re.Match): + yield from obj.groups() + elif traverse_string: + yield from str(obj) + + elif callable(key): + if is_sequence(obj): + iter_obj = enumerate(obj) + elif isinstance(obj, collections.abc.Mapping): + iter_obj = obj.items() + elif isinstance(obj, re.Match): + iter_obj = enumerate((obj.group(), *obj.groups())) + elif traverse_string: + iter_obj = enumerate(str(obj)) else: - val = type_test(val) - if val is not None: - return val - return default + return + yield from (v for k, v in iter_obj if try_call(key, args=(k, v))) + elif isinstance(key, dict): + iter_obj = ((k, _traverse_obj(obj, v)) for k, v in key.items()) + yield {k: v if v is not None else default for k, v in iter_obj + if v is not None or default is not NO_DEFAULT} -def traverse_dict(dictn, keys, casesense=True): - write_string('DeprecationWarning: hypervideo_dl.utils.traverse_dict is deprecated ' - 'and may be removed in a future version. Use hypervideo_dl.utils.traverse_obj instead') - return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True) + elif isinstance(obj, collections.abc.Mapping): + yield (obj.get(key) if casesense or (key in obj) + else next((v for k, v in obj.items() if casefold(k) == key), None)) + elif isinstance(obj, re.Match): + if isinstance(key, int) or casesense: + with contextlib.suppress(IndexError): + yield obj.group(key) + return -def get_first(obj, keys, **kwargs): - return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False) + if not isinstance(key, str): + return + yield next((v for k, v in obj.groupdict().items() if casefold(k) == key), None) -def variadic(x, allowed_types=(str, bytes, dict)): - return x if isinstance(x, collections.abc.Iterable) and not isinstance(x, allowed_types) else (x,) + else: + if is_user_input: + key = (int_or_none(key) if ':' not in key + else slice(*map(int_or_none, key.split(':')))) + if not isinstance(key, (int, slice)): + return -def decode_base(value, digits): - # This will convert given base-x string to scalar (long or int) - table = {char: index for index, char in enumerate(digits)} - result = 0 - base = len(digits) - for chr in value: - result *= base - result += table[chr] - return result + if not is_sequence(obj): + if not traverse_string: + return + obj = str(obj) + + with contextlib.suppress(IndexError): + yield obj[key] + + def apply_path(start_obj, path): + objs = (start_obj,) + has_branched = False + + for key in variadic(path): + if is_user_input and key == ':': + key = ... + + if not casesense and isinstance(key, str): + key = key.casefold() + + if key is ... or isinstance(key, (list, tuple)) or callable(key): + has_branched = True + + key_func = functools.partial(apply_key, key) + objs = itertools.chain.from_iterable(map(key_func, objs)) + + return has_branched, objs + + def _traverse_obj(obj, path, use_list=True): + has_branched, results = apply_path(obj, path) + results = LazyList(x for x in map(type_test, results) if x is not None) + + if get_all and has_branched: + return results.exhaust() if results or use_list else None + + return results[0] if results else None + + for index, path in enumerate(paths, 1): + use_list = default is NO_DEFAULT and index == len(paths) + result = _traverse_obj(obj, path, use_list) + if result is not None: + return result + + return None if default is NO_DEFAULT else default + + +def traverse_dict(dictn, keys, casesense=True): + deprecation_warning(f'"{__name__}.traverse_dict" is deprecated and may be removed ' + f'in a future version. Use "{__name__}.traverse_obj" instead') + return traverse_obj(dictn, keys, casesense=casesense, is_user_input=True, traverse_string=True) + + +def get_first(obj, keys, **kwargs): + return traverse_obj(obj, (..., *variadic(keys)), **kwargs, get_all=False) def time_seconds(**kwargs): @@ -5291,9 +5548,9 @@ def jwt_encode_hs256(payload_data, key, headers={}): } if headers: header_data.update(headers) - header_b64 = base64.b64encode(json.dumps(header_data).encode('utf-8')) - payload_b64 = base64.b64encode(json.dumps(payload_data).encode('utf-8')) - h = hmac.new(key.encode('utf-8'), header_b64 + b'.' + payload_b64, hashlib.sha256) + header_b64 = base64.b64encode(json.dumps(header_data).encode()) + payload_b64 = base64.b64encode(json.dumps(payload_data).encode()) + h = hmac.new(key.encode(), header_b64 + b'.' + payload_b64, hashlib.sha256) signature_b64 = base64.b64encode(h.digest()) token = header_b64 + b'.' + payload_b64 + b'.' + signature_b64 return token @@ -5302,14 +5559,18 @@ def jwt_encode_hs256(payload_data, key, headers={}): # can be extended in future to verify the signature and parse header and return the algorithm used if it's not HS256 def jwt_decode_hs256(jwt): header_b64, payload_b64, signature_b64 = jwt.split('.') - payload_data = json.loads(base64.urlsafe_b64decode(payload_b64)) + # add trailing ='s that may have been stripped, superfluous ='s are ignored + payload_data = json.loads(base64.urlsafe_b64decode(f'{payload_b64}===')) return payload_data +WINDOWS_VT_MODE = False if compat_os_name == 'nt' else None + + +@functools.cache def supports_terminal_sequences(stream): if compat_os_name == 'nt': - from .compat import WINDOWS_VT_MODE # Must be imported locally - if not WINDOWS_VT_MODE or get_windows_version() < (10, 0, 10586): + if not WINDOWS_VT_MODE: return False elif not os.getenv('TERM'): return False @@ -5319,6 +5580,19 @@ def supports_terminal_sequences(stream): return False +def windows_enable_vt_mode(): # TODO: Do this the proper way https://bugs.python.org/issue30075 + if get_windows_version() < (10, 0, 10586): + return + global WINDOWS_VT_MODE + try: + Popen.run('', shell=True) + except Exception: + return + + WINDOWS_VT_MODE = True + supports_terminal_sequences.cache_clear() + + _terminal_sequences_re = re.compile('\033\\[[^m]+m') @@ -5332,7 +5606,7 @@ def number_of_digits(number): def join_nonempty(*values, delim='-', from_dict=None): if from_dict is not None: - values = map(from_dict.get, values) + values = (traverse_obj(from_dict, variadic(v)) for v in values) return delim.join(map(str, filter(None, values))) @@ -5346,7 +5620,7 @@ def scale_thumbnails_to_max_format_width(formats, thumbnails, url_width_re): """ _keys = ('width', 'height') max_dimensions = max( - [tuple(format.get(k) or 0 for k in _keys) for format in formats], + (tuple(format.get(k) or 0 for k in _keys) for format in formats), default=(0, 0)) if not max_dimensions[0]: return thumbnails @@ -5368,33 +5642,69 @@ def parse_http_range(range): return int(crg.group(1)), int_or_none(crg.group(2)), int_or_none(crg.group(3)) +def read_stdin(what): + eof = 'Ctrl+Z' if compat_os_name == 'nt' else 'Ctrl+D' + write_string(f'Reading {what} from STDIN - EOF ({eof}) to end:\n') + return sys.stdin + + +def determine_file_encoding(data): + """ + Detect the text encoding used + @returns (encoding, bytes to skip) + """ + + # BOM marks are given priority over declarations + for bom, enc in BOMS: + if data.startswith(bom): + return enc, len(bom) + + # Strip off all null bytes to match even when UTF-16 or UTF-32 is used. + # We ignore the endianness to get a good enough match + data = data.replace(b'\0', b'') + mobj = re.match(rb'(?m)^#\s*coding\s*:\s*(\S+)\s*$', data) + return mobj.group(1).decode() if mobj else None, 0 + + class Config: own_args = None + parsed_args = None filename = None __initialized = False def __init__(self, parser, label=None): - self._parser, self.label = parser, label + self.parser, self.label = parser, label self._loaded_paths, self.configs = set(), [] def init(self, args=None, filename=None): assert not self.__initialized + self.own_args, self.filename = args, filename + return self.load_configs() + + def load_configs(self): directory = '' - if filename: - location = os.path.realpath(filename) + if self.filename: + location = os.path.realpath(self.filename) directory = os.path.dirname(location) if location in self._loaded_paths: return False self._loaded_paths.add(location) self.__initialized = True - self.own_args, self.filename = args, filename - for location in self._parser.parse_args(args)[0].config_locations or []: + opts, _ = self.parser.parse_known_args(self.own_args) + self.parsed_args = self.own_args + for location in opts.config_locations or []: + if location == '-': + if location in self._loaded_paths: + continue + self._loaded_paths.add(location) + self.append_config(shlex.split(read_stdin('options'), comments=True), label='stdin') + continue location = os.path.join(directory, expand_path(location)) if os.path.isdir(location): location = os.path.join(location, 'hypervideo.conf') if not os.path.exists(location): - self._parser.error(f'config location {location} does not exist') + self.parser.error(f'config location {location} does not exist') self.append_config(self.read_file(location), location) return True @@ -5410,22 +5720,27 @@ class Config: @staticmethod def read_file(filename, default=[]): try: - optionf = open(filename) - except IOError: + optionf = open(filename, 'rb') + except OSError: return default # silently skip if file is not present try: + enc, skip = determine_file_encoding(optionf.read(512)) + optionf.seek(skip, io.SEEK_SET) + except OSError: + enc = None # silently skip read errors + try: # FIXME: https://github.com/ytdl-org/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56 - contents = optionf.read() - if sys.version_info < (3,): - contents = contents.decode(preferredencoding()) - res = compat_shlex_split(contents, comments=True) + contents = optionf.read().decode(enc or preferredencoding()) + res = shlex.split(contents, comments=True) + except Exception as err: + raise ValueError(f'Unable to parse "{filename}": {err}') finally: optionf.close() return res @staticmethod def hide_login_info(opts): - PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username']) + PRIVATE_OPTS = {'-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'} eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$') def _scrub_eq(o): @@ -5442,7 +5757,7 @@ class Config: return opts def append_config(self, *args, label=None): - config = type(self)(self._parser, label) + config = type(self)(self.parser, label) config._loaded_paths = self._loaded_paths if config.init(*args): self.configs.append(config) @@ -5451,18 +5766,23 @@ class Config: def all_args(self): for config in reversed(self.configs): yield from config.all_args - yield from self.own_args or [] + yield from self.parsed_args or [] + + def parse_known_args(self, **kwargs): + return self.parser.parse_known_args(self.all_args, **kwargs) def parse_args(self): - return self._parser.parse_args(list(self.all_args)) + return self.parser.parse_args(self.all_args) -class WebSocketsWrapper(): +class WebSocketsWrapper: """Wraps websockets module to use in non-async scopes""" + pool = None def __init__(self, url, headers=None, connect=True): - self.loop = asyncio.events.new_event_loop() - self.conn = compat_websockets.connect( + self.loop = asyncio.new_event_loop() + # XXX: "loop" is deprecated + self.conn = websockets.connect( url, extra_headers=headers, ping_interval=None, close_timeout=float('inf'), loop=self.loop, ping_timeout=float('inf')) if connect: @@ -5491,7 +5811,7 @@ class WebSocketsWrapper(): # for contributors: If there's any new library using asyncio needs to be run in non-async, move these function out of this class @staticmethod def run_with_loop(main, loop): - if not asyncio.coroutines.iscoroutine(main): + if not asyncio.iscoroutine(main): raise ValueError(f'a coroutine was expected, got {main!r}') try: @@ -5503,7 +5823,7 @@ class WebSocketsWrapper(): @staticmethod def _cancel_all_tasks(loop): - to_cancel = asyncio.tasks.all_tasks(loop) + to_cancel = asyncio.all_tasks(loop) if not to_cancel: return @@ -5511,8 +5831,9 @@ class WebSocketsWrapper(): for task in to_cancel: task.cancel() + # XXX: "loop" is removed in python 3.10+ loop.run_until_complete( - asyncio.tasks.gather(*to_cancel, loop=loop, return_exceptions=True)) + asyncio.gather(*to_cancel, loop=loop, return_exceptions=True)) for task in to_cancel: if task.cancelled(): @@ -5525,17 +5846,459 @@ class WebSocketsWrapper(): }) -has_websockets = bool(compat_websockets) - - def merge_headers(*dicts): """Merge dicts of http headers case insensitively, prioritizing the latter ones""" return {k.title(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))} +def cached_method(f): + """Cache a method""" + signature = inspect.signature(f) + + @functools.wraps(f) + def wrapper(self, *args, **kwargs): + bound_args = signature.bind(self, *args, **kwargs) + bound_args.apply_defaults() + key = tuple(bound_args.arguments.values())[1:] + + cache = vars(self).setdefault('_cached_method__cache', {}).setdefault(f.__name__, {}) + if key not in cache: + cache[key] = f(self, *args, **kwargs) + return cache[key] + return wrapper + + class classproperty: - def __init__(self, f): - self.f = f + """property access for class methods with optional caching""" + def __new__(cls, func=None, *args, **kwargs): + if not func: + return functools.partial(cls, *args, **kwargs) + return super().__new__(cls) + + def __init__(self, func, *, cache=False): + functools.update_wrapper(self, func) + self.func = func + self._cache = {} if cache else None def __get__(self, _, cls): - return self.f(cls) + if self._cache is None: + return self.func(cls) + elif cls not in self._cache: + self._cache[cls] = self.func(cls) + return self._cache[cls] + + +class Namespace(types.SimpleNamespace): + """Immutable namespace""" + + def __iter__(self): + return iter(self.__dict__.values()) + + @property + def items_(self): + return self.__dict__.items() + + +MEDIA_EXTENSIONS = Namespace( + common_video=('avi', 'flv', 'mkv', 'mov', 'mp4', 'webm'), + video=('3g2', '3gp', 'f4v', 'mk3d', 'divx', 'mpg', 'ogv', 'm4v', 'wmv'), + common_audio=('aiff', 'alac', 'flac', 'm4a', 'mka', 'mp3', 'ogg', 'opus', 'wav'), + audio=('aac', 'ape', 'asf', 'f4a', 'f4b', 'm4b', 'm4p', 'm4r', 'oga', 'ogx', 'spx', 'vorbis', 'wma'), + thumbnails=('jpg', 'png', 'webp'), + storyboards=('mhtml', ), + subtitles=('srt', 'vtt', 'ass', 'lrc'), + manifests=('f4f', 'f4m', 'm3u8', 'smil', 'mpd'), +) +MEDIA_EXTENSIONS.video += MEDIA_EXTENSIONS.common_video +MEDIA_EXTENSIONS.audio += MEDIA_EXTENSIONS.common_audio + +KNOWN_EXTENSIONS = (*MEDIA_EXTENSIONS.video, *MEDIA_EXTENSIONS.audio, *MEDIA_EXTENSIONS.manifests) + + +class RetryManager: + """Usage: + for retry in RetryManager(...): + try: + ... + except SomeException as err: + retry.error = err + continue + """ + attempt, _error = 0, None + + def __init__(self, _retries, _error_callback, **kwargs): + self.retries = _retries or 0 + self.error_callback = functools.partial(_error_callback, **kwargs) + + def _should_retry(self): + return self._error is not NO_DEFAULT and self.attempt <= self.retries + + @property + def error(self): + if self._error is NO_DEFAULT: + return None + return self._error + + @error.setter + def error(self, value): + self._error = value + + def __iter__(self): + while self._should_retry(): + self.error = NO_DEFAULT + self.attempt += 1 + yield self + if self.error: + self.error_callback(self.error, self.attempt, self.retries) + + @staticmethod + def report_retry(e, count, retries, *, sleep_func, info, warn, error=None, suffix=None): + """Utility function for reporting retries""" + if count > retries: + if error: + return error(f'{e}. Giving up after {count - 1} retries') if count > 1 else error(str(e)) + raise e + + if not count: + return warn(e) + elif isinstance(e, ExtractorError): + e = remove_end(str_or_none(e.cause) or e.orig_msg, '.') + warn(f'{e}. Retrying{format_field(suffix, None, " %s")} ({count}/{retries})...') + + delay = float_or_none(sleep_func(n=count - 1)) if callable(sleep_func) else sleep_func + if delay: + info(f'Sleeping {delay:.2f} seconds ...') + time.sleep(delay) + + +def make_archive_id(ie, video_id): + ie_key = ie if isinstance(ie, str) else ie.ie_key() + return f'{ie_key.lower()} {video_id}' + + +def truncate_string(s, left, right=0): + assert left > 3 and right >= 0 + if s is None or len(s) <= left + right: + return s + return f'{s[:left-3]}...{s[-right:]}' + + +def orderedSet_from_options(options, alias_dict, *, use_regex=False, start=None): + assert 'all' in alias_dict, '"all" alias is required' + requested = list(start or []) + for val in options: + discard = val.startswith('-') + if discard: + val = val[1:] + + if val in alias_dict: + val = alias_dict[val] if not discard else [ + i[1:] if i.startswith('-') else f'-{i}' for i in alias_dict[val]] + # NB: Do not allow regex in aliases for performance + requested = orderedSet_from_options(val, alias_dict, start=requested) + continue + + current = (filter(re.compile(val, re.I).fullmatch, alias_dict['all']) if use_regex + else [val] if val in alias_dict['all'] else None) + if current is None: + raise ValueError(val) + + if discard: + for item in current: + while item in requested: + requested.remove(item) + else: + requested.extend(current) + + return orderedSet(requested) + + +class FormatSorter: + regex = r' *((?P<reverse>\+)?(?P<field>[a-zA-Z0-9_]+)((?P<separator>[~:])(?P<limit>.*?))?)? *$' + + default = ('hidden', 'aud_or_vid', 'hasvid', 'ie_pref', 'lang', 'quality', + 'res', 'fps', 'hdr:12', 'vcodec:vp9.2', 'channels', 'acodec', + 'size', 'br', 'asr', 'proto', 'ext', 'hasaud', 'source', 'id') # These must not be aliases + ytdl_default = ('hasaud', 'lang', 'quality', 'tbr', 'filesize', 'vbr', + 'height', 'width', 'proto', 'vext', 'abr', 'aext', + 'fps', 'fs_approx', 'source', 'id') + + settings = { + 'vcodec': {'type': 'ordered', 'regex': True, + 'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']}, + 'acodec': {'type': 'ordered', 'regex': True, + 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']}, + 'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range', + 'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]}, + 'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol', + 'order': ['(ht|f)tps', '(ht|f)tp$', 'm3u8.*', '.*dash', 'websocket_frag', 'rtmpe?', '', 'mms|rtsp', 'ws|websocket', 'f4']}, + 'vext': {'type': 'ordered', 'field': 'video_ext', + 'order': ('mp4', 'mov', 'webm', 'flv', '', 'none'), + 'order_free': ('webm', 'mp4', 'mov', 'flv', '', 'none')}, + 'aext': {'type': 'ordered', 'field': 'audio_ext', + 'order': ('m4a', 'aac', 'mp3', 'ogg', 'opus', 'webm', '', 'none'), + 'order_free': ('ogg', 'opus', 'webm', 'mp3', 'm4a', 'aac', '', 'none')}, + 'hidden': {'visible': False, 'forced': True, 'type': 'extractor', 'max': -1000}, + 'aud_or_vid': {'visible': False, 'forced': True, 'type': 'multiple', + 'field': ('vcodec', 'acodec'), + 'function': lambda it: int(any(v != 'none' for v in it))}, + 'ie_pref': {'priority': True, 'type': 'extractor'}, + 'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)}, + 'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)}, + 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1}, + 'quality': {'convert': 'float', 'default': -1}, + 'filesize': {'convert': 'bytes'}, + 'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'}, + 'id': {'convert': 'string', 'field': 'format_id'}, + 'height': {'convert': 'float_none'}, + 'width': {'convert': 'float_none'}, + 'fps': {'convert': 'float_none'}, + 'channels': {'convert': 'float_none', 'field': 'audio_channels'}, + 'tbr': {'convert': 'float_none'}, + 'vbr': {'convert': 'float_none'}, + 'abr': {'convert': 'float_none'}, + 'asr': {'convert': 'float_none'}, + 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1}, + + 'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')}, + 'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True}, + 'size': {'type': 'combined', 'same_limit': True, 'field': ('filesize', 'fs_approx')}, + 'ext': {'type': 'combined', 'field': ('vext', 'aext')}, + 'res': {'type': 'multiple', 'field': ('height', 'width'), + 'function': lambda it: (lambda l: min(l) if l else 0)(tuple(filter(None, it)))}, + + # Actual field names + 'format_id': {'type': 'alias', 'field': 'id'}, + 'preference': {'type': 'alias', 'field': 'ie_pref'}, + 'language_preference': {'type': 'alias', 'field': 'lang'}, + 'source_preference': {'type': 'alias', 'field': 'source'}, + 'protocol': {'type': 'alias', 'field': 'proto'}, + 'filesize_approx': {'type': 'alias', 'field': 'fs_approx'}, + 'audio_channels': {'type': 'alias', 'field': 'channels'}, + + # Deprecated + 'dimension': {'type': 'alias', 'field': 'res', 'deprecated': True}, + 'resolution': {'type': 'alias', 'field': 'res', 'deprecated': True}, + 'extension': {'type': 'alias', 'field': 'ext', 'deprecated': True}, + 'bitrate': {'type': 'alias', 'field': 'br', 'deprecated': True}, + 'total_bitrate': {'type': 'alias', 'field': 'tbr', 'deprecated': True}, + 'video_bitrate': {'type': 'alias', 'field': 'vbr', 'deprecated': True}, + 'audio_bitrate': {'type': 'alias', 'field': 'abr', 'deprecated': True}, + 'framerate': {'type': 'alias', 'field': 'fps', 'deprecated': True}, + 'filesize_estimate': {'type': 'alias', 'field': 'size', 'deprecated': True}, + 'samplerate': {'type': 'alias', 'field': 'asr', 'deprecated': True}, + 'video_ext': {'type': 'alias', 'field': 'vext', 'deprecated': True}, + 'audio_ext': {'type': 'alias', 'field': 'aext', 'deprecated': True}, + 'video_codec': {'type': 'alias', 'field': 'vcodec', 'deprecated': True}, + 'audio_codec': {'type': 'alias', 'field': 'acodec', 'deprecated': True}, + 'video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True}, + 'has_video': {'type': 'alias', 'field': 'hasvid', 'deprecated': True}, + 'audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True}, + 'has_audio': {'type': 'alias', 'field': 'hasaud', 'deprecated': True}, + 'extractor': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True}, + 'extractor_preference': {'type': 'alias', 'field': 'ie_pref', 'deprecated': True}, + } + + def __init__(self, ydl, field_preference): + self.ydl = ydl + self._order = [] + self.evaluate_params(self.ydl.params, field_preference) + if ydl.params.get('verbose'): + self.print_verbose_info(self.ydl.write_debug) + + def _get_field_setting(self, field, key): + if field not in self.settings: + if key in ('forced', 'priority'): + return False + self.ydl.deprecated_feature(f'Using arbitrary fields ({field}) for format sorting is ' + 'deprecated and may be removed in a future version') + self.settings[field] = {} + propObj = self.settings[field] + if key not in propObj: + type = propObj.get('type') + if key == 'field': + default = 'preference' if type == 'extractor' else (field,) if type in ('combined', 'multiple') else field + elif key == 'convert': + default = 'order' if type == 'ordered' else 'float_string' if field else 'ignore' + else: + default = {'type': 'field', 'visible': True, 'order': [], 'not_in_list': (None,)}.get(key, None) + propObj[key] = default + return propObj[key] + + def _resolve_field_value(self, field, value, convertNone=False): + if value is None: + if not convertNone: + return None + else: + value = value.lower() + conversion = self._get_field_setting(field, 'convert') + if conversion == 'ignore': + return None + if conversion == 'string': + return value + elif conversion == 'float_none': + return float_or_none(value) + elif conversion == 'bytes': + return parse_bytes(value) + elif conversion == 'order': + order_list = (self._use_free_order and self._get_field_setting(field, 'order_free')) or self._get_field_setting(field, 'order') + use_regex = self._get_field_setting(field, 'regex') + list_length = len(order_list) + empty_pos = order_list.index('') if '' in order_list else list_length + 1 + if use_regex and value is not None: + for i, regex in enumerate(order_list): + if regex and re.match(regex, value): + return list_length - i + return list_length - empty_pos # not in list + else: # not regex or value = None + return list_length - (order_list.index(value) if value in order_list else empty_pos) + else: + if value.isnumeric(): + return float(value) + else: + self.settings[field]['convert'] = 'string' + return value + + def evaluate_params(self, params, sort_extractor): + self._use_free_order = params.get('prefer_free_formats', False) + self._sort_user = params.get('format_sort', []) + self._sort_extractor = sort_extractor + + def add_item(field, reverse, closest, limit_text): + field = field.lower() + if field in self._order: + return + self._order.append(field) + limit = self._resolve_field_value(field, limit_text) + data = { + 'reverse': reverse, + 'closest': False if limit is None else closest, + 'limit_text': limit_text, + 'limit': limit} + if field in self.settings: + self.settings[field].update(data) + else: + self.settings[field] = data + + sort_list = ( + tuple(field for field in self.default if self._get_field_setting(field, 'forced')) + + (tuple() if params.get('format_sort_force', False) + else tuple(field for field in self.default if self._get_field_setting(field, 'priority'))) + + tuple(self._sort_user) + tuple(sort_extractor) + self.default) + + for item in sort_list: + match = re.match(self.regex, item) + if match is None: + raise ExtractorError('Invalid format sort string "%s" given by extractor' % item) + field = match.group('field') + if field is None: + continue + if self._get_field_setting(field, 'type') == 'alias': + alias, field = field, self._get_field_setting(field, 'field') + if self._get_field_setting(alias, 'deprecated'): + self.ydl.deprecated_feature(f'Format sorting alias {alias} is deprecated and may ' + f'be removed in a future version. Please use {field} instead') + reverse = match.group('reverse') is not None + closest = match.group('separator') == '~' + limit_text = match.group('limit') + + has_limit = limit_text is not None + has_multiple_fields = self._get_field_setting(field, 'type') == 'combined' + has_multiple_limits = has_limit and has_multiple_fields and not self._get_field_setting(field, 'same_limit') + + fields = self._get_field_setting(field, 'field') if has_multiple_fields else (field,) + limits = limit_text.split(':') if has_multiple_limits else (limit_text,) if has_limit else tuple() + limit_count = len(limits) + for (i, f) in enumerate(fields): + add_item(f, reverse, closest, + limits[i] if i < limit_count + else limits[0] if has_limit and not has_multiple_limits + else None) + + def print_verbose_info(self, write_debug): + if self._sort_user: + write_debug('Sort order given by user: %s' % ', '.join(self._sort_user)) + if self._sort_extractor: + write_debug('Sort order given by extractor: %s' % ', '.join(self._sort_extractor)) + write_debug('Formats sorted by: %s' % ', '.join(['%s%s%s' % ( + '+' if self._get_field_setting(field, 'reverse') else '', field, + '%s%s(%s)' % ('~' if self._get_field_setting(field, 'closest') else ':', + self._get_field_setting(field, 'limit_text'), + self._get_field_setting(field, 'limit')) + if self._get_field_setting(field, 'limit_text') is not None else '') + for field in self._order if self._get_field_setting(field, 'visible')])) + + def _calculate_field_preference_from_value(self, format, field, type, value): + reverse = self._get_field_setting(field, 'reverse') + closest = self._get_field_setting(field, 'closest') + limit = self._get_field_setting(field, 'limit') + + if type == 'extractor': + maximum = self._get_field_setting(field, 'max') + if value is None or (maximum is not None and value >= maximum): + value = -1 + elif type == 'boolean': + in_list = self._get_field_setting(field, 'in_list') + not_in_list = self._get_field_setting(field, 'not_in_list') + value = 0 if ((in_list is None or value in in_list) and (not_in_list is None or value not in not_in_list)) else -1 + elif type == 'ordered': + value = self._resolve_field_value(field, value, True) + + # try to convert to number + val_num = float_or_none(value, default=self._get_field_setting(field, 'default')) + is_num = self._get_field_setting(field, 'convert') != 'string' and val_num is not None + if is_num: + value = val_num + + return ((-10, 0) if value is None + else (1, value, 0) if not is_num # if a field has mixed strings and numbers, strings are sorted higher + else (0, -abs(value - limit), value - limit if reverse else limit - value) if closest + else (0, value, 0) if not reverse and (limit is None or value <= limit) + else (0, -value, 0) if limit is None or (reverse and value == limit) or value > limit + else (-1, value, 0)) + + def _calculate_field_preference(self, format, field): + type = self._get_field_setting(field, 'type') # extractor, boolean, ordered, field, multiple + get_value = lambda f: format.get(self._get_field_setting(f, 'field')) + if type == 'multiple': + type = 'field' # Only 'field' is allowed in multiple for now + actual_fields = self._get_field_setting(field, 'field') + + value = self._get_field_setting(field, 'function')(get_value(f) for f in actual_fields) + else: + value = get_value(field) + return self._calculate_field_preference_from_value(format, field, type, value) + + def calculate_preference(self, format): + # Determine missing protocol + if not format.get('protocol'): + format['protocol'] = determine_protocol(format) + + # Determine missing ext + if not format.get('ext') and 'url' in format: + format['ext'] = determine_ext(format['url']) + if format.get('vcodec') == 'none': + format['audio_ext'] = format['ext'] if format.get('acodec') != 'none' else 'none' + format['video_ext'] = 'none' + else: + format['video_ext'] = format['ext'] + format['audio_ext'] = 'none' + # if format.get('preference') is None and format.get('ext') in ('f4f', 'f4m'): # Not supported? + # format['preference'] = -1000 + + # Determine missing bitrates + if format.get('tbr') is None: + if format.get('vbr') is not None and format.get('abr') is not None: + format['tbr'] = format.get('vbr', 0) + format.get('abr', 0) + else: + if format.get('vcodec') != 'none' and format.get('vbr') is None: + format['vbr'] = format.get('tbr') - format.get('abr', 0) + if format.get('acodec') != 'none' and format.get('abr') is None: + format['abr'] = format.get('tbr') - format.get('vbr', 0) + + return tuple(self._calculate_field_preference(format, field) for field in self._order) + + +# Deprecated +has_certifi = bool(certifi) +has_websockets = bool(websockets) diff --git a/hypervideo_dl/version.py b/hypervideo_dl/version.py index 107fefb..3b08699 100644 --- a/hypervideo_dl/version.py +++ b/hypervideo_dl/version.py @@ -2,4 +2,8 @@ __version__ = '1.1.13' -RELEASE_GIT_HEAD = 'c0c2c57d3' +RELEASE_GIT_HEAD = '8b644025b' + +VARIANT = None + +UPDATE_HINT = None diff --git a/hypervideo_dl/webvtt.py b/hypervideo_dl/webvtt.py index 0e602a7..e24dae3 100644 --- a/hypervideo_dl/webvtt.py +++ b/hypervideo_dl/webvtt.py @@ -1,6 +1,3 @@ -# coding: utf-8 -from __future__ import unicode_literals, print_function, division - """ A partial parser for WebVTT segments. Interprets enough of the WebVTT stream to be able to assemble a single stand-alone subtitle file, suitably adjusting @@ -11,17 +8,13 @@ Regular expressions based on the W3C WebVTT specification in RFC 8216 §3.5 <https://tools.ietf.org/html/rfc8216#section-3.5>. """ -import re import io +import re + from .utils import int_or_none, timetuple_from_msec -from .compat import ( - compat_str as str, - compat_Pattern, - compat_Match, -) -class _MatchParser(object): +class _MatchParser: """ An object that maintains the current parsing position and allows conveniently advancing it as syntax elements are successfully parsed. @@ -32,7 +25,7 @@ class _MatchParser(object): self._pos = 0 def match(self, r): - if isinstance(r, compat_Pattern): + if isinstance(r, re.Pattern): return r.match(self._data, self._pos) if isinstance(r, str): if self._data.startswith(r, self._pos): @@ -43,7 +36,7 @@ class _MatchParser(object): def advance(self, by): if by is None: amt = 0 - elif isinstance(by, compat_Match): + elif isinstance(by, re.Match): amt = len(by.group(0)) elif isinstance(by, str): amt = len(by) @@ -70,7 +63,7 @@ class _MatchChildParser(_MatchParser): """ def __init__(self, parent): - super(_MatchChildParser, self).__init__(parent._data) + super().__init__(parent._data) self.__parent = parent self._pos = parent._pos @@ -84,7 +77,7 @@ class _MatchChildParser(_MatchParser): class ParseError(Exception): def __init__(self, parser): - super(ParseError, self).__init__("Parse error at position %u (near %r)" % ( + super().__init__("Parse error at position %u (near %r)" % ( parser._pos, parser._data[parser._pos:parser._pos + 20] )) @@ -100,7 +93,7 @@ _REGEX_TS = re.compile(r'''(?x) ([0-9]{3})? ''') _REGEX_EOF = re.compile(r'\Z') -_REGEX_NL = re.compile(r'(?:\r\n|[\r\n])') +_REGEX_NL = re.compile(r'(?:\r\n|[\r\n]|$)') _REGEX_BLANK = re.compile(r'(?:\r\n|[\r\n])+') @@ -109,14 +102,8 @@ def _parse_ts(ts): Convert a parsed WebVTT timestamp (a re.Match obtained from _REGEX_TS) into an MPEG PES timestamp: a tick counter at 90 kHz resolution. """ - - h, min, s, ms = ts.groups() - return 90 * ( - int(h or 0) * 3600000 + # noqa: W504,E221,E222 - int(min) * 60000 + # noqa: W504,E221,E222 - int(s) * 1000 + # noqa: W504,E221,E222 - int(ms) # noqa: W504,E221,E222 - ) + return 90 * sum( + int(part or 0) * mult for part, mult in zip(ts.groups(), (3600_000, 60_000, 1000, 1))) def _format_ts(ts): @@ -127,7 +114,7 @@ def _format_ts(ts): return '%02u:%02u:%02u.%03u' % timetuple_from_msec(int((ts + 45) // 90)) -class Block(object): +class Block: """ An abstract WebVTT block. """ @@ -153,7 +140,6 @@ class HeaderBlock(Block): A WebVTT block that may only appear in the header part of the file, i.e. before any cue blocks. """ - pass @@ -174,6 +160,12 @@ class Magic(HeaderBlock): _REGEX_TSMAP_MPEGTS = re.compile(r'MPEGTS:([0-9]+)') _REGEX_TSMAP_SEP = re.compile(r'[ \t]*,[ \t]*') + # This was removed from the spec in the 2017 revision; + # the last spec draft to describe this syntax element is + # <https://www.w3.org/TR/2015/WD-webvtt1-20151208/#webvtt-metadata-header>. + # Nevertheless, YouTube keeps serving those + _REGEX_META = re.compile(r'(?:(?!-->)[^\r\n])+:(?:(?!-->)[^\r\n])+(?:\r\n|[\r\n])') + @classmethod def __parse_tsmap(cls, parser): parser = parser.child() @@ -213,13 +205,18 @@ class Magic(HeaderBlock): raise ParseError(parser) extra = m.group(1) - local, mpegts = None, None - if parser.consume(cls._REGEX_TSMAP): - local, mpegts = cls.__parse_tsmap(parser) - if not parser.consume(_REGEX_NL): + local, mpegts, meta = None, None, '' + while not parser.consume(_REGEX_NL): + if parser.consume(cls._REGEX_TSMAP): + local, mpegts = cls.__parse_tsmap(parser) + continue + m = parser.consume(cls._REGEX_META) + if m: + meta += m.group(0) + continue raise ParseError(parser) parser.commit() - return cls(extra=extra, mpegts=mpegts, local=local) + return cls(extra=extra, mpegts=mpegts, local=local, meta=meta) def write_into(self, stream): stream.write('WEBVTT') @@ -232,6 +229,8 @@ class Magic(HeaderBlock): stream.write(',MPEGTS:') stream.write(str(self.mpegts if self.mpegts is not None else 0)) stream.write('\n') + if self.meta: + stream.write(self.meta) stream.write('\n') @@ -359,7 +358,7 @@ def parse_fragment(frag_content): a bytes object containing the raw contents of a WebVTT file. """ - parser = _MatchParser(frag_content.decode('utf-8')) + parser = _MatchParser(frag_content.decode()) yield Magic.parse(parser) |