aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--yt_dlp/YoutubeDL.py33
-rw-r--r--yt_dlp/__init__.py15
-rw-r--r--yt_dlp/extractor/bandcamp.py1
-rw-r--r--yt_dlp/extractor/common.py7
-rw-r--r--yt_dlp/extractor/crunchyroll.py2
-rw-r--r--yt_dlp/extractor/daystar.py48
-rw-r--r--yt_dlp/extractor/extractors.py6
-rw-r--r--yt_dlp/extractor/gettr.py82
-rw-r--r--yt_dlp/extractor/instagram.py3
-rw-r--r--yt_dlp/extractor/lbry.py9
-rw-r--r--yt_dlp/extractor/mildom.py3
-rw-r--r--yt_dlp/extractor/openload.py3
-rw-r--r--yt_dlp/extractor/rtve.py3
-rw-r--r--yt_dlp/extractor/telegram.py37
-rw-r--r--yt_dlp/extractor/twitch.py2
-rw-r--r--yt_dlp/extractor/videocampus_sachsen.py96
-rw-r--r--yt_dlp/extractor/vimeo.py3
-rw-r--r--yt_dlp/extractor/youtube.py10
-rw-r--r--yt_dlp/extractor/zingmp3.py135
-rw-r--r--yt_dlp/options.py7
-rw-r--r--yt_dlp/utils.py13
21 files changed, 393 insertions, 125 deletions
diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py
index 5db4cc483..99c280ffc 100644
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@@ -83,6 +83,7 @@ from .utils import (
make_dir,
make_HTTPS_handler,
MaxDownloadsReached,
+ merge_headers,
network_exceptions,
number_of_digits,
orderedSet,
@@ -331,6 +332,7 @@ class YoutubeDL(object):
nocheckcertificate: Do not verify SSL certificates
prefer_insecure: Use HTTP instead of HTTPS to retrieve information.
At the moment, this is only supported by YouTube.
+ http_headers: A dictionary of custom headers to be used for all requests
proxy: URL of the proxy server to use
geo_verification_proxy: URL of the proxy to use for IP address verification
on geo-restricted sites.
@@ -646,6 +648,9 @@ class YoutubeDL(object):
else self.params['format'] if callable(self.params['format'])
else self.build_format_selector(self.params['format']))
+ # Set http_headers defaults according to std_headers
+ self.params['http_headers'] = merge_headers(std_headers, self.params.get('http_headers', {}))
+
self._setup_opener()
if auto_init:
@@ -953,13 +958,13 @@ class YoutubeDL(object):
except UnicodeEncodeError:
self.to_screen('Deleting existing file')
- def raise_no_formats(self, info, forced=False):
+ def raise_no_formats(self, info, forced=False, *, msg=None):
has_drm = info.get('__has_drm')
- msg = 'This video is DRM protected' if has_drm else 'No video formats found!'
- expected = self.params.get('ignore_no_formats_error')
- if forced or not expected:
+ ignored, expected = self.params.get('ignore_no_formats_error'), bool(msg)
+ msg = msg or has_drm and 'This video is DRM protected' or 'No video formats found!'
+ if forced or not ignored:
raise ExtractorError(msg, video_id=info['id'], ie=info['extractor'],
- expected=has_drm or expected)
+ expected=has_drm or ignored or expected)
else:
self.report_warning(msg)
@@ -2249,8 +2254,7 @@ class YoutubeDL(object):
return _build_selector_function(parsed_selector)
def _calc_headers(self, info_dict):
- res = std_headers.copy()
- res.update(info_dict.get('http_headers') or {})
+ res = merge_headers(self.params['http_headers'], info_dict.get('http_headers') or {})
cookies = self._calc_cookies(info_dict)
if cookies:
@@ -2391,6 +2395,8 @@ class YoutubeDL(object):
sanitize_string_field(info_dict, 'id')
sanitize_numeric_fields(info_dict)
+ if (info_dict.get('duration') or 0) <= 0 and info_dict.pop('duration', None):
+ self.report_warning('"duration" field is negative, there is an error in extractor')
if 'playlist' not in info_dict:
# It isn't part of a playlist
@@ -2437,11 +2443,14 @@ class YoutubeDL(object):
if not self.params.get('allow_unplayable_formats'):
formats = [f for f in formats if not f.get('has_drm')]
- if info_dict.get('is_live'):
- get_from_start = bool(self.params.get('live_from_start'))
+ get_from_start = not info_dict.get('is_live') or bool(self.params.get('live_from_start'))
+ if not get_from_start:
+ info_dict['title'] += ' ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
+ if info_dict.get('is_live') and formats:
formats = [f for f in formats if bool(f.get('is_from_start')) == get_from_start]
- if not get_from_start:
- info_dict['title'] += ' ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M')
+ if get_from_start and not formats:
+ self.raise_no_formats(info_dict, msg='--live-from-start is passed, but there are no formats that can be downloaded from the start. '
+ 'If you want to download from the current time, pass --no-live-from-start')
if not formats:
self.raise_no_formats(info_dict)
@@ -3582,7 +3591,7 @@ class YoutubeDL(object):
return
def get_encoding(stream):
- ret = getattr(stream, 'encoding', 'missing (%s)' % type(stream).__name__)
+ ret = str(getattr(stream, 'encoding', 'missing (%s)' % type(stream).__name__))
if not supports_terminal_sequences(stream):
from .compat import WINDOWS_VT_MODE
ret += ' (No VT)' if WINDOWS_VT_MODE is False else ' (No ANSI)'
diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py
index f308f6a89..524130807 100644
--- a/yt_dlp/__init__.py
+++ b/yt_dlp/__init__.py
@@ -39,6 +39,7 @@ from .utils import (
SameFileError,
setproctitle,
std_headers,
+ traverse_obj,
write_string,
)
from .downloader import (
@@ -72,20 +73,15 @@ def _real_main(argv=None):
parser, opts, args = parseOpts(argv)
warnings, deprecation_warnings = [], []
- # Set user agent
if opts.user_agent is not None:
- std_headers['User-Agent'] = opts.user_agent
-
- # Set referer
+ opts.headers.setdefault('User-Agent', opts.user_agent)
if opts.referer is not None:
- std_headers['Referer'] = opts.referer
-
- # Custom HTTP headers
- std_headers.update(opts.headers)
+ opts.headers.setdefault('Referer', opts.referer)
# Dump user agent
if opts.dump_user_agent:
- write_string(std_headers['User-Agent'] + '\n', out=sys.stdout)
+ ua = traverse_obj(opts.headers, 'User-Agent', casesense=False, default=std_headers['User-Agent'])
+ write_string(f'{ua}\n', out=sys.stdout)
sys.exit(0)
# Batch file verification
@@ -764,6 +760,7 @@ def _real_main(argv=None):
'legacyserverconnect': opts.legacy_server_connect,
'nocheckcertificate': opts.no_check_certificate,
'prefer_insecure': opts.prefer_insecure,
+ 'http_headers': opts.headers,
'proxy': opts.proxy,
'socket_timeout': opts.socket_timeout,
'bidi_workaround': opts.bidi_workaround,
diff --git a/yt_dlp/extractor/bandcamp.py b/yt_dlp/extractor/bandcamp.py
index 42223dab7..745055e2d 100644
--- a/yt_dlp/extractor/bandcamp.py
+++ b/yt_dlp/extractor/bandcamp.py
@@ -183,6 +183,7 @@ class BandcampIE(InfoExtractor):
'format_note': f.get('description'),
'filesize': parse_filesize(f.get('size_mb')),
'vcodec': 'none',
+ 'acodec': format_id.split('-')[0],
})
self._sort_formats(formats)
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index d8bb21137..dbf5ef8d4 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -639,7 +639,7 @@ class InfoExtractor(object):
}
if hasattr(e, 'countries'):
kwargs['countries'] = e.countries
- raise type(e)(e.msg, **kwargs)
+ raise type(e)(e.orig_msg, **kwargs)
except compat_http_client.IncompleteRead as e:
raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
except (KeyError, StopIteration) as e:
@@ -1101,6 +1101,7 @@ class InfoExtractor(object):
if metadata_available and (
self.get_param('ignore_no_formats_error') or self.get_param('wait_for_video')):
self.report_warning(msg)
+ return
if method is not None:
msg = '%s. %s' % (msg, self._LOGIN_HINTS[method])
raise ExtractorError(msg, expected=True)
@@ -1617,7 +1618,7 @@ class InfoExtractor(object):
'vcodec': {'type': 'ordered', 'regex': True,
'order': ['av0?1', 'vp0?9.2', 'vp0?9', '[hx]265|he?vc?', '[hx]264|avc', 'vp0?8', 'mp4v|h263', 'theora', '', None, 'none']},
'acodec': {'type': 'ordered', 'regex': True,
- 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
+ 'order': ['[af]lac', 'wav|aiff', 'opus', 'vorbis|ogg', 'aac', 'mp?4a?', 'mp3', 'e-?a?c-?3', 'ac-?3', 'dts', '', None, 'none']},
'hdr': {'type': 'ordered', 'regex': True, 'field': 'dynamic_range',
'order': ['dv', '(hdr)?12', r'(hdr)?10\+', '(hdr)?10', 'hlg', '', 'sdr', None]},
'proto': {'type': 'ordered', 'regex': True, 'field': 'protocol',
@@ -3678,7 +3679,7 @@ class InfoExtractor(object):
def mark_watched(self, *args, **kwargs):
if not self.get_param('mark_watched', False):
return
- if (self._get_login_info()[0] is not None
+ if (hasattr(self, '_NETRC_MACHINE') and self._get_login_info()[0] is not None
or self.get_param('cookiefile')
or self.get_param('cookiesfrombrowser')):
self._mark_watched(*args, **kwargs)
diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py
index ffe291098..b6ba5ef56 100644
--- a/yt_dlp/extractor/crunchyroll.py
+++ b/yt_dlp/extractor/crunchyroll.py
@@ -85,7 +85,7 @@ class CrunchyrollBaseIE(InfoExtractor):
'session_id': session_id
}).encode('ascii'))
if login_response['code'] != 'ok':
- raise ExtractorError('Login failed. Bad username or password?', expected=True)
+ raise ExtractorError('Login failed. Server message: %s' % login_response['message'], expected=True)
if not self._get_cookies(self._LOGIN_URL).get('etp_rt'):
raise ExtractorError('Login succeeded but did not set etp_rt cookie')
diff --git a/yt_dlp/extractor/daystar.py b/yt_dlp/extractor/daystar.py
new file mode 100644
index 000000000..4f59d904f
--- /dev/null
+++ b/yt_dlp/extractor/daystar.py
@@ -0,0 +1,48 @@
+from .common import InfoExtractor
+from ..utils import js_to_json, urljoin
+
+
+class DaystarClipIE(InfoExtractor):
+ IE_NAME = 'daystar:clip'
+ _VALID_URL = r'https?://player\.daystar\.tv/(?P<id>\w+)'
+ _TESTS = [{
+ 'url': 'https://player.daystar.tv/0MTO2ITM',
+ 'info_dict': {
+ 'id': '0MTO2ITM',
+ 'ext': 'mp4',
+ 'title': 'The Dark World of COVID Pt. 1 | Aaron Siri',
+ 'description': 'a420d320dda734e5f29458df3606c5f4',
+ 'thumbnail': r're:^https?://.+\.jpg',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ src_iframe = self._search_regex(r'\<iframe[^>]+src="([^"]+)"', webpage, 'src iframe')
+ webpage_iframe = self._download_webpage(
+ src_iframe.replace('player.php', 'config2.php'), video_id, headers={'Referer': src_iframe})
+
+ sources = self._parse_json(self._search_regex(
+ r'sources\:\s*(\[.*?\])', webpage_iframe, 'm3u8 source'), video_id, transform_source=js_to_json)
+
+ formats, subtitles = [], {}
+ for source in sources:
+ file = source.get('file')
+ if file and source.get('type') == 'm3u8':
+ fmts, subs = self._extract_m3u8_formats_and_subtitles(
+ urljoin('https://www.lightcast.com/embed/', file),
+ video_id, 'mp4', fatal=False, headers={'Referer': src_iframe})
+ formats.extend(fmts)
+ subtitles = self._merge_subtitles(subtitles, subs)
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': self._html_search_meta(['og:title', 'twitter:title'], webpage),
+ 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage),
+ 'thumbnail': self._search_regex(r'image:\s*"([^"]+)', webpage_iframe, 'thumbnail'),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py
index 0f26dc24f..5ef1901e4 100644
--- a/yt_dlp/extractor/extractors.py
+++ b/yt_dlp/extractor/extractors.py
@@ -348,6 +348,7 @@ from .daum import (
DaumPlaylistIE,
DaumUserIE,
)
+from .daystar import DaystarClipIE
from .dbtv import DBTVIE
from .dctp import DctpTvIE
from .deezer import (
@@ -1593,6 +1594,7 @@ from .tele13 import Tele13IE
from .telebruxelles import TeleBruxellesIE
from .telecinco import TelecincoIE
from .telegraaf import TelegraafIE
+from .telegram import TelegramEmbedIE
from .telemb import TeleMBIE
from .telemundo import TelemundoIE
from .telequebec import (
@@ -1833,6 +1835,10 @@ from .vice import (
from .vidbit import VidbitIE
from .viddler import ViddlerIE
from .videa import VideaIE
+from .videocampus_sachsen import (
+ VideocampusSachsenIE,
+ VideocampusSachsenEmbedIE,
+)
from .videodetective import VideoDetectiveIE
from .videofyme import VideofyMeIE
from .videomore import (
diff --git a/yt_dlp/extractor/gettr.py b/yt_dlp/extractor/gettr.py
index 9842edd81..327a4d0b8 100644
--- a/yt_dlp/extractor/gettr.py
+++ b/yt_dlp/extractor/gettr.py
@@ -8,8 +8,8 @@ from ..utils import (
dict_get,
float_or_none,
int_or_none,
- remove_end,
str_or_none,
+ traverse_obj,
try_get,
url_or_none,
urljoin,
@@ -36,8 +36,9 @@ class GettrIE(GettrBaseIE):
'ext': 'mp4',
'uploader': 'EpochTV',
'uploader_id': 'epochtv',
+ 'upload_date': '20210927',
'thumbnail': r're:^https?://.+/out\.jpg',
- 'timestamp': 1632782451058,
+ 'timestamp': 1632782451.058,
'duration': 58.5585,
'tags': ['hornofafrica', 'explorations'],
}
@@ -50,43 +51,69 @@ class GettrIE(GettrBaseIE):
'ext': 'mp4',
'uploader': 'Neues Forum Freiheit',
'uploader_id': 'nf_freiheit',
+ 'upload_date': '20210718',
'thumbnail': r're:^https?://.+/out\.jpg',
- 'timestamp': 1626594455017,
+ 'timestamp': 1626594455.017,
'duration': 23,
'tags': 'count:12',
}
+ }, {
+ # quote post
+ 'url': 'https://gettr.com/post/pxn5b743a9',
+ 'only_matching': True,
+ }, {
+ # quote with video
+ 'url': 'https://gettr.com/post/pxtiiz5ca2',
+ 'only_matching': True,
+ }, {
+ # streaming embed
+ 'url': 'https://gettr.com/post/pxlu8p3b13',
+ 'only_matching': True,
+ }, {
+ # youtube embed
+ 'url': 'https://gettr.com/post/pv6wp9e24c',
+ 'only_matching': True,
+ 'add_ie': ['Youtube'],
}]
def _real_extract(self, url):
post_id = self._match_id(url)
webpage = self._download_webpage(url, post_id)
-
api_data = self._call_api('post/%s?incl="poststats|userinfo"' % post_id, post_id)
post_data = api_data.get('data')
- user_data = try_get(api_data, lambda x: x['aux']['uinf'][post_data['uid']]) or {}
+ user_data = try_get(api_data, lambda x: x['aux']['uinf'][post_data['uid']], dict) or {}
- if post_data.get('nfound'):
- raise ExtractorError(post_data.get('txt'), expected=True)
+ vid = post_data.get('vid')
+ ovid = post_data.get('ovid')
+
+ if post_data.get('p_type') == 'stream':
+ return self.url_result(f'https://gettr.com/streaming/{post_id}', ie='GettrStreaming', video_id=post_id)
+
+ if not (ovid or vid):
+ embed_url = url_or_none(post_data.get('prevsrc'))
+ shared_post_id = traverse_obj(api_data, ('aux', 'shrdpst', '_id'), ('data', 'rpstIds', 0), expected_type=str)
+
+ if embed_url:
+ return self.url_result(embed_url)
+ elif shared_post_id:
+ return self.url_result(f'https://gettr.com/post/{shared_post_id}', ie='Gettr', video_id=shared_post_id)
+ else:
+ raise ExtractorError('There\'s no video in this post.')
title = description = str_or_none(
post_data.get('txt') or self._og_search_description(webpage))
uploader = str_or_none(
user_data.get('nickname')
- or remove_end(self._og_search_title(webpage), ' on GETTR'))
+ or self._search_regex(r'^(.+?) on GETTR', self._og_search_title(webpage, default=''), 'uploader', fatal=False))
+
if uploader:
title = '%s - %s' % (uploader, title)
- if not dict_get(post_data, ['vid', 'ovid']):
- raise ExtractorError('There\'s no video in this post.')
-
- vid = post_data.get('vid')
- ovid = post_data.get('ovid')
-
- formats = self._extract_m3u8_formats(
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
urljoin(self._MEDIA_BASE_URL, vid), post_id, 'mp4',
- entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) if vid else []
+ entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) if vid else ([], {})
if ovid:
formats.append({
@@ -103,15 +130,16 @@ class GettrIE(GettrBaseIE):
'id': post_id,
'title': title,
'description': description,
- 'thumbnail': url_or_none(
- urljoin(self._MEDIA_BASE_URL, post_data.get('main'))
- or self._og_search_thumbnail(webpage)),
- 'timestamp': int_or_none(post_data.get('cdate')),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'uploader': uploader,
'uploader_id': str_or_none(
dict_get(user_data, ['_id', 'username'])
or post_data.get('uid')),
- 'uploader': uploader,
- 'formats': formats,
+ 'thumbnail': url_or_none(
+ urljoin(self._MEDIA_BASE_URL, post_data.get('main'))
+ or self._html_search_meta(['og:image', 'image'], webpage, 'thumbnail', fatal=False)),
+ 'timestamp': float_or_none(dict_get(post_data, ['cdate', 'udate']), scale=1000),
'duration': float_or_none(post_data.get('vid_dur')),
'tags': post_data.get('htgs'),
}
@@ -165,19 +193,19 @@ class GettrStreamingIE(GettrBaseIE):
thumbnails = [{
'url': urljoin(self._MEDIA_BASE_URL, thumbnail),
- } for thumbnail in try_get(video_info, lambda x: x['postData']['imgs']) or []]
+ } for thumbnail in try_get(video_info, lambda x: x['postData']['imgs'], list) or []]
self._sort_formats(formats)
return {
'id': video_id,
- 'title': try_get(video_info, lambda x: x['postData']['ttl']),
- 'description': try_get(video_info, lambda x: x['postData']['dsc']),
+ 'title': try_get(video_info, lambda x: x['postData']['ttl'], str),
+ 'description': try_get(video_info, lambda x: x['postData']['dsc'], str),
'formats': formats,
'subtitles': subtitles,
'thumbnails': thumbnails,
- 'uploader': try_get(video_info, lambda x: x['liveHostInfo']['nickname']),
- 'uploader_id': try_get(video_info, lambda x: x['liveHostInfo']['_id']),
+ 'uploader': try_get(video_info, lambda x: x['liveHostInfo']['nickname'], str),
+ 'uploader_id': try_get(video_info, lambda x: x['liveHostInfo']['_id'], str),
'view_count': int_or_none(live_info.get('viewsCount')),
'timestamp': float_or_none(live_info.get('startAt'), scale=1000),
'duration': float_or_none(live_info.get('duration'), scale=1000),
diff --git a/yt_dlp/extractor/instagram.py b/yt_dlp/extractor/instagram.py
index a2cc9f748..3bb786d6a 100644
--- a/yt_dlp/extractor/instagram.py
+++ b/yt_dlp/extractor/instagram.py
@@ -17,7 +17,6 @@ from ..utils import (
get_element_by_attribute,
int_or_none,
lowercase_escape,
- std_headers,
str_or_none,
str_to_int,
traverse_obj,
@@ -503,7 +502,7 @@ class InstagramPlaylistBaseIE(InstagramBaseIE):
'%s' % rhx_gis,
'',
'%s:%s' % (rhx_gis, csrf_token),
- '%s:%s:%s' % (rhx_gis, csrf_token, std_headers['User-Agent']),
+ '%s:%s:%s' % (rhx_gis, csrf_token, self.get_param('http_headers')['User-Agent']),
]
# try all of the ways to generate a GIS query, and not only use the
diff --git a/yt_dlp/extractor/lbry.py b/yt_dlp/extractor/lbry.py
index 1405ce0c7..5d5457c53 100644
--- a/yt_dlp/extractor/lbry.py
+++ b/yt_dlp/extractor/lbry.py
@@ -17,6 +17,7 @@ from ..utils import (
parse_qs,
OnDemandPagedList,
try_get,
+ UnsupportedError,
urljoin,
)
@@ -196,11 +197,11 @@ class LBRYIE(LBRYBaseIE):
live_data = self._download_json(
f'https://api.live.odysee.com/v1/odysee/live/{claim_id}', claim_id,
note='Downloading livestream JSON metadata')['data']
- if not live_data['live']:
- raise ExtractorError('This stream is not live', expected=True)
- streaming_url = final_url = live_data['url']
+ streaming_url = final_url = live_data.get('url')
+ if not final_url and not live_data.get('live'):
+ self.raise_no_formats('This stream is not live', True, claim_id)
else:
- raise ExtractorError('Unsupported URL', expected=True)
+ raise UnsupportedError(url)
info = self._parse_stream(result, url)
if determine_ext(final_url) == 'm3u8':
diff --git a/yt_dlp/extractor/mildom.py b/yt_dlp/extractor/mildom.py
index ddeaa7021..b5a2e17f2 100644
--- a/yt_dlp/extractor/mildom.py
+++ b/yt_dlp/extractor/mildom.py
@@ -8,7 +8,6 @@ import json
from .common import InfoExtractor
from ..utils import (
- std_headers,
update_url_query,
random_uuidv4,
try_get,
@@ -70,7 +69,7 @@ class MildomBaseIE(InfoExtractor):
'clu': '',
'wh': '1919*810',
'rtm': self.iso_timestamp(),
- 'ua': std_headers['User-Agent'],
+ 'ua': self.get_param('http_headers')['User-Agent'],
}).encode('utf8')).decode('utf8').replace('\n', ''),
}).encode('utf8'))
self._DISPATCHER_CONFIG = self._parse_json(base64.b64decode(tmp['data']), 'initialization')
diff --git a/yt_dlp/extractor/openload.py b/yt_dlp/extractor/openload.py
index 6ec54509b..36927009d 100644
--- a/yt_dlp/extractor/openload.py
+++ b/yt_dlp/extractor/openload.py
@@ -16,7 +16,6 @@ from ..utils import (
ExtractorError,
get_exe_version,
is_outdated_version,
- std_headers,
Popen,
)
@@ -208,7 +207,7 @@ class PhantomJSwrapper(object):
replaces = self.options
replaces['url'] = url
- user_agent = headers.get('User-Agent') or std_headers['User-Agent']
+ user_agent = headers.get('User-Agent') or self.get_param('http_headers')['User-Agent']
replaces['ua'] = user_agent.replace('"', '\\"')
replaces['jscode'] = jscode
diff --git a/yt_dlp/extractor/rtve.py b/yt_dlp/extractor/rtve.py
index af1bb943d..7a1dc6f32 100644
--- a/yt_dlp/extractor/rtve.py
+++ b/yt_dlp/extractor/rtve.py
@@ -17,7 +17,6 @@ from ..utils import (
qualities,
remove_end,
remove_start,
- std_headers,
try_get,
)
@@ -71,7 +70,7 @@ class RTVEALaCartaIE(InfoExtractor):
}]
def _real_initialize(self):
- user_agent_b64 = base64.b64encode(std_headers['User-Agent'].encode('utf-8')).decode('utf-8')
+ user_agent_b64 = base64.b64encode(self.get_param('http_headers')['User-Agent'].encode('utf-8')).decode('utf-8')
self._manager = self._download_json(
'http://www.rtve.es/odin/loki/' + user_agent_b64,
None, 'Fetching manager info')['manager']
diff --git a/yt_dlp/extractor/telegram.py b/yt_dlp/extractor/telegram.py
new file mode 100644
index 000000000..2dfa261e9
--- /dev/null
+++ b/yt_dlp/extractor/telegram.py
@@ -0,0 +1,37 @@
+from .common import InfoExtractor
+
+
+class TelegramEmbedIE(InfoExtractor):
+ IE_NAME = 'telegram:embed'
+ _VALID_URL = r'https?://t\.me/(?P<channel_name>[^/]+)/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://t.me/europa_press/613',
+ 'info_dict': {
+ 'id': '613',
+ 'ext': 'mp4',
+ 'title': 'Europa Press',
+ 'description': '6ce2d7e8d56eda16d80607b23db7b252',
+ 'thumbnail': r're:^https?:\/\/cdn.*?telesco\.pe\/file\/\w+',
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ webpage_embed = self._download_webpage(f'{url}?embed=1', video_id)
+
+ formats = [{
+ 'url': self._proto_relative_url(self._search_regex(
+ '<video[^>]+src="([^"]+)"', webpage_embed, 'source')),
+ 'ext': 'mp4',
+ }]
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': self._html_search_meta(['og:title', 'twitter:title'], webpage, fatal=True),
+ 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage, fatal=True),
+ 'thumbnail': self._search_regex(r'tgme_widget_message_video_thumb"[^>]+background-image:url\(\'([^\']+)\'\)',
+ webpage_embed, 'thumbnail'),
+ 'formats': formats,
+ }
diff --git a/yt_dlp/extractor/twitch.py b/yt_dlp/extractor/twitch.py
index 54e500edd..bee26c3a3 100644
--- a/yt_dlp/extractor/twitch.py
+++ b/yt_dlp/extractor/twitch.py
@@ -1048,7 +1048,7 @@ class TwitchClipsIE(TwitchBaseIE):
'title': clip.get('title') or video_id,
'formats': formats,
'duration': int_or_none(clip.get('durationSeconds')),
- 'views': int_or_none(clip.get('viewCount')),
+ 'view_count': int_or_none(clip.get('viewCount')),
'timestamp': unified_timestamp(clip.get('createdAt')),
'thumbnails': thumbnails,
'creator': try_get(clip, lambda x: x['broadcaster']['displayName'], compat_str),
diff --git a/yt_dlp/extractor/videocampus_sachsen.py b/yt_dlp/extractor/videocampus_sachsen.py
new file mode 100644
index 000000000..96e98573f
--- /dev/null
+++ b/yt_dlp/extractor/videocampus_sachsen.py
@@ -0,0 +1,96 @@
+# coding: utf-8
+from .common import InfoExtractor
+
+
+class VideocampusSachsenIE(InfoExtractor):
+ _VALID_URL = r'''(?x)https?://videocampus\.sachsen\.de/(?:
+ m/(?P<tmp_id>[0-9a-f]+)|
+ (?:category/)?video/(?P<display_id>[\w-]+)/(?P<id>[0-9a-f]{32})
+ )'''
+
+ _TESTS = [
+ {
+ 'url': 'https://videocampus.sachsen.de/m/e0d6c8ce6e394c188f1342f1ab7c50ed6fc4490b808699801def5cb2e46d76ca7367f622a9f516c542ffb805b24d6b643bd7c81f385acaac4c59081b87a2767b',
+ 'info_dict': {
+ 'id': 'e6b9349905c1628631f175712250f2a1',
+ 'title': 'Konstruktiver Entwicklungsprozess Vorlesung 7',
+ 'ext': 'mp4',
+ },
+ },
+ {
+ 'url': 'https://videocampus.sachsen.de/video/Was-ist-selbstgesteuertes-Lernen/fc99c527e4205b121cb7c74433469262',
+ 'info_dict': {
+ 'id': 'fc99c527e4205b121cb7c74433469262',
+ 'title': 'Was ist selbstgesteuertes Lernen?',
+ 'display_id': 'Was-ist-selbstgesteuertes-Lernen',
+ 'ext': 'mp4',
+ },
+ },
+ {
+ 'url': 'https://videocampus.sachsen.de/category/video/Tutorial-zur-Nutzung-von-Adobe-Connect-aus-Veranstalter-Sicht/09d4ed029002eb1bdda610f1103dd54c/100',
+ 'info_dict': {
+ 'id': '09d4ed029002eb1bdda610f1103dd54c',
+ 'title': 'Tutorial zur Nutzung von Adobe Connect aus Veranstalter-Sicht',
+ 'display_id': 'Tutorial-zur-Nutzung-von-Adobe-Connect-aus-Veranstalter-Sicht',
+ 'ext': 'mp4',
+ },
+ },
+ ]
+
+ def _real_extract(self, url):
+ video_id, tmp_id, display_id = self._match_valid_url(url).group('id', 'tmp_id', 'display_id')
+ webpage = self._download_webpage(url, video_id or tmp_id, fatal=False) or ''
+
+ if not tmp_id:
+ video_id = self._html_search_regex(
+ r'src="https?://videocampus\.sachsen\.de/media/embed\?key=([0-9a-f]+)&',
+ webpage, 'video_id')
+
+ title = self._html_search_regex(
+ (r'<h1>(?P<content>[^<]+)</h1>', *self._meta_regex('title')),
+ webpage, 'title', group='content', fatal=False)
+
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+ f'https://videocampus.sachsen.de/media/hlsMedium/key/{video_id}/format/auto/ext/mp4/learning/0/path/m3u8',
+ video_id, 'mp4', 'm3u8_native', m3u8_id='hls')
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'display_id': display_id,
+ 'formats': formats,
+ 'subtitles': subtitles
+ }
+
+
+class VideocampusSachsenEmbedIE(InfoExtractor):
+ _VALID_URL = r'https?://videocampus.sachsen.de/media/embed\?key=(?P<id>[0-9a-f]+)'
+
+ _TESTS = [
+ {
+ 'url': 'https://videocampus.sachsen.de/media/embed?key=fc99c527e4205b121cb7c74433469262',
+ 'info_dict': {
+ 'id': 'fc99c527e4205b121cb7c74433469262',
+ 'title': 'Was ist selbstgesteuertes Lernen?',
+ 'ext': 'mp4',
+ },
+ }
+ ]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+ title = self._html_search_regex(r'<img[^>]*title="([^"<]+)"', webpage, 'title', fatal=False)
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+ f'https://videocampus.sachsen.de/media/hlsMedium/key/{video_id}/format/auto/ext/mp4/learning/0/path/m3u8',
+ video_id, 'mp4', 'm3u8_native', m3u8_id='hls')
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py
index 1a9fd00e4..77ffb4bfb 100644
--- a/yt_dlp/extractor/vimeo.py
+++ b/yt_dlp/extractor/vimeo.py
@@ -28,7 +28,6 @@ from ..utils import (
parse_qs,
sanitized_Request,
smuggle_url,
- std_headers,
str_or_none,
try_get,
unified_timestamp,
@@ -758,7 +757,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
def _real_extract(self, url):
url, data = unsmuggle_url(url, {})
- headers = std_headers.copy()
+ headers = self.get_param('http_headers').copy()
if 'http_headers' in data:
headers.update(data['http_headers'])
if 'Referer' not in headers:
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py
index 602d48e3c..4e812af99 100644
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -240,13 +240,15 @@ def build_innertube_clients():
base_client, *variant = client.split('_')
ytcfg['priority'] = 10 * priority(base_client)
- if variant == ['embedded']:
- ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = THIRD_PARTY
- INNERTUBE_CLIENTS[f'{base_client}_agegate'] = agegate_ytcfg = copy.deepcopy(ytcfg)
+ if not variant:
+ INNERTUBE_CLIENTS[f'{client}_agegate'] = agegate_ytcfg = copy.deepcopy(ytcfg)
agegate_ytcfg['INNERTUBE_CONTEXT']['client']['clientScreen'] = 'EMBED'
+ agegate_ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = THIRD_PARTY
agegate_ytcfg['priority'] -= 1
+ elif variant == ['embedded']:
+ ytcfg['INNERTUBE_CONTEXT']['thirdParty'] = THIRD_PARTY
ytcfg['priority'] -= 2
- elif variant:
+ else:
ytcfg['priority'] -= 3
diff --git a/yt_dlp/extractor/zingmp3.py b/yt_dlp/extractor/zingmp3.py
index a3edc158f..f84ba5cff 100644
--- a/yt_dlp/extractor/zingmp3.py
+++ b/yt_dlp/extractor/zingmp3.py
@@ -1,22 +1,47 @@
# coding: utf-8
from __future__ import unicode_literals
+import hashlib
+import hmac
+import urllib.parse
+
from .common import InfoExtractor
from ..utils import (
int_or_none,
+ traverse_obj,
+ HEADRequest,
)
class ZingMp3BaseIE(InfoExtractor):
- _VALID_URL_TMPL = r'https?://(?:mp3\.zing|zingmp3)\.vn/(?:%s)/[^/]+/(?P<id>\w+)\.html'
+ _VALID_URL_TMPL = r'https?://(?:mp3\.zing|zingmp3)\.vn/(?P<type>(?:%s))/[^/]+/(?P<id>\w+)(?:\.html|\?)'
_GEO_COUNTRIES = ['VN']
+ _DOMAIN = 'https://zingmp3.vn'
+ _SLUG_API = {
+ 'bai-hat': '/api/v2/page/get/song',
+ 'embed': '/api/v2/page/get/song',
+ 'video-clip': '/api/v2/page/get/video',
+ 'playlist': '/api/v2/page/get/playlist',
+ 'album': '/api/v2/page/get/playlist',
+ 'lyric': '/api/v2/lyric/get/lyric',
+ 'song_streaming': '/api/v2/song/get/streaming',
+ }
+
+ _API_KEY = '88265e23d4284f25963e6eedac8fbfa3'
+ _SECRET_KEY = b'2aa2d1c561e809b267f3638c4a307aab'
- def _extract_item(self, item, fatal):
- item_id = item['id']
- title = item.get('name') or item['title']
+ def _extract_item(self, item, song_id, type_url, fatal):
+ item_id = item.get('encodeId') or song_id
+ title = item.get('title') or item.get('alias')
+
+ if type_url == 'video-clip':
+ source = item.get('streaming')
+ else:
+ api = self.get_api_with_signature(name_api=self._SLUG_API.get('song_streaming'), param={'id': item_id})
+ source = self._download_json(api, video_id=item_id).get('data')
formats = []
- for k, v in (item.get('source') or {}).items():
+ for k, v in (source or {}).items():
if not v:
continue
if k in ('mp4', 'hls'):
@@ -34,31 +59,35 @@ class ZingMp3BaseIE(InfoExtractor):
'height': int_or_none(self._search_regex(
r'^(\d+)p', res, 'resolution', default=None)),
})
- else:
- formats.append({
- 'ext': 'mp3',
- 'format_id': k,
- 'tbr': int_or_none(k),
- 'url': self._proto_relative_url(v),
- 'vcodec': 'none',
- })
+ continue
+ elif v == 'VIP':
+ continue
+ formats.append({
+ 'ext': 'mp3',
+ 'format_id': k,
+ 'tbr': int_or_none(k),
+ 'url': self._proto_relative_url(v),
+ 'vcodec': 'none',
+ })
if not formats:
if not fatal:
return
- msg = item['msg']
+ msg = item.get('msg')
if msg == 'Sorry, this content is not available in your country.':
self.raise_geo_restricted(countries=self._GEO_COUNTRIES, metadata_available=True)
self.raise_no_formats(msg, expected=True)
self._sort_formats(formats)
- subtitles = None
lyric = item.get('lyric')
- if lyric:
- subtitles = {
- 'origin': [{
- 'url': lyric,
- }],
- }
+ if not lyric:
+ api = self.get_api_with_signature(name_api=self._SLUG_API.get("lyric"), param={'id': item_id})
+ info_lyric = self._download_json(api, video_id=item_id)
+ lyric = traverse_obj(info_lyric, ('data', 'file'))
+ subtitles = {
+ 'origin': [{
+ 'url': lyric,
+ }],
+ } if lyric else None
album = item.get('album') or {}
@@ -66,30 +95,41 @@ class ZingMp3BaseIE(InfoExtractor):
'id': item_id,
'title': title,
'formats': formats,
- 'thumbnail': item.get('thumbnail'),
+ 'thumbnail': traverse_obj(item, 'thumbnail', 'thumbnailM'),
'subtitles': subtitles,
'duration': int_or_none(item.get('duration')),
'track': title,
- 'artist': item.get('artists_names'),
- 'album': album.get('name') or album.get('title'),
- 'album_artist': album.get('artists_names'),
+ 'artist': traverse_obj(item, 'artistsNames', 'artists_names'),
+ 'album': traverse_obj(album, 'name', 'title'),
+ 'album_artist': traverse_obj(album, 'artistsNames', 'artists_names'),
}
+ def _real_initialize(self):
+ if not self.get_param('cookiefile') and not self.get_param('cookiesfrombrowser'):
+ self._request_webpage(HEADRequest(self._DOMAIN), None, note='Updating cookies')
+
def _real_extract(self, url):
- page_id = self._match_id(url)
- webpage = self._download_webpage(
- url.replace('://zingmp3.vn/', '://mp3.zing.vn/'),
- page_id, query={'play_song': 1})
- data_path = self._search_regex(
- r'data-xml="([^"]+)', webpage, 'data path')
- return self._process_data(self._download_json(
- 'https://mp3.zing.vn/xhr' + data_path, page_id)['data'])
+ song_id, type_url = self._match_valid_url(url).group('id', 'type')
+
+ api = self.get_api_with_signature(name_api=self._SLUG_API[type_url], param={'id': song_id})
+
+ return self._process_data(self._download_json(api, song_id)['data'], song_id, type_url)
+
+ def get_api_with_signature(self, name_api, param):
+ sha256 = hashlib.sha256(''.join(f'{k}={v}' for k, v in param.items()).encode('utf-8')).hexdigest()
+
+ data = {
+ 'apiKey': self._API_KEY,
+ 'sig': hmac.new(self._SECRET_KEY, f'{name_api}{sha256}'.encode('utf-8'), hashlib.sha512).hexdigest(),
+ **param,
+ }
+ return f'{self._DOMAIN}{name_api}?{urllib.parse.urlencode(data)}'
class ZingMp3IE(ZingMp3BaseIE):
- _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'bai-hat|video-clip'
+ _VALID_URL = ZingMp3BaseIE._VALID_URL_TMPL % 'bai-hat|video-clip|embed'
_TESTS = [{
- 'url': 'http://mp3.zing.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html',
+ 'url': 'https://mp3.zing.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html',
'md5': 'ead7ae13693b3205cbc89536a077daed',
'info_dict': {
'id': 'ZWZB9WAB',
@@ -108,7 +148,7 @@ class ZingMp3IE(ZingMp3BaseIE):
'album_artist': 'Bảo Thy',
},
}, {
- 'url': 'https://mp3.zing.vn/video-clip/Suong-Hoa-Dua-Loi-K-ICM-RYO/ZO8ZF7C7.html',
+ 'url': 'https://zingmp3.vn/video-clip/Suong-Hoa-Dua-Loi-K-ICM-RYO/ZO8ZF7C7.html',
'md5': 'e9c972b693aa88301ef981c8151c4343',
'info_dict': {
'id': 'ZO8ZF7C7',
@@ -120,14 +160,17 @@ class ZingMp3IE(ZingMp3BaseIE):
'artist': 'K-ICM, RYO',
},
}, {
+ 'url': 'https://zingmp3.vn/embed/song/ZWZEI76B?start=false',
+ 'only_matching': True,
+ }, {
'url': 'https://zingmp3.vn/bai-hat/Xa-Mai-Xa-Bao-Thy/ZWZB9WAB.html',
'only_matching': True,
}]
IE_NAME = 'zingmp3'
- IE_DESC = 'mp3.zing.vn'
+ IE_DESC = 'zingmp3.vn'
- def _process_data(self, data):
- return self._extract_item(data, True)
+ def _process_data(self, data, song_id, type_url):
+ return self._extract_item(data, song_id, type_url, True)
class ZingMp3AlbumIE(ZingMp3BaseIE):
@@ -139,7 +182,7 @@ class ZingMp3AlbumIE(ZingMp3BaseIE):
'id': 'ZWZBWDAF',
'title': 'Lâu Đài Tình Ái',
},
- 'playlist_count': 10,
+ 'playlist_count': 9,
}, {
'url': 'http://mp3.zing.vn/playlist/Duong-Hong-Loan-apollobee/IWCAACCB.html',
'only_matching': True,
@@ -149,12 +192,12 @@ class ZingMp3AlbumIE(ZingMp3BaseIE):
}]
IE_NAME = 'zingmp3:album'
- def _process_data(self, data):
+ def _process_data(self, data, song_id, type_url):
def entries():
- for item in (data.get('items') or []):
- entry = self._extract_item(item, False)
+ for item in traverse_obj(data, ('song', 'items')) or []:
+ entry = self._extract_item(item, song_id, type_url, False)
if entry:
yield entry
- info = data.get('info') or {}
- return self.playlist_result(
- entries(), info.get('id'), info.get('name') or info.get('title'))
+
+ return self.playlist_result(entries(), traverse_obj(data, 'id', 'encodeId'),
+ traverse_obj(data, 'name', 'title'))
diff --git a/yt_dlp/options.py b/yt_dlp/options.py
index bee8e3637..9f1f31974 100644
--- a/yt_dlp/options.py
+++ b/yt_dlp/options.py
@@ -856,17 +856,16 @@ def create_parser():
workarounds.add_option(
'--user-agent',
metavar='UA', dest='user_agent',
- help='Specify a custom user agent')
+ help=optparse.SUPPRESS_HELP)
workarounds.add_option(
'--referer',
metavar='URL', dest='referer', default=None,
- help='Specify a custom referer, use if the video access is restricted to one domain',
- )
+ help=optparse.SUPPRESS_HELP)
workarounds.add_option(
'--add-header',
metavar='FIELD:VALUE', dest='headers', default={}, type='str',
action='callback', callback=_dict_from_options_callback,
- callback_kwargs={'multiple_keys': False, 'process_key': None},
+ callback_kwargs={'multiple_keys': False},
help='Specify a custom HTTP header and its value, separated by a colon ":". You can use this option multiple times',
)
workarounds.add_option(
diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py
index 5eb049ab7..be0c69d8f 100644
--- a/yt_dlp/utils.py
+++ b/yt_dlp/utils.py
@@ -1060,7 +1060,7 @@ class ExtractorError(YoutubeDLError):
if sys.exc_info()[0] in network_exceptions:
expected = True
- self.msg = str(msg)
+ self.orig_msg = str(msg)
self.traceback = tb
self.expected = expected
self.cause = cause
@@ -1071,7 +1071,7 @@ class ExtractorError(YoutubeDLError):
super(ExtractorError, self).__init__(''.join((
format_field(ie, template='[%s] '),
format_field(video_id, template='%s: '),
- self.msg,
+ msg,
format_field(cause, template=' (caused by %r)'),
'' if expected else bug_reports_message())))
@@ -1372,7 +1372,7 @@ class YoutubeDLHandler(compat_urllib_request.HTTPHandler):
if url != url_escaped:
req = update_Request(req, url=url_escaped)
- for h, v in std_headers.items():
+ for h, v in self._params.get('http_headers', std_headers).items():
# Capitalize is needed because of Python bug 2275: http://bugs.python.org/issue2275
# The dict keys are capitalized because of this bug by urllib
if h.capitalize() not in req.headers:
@@ -2257,7 +2257,7 @@ def unsmuggle_url(smug_url, default=None):
def format_decimal_suffix(num, fmt='%d%s', *, factor=1000):
""" Formats numbers with decimal sufixes like K, M, etc """
num, factor = float_or_none(num), float(factor)
- if num is None:
+ if num is None or num < 0:
return None
exponent = 0 if num == 0 else int(math.log(num, factor))
suffix = ['', *'kMGTPEZY'][exponent]
@@ -5436,3 +5436,8 @@ class WebSocketsWrapper():
has_websockets = bool(compat_websockets)
+
+
+def merge_headers(*dicts):
+ """Merge dicts of network headers case insensitively, prioritizing the latter ones"""
+ return {k.capitalize(): v for k, v in itertools.chain.from_iterable(map(dict.items, dicts))}