aboutsummaryrefslogtreecommitdiffstats
path: root/yt_dlp/extractor
diff options
context:
space:
mode:
authorJesús <heckyel@hyperbola.info>2021-10-31 11:36:52 -0500
committerJesús <heckyel@hyperbola.info>2021-10-31 11:36:52 -0500
commit5bb25093eb718346ab8a723d2c04f0066fc3958a (patch)
tree8a7fa5611895a933eaf1ef1623f7b9e1a1c36157 /yt_dlp/extractor
parentc7afb25e19a91493db6069d1db9f7d1bc8491dc1 (diff)
parent652fb0d446524af4b783276babd55f5fc6a3afeb (diff)
downloadhypervideo-pre-5bb25093eb718346ab8a723d2c04f0066fc3958a.tar.lz
hypervideo-pre-5bb25093eb718346ab8a723d2c04f0066fc3958a.tar.xz
hypervideo-pre-5bb25093eb718346ab8a723d2c04f0066fc3958a.zip
updated from upstream | 31/10/2021 at 11:36
Diffstat (limited to 'yt_dlp/extractor')
-rw-r--r--yt_dlp/extractor/__init__.py21
-rw-r--r--yt_dlp/extractor/adobepass.py4
-rw-r--r--yt_dlp/extractor/bilibili.py8
-rw-r--r--yt_dlp/extractor/cbc.py134
-rw-r--r--yt_dlp/extractor/common.py34
-rw-r--r--yt_dlp/extractor/coub.py3
-rw-r--r--yt_dlp/extractor/dplay.py5
-rw-r--r--yt_dlp/extractor/extractors.py17
-rw-r--r--yt_dlp/extractor/generic.py25
-rw-r--r--yt_dlp/extractor/instagram.py46
-rw-r--r--yt_dlp/extractor/itv.py34
-rw-r--r--yt_dlp/extractor/mediaset.py82
-rw-r--r--yt_dlp/extractor/microsoftstream.py125
-rw-r--r--yt_dlp/extractor/mlssoccer.py118
-rw-r--r--yt_dlp/extractor/mtv.py12
-rw-r--r--yt_dlp/extractor/naver.py7
-rw-r--r--yt_dlp/extractor/niconico.py1
-rw-r--r--yt_dlp/extractor/nrk.py4
-rw-r--r--yt_dlp/extractor/patreon.py2
-rw-r--r--yt_dlp/extractor/sky.py28
-rw-r--r--yt_dlp/extractor/soundcloud.py3
-rw-r--r--yt_dlp/extractor/tagesschau.py279
-rw-r--r--yt_dlp/extractor/threespeak.py97
-rw-r--r--yt_dlp/extractor/trovo.py4
-rw-r--r--yt_dlp/extractor/twitter.py2
-rw-r--r--yt_dlp/extractor/viewlift.py192
-rw-r--r--yt_dlp/extractor/vimeo.py230
-rw-r--r--yt_dlp/extractor/vlive.py230
-rw-r--r--yt_dlp/extractor/wakanim.py25
-rw-r--r--yt_dlp/extractor/youtube.py39
30 files changed, 1172 insertions, 639 deletions
diff --git a/yt_dlp/extractor/__init__.py b/yt_dlp/extractor/__init__.py
index 198c4ae17..b35484246 100644
--- a/yt_dlp/extractor/__init__.py
+++ b/yt_dlp/extractor/__init__.py
@@ -1,14 +1,15 @@
-from __future__ import unicode_literals
+import os
from ..utils import load_plugins
-try:
- from .lazy_extractors import *
- from .lazy_extractors import _ALL_CLASSES
- _LAZY_LOADER = True
- _PLUGIN_CLASSES = {}
-except ImportError:
- _LAZY_LOADER = False
+_LAZY_LOADER = False
+if not os.environ.get('YTDLP_NO_LAZY_EXTRACTORS'):
+ try:
+ from .lazy_extractors import *
+ from .lazy_extractors import _ALL_CLASSES
+ _LAZY_LOADER = True
+ except ImportError:
+ pass
if not _LAZY_LOADER:
from .extractors import *
@@ -19,8 +20,8 @@ if not _LAZY_LOADER:
]
_ALL_CLASSES.append(GenericIE)
- _PLUGIN_CLASSES = load_plugins('extractor', 'IE', globals())
- _ALL_CLASSES = list(_PLUGIN_CLASSES.values()) + _ALL_CLASSES
+_PLUGIN_CLASSES = load_plugins('extractor', 'IE', globals())
+_ALL_CLASSES = list(_PLUGIN_CLASSES.values()) + _ALL_CLASSES
def gen_extractor_classes():
diff --git a/yt_dlp/extractor/adobepass.py b/yt_dlp/extractor/adobepass.py
index 9378c33cd..bebcafa6b 100644
--- a/yt_dlp/extractor/adobepass.py
+++ b/yt_dlp/extractor/adobepass.py
@@ -39,8 +39,8 @@ MSO_INFO = {
},
'RCN': {
'name': 'RCN',
- 'username_field': 'UserName',
- 'password_field': 'UserPassword',
+ 'username_field': 'username',
+ 'password_field': 'password',
},
'Rogers': {
'name': 'Rogers',
diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py
index d6c77e418..483f93d67 100644
--- a/yt_dlp/extractor/bilibili.py
+++ b/yt_dlp/extractor/bilibili.py
@@ -376,8 +376,10 @@ class BiliBiliIE(InfoExtractor):
replies = traverse_obj(
self._download_json(
f'https://api.bilibili.com/x/v2/reply?pn={idx}&oid={video_id}&type=1&jsonp=jsonp&sort=2&_=1567227301685',
- video_id, note=f'Extracting comments from page {idx}'),
- ('data', 'replies')) or []
+ video_id, note=f'Extracting comments from page {idx}', fatal=False),
+ ('data', 'replies'))
+ if not replies:
+ return
for children in map(self._get_all_children, replies):
yield from children
@@ -566,7 +568,7 @@ class BilibiliCategoryIE(InfoExtractor):
class BiliBiliSearchIE(SearchInfoExtractor):
- IE_DESC = 'Bilibili video search, "bilisearch" keyword'
+ IE_DESC = 'Bilibili video search'
_MAX_RESULTS = 100000
_SEARCH_KEY = 'bilisearch'
diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py
index 5e4526c53..4fcf2a9c1 100644
--- a/yt_dlp/extractor/cbc.py
+++ b/yt_dlp/extractor/cbc.py
@@ -2,6 +2,9 @@
from __future__ import unicode_literals
import re
+import json
+import base64
+import time
from .common import InfoExtractor
from ..compat import (
@@ -244,37 +247,96 @@ class CBCGemIE(InfoExtractor):
'params': {'format': 'bv'},
'skip': 'Geo-restricted to Canada',
}]
- _API_BASE = 'https://services.radio-canada.ca/ott/cbc-api/v2/assets/'
+
+ _GEO_COUNTRIES = ['CA']
+ _TOKEN_API_KEY = '3f4beddd-2061-49b0-ae80-6f1f2ed65b37'
+ _NETRC_MACHINE = 'cbcgem'
+ _claims_token = None
+
+ def _new_claims_token(self, email, password):
+ data = json.dumps({
+ 'email': email,
+ 'password': password,
+ }).encode()
+ headers = {'content-type': 'application/json'}
+ query = {'apikey': self._TOKEN_API_KEY}
+ resp = self._download_json('https://api.loginradius.com/identity/v2/auth/login',
+ None, data=data, headers=headers, query=query)
+ access_token = resp['access_token']
+
+ query = {
+ 'access_token': access_token,
+ 'apikey': self._TOKEN_API_KEY,
+ 'jwtapp': 'jwt',
+ }
+ resp = self._download_json('https://cloud-api.loginradius.com/sso/jwt/api/token',
+ None, headers=headers, query=query)
+ sig = resp['signature']
+
+ data = json.dumps({'jwt': sig}).encode()
+ headers = {'content-type': 'application/json', 'ott-device-type': 'web'}
+ resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/token',
+ None, data=data, headers=headers)
+ cbc_access_token = resp['accessToken']
+
+ headers = {'content-type': 'application/json', 'ott-device-type': 'web', 'ott-access-token': cbc_access_token}
+ resp = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/profile',
+ None, headers=headers)
+ return resp['claimsToken']
+
+ def _get_claims_token_expiry(self):
+ # Token is a JWT
+ # JWT is decoded here and 'exp' field is extracted
+ # It is a Unix timestamp for when the token expires
+ b64_data = self._claims_token.split('.')[1]
+ data = base64.urlsafe_b64decode(b64_data + "==")
+ return json.loads(data)['exp']
+
+ def claims_token_expired(self):
+ exp = self._get_claims_token_expiry()
+ if exp - time.time() < 10:
+ # It will expire in less than 10 seconds, or has already expired
+ return True
+ return False
+
+ def claims_token_valid(self):
+ return self._claims_token is not None and not self.claims_token_expired()
+
+ def _get_claims_token(self, email, password):
+ if not self.claims_token_valid():
+ self._claims_token = self._new_claims_token(email, password)
+ self._downloader.cache.store(self._NETRC_MACHINE, 'claims_token', self._claims_token)
+ return self._claims_token
+
+ def _real_initialize(self):
+ if self.claims_token_valid():
+ return
+ self._claims_token = self._downloader.cache.load(self._NETRC_MACHINE, 'claims_token')
def _real_extract(self, url):
video_id = self._match_id(url)
- video_info = self._download_json(self._API_BASE + video_id, video_id)
-
- last_error = None
- attempt = -1
- retries = self.get_param('extractor_retries', 15)
- while attempt < retries:
- attempt += 1
- if last_error:
- self.report_warning('%s. Retrying ...' % last_error)
- m3u8_info = self._download_json(
- video_info['playSession']['url'], video_id,
- note='Downloading JSON metadata%s' % f' (attempt {attempt})')
- m3u8_url = m3u8_info.get('url')
- if m3u8_url:
- break
- elif m3u8_info.get('errorCode') == 1:
- self.raise_geo_restricted(countries=['CA'])
- else:
- last_error = f'{self.IE_NAME} said: {m3u8_info.get("errorCode")} - {m3u8_info.get("message")}'
- # 35 means media unavailable, but retries work
- if m3u8_info.get('errorCode') != 35 or attempt >= retries:
- raise ExtractorError(last_error)
+ video_info = self._download_json('https://services.radio-canada.ca/ott/cbc-api/v2/assets/' + video_id, video_id)
+
+ email, password = self._get_login_info()
+ if email and password:
+ claims_token = self._get_claims_token(email, password)
+ headers = {'x-claims-token': claims_token}
+ else:
+ headers = {}
+ m3u8_info = self._download_json(video_info['playSession']['url'], video_id, headers=headers)
+ m3u8_url = m3u8_info.get('url')
+
+ if m3u8_info.get('errorCode') == 1:
+ self.raise_geo_restricted(countries=['CA'])
+ elif m3u8_info.get('errorCode') == 35:
+ self.raise_login_required(method='password')
+ elif m3u8_info.get('errorCode') != 0:
+ raise ExtractorError(f'{self.IE_NAME} said: {m3u8_info.get("errorCode")} - {m3u8_info.get("message")}')
formats = self._extract_m3u8_formats(m3u8_url, video_id, m3u8_id='hls')
self._remove_duplicate_formats(formats)
- for i, format in enumerate(formats):
+ for format in formats:
if format.get('vcodec') == 'none':
if format.get('ext') is None:
format['ext'] = 'm4a'
@@ -377,7 +439,7 @@ class CBCGemPlaylistIE(InfoExtractor):
class CBCGemLiveIE(InfoExtractor):
IE_NAME = 'gem.cbc.ca:live'
- _VALID_URL = r'https?://gem\.cbc\.ca/live/(?P<id>[0-9]{12})'
+ _VALID_URL = r'https?://gem\.cbc\.ca/live/(?P<id>\d+)'
_TEST = {
'url': 'https://gem.cbc.ca/live/920604739687',
'info_dict': {
@@ -396,21 +458,21 @@ class CBCGemLiveIE(InfoExtractor):
# It's unclear where the chars at the end come from, but they appear to be
# constant. Might need updating in the future.
- _API = 'https://tpfeed.cbc.ca/f/ExhSPC/t_t3UKJR6MAT'
+ # There are two URLs, some livestreams are in one, and some
+ # in the other. The JSON schema is the same for both.
+ _API_URLS = ['https://tpfeed.cbc.ca/f/ExhSPC/t_t3UKJR6MAT', 'https://tpfeed.cbc.ca/f/ExhSPC/FNiv9xQx_BnT']
def _real_extract(self, url):
video_id = self._match_id(url)
- live_info = self._download_json(self._API, video_id)['entries']
- video_info = None
- for stream in live_info:
- if stream.get('guid') == video_id:
- video_info = stream
-
- if video_info is None:
- raise ExtractorError(
- 'Couldn\'t find video metadata, maybe this livestream is now offline',
- expected=True)
+ for api_url in self._API_URLS:
+ video_info = next((
+ stream for stream in self._download_json(api_url, video_id)['entries']
+ if stream.get('guid') == video_id), None)
+ if video_info:
+ break
+ else:
+ raise ExtractorError('Couldn\'t find video metadata, maybe this livestream is now offline', expected=True)
return {
'_type': 'url_transparent',
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index e00d8c42b..aa98c0cc9 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -74,6 +74,7 @@ from ..utils import (
strip_or_none,
traverse_obj,
unescapeHTML,
+ UnsupportedError,
unified_strdate,
unified_timestamp,
update_Request,
@@ -448,7 +449,9 @@ class InfoExtractor(object):
}
def __init__(self, downloader=None):
- """Constructor. Receives an optional downloader."""
+ """Constructor. Receives an optional downloader (a YoutubeDL instance).
+ If a downloader is not passed during initialization,
+ it must be set using "set_downloader()" before "extract()" is called"""
self._ready = False
self._x_forwarded_for_ip = None
self._printed_messages = set()
@@ -602,10 +605,19 @@ class InfoExtractor(object):
if self.__maybe_fake_ip_and_retry(e.countries):
continue
raise
+ except UnsupportedError:
+ raise
except ExtractorError as e:
- video_id = e.video_id or self.get_temp_id(url)
- raise ExtractorError(
- e.msg, video_id=video_id, ie=self.IE_NAME, tb=e.traceback, expected=e.expected, cause=e.cause)
+ kwargs = {
+ 'video_id': e.video_id or self.get_temp_id(url),
+ 'ie': self.IE_NAME,
+ 'tb': e.traceback,
+ 'expected': e.expected,
+ 'cause': e.cause
+ }
+ if hasattr(e, 'countries'):
+ kwargs['countries'] = e.countries
+ raise type(e)(e.msg, **kwargs)
except compat_http_client.IncompleteRead as e:
raise ExtractorError('A network error has occurred.', cause=e, expected=True, video_id=self.get_temp_id(url))
except (KeyError, StopIteration) as e:
@@ -664,7 +676,7 @@ class InfoExtractor(object):
See _download_webpage docstring for arguments specification.
"""
if not self._downloader._first_webpage_request:
- sleep_interval = float_or_none(self.get_param('sleep_interval_requests')) or 0
+ sleep_interval = self.get_param('sleep_interval_requests') or 0
if sleep_interval > 0:
self.to_screen('Sleeping %s seconds ...' % sleep_interval)
time.sleep(sleep_interval)
@@ -1137,7 +1149,7 @@ class InfoExtractor(object):
if mobj:
break
- _name = self._downloader._color_text(name, 'blue')
+ _name = self._downloader._format_err(name, self._downloader.Styles.EMPHASIS)
if mobj:
if group is None:
@@ -1537,8 +1549,8 @@ class InfoExtractor(object):
'ie_pref': {'priority': True, 'type': 'extractor'},
'hasvid': {'priority': True, 'field': 'vcodec', 'type': 'boolean', 'not_in_list': ('none',)},
'hasaud': {'field': 'acodec', 'type': 'boolean', 'not_in_list': ('none',)},
- 'lang': {'convert': 'ignore', 'field': 'language_preference'},
- 'quality': {'convert': 'float_none', 'default': -1},
+ 'lang': {'convert': 'float', 'field': 'language_preference', 'default': -1},
+ 'quality': {'convert': 'float', 'default': -1},
'filesize': {'convert': 'bytes'},
'fs_approx': {'convert': 'bytes', 'field': 'filesize_approx'},
'id': {'convert': 'string', 'field': 'format_id'},
@@ -1549,7 +1561,7 @@ class InfoExtractor(object):
'vbr': {'convert': 'float_none'},
'abr': {'convert': 'float_none'},
'asr': {'convert': 'float_none'},
- 'source': {'convert': 'ignore', 'field': 'source_preference'},
+ 'source': {'convert': 'float', 'field': 'source_preference', 'default': -1},
'codec': {'type': 'combined', 'field': ('vcodec', 'acodec')},
'br': {'type': 'combined', 'field': ('tbr', 'vbr', 'abr'), 'same_limit': True},
@@ -3618,9 +3630,11 @@ class SearchInfoExtractor(InfoExtractor):
"""
Base class for paged search queries extractors.
They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
- Instances should define _SEARCH_KEY and _MAX_RESULTS.
+ Instances should define _SEARCH_KEY and optionally _MAX_RESULTS
"""
+ _MAX_RESULTS = float('inf')
+
@classmethod
def _make_valid_url(cls):
return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
diff --git a/yt_dlp/extractor/coub.py b/yt_dlp/extractor/coub.py
index eba6b73ba..e90aa1954 100644
--- a/yt_dlp/extractor/coub.py
+++ b/yt_dlp/extractor/coub.py
@@ -57,7 +57,7 @@ class CoubIE(InfoExtractor):
file_versions = coub['file_versions']
- QUALITIES = ('low', 'med', 'high')
+ QUALITIES = ('low', 'med', 'high', 'higher')
MOBILE = 'mobile'
IPHONE = 'iphone'
@@ -86,6 +86,7 @@ class CoubIE(InfoExtractor):
'format_id': '%s-%s-%s' % (HTML5, kind, quality),
'filesize': int_or_none(item.get('size')),
'vcodec': 'none' if kind == 'audio' else None,
+ 'acodec': 'none' if kind == 'video' else None,
'quality': quality_key(quality),
'source_preference': preference_key(HTML5),
})
diff --git a/yt_dlp/extractor/dplay.py b/yt_dlp/extractor/dplay.py
index e0e446b87..d62480810 100644
--- a/yt_dlp/extractor/dplay.py
+++ b/yt_dlp/extractor/dplay.py
@@ -325,7 +325,7 @@ class HGTVDeIE(DPlayIE):
class DiscoveryPlusIE(DPlayIE):
- _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/video' + DPlayIE._PATH_REGEX
+ _VALID_URL = r'https?://(?:www\.)?discoveryplus\.com/(?:\w{2}/)?video' + DPlayIE._PATH_REGEX
_TESTS = [{
'url': 'https://www.discoveryplus.com/video/property-brothers-forever-home/food-and-family',
'info_dict': {
@@ -343,6 +343,9 @@ class DiscoveryPlusIE(DPlayIE):
'episode_number': 1,
},
'skip': 'Available for Premium users',
+ }, {
+ 'url': 'https://discoveryplus.com/ca/video/bering-sea-gold-discovery-ca/goldslingers',
+ 'only_matching': True,
}]
_PRODUCT = 'dplus_us'
diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py
index f4f817fcb..9d963ee46 100644
--- a/yt_dlp/extractor/extractors.py
+++ b/yt_dlp/extractor/extractors.py
@@ -744,7 +744,10 @@ from .mdr import MDRIE
from .medaltv import MedalTVIE
from .mediaite import MediaiteIE
from .mediaklikk import MediaKlikkIE
-from .mediaset import MediasetIE
+from .mediaset import (
+ MediasetIE,
+ MediasetShowIE,
+)
from .mediasite import (
MediasiteIE,
MediasiteCatalogIE,
@@ -760,6 +763,7 @@ from .metacritic import MetacriticIE
from .mgoon import MgoonIE
from .mgtv import MGTVIE
from .miaopai import MiaoPaiIE
+from .microsoftstream import MicrosoftStreamIE
from .microsoftvirtualacademy import (
MicrosoftVirtualAcademyIE,
MicrosoftVirtualAcademyCourseIE,
@@ -792,6 +796,7 @@ from .mlb import (
MLBIE,
MLBVideoIE,
)
+from .mlssoccer import MLSSoccerIE
from .mnet import MnetIE
from .moevideo import MoeVideoIE
from .mofosex import (
@@ -1288,6 +1293,7 @@ from .skynewsarabia import (
from .skynewsau import SkyNewsAUIE
from .sky import (
SkyNewsIE,
+ SkyNewsStoryIE,
SkySportsIE,
SkySportsNewsIE,
)
@@ -1387,10 +1393,7 @@ from .svt import (
from .swrmediathek import SWRMediathekIE
from .syfy import SyfyIE
from .sztvhu import SztvHuIE
-from .tagesschau import (
- TagesschauPlayerIE,
- TagesschauIE,
-)
+from .tagesschau import TagesschauIE
from .tass import TassIE
from .tbs import TBSIE
from .tdslifeway import TDSLifewayIE
@@ -1444,6 +1447,10 @@ from .theweatherchannel import TheWeatherChannelIE
from .thisamericanlife import ThisAmericanLifeIE
from .thisav import ThisAVIE
from .thisoldhouse import ThisOldHouseIE
+from .threespeak import (
+ ThreeSpeakIE,
+ ThreeSpeakUserIE,
+)
from .threeqsdn import ThreeQSDNIE
from .tiktok import (
TikTokIE,
diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py
index 5918c8c56..0d279016b 100644
--- a/yt_dlp/extractor/generic.py
+++ b/yt_dlp/extractor/generic.py
@@ -1188,6 +1188,21 @@ class GenericIE(InfoExtractor):
},
'skip': 'Only has video a few mornings per month, see http://www.suffolk.edu/sjc/',
},
+ # jwplayer with only the json URL
+ {
+ 'url': 'https://www.hollywoodreporter.com/news/general-news/dunkirk-team-reveals-what-christopher-nolan-said-oscar-win-meet-your-oscar-winner-1092454',
+ 'info_dict': {
+ 'id': 'TljWkvWH',
+ 'ext': 'mp4',
+ 'upload_date': '20180306',
+ 'title': 'md5:91eb1862f6526415214f62c00b453936',
+ 'description': 'md5:73048ae50ae953da10549d1d2fe9b3aa',
+ 'timestamp': 1520367225,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
# Complex jwplayer
{
'url': 'http://www.indiedb.com/games/king-machine/videos',
@@ -3503,6 +3518,13 @@ class GenericIE(InfoExtractor):
jwplayer_data = self._find_jwplayer_data(
webpage, video_id, transform_source=js_to_json)
if jwplayer_data:
+ if isinstance(jwplayer_data.get('playlist'), str):
+ return {
+ **info_dict,
+ '_type': 'url',
+ 'ie_key': JWPlatformIE.ie_key(),
+ 'url': jwplayer_data['playlist'],
+ }
try:
info = self._parse_jwplayer_data(
jwplayer_data, video_id, require_title=False, base_url=url)
@@ -3561,8 +3583,7 @@ class GenericIE(InfoExtractor):
return info_dict
# Looking for http://schema.org/VideoObject
- json_ld = self._search_json_ld(
- webpage, video_id, default={}, expected_type='VideoObject')
+ json_ld = self._search_json_ld(webpage, video_id, default={})
if json_ld.get('url'):
return merge_dicts(json_ld, info_dict)
diff --git a/yt_dlp/extractor/instagram.py b/yt_dlp/extractor/instagram.py
index 3801c7af9..ccfcddd5b 100644
--- a/yt_dlp/extractor/instagram.py
+++ b/yt_dlp/extractor/instagram.py
@@ -4,6 +4,7 @@ import itertools
import hashlib
import json
import re
+import time
from .common import InfoExtractor
from ..compat import (
@@ -20,11 +21,13 @@ from ..utils import (
try_get,
url_or_none,
variadic,
+ urlencode_postdata,
)
class InstagramIE(InfoExtractor):
_VALID_URL = r'(?P<url>https?://(?:www\.)?instagram\.com/(?:p|tv|reel)/(?P<id>[^/?#&]+))'
+ _NETRC_MACHINE = 'instagram'
_TESTS = [{
'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc',
'md5': '0d2da106a9d2631273e192b372806516',
@@ -140,6 +143,47 @@ class InstagramIE(InfoExtractor):
if mobj:
return mobj.group('link')
+ def _login(self):
+ username, password = self._get_login_info()
+ if username is None:
+ return
+
+ login_webpage = self._download_webpage(
+ 'https://www.instagram.com/accounts/login/', None,
+ note='Downloading login webpage', errnote='Failed to download login webpage')
+
+ shared_data = self._parse_json(
+ self._search_regex(
+ r'window\._sharedData\s*=\s*({.+?});',
+ login_webpage, 'shared data', default='{}'),
+ None)
+
+ login = self._download_json('https://www.instagram.com/accounts/login/ajax/', None, note='Logging in', headers={
+ 'Accept': '*/*',
+ 'X-IG-App-ID': '936619743392459',
+ 'X-ASBD-ID': '198387',
+ 'X-IG-WWW-Claim': '0',
+ 'X-Requested-With': 'XMLHttpRequest',
+ 'X-CSRFToken': shared_data['config']['csrf_token'],
+ 'X-Instagram-AJAX': shared_data['rollout_hash'],
+ 'Referer': 'https://www.instagram.com/',
+ }, data=urlencode_postdata({
+ 'enc_password': f'#PWD_INSTAGRAM_BROWSER:0:{int(time.time())}:{password}',
+ 'username': username,
+ 'queryParams': '{}',
+ 'optIntoOneTap': 'false',
+ 'stopDeletionNonce': '',
+ 'trustedDeviceRecords': '{}',
+ }))
+
+ if not login.get('authenticated'):
+ if login.get('message'):
+ raise ExtractorError(f'Unable to login: {login["message"]}')
+ raise ExtractorError('Unable to login')
+
+ def _real_initialize(self):
+ self._login()
+
def _real_extract(self, url):
mobj = self._match_valid_url(url)
video_id = mobj.group('id')
@@ -147,7 +191,7 @@ class InstagramIE(InfoExtractor):
webpage, urlh = self._download_webpage_handle(url, video_id)
if 'www.instagram.com/accounts/login' in urlh.geturl().rstrip('/'):
- self.raise_login_required('You need to log in to access this content', method='cookies')
+ self.raise_login_required('You need to log in to access this content')
(media, video_url, description, thumbnail, timestamp, uploader,
uploader_id, like_count, comment_count, comments, height,
diff --git a/yt_dlp/extractor/itv.py b/yt_dlp/extractor/itv.py
index d69782b78..6e6a3673c 100644
--- a/yt_dlp/extractor/itv.py
+++ b/yt_dlp/extractor/itv.py
@@ -220,16 +220,23 @@ class ITVIE(InfoExtractor):
class ITVBTCCIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?itv\.com/btcc/(?:[^/]+/)*(?P<id>[^/?#&]+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?itv\.com/(?:news|btcc)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+ _TESTS = [{
'url': 'https://www.itv.com/btcc/articles/btcc-2019-brands-hatch-gp-race-action',
'info_dict': {
'id': 'btcc-2019-brands-hatch-gp-race-action',
'title': 'BTCC 2019: Brands Hatch GP race action',
},
'playlist_count': 12,
- }
- BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1582188683001/HkiHLnNRx_default/index.html?videoId=%s'
+ }, {
+ 'url': 'https://www.itv.com/news/2021-10-27/i-have-to-protect-the-country-says-rishi-sunak-as-uk-faces-interest-rate-hike',
+ 'info_dict': {
+ 'id': 'i-have-to-protect-the-country-says-rishi-sunak-as-uk-faces-interest-rate-hike',
+ 'title': 'md5:6ef054dd9f069330db3dcc66cb772d32'
+ },
+ 'playlist_count': 4
+ }]
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_default/index.html?videoId=%s'
def _real_extract(self, url):
playlist_id = self._match_id(url)
@@ -240,15 +247,15 @@ class ITVBTCCIE(InfoExtractor):
'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[^>]*>([^<]+)</script>', webpage, 'json_map'), playlist_id),
lambda x: x['props']['pageProps']['article']['body']['content']) or []
- # Discard empty objects
- video_ids = []
+ entries = []
for video in json_map:
- if video['data'].get('id'):
- video_ids.append(video['data']['id'])
-
- entries = [
- self.url_result(
- smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {
+ if not any(video['data'].get(attr) == 'Brightcove' for attr in ('name', 'type')):
+ continue
+ video_id = video['data']['id']
+ account_id = video['data']['accountId']
+ player_id = video['data']['playerId']
+ entries.append(self.url_result(
+ smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % (account_id, player_id, video_id), {
# ITV does not like some GB IP ranges, so here are some
# IP blocks it accepts
'geo_ip_blocks': [
@@ -256,8 +263,7 @@ class ITVBTCCIE(InfoExtractor):
],
'referrer': url,
}),
- ie=BrightcoveNewIE.ie_key(), video_id=video_id)
- for video_id in video_ids]
+ ie=BrightcoveNewIE.ie_key(), video_id=video_id))
title = self._og_search_title(webpage, fatal=False)
diff --git a/yt_dlp/extractor/mediaset.py b/yt_dlp/extractor/mediaset.py
index 26e7abc49..119b39997 100644
--- a/yt_dlp/extractor/mediaset.py
+++ b/yt_dlp/extractor/mediaset.py
@@ -1,13 +1,17 @@
# coding: utf-8
from __future__ import unicode_literals
+import functools
import re
from .theplatform import ThePlatformBaseIE
from ..utils import (
ExtractorError,
int_or_none,
+ OnDemandPagedList,
parse_qs,
+ try_get,
+ urljoin,
update_url_query,
)
@@ -212,3 +216,81 @@ class MediasetIE(ThePlatformBaseIE):
'subtitles': subtitles,
})
return info
+
+
+class MediasetShowIE(MediasetIE):
+ _VALID_URL = r'''(?x)
+ (?:
+ https?://
+ (?:(?:www|static3)\.)?mediasetplay\.mediaset\.it/
+ (?:
+ (?:fiction|programmi-tv|serie-tv)/(?:.+?/)?
+ (?:[a-z]+)_SE(?P<id>\d{12})
+ (?:,ST(?P<st>\d{12}))?
+ (?:,sb(?P<sb>\d{9}))?$
+ )
+ )
+ '''
+ _TESTS = [{
+ # TV Show webpage (with a single playlist)
+ 'url': 'https://www.mediasetplay.mediaset.it/serie-tv/fireforce/episodi_SE000000001556',
+ 'info_dict': {
+ 'id': '000000001556',
+ 'title': 'Fire Force',
+ },
+ 'playlist_count': 1,
+ }, {
+ # TV Show webpage (with multiple playlists)
+ 'url': 'https://www.mediasetplay.mediaset.it/programmi-tv/leiene/leiene_SE000000000061,ST000000002763',
+ 'info_dict': {
+ 'id': '000000002763',
+ 'title': 'Le Iene',
+ },
+ 'playlist_count': 7,
+ }, {
+ # TV Show specific playlist (single page)
+ 'url': 'https://www.mediasetplay.mediaset.it/serie-tv/fireforce/episodi_SE000000001556,ST000000002738,sb100013107',
+ 'info_dict': {
+ 'id': '100013107',
+ 'title': 'Episodi',
+ },
+ 'playlist_count': 4,
+ }, {
+ # TV Show specific playlist (with multiple pages)
+ 'url': 'https://www.mediasetplay.mediaset.it/programmi-tv/leiene/iservizi_SE000000000061,ST000000002763,sb100013375',
+ 'info_dict': {
+ 'id': '100013375',
+ 'title': 'I servizi',
+ },
+ 'playlist_count': 53,
+ }]
+
+ _BY_SUBBRAND = 'https://feed.entertainment.tv.theplatform.eu/f/PR1GhC/mediaset-prod-all-programs-v2?byCustomValue={subBrandId}{%s}&sort=:publishInfo_lastPublished|desc,tvSeasonEpisodeNumber|desc&range=%d-%d'
+ _PAGE_SIZE = 25
+
+ def _fetch_page(self, sb, page):
+ lower_limit = page * self._PAGE_SIZE + 1
+ upper_limit = lower_limit + self._PAGE_SIZE - 1
+ content = self._download_json(
+ self._BY_SUBBRAND % (sb, lower_limit, upper_limit), sb)
+ for entry in content.get('entries') or []:
+ yield self.url_result(
+ 'mediaset:' + entry['guid'],
+ playlist_title=entry['mediasetprogram$subBrandDescription'])
+
+ def _real_extract(self, url):
+ playlist_id, st, sb = self._match_valid_url(url).group('id', 'st', 'sb')
+ if not sb:
+ page = self._download_webpage(url, playlist_id)
+ entries = [self.url_result(urljoin('https://www.mediasetplay.mediaset.it', url))
+ for url in re.findall(r'href="([^<>=]+SE\d{12},ST\d{12},sb\d{9})">[^<]+<', page)]
+ title = (self._html_search_regex(r'(?s)<h1[^>]*>(.+?)</h1>', page, 'title', default=None)
+ or self._og_search_title(page))
+ return self.playlist_result(entries, st or playlist_id, title)
+
+ entries = OnDemandPagedList(
+ functools.partial(self._fetch_page, sb),
+ self._PAGE_SIZE)
+ title = try_get(entries, lambda x: x[0]['playlist_title'])
+
+ return self.playlist_result(entries, sb, title)
diff --git a/yt_dlp/extractor/microsoftstream.py b/yt_dlp/extractor/microsoftstream.py
new file mode 100644
index 000000000..4d5a9df1f
--- /dev/null
+++ b/yt_dlp/extractor/microsoftstream.py
@@ -0,0 +1,125 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from base64 import b64decode
+
+from .common import InfoExtractor
+from ..utils import (
+ merge_dicts,
+ parse_iso8601,
+ parse_duration,
+ parse_resolution,
+ try_get,
+ url_basename,
+)
+
+
+class MicrosoftStreamIE(InfoExtractor):
+ IE_NAME = 'microsoftstream'
+ IE_DESC = 'Microsoft Stream'
+ _VALID_URL = r'https?://(?:web|www|msit)\.microsoftstream\.com/video/(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+
+ _TESTS = [{
+ 'url': 'https://web.microsoftstream.com/video/6e51d928-4f46-4f1c-b141-369925e37b62?list=user&userId=f5491e02-e8fe-4e34-b67c-ec2e79a6ecc0',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://msit.microsoftstream.com/video/b60f5987-aabd-4e1c-a42f-c559d138f2ca',
+ 'only_matching': True,
+ }]
+
+ def _get_all_subtitles(self, api_url, video_id, headers):
+ subtitles = {}
+ automatic_captions = {}
+ text_tracks = self._download_json(
+ f'{api_url}/videos/{video_id}/texttracks', video_id,
+ note='Downloading subtitles JSON', fatal=False, headers=headers,
+ query={'api-version': '1.4-private'}).get('value') or []
+ for track in text_tracks:
+ if not track.get('language') or not track.get('url'):
+ continue
+ sub_dict = automatic_captions if track.get('autoGenerated') else subtitles
+ sub_dict.setdefault(track['language'], []).append({
+ 'ext': 'vtt',
+ 'url': track.get('url')
+ })
+ return {
+ 'subtitles': subtitles,
+ 'automatic_captions': automatic_captions
+ }
+
+ def extract_all_subtitles(self, *args, **kwargs):
+ if (self.get_param('writesubtitles', False)
+ or self.get_param('writeautomaticsub', False)
+ or self.get_param('listsubtitles')):
+ return self._get_all_subtitles(*args, **kwargs)
+ return {}
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ if '<title>Microsoft Stream</title>' not in webpage:
+ self.raise_login_required(method='cookies')
+
+ access_token = self._html_search_regex(r'"AccessToken":"(.+?)"', webpage, 'access token')
+ api_url = self._html_search_regex(r'"ApiGatewayUri":"(.+?)"', webpage, 'api url')
+
+ headers = {'Authorization': f'Bearer {access_token}'}
+
+ video_data = self._download_json(
+ f'{api_url}/videos/{video_id}', video_id,
+ headers=headers, query={
+ '$expand': 'creator,tokens,status,liveEvent,extensions',
+ 'api-version': '1.4-private'
+ })
+ video_id = video_data.get('id') or video_id
+ language = video_data.get('language')
+
+ thumbnails = []
+ for thumbnail_id in ('extraSmall', 'small', 'medium', 'large'):
+ thumbnail_url = try_get(video_data, lambda x: x['posterImage'][thumbnail_id]['url'], str)
+ if not thumbnail_url:
+ continue
+ thumb = {
+ 'id': thumbnail_id,
+ 'url': thumbnail_url,
+ }
+ thumb_name = url_basename(thumbnail_url)
+ thumb_name = str(b64decode(thumb_name + '=' * (-len(thumb_name) % 4)))
+ thumb.update(parse_resolution(thumb_name))
+ thumbnails.append(thumb)
+
+ formats = []
+ for playlist in video_data['playbackUrls']:
+ if playlist['mimeType'] == 'application/vnd.apple.mpegurl':
+ formats.extend(self._extract_m3u8_formats(
+ playlist['playbackUrl'], video_id,
+ ext='mp4', entry_protocol='m3u8_native', m3u8_id='hls',
+ fatal=False, headers=headers))
+ elif playlist['mimeType'] == 'application/dash+xml':
+ formats.extend(self._extract_mpd_formats(
+ playlist['playbackUrl'], video_id, mpd_id='dash',
+ fatal=False, headers=headers))
+ elif playlist['mimeType'] == 'application/vnd.ms-sstr+xml':
+ formats.extend(self._extract_ism_formats(
+ playlist['playbackUrl'], video_id, ism_id='mss',
+ fatal=False, headers=headers))
+ formats = [merge_dicts(f, {'language': language}) for f in formats]
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': video_data['name'],
+ 'description': video_data.get('description'),
+ 'uploader': try_get(video_data, lambda x: x['creator']['name'], str),
+ 'uploader_id': try_get(video_data, (lambda x: x['creator']['mail'],
+ lambda x: x['creator']['id']), str),
+ 'thumbnails': thumbnails,
+ **self.extract_all_subtitles(api_url, video_id, headers),
+ 'timestamp': parse_iso8601(video_data.get('created')),
+ 'duration': parse_duration(try_get(video_data, lambda x: x['media']['duration'])),
+ 'webpage_url': f'https://web.microsoftstream.com/video/{video_id}',
+ 'view_count': try_get(video_data, lambda x: x['metrics']['views'], int),
+ 'like_count': try_get(video_data, lambda x: x['metrics']['likes'], int),
+ 'comment_count': try_get(video_data, lambda x: x['metrics']['comments'], int),
+ 'formats': formats,
+ }
diff --git a/yt_dlp/extractor/mlssoccer.py b/yt_dlp/extractor/mlssoccer.py
new file mode 100644
index 000000000..2d65787e2
--- /dev/null
+++ b/yt_dlp/extractor/mlssoccer.py
@@ -0,0 +1,118 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class MLSSoccerIE(InfoExtractor):
+ _VALID_DOMAINS = r'(?:(?:cfmontreal|intermiamicf|lagalaxy|lafc|houstondynamofc|dcunited|atlutd|mlssoccer|fcdallas|columbuscrew|coloradorapids|fccincinnati|chicagofirefc|austinfc|nashvillesc|whitecapsfc|sportingkc|soundersfc|sjearthquakes|rsl|timbers|philadelphiaunion|orlandocitysc|newyorkredbulls|nycfc)\.com|(?:torontofc)\.ca|(?:revolutionsoccer)\.net)'
+ _VALID_URL = r'(?:https?://)(?:www\.)?%s/video/#?(?P<id>[^/&$#?]+)' % _VALID_DOMAINS
+
+ _TESTS = [{
+ 'url': 'https://www.mlssoccer.com/video/the-octagon-can-alphonso-davies-lead-canada-to-first-world-cup-since-1986#the-octagon-can-alphonso-davies-lead-canada-to-first-world-cup-since-1986',
+ 'info_dict': {
+ 'id': '6276033198001',
+ 'ext': 'mp4',
+ 'title': 'The Octagon | Can Alphonso Davies lead Canada to first World Cup since 1986?',
+ 'description': 'md5:f0a883ee33592a0221798f451a98be8f',
+ 'thumbnail': 'https://cf-images.us-east-1.prod.boltdns.net/v1/static/5530036772001/1bbc44f6-c63c-4981-82fa-46b0c1f891e0/5c1ca44a-a033-4e98-b531-ff24c4947608/160x90/match/image.jpg',
+ 'duration': 350.165,
+ 'timestamp': 1633627291,
+ 'uploader_id': '5530036772001',
+ 'tags': ['club/canada'],
+ 'is_live': False,
+ 'duration_string': '5:50',
+ 'upload_date': '20211007',
+ 'filesize_approx': 255193528.83200002
+ },
+ 'params': {'skip_download': True}
+ }, {
+ 'url': 'https://www.whitecapsfc.com/video/highlights-san-jose-earthquakes-vs-vancouver-whitecaps-fc-october-23-2021#highlights-san-jose-earthquakes-vs-vancouver-whitecaps-fc-october-23-2021',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.torontofc.ca/video/highlights-toronto-fc-vs-cf-montreal-october-23-2021-x6733#highlights-toronto-fc-vs-cf-montreal-october-23-2021-x6733',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.sportingkc.com/video/post-match-press-conference-john-pulskamp-oct-27-2021#post-match-press-conference-john-pulskamp-oct-27-2021',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.soundersfc.com/video/highlights-seattle-sounders-fc-vs-sporting-kansas-city-october-23-2021',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.sjearthquakes.com/video/#highlights-austin-fc-vs-san-jose-earthquakes-june-19-2021',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.rsl.com/video/2021-u-of-u-health-mic-d-up-vs-colorado-10-16-21#2021-u-of-u-health-mic-d-up-vs-colorado-10-16-21',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.timbers.com/video/highlights-d-chara-asprilla-with-goals-in-portland-timbers-2-0-win-over-san-jose#highlights-d-chara-asprilla-with-goals-in-portland-timbers-2-0-win-over-san-jose',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.philadelphiaunion.com/video/highlights-torvphi',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.orlandocitysc.com/video/highlight-columbus-crew-vs-orlando-city-sc',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.newyorkredbulls.com/video/all-access-matchday-double-derby-week#all-access-matchday-double-derby-week',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.nycfc.com/video/highlights-nycfc-1-0-chicago-fire-fc#highlights-nycfc-1-0-chicago-fire-fc',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.revolutionsoccer.net/video/two-minute-highlights-revs-1-rapids-0-october-27-2021#two-minute-highlights-revs-1-rapids-0-october-27-2021',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.nashvillesc.com/video/goal-c-j-sapong-nashville-sc-92nd-minute',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.cfmontreal.com/video/faits-saillants-tor-v-mtl#faits-saillants-orl-v-mtl-x5645',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.intermiamicf.com/video/all-access-victory-vs-nashville-sc-by-ukg#all-access-victory-vs-nashville-sc-by-ukg',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.lagalaxy.com/video/#moment-of-the-month-presented-by-san-manuel-casino-rayan-raveloson-scores-his-se',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.lafc.com/video/breaking-down-lafc-s-final-6-matches-of-the-2021-mls-regular-season#breaking-down-lafc-s-final-6-matches-of-the-2021-mls-regular-season',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.houstondynamofc.com/video/postgame-press-conference-michael-nelson-presented-by-coushatta-casino-res-x9660#postgame-press-conference-michael-nelson-presented-by-coushatta-casino-res-x9660',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.dcunited.com/video/tony-alfaro-my-family-pushed-me-to-believe-everything-was-possible',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.fcdallas.com/video/highlights-fc-dallas-vs-minnesota-united-fc-october-02-2021#highlights-fc-dallas-vs-minnesota-united-fc-october-02-2021',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.columbuscrew.com/video/match-rewind-columbus-crew-vs-new-york-red-bulls-october-23-2021',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.coloradorapids.com/video/postgame-reaction-robin-fraser-october-27#postgame-reaction-robin-fraser-october-27',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.fccincinnati.com/video/#keeping-cincy-chill-presented-by-coors-lite',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.chicagofirefc.com/video/all-access-fire-score-dramatic-road-win-in-cincy#all-access-fire-score-dramatic-road-win-in-cincy',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.austinfc.com/video/highlights-colorado-rapids-vs-austin-fc-september-29-2021#highlights-colorado-rapids-vs-austin-fc-september-29-2021',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.atlutd.com/video/goal-josef-martinez-scores-in-the-73rd-minute#goal-josef-martinez-scores-in-the-73rd-minute',
+ 'only_matching': True
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ webpage = self._download_webpage(url, id)
+ data_json = self._parse_json(self._html_search_regex(r'data-options\=\"([^\"]+)\"', webpage, 'json'), id)['videoList'][0]
+ return {
+ 'id': id,
+ '_type': 'url',
+ 'url': 'https://players.brightcove.net/%s/default_default/index.html?videoId=%s' % (data_json['accountId'], data_json['videoId']),
+ 'ie_key': 'BrightcoveNew',
+ }
diff --git a/yt_dlp/extractor/mtv.py b/yt_dlp/extractor/mtv.py
index e0608845d..141dd7deb 100644
--- a/yt_dlp/extractor/mtv.py
+++ b/yt_dlp/extractor/mtv.py
@@ -306,6 +306,14 @@ class MTVServicesInfoExtractor(InfoExtractor):
mgid = self._extract_triforce_mgid(webpage)
if not mgid:
+ mgid = self._search_regex(
+ r'"videoConfig":{"videoId":"(mgid:.*?)"', webpage, 'mgid', default=None)
+
+ if not mgid:
+ mgid = self._search_regex(
+ r'"media":{"video":{"config":{"uri":"(mgid:.*?)"', webpage, 'mgid', default=None)
+
+ if not mgid:
data = self._parse_json(self._search_regex(
r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None)
main_container = self._extract_child_with_type(data, 'MainContainer')
@@ -313,10 +321,6 @@ class MTVServicesInfoExtractor(InfoExtractor):
video_player = self._extract_child_with_type(ab_testing or main_container, 'VideoPlayer')
mgid = video_player['props']['media']['video']['config']['uri']
- if not mgid:
- mgid = self._search_regex(
- r'"media":{"video":{"config":{"uri":"(mgid:.*?)"', webpage, 'mgid', default=None)
-
return mgid
def _real_extract(self, url):
diff --git a/yt_dlp/extractor/naver.py b/yt_dlp/extractor/naver.py
index acf53c1ff..a6821ba86 100644
--- a/yt_dlp/extractor/naver.py
+++ b/yt_dlp/extractor/naver.py
@@ -40,6 +40,7 @@ class NaverBaseIE(InfoExtractor):
formats.append({
'format_id': '%s_%s' % (stream.get('type') or stream_type, dict_get(encoding_option, ('name', 'id'))),
'url': stream_url,
+ 'ext': 'mp4',
'width': int_or_none(encoding_option.get('width')),
'height': int_or_none(encoding_option.get('height')),
'vbr': int_or_none(bitrate.get('video')),
@@ -174,7 +175,7 @@ class NaverLiveIE(InfoExtractor):
'url': 'https://tv.naver.com/l/52010',
'info_dict': {
'id': '52010',
- 'ext': 'm3u8',
+ 'ext': 'mp4',
'title': '[LIVE] 뉴스특보 : "수도권 거리두기, 2주간 2단계로 조정"',
'description': 'md5:df7f0c237a5ed5e786ce5c91efbeaab3',
'channel_id': 'NTV-ytnnews24-0',
@@ -184,7 +185,7 @@ class NaverLiveIE(InfoExtractor):
'url': 'https://tv.naver.com/l/51549',
'info_dict': {
'id': '51549',
- 'ext': 'm3u8',
+ 'ext': 'mp4',
'title': '연합뉴스TV - 코로나19 뉴스특보',
'description': 'md5:c655e82091bc21e413f549c0eaccc481',
'channel_id': 'NTV-yonhapnewstv-0',
@@ -233,7 +234,7 @@ class NaverLiveIE(InfoExtractor):
continue
formats.extend(self._extract_m3u8_formats(
- quality.get('url'), video_id, 'm3u8',
+ quality.get('url'), video_id, 'mp4',
m3u8_id=quality.get('qualityId'), live=True
))
self._sort_formats(formats)
diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py
index 76f087057..4bcea33d5 100644
--- a/yt_dlp/extractor/niconico.py
+++ b/yt_dlp/extractor/niconico.py
@@ -704,7 +704,6 @@ class NicovideoSearchURLIE(InfoExtractor):
class NicovideoSearchIE(SearchInfoExtractor, NicovideoSearchURLIE):
IE_DESC = 'Nico video searches'
- _MAX_RESULTS = float('inf')
IE_NAME = NicovideoSearchIE_NAME
_SEARCH_KEY = 'nicosearch'
_TESTS = []
diff --git a/yt_dlp/extractor/nrk.py b/yt_dlp/extractor/nrk.py
index b556bc6aa..49d58a685 100644
--- a/yt_dlp/extractor/nrk.py
+++ b/yt_dlp/extractor/nrk.py
@@ -147,7 +147,7 @@ class NRKIE(NRKBaseIE):
def _real_extract(self, url):
video_id = self._match_id(url).split('/')[-1]
- path_templ = 'playback/%s/' + video_id
+ path_templ = 'playback/%s/program/' + video_id
def call_playback_api(item, query=None):
return self._call_api(path_templ % item, video_id, item, query=query)
@@ -188,7 +188,7 @@ class NRKIE(NRKBaseIE):
title = titles['title']
alt_title = titles.get('subtitle')
- description = preplay.get('description')
+ description = try_get(preplay, lambda x: x['description'].replace('\r', '\n'))
duration = parse_duration(playable.get('duration')) or parse_duration(data.get('duration'))
thumbnails = []
diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py
index a189c0237..c7d316efc 100644
--- a/yt_dlp/extractor/patreon.py
+++ b/yt_dlp/extractor/patreon.py
@@ -161,7 +161,7 @@ class PatreonIE(InfoExtractor):
if try_get(attributes, lambda x: x['embed']['provider']) == 'Vimeo':
embed_html = try_get(attributes, lambda x: x['embed']['html'])
v_url = url_or_none(compat_urllib_parse_unquote(
- self._search_regex(r'src=(https%3A%2F%2Fplayer\.vimeo\.com.+)%3F', embed_html, 'vimeo url', fatal=False)))
+ self._search_regex(r'(https(?:%3A%2F%2F|://)player\.vimeo\.com.+app_id(?:=|%3D)+\d+)', embed_html, 'vimeo url', fatal=False)))
if v_url:
info.update({
'_type': 'url_transparent',
diff --git a/yt_dlp/extractor/sky.py b/yt_dlp/extractor/sky.py
index ff2c977a0..ad1e62d88 100644
--- a/yt_dlp/extractor/sky.py
+++ b/yt_dlp/extractor/sky.py
@@ -105,6 +105,34 @@ class SkyNewsIE(SkyBaseIE):
}
+class SkyNewsStoryIE(SkyBaseIE):
+ IE_NAME = 'sky:news:story'
+ _VALID_URL = r'https?://news\.sky\.com/story/[0-9a-z-]+-(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'https://news.sky.com/story/budget-2021-chancellor-rishi-sunak-vows-address-will-deliver-strong-economy-fit-for-a-new-age-of-optimism-12445425',
+ 'info_dict': {
+ 'id': 'ref:0714acb9-123d-42c8-91b8-5c1bc6c73f20',
+ 'title': 'md5:e408dd7aad63f31a1817bbe40c7d276f',
+ 'description': 'md5:a881e12f49212f92be2befe4a09d288a',
+ 'ext': 'mp4',
+ 'upload_date': '20211027',
+ 'timestamp': 1635317494,
+ 'uploader_id': '6058004172001',
+ }
+ }
+
+ def _real_extract(self, url):
+ article_id = self._match_id(url)
+ webpage = self._download_webpage(url, article_id)
+
+ entries = [self._process_ooyala_element(webpage, sdc_el, url)
+ for sdc_el in re.findall(self._SDC_EL_REGEX, webpage)]
+
+ return self.playlist_result(
+ entries, article_id, self._og_search_title(webpage),
+ self._html_search_meta(['og:description', 'description'], webpage))
+
+
class SkySportsNewsIE(SkyBaseIE):
IE_NAME = 'sky:sports:news'
_VALID_URL = r'https?://(?:www\.)?skysports\.com/([^/]+/)*news/\d+/(?P<id>\d+)'
diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py
index e89383ff1..824528474 100644
--- a/yt_dlp/extractor/soundcloud.py
+++ b/yt_dlp/extractor/soundcloud.py
@@ -856,7 +856,7 @@ class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE):
class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE):
IE_NAME = 'soundcloud:search'
IE_DESC = 'Soundcloud search'
- _MAX_RESULTS = float('inf')
+ _SEARCH_KEY = 'scsearch'
_TESTS = [{
'url': 'scsearch15:post-avant jazzcore',
'info_dict': {
@@ -865,7 +865,6 @@ class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE):
'playlist_count': 15,
}]
- _SEARCH_KEY = 'scsearch'
_MAX_RESULTS_PER_PAGE = 200
_DEFAULT_RESULTS_PER_PAGE = 50
diff --git a/yt_dlp/extractor/tagesschau.py b/yt_dlp/extractor/tagesschau.py
index 25c200455..6e03d0a7d 100644
--- a/yt_dlp/extractor/tagesschau.py
+++ b/yt_dlp/extractor/tagesschau.py
@@ -5,177 +5,63 @@ import re
from .common import InfoExtractor
from ..utils import (
- determine_ext,
js_to_json,
- parse_iso8601,
- parse_filesize,
+ extract_attributes,
+ try_get,
+ int_or_none,
)
-class TagesschauPlayerIE(InfoExtractor):
- IE_NAME = 'tagesschau:player'
- _VALID_URL = r'https?://(?:www\.)?tagesschau\.de/multimedia/(?P<kind>audio|video)/(?P=kind)-(?P<id>\d+)~player(?:_[^/?#&]+)?\.html'
-
- _TESTS = [{
- 'url': 'http://www.tagesschau.de/multimedia/video/video-179517~player.html',
- 'md5': '8d09548d5c15debad38bee3a4d15ca21',
- 'info_dict': {
- 'id': '179517',
- 'ext': 'mp4',
- 'title': 'Marie Kristin Boese, ARD Berlin, über den zukünftigen Kurs der AfD',
- 'thumbnail': r're:^https?:.*\.jpg$',
- 'formats': 'mincount:6',
- },
- }, {
- 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html',
- 'md5': '76e6eec6ebd40740671cf0a2c88617e5',
- 'info_dict': {
- 'id': '29417',
- 'ext': 'mp3',
- 'title': 'Trabi - Bye, bye Rennpappe',
- 'thumbnail': r're:^https?:.*\.jpg$',
- 'formats': 'mincount:2',
- },
- }, {
- 'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417~player_autoplay-true.html',
- 'only_matching': True,
- }]
-
- _FORMATS = {
- 'xs': {'quality': 0},
- 's': {'width': 320, 'height': 180, 'quality': 1},
- 'm': {'width': 512, 'height': 288, 'quality': 2},
- 'l': {'width': 960, 'height': 540, 'quality': 3},
- 'xl': {'width': 1280, 'height': 720, 'quality': 4},
- 'xxl': {'quality': 5},
- }
-
- def _extract_via_api(self, kind, video_id):
- info = self._download_json(
- 'https://www.tagesschau.de/api/multimedia/{0}/{0}-{1}.json'.format(kind, video_id),
- video_id)
- title = info['headline']
- formats = []
- for media in info['mediadata']:
- for format_id, format_url in media.items():
- if determine_ext(format_url) == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- format_url, video_id, 'mp4',
- entry_protocol='m3u8_native', m3u8_id='hls'))
- else:
- formats.append({
- 'url': format_url,
- 'format_id': format_id,
- 'vcodec': 'none' if kind == 'audio' else None,
- })
- self._sort_formats(formats)
- timestamp = parse_iso8601(info.get('date'))
- return {
- 'id': video_id,
- 'title': title,
- 'timestamp': timestamp,
- 'formats': formats,
- }
-
- def _real_extract(self, url):
- mobj = self._match_valid_url(url)
- video_id = mobj.group('id')
-
- # kind = mobj.group('kind').lower()
- # if kind == 'video':
- # return self._extract_via_api(kind, video_id)
-
- # JSON api does not provide some audio formats (e.g. ogg) thus
- # extracting audio via webpage
-
- webpage = self._download_webpage(url, video_id)
-
- title = self._og_search_title(webpage).strip()
- formats = []
-
- for media_json in re.findall(r'({src\s*:\s*["\']http[^}]+type\s*:[^}]+})', webpage):
- media = self._parse_json(js_to_json(media_json), video_id, fatal=False)
- if not media:
- continue
- src = media.get('src')
- if not src:
- return
- quality = media.get('quality')
- kind = media.get('type', '').split('/')[0]
- ext = determine_ext(src)
- f = {
- 'url': src,
- 'format_id': '%s_%s' % (quality, ext) if quality else ext,
- 'ext': ext,
- 'vcodec': 'none' if kind == 'audio' else None,
- }
- f.update(self._FORMATS.get(quality, {}))
- formats.append(f)
-
- self._sort_formats(formats)
-
- thumbnail = self._og_search_thumbnail(webpage)
-
- return {
- 'id': video_id,
- 'title': title,
- 'thumbnail': thumbnail,
- 'formats': formats,
- }
-
-
class TagesschauIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?tagesschau\.de/(?P<path>[^/]+/(?:[^/]+/)*?(?P<id>[^/#?]+?(?:-?[0-9]+)?))(?:~_?[^/#?]+?)?\.html'
_TESTS = [{
'url': 'http://www.tagesschau.de/multimedia/video/video-102143.html',
- 'md5': 'f7c27a0eff3bfe8c7727e65f8fe1b1e6',
+ 'md5': '7a7287612fa881a1ae1d087df45c2fd6',
'info_dict': {
- 'id': 'video-102143',
+ 'id': 'video-102143-1',
'ext': 'mp4',
'title': 'Regierungsumbildung in Athen: Neue Minister in Griechenland vereidigt',
- 'description': '18.07.2015 20:10 Uhr',
- 'thumbnail': r're:^https?:.*\.jpg$',
},
}, {
'url': 'http://www.tagesschau.de/multimedia/sendung/ts-5727.html',
'md5': '3c54c1f6243d279b706bde660ceec633',
'info_dict': {
- 'id': 'ts-5727',
+ 'id': 'ts-5727-1',
'ext': 'mp4',
- 'title': 'Sendung: tagesschau \t04.12.2014 20:00 Uhr',
- 'description': 'md5:695c01bfd98b7e313c501386327aea59',
- 'thumbnail': r're:^https?:.*\.jpg$',
+ 'title': 'Ganze Sendung',
},
}, {
# exclusive audio
'url': 'http://www.tagesschau.de/multimedia/audio/audio-29417.html',
- 'md5': '76e6eec6ebd40740671cf0a2c88617e5',
+ 'md5': '4cf22023c285f35e99c24d290ba58cc9',
'info_dict': {
- 'id': 'audio-29417',
+ 'id': 'audio-29417-1',
'ext': 'mp3',
- 'title': 'Trabi - Bye, bye Rennpappe',
- 'description': 'md5:8687dda862cbbe2cfb2df09b56341317',
- 'thumbnail': r're:^https?:.*\.jpg$',
+ 'title': 'Brasilianischer Präsident Bolsonaro unter Druck: Corona-Bericht wird vorgestellt',
},
}, {
- # audio in article
'url': 'http://www.tagesschau.de/inland/bnd-303.html',
- 'md5': 'e0916c623e85fc1d2b26b78f299d3958',
+ 'md5': '12cfb212d9325b5ba0d52b625f1aa61c',
'info_dict': {
- 'id': 'bnd-303',
- 'ext': 'mp3',
- 'title': 'Viele Baustellen für neuen BND-Chef',
- 'description': 'md5:1e69a54be3e1255b2b07cdbce5bcd8b4',
- 'thumbnail': r're:^https?:.*\.jpg$',
+ 'id': 'bnd-303-1',
+ 'ext': 'mp4',
+ 'title': 'SPD-Gruppenbild mit Bärbel Bas nach der Fraktionssitzung | dpa',
},
}, {
'url': 'http://www.tagesschau.de/inland/afd-parteitag-135.html',
'info_dict': {
'id': 'afd-parteitag-135',
- 'title': 'Möchtegern-Underdog mit Machtanspruch',
+ 'title': 'AfD',
+ },
+ 'playlist_count': 20,
+ }, {
+ 'url': 'https://www.tagesschau.de/multimedia/audio/audio-29417~player.html',
+ 'info_dict': {
+ 'id': 'audio-29417-1',
+ 'ext': 'mp3',
+ 'title': 'Brasilianischer Präsident Bolsonaro unter Druck: Corona-Bericht wird vorgestellt',
},
- 'playlist_count': 2,
}, {
'url': 'http://www.tagesschau.de/multimedia/sendung/tsg-3771.html',
'only_matching': True,
@@ -206,62 +92,6 @@ class TagesschauIE(InfoExtractor):
'only_matching': True,
}]
- @classmethod
- def suitable(cls, url):
- return False if TagesschauPlayerIE.suitable(url) else super(TagesschauIE, cls).suitable(url)
-
- def _extract_formats(self, download_text, media_kind):
- links = re.finditer(
- r'<div class="button" title="(?P<title>[^"]*)"><a href="(?P<url>[^"]+)">(?P<name>.+?)</a></div>',
- download_text)
- formats = []
- for l in links:
- link_url = l.group('url')
- if not link_url:
- continue
- format_id = self._search_regex(
- r'.*/[^/.]+\.([^/]+)\.[^/.]+$', link_url, 'format ID',
- default=determine_ext(link_url))
- format = {
- 'format_id': format_id,
- 'url': l.group('url'),
- 'format_name': l.group('name'),
- }
- title = l.group('title')
- if title:
- if media_kind.lower() == 'video':
- m = re.match(
- r'''(?x)
- Video:\s*(?P<vcodec>[a-zA-Z0-9/._-]+)\s*&\#10;
- (?P<width>[0-9]+)x(?P<height>[0-9]+)px&\#10;
- (?P<vbr>[0-9]+)kbps&\#10;
- Audio:\s*(?P<abr>[0-9]+)kbps,\s*(?P<audio_desc>[A-Za-z\.0-9]+)&\#10;
- Gr&ouml;&szlig;e:\s*(?P<filesize_approx>[0-9.,]+\s+[a-zA-Z]*B)''',
- title)
- if m:
- format.update({
- 'format_note': m.group('audio_desc'),
- 'vcodec': m.group('vcodec'),
- 'width': int(m.group('width')),
- 'height': int(m.group('height')),
- 'abr': int(m.group('abr')),
- 'vbr': int(m.group('vbr')),
- 'filesize_approx': parse_filesize(m.group('filesize_approx')),
- })
- else:
- m = re.match(
- r'(?P<format>.+?)-Format\s*:\s*(?P<abr>\d+)kbps\s*,\s*(?P<note>.+)',
- title)
- if m:
- format.update({
- 'format_note': '%s, %s' % (m.group('format'), m.group('note')),
- 'vcodec': 'none',
- 'abr': int(m.group('abr')),
- })
- formats.append(format)
- self._sort_formats(formats)
- return formats
-
def _real_extract(self, url):
mobj = self._match_valid_url(url)
video_id = mobj.group('id') or mobj.group('path')
@@ -271,34 +101,46 @@ class TagesschauIE(InfoExtractor):
title = self._html_search_regex(
r'<span[^>]*class="headline"[^>]*>(.+?)</span>',
- webpage, 'title', default=None) or self._og_search_title(webpage)
-
- DOWNLOAD_REGEX = r'(?s)<p>Wir bieten dieses (?P<kind>Video|Audio) in folgenden Formaten zum Download an:</p>\s*<div class="controls">(?P<links>.*?)</div>\s*<p>'
-
- webpage_type = self._og_search_property('type', webpage, default=None)
- if webpage_type == 'website': # Article
- entries = []
- for num, (entry_title, media_kind, download_text) in enumerate(re.findall(
- r'(?s)<p[^>]+class="infotext"[^>]*>\s*(?:<a[^>]+>)?\s*<strong>(.+?)</strong>.*?</p>.*?%s' % DOWNLOAD_REGEX,
- webpage), 1):
+ webpage, 'title', default=None) or self._og_search_title(webpage, fatal=False)
+
+ entries = []
+ videos = re.findall(r'<div[^>]+>', webpage)
+ num = 0
+ for video in videos:
+ video = extract_attributes(video).get('data-config')
+ if not video:
+ continue
+ video = self._parse_json(video, video_id, transform_source=js_to_json, fatal=False)
+ video_formats = try_get(video, lambda x: x['mc']['_mediaArray'][0]['_mediaStreamArray'])
+ if not video_formats:
+ continue
+ num += 1
+ for video_format in video_formats:
+ media_url = video_format.get('_stream') or ''
+ formats = []
+ if media_url.endswith('master.m3u8'):
+ formats = self._extract_m3u8_formats(media_url, video_id, 'mp4', m3u8_id='hls')
+ elif media_url.endswith('.hi.mp3') and media_url.startswith('https://download'):
+ formats = [{
+ 'url': media_url,
+ 'vcodec': 'none',
+ }]
+ if not formats:
+ continue
entries.append({
'id': '%s-%d' % (display_id, num),
- 'title': '%s' % entry_title,
- 'formats': self._extract_formats(download_text, media_kind),
+ 'title': try_get(video, lambda x: x['mc']['_title']),
+ 'duration': int_or_none(try_get(video, lambda x: x['mc']['_duration'])),
+ 'formats': formats
})
- if len(entries) > 1:
- return self.playlist_result(entries, display_id, title)
- formats = entries[0]['formats']
- else: # Assume single video
- download_text = self._search_regex(
- DOWNLOAD_REGEX, webpage, 'download links', group='links')
- media_kind = self._search_regex(
- DOWNLOAD_REGEX, webpage, 'media kind', default='Video', group='kind')
- formats = self._extract_formats(download_text, media_kind)
- thumbnail = self._og_search_thumbnail(webpage)
- description = self._html_search_regex(
- r'(?s)<p class="teasertext">(.*?)</p>',
- webpage, 'description', default=None)
+ if len(entries) > 1:
+ return self.playlist_result(entries, display_id, title)
+ formats = entries[0]['formats']
+ video_info = self._search_json_ld(webpage, video_id)
+ description = video_info.get('description')
+ thumbnail = self._og_search_thumbnail(webpage) or video_info.get('thumbnail')
+ timestamp = video_info.get('timestamp')
+ title = title or video_info.get('description')
self._sort_formats(formats)
@@ -307,5 +149,6 @@ class TagesschauIE(InfoExtractor):
'title': title,
'thumbnail': thumbnail,
'formats': formats,
+ 'timestamp': timestamp,
'description': description,
}
diff --git a/yt_dlp/extractor/threespeak.py b/yt_dlp/extractor/threespeak.py
new file mode 100644
index 000000000..60e84529d
--- /dev/null
+++ b/yt_dlp/extractor/threespeak.py
@@ -0,0 +1,97 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ try_get,
+ unified_strdate,
+)
+
+
+class ThreeSpeakIE(InfoExtractor):
+ _VALID_URL = r'(?:https?://)(?:www\.)?3speak\.tv/watch\?v\=[^/]+/(?P<id>[^/$&#?]+)'
+
+ _TESTS = [{
+ 'url': 'https://3speak.tv/watch?v=dannyshine/wjgoxyfy',
+ 'info_dict': {
+ 'id': 'wjgoxyfy',
+ 'ext': 'mp4',
+ 'title': 'Can People who took the Vax think Critically',
+ 'uploader': 'dannyshine',
+ 'description': 'md5:181aa7ccb304afafa089b5af3bca7a10',
+ 'tags': ['sex', 'covid', 'antinatalism', 'comedy', 'vaccines'],
+ 'thumbnail': 'https://img.3speakcontent.co/wjgoxyfy/thumbnails/default.png',
+ 'upload_date': '20211021',
+ 'duration': 2703.867833,
+ 'filesize': 1620054781,
+ },
+ 'params': {'skip_download': True}
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ webpage = self._download_webpage(url, id)
+ json_str = self._html_search_regex(r'JSON\.parse\(\'([^\']+)\'\)', webpage, 'json')
+ # The json string itself is escaped. Hence the double parsing
+ data_json = self._parse_json(self._parse_json(f'"{json_str}"', id), id)
+ video_json = self._parse_json(data_json['json_metadata'], id)
+ formats, subtitles = [], {}
+ og_m3u8 = self._html_search_regex(r'<meta\s?property=\"ogvideo\"\s?content=\"([^\"]+)\">', webpage, 'og m3u8', fatal=False)
+ if og_m3u8:
+ https_frmts, https_subs = self._extract_m3u8_formats_and_subtitles(og_m3u8, id, fatal=False, m3u8_id='https')
+ formats.extend(https_frmts)
+ subtitles = self._merge_subtitles(subtitles, https_subs)
+ ipfs_m3u8 = try_get(video_json, lambda x: x['video']['info']['ipfs'])
+ if ipfs_m3u8:
+ ipfs_frmts, ipfs_subs = self._extract_m3u8_formats_and_subtitles(f'https://ipfs.3speak.tv/ipfs/{ipfs_m3u8}',
+ id, fatal=False, m3u8_id='ipfs')
+ formats.extend(ipfs_frmts)
+ subtitles = self._merge_subtitles(subtitles, ipfs_subs)
+ mp4_file = try_get(video_json, lambda x: x['video']['info']['file'])
+ if mp4_file:
+ formats.append({
+ 'url': f'https://threespeakvideo.b-cdn.net/{id}/{mp4_file}',
+ 'ext': 'mp4',
+ 'format_id': 'https-mp4',
+ 'duration': try_get(video_json, lambda x: x['video']['info']['duration']),
+ 'filesize': try_get(video_json, lambda x: x['video']['info']['filesize']),
+ 'quality': 11,
+ 'format_note': 'Original file',
+ })
+ self._sort_formats(formats)
+ return {
+ 'id': id,
+ 'title': data_json.get('title') or data_json.get('root_title'),
+ 'uploader': data_json.get('author'),
+ 'description': try_get(video_json, lambda x: x['video']['content']['description']),
+ 'tags': try_get(video_json, lambda x: x['video']['content']['tags']),
+ 'thumbnail': try_get(video_json, lambda x: x['image'][0]),
+ 'upload_date': unified_strdate(data_json.get('created')),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
+
+
+class ThreeSpeakUserIE(InfoExtractor):
+ _VALID_URL = r'(?:https?://)(?:www\.)?3speak\.tv/user/(?P<id>[^/$&?#]+)'
+
+ _TESTS = [{
+ 'url': 'https://3speak.tv/user/theycallmedan',
+ 'info_dict': {
+ 'id': 'theycallmedan',
+ },
+ 'playlist_mincount': 115,
+ }]
+
+ def _real_extract(self, url):
+ id = self._match_id(url)
+ webpage = self._download_webpage(url, id)
+ entries = [
+ self.url_result(
+ 'https://3speak.tv/watch?v=%s' % video,
+ ie=ThreeSpeakIE.ie_key())
+ for video in re.findall(r'data-payout\s?\=\s?\"([^\"]+)\"', webpage) if video
+ ]
+ return self.playlist_result(entries, id)
diff --git a/yt_dlp/extractor/trovo.py b/yt_dlp/extractor/trovo.py
index ec55f41f2..a0f0cc31c 100644
--- a/yt_dlp/extractor/trovo.py
+++ b/yt_dlp/extractor/trovo.py
@@ -223,7 +223,7 @@ class TrovoChannelBaseIE(InfoExtractor):
class TrovoChannelVodIE(TrovoChannelBaseIE):
_VALID_URL = r'trovovod:(?P<id>[^\s]+)'
- IE_DESC = 'All VODs of a trovo.live channel, "trovovod" keyword'
+ IE_DESC = 'All VODs of a trovo.live channel; "trovovod:" prefix'
_TESTS = [{
'url': 'trovovod:OneTappedYou',
@@ -244,7 +244,7 @@ class TrovoChannelVodIE(TrovoChannelBaseIE):
class TrovoChannelClipIE(TrovoChannelBaseIE):
_VALID_URL = r'trovoclip:(?P<id>[^\s]+)'
- IE_DESC = 'All Clips of a trovo.live channel, "trovoclip" keyword'
+ IE_DESC = 'All Clips of a trovo.live channel; "trovoclip:" prefix'
_TESTS = [{
'url': 'trovoclip:OneTappedYou',
diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py
index 485b781ca..0749263d9 100644
--- a/yt_dlp/extractor/twitter.py
+++ b/yt_dlp/extractor/twitter.py
@@ -485,7 +485,7 @@ class TwitterIE(TwitterBaseIE):
fmts, subs = self._extract_variant_formats(variant, twid)
subtitles = self._merge_subtitles(subtitles, subs)
formats.extend(fmts)
- self._sort_formats(formats)
+ self._sort_formats(formats, ('res', 'br', 'size', 'proto')) # The codec of http formats are unknown
thumbnails = []
media_url = media.get('media_url_https') or media.get('media_url')
diff --git a/yt_dlp/extractor/viewlift.py b/yt_dlp/extractor/viewlift.py
index c3b2e863d..5b558d890 100644
--- a/yt_dlp/extractor/viewlift.py
+++ b/yt_dlp/extractor/viewlift.py
@@ -9,6 +9,7 @@ from ..utils import (
ExtractorError,
int_or_none,
parse_age_limit,
+ traverse_obj,
)
@@ -32,26 +33,36 @@ class ViewLiftBaseIE(InfoExtractor):
}
_TOKENS = {}
- def _call_api(self, site, path, video_id, query):
- token = self._TOKENS.get(site)
- if not token:
- token_query = {'site': site}
- email, password = self._get_login_info(netrc_machine=site)
- if email:
- resp = self._download_json(
- self._API_BASE + 'identity/signin', video_id,
- 'Logging in', query=token_query, data=json.dumps({
- 'email': email,
- 'password': password,
- }).encode())
- else:
- resp = self._download_json(
- self._API_BASE + 'identity/anonymous-token', video_id,
- 'Downloading authorization token', query=token_query)
- self._TOKENS[site] = token = resp['authorizationToken']
- return self._download_json(
- self._API_BASE + path, video_id,
- headers={'Authorization': token}, query=query)
+ def _fetch_token(self, site, url):
+ if self._TOKENS.get(site):
+ return
+ email, password = self._get_login_info(netrc_machine=site)
+ if email:
+ self.report_warning('Logging in using username and password is broken. %s' % self._LOGIN_HINTS['cookies'])
+
+ cookies = self._get_cookies(url)
+ if cookies and cookies.get('token'):
+ self._TOKENS[site] = self._search_regex(r'22authorizationToken\%22:\%22([^\%]+)\%22', cookies['token'].value, 'token')
+ if not self._TOKENS.get(site):
+ self.raise_login_required('Cookies (not necessarily logged in) are needed to download from this website', method='cookies')
+
+ def _call_api(self, site, path, video_id, url, query):
+ self._fetch_token(site, url)
+ try:
+ return self._download_json(
+ self._API_BASE + path, video_id, headers={'Authorization': self._TOKENS.get(site)}, query=query)
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ webpage = e.cause.read().decode()
+ try:
+ error_message = traverse_obj(json.loads(webpage), 'errorMessage', 'message')
+ except json.JSONDecodeError:
+ raise ExtractorError(f'{site} said: {webpage}', cause=e.cause)
+ if error_message:
+ if 'has not purchased' in error_message:
+ self.raise_login_required(method='cookies')
+ raise ExtractorError(error_message, expected=True)
+ raise
class ViewLiftEmbedIE(ViewLiftBaseIE):
@@ -96,27 +107,24 @@ class ViewLiftEmbedIE(ViewLiftBaseIE):
site = domain.split('.')[-2]
if site in self._SITE_MAP:
site = self._SITE_MAP[site]
- try:
- content_data = self._call_api(
- site, 'entitlement/video/status', film_id, {
- 'id': film_id
- })['video']
- except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
- error_message = self._parse_json(e.cause.read().decode(), film_id).get('errorMessage')
- if error_message == 'User does not have a valid subscription or has not purchased this content.':
- self.raise_login_required()
- raise ExtractorError(error_message, expected=True)
- raise
+
+ content_data = self._call_api(
+ site, 'entitlement/video/status', film_id, url, {
+ 'id': film_id
+ })['video']
gist = content_data['gist']
title = gist['title']
video_assets = content_data['streamingInfo']['videoAssets']
- formats = []
- mpeg_video_assets = video_assets.get('mpeg') or []
- for video_asset in mpeg_video_assets:
+ hls_url = video_assets.get('hls')
+ formats, subtitles = [], {}
+ if hls_url:
+ formats, subtitles = self._extract_m3u8_formats_and_subtitles(
+ hls_url, film_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
+
+ for video_asset in video_assets.get('mpeg') or []:
video_asset_url = video_asset.get('url')
- if not video_asset:
+ if not video_asset_url:
continue
bitrate = int_or_none(video_asset.get('bitrate'))
height = int_or_none(self._search_regex(
@@ -130,13 +138,17 @@ class ViewLiftEmbedIE(ViewLiftBaseIE):
'vcodec': video_asset.get('codec'),
})
- hls_url = video_assets.get('hls')
- if hls_url:
- formats.extend(self._extract_m3u8_formats(
- hls_url, film_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
- self._sort_formats(formats)
+ subs = {}
+ for sub in traverse_obj(content_data, ('contentDetails', 'closedCaptions')) or []:
+ sub_url = sub.get('url')
+ if not sub_url:
+ continue
+ subs.setdefault(sub.get('language', 'English'), []).append({
+ 'url': sub_url,
+ })
- info = {
+ self._sort_formats(formats)
+ return {
'id': film_id,
'title': title,
'description': gist.get('description'),
@@ -145,14 +157,15 @@ class ViewLiftEmbedIE(ViewLiftBaseIE):
'age_limit': parse_age_limit(content_data.get('parentalRating')),
'timestamp': int_or_none(gist.get('publishDate'), 1000),
'formats': formats,
+ 'subtitles': self._merge_subtitles(subs, subtitles),
+ 'categories': traverse_obj(content_data, ('categories', ..., 'title')),
+ 'tags': traverse_obj(content_data, ('tags', ..., 'title')),
}
- for k in ('categories', 'tags'):
- info[k] = [v['title'] for v in content_data.get(k, []) if v.get('title')]
- return info
class ViewLiftIE(ViewLiftBaseIE):
IE_NAME = 'viewlift'
+ _API_BASE = 'https://prod-api-cached-2.viewlift.com/'
_VALID_URL = r'https?://(?:www\.)?(?P<domain>%s)(?P<path>(?:/(?:films/title|show|(?:news/)?videos?|watch))?/(?P<id>[^?#]+))' % ViewLiftBaseIE._DOMAINS_REGEX
_TESTS = [{
'url': 'http://www.snagfilms.com/films/title/lost_for_life',
@@ -222,24 +235,111 @@ class ViewLiftIE(ViewLiftBaseIE):
}, {
'url': 'https://www.marquee.tv/watch/sadlerswells-sacredmonsters',
'only_matching': True,
+ }, { # Free film with langauge code
+ 'url': 'https://www.hoichoi.tv/bn/films/title/shuyopoka',
+ 'info_dict': {
+ 'id': '7a7a9d33-1f4c-4771-9173-ee4fb6dbf196',
+ 'ext': 'mp4',
+ 'title': 'Shuyopoka',
+ 'description': 'md5:e28f2fb8680096a69c944d37c1fa5ffc',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'upload_date': '20211006',
+ 'series': None
+ },
+ 'params': {'skip_download': True},
+ }, { # Free film
+ 'url': 'https://www.hoichoi.tv/films/title/dadu-no1',
+ 'info_dict': {
+ 'id': '0000015b-b009-d126-a1db-b81ff3780000',
+ 'ext': 'mp4',
+ 'title': 'Dadu No.1',
+ 'description': 'md5:605cba408e51a79dafcb824bdeded51e',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'upload_date': '20210827',
+ 'series': None
+ },
+ 'params': {'skip_download': True},
+ }, { # Free episode
+ 'url': 'https://www.hoichoi.tv/webseries/case-jaundice-s01-e01',
+ 'info_dict': {
+ 'id': 'f779e07c-30c8-459c-8612-5a834ab5e5ba',
+ 'ext': 'mp4',
+ 'title': 'Humans Vs. Corona',
+ 'description': 'md5:ca30a682b4528d02a3eb6d0427dd0f87',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'upload_date': '20210830',
+ 'series': 'Case Jaundice'
+ },
+ 'params': {'skip_download': True},
+ }, { # Free video
+ 'url': 'https://www.hoichoi.tv/videos/1549072415320-six-episode-02-hindi',
+ 'info_dict': {
+ 'id': 'b41fa1ce-aca6-47b6-b208-283ff0a2de30',
+ 'ext': 'mp4',
+ 'title': 'Woman in red - Hindi',
+ 'description': 'md5:9d21edc1827d32f8633eb67c2054fc31',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'upload_date': '20211006',
+ 'series': 'Six (Hindi)'
+ },
+ 'params': {'skip_download': True},
+ }, { # Free episode
+ 'url': 'https://www.hoichoi.tv/shows/watch-asian-paints-moner-thikana-online-season-1-episode-1',
+ 'info_dict': {
+ 'id': '1f45d185-8500-455c-b88d-13252307c3eb',
+ 'ext': 'mp4',
+ 'title': 'Jisshu Sengupta',
+ 'description': 'md5:ef6ffae01a3d83438597367400f824ed',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'upload_date': '20211004',
+ 'series': 'Asian Paints Moner Thikana'
+ },
+ 'params': {'skip_download': True},
+ }, { # Free series
+ 'url': 'https://www.hoichoi.tv/shows/watch-moner-thikana-bengali-web-series-online',
+ 'playlist_mincount': 5,
+ 'info_dict': {
+ 'id': 'watch-moner-thikana-bengali-web-series-online',
+ },
+ }, { # Premium series
+ 'url': 'https://www.hoichoi.tv/shows/watch-byomkesh-bengali-web-series-online',
+ 'playlist_mincount': 14,
+ 'info_dict': {
+ 'id': 'watch-byomkesh-bengali-web-series-online',
+ },
+ }, { # Premium movie
+ 'url': 'https://www.hoichoi.tv/movies/detective-2020',
+ 'only_matching': True
}]
@classmethod
def suitable(cls, url):
return False if ViewLiftEmbedIE.suitable(url) else super(ViewLiftIE, cls).suitable(url)
+ def _show_entries(self, domain, seasons):
+ for season in seasons:
+ for episode in season.get('episodes') or []:
+ path = traverse_obj(episode, ('gist', 'permalink'))
+ if path:
+ yield self.url_result(f'https://www.{domain}{path}', ie=self.ie_key())
+
def _real_extract(self, url):
domain, path, display_id = self._match_valid_url(url).groups()
site = domain.split('.')[-2]
if site in self._SITE_MAP:
site = self._SITE_MAP[site]
modules = self._call_api(
- site, 'content/pages', display_id, {
+ site, 'content/pages', display_id, url, {
'includeContent': 'true',
'moduleOffset': 1,
'path': path,
'site': site,
})['modules']
+
+ seasons = next((m['contentData'][0]['seasons'] for m in modules if m.get('moduleType') == 'ShowDetailModule'), None)
+ if seasons:
+ return self.playlist_result(self._show_entries(domain, seasons), display_id)
+
film_id = next(m['contentData'][0]['gist']['id'] for m in modules if m.get('moduleType') == 'VideoDetailModule')
return {
'_type': 'url_transparent',
diff --git a/yt_dlp/extractor/vimeo.py b/yt_dlp/extractor/vimeo.py
index 8b367a4e6..04c504934 100644
--- a/yt_dlp/extractor/vimeo.py
+++ b/yt_dlp/extractor/vimeo.py
@@ -3,7 +3,6 @@ from __future__ import unicode_literals
import base64
import functools
-import json
import re
import itertools
@@ -17,8 +16,8 @@ from ..compat import (
from ..utils import (
clean_html,
determine_ext,
- dict_get,
ExtractorError,
+ get_element_by_class,
js_to_json,
int_or_none,
merge_dicts,
@@ -26,7 +25,6 @@ from ..utils import (
parse_filesize,
parse_iso8601,
parse_qs,
- RegexNotFoundError,
sanitized_Request,
smuggle_url,
std_headers,
@@ -129,10 +127,11 @@ class VimeoBaseInfoExtractor(InfoExtractor):
video_title = video_data['title']
live_event = video_data.get('live_event') or {}
is_live = live_event.get('status') == 'started'
+ request = config.get('request') or {}
formats = []
- config_files = video_data.get('files') or config['request'].get('files', {})
- for f in config_files.get('progressive', []):
+ config_files = video_data.get('files') or request.get('files') or {}
+ for f in (config_files.get('progressive') or []):
video_url = f.get('url')
if not video_url:
continue
@@ -148,7 +147,7 @@ class VimeoBaseInfoExtractor(InfoExtractor):
# TODO: fix handling of 308 status code returned for live archive manifest requests
sep_pattern = r'/sep/video/'
for files_type in ('hls', 'dash'):
- for cdn_name, cdn_data in config_files.get(files_type, {}).get('cdns', {}).items():
+ for cdn_name, cdn_data in (try_get(config_files, lambda x: x[files_type]['cdns']) or {}).items():
manifest_url = cdn_data.get('url')
if not manifest_url:
continue
@@ -188,17 +187,15 @@ class VimeoBaseInfoExtractor(InfoExtractor):
})
subtitles = {}
- text_tracks = config['request'].get('text_tracks')
- if text_tracks:
- for tt in text_tracks:
- subtitles[tt['lang']] = [{
- 'ext': 'vtt',
- 'url': urljoin('https://vimeo.com', tt['url']),
- }]
+ for tt in (request.get('text_tracks') or []):
+ subtitles[tt['lang']] = [{
+ 'ext': 'vtt',
+ 'url': urljoin('https://vimeo.com', tt['url']),
+ }]
thumbnails = []
if not is_live:
- for key, thumb in video_data.get('thumbs', {}).items():
+ for key, thumb in (video_data.get('thumbs') or {}).items():
thumbnails.append({
'id': key,
'width': int_or_none(key),
@@ -342,6 +339,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'duration': 1595,
'upload_date': '20130610',
'timestamp': 1370893156,
+ 'license': 'by',
},
'params': {
'format': 'best[protocol=https]',
@@ -420,6 +418,12 @@ class VimeoIE(VimeoBaseInfoExtractor):
'uploader_id': 'staff',
'uploader': 'Vimeo Staff',
'duration': 62,
+ 'subtitles': {
+ 'de': [{'ext': 'vtt'}],
+ 'en': [{'ext': 'vtt'}],
+ 'es': [{'ext': 'vtt'}],
+ 'fr': [{'ext': 'vtt'}],
+ },
}
},
{
@@ -626,6 +630,37 @@ class VimeoIE(VimeoBaseInfoExtractor):
def _real_initialize(self):
self._login()
+ def _extract_from_api(self, video_id, unlisted_hash=None):
+ token = self._download_json(
+ 'https://vimeo.com/_rv/jwt', video_id, headers={
+ 'X-Requested-With': 'XMLHttpRequest'
+ })['token']
+ api_url = 'https://api.vimeo.com/videos/' + video_id
+ if unlisted_hash:
+ api_url += ':' + unlisted_hash
+ video = self._download_json(
+ api_url, video_id, headers={
+ 'Authorization': 'jwt ' + token,
+ }, query={
+ 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays',
+ })
+ info = self._parse_config(self._download_json(
+ video['config_url'], video_id), video_id)
+ self._vimeo_sort_formats(info['formats'])
+ get_timestamp = lambda x: parse_iso8601(video.get(x + '_time'))
+ info.update({
+ 'description': video.get('description'),
+ 'license': video.get('license'),
+ 'release_timestamp': get_timestamp('release'),
+ 'timestamp': get_timestamp('created'),
+ 'view_count': int_or_none(try_get(video, lambda x: x['stats']['plays'])),
+ })
+ connections = try_get(
+ video, lambda x: x['metadata']['connections'], dict) or {}
+ for k in ('comment', 'like'):
+ info[k + '_count'] = int_or_none(try_get(connections, lambda x: x[k + 's']['total']))
+ return info
+
def _try_album_password(self, url):
album_id = self._search_regex(
r'vimeo\.com/(?:album|showcase)/([^/]+)', url, 'album id', default=None)
@@ -675,45 +710,16 @@ class VimeoIE(VimeoBaseInfoExtractor):
# Extract ID from URL
video_id, unlisted_hash = self._match_valid_url(url).groups()
if unlisted_hash:
- token = self._download_json(
- 'https://vimeo.com/_rv/jwt', video_id, headers={
- 'X-Requested-With': 'XMLHttpRequest'
- })['token']
- video = self._download_json(
- 'https://api.vimeo.com/videos/%s:%s' % (video_id, unlisted_hash),
- video_id, headers={
- 'Authorization': 'jwt ' + token,
- }, query={
- 'fields': 'config_url,created_time,description,license,metadata.connections.comments.total,metadata.connections.likes.total,release_time,stats.plays',
- })
- info = self._parse_config(self._download_json(
- video['config_url'], video_id), video_id)
- self._vimeo_sort_formats(info['formats'])
- get_timestamp = lambda x: parse_iso8601(video.get(x + '_time'))
- info.update({
- 'description': video.get('description'),
- 'license': video.get('license'),
- 'release_timestamp': get_timestamp('release'),
- 'timestamp': get_timestamp('created'),
- 'view_count': int_or_none(try_get(video, lambda x: x['stats']['plays'])),
- })
- connections = try_get(
- video, lambda x: x['metadata']['connections'], dict) or {}
- for k in ('comment', 'like'):
- info[k + '_count'] = int_or_none(try_get(connections, lambda x: x[k + 's']['total']))
- return info
+ return self._extract_from_api(video_id, unlisted_hash)
orig_url = url
is_pro = 'vimeopro.com/' in url
- is_player = '://player.vimeo.com/video/' in url
if is_pro:
# some videos require portfolio_id to be present in player url
# https://github.com/ytdl-org/youtube-dl/issues/20070
url = self._extract_url(url, self._download_webpage(url, video_id))
if not url:
url = 'https://vimeo.com/' + video_id
- elif is_player:
- url = 'https://player.vimeo.com/video/' + video_id
elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')):
url = 'https://vimeo.com/' + video_id
@@ -734,14 +740,25 @@ class VimeoIE(VimeoBaseInfoExtractor):
expected=True)
raise
- # Now we begin extracting as much information as we can from what we
- # retrieved. First we extract the information common to all extractors,
- # and latter we extract those that are Vimeo specific.
- self.report_extraction(video_id)
+ if '://player.vimeo.com/video/' in url:
+ config = self._parse_json(self._search_regex(
+ r'\bconfig\s*=\s*({.+?})\s*;', webpage, 'info section'), video_id)
+ if config.get('view') == 4:
+ config = self._verify_player_video_password(
+ redirect_url, video_id, headers)
+ info = self._parse_config(config, video_id)
+ self._vimeo_sort_formats(info['formats'])
+ return info
+
+ if re.search(r'<form[^>]+?id="pw_form"', webpage):
+ video_password = self._get_video_password()
+ token, vuid = self._extract_xsrft_and_vuid(webpage)
+ webpage = self._verify_video_password(
+ redirect_url, video_id, video_password, token, vuid)
vimeo_config = self._extract_vimeo_config(webpage, video_id, default=None)
if vimeo_config:
- seed_status = vimeo_config.get('seed_status', {})
+ seed_status = vimeo_config.get('seed_status') or {}
if seed_status.get('state') == 'failed':
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, seed_status['title']),
@@ -750,70 +767,40 @@ class VimeoIE(VimeoBaseInfoExtractor):
cc_license = None
timestamp = None
video_description = None
+ info_dict = {}
- # Extract the config JSON
- try:
- try:
- config_url = self._html_search_regex(
- r' data-config-url="(.+?)"', webpage,
- 'config URL', default=None)
- if not config_url:
- # Sometimes new react-based page is served instead of old one that require
- # different config URL extraction approach (see
- # https://github.com/ytdl-org/youtube-dl/pull/7209)
- page_config = self._parse_json(self._search_regex(
- r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});',
- webpage, 'page config'), video_id)
- config_url = page_config['player']['config_url']
- cc_license = page_config.get('cc_license')
- timestamp = try_get(
- page_config, lambda x: x['clip']['uploaded_on'],
- compat_str)
- video_description = clean_html(dict_get(
- page_config, ('description', 'description_html_escaped')))
- config = self._download_json(config_url, video_id)
- except RegexNotFoundError:
- # For pro videos or player.vimeo.com urls
- # We try to find out to which variable is assigned the config dic
- m_variable_name = re.search(r'(\w)\.video\.id', webpage)
- if m_variable_name is not None:
- config_re = [r'%s=({[^}].+?});' % re.escape(m_variable_name.group(1))]
- else:
- config_re = [r' = {config:({.+?}),assets:', r'(?:[abc])=({.+?});']
- config_re.append(r'\bvar\s+r\s*=\s*({.+?})\s*;')
- config_re.append(r'\bconfig\s*=\s*({.+?})\s*;')
- config = self._search_regex(config_re, webpage, 'info section',
- flags=re.DOTALL)
- config = json.loads(config)
- except Exception as e:
- if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
- raise ExtractorError('The author has restricted the access to this video, try with the "--referer" option')
-
- if re.search(r'<form[^>]+?id="pw_form"', webpage) is not None:
- if '_video_password_verified' in data:
- raise ExtractorError('video password verification failed!')
- video_password = self._get_video_password()
- token, vuid = self._extract_xsrft_and_vuid(webpage)
- self._verify_video_password(
- redirect_url, video_id, video_password, token, vuid)
- return self._real_extract(
- smuggle_url(redirect_url, {'_video_password_verified': 'verified'}))
- else:
- raise ExtractorError('Unable to extract info section',
- cause=e)
+ channel_id = self._search_regex(
+ r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None)
+ if channel_id:
+ config_url = self._html_search_regex(
+ r'\bdata-config-url="([^"]+)"', webpage, 'config URL')
+ video_description = clean_html(get_element_by_class('description', webpage))
+ info_dict.update({
+ 'channel_id': channel_id,
+ 'channel_url': 'https://vimeo.com/channels/' + channel_id,
+ })
else:
- if config.get('view') == 4:
- config = self._verify_player_video_password(redirect_url, video_id, headers)
-
+ page_config = self._parse_json(self._search_regex(
+ r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});',
+ webpage, 'page config', default='{}'), video_id, fatal=False)
+ if not page_config:
+ return self._extract_from_api(video_id)
+ config_url = page_config['player']['config_url']
+ cc_license = page_config.get('cc_license')
+ clip = page_config.get('clip') or {}
+ timestamp = clip.get('uploaded_on')
+ video_description = clean_html(
+ clip.get('description') or page_config.get('description_html_escaped'))
+ config = self._download_json(config_url, video_id)
video = config.get('video') or {}
vod = video.get('vod') or {}
def is_rented():
if '>You rented this title.<' in webpage:
return True
- if config.get('user', {}).get('purchased'):
+ if try_get(config, lambda x: x['user']['purchased']):
return True
- for purchase_option in vod.get('purchase_options', []):
+ for purchase_option in (vod.get('purchase_options') or []):
if purchase_option.get('purchased'):
return True
label = purchase_option.get('label_string')
@@ -828,14 +815,14 @@ class VimeoIE(VimeoBaseInfoExtractor):
'https://player.vimeo.com/player/%s' % feature_id,
{'force_feature_id': True}), 'Vimeo')
- # Extract video description
if not video_description:
video_description = self._html_search_regex(
r'(?s)<div\s+class="[^"]*description[^"]*"[^>]*>(.*?)</div>',
webpage, 'description', default=None)
if not video_description:
video_description = self._html_search_meta(
- 'description', webpage, default=None)
+ ['description', 'og:description', 'twitter:description'],
+ webpage, default=None)
if not video_description and is_pro:
orig_webpage = self._download_webpage(
orig_url, video_id,
@@ -844,24 +831,17 @@ class VimeoIE(VimeoBaseInfoExtractor):
if orig_webpage:
video_description = self._html_search_meta(
'description', orig_webpage, default=None)
- if not video_description and not is_player:
+ if not video_description:
self.report_warning('Cannot find video description')
- # Extract upload date
if not timestamp:
timestamp = self._search_regex(
r'<time[^>]+datetime="([^"]+)"', webpage,
'timestamp', default=None)
- try:
- view_count = int(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count'))
- like_count = int(self._search_regex(r'UserLikes:(\d+)', webpage, 'like count'))
- comment_count = int(self._search_regex(r'UserComments:(\d+)', webpage, 'comment count'))
- except RegexNotFoundError:
- # This info is only available in vimeo.com/{id} urls
- view_count = None
- like_count = None
- comment_count = None
+ view_count = int_or_none(self._search_regex(r'UserPlays:(\d+)', webpage, 'view count', default=None))
+ like_count = int_or_none(self._search_regex(r'UserLikes:(\d+)', webpage, 'like count', default=None))
+ comment_count = int_or_none(self._search_regex(r'UserComments:(\d+)', webpage, 'comment count', default=None))
formats = []
@@ -881,11 +861,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
r'<link[^>]+rel=["\']license["\'][^>]+href=(["\'])(?P<license>(?:(?!\1).)+)\1',
webpage, 'license', default=None, group='license')
- channel_id = self._search_regex(
- r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None)
- channel_url = 'https://vimeo.com/channels/%s' % channel_id if channel_id else None
-
- info_dict = {
+ info_dict.update({
'formats': formats,
'timestamp': unified_timestamp(timestamp),
'description': video_description,
@@ -894,18 +870,14 @@ class VimeoIE(VimeoBaseInfoExtractor):
'like_count': like_count,
'comment_count': comment_count,
'license': cc_license,
- 'channel_id': channel_id,
- 'channel_url': channel_url,
- }
-
- info_dict = merge_dicts(info_dict, info_dict_config, json_ld)
+ })
- return info_dict
+ return merge_dicts(info_dict, info_dict_config, json_ld)
class VimeoOndemandIE(VimeoIE):
IE_NAME = 'vimeo:ondemand'
- _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/([^/]+/)?(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/(?:[^/]+/)?(?P<id>[^/?#&]+)'
_TESTS = [{
# ondemand video not available via https://vimeo.com/id
'url': 'https://vimeo.com/ondemand/20704',
diff --git a/yt_dlp/extractor/vlive.py b/yt_dlp/extractor/vlive.py
index 84f51a544..4340b1d4c 100644
--- a/yt_dlp/extractor/vlive.py
+++ b/yt_dlp/extractor/vlive.py
@@ -17,17 +17,65 @@ from ..utils import (
strip_or_none,
try_get,
urlencode_postdata,
+ url_or_none,
)
class VLiveBaseIE(NaverBaseIE):
- _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b'
+ _NETRC_MACHINE = 'vlive'
+ _logged_in = False
+
+ def _real_initialize(self):
+ if not self._logged_in:
+ VLiveBaseIE._logged_in = self._login()
+
+ def _login(self):
+ email, password = self._get_login_info()
+ if email is None:
+ return False
+
+ LOGIN_URL = 'https://www.vlive.tv/auth/email/login'
+ self._request_webpage(
+ LOGIN_URL, None, note='Downloading login cookies')
+
+ self._download_webpage(
+ LOGIN_URL, None, note='Logging in',
+ data=urlencode_postdata({'email': email, 'pwd': password}),
+ headers={
+ 'Referer': LOGIN_URL,
+ 'Content-Type': 'application/x-www-form-urlencoded'
+ })
+
+ login_info = self._download_json(
+ 'https://www.vlive.tv/auth/loginInfo', None,
+ note='Checking login status',
+ headers={'Referer': 'https://www.vlive.tv/home'})
+
+ if not try_get(login_info, lambda x: x['message']['login'], bool):
+ raise ExtractorError('Unable to log in', expected=True)
+ return True
+
+ def _call_api(self, path_template, video_id, fields=None, query_add={}, note=None):
+ if note is None:
+ note = 'Downloading %s JSON metadata' % path_template.split('/')[-1].split('-')[0]
+ query = {'appId': '8c6cc7b45d2568fb668be6e05b6e5a3b', 'gcc': 'KR', 'platformType': 'PC'}
+ if fields:
+ query['fields'] = fields
+ if query_add:
+ query.update(query_add)
+ try:
+ return self._download_json(
+ 'https://www.vlive.tv/globalv-web/vam-web/' + path_template % video_id, video_id,
+ note, headers={'Referer': 'https://www.vlive.tv/'}, query=query)
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ self.raise_login_required(json.loads(e.cause.read().decode('utf-8'))['message'])
+ raise
class VLiveIE(VLiveBaseIE):
IE_NAME = 'vlive'
_VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/(?:video|embed)/(?P<id>[0-9]+)'
- _NETRC_MACHINE = 'vlive'
_TESTS = [{
'url': 'http://www.vlive.tv/video/1326',
'md5': 'cc7314812855ce56de70a06a27314983',
@@ -38,6 +86,12 @@ class VLiveIE(VLiveBaseIE):
'creator': "Girl's Day",
'view_count': int,
'uploader_id': 'muploader_a',
+ 'upload_date': '20150817',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
+ 'timestamp': 1439816449,
+ },
+ 'params': {
+ 'skip_download': True,
},
}, {
'url': 'http://www.vlive.tv/video/16937',
@@ -49,6 +103,9 @@ class VLiveIE(VLiveBaseIE):
'view_count': int,
'subtitles': 'mincount:12',
'uploader_id': 'muploader_j',
+ 'upload_date': '20161112',
+ 'thumbnail': r're:^https?://.*\.(?:jpg|png)$',
+ 'timestamp': 1478923074,
},
'params': {
'skip_download': True,
@@ -81,53 +138,6 @@ class VLiveIE(VLiveBaseIE):
'playlist_mincount': 120
}]
- def _real_initialize(self):
- self._login()
-
- def _login(self):
- email, password = self._get_login_info()
- if None in (email, password):
- return
-
- def is_logged_in():
- login_info = self._download_json(
- 'https://www.vlive.tv/auth/loginInfo', None,
- note='Downloading login info',
- headers={'Referer': 'https://www.vlive.tv/home'})
- return try_get(
- login_info, lambda x: x['message']['login'], bool) or False
-
- LOGIN_URL = 'https://www.vlive.tv/auth/email/login'
- self._request_webpage(
- LOGIN_URL, None, note='Downloading login cookies')
-
- self._download_webpage(
- LOGIN_URL, None, note='Logging in',
- data=urlencode_postdata({'email': email, 'pwd': password}),
- headers={
- 'Referer': LOGIN_URL,
- 'Content-Type': 'application/x-www-form-urlencoded'
- })
-
- if not is_logged_in():
- raise ExtractorError('Unable to log in', expected=True)
-
- def _call_api(self, path_template, video_id, fields=None, limit=None):
- query = {'appId': self._APP_ID, 'gcc': 'KR', 'platformType': 'PC'}
- if fields:
- query['fields'] = fields
- if limit:
- query['limit'] = limit
- try:
- return self._download_json(
- 'https://www.vlive.tv/globalv-web/vam-web/' + path_template % video_id, video_id,
- 'Downloading %s JSON metadata' % path_template.split('/')[-1].split('-')[0],
- headers={'Referer': 'https://www.vlive.tv/'}, query=query)
- except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
- self.raise_login_required(json.loads(e.cause.read().decode('utf-8'))['message'])
- raise
-
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -150,7 +160,7 @@ class VLiveIE(VLiveBaseIE):
playlist_count = str_or_none(playlist.get('totalCount'))
playlist = self._call_api(
- 'playlist/v1.0/playlist-%s/posts', playlist_id, 'data', limit=playlist_count)
+ 'playlist/v1.0/playlist-%s/posts', playlist_id, 'data', {'limit': playlist_count})
entries = []
for video_data in playlist['data']:
@@ -172,6 +182,8 @@ class VLiveIE(VLiveBaseIE):
'view_count': int_or_none(video.get('playCount')),
'like_count': int_or_none(video.get('likeCount')),
'comment_count': int_or_none(video.get('commentCount')),
+ 'timestamp': int_or_none(video.get('createdAt'), scale=1000),
+ 'thumbnail': video.get('thumb'),
}
video_type = video.get('type')
@@ -216,7 +228,7 @@ class VLiveIE(VLiveBaseIE):
raise ExtractorError('Unknown status ' + status)
-class VLivePostIE(VLiveIE):
+class VLivePostIE(VLiveBaseIE):
IE_NAME = 'vlive:post'
_VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/post/(?P<id>\d-\d+)'
_TESTS = [{
@@ -238,8 +250,6 @@ class VLivePostIE(VLiveIE):
'playlist_count': 1,
}]
_FVIDEO_TMPL = 'fvideo/v1.0/fvideo-%%s/%s'
- _SOS_TMPL = _FVIDEO_TMPL % 'sosPlayInfo'
- _INKEY_TMPL = _FVIDEO_TMPL % 'inKey'
def _real_extract(self, url):
post_id = self._match_id(url)
@@ -266,7 +276,7 @@ class VLivePostIE(VLiveIE):
entry = None
if upload_type == 'SOS':
download = self._call_api(
- self._SOS_TMPL, video_id)['videoUrl']['download']
+ self._FVIDEO_TMPL % 'sosPlayInfo', video_id)['videoUrl']['download']
formats = []
for f_id, f_url in download.items():
formats.append({
@@ -284,7 +294,7 @@ class VLivePostIE(VLiveIE):
vod_id = upload_info.get('videoId')
if not vod_id:
continue
- inkey = self._call_api(self._INKEY_TMPL, video_id)['inKey']
+ inkey = self._call_api(self._FVIDEO_TMPL % 'inKey', video_id)['inKey']
entry = self._extract_video_info(video_id, vod_id, inkey)
if entry:
entry['title'] = '%s_part%s' % (title, idx)
@@ -295,7 +305,7 @@ class VLivePostIE(VLiveIE):
class VLiveChannelIE(VLiveBaseIE):
IE_NAME = 'vlive:channel'
- _VALID_URL = r'https?://(?:channels\.vlive\.tv|(?:(?:www|m)\.)?vlive\.tv/channel)/(?P<id>[0-9A-Z]+)'
+ _VALID_URL = r'https?://(?:channels\.vlive\.tv|(?:(?:www|m)\.)?vlive\.tv/channel)/(?P<channel_id>[0-9A-Z]+)(?:/board/(?P<posts_id>\d+))?'
_TESTS = [{
'url': 'http://channels.vlive.tv/FCD4B',
'info_dict': {
@@ -306,78 +316,58 @@ class VLiveChannelIE(VLiveBaseIE):
}, {
'url': 'https://www.vlive.tv/channel/FCD4B',
'only_matching': True,
+ }, {
+ 'url': 'https://www.vlive.tv/channel/FCD4B/board/3546',
+ 'info_dict': {
+ 'id': 'FCD4B-3546',
+ 'title': 'MAMAMOO - Star Board',
+ },
+ 'playlist_mincount': 880
}]
- def _call_api(self, path, channel_key_suffix, channel_value, note, query):
- q = {
- 'app_id': self._APP_ID,
- 'channel' + channel_key_suffix: channel_value,
- }
- q.update(query)
- return self._download_json(
- 'http://api.vfan.vlive.tv/vproxy/channelplus/' + path,
- channel_value, note='Downloading ' + note, query=q)['result']
-
- def _real_extract(self, url):
- channel_code = self._match_id(url)
-
- channel_seq = self._call_api(
- 'decodeChannelCode', 'Code', channel_code,
- 'decode channel code', {})['channelSeq']
-
- channel_name = None
- entries = []
+ def _entries(self, posts_id, board_name):
+ if board_name:
+ posts_path = 'post/v1.0/board-%s/posts'
+ query_add = {'limit': 100, 'sortType': 'LATEST'}
+ else:
+ posts_path = 'post/v1.0/channel-%s/starPosts'
+ query_add = {'limit': 100}
for page_num in itertools.count(1):
video_list = self._call_api(
- 'getChannelVideoList', 'Seq', channel_seq,
- 'channel list page #%d' % page_num, {
- # Large values of maxNumOfRows (~300 or above) may cause
- # empty responses (see [1]), e.g. this happens for [2] that
- # has more than 300 videos.
- # 1. https://github.com/ytdl-org/youtube-dl/issues/13830
- # 2. http://channels.vlive.tv/EDBF.
- 'maxNumOfRows': 100,
- 'pageNo': page_num
- }
- )
-
- if not channel_name:
- channel_name = try_get(
- video_list,
- lambda x: x['channelInfo']['channelName'],
- compat_str)
+ posts_path, posts_id, 'channel{channelName},contentType,postId,title,url', query_add,
+ note=f'Downloading playlist page {page_num}')
+
+ for video in try_get(video_list, lambda x: x['data'], list) or []:
+ video_id = str(video.get('postId'))
+ video_title = str_or_none(video.get('title'))
+ video_url = url_or_none(video.get('url'))
+ if not all((video_id, video_title, video_url)) or video.get('contentType') != 'VIDEO':
+ continue
+ channel_name = try_get(video, lambda x: x['channel']['channelName'], compat_str)
+ yield self.url_result(video_url, VLivePostIE.ie_key(), video_id, video_title, channel=channel_name)
- videos = try_get(
- video_list, lambda x: x['videoList'], list)
- if not videos:
+ after = try_get(video_list, lambda x: x['paging']['nextParams']['after'], compat_str)
+ if not after:
break
+ query_add['after'] = after
- for video in videos:
- video_id = video.get('videoSeq')
- video_type = video.get('videoType')
+ def _real_extract(self, url):
+ channel_id, posts_id = self._match_valid_url(url).groups()
- if not video_id or not video_type:
- continue
- video_id = compat_str(video_id)
-
- if video_type in ('PLAYLIST'):
- first_video_id = try_get(
- video,
- lambda x: x['videoPlaylist']['videoList'][0]['videoSeq'], int)
-
- if not first_video_id:
- continue
-
- entries.append(
- self.url_result(
- 'http://www.vlive.tv/video/%s' % first_video_id,
- ie=VLiveIE.ie_key(), video_id=first_video_id))
- else:
- entries.append(
- self.url_result(
- 'http://www.vlive.tv/video/%s' % video_id,
- ie=VLiveIE.ie_key(), video_id=video_id))
+ board_name = None
+ if posts_id:
+ board = self._call_api(
+ 'board/v1.0/board-%s', posts_id, 'title,boardType')
+ board_name = board.get('title') or 'Unknown'
+ if board.get('boardType') not in ('STAR', 'VLIVE_PLUS'):
+ raise ExtractorError(f'Board {board_name!r} is not supported', expected=True)
+
+ entries = self._entries(posts_id or channel_id, board_name)
+ first_video = next(entries)
+ channel_name = first_video['channel']
return self.playlist_result(
- entries, channel_code, channel_name)
+ itertools.chain([first_video], entries),
+ f'{channel_id}-{posts_id}' if posts_id else channel_id,
+ f'{channel_name} - {board_name}' if channel_name and board_name else channel_name)
diff --git a/yt_dlp/extractor/wakanim.py b/yt_dlp/extractor/wakanim.py
index c956d616e..a61a630e2 100644
--- a/yt_dlp/extractor/wakanim.py
+++ b/yt_dlp/extractor/wakanim.py
@@ -1,6 +1,8 @@
# coding: utf-8
from __future__ import unicode_literals
+from urllib.parse import unquote
+
from .common import InfoExtractor
from ..utils import (
merge_dicts,
@@ -31,26 +33,37 @@ class WakanimIE(InfoExtractor):
'url': 'https://www.wakanim.tv/de/v2/catalogue/episode/7843/sword-art-online-alicization-omu-arc-2-folge-15-omu',
'only_matching': True,
}]
+ _GEO_BYPASS = False
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- m3u8_url = urljoin(url, self._search_regex(
- r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'm3u8 url',
+ if 'Geoblocking' in webpage:
+ if '/de/' in url:
+ self.raise_geo_restricted(countries=['DE', 'AT', 'CH'])
+ else:
+ self.raise_geo_restricted(countries=['RU'])
+
+ manifest_url = urljoin(url, self._search_regex(
+ r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, 'manifest url',
group='url'))
if not self.get_param('allow_unplayable_formats'):
# https://docs.microsoft.com/en-us/azure/media-services/previous/media-services-content-protection-overview#streaming-urls
encryption = self._search_regex(
r'encryption%3D(c(?:enc|bc(?:s-aapl)?))',
- m3u8_url, 'encryption', default=None)
+ manifest_url, 'encryption', default=None)
if encryption in ('cenc', 'cbcs-aapl'):
self.report_drm(video_id)
- formats = self._extract_m3u8_formats(
- m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',
- m3u8_id='hls')
+ if 'format=mpd-time-cmaf' in unquote(manifest_url):
+ formats = self._extract_mpd_formats(
+ manifest_url, video_id, mpd_id='dash')
+ else:
+ formats = self._extract_m3u8_formats(
+ manifest_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls')
info = self._search_json_ld(webpage, video_id, default={})
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py
index aa58a22bf..658b45fe1 100644
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -695,7 +695,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
class YoutubeIE(YoutubeBaseInfoExtractor):
- IE_DESC = 'YouTube.com'
+ IE_DESC = 'YouTube'
_INVIDIOUS_SITES = (
# invidious-redirect websites
r'(?:www\.)?redirect\.invidious\.io',
@@ -2696,6 +2696,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
thumbnails.append({
'url': thumbnail_url,
})
+ original_thumbnails = thumbnails.copy()
+
# The best resolution thumbnails sometimes does not appear in the webpage
# See: https://github.com/ytdl-org/youtube-dl/issues/29049, https://github.com/yt-dlp/yt-dlp/issues/340
# List of possible thumbnails - Ref: <https://stackoverflow.com/a/20542029>
@@ -2706,7 +2708,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'default', '1', '2', '3'
]
n_thumbnail_names = len(thumbnail_names)
-
thumbnails.extend({
'url': 'https://i.ytimg.com/vi{webp}/{video_id}/{name}{live}.{ext}'.format(
video_id=video_id, name=name, ext=ext,
@@ -2716,6 +2717,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
i = next((i for i, t in enumerate(thumbnail_names) if f'/{video_id}/{t}' in thumb['url']), n_thumbnail_names)
thumb['preference'] = (0 if '.webp' in thumb['url'] else -1) - (2 * i)
self._remove_duplicate_formats(thumbnails)
+ self._downloader._sort_thumbnails(original_thumbnails)
category = get_first(microformats, 'category') or search_meta('genre')
channel_id = str_or_none(
@@ -2745,6 +2747,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'title': self._live_title(video_title) if is_live else video_title,
'formats': formats,
'thumbnails': thumbnails,
+ # The best thumbnail that we are sure exists. Prevents unnecessary
+ # URL checking if user don't care about getting the best possible thumbnail
+ 'thumbnail': traverse_obj(original_thumbnails, (-1, 'url')),
'description': video_description,
'upload_date': unified_strdate(
get_first(microformats, 'uploadDate')
@@ -3010,7 +3015,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
class YoutubeTabIE(YoutubeBaseInfoExtractor):
- IE_DESC = 'YouTube.com tab'
+ IE_DESC = 'YouTube Tabs'
_VALID_URL = r'''(?x)
https?://
(?:\w+\.)?
@@ -4238,7 +4243,7 @@ class YoutubeTabIE(YoutubeBaseInfoExtractor):
class YoutubePlaylistIE(InfoExtractor):
- IE_DESC = 'YouTube.com playlists'
+ IE_DESC = 'YouTube playlists'
_VALID_URL = r'''(?x)(?:
(?:https?://)?
(?:\w+\.)?
@@ -4304,9 +4309,7 @@ class YoutubePlaylistIE(InfoExtractor):
def suitable(cls, url):
if YoutubeTabIE.suitable(url):
return False
- # Hack for lazy extractors until more generic solution is implemented
- # (see #28780)
- from .youtube import parse_qs
+ from ..utils import parse_qs
qs = parse_qs(url)
if qs.get('v', [None])[0]:
return False
@@ -4364,7 +4367,7 @@ class YoutubeYtBeIE(InfoExtractor):
class YoutubeYtUserIE(InfoExtractor):
- IE_DESC = 'YouTube.com user videos, URL or "ytuser" keyword'
+ IE_DESC = 'YouTube user videos; "ytuser:" prefix'
_VALID_URL = r'ytuser:(?P<id>.+)'
_TESTS = [{
'url': 'ytuser:phihag',
@@ -4380,7 +4383,7 @@ class YoutubeYtUserIE(InfoExtractor):
class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
IE_NAME = 'youtube:favorites'
- IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)'
+ IE_DESC = 'YouTube liked videos; ":ytfav" keyword (requires cookies)'
_VALID_URL = r':ytfav(?:ou?rite)?s?'
_LOGIN_REQUIRED = True
_TESTS = [{
@@ -4398,10 +4401,7 @@ class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
class YoutubeSearchIE(SearchInfoExtractor, YoutubeTabIE):
- IE_DESC = 'YouTube.com searches, "ytsearch" keyword'
- # there doesn't appear to be a real limit, for example if you search for
- # 'python' you get more than 8.000.000 results
- _MAX_RESULTS = float('inf')
+ IE_DESC = 'YouTube searches'
IE_NAME = 'youtube:search'
_SEARCH_KEY = 'ytsearch'
_SEARCH_PARAMS = None
@@ -4461,13 +4461,14 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeTabIE):
class YoutubeSearchDateIE(YoutubeSearchIE):
IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
_SEARCH_KEY = 'ytsearchdate'
- IE_DESC = 'YouTube.com searches, newest videos first, "ytsearchdate" keyword'
+ IE_DESC = 'YouTube searches, newest videos first'
_SEARCH_PARAMS = 'CAI%3D'
class YoutubeSearchURLIE(YoutubeSearchIE):
- IE_DESC = 'YouTube.com search URLs'
+ IE_DESC = 'YouTube search URLs with sorting and filter support'
IE_NAME = YoutubeSearchIE.IE_NAME + '_url'
+ _SEARCH_KEY = None
_VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'
# _MAX_RESULTS = 100
_TESTS = [{
@@ -4513,7 +4514,7 @@ class YoutubeFeedsInfoExtractor(YoutubeTabIE):
class YoutubeWatchLaterIE(InfoExtractor):
IE_NAME = 'youtube:watchlater'
- IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
+ IE_DESC = 'Youtube watch later list; ":ytwatchlater" keyword (requires cookies)'
_VALID_URL = r':ytwatchlater'
_TESTS = [{
'url': ':ytwatchlater',
@@ -4526,7 +4527,7 @@ class YoutubeWatchLaterIE(InfoExtractor):
class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
- IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
+ IE_DESC = 'YouTube recommended videos; ":ytrec" keyword'
_VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?'
_FEED_NAME = 'recommended'
_LOGIN_REQUIRED = False
@@ -4543,7 +4544,7 @@ class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
- IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)'
+ IE_DESC = 'YouTube subscriptions feed; ":ytsubs" keyword (requires cookies)'
_VALID_URL = r':ytsub(?:scription)?s?'
_FEED_NAME = 'subscriptions'
_TESTS = [{
@@ -4556,7 +4557,7 @@ class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
- IE_DESC = 'Youtube watch history, ":ythis" for short (requires authentication)'
+ IE_DESC = 'Youtube watch history; ":ythis" keyword (requires cookies)'
_VALID_URL = r':ythis(?:tory)?'
_FEED_NAME = 'history'
_TESTS = [{