aboutsummaryrefslogtreecommitdiffstats
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/biqle.py20
-rw-r--r--youtube_dl/extractor/doodstream.py71
-rw-r--r--youtube_dl/extractor/extractors.py7
-rw-r--r--youtube_dl/extractor/francetv.py12
-rw-r--r--youtube_dl/extractor/hrfensehen.py102
-rw-r--r--youtube_dl/extractor/soundcloud.py97
-rw-r--r--youtube_dl/extractor/storyfire.py255
-rw-r--r--youtube_dl/extractor/twitch.py20
-rw-r--r--youtube_dl/extractor/videa.py62
-rw-r--r--youtube_dl/extractor/viki.py4
-rw-r--r--youtube_dl/extractor/xhamster.py17
-rw-r--r--youtube_dl/extractor/youtube.py14
12 files changed, 633 insertions, 48 deletions
diff --git a/youtube_dl/extractor/biqle.py b/youtube_dl/extractor/biqle.py
index af21e3ee5..17ebbb257 100644
--- a/youtube_dl/extractor/biqle.py
+++ b/youtube_dl/extractor/biqle.py
@@ -3,10 +3,11 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from .vk import VKIE
-from ..utils import (
- HEADRequest,
- int_or_none,
+from ..compat import (
+ compat_b64decode,
+ compat_urllib_parse_unquote,
)
+from ..utils import int_or_none
class BIQLEIE(InfoExtractor):
@@ -47,9 +48,16 @@ class BIQLEIE(InfoExtractor):
if VKIE.suitable(embed_url):
return self.url_result(embed_url, VKIE.ie_key(), video_id)
- self._request_webpage(
- HEADRequest(embed_url), video_id, headers={'Referer': url})
- video_id, sig, _, access_token = self._get_cookies(embed_url)['video_ext'].value.split('%3A')
+ embed_page = self._download_webpage(
+ embed_url, video_id, headers={'Referer': url})
+ video_ext = self._get_cookies(embed_url).get('video_ext')
+ if video_ext:
+ video_ext = compat_urllib_parse_unquote(video_ext.value)
+ if not video_ext:
+ video_ext = compat_b64decode(self._search_regex(
+ r'video_ext\s*:\s*[\'"]([A-Za-z0-9+/=]+)',
+ embed_page, 'video_ext')).decode()
+ video_id, sig, _, access_token = video_ext.split(':')
item = self._download_json(
'https://api.vk.com/method/video.get', video_id,
headers={'User-Agent': 'okhttp/3.4.1'}, query={
diff --git a/youtube_dl/extractor/doodstream.py b/youtube_dl/extractor/doodstream.py
new file mode 100644
index 000000000..2c9ea6898
--- /dev/null
+++ b/youtube_dl/extractor/doodstream.py
@@ -0,0 +1,71 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import string
+import random
+import time
+
+from .common import InfoExtractor
+
+
+class DoodStreamIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?dood\.(?:to|watch)/[ed]/(?P<id>[a-z0-9]+)'
+ _TESTS = [{
+ 'url': 'http://dood.to/e/5s1wmbdacezb',
+ 'md5': '4568b83b31e13242b3f1ff96c55f0595',
+ 'info_dict': {
+ 'id': '5s1wmbdacezb',
+ 'ext': 'mp4',
+ 'title': 'Kat Wonders - Monthly May 2020',
+ 'description': 'Kat Wonders - Monthly May 2020 | DoodStream.com',
+ 'thumbnail': 'https://img.doodcdn.com/snaps/flyus84qgl2fsk4g.jpg',
+ }
+ }, {
+ 'url': 'https://dood.to/d/jzrxn12t2s7n',
+ 'md5': '3207e199426eca7c2aa23c2872e6728a',
+ 'info_dict': {
+ 'id': 'jzrxn12t2s7n',
+ 'ext': 'mp4',
+ 'title': 'Stacy Cruz Cute ALLWAYSWELL',
+ 'description': 'Stacy Cruz Cute ALLWAYSWELL | DoodStream.com',
+ 'thumbnail': 'https://img.doodcdn.com/snaps/8edqd5nppkac3x8u.jpg',
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ if '/d/' in url:
+ url = "https://dood.to" + self._html_search_regex(
+ r'<iframe src="(/e/[a-z0-9]+)"', webpage, 'embed')
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._html_search_meta(['og:title', 'twitter:title'],
+ webpage, default=None)
+ thumb = self._html_search_meta(['og:image', 'twitter:image'],
+ webpage, default=None)
+ token = self._html_search_regex(r'[?&]token=([a-z0-9]+)[&\']', webpage, 'token')
+ description = self._html_search_meta(
+ ['og:description', 'description', 'twitter:description'],
+ webpage, default=None)
+ auth_url = 'https://dood.to' + self._html_search_regex(
+ r'(/pass_md5.*?)\'', webpage, 'pass_md5')
+ headers = {
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/66.0',
+ 'referer': url
+ }
+
+ webpage = self._download_webpage(auth_url, video_id, headers=headers)
+ final_url = webpage + ''.join([random.choice(string.ascii_letters + string.digits) for _ in range(10)]) + "?token=" + token + "&expiry=" + str(int(time.time() * 1000))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': final_url,
+ 'http_headers': headers,
+ 'ext': 'mp4',
+ 'description': description,
+ 'thumbnail': thumb,
+ }
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index 4b3092028..e213b1bea 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -293,6 +293,7 @@ from .discoverynetworks import DiscoveryNetworksDeIE
from .discoveryvr import DiscoveryVRIE
from .disney import DisneyIE
from .dispeak import DigitallySpeakingIE
+from .doodstream import DoodStreamIE
from .dropbox import DropboxIE
from .dw import (
DWIE,
@@ -440,6 +441,7 @@ from .hotstar import (
)
from .howcast import HowcastIE
from .howstuffworks import HowStuffWorksIE
+from .hrfensehen import HRFernsehenIE
from .hrti import (
HRTiIE,
HRTiPlaylistIE,
@@ -1057,6 +1059,11 @@ from .spike import (
BellatorIE,
ParamountNetworkIE,
)
+from .storyfire import (
+ StoryFireIE,
+ StoryFireUserIE,
+ StoryFireSeriesIE,
+)
from .stitcher import StitcherIE
from .sport5 import Sport5IE
from .sportbox import SportBoxIE
diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py
index 81b468c7d..e340cddba 100644
--- a/youtube_dl/extractor/francetv.py
+++ b/youtube_dl/extractor/francetv.py
@@ -316,13 +316,14 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor):
_VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P<id>[^/?#&.]+)'
_TESTS = [{
- 'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html',
+ 'url': 'https://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-jeudi-22-aout-2019_3561461.html',
'info_dict': {
- 'id': '84981923',
+ 'id': 'd12458ee-5062-48fe-bfdd-a30d6a01b793',
'ext': 'mp4',
'title': 'Soir 3',
- 'upload_date': '20130826',
- 'timestamp': 1377548400,
+ 'upload_date': '20190822',
+ 'timestamp': 1566510900,
+ 'description': 'md5:72d167097237701d6e8452ff03b83c00',
'subtitles': {
'fr': 'mincount:2',
},
@@ -374,7 +375,8 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor):
video_id = self._search_regex(
(r'player\.load[^;]+src:\s*["\']([^"\']+)',
r'id-video=([^@]+@[^"]+)',
- r'<a[^>]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"'),
+ r'<a[^>]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"',
+ r'data-id="([^"]+)"'),
webpage, 'video id')
return self._make_url_result(video_id)
diff --git a/youtube_dl/extractor/hrfensehen.py b/youtube_dl/extractor/hrfensehen.py
new file mode 100644
index 000000000..2beadef2c
--- /dev/null
+++ b/youtube_dl/extractor/hrfensehen.py
@@ -0,0 +1,102 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from youtube_dl.utils import int_or_none, unified_timestamp, unescapeHTML
+from .common import InfoExtractor
+
+
+class HRFernsehenIE(InfoExtractor):
+ IE_NAME = 'hrfernsehen'
+ _VALID_URL = r'^https?://www\.(?:hr-fernsehen|hessenschau)\.de/.*,video-(?P<id>[0-9]{6})\.html'
+
+ _TESTS = [{
+ 'url': 'https://www.hessenschau.de/tv-sendung/hessenschau-vom-26082020,video-130546.html',
+ 'md5': '5c4e0ba94677c516a2f65a84110fc536',
+ 'info_dict': {
+ 'id': '130546',
+ 'ext': 'mp4',
+ 'description': 'Sturmtief Kirsten fegt über Hessen / Die Corona-Pandemie – eine Chronologie / '
+ 'Sterbehilfe: Die Lage in Hessen / Miss Hessen leitet zwei eigene Unternehmen / '
+ 'Pop-Up Museum zeigt Schwarze Unterhaltung und Black Music',
+ 'subtitles': {'de': [{
+ 'url': 'https://hr-a.akamaihd.net/video/as/hessenschau/2020_08/hrLogo_200826200407_L385592_512x288-25p-500kbit.vtt'
+ }]},
+ 'timestamp': 1598470200,
+ 'upload_date': '20200826',
+ 'thumbnails': [{
+ 'url': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9.jpg',
+ 'id': '0'
+ }, {
+ 'url': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9__medium.jpg',
+ 'id': '1'
+ }],
+ 'title': 'hessenschau vom 26.08.2020'
+ }
+ }, {
+ 'url': 'https://www.hr-fernsehen.de/sendungen-a-z/mex/sendungen/fair-und-gut---was-hinter-aldis-eigenem-guetesiegel-steckt,video-130544.html',
+ 'only_matching': True
+ }]
+
+ _GEO_COUNTRIES = ['DE']
+
+ def extract_airdate(self, loader_data):
+ airdate_str = loader_data.get('mediaMetadata', {}).get('agf', {}).get('airdate')
+
+ if airdate_str is None:
+ return None
+
+ return unified_timestamp(airdate_str)
+
+ def extract_formats(self, loader_data):
+ stream_formats = []
+ for stream_obj in loader_data["videoResolutionLevels"]:
+ stream_format = {
+ 'format_id': str(stream_obj['verticalResolution']) + "p",
+ 'height': stream_obj['verticalResolution'],
+ 'url': stream_obj['url'],
+ }
+
+ quality_information = re.search(r'([0-9]{3,4})x([0-9]{3,4})-([0-9]{2})p-([0-9]{3,4})kbit',
+ stream_obj['url'])
+ if quality_information:
+ stream_format['width'] = int_or_none(quality_information.group(1))
+ stream_format['height'] = int_or_none(quality_information.group(2))
+ stream_format['fps'] = int_or_none(quality_information.group(3))
+ stream_format['tbr'] = int_or_none(quality_information.group(4))
+
+ stream_formats.append(stream_format)
+
+ self._sort_formats(stream_formats)
+ return stream_formats
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._html_search_meta(
+ ['og:title', 'twitter:title', 'name'], webpage)
+ description = self._html_search_meta(
+ ['description'], webpage)
+
+ loader_str = unescapeHTML(self._search_regex(r"data-hr-mediaplayer-loader='([^']*)'", webpage, "ardloader"))
+ loader_data = json.loads(loader_str)
+
+ info = {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'formats': self.extract_formats(loader_data),
+ 'timestamp': self.extract_airdate(loader_data)
+ }
+
+ if "subtitle" in loader_data:
+ info["subtitles"] = {"de": [{"url": loader_data["subtitle"]}]}
+
+ thumbnails = list(set([t for t in loader_data.get("previewImageUrl", {}).values()]))
+ if len(thumbnails) > 0:
+ info["thumbnails"] = [{"url": t} for t in thumbnails]
+
+ return info
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py
index d37c52543..ac09cb5e6 100644
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@@ -3,6 +3,8 @@ from __future__ import unicode_literals
import itertools
import re
+import json
+import random
from .common import (
InfoExtractor,
@@ -28,6 +30,7 @@ from ..utils import (
update_url_query,
url_or_none,
urlhandle_detect_ext,
+ sanitized_Request,
)
@@ -309,7 +312,81 @@ class SoundcloudIE(InfoExtractor):
raise
def _real_initialize(self):
- self._CLIENT_ID = self._downloader.cache.load('soundcloud', 'client_id') or 'YUKXoArFcqrlQn9tfNHvvyfnDISj04zk'
+ self._CLIENT_ID = self._downloader.cache.load('soundcloud', 'client_id') or "T5R4kgWS2PRf6lzLyIravUMnKlbIxQag" # 'EXLwg5lHTO2dslU5EePe3xkw0m1h86Cd' # 'YUKXoArFcqrlQn9tfNHvvyfnDISj04zk'
+ self._login()
+
+ _USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36"
+ _API_AUTH_QUERY_TEMPLATE = '?client_id=%s'
+ _API_AUTH_URL_PW = 'https://api-auth.soundcloud.com/web-auth/sign-in/password%s'
+ _access_token = None
+ _HEADERS = {}
+ _NETRC_MACHINE = 'soundcloud'
+
+ def _login(self):
+ username, password = self._get_login_info()
+ if username is None:
+ return
+
+ def genDevId():
+ def genNumBlock():
+ return ''.join([str(random.randrange(10)) for i in range(6)])
+ return '-'.join([genNumBlock() for i in range(4)])
+
+ payload = {
+ 'client_id': self._CLIENT_ID,
+ 'recaptcha_pubkey': 'null',
+ 'recaptcha_response': 'null',
+ 'credentials': {
+ 'identifier': username,
+ 'password': password
+ },
+ 'signature': self.sign(username, password, self._CLIENT_ID),
+ 'device_id': genDevId(),
+ 'user_agent': self._USER_AGENT
+ }
+
+ query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID
+ login = sanitized_Request(self._API_AUTH_URL_PW % query, json.dumps(payload).encode('utf-8'))
+ response = self._download_json(login, None)
+ self._access_token = response.get('session').get('access_token')
+ if not self._access_token:
+ self.report_warning('Unable to get access token, login may has failed')
+ else:
+ self._HEADERS = {'Authorization': 'OAuth ' + self._access_token}
+
+ # signature generation
+ def sign(self, user, pw, clid):
+ a = 33
+ i = 1
+ s = 440123
+ w = 117
+ u = 1800000
+ l = 1042
+ b = 37
+ k = 37
+ c = 5
+ n = "0763ed7314c69015fd4a0dc16bbf4b90" # _KEY
+ y = "8" # _REV
+ r = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36" # _USER_AGENT
+ e = user # _USERNAME
+ t = clid # _CLIENT_ID
+
+ d = '-'.join([str(mInt) for mInt in [a, i, s, w, u, l, b, k]])
+ p = n + y + d + r + e + t + d + n
+ h = p
+
+ m = 8011470
+ f = 0
+
+ for f in range(f, len(h)):
+ m = (m >> 1) + ((1 & m) << 23)
+ m += ord(h[f])
+ m &= 16777215
+
+ # c is not even needed
+ out = str(y) + ':' + str(d) + ':' + format(m, 'x') + ':' + str(c)
+
+ return out
@classmethod
def _resolv_url(cls, url):
@@ -389,7 +466,7 @@ class SoundcloudIE(InfoExtractor):
if not format_url:
continue
stream = self._download_json(
- format_url, track_id, query=query, fatal=False)
+ format_url, track_id, query=query, fatal=False, headers=self._HEADERS)
if not isinstance(stream, dict):
continue
stream_url = url_or_none(stream.get('url'))
@@ -487,7 +564,7 @@ class SoundcloudIE(InfoExtractor):
info_json_url = self._resolv_url(self._BASE_URL + resolve_title)
info = self._download_json(
- info_json_url, full_title, 'Downloading info JSON', query=query)
+ info_json_url, full_title, 'Downloading info JSON', query=query, headers=self._HEADERS)
return self._extract_info_dict(info, full_title, token)
@@ -503,7 +580,7 @@ class SoundcloudPlaylistBaseIE(SoundcloudIE):
'ids': ','.join([compat_str(t['id']) for t in tracks]),
'playlistId': playlist_id,
'playlistSecretToken': token,
- })
+ }, headers=self._HEADERS)
entries = []
for track in tracks:
track_id = str_or_none(track.get('id'))
@@ -547,7 +624,7 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE):
full_title += '/' + token
info = self._download_json(self._resolv_url(
- self._BASE_URL + full_title), full_title)
+ self._BASE_URL + full_title), full_title, headers=self._HEADERS)
if 'errors' in info:
msgs = (compat_str(err['error_message']) for err in info['errors'])
@@ -572,7 +649,7 @@ class SoundcloudPagedPlaylistBaseIE(SoundcloudIE):
for i in itertools.count():
response = self._download_json(
next_href, playlist_id,
- 'Downloading track page %s' % (i + 1), query=query)
+ 'Downloading track page %s' % (i + 1), query=query, headers=self._HEADERS)
collection = response['collection']
@@ -694,7 +771,7 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE):
user = self._download_json(
self._resolv_url(self._BASE_URL + uploader),
- uploader, 'Downloading user info')
+ uploader, 'Downloading user info', headers=self._HEADERS)
resource = mobj.group('rsrc') or 'all'
@@ -719,7 +796,7 @@ class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE):
def _real_extract(self, url):
track_name = self._match_id(url)
- track = self._download_json(self._resolv_url(url), track_name)
+ track = self._download_json(self._resolv_url(url), track_name, headers=self._HEADERS)
track_id = self._search_regex(
r'soundcloud:track-stations:(\d+)', track['id'], 'track id')
@@ -752,7 +829,7 @@ class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE):
data = self._download_json(
self._API_V2_BASE + 'playlists/' + playlist_id,
- playlist_id, 'Downloading playlist', query=query)
+ playlist_id, 'Downloading playlist', query=query, headers=self._HEADERS)
return self._extract_set(data, token)
@@ -789,7 +866,7 @@ class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE):
for i in itertools.count(1):
response = self._download_json(
next_url, collection_id, 'Downloading page {0}'.format(i),
- 'Unable to download API page')
+ 'Unable to download API page', headers=self._HEADERS)
collection = response.get('collection', [])
if not collection:
diff --git a/youtube_dl/extractor/storyfire.py b/youtube_dl/extractor/storyfire.py
new file mode 100644
index 000000000..67457cc94
--- /dev/null
+++ b/youtube_dl/extractor/storyfire.py
@@ -0,0 +1,255 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+from .common import InfoExtractor
+
+
+class StoryFireIE(InfoExtractor):
+ _VALID_URL = r'(?:(?:https?://(?:www\.)?storyfire\.com/video-details)|(?:https://storyfire.app.link))/(?P<id>[^/\s]+)'
+ _TESTS = [{
+ 'url': 'https://storyfire.com/video-details/5df1d132b6378700117f9181',
+ 'md5': '560953bfca81a69003cfa5e53ac8a920',
+ 'info_dict': {
+ 'id': '5df1d132b6378700117f9181',
+ 'ext': 'mp4',
+ 'title': 'Buzzfeed Teaches You About Memes',
+ 'uploader_id': 'ntZAJFECERSgqHSxzonV5K2E89s1',
+ 'timestamp': 1576129028,
+ 'description': 'Mocking Buzzfeed\'s meme lesson. Reuploaded from YouTube because of their new policies',
+ 'uploader': 'whang!',
+ 'upload_date': '20191212',
+ },
+ 'params': {'format': 'bestvideo'} # There are no merged formats in the playlist.
+ }, {
+ 'url': 'https://storyfire.app.link/5GxAvWOQr8', # Alternate URL format, with unrelated short ID
+ 'md5': '7a2dc6d60c4889edfed459c620fe690d',
+ 'info_dict': {
+ 'id': '5f1e11ecd78a57b6c702001d',
+ 'ext': 'm4a',
+ 'title': 'Weird Nintendo Prototype Leaks',
+ 'description': 'A stream taking a look at some weird Nintendo Prototypes with Luigi in Mario 64 and weird Yoshis',
+ 'timestamp': 1595808576,
+ 'upload_date': '20200727',
+ 'uploader': 'whang!',
+ 'uploader_id': 'ntZAJFECERSgqHSxzonV5K2E89s1',
+ },
+ 'params': {'format': 'bestaudio'} # Verifying audio extraction
+
+ }]
+
+ _aformats = {
+ 'audio-medium-audio': {'acodec': 'aac', 'abr': 125, 'preference': -10},
+ 'audio-high-audio': {'acodec': 'aac', 'abr': 254, 'preference': -1},
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ # Extracting the json blob is mandatory to proceed with extraction.
+ jsontext = self._html_search_regex(
+ r'<script id="__NEXT_DATA__" type="application/json">(.+?)</script>',
+ webpage, 'json_data')
+
+ json = self._parse_json(jsontext, video_id)
+
+ # The currentVideo field in the json is mandatory
+ # because it contains the only link to the m3u playlist
+ video = json['props']['initialState']['video']['currentVideo']
+ videourl = video['vimeoVideoURL'] # Video URL is mandatory
+
+ # Extract other fields from the json in an error tolerant fashion
+ # ID may be incorrect (on short URL format), correct it.
+ parsed_id = video.get('_id')
+ if parsed_id:
+ video_id = parsed_id
+
+ title = video.get('title')
+ description = video.get('description')
+
+ thumbnail = video.get('storyImage')
+ views = video.get('views')
+ likes = video.get('likesCount')
+ comments = video.get('commentsCount')
+ duration = video.get('videoDuration')
+ publishdate = video.get('publishDate') # Apparently epoch time, day only
+
+ uploader = video.get('username')
+ uploader_id = video.get('hostID')
+ # Construct an uploader URL
+ uploader_url = None
+ if uploader_id:
+ uploader_url = "https://storyfire.com/user/%s/video" % uploader_id
+
+ # Collect root playlist to determine formats
+ formats = self._extract_m3u8_formats(
+ videourl, video_id, 'mp4', 'm3u8_native')
+
+ # Modify formats to fill in missing information about audio codecs
+ for format in formats:
+ aformat = self._aformats.get(format['format_id'])
+ if aformat:
+ format['acodec'] = aformat['acodec']
+ format['abr'] = aformat['abr']
+ format['preference'] = aformat['preference']
+ format['ext'] = 'm4a'
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'ext': "mp4",
+ 'url': videourl,
+ 'formats': formats,
+
+ 'thumbnail': thumbnail,
+ 'view_count': views,
+ 'like_count': likes,
+ 'comment_count': comments,
+ 'duration': duration,
+ 'timestamp': publishdate,
+
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'uploader_url': uploader_url,
+
+ }
+
+
+class StoryFireUserIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?storyfire\.com/user/(?P<id>[^/\s]+)/video'
+ _TESTS = [{
+ 'url': 'https://storyfire.com/user/ntZAJFECERSgqHSxzonV5K2E89s1/video',
+ 'info_dict': {
+ 'id': 'ntZAJFECERSgqHSxzonV5K2E89s1',
+ 'title': 'whang!',
+ },
+ 'playlist_mincount': 18
+ }, {
+ 'url': 'https://storyfire.com/user/UQ986nFxmAWIgnkZQ0ftVhq4nOk2/video',
+ 'info_dict': {
+ 'id': 'UQ986nFxmAWIgnkZQ0ftVhq4nOk2',
+ 'title': 'McJuggerNuggets',
+ },
+ 'playlist_mincount': 143
+
+ }]
+
+ # Generator for fetching playlist items
+ def _enum_videos(self, baseurl, user_id, firstjson):
+ totalVideos = int(firstjson['videosCount'])
+ haveVideos = 0
+ json = firstjson
+
+ for page in itertools.count(1):
+ for video in json['videos']:
+ id = video['_id']
+ url = "https://storyfire.com/video-details/%s" % id
+ haveVideos += 1
+ yield {
+ '_type': 'url',
+ 'id': id,
+ 'url': url,
+ 'ie_key': 'StoryFire',
+
+ 'title': video.get('title'),
+ 'description': video.get('description'),
+ 'view_count': video.get('views'),
+ 'comment_count': video.get('commentsCount'),
+ 'duration': video.get('videoDuration'),
+ 'timestamp': video.get('publishDate'),
+ }
+ # Are there more pages we could fetch?
+ if haveVideos < totalVideos:
+ pageurl = baseurl + ("%i" % haveVideos)
+ json = self._download_json(pageurl, user_id,
+ note='Downloading page %s' % page)
+
+ # Are there any videos in the new json?
+ videos = json.get('videos')
+ if not videos or len(videos) == 0:
+ break # no videos
+
+ else:
+ break # We have fetched all the videos, stop
+
+ def _real_extract(self, url):
+ user_id = self._match_id(url)
+
+ baseurl = "https://storyfire.com/app/publicVideos/%s?skip=" % user_id
+
+ # Download first page to ensure it can be downloaded, and get user information if available.
+ firstpage = baseurl + "0"
+ firstjson = self._download_json(firstpage, user_id)
+
+ title = None
+ videos = firstjson.get('videos')
+ if videos and len(videos):
+ title = videos[1].get('username')
+
+ return {
+ '_type': 'playlist',
+ 'entries': self._enum_videos(baseurl, user_id, firstjson),
+ 'id': user_id,
+ 'title': title,
+ }
+
+
+class StoryFireSeriesIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?storyfire\.com/write/series/stories/(?P<id>[^/\s]+)'
+ _TESTS = [{
+ 'url': 'https://storyfire.com/write/series/stories/-Lq6MsuIHLODO6d2dDkr/',
+ 'info_dict': {
+ 'id': '-Lq6MsuIHLODO6d2dDkr',
+ },
+ 'playlist_mincount': 13
+ }, {
+ 'url': 'https://storyfire.com/write/series/stories/the_mortal_one/',
+ 'info_dict': {
+ 'id': 'the_mortal_one',
+ },
+ 'playlist_count': 0 # This playlist has entries, but no videos.
+ }, {
+ 'url': 'https://storyfire.com/write/series/stories/story_time',
+ 'info_dict': {
+ 'id': 'story_time',
+ },
+ 'playlist_mincount': 10
+ }]
+
+ # Generator for returning playlist items
+ # This object is substantially different than the one in the user videos page above
+ def _enum_videos(self, jsonlist):
+ for video in jsonlist:
+ id = video['_id']
+ if video.get('hasVideo'): # Boolean element
+ url = "https://storyfire.com/video-details/%s" % id
+ yield {
+ '_type': 'url',
+ 'id': id,
+ 'url': url,
+ 'ie_key': 'StoryFire',
+
+ 'title': video.get('title'),
+ 'description': video.get('description'),
+ 'view_count': video.get('views'),
+ 'likes_count': video.get('likesCount'),
+ 'comment_count': video.get('commentsCount'),
+ 'duration': video.get('videoDuration'),
+ 'timestamp': video.get('publishDate'),
+ }
+
+ def _real_extract(self, url):
+ list_id = self._match_id(url)
+
+ listurl = "https://storyfire.com/app/seriesStories/%s/list" % list_id
+ json = self._download_json(listurl, list_id)
+
+ return {
+ '_type': 'playlist',
+ 'entries': self._enum_videos(json),
+ 'id': list_id
+ }
diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py
index e211cd4c8..3f0f7e277 100644
--- a/youtube_dl/extractor/twitch.py
+++ b/youtube_dl/extractor/twitch.py
@@ -380,11 +380,13 @@ class TwitchPlaylistBaseIE(TwitchBaseIE):
_PLAYLIST_PATH = 'kraken/channels/%s/videos/?offset=%d&limit=%d'
_PAGE_LIMIT = 100
- def _extract_playlist(self, channel_id):
+ def _extract_playlist(self, channel_name):
info = self._call_api(
- 'kraken/channels/%s' % channel_id,
- channel_id, 'Downloading channel info JSON')
- channel_name = info.get('display_name') or info.get('name')
+ 'kraken/users?login=%s' % channel_name,
+ channel_name, 'Downloading channel info JSON')
+ info = info['users'][0]
+ channel_id = info['_id']
+ channel_name = info.get('display_name') or info.get('name') or channel_name
entries = []
offset = 0
limit = self._PAGE_LIMIT
@@ -444,7 +446,7 @@ class TwitchProfileIE(TwitchPlaylistBaseIE):
_TESTS = [{
'url': 'http://www.twitch.tv/vanillatv/profile',
'info_dict': {
- 'id': 'vanillatv',
+ 'id': '22744919',
'title': 'VanillaTV',
},
'playlist_mincount': 412,
@@ -468,7 +470,7 @@ class TwitchAllVideosIE(TwitchVideosBaseIE):
_TESTS = [{
'url': 'https://www.twitch.tv/spamfish/videos/all',
'info_dict': {
- 'id': 'spamfish',
+ 'id': '497952',
'title': 'Spamfish',
},
'playlist_mincount': 869,
@@ -487,7 +489,7 @@ class TwitchUploadsIE(TwitchVideosBaseIE):
_TESTS = [{
'url': 'https://www.twitch.tv/spamfish/videos/uploads',
'info_dict': {
- 'id': 'spamfish',
+ 'id': '497952',
'title': 'Spamfish',
},
'playlist_mincount': 0,
@@ -506,7 +508,7 @@ class TwitchPastBroadcastsIE(TwitchVideosBaseIE):
_TESTS = [{
'url': 'https://www.twitch.tv/spamfish/videos/past-broadcasts',
'info_dict': {
- 'id': 'spamfish',
+ 'id': '497952',
'title': 'Spamfish',
},
'playlist_mincount': 0,
@@ -525,7 +527,7 @@ class TwitchHighlightsIE(TwitchVideosBaseIE):
_TESTS = [{
'url': 'https://www.twitch.tv/spamfish/videos/highlights',
'info_dict': {
- 'id': 'spamfish',
+ 'id': '497952',
'title': 'Spamfish',
},
'playlist_mincount': 805,
diff --git a/youtube_dl/extractor/videa.py b/youtube_dl/extractor/videa.py
index d0e34c819..a03614cc1 100644
--- a/youtube_dl/extractor/videa.py
+++ b/youtube_dl/extractor/videa.py
@@ -2,15 +2,24 @@
from __future__ import unicode_literals
import re
+import random
+import string
+import struct
from .common import InfoExtractor
from ..utils import (
+ ExtractorError,
int_or_none,
mimetype2ext,
parse_codecs,
xpath_element,
xpath_text,
)
+from ..compat import (
+ compat_b64decode,
+ compat_ord,
+ compat_parse_qs,
+)
class VideaIE(InfoExtractor):
@@ -60,15 +69,63 @@ class VideaIE(InfoExtractor):
r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1',
webpage)]
+ def rc4(self, ciphertext, key):
+ res = b''
+
+ keyLen = len(key)
+ S = list(range(256))
+
+ j = 0
+ for i in range(256):
+ j = (j + S[i] + ord(key[i % keyLen])) % 256
+ S[i], S[j] = S[j], S[i]
+
+ i = 0
+ j = 0
+ for m in range(len(ciphertext)):
+ i = (i + 1) % 256
+ j = (j + S[i]) % 256
+ S[i], S[j] = S[j], S[i]
+ k = S[(S[i] + S[j]) % 256]
+ res += struct.pack("B", k ^ compat_ord(ciphertext[m]))
+
+ return res
+
def _real_extract(self, url):
video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id, fatal=True)
+ error = self._search_regex(r'<p class="error-text">([^<]+)</p>', webpage, 'error', default=None)
+ if error:
+ raise ExtractorError(error, expected=True)
+
+ video_src_params_raw = self._search_regex(r'<iframe[^>]+id="videa_player_iframe"[^>]+src="/player\?([^"]+)"', webpage, 'video_src_params')
+ video_src_params = compat_parse_qs(video_src_params_raw)
+ player_page = self._download_webpage("https://videa.hu/videojs_player?%s" % video_src_params_raw, video_id, fatal=True)
+ nonce = self._search_regex(r'_xt\s*=\s*"([^"]+)"', player_page, 'nonce')
+ random_seed = ''.join(random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) for _ in range(8))
+ static_secret = 'xHb0ZvME5q8CBcoQi6AngerDu3FGO9fkUlwPmLVY_RTzj2hJIS4NasXWKy1td7p'
+ l = nonce[:32]
+ s = nonce[32:]
+ result = ''
+ for i in range(0, 32):
+ result += s[i - (static_secret.index(l[i]) - 31)]
- info = self._download_xml(
+ video_src_params['_s'] = random_seed
+ video_src_params['_t'] = result[:16]
+ encryption_key_stem = result[16:] + random_seed
+
+ [b64_info, handle] = self._download_webpage_handle(
'http://videa.hu/videaplayer_get_xml.php', video_id,
- query={'v': video_id})
+ query=video_src_params, fatal=True)
+
+ encrypted_info = compat_b64decode(b64_info)
+ key = encryption_key_stem + handle.info()['x-videa-xs']
+ info_str = self.rc4(encrypted_info, key).decode('utf8')
+ info = self._parse_xml(info_str, video_id)
video = xpath_element(info, './/video', 'video', fatal=True)
sources = xpath_element(info, './/video_sources', 'sources', fatal=True)
+ hash_values = xpath_element(info, './/hash_values', 'hash_values', fatal=True)
title = xpath_text(video, './title', fatal=True)
@@ -77,6 +134,7 @@ class VideaIE(InfoExtractor):
source_url = source.text
if not source_url:
continue
+ source_url += '?md5=%s&expires=%s' % (hash_values.find('hash_value_%s' % source.get('name')).text, source.get('exp'))
f = parse_codecs(source.get('codecs'))
f.update({
'url': source_url,
diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py
index b0dcdc0e6..9e4171237 100644
--- a/youtube_dl/extractor/viki.py
+++ b/youtube_dl/extractor/viki.py
@@ -56,14 +56,14 @@ class VikiBaseIE(InfoExtractor):
def _call_api(self, path, video_id, note, timestamp=None, post_data=None):
resp = self._download_json(
- self._prepare_call(path, timestamp, post_data), video_id, note)
+ self._prepare_call(path, timestamp, post_data), video_id, note, headers={'x-viki-app-ver': '2.2.5.1428709186'}, expected_status=[200, 400, 404])
error = resp.get('error')
if error:
if error == 'invalid timestamp':
resp = self._download_json(
self._prepare_call(path, int(resp['current_timestamp']), post_data),
- video_id, '%s (retry)' % note)
+ video_id, '%s (retry)' % note, headers={'x-viki-app-ver': '2.2.5.1428709186'}, expected_status=[200, 400, 404])
error = resp.get('error')
if error:
self._raise_error(resp['error'])
diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py
index 0f7be6a7d..902a3ed33 100644
--- a/youtube_dl/extractor/xhamster.py
+++ b/youtube_dl/extractor/xhamster.py
@@ -20,13 +20,13 @@ from ..utils import (
class XHamsterIE(InfoExtractor):
- _DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster[27]\.com)'
+ _DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster\d+\.com)'
_VALID_URL = r'''(?x)
https?://
(?:.+?\.)?%s/
(?:
- movies/(?P<id>\d+)/(?P<display_id>[^/]*)\.html|
- videos/(?P<display_id_2>[^/]*)-(?P<id_2>\d+)
+ movies/(?P<id>[\dA-Za-z]+)/(?P<display_id>[^/]*)\.html|
+ videos/(?P<display_id_2>[^/]*)-(?P<id_2>[\dA-Za-z]+)
)
''' % _DOMAINS
_TESTS = [{
@@ -100,11 +100,20 @@ class XHamsterIE(InfoExtractor):
'url': 'https://xhamster2.com/videos/femaleagent-shy-beauty-takes-the-bait-1509445',
'only_matching': True,
}, {
+ 'url': 'https://xhamster11.com/videos/femaleagent-shy-beauty-takes-the-bait-1509445',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://xhamster26.com/videos/femaleagent-shy-beauty-takes-the-bait-1509445',
+ 'only_matching': True,
+ }, {
'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html',
'only_matching': True,
}, {
'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd',
'only_matching': True,
+ }, {
+ 'url': 'http://de.xhamster.com/videos/skinny-girl-fucks-herself-hard-in-the-forest-xhnBJZx',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -129,7 +138,7 @@ class XHamsterIE(InfoExtractor):
initials = self._parse_json(
self._search_regex(
- r'window\.initials\s*=\s*({.+?})\s*;\s*\n', webpage, 'initials',
+ r'window\.initials\s*=\s*({.+?})\s*;', webpage, 'initials',
default='{}'),
video_id, fatal=False)
if initials:
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 9fff8bdf4..6e0bb6a12 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -1678,21 +1678,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _extract_chapters_from_json(self, webpage, video_id, duration):
if not webpage:
return
- player = self._parse_json(
+ initial_data = self._parse_json(
self._search_regex(
- r'RELATED_PLAYER_ARGS["\']\s*:\s*({.+})\s*,?\s*\n', webpage,
+ r'window\["ytInitialData"\] = (.+);\n', webpage,
'player args', default='{}'),
video_id, fatal=False)
- if not player or not isinstance(player, dict):
- return
- watch_next_response = player.get('watch_next_response')
- if not isinstance(watch_next_response, compat_str):
- return
- response = self._parse_json(watch_next_response, video_id, fatal=False)
- if not response or not isinstance(response, dict):
+ if not initial_data or not isinstance(initial_data, dict):
return
chapters_list = try_get(
- response,
+ initial_data,
lambda x: x['playerOverlays']
['playerOverlayRenderer']
['decoratedPlayerBarRenderer']