diff options
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r-- | youtube_dl/extractor/biqle.py | 20 | ||||
-rw-r--r-- | youtube_dl/extractor/doodstream.py | 71 | ||||
-rw-r--r-- | youtube_dl/extractor/extractors.py | 7 | ||||
-rw-r--r-- | youtube_dl/extractor/francetv.py | 12 | ||||
-rw-r--r-- | youtube_dl/extractor/hrfensehen.py | 102 | ||||
-rw-r--r-- | youtube_dl/extractor/soundcloud.py | 97 | ||||
-rw-r--r-- | youtube_dl/extractor/storyfire.py | 255 | ||||
-rw-r--r-- | youtube_dl/extractor/twitch.py | 20 | ||||
-rw-r--r-- | youtube_dl/extractor/videa.py | 62 | ||||
-rw-r--r-- | youtube_dl/extractor/viki.py | 4 | ||||
-rw-r--r-- | youtube_dl/extractor/xhamster.py | 17 | ||||
-rw-r--r-- | youtube_dl/extractor/youtube.py | 14 |
12 files changed, 633 insertions, 48 deletions
diff --git a/youtube_dl/extractor/biqle.py b/youtube_dl/extractor/biqle.py index af21e3ee5..17ebbb257 100644 --- a/youtube_dl/extractor/biqle.py +++ b/youtube_dl/extractor/biqle.py @@ -3,10 +3,11 @@ from __future__ import unicode_literals from .common import InfoExtractor from .vk import VKIE -from ..utils import ( - HEADRequest, - int_or_none, +from ..compat import ( + compat_b64decode, + compat_urllib_parse_unquote, ) +from ..utils import int_or_none class BIQLEIE(InfoExtractor): @@ -47,9 +48,16 @@ class BIQLEIE(InfoExtractor): if VKIE.suitable(embed_url): return self.url_result(embed_url, VKIE.ie_key(), video_id) - self._request_webpage( - HEADRequest(embed_url), video_id, headers={'Referer': url}) - video_id, sig, _, access_token = self._get_cookies(embed_url)['video_ext'].value.split('%3A') + embed_page = self._download_webpage( + embed_url, video_id, headers={'Referer': url}) + video_ext = self._get_cookies(embed_url).get('video_ext') + if video_ext: + video_ext = compat_urllib_parse_unquote(video_ext.value) + if not video_ext: + video_ext = compat_b64decode(self._search_regex( + r'video_ext\s*:\s*[\'"]([A-Za-z0-9+/=]+)', + embed_page, 'video_ext')).decode() + video_id, sig, _, access_token = video_ext.split(':') item = self._download_json( 'https://api.vk.com/method/video.get', video_id, headers={'User-Agent': 'okhttp/3.4.1'}, query={ diff --git a/youtube_dl/extractor/doodstream.py b/youtube_dl/extractor/doodstream.py new file mode 100644 index 000000000..2c9ea6898 --- /dev/null +++ b/youtube_dl/extractor/doodstream.py @@ -0,0 +1,71 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import string +import random +import time + +from .common import InfoExtractor + + +class DoodStreamIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?dood\.(?:to|watch)/[ed]/(?P<id>[a-z0-9]+)' + _TESTS = [{ + 'url': 'http://dood.to/e/5s1wmbdacezb', + 'md5': '4568b83b31e13242b3f1ff96c55f0595', + 'info_dict': { + 'id': '5s1wmbdacezb', + 'ext': 'mp4', + 'title': 'Kat Wonders - Monthly May 2020', + 'description': 'Kat Wonders - Monthly May 2020 | DoodStream.com', + 'thumbnail': 'https://img.doodcdn.com/snaps/flyus84qgl2fsk4g.jpg', + } + }, { + 'url': 'https://dood.to/d/jzrxn12t2s7n', + 'md5': '3207e199426eca7c2aa23c2872e6728a', + 'info_dict': { + 'id': 'jzrxn12t2s7n', + 'ext': 'mp4', + 'title': 'Stacy Cruz Cute ALLWAYSWELL', + 'description': 'Stacy Cruz Cute ALLWAYSWELL | DoodStream.com', + 'thumbnail': 'https://img.doodcdn.com/snaps/8edqd5nppkac3x8u.jpg', + } + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + if '/d/' in url: + url = "https://dood.to" + self._html_search_regex( + r'<iframe src="(/e/[a-z0-9]+)"', webpage, 'embed') + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_meta(['og:title', 'twitter:title'], + webpage, default=None) + thumb = self._html_search_meta(['og:image', 'twitter:image'], + webpage, default=None) + token = self._html_search_regex(r'[?&]token=([a-z0-9]+)[&\']', webpage, 'token') + description = self._html_search_meta( + ['og:description', 'description', 'twitter:description'], + webpage, default=None) + auth_url = 'https://dood.to' + self._html_search_regex( + r'(/pass_md5.*?)\'', webpage, 'pass_md5') + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/66.0', + 'referer': url + } + + webpage = self._download_webpage(auth_url, video_id, headers=headers) + final_url = webpage + ''.join([random.choice(string.ascii_letters + string.digits) for _ in range(10)]) + "?token=" + token + "&expiry=" + str(int(time.time() * 1000)) + + return { + 'id': video_id, + 'title': title, + 'url': final_url, + 'http_headers': headers, + 'ext': 'mp4', + 'description': description, + 'thumbnail': thumb, + } diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 4b3092028..e213b1bea 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -293,6 +293,7 @@ from .discoverynetworks import DiscoveryNetworksDeIE from .discoveryvr import DiscoveryVRIE from .disney import DisneyIE from .dispeak import DigitallySpeakingIE +from .doodstream import DoodStreamIE from .dropbox import DropboxIE from .dw import ( DWIE, @@ -440,6 +441,7 @@ from .hotstar import ( ) from .howcast import HowcastIE from .howstuffworks import HowStuffWorksIE +from .hrfensehen import HRFernsehenIE from .hrti import ( HRTiIE, HRTiPlaylistIE, @@ -1057,6 +1059,11 @@ from .spike import ( BellatorIE, ParamountNetworkIE, ) +from .storyfire import ( + StoryFireIE, + StoryFireUserIE, + StoryFireSeriesIE, +) from .stitcher import StitcherIE from .sport5 import Sport5IE from .sportbox import SportBoxIE diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 81b468c7d..e340cddba 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -316,13 +316,14 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P<id>[^/?#&.]+)' _TESTS = [{ - 'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html', + 'url': 'https://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-jeudi-22-aout-2019_3561461.html', 'info_dict': { - 'id': '84981923', + 'id': 'd12458ee-5062-48fe-bfdd-a30d6a01b793', 'ext': 'mp4', 'title': 'Soir 3', - 'upload_date': '20130826', - 'timestamp': 1377548400, + 'upload_date': '20190822', + 'timestamp': 1566510900, + 'description': 'md5:72d167097237701d6e8452ff03b83c00', 'subtitles': { 'fr': 'mincount:2', }, @@ -374,7 +375,8 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): video_id = self._search_regex( (r'player\.load[^;]+src:\s*["\']([^"\']+)', r'id-video=([^@]+@[^"]+)', - r'<a[^>]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"'), + r'<a[^>]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"', + r'data-id="([^"]+)"'), webpage, 'video id') return self._make_url_result(video_id) diff --git a/youtube_dl/extractor/hrfensehen.py b/youtube_dl/extractor/hrfensehen.py new file mode 100644 index 000000000..2beadef2c --- /dev/null +++ b/youtube_dl/extractor/hrfensehen.py @@ -0,0 +1,102 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from youtube_dl.utils import int_or_none, unified_timestamp, unescapeHTML +from .common import InfoExtractor + + +class HRFernsehenIE(InfoExtractor): + IE_NAME = 'hrfernsehen' + _VALID_URL = r'^https?://www\.(?:hr-fernsehen|hessenschau)\.de/.*,video-(?P<id>[0-9]{6})\.html' + + _TESTS = [{ + 'url': 'https://www.hessenschau.de/tv-sendung/hessenschau-vom-26082020,video-130546.html', + 'md5': '5c4e0ba94677c516a2f65a84110fc536', + 'info_dict': { + 'id': '130546', + 'ext': 'mp4', + 'description': 'Sturmtief Kirsten fegt über Hessen / Die Corona-Pandemie – eine Chronologie / ' + 'Sterbehilfe: Die Lage in Hessen / Miss Hessen leitet zwei eigene Unternehmen / ' + 'Pop-Up Museum zeigt Schwarze Unterhaltung und Black Music', + 'subtitles': {'de': [{ + 'url': 'https://hr-a.akamaihd.net/video/as/hessenschau/2020_08/hrLogo_200826200407_L385592_512x288-25p-500kbit.vtt' + }]}, + 'timestamp': 1598470200, + 'upload_date': '20200826', + 'thumbnails': [{ + 'url': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9.jpg', + 'id': '0' + }, { + 'url': 'https://www.hessenschau.de/tv-sendung/hs_ganz-1554~_t-1598465545029_v-16to9__medium.jpg', + 'id': '1' + }], + 'title': 'hessenschau vom 26.08.2020' + } + }, { + 'url': 'https://www.hr-fernsehen.de/sendungen-a-z/mex/sendungen/fair-und-gut---was-hinter-aldis-eigenem-guetesiegel-steckt,video-130544.html', + 'only_matching': True + }] + + _GEO_COUNTRIES = ['DE'] + + def extract_airdate(self, loader_data): + airdate_str = loader_data.get('mediaMetadata', {}).get('agf', {}).get('airdate') + + if airdate_str is None: + return None + + return unified_timestamp(airdate_str) + + def extract_formats(self, loader_data): + stream_formats = [] + for stream_obj in loader_data["videoResolutionLevels"]: + stream_format = { + 'format_id': str(stream_obj['verticalResolution']) + "p", + 'height': stream_obj['verticalResolution'], + 'url': stream_obj['url'], + } + + quality_information = re.search(r'([0-9]{3,4})x([0-9]{3,4})-([0-9]{2})p-([0-9]{3,4})kbit', + stream_obj['url']) + if quality_information: + stream_format['width'] = int_or_none(quality_information.group(1)) + stream_format['height'] = int_or_none(quality_information.group(2)) + stream_format['fps'] = int_or_none(quality_information.group(3)) + stream_format['tbr'] = int_or_none(quality_information.group(4)) + + stream_formats.append(stream_format) + + self._sort_formats(stream_formats) + return stream_formats + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_meta( + ['og:title', 'twitter:title', 'name'], webpage) + description = self._html_search_meta( + ['description'], webpage) + + loader_str = unescapeHTML(self._search_regex(r"data-hr-mediaplayer-loader='([^']*)'", webpage, "ardloader")) + loader_data = json.loads(loader_str) + + info = { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': self.extract_formats(loader_data), + 'timestamp': self.extract_airdate(loader_data) + } + + if "subtitle" in loader_data: + info["subtitles"] = {"de": [{"url": loader_data["subtitle"]}]} + + thumbnails = list(set([t for t in loader_data.get("previewImageUrl", {}).values()])) + if len(thumbnails) > 0: + info["thumbnails"] = [{"url": t} for t in thumbnails] + + return info diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index d37c52543..ac09cb5e6 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -3,6 +3,8 @@ from __future__ import unicode_literals import itertools import re +import json +import random from .common import ( InfoExtractor, @@ -28,6 +30,7 @@ from ..utils import ( update_url_query, url_or_none, urlhandle_detect_ext, + sanitized_Request, ) @@ -309,7 +312,81 @@ class SoundcloudIE(InfoExtractor): raise def _real_initialize(self): - self._CLIENT_ID = self._downloader.cache.load('soundcloud', 'client_id') or 'YUKXoArFcqrlQn9tfNHvvyfnDISj04zk' + self._CLIENT_ID = self._downloader.cache.load('soundcloud', 'client_id') or "T5R4kgWS2PRf6lzLyIravUMnKlbIxQag" # 'EXLwg5lHTO2dslU5EePe3xkw0m1h86Cd' # 'YUKXoArFcqrlQn9tfNHvvyfnDISj04zk' + self._login() + + _USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36" + _API_AUTH_QUERY_TEMPLATE = '?client_id=%s' + _API_AUTH_URL_PW = 'https://api-auth.soundcloud.com/web-auth/sign-in/password%s' + _access_token = None + _HEADERS = {} + _NETRC_MACHINE = 'soundcloud' + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + def genDevId(): + def genNumBlock(): + return ''.join([str(random.randrange(10)) for i in range(6)]) + return '-'.join([genNumBlock() for i in range(4)]) + + payload = { + 'client_id': self._CLIENT_ID, + 'recaptcha_pubkey': 'null', + 'recaptcha_response': 'null', + 'credentials': { + 'identifier': username, + 'password': password + }, + 'signature': self.sign(username, password, self._CLIENT_ID), + 'device_id': genDevId(), + 'user_agent': self._USER_AGENT + } + + query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID + login = sanitized_Request(self._API_AUTH_URL_PW % query, json.dumps(payload).encode('utf-8')) + response = self._download_json(login, None) + self._access_token = response.get('session').get('access_token') + if not self._access_token: + self.report_warning('Unable to get access token, login may has failed') + else: + self._HEADERS = {'Authorization': 'OAuth ' + self._access_token} + + # signature generation + def sign(self, user, pw, clid): + a = 33 + i = 1 + s = 440123 + w = 117 + u = 1800000 + l = 1042 + b = 37 + k = 37 + c = 5 + n = "0763ed7314c69015fd4a0dc16bbf4b90" # _KEY + y = "8" # _REV + r = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36" # _USER_AGENT + e = user # _USERNAME + t = clid # _CLIENT_ID + + d = '-'.join([str(mInt) for mInt in [a, i, s, w, u, l, b, k]]) + p = n + y + d + r + e + t + d + n + h = p + + m = 8011470 + f = 0 + + for f in range(f, len(h)): + m = (m >> 1) + ((1 & m) << 23) + m += ord(h[f]) + m &= 16777215 + + # c is not even needed + out = str(y) + ':' + str(d) + ':' + format(m, 'x') + ':' + str(c) + + return out @classmethod def _resolv_url(cls, url): @@ -389,7 +466,7 @@ class SoundcloudIE(InfoExtractor): if not format_url: continue stream = self._download_json( - format_url, track_id, query=query, fatal=False) + format_url, track_id, query=query, fatal=False, headers=self._HEADERS) if not isinstance(stream, dict): continue stream_url = url_or_none(stream.get('url')) @@ -487,7 +564,7 @@ class SoundcloudIE(InfoExtractor): info_json_url = self._resolv_url(self._BASE_URL + resolve_title) info = self._download_json( - info_json_url, full_title, 'Downloading info JSON', query=query) + info_json_url, full_title, 'Downloading info JSON', query=query, headers=self._HEADERS) return self._extract_info_dict(info, full_title, token) @@ -503,7 +580,7 @@ class SoundcloudPlaylistBaseIE(SoundcloudIE): 'ids': ','.join([compat_str(t['id']) for t in tracks]), 'playlistId': playlist_id, 'playlistSecretToken': token, - }) + }, headers=self._HEADERS) entries = [] for track in tracks: track_id = str_or_none(track.get('id')) @@ -547,7 +624,7 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE): full_title += '/' + token info = self._download_json(self._resolv_url( - self._BASE_URL + full_title), full_title) + self._BASE_URL + full_title), full_title, headers=self._HEADERS) if 'errors' in info: msgs = (compat_str(err['error_message']) for err in info['errors']) @@ -572,7 +649,7 @@ class SoundcloudPagedPlaylistBaseIE(SoundcloudIE): for i in itertools.count(): response = self._download_json( next_href, playlist_id, - 'Downloading track page %s' % (i + 1), query=query) + 'Downloading track page %s' % (i + 1), query=query, headers=self._HEADERS) collection = response['collection'] @@ -694,7 +771,7 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE): user = self._download_json( self._resolv_url(self._BASE_URL + uploader), - uploader, 'Downloading user info') + uploader, 'Downloading user info', headers=self._HEADERS) resource = mobj.group('rsrc') or 'all' @@ -719,7 +796,7 @@ class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE): def _real_extract(self, url): track_name = self._match_id(url) - track = self._download_json(self._resolv_url(url), track_name) + track = self._download_json(self._resolv_url(url), track_name, headers=self._HEADERS) track_id = self._search_regex( r'soundcloud:track-stations:(\d+)', track['id'], 'track id') @@ -752,7 +829,7 @@ class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE): data = self._download_json( self._API_V2_BASE + 'playlists/' + playlist_id, - playlist_id, 'Downloading playlist', query=query) + playlist_id, 'Downloading playlist', query=query, headers=self._HEADERS) return self._extract_set(data, token) @@ -789,7 +866,7 @@ class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE): for i in itertools.count(1): response = self._download_json( next_url, collection_id, 'Downloading page {0}'.format(i), - 'Unable to download API page') + 'Unable to download API page', headers=self._HEADERS) collection = response.get('collection', []) if not collection: diff --git a/youtube_dl/extractor/storyfire.py b/youtube_dl/extractor/storyfire.py new file mode 100644 index 000000000..67457cc94 --- /dev/null +++ b/youtube_dl/extractor/storyfire.py @@ -0,0 +1,255 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools +from .common import InfoExtractor + + +class StoryFireIE(InfoExtractor): + _VALID_URL = r'(?:(?:https?://(?:www\.)?storyfire\.com/video-details)|(?:https://storyfire.app.link))/(?P<id>[^/\s]+)' + _TESTS = [{ + 'url': 'https://storyfire.com/video-details/5df1d132b6378700117f9181', + 'md5': '560953bfca81a69003cfa5e53ac8a920', + 'info_dict': { + 'id': '5df1d132b6378700117f9181', + 'ext': 'mp4', + 'title': 'Buzzfeed Teaches You About Memes', + 'uploader_id': 'ntZAJFECERSgqHSxzonV5K2E89s1', + 'timestamp': 1576129028, + 'description': 'Mocking Buzzfeed\'s meme lesson. Reuploaded from YouTube because of their new policies', + 'uploader': 'whang!', + 'upload_date': '20191212', + }, + 'params': {'format': 'bestvideo'} # There are no merged formats in the playlist. + }, { + 'url': 'https://storyfire.app.link/5GxAvWOQr8', # Alternate URL format, with unrelated short ID + 'md5': '7a2dc6d60c4889edfed459c620fe690d', + 'info_dict': { + 'id': '5f1e11ecd78a57b6c702001d', + 'ext': 'm4a', + 'title': 'Weird Nintendo Prototype Leaks', + 'description': 'A stream taking a look at some weird Nintendo Prototypes with Luigi in Mario 64 and weird Yoshis', + 'timestamp': 1595808576, + 'upload_date': '20200727', + 'uploader': 'whang!', + 'uploader_id': 'ntZAJFECERSgqHSxzonV5K2E89s1', + }, + 'params': {'format': 'bestaudio'} # Verifying audio extraction + + }] + + _aformats = { + 'audio-medium-audio': {'acodec': 'aac', 'abr': 125, 'preference': -10}, + 'audio-high-audio': {'acodec': 'aac', 'abr': 254, 'preference': -1}, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + # Extracting the json blob is mandatory to proceed with extraction. + jsontext = self._html_search_regex( + r'<script id="__NEXT_DATA__" type="application/json">(.+?)</script>', + webpage, 'json_data') + + json = self._parse_json(jsontext, video_id) + + # The currentVideo field in the json is mandatory + # because it contains the only link to the m3u playlist + video = json['props']['initialState']['video']['currentVideo'] + videourl = video['vimeoVideoURL'] # Video URL is mandatory + + # Extract other fields from the json in an error tolerant fashion + # ID may be incorrect (on short URL format), correct it. + parsed_id = video.get('_id') + if parsed_id: + video_id = parsed_id + + title = video.get('title') + description = video.get('description') + + thumbnail = video.get('storyImage') + views = video.get('views') + likes = video.get('likesCount') + comments = video.get('commentsCount') + duration = video.get('videoDuration') + publishdate = video.get('publishDate') # Apparently epoch time, day only + + uploader = video.get('username') + uploader_id = video.get('hostID') + # Construct an uploader URL + uploader_url = None + if uploader_id: + uploader_url = "https://storyfire.com/user/%s/video" % uploader_id + + # Collect root playlist to determine formats + formats = self._extract_m3u8_formats( + videourl, video_id, 'mp4', 'm3u8_native') + + # Modify formats to fill in missing information about audio codecs + for format in formats: + aformat = self._aformats.get(format['format_id']) + if aformat: + format['acodec'] = aformat['acodec'] + format['abr'] = aformat['abr'] + format['preference'] = aformat['preference'] + format['ext'] = 'm4a' + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'ext': "mp4", + 'url': videourl, + 'formats': formats, + + 'thumbnail': thumbnail, + 'view_count': views, + 'like_count': likes, + 'comment_count': comments, + 'duration': duration, + 'timestamp': publishdate, + + 'uploader': uploader, + 'uploader_id': uploader_id, + 'uploader_url': uploader_url, + + } + + +class StoryFireUserIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?storyfire\.com/user/(?P<id>[^/\s]+)/video' + _TESTS = [{ + 'url': 'https://storyfire.com/user/ntZAJFECERSgqHSxzonV5K2E89s1/video', + 'info_dict': { + 'id': 'ntZAJFECERSgqHSxzonV5K2E89s1', + 'title': 'whang!', + }, + 'playlist_mincount': 18 + }, { + 'url': 'https://storyfire.com/user/UQ986nFxmAWIgnkZQ0ftVhq4nOk2/video', + 'info_dict': { + 'id': 'UQ986nFxmAWIgnkZQ0ftVhq4nOk2', + 'title': 'McJuggerNuggets', + }, + 'playlist_mincount': 143 + + }] + + # Generator for fetching playlist items + def _enum_videos(self, baseurl, user_id, firstjson): + totalVideos = int(firstjson['videosCount']) + haveVideos = 0 + json = firstjson + + for page in itertools.count(1): + for video in json['videos']: + id = video['_id'] + url = "https://storyfire.com/video-details/%s" % id + haveVideos += 1 + yield { + '_type': 'url', + 'id': id, + 'url': url, + 'ie_key': 'StoryFire', + + 'title': video.get('title'), + 'description': video.get('description'), + 'view_count': video.get('views'), + 'comment_count': video.get('commentsCount'), + 'duration': video.get('videoDuration'), + 'timestamp': video.get('publishDate'), + } + # Are there more pages we could fetch? + if haveVideos < totalVideos: + pageurl = baseurl + ("%i" % haveVideos) + json = self._download_json(pageurl, user_id, + note='Downloading page %s' % page) + + # Are there any videos in the new json? + videos = json.get('videos') + if not videos or len(videos) == 0: + break # no videos + + else: + break # We have fetched all the videos, stop + + def _real_extract(self, url): + user_id = self._match_id(url) + + baseurl = "https://storyfire.com/app/publicVideos/%s?skip=" % user_id + + # Download first page to ensure it can be downloaded, and get user information if available. + firstpage = baseurl + "0" + firstjson = self._download_json(firstpage, user_id) + + title = None + videos = firstjson.get('videos') + if videos and len(videos): + title = videos[1].get('username') + + return { + '_type': 'playlist', + 'entries': self._enum_videos(baseurl, user_id, firstjson), + 'id': user_id, + 'title': title, + } + + +class StoryFireSeriesIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?storyfire\.com/write/series/stories/(?P<id>[^/\s]+)' + _TESTS = [{ + 'url': 'https://storyfire.com/write/series/stories/-Lq6MsuIHLODO6d2dDkr/', + 'info_dict': { + 'id': '-Lq6MsuIHLODO6d2dDkr', + }, + 'playlist_mincount': 13 + }, { + 'url': 'https://storyfire.com/write/series/stories/the_mortal_one/', + 'info_dict': { + 'id': 'the_mortal_one', + }, + 'playlist_count': 0 # This playlist has entries, but no videos. + }, { + 'url': 'https://storyfire.com/write/series/stories/story_time', + 'info_dict': { + 'id': 'story_time', + }, + 'playlist_mincount': 10 + }] + + # Generator for returning playlist items + # This object is substantially different than the one in the user videos page above + def _enum_videos(self, jsonlist): + for video in jsonlist: + id = video['_id'] + if video.get('hasVideo'): # Boolean element + url = "https://storyfire.com/video-details/%s" % id + yield { + '_type': 'url', + 'id': id, + 'url': url, + 'ie_key': 'StoryFire', + + 'title': video.get('title'), + 'description': video.get('description'), + 'view_count': video.get('views'), + 'likes_count': video.get('likesCount'), + 'comment_count': video.get('commentsCount'), + 'duration': video.get('videoDuration'), + 'timestamp': video.get('publishDate'), + } + + def _real_extract(self, url): + list_id = self._match_id(url) + + listurl = "https://storyfire.com/app/seriesStories/%s/list" % list_id + json = self._download_json(listurl, list_id) + + return { + '_type': 'playlist', + 'entries': self._enum_videos(json), + 'id': list_id + } diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index e211cd4c8..3f0f7e277 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -380,11 +380,13 @@ class TwitchPlaylistBaseIE(TwitchBaseIE): _PLAYLIST_PATH = 'kraken/channels/%s/videos/?offset=%d&limit=%d' _PAGE_LIMIT = 100 - def _extract_playlist(self, channel_id): + def _extract_playlist(self, channel_name): info = self._call_api( - 'kraken/channels/%s' % channel_id, - channel_id, 'Downloading channel info JSON') - channel_name = info.get('display_name') or info.get('name') + 'kraken/users?login=%s' % channel_name, + channel_name, 'Downloading channel info JSON') + info = info['users'][0] + channel_id = info['_id'] + channel_name = info.get('display_name') or info.get('name') or channel_name entries = [] offset = 0 limit = self._PAGE_LIMIT @@ -444,7 +446,7 @@ class TwitchProfileIE(TwitchPlaylistBaseIE): _TESTS = [{ 'url': 'http://www.twitch.tv/vanillatv/profile', 'info_dict': { - 'id': 'vanillatv', + 'id': '22744919', 'title': 'VanillaTV', }, 'playlist_mincount': 412, @@ -468,7 +470,7 @@ class TwitchAllVideosIE(TwitchVideosBaseIE): _TESTS = [{ 'url': 'https://www.twitch.tv/spamfish/videos/all', 'info_dict': { - 'id': 'spamfish', + 'id': '497952', 'title': 'Spamfish', }, 'playlist_mincount': 869, @@ -487,7 +489,7 @@ class TwitchUploadsIE(TwitchVideosBaseIE): _TESTS = [{ 'url': 'https://www.twitch.tv/spamfish/videos/uploads', 'info_dict': { - 'id': 'spamfish', + 'id': '497952', 'title': 'Spamfish', }, 'playlist_mincount': 0, @@ -506,7 +508,7 @@ class TwitchPastBroadcastsIE(TwitchVideosBaseIE): _TESTS = [{ 'url': 'https://www.twitch.tv/spamfish/videos/past-broadcasts', 'info_dict': { - 'id': 'spamfish', + 'id': '497952', 'title': 'Spamfish', }, 'playlist_mincount': 0, @@ -525,7 +527,7 @@ class TwitchHighlightsIE(TwitchVideosBaseIE): _TESTS = [{ 'url': 'https://www.twitch.tv/spamfish/videos/highlights', 'info_dict': { - 'id': 'spamfish', + 'id': '497952', 'title': 'Spamfish', }, 'playlist_mincount': 805, diff --git a/youtube_dl/extractor/videa.py b/youtube_dl/extractor/videa.py index d0e34c819..a03614cc1 100644 --- a/youtube_dl/extractor/videa.py +++ b/youtube_dl/extractor/videa.py @@ -2,15 +2,24 @@ from __future__ import unicode_literals import re +import random +import string +import struct from .common import InfoExtractor from ..utils import ( + ExtractorError, int_or_none, mimetype2ext, parse_codecs, xpath_element, xpath_text, ) +from ..compat import ( + compat_b64decode, + compat_ord, + compat_parse_qs, +) class VideaIE(InfoExtractor): @@ -60,15 +69,63 @@ class VideaIE(InfoExtractor): r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//videa\.hu/player\?.*?\bv=.+?)\1', webpage)] + def rc4(self, ciphertext, key): + res = b'' + + keyLen = len(key) + S = list(range(256)) + + j = 0 + for i in range(256): + j = (j + S[i] + ord(key[i % keyLen])) % 256 + S[i], S[j] = S[j], S[i] + + i = 0 + j = 0 + for m in range(len(ciphertext)): + i = (i + 1) % 256 + j = (j + S[i]) % 256 + S[i], S[j] = S[j], S[i] + k = S[(S[i] + S[j]) % 256] + res += struct.pack("B", k ^ compat_ord(ciphertext[m])) + + return res + def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id, fatal=True) + error = self._search_regex(r'<p class="error-text">([^<]+)</p>', webpage, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) + + video_src_params_raw = self._search_regex(r'<iframe[^>]+id="videa_player_iframe"[^>]+src="/player\?([^"]+)"', webpage, 'video_src_params') + video_src_params = compat_parse_qs(video_src_params_raw) + player_page = self._download_webpage("https://videa.hu/videojs_player?%s" % video_src_params_raw, video_id, fatal=True) + nonce = self._search_regex(r'_xt\s*=\s*"([^"]+)"', player_page, 'nonce') + random_seed = ''.join(random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) for _ in range(8)) + static_secret = 'xHb0ZvME5q8CBcoQi6AngerDu3FGO9fkUlwPmLVY_RTzj2hJIS4NasXWKy1td7p' + l = nonce[:32] + s = nonce[32:] + result = '' + for i in range(0, 32): + result += s[i - (static_secret.index(l[i]) - 31)] - info = self._download_xml( + video_src_params['_s'] = random_seed + video_src_params['_t'] = result[:16] + encryption_key_stem = result[16:] + random_seed + + [b64_info, handle] = self._download_webpage_handle( 'http://videa.hu/videaplayer_get_xml.php', video_id, - query={'v': video_id}) + query=video_src_params, fatal=True) + + encrypted_info = compat_b64decode(b64_info) + key = encryption_key_stem + handle.info()['x-videa-xs'] + info_str = self.rc4(encrypted_info, key).decode('utf8') + info = self._parse_xml(info_str, video_id) video = xpath_element(info, './/video', 'video', fatal=True) sources = xpath_element(info, './/video_sources', 'sources', fatal=True) + hash_values = xpath_element(info, './/hash_values', 'hash_values', fatal=True) title = xpath_text(video, './title', fatal=True) @@ -77,6 +134,7 @@ class VideaIE(InfoExtractor): source_url = source.text if not source_url: continue + source_url += '?md5=%s&expires=%s' % (hash_values.find('hash_value_%s' % source.get('name')).text, source.get('exp')) f = parse_codecs(source.get('codecs')) f.update({ 'url': source_url, diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index b0dcdc0e6..9e4171237 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -56,14 +56,14 @@ class VikiBaseIE(InfoExtractor): def _call_api(self, path, video_id, note, timestamp=None, post_data=None): resp = self._download_json( - self._prepare_call(path, timestamp, post_data), video_id, note) + self._prepare_call(path, timestamp, post_data), video_id, note, headers={'x-viki-app-ver': '2.2.5.1428709186'}, expected_status=[200, 400, 404]) error = resp.get('error') if error: if error == 'invalid timestamp': resp = self._download_json( self._prepare_call(path, int(resp['current_timestamp']), post_data), - video_id, '%s (retry)' % note) + video_id, '%s (retry)' % note, headers={'x-viki-app-ver': '2.2.5.1428709186'}, expected_status=[200, 400, 404]) error = resp.get('error') if error: self._raise_error(resp['error']) diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 0f7be6a7d..902a3ed33 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -20,13 +20,13 @@ from ..utils import ( class XHamsterIE(InfoExtractor): - _DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster[27]\.com)' + _DOMAINS = r'(?:xhamster\.(?:com|one|desi)|xhms\.pro|xhamster\d+\.com)' _VALID_URL = r'''(?x) https?:// (?:.+?\.)?%s/ (?: - movies/(?P<id>\d+)/(?P<display_id>[^/]*)\.html| - videos/(?P<display_id_2>[^/]*)-(?P<id_2>\d+) + movies/(?P<id>[\dA-Za-z]+)/(?P<display_id>[^/]*)\.html| + videos/(?P<display_id_2>[^/]*)-(?P<id_2>[\dA-Za-z]+) ) ''' % _DOMAINS _TESTS = [{ @@ -100,11 +100,20 @@ class XHamsterIE(InfoExtractor): 'url': 'https://xhamster2.com/videos/femaleagent-shy-beauty-takes-the-bait-1509445', 'only_matching': True, }, { + 'url': 'https://xhamster11.com/videos/femaleagent-shy-beauty-takes-the-bait-1509445', + 'only_matching': True, + }, { + 'url': 'https://xhamster26.com/videos/femaleagent-shy-beauty-takes-the-bait-1509445', + 'only_matching': True, + }, { 'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html', 'only_matching': True, }, { 'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', 'only_matching': True, + }, { + 'url': 'http://de.xhamster.com/videos/skinny-girl-fucks-herself-hard-in-the-forest-xhnBJZx', + 'only_matching': True, }] def _real_extract(self, url): @@ -129,7 +138,7 @@ class XHamsterIE(InfoExtractor): initials = self._parse_json( self._search_regex( - r'window\.initials\s*=\s*({.+?})\s*;\s*\n', webpage, 'initials', + r'window\.initials\s*=\s*({.+?})\s*;', webpage, 'initials', default='{}'), video_id, fatal=False) if initials: diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9fff8bdf4..6e0bb6a12 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1678,21 +1678,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_chapters_from_json(self, webpage, video_id, duration): if not webpage: return - player = self._parse_json( + initial_data = self._parse_json( self._search_regex( - r'RELATED_PLAYER_ARGS["\']\s*:\s*({.+})\s*,?\s*\n', webpage, + r'window\["ytInitialData"\] = (.+);\n', webpage, 'player args', default='{}'), video_id, fatal=False) - if not player or not isinstance(player, dict): - return - watch_next_response = player.get('watch_next_response') - if not isinstance(watch_next_response, compat_str): - return - response = self._parse_json(watch_next_response, video_id, fatal=False) - if not response or not isinstance(response, dict): + if not initial_data or not isinstance(initial_data, dict): return chapters_list = try_get( - response, + initial_data, lambda x: x['playerOverlays'] ['playerOverlayRenderer'] ['decoratedPlayerBarRenderer'] |