diff options
Diffstat (limited to 'youtube_dlc')
-rw-r--r-- | youtube_dlc/YoutubeDL.py | 18 | ||||
-rw-r--r-- | youtube_dlc/downloader/common.py | 24 | ||||
-rw-r--r-- | youtube_dlc/extractor/adobepass.py | 7 | ||||
-rw-r--r-- | youtube_dlc/extractor/bandcamp.py | 180 | ||||
-rw-r--r-- | youtube_dlc/extractor/brightcove.py | 13 | ||||
-rw-r--r-- | youtube_dlc/extractor/extractors.py | 5 | ||||
-rw-r--r-- | youtube_dlc/extractor/mtv.py | 7 | ||||
-rw-r--r-- | youtube_dlc/extractor/netzkino.py | 47 | ||||
-rw-r--r-- | youtube_dlc/extractor/newgrounds.py | 107 | ||||
-rw-r--r-- | youtube_dlc/extractor/nitter.py | 167 | ||||
-rw-r--r-- | youtube_dlc/extractor/skyitalia.py | 119 | ||||
-rw-r--r-- | youtube_dlc/extractor/tvland.py | 2 | ||||
-rw-r--r-- | youtube_dlc/extractor/urplay.py | 49 | ||||
-rw-r--r-- | youtube_dlc/extractor/xtube.py | 47 | ||||
-rw-r--r-- | youtube_dlc/extractor/youtube.py | 48 | ||||
-rw-r--r-- | youtube_dlc/options.py | 2 | ||||
-rw-r--r-- | youtube_dlc/update.py | 20 | ||||
-rw-r--r-- | youtube_dlc/utils.py | 4 |
18 files changed, 651 insertions, 215 deletions
diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py index fc351db0d..dd55ba0f2 100644 --- a/youtube_dlc/YoutubeDL.py +++ b/youtube_dlc/YoutubeDL.py @@ -801,7 +801,7 @@ class YoutubeDL(object): for key, value in extra_info.items(): info_dict.setdefault(key, value) - def extract_info(self, url, download=True, ie_key=None, extra_info={}, + def extract_info(self, url, download=True, ie_key=None, info_dict=None, extra_info={}, process=True, force_generic_extractor=False): ''' Returns a list with a dictionary for each video we find. @@ -836,6 +836,11 @@ class YoutubeDL(object): '_type': 'compat_list', 'entries': ie_result, } + if info_dict: + if info_dict.get('id'): + ie_result['id'] = info_dict['id'] + if info_dict.get('title'): + ie_result['title'] = info_dict['title'] self.add_default_extra_info(ie_result, ie, url) if process: return self.process_ie_result(ie_result, download, extra_info) @@ -898,7 +903,7 @@ class YoutubeDL(object): # We have to add extra_info to the results because it may be # contained in a playlist return self.extract_info(ie_result['url'], - download, + download, info_dict=ie_result, ie_key=ie_result.get('ie_key'), extra_info=extra_info) elif result_type == 'url_transparent': @@ -1852,13 +1857,13 @@ class YoutubeDL(object): self.report_error('Cannot write annotations file: ' + annofn) return - def dl(name, info): + def dl(name, info, subtitle=False): fd = get_suitable_downloader(info, self.params)(self, self.params) for ph in self._progress_hooks: fd.add_progress_hook(ph) if self.params.get('verbose'): self.to_stdout('[debug] Invoking downloader on %r' % info.get('url')) - return fd.download(name, info) + return fd.download(name, info, subtitle) subtitles_are_requested = any([self.params.get('writesubtitles', False), self.params.get('writeautomaticsub')]) @@ -1867,7 +1872,7 @@ class YoutubeDL(object): # subtitles download errors are already managed as troubles in relevant IE # that way it will silently go on when used with unsupporting IE subtitles = info_dict['requested_subtitles'] - ie = self.get_info_extractor(info_dict['extractor_key']) + # ie = self.get_info_extractor(info_dict['extractor_key']) for sub_lang, sub_info in subtitles.items(): sub_format = sub_info['ext'] sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext')) @@ -1886,6 +1891,8 @@ class YoutubeDL(object): return else: try: + dl(sub_filename, sub_info, subtitle=True) + ''' if self.params.get('sleep_interval_subtitles', False): dl(sub_filename, sub_info) else: @@ -1893,6 +1900,7 @@ class YoutubeDL(object): sub_info['url'], info_dict['id'], note=False).read() with io.open(encodeFilename(sub_filename), 'wb') as subfile: subfile.write(sub_data) + ''' except (ExtractorError, IOError, OSError, ValueError, compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: self.report_warning('Unable to download subtitle for "%s": %s' % (sub_lang, error_to_compat_str(err))) diff --git a/youtube_dlc/downloader/common.py b/youtube_dlc/downloader/common.py index 31c286458..460364a0b 100644 --- a/youtube_dlc/downloader/common.py +++ b/youtube_dlc/downloader/common.py @@ -326,7 +326,7 @@ class FileDownloader(object): """Report it was impossible to resume download.""" self.to_screen('[download] Unable to resume') - def download(self, filename, info_dict): + def download(self, filename, info_dict, subtitle=False): """Download to a filename using the info from info_dict Return True on success and False otherwise """ @@ -353,16 +353,22 @@ class FileDownloader(object): }) return True - min_sleep_interval = self.params.get('sleep_interval') - if min_sleep_interval: - max_sleep_interval = self.params.get('max_sleep_interval', min_sleep_interval) - sleep_interval = random.uniform(min_sleep_interval, max_sleep_interval) + if subtitle is False: + min_sleep_interval = self.params.get('sleep_interval') + if min_sleep_interval: + max_sleep_interval = self.params.get('max_sleep_interval', min_sleep_interval) + sleep_interval = random.uniform(min_sleep_interval, max_sleep_interval) + self.to_screen( + '[download] Sleeping %s seconds...' % ( + int(sleep_interval) if sleep_interval.is_integer() + else '%.2f' % sleep_interval)) + time.sleep(sleep_interval) + else: + sleep_interval_sub = self.params.get('sleep_interval_subtitles') self.to_screen( '[download] Sleeping %s seconds...' % ( - int(sleep_interval) if sleep_interval.is_integer() - else '%.2f' % sleep_interval)) - time.sleep(sleep_interval) - + int(sleep_interval_sub))) + time.sleep(sleep_interval_sub) return self.real_download(filename, info_dict) def real_download(self, filename, info_dict): diff --git a/youtube_dlc/extractor/adobepass.py b/youtube_dlc/extractor/adobepass.py index 38dca1b0a..649f9940f 100644 --- a/youtube_dlc/extractor/adobepass.py +++ b/youtube_dlc/extractor/adobepass.py @@ -1438,6 +1438,13 @@ class AdobePassIE(InfoExtractor): provider_redirect_page, 'oauth redirect') self._download_webpage( oauth_redirect_url, video_id, 'Confirming auto login') + elif 'automatically signed in with' in provider_redirect_page: + # Seems like comcast is rolling up new way of automatically signing customers + oauth_redirect_url = self._html_search_regex( + r'continue:\s*"(https://oauth.xfinity.com/oauth/authorize\?.+)"', provider_redirect_page, + 'oauth redirect (signed)') + # Just need to process the request. No useful data comes back + self._download_webpage(oauth_redirect_url, video_id, 'Confirming auto login') else: if '<form name="signin"' in provider_redirect_page: provider_login_page_res = provider_redirect_page_res diff --git a/youtube_dlc/extractor/bandcamp.py b/youtube_dlc/extractor/bandcamp.py index 9dbafe86d..0e7492764 100644 --- a/youtube_dlc/extractor/bandcamp.py +++ b/youtube_dlc/extractor/bandcamp.py @@ -25,10 +25,48 @@ from ..utils import ( ) -class BandcampIE(InfoExtractor): +class BandcampBaseIE(InfoExtractor): + """Provide base functions for Bandcamp extractors""" + + def _extract_json_from_html_data_attribute(self, webpage, suffix, video_id): + json_string = self._html_search_regex( + r' data-%s="([^"]*)' % suffix, + webpage, '%s json' % suffix, default='{}') + + return self._parse_json(json_string, video_id) + + def _parse_json_track(self, json): + formats = [] + file_ = json.get('file') + if isinstance(file_, dict): + for format_id, format_url in file_.items(): + if not url_or_none(format_url): + continue + ext, abr_str = format_id.split('-', 1) + formats.append({ + 'format_id': format_id, + 'url': self._proto_relative_url(format_url, 'http:'), + 'ext': ext, + 'vcodec': 'none', + 'acodec': ext, + 'abr': int_or_none(abr_str), + }) + + return { + 'duration': float_or_none(json.get('duration')), + 'id': str_or_none(json.get('track_id') or json.get('id')), + 'title': json.get('title'), + 'title_link': json.get('title_link'), + 'number': int_or_none(json.get('track_num')), + 'formats': formats + } + + +class BandcampIE(BandcampBaseIE): + IE_NAME = "Bandcamp:track" _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<title>[^/?#&]+)' _TESTS = [{ - 'url': 'http://youtube-dlc.bandcamp.com/track/youtube-dlc-test-song', + 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song', 'md5': 'c557841d5e50261777a6585648adf439', 'info_dict': { 'id': '1812978515', @@ -85,52 +123,32 @@ class BandcampIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) title = mobj.group('title') + url_track_title = title webpage = self._download_webpage(url, title) thumbnail = self._html_search_meta('og:image', webpage, default=None) - track_id = None - track = None - track_number = None - duration = None + json_tralbum = self._extract_json_from_html_data_attribute(webpage, "tralbum", url_track_title) + json_embed = self._extract_json_from_html_data_attribute(webpage, "embed", url_track_title) - formats = [] - trackinfo_block = self._html_search_regex( - r'trackinfo(?:["\']|"):\[\s*({.+?})\s*\],(?:["\']|")', - webpage, 'track info', default='{}') - - track_info = self._parse_json(trackinfo_block, title) - if track_info: - file_ = track_info.get('file') - if isinstance(file_, dict): - for format_id, format_url in file_.items(): - if not url_or_none(format_url): - continue - ext, abr_str = format_id.split('-', 1) - formats.append({ - 'format_id': format_id, - 'url': self._proto_relative_url(format_url, 'http:'), - 'ext': ext, - 'vcodec': 'none', - 'acodec': ext, - 'abr': int_or_none(abr_str), - }) - - track_id = str_or_none(track_info.get('track_id') or track_info.get('id')) - track_number = int_or_none(track_info.get('track_num')) - duration = float_or_none(track_info.get('duration')) - - def extract(key): - data = self._html_search_regex( - r',(["\']|")%s\1:\1(?P<value>(?:\\\1|((?!\1).))+)\1' % key, - webpage, key, default=None, group='value') - return data.replace(r'\"', '"').replace('\\\\', '\\') if data else data - - track = extract('title') - artist = extract('artist') - album = extract('album_title') - timestamp = unified_timestamp( - extract('publish_date') or extract('album_publish_date')) - release_date = unified_strdate(extract('album_release_date')) + json_tracks = json_tralbum.get('trackinfo') + if not json_tracks: + raise ExtractorError('Could not extract track') + + track = self._parse_json_track(json_tracks[0]) + artist = json_tralbum.get('artist') + album_title = json_embed.get('album_title') + + json_album = json_tralbum.get('packages') + if json_album: + json_album = json_album[0] + album_publish_date = json_album.get('album_publish_date') + album_release_date = json_album.get('album_release_date') + else: + album_publish_date = None + album_release_date = json_tralbum.get('album_release_date') + + timestamp = unified_timestamp(json_tralbum.get('current', {}).get('publish_date') or album_publish_date) + release_date = unified_strdate(album_release_date) download_link = self._search_regex( r'freeDownloadPage(?:["\']|"):\s*(["\']|")(?P<url>(?:(?!\1).)+)\1', webpage, @@ -155,8 +173,6 @@ class BandcampIE(InfoExtractor): if info: downloads = info.get('downloads') if isinstance(downloads, dict): - if not track: - track = info.get('title') if not artist: artist = info.get('artist') if not thumbnail: @@ -190,7 +206,7 @@ class BandcampIE(InfoExtractor): retry_url = url_or_none(stat.get('retry_url')) if not retry_url: continue - formats.append({ + track['formats'].append({ 'url': self._proto_relative_url(retry_url, 'http:'), 'ext': download_formats.get(format_id), 'format_id': format_id, @@ -199,32 +215,28 @@ class BandcampIE(InfoExtractor): 'vcodec': 'none', }) - self._sort_formats(formats) + self._sort_formats(track['formats']) - title = '%s - %s' % (artist, track) if artist else track - - if not duration: - duration = float_or_none(self._html_search_meta( - 'duration', webpage, default=None)) + title = '%s - %s' % (artist, track.get('title')) if artist else track.get('title') return { - 'id': track_id, - 'title': title, + 'album': album_title, + 'artist': artist, + 'duration': track['duration'], + 'formats': track['formats'], + 'id': track['id'], + 'release_date': release_date, 'thumbnail': thumbnail, - 'uploader': artist, 'timestamp': timestamp, - 'release_date': release_date, - 'duration': duration, - 'track': track, - 'track_number': track_number, - 'track_id': track_id, - 'artist': artist, - 'album': album, - 'formats': formats, + 'title': title, + 'track': track['title'], + 'track_id': track['id'], + 'track_number': track['number'], + 'uploader': artist } -class BandcampAlbumIE(InfoExtractor): +class BandcampAlbumIE(BandcampBaseIE): IE_NAME = 'Bandcamp:album' _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?' @@ -305,34 +317,32 @@ class BandcampAlbumIE(InfoExtractor): album_id = mobj.group('album_id') playlist_id = album_id or uploader_id webpage = self._download_webpage(url, playlist_id) - track_elements = re.findall( - r'(?s)<div[^>]*>(.*?<a[^>]+href="([^"]+?)"[^>]+itemprop="url"[^>]*>.*?)</div>', webpage) - if not track_elements: - raise ExtractorError('The page doesn\'t contain any tracks') + + json_tralbum = self._extract_json_from_html_data_attribute(webpage, "tralbum", playlist_id) + json_embed = self._extract_json_from_html_data_attribute(webpage, "embed", playlist_id) + + json_tracks = json_tralbum.get('trackinfo') + if not json_tracks: + raise ExtractorError('Could not extract album tracks') + + album_title = json_embed.get('album_title') + # Only tracks with duration info have songs + tracks = [self._parse_json_track(track) for track in json_tracks] entries = [ self.url_result( - compat_urlparse.urljoin(url, t_path), - ie=BandcampIE.ie_key(), - video_title=self._search_regex( - r'<span\b[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)', - elem_content, 'track title', fatal=False)) - for elem_content, t_path in track_elements - if self._html_search_meta('duration', elem_content, default=None)] - - title = self._html_search_regex( - r'album_title\s*(?:"|["\']):\s*("|["\'])(?P<album>(?:\\\1|((?!\1).))+)\1', - webpage, 'title', fatal=False, group='album') - - if title: - title = title.replace(r'\"', '"') + compat_urlparse.urljoin(url, track['title_link']), + ie=BandcampIE.ie_key(), video_id=track['id'], + video_title=track['title']) + for track in tracks + if track.get('duration')] return { '_type': 'playlist', 'uploader_id': uploader_id, 'id': playlist_id, - 'title': title, - 'entries': entries, + 'title': album_title, + 'entries': entries } diff --git a/youtube_dlc/extractor/brightcove.py b/youtube_dlc/extractor/brightcove.py index 2aa9f4782..638673c31 100644 --- a/youtube_dlc/extractor/brightcove.py +++ b/youtube_dlc/extractor/brightcove.py @@ -471,12 +471,17 @@ class BrightcoveNewIE(AdobePassIE): title = json_data['name'].strip() formats = [] + sources_num = len(json_data.get('sources')) + key_systems_present = 0 for source in json_data.get('sources', []): container = source.get('container') ext = mimetype2ext(source.get('type')) src = source.get('src') - # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object - if ext == 'ism' or container == 'WVM' or source.get('key_systems'): + # https://apis.support.brightcove.com/playback/references/playback-api-video-fields-reference.html + if source.get('key_systems'): + key_systems_present += 1 + continue + elif ext == 'ism' or container == 'WVM': continue elif ext == 'm3u8' or container == 'M2TS': if not src: @@ -533,6 +538,10 @@ class BrightcoveNewIE(AdobePassIE): 'format_id': build_format_id('rtmp'), }) formats.append(f) + + if sources_num == key_systems_present: + raise ExtractorError('This video is DRM protected', expected=True) + if not formats: # for sonyliv.com DRM protected videos s3_source_url = json_data.get('custom_fields', {}).get('s3sourceurl') diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index d31edd7c8..666134d86 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -751,6 +751,7 @@ from .ninecninemedia import NineCNineMediaIE from .ninegag import NineGagIE from .ninenow import NineNowIE from .nintendo import NintendoIE +from .nitter import NitterIE from .njpwworld import NJPWWorldIE from .nobelprize import NobelPrizeIE from .noco import NocoIE @@ -1037,6 +1038,10 @@ from .sky import ( SkyNewsIE, SkySportsIE, ) +from .skyitalia import ( + SkyArteItaliaIE, + SkyItaliaIE, +) from .slideshare import SlideshareIE from .slideslive import SlidesLiveIE from .slutload import SlutloadIE diff --git a/youtube_dlc/extractor/mtv.py b/youtube_dlc/extractor/mtv.py index 6b3658397..04cc95b6a 100644 --- a/youtube_dlc/extractor/mtv.py +++ b/youtube_dlc/extractor/mtv.py @@ -289,7 +289,7 @@ class MTVServicesInfoExtractor(InfoExtractor): return mgid - def _extract_mgid(self, webpage, url, data_zone=None): + def _extract_mgid(self, webpage, url, title=None, data_zone=None): try: # the url can be http://media.mtvnservices.com/fb/{mgid}.swf # or http://media.mtvnservices.com/{mgid} @@ -300,7 +300,8 @@ class MTVServicesInfoExtractor(InfoExtractor): except RegexNotFoundError: mgid = None - title = self._match_id(url) + if not title: + title = url_basename(url) try: window_data = self._parse_json(self._search_regex( @@ -336,7 +337,7 @@ class MTVServicesInfoExtractor(InfoExtractor): def _real_extract(self, url): title = url_basename(url) webpage = self._download_webpage(url, title) - mgid = self._extract_mgid(webpage, url) + mgid = self._extract_mgid(webpage, url, title=title) videos_info = self._get_videos_info(mgid, url=url) return videos_info diff --git a/youtube_dlc/extractor/netzkino.py b/youtube_dlc/extractor/netzkino.py index aec3026b1..3d1a06d0b 100644 --- a/youtube_dlc/extractor/netzkino.py +++ b/youtube_dlc/extractor/netzkino.py @@ -13,17 +13,16 @@ from ..utils import ( class NetzkinoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?netzkino\.de/\#!/(?P<category>[^/]+)/(?P<id>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?netzkino\.de/\#!/[^/]+/(?P<id>[^/]+)' - _TEST = { - 'url': 'http://www.netzkino.de/#!/scifikino/rakete-zum-mond', + _TESTS = [{ + 'url': 'https://www.netzkino.de/#!/scifikino/rakete-zum-mond', 'md5': '92a3f8b76f8d7220acce5377ea5d4873', 'info_dict': { 'id': 'rakete-zum-mond', 'ext': 'mp4', - 'title': 'Rakete zum Mond (Endstation Mond, Destination Moon)', - 'comments': 'mincount:3', - 'description': 'md5:1eddeacc7e62d5a25a2d1a7290c64a28', + 'title': 'Rakete zum Mond \u2013 Jules Verne', + 'description': 'md5:f0a8024479618ddbfa450ff48ffa6c60', 'upload_date': '20120813', 'thumbnail': r're:https?://.*\.jpg$', 'timestamp': 1344858571, @@ -32,17 +31,30 @@ class NetzkinoIE(InfoExtractor): 'params': { 'skip_download': 'Download only works from Germany', } - } + }, { + 'url': 'https://www.netzkino.de/#!/filme/dr-jekyll-mrs-hyde-2', + 'md5': 'c7728b2dadd04ff6727814847a51ef03', + 'info_dict': { + 'id': 'dr-jekyll-mrs-hyde-2', + 'ext': 'mp4', + 'title': 'Dr. Jekyll & Mrs. Hyde 2', + 'description': 'md5:c2e9626ebd02de0a794b95407045d186', + 'upload_date': '20190130', + 'thumbnail': r're:https?://.*\.jpg$', + 'timestamp': 1548849437, + 'age_limit': 18, + }, + 'params': { + 'skip_download': 'Download only works from Germany', + } + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - category_id = mobj.group('category') video_id = mobj.group('id') - api_url = 'http://api.netzkino.de.simplecache.net/capi-2.0a/categories/%s.json?d=www' % category_id - api_info = self._download_json(api_url, video_id) - info = next( - p for p in api_info['posts'] if p['slug'] == video_id) + api_url = 'https://api.netzkino.de.simplecache.net/capi-2.0a/movies/%s.json?d=www' % video_id + info = self._download_json(api_url, video_id) custom_fields = info['custom_fields'] production_js = self._download_webpage( @@ -67,23 +79,12 @@ class NetzkinoIE(InfoExtractor): } for key, tpl in templates.items()] self._sort_formats(formats) - comments = [{ - 'timestamp': parse_iso8601(c.get('date'), delimiter=' '), - 'id': c['id'], - 'author': c['name'], - 'html': c['content'], - 'parent': 'root' if c.get('parent', 0) == 0 else c['parent'], - } for c in info.get('comments', [])] - return { 'id': video_id, 'formats': formats, - 'comments': comments, 'title': info['title'], 'age_limit': int_or_none(custom_fields.get('FSK')[0]), 'timestamp': parse_iso8601(info.get('date'), delimiter=' '), 'description': clean_html(info.get('content')), 'thumbnail': info.get('thumbnail'), - 'playlist_title': api_info.get('title'), - 'playlist_id': category_id, } diff --git a/youtube_dlc/extractor/newgrounds.py b/youtube_dlc/extractor/newgrounds.py index 82e7cf522..b9f01235f 100644 --- a/youtube_dlc/extractor/newgrounds.py +++ b/youtube_dlc/extractor/newgrounds.py @@ -4,6 +4,7 @@ import re from .common import InfoExtractor from ..utils import ( + ExtractorError, extract_attributes, int_or_none, parse_duration, @@ -20,22 +21,22 @@ class NewgroundsIE(InfoExtractor): 'info_dict': { 'id': '549479', 'ext': 'mp3', - 'title': 'B7 - BusMode', + 'title': 'Burn7 - B7 - BusMode', 'uploader': 'Burn7', 'timestamp': 1378878540, 'upload_date': '20130911', 'duration': 143, }, }, { - 'url': 'https://www.newgrounds.com/portal/view/673111', - 'md5': '3394735822aab2478c31b1004fe5e5bc', + 'url': 'https://www.newgrounds.com/portal/view/1', + 'md5': 'fbfb40e2dc765a7e830cb251d370d981', 'info_dict': { - 'id': '673111', + 'id': '1', 'ext': 'mp4', - 'title': 'Dancin', - 'uploader': 'Squirrelman82', - 'timestamp': 1460256780, - 'upload_date': '20160410', + 'title': 'Brian-Beaton - Scrotum 1', + 'uploader': 'Brian-Beaton', + 'timestamp': 955064100, + 'upload_date': '20000406', }, }, { # source format unavailable, additional mp4 formats @@ -43,7 +44,7 @@ class NewgroundsIE(InfoExtractor): 'info_dict': { 'id': '689400', 'ext': 'mp4', - 'title': 'ZTV News Episode 8', + 'title': 'Bennettthesage - ZTV News Episode 8', 'uploader': 'BennettTheSage', 'timestamp': 1487965140, 'upload_date': '20170224', @@ -55,42 +56,73 @@ class NewgroundsIE(InfoExtractor): def _real_extract(self, url): media_id = self._match_id(url) - + formats = [] + uploader = None webpage = self._download_webpage(url, media_id) title = self._html_search_regex( r'<title>([^>]+)</title>', webpage, 'title') - media_url = self._parse_json(self._search_regex( - r'"url"\s*:\s*("[^"]+"),', webpage, ''), media_id) - - formats = [{ - 'url': media_url, - 'format_id': 'source', - 'quality': 1, - }] - - max_resolution = int_or_none(self._search_regex( - r'max_resolution["\']\s*:\s*(\d+)', webpage, 'max resolution', - default=None)) - if max_resolution: - url_base = media_url.rpartition('.')[0] - for resolution in (360, 720, 1080): - if resolution > max_resolution: - break - formats.append({ - 'url': '%s.%dp.mp4' % (url_base, resolution), - 'format_id': '%dp' % resolution, - 'height': resolution, - }) + media_url_string = self._search_regex( + r'"url"\s*:\s*("[^"]+"),', webpage, 'media url', default=None, fatal=False) + + if media_url_string: + media_url = self._parse_json(media_url_string, media_id) + formats = [{ + 'url': media_url, + 'format_id': 'source', + 'quality': 1, + }] + + max_resolution = int_or_none(self._search_regex( + r'max_resolution["\']\s*:\s*(\d+)', webpage, 'max resolution', + default=None)) + if max_resolution: + url_base = media_url.rpartition('.')[0] + for resolution in (360, 720, 1080): + if resolution > max_resolution: + break + formats.append({ + 'url': '%s.%dp.mp4' % (url_base, resolution), + 'format_id': '%dp' % resolution, + 'height': resolution, + }) + else: + video_id = int_or_none(self._search_regex( + r'data-movie-id=\\"([0-9]+)\\"', webpage, '')) + if not video_id: + raise ExtractorError('Could not extract media data') + + url_video_data = 'https://www.newgrounds.com/portal/video/%s' % video_id + headers = { + 'Accept': 'application/json', + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest' + } + json_video = self._download_json(url_video_data, video_id, headers=headers, fatal=False) + if not json_video: + raise ExtractorError('Could not fetch media data') + + uploader = json_video.get('author') + title = json_video.get('title') + media_formats = json_video.get('sources', []) + for media_format in media_formats: + media_sources = media_formats[media_format] + for source in media_sources: + formats.append({ + 'format_id': media_format, + 'quality': int_or_none(media_format[:-1]), + 'url': source.get('src') + }) self._check_formats(formats, media_id) self._sort_formats(formats) - uploader = self._html_search_regex( - (r'(?s)<h4[^>]*>(.+?)</h4>.*?<em>\s*Author\s*</em>', - r'(?:Author|Writer)\s*<a[^>]+>([^<]+)'), webpage, 'uploader', - fatal=False) + if not uploader: + uploader = self._html_search_regex( + (r'(?s)<h4[^>]*>(.+?)</h4>.*?<em>\s*(?:Author|Artist)\s*</em>', + r'(?:Author|Writer)\s*<a[^>]+>([^<]+)'), webpage, 'uploader', + fatal=False) timestamp = unified_timestamp(self._html_search_regex( (r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+</dd>\s*<dd>[^<]+)', @@ -109,6 +141,9 @@ class NewgroundsIE(InfoExtractor): if '<dd>Song' in webpage: formats[0]['vcodec'] = 'none' + if uploader: + title = "%s - %s" % (uploader, title) + return { 'id': media_id, 'title': title, diff --git a/youtube_dlc/extractor/nitter.py b/youtube_dlc/extractor/nitter.py new file mode 100644 index 000000000..3191543ed --- /dev/null +++ b/youtube_dlc/extractor/nitter.py @@ -0,0 +1,167 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + parse_count, + unified_strdate, + unified_timestamp, + remove_end, + determine_ext, +) +import re + + +class NitterIE(InfoExtractor): + # Taken from https://github.com/zedeus/nitter/wiki/Instances + INSTANCES = ('nitter.net', + 'nitter.snopyta.org', + 'nitter.42l.fr', + 'nitter.nixnet.services', + 'nitter.13ad.de', + 'nitter.pussthecat.org', + 'nitter.mastodont.cat', + 'nitter.dark.fail', + 'nitter.tedomum.net', + 'nitter.cattube.org', + 'nitter.fdn.fr', + 'nitter.1d4.us', + 'nitter.kavin.rocks', + 'tweet.lambda.dance', + 'nitter.cc', + 'nitter.weaponizedhumiliation.com', + '3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion', + 'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion', + 'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion') + + _INSTANCES_RE = '(?:' + '|'.join([re.escape(instance) for instance in INSTANCES]) + ')' + _VALID_URL = r'https?://%(instance)s/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?' % {'instance': _INSTANCES_RE} + current_instance = INSTANCES[0] # the test and official instance + _TESTS = [ + { + # GIF (wrapped in mp4) + 'url': 'https://' + current_instance + '/firefox/status/1314279897502629888#m', + 'info_dict': { + 'id': '1314279897502629888', + 'ext': 'mp4', + 'title': 'Firefox 🔥 - You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension. Report harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg #UnfckTheInternet', + 'description': 'You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension. Report harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg #UnfckTheInternet', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Firefox 🔥', + 'uploader_id': 'firefox', + 'uploader_url': 'https://' + current_instance + '/firefox', + 'upload_date': '20201008', + 'timestamp': 1602183720, + }, + }, { # normal video + 'url': 'https://' + current_instance + '/Le___Doc/status/1299715685392756737#m', + 'info_dict': { + 'id': '1299715685392756737', + 'ext': 'mp4', + 'title': 'Le Doc - "Je ne prédis jamais rien" D Raoult, Août 2020...', + 'description': '"Je ne prédis jamais rien" D Raoult, Août 2020...', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Le Doc', + 'uploader_id': 'Le___Doc', + 'uploader_url': 'https://' + current_instance + '/Le___Doc', + 'upload_date': '20200829', + 'timestamp': 1598711341, + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + }, + }, { # video embed in a "Streaming Political Ads" box + 'url': 'https://' + current_instance + '/mozilla/status/1321147074491092994#m', + 'info_dict': { + 'id': '1321147074491092994', + 'ext': 'mp4', + 'title': "Mozilla - Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows? This isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few. Learn more ➡️ https://mzl.la/StreamingAds", + 'description': "Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows? This isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few. Learn more ➡️ https://mzl.la/StreamingAds", + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Mozilla', + 'uploader_id': 'mozilla', + 'uploader_url': 'https://' + current_instance + '/mozilla', + 'upload_date': '20201027', + 'timestamp': 1603820982 + }, + }, + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + parsed_url = compat_urlparse.urlparse(url) + base_url = parsed_url.scheme + '://' + parsed_url.netloc + + self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on') + webpage = self._download_webpage(url, video_id) + + video_url = base_url + self._html_search_regex(r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url') + ext = determine_ext(video_url) + + if ext == 'unknown_video': + formats = self._extract_m3u8_formats(video_url, video_id, ext='mp4') + else: + formats = [{ + 'url': video_url, + 'ext': ext + }] + + title = ( + self._og_search_description(webpage).replace('\n', ' ') + or self._html_search_regex(r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title')) + description = title + + mobj = re.match(self._VALID_URL, url) + uploader_id = ( + mobj.group('uploader_id') + or self._html_search_regex(r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False)) + + if uploader_id: + uploader_url = base_url + '/' + uploader_id + + uploader = self._html_search_regex(r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False) + + if uploader: + title = uploader + ' - ' + title + + view_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-play[^>]*></span>\s([^<]+)</div>', webpage, 'view count', fatal=False)) + like_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-heart[^>]*></span>\s([^<]+)</div>', webpage, 'like count', fatal=False)) + repost_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-retweet[^>]*></span>\s([^<]+)</div>', webpage, 'repost count', fatal=False)) + comment_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-comment[^>]*></span>\s([^<]+)</div>', webpage, 'repost count', fatal=False)) + + thumbnail = base_url + (self._html_search_meta('og:image', webpage, 'thumbnail url') + or self._html_search_regex(r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False)) + + thumbnail = remove_end(thumbnail, '%3Asmall') # if parsed with regex, it should contain this + + thumbnails = [] + thumbnail_ids = ('thumb', 'small', 'large', 'medium', 'orig') + for id in thumbnail_ids: + thumbnails.append({ + 'id': id, + 'url': thumbnail + '%3A' + id, + }) + + date = self._html_search_regex(r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"', webpage, 'upload date', fatal=False) + upload_date = unified_strdate(date) + timestamp = unified_timestamp(date) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'uploader': uploader, + 'timestamp': timestamp, + 'uploader_id': uploader_id, + 'uploader_url': uploader_url, + 'view_count': view_count, + 'like_count': like_count, + 'repost_count': repost_count, + 'comment_count': comment_count, + 'formats': formats, + 'thumbnails': thumbnails, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + } diff --git a/youtube_dlc/extractor/skyitalia.py b/youtube_dlc/extractor/skyitalia.py new file mode 100644 index 000000000..3c7bd465d --- /dev/null +++ b/youtube_dlc/extractor/skyitalia.py @@ -0,0 +1,119 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ExtractorError + + +class SkyItaliaBaseIE(InfoExtractor): + _GET_VIDEO_DATA = 'https://apid.sky.it/vdp/v1/getVideoData?token={token}&caller=sky&rendition=web&id={id}' + _RES = { + 'low': [426, 240], + 'med': [640, 360], + 'high': [854, 480], + 'hd': [1280, 720] + } + + def _extract_video_id(self, url): + webpage = self._download_webpage(url, 'skyitalia') + video_id = self._html_search_regex( + [r'data-videoid=\"(\d+)\"', + r'http://player\.sky\.it/social\?id=(\d+)\&'], + webpage, 'video_id') + if video_id: + return video_id + raise ExtractorError('Video ID not found.') + + def _get_formats(self, video_id, token): + data_url = self._GET_VIDEO_DATA.replace('{id}', video_id) + data_url = data_url.replace('{token}', token) + video_data = self._parse_json( + self._download_webpage(data_url, video_id), + video_id) + + formats = [] + for q, r in self._RES.items(): + key = 'web_%s_url' % q + if key not in video_data: + continue + formats.append({ + 'url': video_data.get(key), + 'format_id': q, + 'width': r[0], + 'height': r[1] + }) + + self._sort_formats(formats) + title = video_data.get('title') + thumb = video_data.get('thumb') + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumb, + 'formats': formats + } + + def _real_extract(self, url): + video_id = self._match_id(url) + if video_id == 'None': + video_id = self._extract_video_id(url) + return self._get_formats(video_id, self._TOKEN) + + +class SkyItaliaIE(SkyItaliaBaseIE): + IE_NAME = 'sky.it' + _VALID_URL = r'''(?x)https?:// + (?P<ie>sport|tg24|video) + \.sky\.it/(?:.+?) + (?P<id>[0-9]{6})? + (?:$|\?)''' + + _TESTS = [{ + 'url': 'https://video.sky.it/sport/motogp/video/motogp-gp-emilia-romagna-highlights-prove-libere-616162', + 'md5': '9c03b590b06e5952d8051f0e02b0feca', + 'info_dict': { + 'id': '616162', + 'ext': 'mp4', + 'title': 'MotoGP, GP Emilia Romagna: gli highlights delle prove libere', + 'thumbnail': 'https://videoplatform.sky.it/thumbnail/2020/09/18/1600441214452_hl-libere-motogp-misano2_5602634_thumbnail_1.jpg', + } + }, { + 'url': 'https://sport.sky.it/motogp/2020/09/18/motogp-gp-emilia-romagna-misano-2020-prove-libere-diretta', + 'md5': '9c03b590b06e5952d8051f0e02b0feca', + 'info_dict': { + 'id': '616162', + 'ext': 'mp4', + 'title': 'MotoGP, GP Emilia Romagna: gli highlights delle prove libere', + 'thumbnail': 'https://videoplatform.sky.it/thumbnail/2020/09/18/1600441214452_hl-libere-motogp-misano2_5602634_thumbnail_1.jpg', + } + }, { + 'url': 'https://tg24.sky.it/salute-e-benessere/2020/09/18/coronavirus-vaccino-ue-sanofi', + 'md5': 'caa25e62dadb529bc5e0b078da99f854', + 'info_dict': { + 'id': '615904', + 'ext': 'mp4', + 'title': 'Covid-19, al Buzzi di Milano tamponi drive-in per studenti', + 'thumbnail': 'https://videoplatform.sky.it/thumbnail/2020/09/17/1600351405841_error-coronavirus-al-buzzi-di-milano-tamponi_thumbnail_1.jpg', + } + }, { + 'url': 'https://video.sky.it/sport/motogp/video/motogp-gp-emilia-romagna-highlights-prove-libere-616162?itm_source=parsely-api', + 'only_matching': True, + }] + _TOKEN = 'F96WlOd8yoFmLQgiqv6fNQRvHZcsWk5jDaYnDvhbiJk' + + +class SkyArteItaliaIE(SkyItaliaBaseIE): + IE_NAME = 'arte.sky.it' + _VALID_URL = r'https?://arte\.sky\.it/video/.+?(?P<id>[0-9]{6})?$' + _TEST = { + 'url': 'https://arte.sky.it/video/federico-fellini-maestri-cinema/', + 'md5': '2f22513a89f45142f2746f878d690647', + 'info_dict': { + 'id': '612888', + 'ext': 'mp4', + 'title': 'I maestri del cinema Federico Felini', + 'thumbnail': 'https://videoplatform.sky.it/thumbnail/2020/09/03/1599146747305_i-maestri-del-cinema-federico-felini_thumbnail_1.jpg', + } + } + _TOKEN = 'LWk29hfiU39NNdq87ePeRach3nzTSV20o0lTv2001Cd' diff --git a/youtube_dlc/extractor/tvland.py b/youtube_dlc/extractor/tvland.py index 791144128..225b6b078 100644 --- a/youtube_dlc/extractor/tvland.py +++ b/youtube_dlc/extractor/tvland.py @@ -3,6 +3,8 @@ from __future__ import unicode_literals from .spike import ParamountNetworkIE +# TODO: Remove - Reason not used anymore - Service moved to youtube + class TVLandIE(ParamountNetworkIE): IE_NAME = 'tvland.com' diff --git a/youtube_dlc/extractor/urplay.py b/youtube_dlc/extractor/urplay.py index 6030b7cb5..4bc2b78fb 100644 --- a/youtube_dlc/extractor/urplay.py +++ b/youtube_dlc/extractor/urplay.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..utils import unified_timestamp +import re class URPlayIE(InfoExtractor): @@ -13,10 +14,10 @@ class URPlayIE(InfoExtractor): 'info_dict': { 'id': '203704', 'ext': 'mp4', - 'title': 'UR Samtiden - Livet, universum och rymdens märkliga musik : Om vetenskap, kritiskt tänkande och motstånd', + 'title': 'Om vetenskap, kritiskt tänkande och motstånd', 'description': 'md5:5344508a52aa78c1ced6c1b8b9e44e9a', - 'timestamp': 1513512768, - 'upload_date': '20171217', + 'timestamp': 1513292400, + 'upload_date': '20171214', }, }, { 'url': 'https://urskola.se/Produkter/190031-Tripp-Trapp-Trad-Sovkudde', @@ -37,35 +38,41 @@ class URPlayIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - urplayer_data = self._parse_json(self._search_regex( - r'urPlayer\.init\(({.+?})\);', webpage, 'urplayer data'), video_id) - host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect'] + urplayer_data = re.sub(""", "\"", self._search_regex( + r'components\/Player\/Player\" data-react-props=\"({.+?})\"', + webpage, 'urplayer data')) + urplayer_data = self._parse_json(urplayer_data, video_id) + for i in range(len(urplayer_data['accessibleEpisodes'])): + if urplayer_data.get('accessibleEpisodes', {})[i].get('id') == int(video_id): + urplayer_data = urplayer_data['accessibleEpisodes'][i] + break + host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect'] formats = [] - for quality_attr, quality, preference in (('', 'sd', 0), ('_hd', 'hd', 1)): - file_http = urplayer_data.get('file_http' + quality_attr) or urplayer_data.get('file_http_sub' + quality_attr) - if file_http: + urplayer_streams = urplayer_data.get("streamingInfo") + for quality in ('sd'), ('hd'): + location = (urplayer_streams.get("raw", {}).get(quality, {}).get("location") + or urplayer_streams.get("sweComplete", {}).get(quality, {}).get("location")) + if location: formats.extend(self._extract_wowza_formats( - 'http://%s/%splaylist.m3u8' % (host, file_http), video_id, skip_protocols=['rtmp', 'rtsp'])) + 'http://%s/%s/playlist.m3u8' % (host, location), video_id, + skip_protocols=['f4m', 'rtmp', 'rtsp'])) self._sort_formats(formats) - subtitles = {} - for subtitle in urplayer_data.get('subtitles', []): - subtitle_url = subtitle.get('file') - kind = subtitle.get('kind') - if not subtitle_url or (kind and kind != 'captions'): - continue - subtitles.setdefault(subtitle.get('label', 'Svenska'), []).append({ - 'url': subtitle_url, + subs = urplayer_streams.get("sweComplete", {}).get("tt", {}).get("location") + if subs: + subtitles.setdefault('Svenska', []).append({ + 'url': subs, }) return { 'id': video_id, 'title': urplayer_data['title'], 'description': self._og_search_description(webpage), - 'thumbnail': urplayer_data.get('image'), - 'timestamp': unified_timestamp(self._html_search_meta(('uploadDate', 'schema:uploadDate'), webpage, 'timestamp')), - 'series': urplayer_data.get('series_title'), + 'thumbnail': urplayer_data.get('image', {}).get('1280x720'), + 'timestamp': unified_timestamp(self._html_search_meta(('uploadDate', 'schema:uploadDate'), + webpage, 'timestamp')), + 'series': urplayer_data.get('seriesTitle'), 'subtitles': subtitles, 'formats': formats, } diff --git a/youtube_dlc/extractor/xtube.py b/youtube_dlc/extractor/xtube.py index 01b253dcb..081c5e2e7 100644 --- a/youtube_dlc/extractor/xtube.py +++ b/youtube_dlc/extractor/xtube.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from ..utils import ( + ExtractorError, int_or_none, js_to_json, orderedSet, @@ -33,28 +34,12 @@ class XTubeIE(InfoExtractor): 'title': 'strange erotica', 'description': 'contains:an ET kind of thing', 'uploader': 'greenshowers', - 'duration': 450, + 'duration': 449, 'view_count': int, 'comment_count': int, 'age_limit': 18, } }, { - # FLV videos with duplicated formats - 'url': 'http://www.xtube.com/video-watch/A-Super-Run-Part-1-YT-9299752', - 'md5': 'a406963eb349dd43692ec54631efd88b', - 'info_dict': { - 'id': '9299752', - 'display_id': 'A-Super-Run-Part-1-YT', - 'ext': 'flv', - 'title': 'A Super Run - Part 1 (YT)', - 'description': 'md5:4cc3af1aa1b0413289babc88f0d4f616', - 'uploader': 'tshirtguy59', - 'duration': 579, - 'view_count': int, - 'comment_count': int, - 'age_limit': 18, - }, - }, { # new URL schema 'url': 'http://www.xtube.com/video-watch/strange-erotica-625837', 'only_matching': True, @@ -89,16 +74,24 @@ class XTubeIE(InfoExtractor): title, thumbnail, duration = [None] * 3 - config = self._parse_json(self._search_regex( - r'playerConf\s*=\s*({.+?})\s*,\s*\n', webpage, 'config', - default='{}'), video_id, transform_source=js_to_json, fatal=False) - if config: - config = config.get('mainRoll') - if isinstance(config, dict): - title = config.get('title') - thumbnail = config.get('poster') - duration = int_or_none(config.get('duration')) - sources = config.get('sources') or config.get('format') + json_config_string = self._search_regex( + r'playerConf=({.+?}),loaderConf', + webpage, 'config', default=None) + if not json_config_string: + raise ExtractorError("Could not extract video player data") + + json_config_string = json_config_string.replace("!0", "true").replace("!1", "false") + + config = self._parse_json(json_config_string, video_id, transform_source=js_to_json, fatal=False) + if not config: + raise ExtractorError("Could not extract video player data") + + config = config.get('mainRoll') + if isinstance(config, dict): + title = config.get('title') + thumbnail = config.get('poster') + duration = int_or_none(config.get('duration')) + sources = config.get('sources') or config.get('format') if not isinstance(sources, dict): sources = self._parse_json(self._search_regex( diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 4fb49b864..d605f1e74 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -1406,6 +1406,44 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return self._parse_json( uppercase_escape(config), video_id, fatal=False) + def _get_music_metadata_from_yt_initial(self, yt_initial): + music_metadata = [] + key_map = { + 'Album': 'album', + 'Artist': 'artist', + 'Song': 'track' + } + contents = try_get(yt_initial, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents']) + if type(contents) is list: + for content in contents: + music_track = {} + if type(content) is not dict: + continue + videoSecondaryInfoRenderer = try_get(content, lambda x: x['videoSecondaryInfoRenderer']) + if type(videoSecondaryInfoRenderer) is not dict: + continue + rows = try_get(videoSecondaryInfoRenderer, lambda x: x['metadataRowContainer']['metadataRowContainerRenderer']['rows']) + if type(rows) is not list: + continue + for row in rows: + metadataRowRenderer = try_get(row, lambda x: x['metadataRowRenderer']) + if type(metadataRowRenderer) is not dict: + continue + key = try_get(metadataRowRenderer, lambda x: x['title']['simpleText']) + value = try_get(metadataRowRenderer, lambda x: x['contents'][0]['simpleText']) or \ + try_get(metadataRowRenderer, lambda x: x['contents'][0]['runs'][0]['text']) + if type(key) is not str or type(value) is not str: + continue + if key in key_map: + if key_map[key] in music_track: + # we've started on a new track + music_metadata.append(music_track) + music_track = {} + music_track[key_map[key]] = value + if len(music_track.keys()): + music_metadata.append(music_track) + return music_metadata + def _get_automatic_captions(self, video_id, webpage): """We need the webpage for getting the captions url, pass it as an argument to speed up the process.""" @@ -2051,7 +2089,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if cipher: if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True): - ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")' + ASSETS_RE = r'(?:"assets":.+?"js":\s*("[^"]+"))|(?:"jsUrl":\s*("[^"]+"))' jsplayer_url_json = self._search_regex( ASSETS_RE, embed_webpage if age_gate else video_webpage, @@ -2328,6 +2366,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if release_year: release_year = int(release_year) + yt_initial = self._get_yt_initial_data(video_id, video_webpage) + if yt_initial: + music_metadata = self._get_music_metadata_from_yt_initial(yt_initial) + if len(music_metadata): + album = music_metadata[0].get('album') + artist = music_metadata[0].get('artist') + track = music_metadata[0].get('track') + m_episode = re.search( r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', video_webpage) diff --git a/youtube_dlc/options.py b/youtube_dlc/options.py index 1d7a7fed2..66b45220c 100644 --- a/youtube_dlc/options.py +++ b/youtube_dlc/options.py @@ -582,7 +582,7 @@ def parseOpts(overrideArguments=None): 'along with --min-sleep-interval.')) workarounds.add_option( '--sleep-subtitles', - dest='sleep_interval_subtitles', action='store_true', default=False, + dest='sleep_interval_subtitles', action='store_true', default=0, help='Enforce sleep interval on subtitles as well') verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options') diff --git a/youtube_dlc/update.py b/youtube_dlc/update.py index e49e09c17..b358e902b 100644 --- a/youtube_dlc/update.py +++ b/youtube_dlc/update.py @@ -37,10 +37,26 @@ def update_self(to_screen, verbose, opener): JSON_URL = UPDATE_URL + 'versions.json' UPDATES_RSA_KEY = (0x9d60ee4d8f805312fdb15a62f87b95bd66177b91df176765d13514a0f1754bcd2057295c5b6f1d35daa6742c3ffc9a82d3e118861c207995a8031e151d863c9927e304576bc80692bc8e094896fcf11b66f3e29e04e3a71e9a11558558acea1840aec37fc396fb6b65dc81a1c4144e03bd1c011de62e3f1357b327d08426fe93, 65537) + def sha256sum(): + h = hashlib.sha256() + b = bytearray(128 * 1024) + mv = memoryview(b) + with open(os.path.realpath(sys.executable), 'rb', buffering=0) as f: + for n in iter(lambda: f.readinto(mv), 0): + h.update(mv[:n]) + return h.hexdigest() + + to_screen('Current Build Hash %s' % sha256sum()) + if not isinstance(globals().get('__loader__'), zipimporter) and not hasattr(sys, 'frozen'): to_screen('It looks like you installed youtube-dlc with a package manager, pip, setup.py or a tarball. Please use that to update.') return + # compiled file.exe can find itself by + # to_screen(os.path.basename(sys.executable)) + # and path to py or exe + # to_screen(os.path.realpath(sys.executable)) + # Check if there is a new version try: newversion = opener.open(VERSION_URL).read().decode('utf-8').strip() @@ -48,6 +64,7 @@ def update_self(to_screen, verbose, opener): if verbose: to_screen(encode_compat_str(traceback.format_exc())) to_screen('ERROR: can\'t find the current version. Please try again later.') + to_screen('Visit https://github.com/blackjack4494/yt-dlc/releases/latest') return if newversion == __version__: to_screen('youtube-dlc is up-to-date (' + __version__ + ')') @@ -61,6 +78,7 @@ def update_self(to_screen, verbose, opener): if verbose: to_screen(encode_compat_str(traceback.format_exc())) to_screen('ERROR: can\'t obtain versions info. Please try again later.') + to_screen('Visit https://github.com/blackjack4494/yt-dlc/releases/latest') return if 'signature' not in versions_info: to_screen('ERROR: the versions file is not signed or corrupted. Aborting.') @@ -109,6 +127,7 @@ def update_self(to_screen, verbose, opener): if verbose: to_screen(encode_compat_str(traceback.format_exc())) to_screen('ERROR: unable to download latest version') + to_screen('Visit https://github.com/blackjack4494/yt-dlc/releases/latest') return newcontent_hash = hashlib.sha256(newcontent).hexdigest() @@ -155,6 +174,7 @@ start /b "" cmd /c del "%%~f0"&exit /b" if verbose: to_screen(encode_compat_str(traceback.format_exc())) to_screen('ERROR: unable to download latest version') + to_screen('Visit https://github.com/blackjack4494/yt-dlc/releases/latest') return newcontent_hash = hashlib.sha256(newcontent).hexdigest() diff --git a/youtube_dlc/utils.py b/youtube_dlc/utils.py index 54a4ea2aa..f5dc1bdaf 100644 --- a/youtube_dlc/utils.py +++ b/youtube_dlc/utils.py @@ -2320,8 +2320,8 @@ def bug_reports_message(): if ytdl_is_updateable(): update_cmd = 'type youtube-dlc -U to update' else: - update_cmd = 'see https://yt-dl.org/update on how to update' - msg = '; please report this issue on https://yt-dl.org/bug .' + update_cmd = 'see https://github.com/blackjack4494/yt-dlc on how to update' + msg = '; please report this issue on https://github.com/blackjack4494/yt-dlc .' msg += ' Make sure you are using the latest version; %s.' % update_cmd msg += ' Be sure to call youtube-dlc with the --verbose flag and include its complete output.' return msg |