diff options
31 files changed, 1311 insertions, 536 deletions
diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md index f2260db46..f05aa66e6 100644 --- a/.github/ISSUE_TEMPLATE/1_broken_site.md +++ b/.github/ISSUE_TEMPLATE/1_broken_site.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.07.28. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.09.06. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -26,7 +26,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support -- [ ] I've verified that I'm running youtube-dl version **2020.07.28** +- [ ] I've verified that I'm running youtube-dl version **2020.09.06** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar issues including closed ones @@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.07.28 + [debug] youtube-dl version 2020.09.06 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md index 8bc05c4ba..29beaf437 100644 --- a/.github/ISSUE_TEMPLATE/2_site_support_request.md +++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md @@ -19,7 +19,7 @@ labels: 'site-support-request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.07.28. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.09.06. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dl does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights. - Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a new site support request -- [ ] I've verified that I'm running youtube-dl version **2020.07.28** +- [ ] I've verified that I'm running youtube-dl version **2020.09.06** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that none of provided URLs violate any copyrights - [ ] I've searched the bugtracker for similar site support requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md index 98348e0cd..f96b8d2bb 100644 --- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md +++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md @@ -18,13 +18,13 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.07.28. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.09.06. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a site feature request -- [ ] I've verified that I'm running youtube-dl version **2020.07.28** +- [ ] I've verified that I'm running youtube-dl version **2020.09.06** - [ ] I've searched the bugtracker for similar site feature requests including closed ones diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md index 86706f528..3a175aa4d 100644 --- a/.github/ISSUE_TEMPLATE/4_bug_report.md +++ b/.github/ISSUE_TEMPLATE/4_bug_report.md @@ -18,7 +18,7 @@ title: '' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.07.28. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.09.06. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser. - Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape. - Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates. @@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com --> - [ ] I'm reporting a broken site support issue -- [ ] I've verified that I'm running youtube-dl version **2020.07.28** +- [ ] I've verified that I'm running youtube-dl version **2020.09.06** - [ ] I've checked that all provided URLs are alive and playable in a browser - [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped - [ ] I've searched the bugtracker for similar bug reports including closed ones @@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v < [debug] User config: [] [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj'] [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251 - [debug] youtube-dl version 2020.07.28 + [debug] youtube-dl version 2020.09.06 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4 [debug] Proxy map: {} diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md index 52c2709f9..4977079de 100644 --- a/.github/ISSUE_TEMPLATE/5_feature_request.md +++ b/.github/ISSUE_TEMPLATE/5_feature_request.md @@ -19,13 +19,13 @@ labels: 'request' <!-- Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl: -- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.07.28. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. +- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2020.09.06. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED. - Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates. - Finally, put x into all relevant boxes (like this [x]) --> - [ ] I'm reporting a feature request -- [ ] I've verified that I'm running youtube-dl version **2020.07.28** +- [ ] I've verified that I'm running youtube-dl version **2020.09.06** - [ ] I've searched the bugtracker for similar feature requests including closed ones @@ -1,3 +1,24 @@ +version 2020.09.06 + +Core ++ [utils] Recognize wav mimetype (#26463) + +Extractors +* [nrktv:episode] Improve video id extraction (#25594, #26369, #26409) +* [youtube] Fix age gate content detection (#26100, #26152, #26311, #26384) +* [youtube:user] Extend URL regular expression (#26443) +* [xhamster] Improve initials regular expression (#26526, #26353) +* [svtplay] Fix video id extraction (#26425, #26428, #26438) +* [twitch] Rework extractors (#12297, #20414, #20604, #21811, #21812, #22979, + #24263, #25010, #25553, #25606) + * Switch to GraphQL + + Add support for collections + + Add support for clips and collections playlists +* [biqle] Improve video ext extraction +* [xhamster] Fix extraction (#26157, #26254) +* [xhamster] Extend URL regular expression (#25789, #25804, #25927)) + + version 2020.07.28 Extractors diff --git a/docs/faq.md b/docs/faq.md new file mode 100644 index 000000000..b0f8cad14 --- /dev/null +++ b/docs/faq.md @@ -0,0 +1,31 @@ +- Q: How to redirect to another extractor? + - A: + - Most simple using only `url_result` + ``` + # get proper url first if needed. + return self.url_result(url) + ``` + - Using `_request_webpage` and `to_screen` in addition + ``` + urlh = self._request_webpage( + url, id, note='Downloading redirect page') + url = urlh.geturl() + self.to_screen('Following redirect: %s' % url) + return self.url_result(url) + ``` + - Using `return` construction + ``` + return { + '_type': 'url_transparent', + 'url': url, + 'ie_key': ExampleIE.ie_key(), + 'id': id, + } + # Alternative if extractor supports internal uri like kaltura + return { + '_type': 'url_transparent', + 'url': 'kaltura:%s:%s' % (partner_id, kaltura_id), + 'ie_key': KalturaIE.ie_key(), + 'id': id, + } + ``` diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 32c452267..87c3d2232 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -40,6 +40,8 @@ - **AlJazeera** - **Allocine** - **AlphaPorno** + - **Alura** + - **AluraCourse** - **AMCNetworks** - **AmericasTestKitchen** - **anderetijden**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl @@ -208,6 +210,7 @@ - **daum.net:user** - **DBTV** - **DctpTv** + - **DeezerAlbum** - **DeezerPlaylist** - **defense.gouv.fr** - **democracynow** @@ -234,6 +237,8 @@ - **drtv** - **drtv:live** - **DTube** + - **duboku**: www.duboku.co + - **duboku:list**: www.duboku.co entire series - **Dumpert** - **dvtv**: http://video.aktualne.cz/ - **dw** @@ -455,6 +460,7 @@ - **lynda**: lynda.com videos - **lynda:course**: lynda.com online courses - **m6** + - **MagentaMusik360** - **mailru**: Видео@Mail.Ru - **mailru:music**: Музыка@Mail.Ru - **mailru:music:search**: Музыка@Mail.Ru @@ -525,6 +531,7 @@ - **MySpace:album** - **MySpass** - **Myvi** + - **MyVideoGe** - **MyVidster** - **MyviEmbed** - **MyVisionTV** @@ -900,7 +907,6 @@ - **ThisAV** - **ThisOldHouse** - **TikTok** - - **TikTokUser** - **tinypic**: tinypic.com videos - **TMZ** - **TMZArticle** @@ -945,6 +951,7 @@ - **TVNoe** - **TVNow** - **TVNowAnnual** + - **TVNowFilm** - **TVNowNew** - **TVNowSeason** - **TVNowShow** @@ -955,16 +962,13 @@ - **TVPlayHome** - **Tweakers** - **TwitCasting** - - **twitch:chapter** - **twitch:clips** - - **twitch:profile** - **twitch:stream** - - **twitch:video** - - **twitch:videos:all** - - **twitch:videos:highlights** - - **twitch:videos:past-broadcasts** - - **twitch:videos:uploads** - **twitch:vod** + - **TwitchCollection** + - **TwitchVideos** + - **TwitchVideosClips** + - **TwitchVideosCollections** - **twitter** - **twitter:amplify** - **twitter:broadcast** diff --git a/make_win.bat b/make_win.bat index a63130f1e..891d517b3 100644 --- a/make_win.bat +++ b/make_win.bat @@ -1 +1 @@ -pyinstaller.exe youtube_dlc\__main__.py --onefile --name youtube-dlc --version-file win\ver.txt --icon win\icon\cloud.ico
\ No newline at end of file +py -m PyInstaller youtube_dlc\__main__.py --onefile --name youtube-dlc --version-file win\ver.txt --icon win\icon\cloud.ico
\ No newline at end of file diff --git a/test/test_utils.py b/test/test_utils.py index 5914d4fd6..95231200b 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -803,6 +803,8 @@ class TestUtil(unittest.TestCase): self.assertEqual(mimetype2ext('text/vtt'), 'vtt') self.assertEqual(mimetype2ext('text/vtt;charset=utf-8'), 'vtt') self.assertEqual(mimetype2ext('text/html; charset=utf-8'), 'html') + self.assertEqual(mimetype2ext('audio/x-wav'), 'wav') + self.assertEqual(mimetype2ext('audio/x-wav;codec=pcm'), 'wav') def test_month_by_name(self): self.assertEqual(month_by_name(None), None) diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py index f79d31deb..4cec2298c 100644 --- a/youtube_dlc/YoutubeDL.py +++ b/youtube_dlc/YoutubeDL.py @@ -105,6 +105,7 @@ from .postprocessor import ( FFmpegFixupStretchedPP, FFmpegMergerPP, FFmpegPostProcessor, + FFmpegSubtitlesConvertorPP, get_postprocessor, ) from .version import __version__ @@ -1846,6 +1847,29 @@ class YoutubeDL(object): (sub_lang, error_to_compat_str(err))) continue + if self.params.get('skip_download', False): + if self.params.get('convertsubtitles', False): + subconv = FFmpegSubtitlesConvertorPP(self, format=self.params.get('convertsubtitles')) + filename_real_ext = os.path.splitext(filename)[1][1:] + filename_wo_ext = ( + os.path.splitext(filename)[0] + if filename_real_ext == info_dict['ext'] + else filename) + afilename = '%s.%s' % (filename_wo_ext, self.params.get('convertsubtitles')) + if subconv.available: + info_dict.setdefault('__postprocessors', []) + # info_dict['__postprocessors'].append(subconv) + if os.path.exists(encodeFilename(afilename)): + self.to_screen( + '[download] %s has already been downloaded and ' + 'converted' % afilename) + else: + try: + self.post_process(filename, info_dict) + except (PostProcessingError) as err: + self.report_error('postprocessing: %s' % str(err)) + return + if self.params.get('writeinfojson', False): infofn = replace_extension(filename, 'info.json', info_dict.get('ext')) if self.params.get('nooverwrites', False) and os.path.exists(encodeFilename(infofn)): diff --git a/youtube_dlc/__init__.py b/youtube_dlc/__init__.py index a663417da..fc11642b9 100644 --- a/youtube_dlc/__init__.py +++ b/youtube_dlc/__init__.py @@ -315,6 +315,7 @@ def _real_main(argv=None): else match_filter_func(opts.match_filter)) ydl_opts = { + 'convertsubtitles': opts.convertsubtitles, 'usenetrc': opts.usenetrc, 'username': opts.username, 'password': opts.password, diff --git a/youtube_dlc/extractor/alura.py b/youtube_dlc/extractor/alura.py new file mode 100644 index 000000000..36b4d95b3 --- /dev/null +++ b/youtube_dlc/extractor/alura.py @@ -0,0 +1,180 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + +from ..compat import ( + compat_urlparse, +) + +from ..utils import ( + urlencode_postdata, + urljoin, + int_or_none, + clean_html, + ExtractorError +) + + +class AluraIE(InfoExtractor): + _VALID_URL = r'https?://(?:cursos\.)?alura\.com\.br/course/(?P<course_name>[^/]+)/task/(?P<id>\d+)' + _LOGIN_URL = 'https://cursos.alura.com.br/loginForm?urlAfterLogin=/loginForm' + _VIDEO_URL = 'https://cursos.alura.com.br/course/%s/task/%s/video' + _NETRC_MACHINE = 'alura' + _TESTS = [{ + 'url': 'https://cursos.alura.com.br/course/clojure-mutabilidade-com-atoms-e-refs/task/60095', + 'info_dict': { + 'id': '60095', + 'ext': 'mp4', + 'title': 'Referências, ref-set e alter' + }, + 'skip': 'Requires alura account credentials'}, + { + # URL without video + 'url': 'https://cursos.alura.com.br/course/clojure-mutabilidade-com-atoms-e-refs/task/60098', + 'only_matching': True}, + { + 'url': 'https://cursos.alura.com.br/course/fundamentos-market-digital/task/55219', + 'only_matching': True} + ] + + def _real_extract(self, url): + + video_id = self._match_id(url) + course = self._search_regex(self._VALID_URL, url, 'post url', group='course_name') + video_url = self._VIDEO_URL % (course, video_id) + + video_dict = self._download_json(video_url, video_id, 'Searching for videos') + + if video_dict: + webpage = self._download_webpage(url, video_id) + video_title = clean_html(self._search_regex( + r'<span[^>]+class=(["\'])task-body-header-title-text\1[^>]*>(?P<title>[^<]+)', + webpage, 'title', group='title')) + + formats = [] + for video_obj in video_dict: + video_url_m3u8 = video_obj.get('link') + video_format = self._extract_m3u8_formats( + video_url_m3u8, None, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False) + for f in video_format: + m = re.search(r'^[\w \W]*-(?P<res>\w*).mp4[\W \w]*', f['url']) + if m: + if not f.get('height'): + f['height'] = int('720' if m.group('res') == 'hd' else '480') + formats.extend(video_format) + + self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id')) + + return { + 'id': video_id, + 'title': video_title, + "formats": formats + } + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + pass + + login_page = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login popup') + + def is_logged(webpage): + return any(re.search(p, webpage) for p in ( + r'href=[\"|\']?/signout[\"|\']', + r'>Logout<')) + + # already logged in + if is_logged(login_page): + return + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password, + }) + + post_url = self._search_regex( + r'<form[^>]+class=["|\']signin-form["|\'] action=["|\'](?P<url>.+?)["|\']', login_page, + 'post url', default=self._LOGIN_URL, group='url') + + if not post_url.startswith('http'): + post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url) + + response = self._download_webpage( + post_url, None, 'Logging in', + data=urlencode_postdata(login_form), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) + + if not is_logged(response): + error = self._html_search_regex( + r'(?s)<p[^>]+class="alert-message[^"]*">(.+?)</p>', + response, 'error message', default=None) + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + +class AluraCourseIE(AluraIE): + + _VALID_URL = r'https?://(?:cursos\.)?alura\.com\.br/course/(?P<id>[^/]+)' + _LOGIN_URL = 'https://cursos.alura.com.br/loginForm?urlAfterLogin=/loginForm' + _NETRC_MACHINE = 'aluracourse' + _TESTS = [{ + 'url': 'https://cursos.alura.com.br/course/clojure-mutabilidade-com-atoms-e-refs', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if AluraIE.suitable(url) else super(AluraCourseIE, cls).suitable(url) + + def _real_extract(self, url): + + course_path = self._match_id(url) + webpage = self._download_webpage(url, course_path) + + course_title = self._search_regex( + r'<h1.*?>(.*?)<strong>(?P<course_title>.*?)</strong></h[0-9]>', webpage, + 'course title', default=course_path, group='course_title') + + entries = [] + if webpage: + for path in re.findall(r'<a\b(?=[^>]* class="[^"]*(?<=[" ])courseSectionList-section[" ])(?=[^>]* href="([^"]*))', webpage): + page_url = urljoin(url, path) + section_path = self._download_webpage(page_url, course_path) + for path_video in re.findall(r'<a\b(?=[^>]* class="[^"]*(?<=[" ])task-menu-nav-item-link-VIDEO[" ])(?=[^>]* href="([^"]*))', section_path): + chapter = clean_html( + self._search_regex( + r'<h3[^>]+class=(["\'])task-menu-section-title-text\1[^>]*>(?P<chapter>[^<]+)', + section_path, + 'chapter', + group='chapter')) + + chapter_number = int_or_none( + self._search_regex( + r'<span[^>]+class=(["\'])task-menu-section-title-number[^>]*>(.*?)<strong>(?P<chapter_number>[^<]+)</strong>', + section_path, + 'chapter number', + group='chapter_number')) + video_url = urljoin(url, path_video) + + entry = { + '_type': 'url_transparent', + 'id': self._match_id(video_url), + 'url': video_url, + 'id_key': self.ie_key(), + 'chapter': chapter, + 'chapter_number': chapter_number + } + entries.append(entry) + return self.playlist_result(entries, course_path, course_title) diff --git a/youtube_dlc/extractor/bitchute.py b/youtube_dlc/extractor/bitchute.py index 0c773e66e..92fc70b5a 100644 --- a/youtube_dlc/extractor/bitchute.py +++ b/youtube_dlc/extractor/bitchute.py @@ -6,6 +6,8 @@ import re from .common import InfoExtractor from ..utils import ( + ExtractorError, + GeoRestrictedError, orderedSet, unified_strdate, urlencode_postdata, @@ -59,8 +61,14 @@ class BitChuteIE(InfoExtractor): for format_url in orderedSet(format_urls)] if not formats: - formats = self._parse_html5_media_entries( - url, webpage, video_id)[0]['formats'] + entries = self._parse_html5_media_entries( + url, webpage, video_id) + if not entries: + error = self._html_search_regex(r'<h1 class="page-title">([^<]+)</h1>', webpage, 'error', default='Cannot find video') + if error == 'Video Unavailable': + raise GeoRestrictedError(error) + raise ExtractorError(error) + formats = entries[0]['formats'] self._check_formats(formats, video_id) self._sort_formats(formats) diff --git a/youtube_dlc/extractor/duboku.py b/youtube_dlc/extractor/duboku.py new file mode 100644 index 000000000..fdc695bf4 --- /dev/null +++ b/youtube_dlc/extractor/duboku.py @@ -0,0 +1,242 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + clean_html, + extract_attributes, + ExtractorError, + get_elements_by_class, + int_or_none, + js_to_json, + smuggle_url, + unescapeHTML, +) + + +def _get_elements_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True): + """Return the content of the tag with the specified attribute in the passed HTML document""" + + if tag is None: + tag = '[a-zA-Z0-9:._-]+' + if attribute is None: + attribute = '' + else: + attribute = r'\s+(?P<attribute>%s)' % re.escape(attribute) + if value is None: + value = '' + else: + value = re.escape(value) if escape_value else value + value = '=[\'"]?(?P<value>%s)[\'"]?' % value + + retlist = [] + for m in re.finditer(r'''(?xs) + <(?P<tag>%s) + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? + %s%s + (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*? + \s*> + (?P<content>.*?) + </\1> + ''' % (tag, attribute, value), html): + retlist.append(m) + + return retlist + + +def _get_element_by_tag_and_attrib(html, tag=None, attribute=None, value=None, escape_value=True): + retval = _get_elements_by_tag_and_attrib(html, tag, attribute, value, escape_value) + return retval[0] if retval else None + + +class DubokuIE(InfoExtractor): + IE_NAME = 'duboku' + IE_DESC = 'www.duboku.co' + + _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/vodplay/)(?P<id>[0-9]+-[0-9-]+)\.html.*' + _TESTS = [{ + 'url': 'https://www.duboku.co/vodplay/1575-1-1.html', + 'info_dict': { + 'id': '1575-1-1', + 'ext': 'ts', + 'series': '白色月光', + 'title': 'contains:白色月光', + 'season_number': 1, + 'episode_number': 1, + }, + 'params': { + 'skip_download': 'm3u8 download', + }, + }, { + 'url': 'https://www.duboku.co/vodplay/1588-1-1.html', + 'info_dict': { + 'id': '1588-1-1', + 'ext': 'ts', + 'series': '亲爱的自己', + 'title': 'contains:预告片', + 'season_number': 1, + 'episode_number': 1, + }, + 'params': { + 'skip_download': 'm3u8 download', + }, + }] + + _PLAYER_DATA_PATTERN = r'player_data\s*=\s*(\{\s*(.*)})\s*;?\s*</script' + + def _real_extract(self, url): + video_id = self._match_id(url) + temp = video_id.split('-') + series_id = temp[0] + season_id = temp[1] + episode_id = temp[2] + + webpage_url = 'https://www.duboku.co/vodplay/%s.html' % video_id + webpage_html = self._download_webpage(webpage_url, video_id) + + # extract video url + + player_data = self._search_regex( + self._PLAYER_DATA_PATTERN, webpage_html, 'player_data') + player_data = self._parse_json(player_data, video_id, js_to_json) + + # extract title + + temp = get_elements_by_class('title', webpage_html) + series_title = None + title = None + for html in temp: + mobj = re.search(r'<a\s+.*>(.*)</a>', html) + if mobj: + href = extract_attributes(mobj.group(0)).get('href') + if href: + mobj1 = re.search(r'/(\d+)\.html', href) + if mobj1 and mobj1.group(1) == series_id: + series_title = clean_html(mobj.group(0)) + series_title = re.sub(r'[\s\r\n\t]+', ' ', series_title) + title = clean_html(html) + title = re.sub(r'[\s\r\n\t]+', ' ', title) + break + + data_url = player_data.get('url') + if not data_url: + raise ExtractorError('Cannot find url in player_data') + data_from = player_data.get('from') + + # if it is an embedded iframe, maybe it's an external source + if data_from == 'iframe': + # use _type url_transparent to retain the meaningful details + # of the video. + return { + '_type': 'url_transparent', + 'url': smuggle_url(data_url, {'http_headers': {'Referer': webpage_url}}), + 'id': video_id, + 'title': title, + 'series': series_title, + 'season_number': int_or_none(season_id), + 'season_id': season_id, + 'episode_number': int_or_none(episode_id), + 'episode_id': episode_id, + } + + formats = self._extract_m3u8_formats(data_url, video_id, 'mp4') + + return { + 'id': video_id, + 'title': title, + 'series': series_title, + 'season_number': int_or_none(season_id), + 'season_id': season_id, + 'episode_number': int_or_none(episode_id), + 'episode_id': episode_id, + 'formats': formats, + 'http_headers': {'Referer': 'https://www.duboku.co/static/player/videojs.html'} + } + + +class DubokuPlaylistIE(InfoExtractor): + IE_NAME = 'duboku:list' + IE_DESC = 'www.duboku.co entire series' + + _VALID_URL = r'(?:https?://[^/]+\.duboku\.co/voddetail/)(?P<id>[0-9]+)\.html.*' + _TESTS = [{ + 'url': 'https://www.duboku.co/voddetail/1575.html', + 'info_dict': { + 'id': 'startswith:1575', + 'title': '白色月光', + }, + 'playlist_count': 12, + }, { + 'url': 'https://www.duboku.co/voddetail/1554.html', + 'info_dict': { + 'id': 'startswith:1554', + 'title': '以家人之名', + }, + 'playlist_mincount': 30, + }, { + 'url': 'https://www.duboku.co/voddetail/1554.html#playlist2', + 'info_dict': { + 'id': '1554#playlist2', + 'title': '以家人之名', + }, + 'playlist_mincount': 27, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError('Invalid URL: %s' % url) + series_id = mobj.group('id') + fragment = compat_urlparse.urlparse(url).fragment + + webpage_url = 'https://www.duboku.co/voddetail/%s.html' % series_id + webpage_html = self._download_webpage(webpage_url, series_id) + + # extract title + + title = _get_element_by_tag_and_attrib(webpage_html, 'h1', 'class', 'title') + title = unescapeHTML(title.group('content')) if title else None + if not title: + title = self._html_search_meta('keywords', webpage_html) + if not title: + title = _get_element_by_tag_and_attrib(webpage_html, 'title') + title = unescapeHTML(title.group('content')) if title else None + + # extract playlists + + playlists = {} + for div in _get_elements_by_tag_and_attrib( + webpage_html, attribute='id', value='playlist\\d+', escape_value=False): + playlist_id = div.group('value') + playlist = [] + for a in _get_elements_by_tag_and_attrib( + div.group('content'), 'a', 'href', value='[^\'"]+?', escape_value=False): + playlist.append({ + 'href': unescapeHTML(a.group('value')), + 'title': unescapeHTML(a.group('content')) + }) + playlists[playlist_id] = playlist + + # select the specified playlist if url fragment exists + playlist = None + playlist_id = None + if fragment: + playlist = playlists.get(fragment) + playlist_id = fragment + else: + first = next(iter(playlists.items()), None) + if first: + (playlist_id, playlist) = first + if not playlist: + raise ExtractorError( + 'Cannot find %s' % fragment if fragment else 'Cannot extract playlist') + + # return url results + return self.playlist_result([ + self.url_result( + compat_urlparse.urljoin('https://www.duboku.co', x['href']), + ie=DubokuIE.ie_key(), video_title=x.get('title')) + for x in playlist], series_id + '#' + playlist_id, title) diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py index 4534effa2..af1bc6e31 100644 --- a/youtube_dlc/extractor/extractors.py +++ b/youtube_dlc/extractor/extractors.py @@ -36,6 +36,10 @@ from .afreecatv import AfreecaTVIE from .airmozilla import AirMozillaIE from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE +from .alura import ( + AluraIE, + AluraCourseIE +) from .amcnetworks import AMCNetworksIE from .americastestkitchen import AmericasTestKitchenIE from .animeondemand import AnimeOnDemandIE @@ -284,6 +288,10 @@ from .drtv import ( ) from .dtube import DTubeIE from .dvtv import DVTVIE +from .duboku import ( + DubokuIE, + DubokuPlaylistIE +) from .dumpert import DumpertIE from .defense import DefenseGouvFrIE from .discovery import DiscoveryIE @@ -672,6 +680,7 @@ from .myvi import ( MyviIE, MyviEmbedIE, ) +from .myvideoge import MyVideoGeIE from .myvidster import MyVidsterIE from .nationalgeographic import ( NationalGeographicVideoIE, @@ -1156,10 +1165,7 @@ from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE from .thisoldhouse import ThisOldHouseIE from .threeqsdn import ThreeQSDNIE -from .tiktok import ( - TikTokIE, - TikTokUserIE, -) +from .tiktok import TikTokIE from .tinypic import TinyPicIE from .tmz import ( TMZIE, @@ -1243,14 +1249,11 @@ from .twentymin import TwentyMinutenIE from .twentythreevideo import TwentyThreeVideoIE from .twitcasting import TwitCastingIE from .twitch import ( - TwitchVideoIE, - TwitchChapterIE, TwitchVodIE, - TwitchProfileIE, - TwitchAllVideosIE, - TwitchUploadsIE, - TwitchPastBroadcastsIE, - TwitchHighlightsIE, + TwitchCollectionIE, + TwitchVideosIE, + TwitchVideosClipsIE, + TwitchVideosCollectionsIE, TwitchStreamIE, TwitchClipsIE, ) diff --git a/youtube_dlc/extractor/googledrive.py b/youtube_dlc/extractor/googledrive.py index 589e4d5c3..886fdd532 100644 --- a/youtube_dlc/extractor/googledrive.py +++ b/youtube_dlc/extractor/googledrive.py @@ -265,6 +265,8 @@ class GoogleDriveIE(InfoExtractor): subtitles_id = ttsurl.encode('utf-8').decode( 'unicode_escape').split('=')[-1] + self._downloader.cookiejar.clear(domain='.google.com', path='/', name='NID') + return { 'id': video_id, 'title': title, diff --git a/youtube_dlc/extractor/mailru.py b/youtube_dlc/extractor/mailru.py index 65cc474db..6fdf70aa6 100644 --- a/youtube_dlc/extractor/mailru.py +++ b/youtube_dlc/extractor/mailru.py @@ -20,10 +20,10 @@ class MailRuIE(InfoExtractor): IE_DESC = 'Видео@Mail.Ru' _VALID_URL = r'''(?x) https?:// - (?:(?:www|m)\.)?my\.mail\.ru/+ + (?:(?:www|m|videoapi)\.)?my\.mail\.ru/+ (?: video/.*\#video=/?(?P<idv1>(?:[^/]+/){3}\d+)| - (?:(?P<idv2prefix>(?:[^/]+/+){2})video/(?P<idv2suffix>[^/]+/\d+))\.html| + (?:videos/embed/)?(?:(?P<idv2prefix>(?:[^/]+/+){2})(?:video/(?:embed/)?)?(?P<idv2suffix>[^/]+/\d+))(?:\.html)?| (?:video/embed|\+/video/meta)/(?P<metaid>\d+) ) ''' @@ -108,15 +108,21 @@ class MailRuIE(InfoExtractor): if not video_id: video_id = mobj.group('idv2prefix') + mobj.group('idv2suffix') webpage = self._download_webpage(url, video_id) - page_config = self._parse_json(self._search_regex( + page_config = self._parse_json(self._search_regex([ r'(?s)<script[^>]+class="sp-video__page-config"[^>]*>(.+?)</script>', + r'(?s)"video":\s*(\{.+?\}),'], webpage, 'page config', default='{}'), video_id, fatal=False) if page_config: - meta_url = page_config.get('metaUrl') or page_config.get('video', {}).get('metaUrl') + meta_url = page_config.get('metaUrl') or page_config.get('video', {}).get('metaUrl') or page_config.get('metadataUrl') else: meta_url = None video_data = None + + # fix meta_url if missing the host address + if re.match(r'^\/\+\/', meta_url): + meta_url = 'https://my.mail.ru' + meta_url + if meta_url: video_data = self._download_json( meta_url, video_id or meta_id, 'Downloading video meta JSON', diff --git a/youtube_dlc/extractor/myvideoge.py b/youtube_dlc/extractor/myvideoge.py new file mode 100644 index 000000000..0a1d7d0cb --- /dev/null +++ b/youtube_dlc/extractor/myvideoge.py @@ -0,0 +1,56 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import js_to_json + + +class MyVideoGeIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?myvideo\.ge/v/(?P<id>[0-9]+)' + _TEST = { + 'url': 'https://www.myvideo.ge/v/3941048', + 'md5': '8c192a7d2b15454ba4f29dc9c9a52ea9', + 'info_dict': { + 'id': '3941048', + 'ext': 'mp4', + 'title': 'The best prikol', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'md5:d72addd357b0dd914e704781f7f777d8', + 'description': 'md5:5c0371f540f5888d603ebfedd46b6df3' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex(r'<h1[^>]*>([^<]+)</h1>', webpage, 'title') + description = self._og_search_description(webpage) + thumbnail = self._html_search_meta(['og:image'], webpage) + uploader = self._search_regex(r'<a[^>]+class="mv_user_name"[^>]*>([^<]+)<', webpage, 'uploader', fatal=False) + + jwplayer_sources = self._parse_json( + self._search_regex( + r"(?s)jwplayer\(\"mvplayer\"\).setup\(.*?sources: (.*?])", webpage, 'jwplayer sources'), + video_id, transform_source=js_to_json) + + def _formats_key(f): + if f['label'] == 'SD': + return -1 + elif f['label'] == 'HD': + return 1 + else: + return 0 + + jwplayer_sources = sorted(jwplayer_sources, key=_formats_key) + + formats = self._parse_jwplayer_formats(jwplayer_sources, video_id) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'uploader': uploader, + 'formats': formats, + 'thumbnail': thumbnail + } diff --git a/youtube_dlc/extractor/nrk.py b/youtube_dlc/extractor/nrk.py index 94115534b..84aacbcda 100644 --- a/youtube_dlc/extractor/nrk.py +++ b/youtube_dlc/extractor/nrk.py @@ -11,7 +11,6 @@ from ..compat import ( from ..utils import ( ExtractorError, int_or_none, - JSON_LD_RE, js_to_json, NO_DEFAULT, parse_age_limit, @@ -425,13 +424,20 @@ class NRKTVEpisodeIE(InfoExtractor): webpage = self._download_webpage(url, display_id) - nrk_id = self._parse_json( - self._search_regex(JSON_LD_RE, webpage, 'JSON-LD', group='json_ld'), - display_id)['@id'] - + info = self._search_json_ld(webpage, display_id, default={}) + nrk_id = info.get('@id') or self._html_search_meta( + 'nrk:program-id', webpage, default=None) or self._search_regex( + r'data-program-id=["\'](%s)' % NRKTVIE._EPISODE_RE, webpage, + 'nrk id') assert re.match(NRKTVIE._EPISODE_RE, nrk_id) - return self.url_result( - 'nrk:%s' % nrk_id, ie=NRKIE.ie_key(), video_id=nrk_id) + + info.update({ + '_type': 'url_transparent', + 'id': nrk_id, + 'url': 'nrk:%s' % nrk_id, + 'ie_key': NRKIE.ie_key(), + }) + return info class NRKTVSerieBaseIE(InfoExtractor): diff --git a/youtube_dlc/extractor/rai.py b/youtube_dlc/extractor/rai.py index 207a6c247..51a310f5c 100644 --- a/youtube_dlc/extractor/rai.py +++ b/youtube_dlc/extractor/rai.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re @@ -17,7 +18,6 @@ from ..utils import ( parse_duration, strip_or_none, try_get, - unescapeHTML, unified_strdate, unified_timestamp, update_url_query, @@ -30,6 +30,7 @@ class RaiBaseIE(InfoExtractor): _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' _GEO_COUNTRIES = ['IT'] _GEO_BYPASS = False + _BASE_URL = 'https://www.raiplay.it' def _extract_relinker_info(self, relinker_url, video_id): if not re.match(r'https?://', relinker_url): @@ -122,41 +123,19 @@ class RaiBaseIE(InfoExtractor): class RaiPlayIE(RaiBaseIE): - _VALID_URL = r'(?P<url>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s)\.html)' % RaiBaseIE._UUID_RE + _VALID_URL = r'(?P<url>(?P<base>https?://(?:www\.)?raiplay\.it/.+?-)(?P<id>%s)(?P<ext>\.(?:html|json)))' % RaiBaseIE._UUID_RE _TESTS = [{ - 'url': 'http://www.raiplay.it/video/2016/10/La-Casa-Bianca-e06118bb-59a9-4636-b914-498e4cfd2c66.html?source=twitter', - 'md5': '340aa3b7afb54bfd14a8c11786450d76', - 'info_dict': { - 'id': 'e06118bb-59a9-4636-b914-498e4cfd2c66', - 'ext': 'mp4', - 'title': 'La Casa Bianca', - 'alt_title': 'S2016 - Puntata del 23/10/2016', - 'description': 'md5:a09d45890850458077d1f68bb036e0a5', - 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Rai 3', - 'creator': 'Rai 3', - 'duration': 3278, - 'timestamp': 1477764300, - 'upload_date': '20161029', - 'series': 'La Casa Bianca', - 'season': '2016', - }, - }, { 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', 'md5': '8970abf8caf8aef4696e7b1f2adfc696', 'info_dict': { 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', 'ext': 'mp4', 'title': 'Report del 07/04/2014', - 'alt_title': 'S2013/14 - Puntata del 07/04/2014', - 'description': 'md5:f27c544694cacb46a078db84ec35d2d9', + 'alt_title': 'St 2013/14 - Espresso nel caffè - 07/04/2014 ', + 'description': 'md5:d730c168a58f4bb35600fc2f881ec04e', 'thumbnail': r're:^https?://.*\.jpg$', - 'uploader': 'Rai 5', - 'creator': 'Rai 5', + 'uploader': 'Rai Gulp', 'duration': 6160, - 'series': 'Report', - 'season_number': 5, - 'season': '2013/14', }, 'params': { 'skip_download': True, @@ -168,16 +147,15 @@ class RaiPlayIE(RaiBaseIE): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - url, video_id = mobj.group('url', 'id') + url, base, video_id, ext = mobj.group('url', 'base', 'id', 'ext') media = self._download_json( - '%s?json' % url, video_id, 'Downloading video JSON') + '%s%s.json' % (base, video_id), video_id, 'Downloading video JSON') title = media['name'] - video = media['video'] - relinker_info = self._extract_relinker_info(video['contentUrl'], video_id) + relinker_info = self._extract_relinker_info(video['content_url'], video_id) self._sort_formats(relinker_info['formats']) thumbnails = [] @@ -185,7 +163,7 @@ class RaiPlayIE(RaiBaseIE): for _, value in media.get('images').items(): if value: thumbnails.append({ - 'url': value.replace('[RESOLUTION]', '600x400') + 'url': urljoin(RaiBaseIE._BASE_URL, value.replace('[RESOLUTION]', '600x400')) }) timestamp = unified_timestamp(try_get( @@ -225,7 +203,7 @@ class RaiPlayLiveIE(RaiBaseIE): 'display_id': 'rainews24', 'ext': 'mp4', 'title': 're:^Diretta di Rai News 24 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': 'md5:6eca31500550f9376819f174e5644754', + 'description': 'md5:4d00bcf6dc98b27c6ec480de329d1497', 'uploader': 'Rai News 24', 'creator': 'Rai News 24', 'is_live': True, @@ -238,20 +216,32 @@ class RaiPlayLiveIE(RaiBaseIE): def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + media = self._download_json( + '%s.json' % urljoin(RaiBaseIE._BASE_URL, 'dirette/' + display_id), + display_id, 'Downloading channel JSON') + + title = media['name'] + video = media['video'] + video_id = media['id'].replace('ContentItem-', '') - video_id = self._search_regex( - r'data-uniquename=["\']ContentItem-(%s)' % RaiBaseIE._UUID_RE, - webpage, 'content id') + relinker_info = self._extract_relinker_info(video['content_url'], video_id) + self._sort_formats(relinker_info['formats']) - return { - '_type': 'url_transparent', - 'ie_key': RaiPlayIE.ie_key(), - 'url': 'http://www.raiplay.it/dirette/ContentItem-%s.html' % video_id, + info = { 'id': video_id, 'display_id': display_id, + 'title': self._live_title(title) if relinker_info.get( + 'is_live') else title, + 'alt_title': media.get('subtitle'), + 'description': media.get('description'), + 'uploader': strip_or_none(media.get('channel')), + 'creator': strip_or_none(media.get('editor')), + 'duration': parse_duration(video.get('duration')), } + info.update(relinker_info) + return info + class RaiPlayPlaylistIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+)' @@ -260,7 +250,7 @@ class RaiPlayPlaylistIE(InfoExtractor): 'info_dict': { 'id': 'nondirloalmiocapo', 'title': 'Non dirlo al mio capo', - 'description': 'md5:9f3d603b2947c1c7abb098f3b14fac86', + 'description': 'md5:98ab6b98f7f44c2843fd7d6f045f153b', }, 'playlist_mincount': 12, }] @@ -268,21 +258,25 @@ class RaiPlayPlaylistIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) - webpage = self._download_webpage(url, playlist_id) + media = self._download_json( + '%s.json' % urljoin(RaiBaseIE._BASE_URL, 'programmi/' + playlist_id), + playlist_id, 'Downloading program JSON') - title = self._html_search_meta( - ('programma', 'nomeProgramma'), webpage, 'title') - description = unescapeHTML(self._html_search_meta( - ('description', 'og:description'), webpage, 'description')) + title = media['name'] + description = media['program_info']['description'] + + content_sets = [s['id'] for b in media['blocks'] for s in b['sets']] entries = [] - for mobj in re.finditer( - r'<a\b[^>]+\bhref=(["\'])(?P<path>/raiplay/video/.+?)\1', - webpage): - video_url = urljoin(url, mobj.group('path')) - entries.append(self.url_result( - video_url, ie=RaiPlayIE.ie_key(), - video_id=RaiPlayIE._match_id(video_url))) + for cs in content_sets: + medias = self._download_json( + '%s/%s.json' % (urljoin(RaiBaseIE._BASE_URL, 'programmi/' + playlist_id), cs), + cs, 'Downloading content set JSON') + for m in medias['items']: + video_url = urljoin(url, m['path_id']) + entries.append(self.url_result( + video_url, ie=RaiPlayIE.ie_key(), + video_id=RaiPlayIE._match_id(video_url))) return self.playlist_result(entries, playlist_id, title, description) @@ -316,7 +310,7 @@ class RaiIE(RaiBaseIE): }, { # with ContentItem in og:url 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', - 'md5': '11959b4e44fa74de47011b5799490adf', + 'md5': '6865dd00cf0bbf5772fdd89d59bd768a', 'info_dict': { 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', 'ext': 'mp4', @@ -327,18 +321,6 @@ class RaiIE(RaiBaseIE): 'upload_date': '20161103', } }, { - # drawMediaRaiTV(...) - 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', - 'md5': '2dd727e61114e1ee9c47f0da6914e178', - 'info_dict': { - 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', - 'ext': 'mp4', - 'title': 'Il pacco', - 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20141221', - }, - }, { # initEdizione('ContentItem-...' 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', 'info_dict': { @@ -350,17 +332,6 @@ class RaiIE(RaiBaseIE): }, 'skip': 'Changes daily', }, { - # HDS live stream with only relinker URL - 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', - 'info_dict': { - 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc', - 'ext': 'flv', - 'title': 'EuroNews', - }, - 'params': { - 'skip_download': True, - }, - }, { # HLS live stream with ContentItem in og:url 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', 'info_dict': { diff --git a/youtube_dlc/extractor/rtlnl.py b/youtube_dlc/extractor/rtlnl.py index fadca8c17..8be5ca236 100644 --- a/youtube_dlc/extractor/rtlnl.py +++ b/youtube_dlc/extractor/rtlnl.py @@ -15,11 +15,25 @@ class RtlNlIE(InfoExtractor): https?://(?:(?:www|static)\.)? (?: rtlxl\.nl/[^\#]*\#!/[^/]+/| + rtlxl\.nl/programma/[^/]+/| rtl\.nl/(?:(?:system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html|embed)\b.+?\buuid=|video/) ) (?P<id>[0-9a-f-]+)''' _TESTS = [{ + 'url': 'https://www.rtlxl.nl/programma/rtl-nieuws/0bd1384d-d970-3086-98bb-5c104e10c26f', + 'md5': '490428f1187b60d714f34e1f2e3af0b6', + 'info_dict': { + 'id': '0bd1384d-d970-3086-98bb-5c104e10c26f', + 'ext': 'mp4', + 'title': 'RTL Nieuws', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'timestamp': 1593293400, + 'upload_date': '20200627', + 'duration': 661.08, + }, + }, { + # old url pattern. Tests does not pass 'url': 'http://www.rtlxl.nl/#!/rtl-nieuws-132237/82b1aad1-4a14-3d7b-b554-b0aed1b2c416', 'md5': '473d1946c1fdd050b2c0161a4b13c373', 'info_dict': { diff --git a/youtube_dlc/extractor/svt.py b/youtube_dlc/extractor/svt.py index e12389cad..8e9ec2ca3 100644 --- a/youtube_dlc/extractor/svt.py +++ b/youtube_dlc/extractor/svt.py @@ -224,9 +224,15 @@ class SVTPlayIE(SVTPlayBaseIE): self._adjust_title(info_dict) return info_dict - svt_id = self._search_regex( - r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)', - webpage, 'video id') + svt_id = try_get( + data, lambda x: x['statistics']['dataLake']['content']['id'], + compat_str) + + if not svt_id: + svt_id = self._search_regex( + (r'<video[^>]+data-video-id=["\']([\da-zA-Z-]+)', + r'"content"\s*:\s*{.*?"id"\s*:\s*"([\da-zA-Z-]+)"'), + webpage, 'video id') return self._extract_by_video_id(svt_id, webpage) diff --git a/youtube_dlc/extractor/tiktok.py b/youtube_dlc/extractor/tiktok.py index 66088b9ab..075a2cdf9 100644 --- a/youtube_dlc/extractor/tiktok.py +++ b/youtube_dlc/extractor/tiktok.py @@ -1,138 +1,136 @@ # coding: utf-8 from __future__ import unicode_literals +from datetime import datetime from .common import InfoExtractor from ..utils import ( - compat_str, ExtractorError, int_or_none, str_or_none, - try_get, - url_or_none, + try_get ) class TikTokBaseIE(InfoExtractor): - def _extract_aweme(self, data): - video = data['video'] - description = str_or_none(try_get(data, lambda x: x['desc'])) - width = int_or_none(try_get(data, lambda x: video['width'])) - height = int_or_none(try_get(data, lambda x: video['height'])) + def _extract_aweme(self, video_data, webpage): + video_info = try_get( + video_data, lambda x: x['videoData']['itemInfos'], dict) + author_info = try_get( + video_data, lambda x: x['videoData']['authorInfos'], dict) + share_info = try_get(video_data, lambda x: x['shareMeta'], dict) - format_urls = set() - formats = [] - for format_id in ( - 'play_addr_lowbr', 'play_addr', 'play_addr_h264', - 'download_addr'): - for format in try_get( - video, lambda x: x[format_id]['url_list'], list) or []: - format_url = url_or_none(format) - if not format_url: - continue - if format_url in format_urls: - continue - format_urls.add(format_url) - formats.append({ - 'url': format_url, - 'ext': 'mp4', - 'height': height, - 'width': width, - }) - self._sort_formats(formats) + unique_id = str_or_none(author_info.get('uniqueId')) + timestamp = try_get(video_info, lambda x: int(x['createTime']), int) + date = datetime.fromtimestamp(timestamp).strftime('%Y%m%d') - thumbnail = url_or_none(try_get( - video, lambda x: x['cover']['url_list'][0], compat_str)) - uploader = try_get(data, lambda x: x['author']['nickname'], compat_str) - timestamp = int_or_none(data.get('create_time')) - comment_count = int_or_none(data.get('comment_count')) or int_or_none( - try_get(data, lambda x: x['statistics']['comment_count'])) - repost_count = int_or_none(try_get( - data, lambda x: x['statistics']['share_count'])) + height = try_get(video_info, lambda x: x['video']['videoMeta']['height'], int) + width = try_get(video_info, lambda x: x['video']['videoMeta']['width'], int) + thumbnails = [] + thumbnails.append({ + 'url': video_info.get('thumbnail') or self._og_search_thumbnail(webpage), + 'width': width, + 'height': height + }) - aweme_id = data['aweme_id'] + formats = [] + formats.append({ + 'url': try_get(video_info, lambda x: x['video']['urls'][0]), + 'ext': 'mp4', + 'height': height, + 'width': width + }) return { - 'id': aweme_id, - 'title': uploader or aweme_id, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, + 'comment_count': int_or_none(video_info.get('commentCount')), + 'duration': try_get(video_info, lambda x: x['video']['videoMeta']['duration'], int), + 'height': height, + 'id': str_or_none(video_info.get('id')), + 'like_count': int_or_none(video_info.get('diggCount')), + 'repost_count': int_or_none(video_info.get('shareCount')), + 'thumbnail': try_get(video_info, lambda x: x['covers'][0]), 'timestamp': timestamp, - 'comment_count': comment_count, - 'repost_count': repost_count, - 'formats': formats, + 'width': width, + 'title': str_or_none(share_info.get('title')) or self._og_search_title(webpage), + 'creator': str_or_none(author_info.get('nickName')), + 'uploader': unique_id, + 'uploader_id': str_or_none(author_info.get('userId')), + 'uploader_url': 'https://www.tiktok.com/@' + unique_id, + 'thumbnails': thumbnails, + 'upload_date': date, + 'webpage_url': self._og_search_url(webpage), + 'description': str_or_none(video_info.get('text')) or str_or_none(share_info.get('desc')), + 'ext': 'mp4', + 'formats': formats } class TikTokIE(TikTokBaseIE): - _VALID_URL = r'''(?x) - https?:// - (?: - (?:m\.)?tiktok\.com/v| - (?:www\.)?tiktok\.com/share/video - ) - /(?P<id>\d+) - ''' + _VALID_URL = r'https?://www\.tiktok\.com/@[\w\._]+/video/(?P<id>\d+)' + _TESTS = [{ - 'url': 'https://m.tiktok.com/v/6606727368545406213.html', - 'md5': 'd584b572e92fcd48888051f238022420', + 'url': 'https://www.tiktok.com/@leenabhushan/video/6748451240264420610', + 'md5': '34a7543afd5a151b0840ba6736fb633b', 'info_dict': { - 'id': '6606727368545406213', - 'ext': 'mp4', - 'title': 'Zureeal', - 'description': '#bowsette#mario#cosplay#uk#lgbt#gaming#asian#bowsettecosplay', - 'thumbnail': r're:^https?://.*~noop.image', - 'uploader': 'Zureeal', - 'timestamp': 1538248586, - 'upload_date': '20180929', 'comment_count': int, + 'creator': 'facestoriesbyleenabh', + 'description': 'md5:a9f6c0c44a1ff2249cae610372d0ae95', + 'duration': 13, + 'ext': 'mp4', + 'formats': list, + 'height': 1280, + 'id': '6748451240264420610', + 'like_count': int, 'repost_count': int, + 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?', + 'thumbnails': list, + 'timestamp': 1571246252, + 'title': 'facestoriesbyleenabh on TikTok', + 'upload_date': '20191016', + 'uploader': 'leenabhushan', + 'uploader_id': '6691488002098119685', + 'uploader_url': r're:https://www.tiktok.com/@leenabhushan', + 'webpage_url': r're:https://www.tiktok.com/@leenabhushan/(video/)?6748451240264420610', + 'width': 720, } }, { - 'url': 'https://www.tiktok.com/share/video/6606727368545406213', - 'only_matching': True, + 'url': 'https://www.tiktok.com/@patroxofficial/video/6742501081818877190?langCountry=en', + 'md5': '06b9800d47d5fe51a19e322dd86e61c9', + 'info_dict': { + 'comment_count': int, + 'creator': 'patroX', + 'description': 'md5:5e2a23877420bb85ce6521dbee39ba94', + 'duration': 27, + 'ext': 'mp4', + 'formats': list, + 'height': 960, + 'id': '6742501081818877190', + 'like_count': int, + 'repost_count': int, + 'thumbnail': r're:^https?://[\w\/\.\-]+(~[\w\-]+\.image)?', + 'thumbnails': list, + 'timestamp': 1569860870, + 'title': 'patroX on TikTok', + 'upload_date': '20190930', + 'uploader': 'patroxofficial', + 'uploader_id': '18702747', + 'uploader_url': r're:https://www.tiktok.com/@patroxofficial', + 'webpage_url': r're:https://www.tiktok.com/@patroxofficial/(video/)?6742501081818877190', + 'width': 540, + } }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'https://m.tiktok.com/v/%s.html' % video_id, video_id) - data = self._parse_json(self._search_regex( - r'\bdata\s*=\s*({.+?})\s*;', webpage, 'data'), video_id) - return self._extract_aweme(data) + webpage = self._download_webpage(url, video_id, note='Downloading video webpage') + json_string = self._search_regex( + r'id=\"__NEXT_DATA__\"\s+type=\"application\/json\"\s*[^>]+>\s*(?P<json_string_ld>[^<]+)', + webpage, 'json_string', group='json_string_ld') + json_data = self._parse_json(json_string, video_id) + video_data = try_get(json_data, lambda x: x['props']['pageProps'], expected_type=dict) -class TikTokUserIE(TikTokBaseIE): - _VALID_URL = r'''(?x) - https?:// - (?: - (?:m\.)?tiktok\.com/h5/share/usr| - (?:www\.)?tiktok\.com/share/user - ) - /(?P<id>\d+) - ''' - _TESTS = [{ - 'url': 'https://m.tiktok.com/h5/share/usr/188294915489964032.html', - 'info_dict': { - 'id': '188294915489964032', - }, - 'playlist_mincount': 24, - }, { - 'url': 'https://www.tiktok.com/share/user/188294915489964032', - 'only_matching': True, - }] + # Chech statusCode for success + if video_data.get('statusCode') == 0: + return self._extract_aweme(video_data, webpage) - def _real_extract(self, url): - user_id = self._match_id(url) - data = self._download_json( - 'https://m.tiktok.com/h5/share/usr/list/%s/' % user_id, user_id, - query={'_signature': '_'}) - entries = [] - for aweme in data['aweme_list']: - try: - entry = self._extract_aweme(aweme) - except ExtractorError: - continue - entry['extractor_key'] = TikTokIE.ie_key() - entries.append(entry) - return self.playlist_result(entries, user_id) + raise ExtractorError('Video not available', video_id=video_id) diff --git a/youtube_dlc/extractor/twitch.py b/youtube_dlc/extractor/twitch.py index 35e4dda37..eadc48c6d 100644 --- a/youtube_dlc/extractor/twitch.py +++ b/youtube_dlc/extractor/twitch.py @@ -1,24 +1,26 @@ # coding: utf-8 from __future__ import unicode_literals +import collections import itertools -import re -import random import json +import random +import re from .common import InfoExtractor from ..compat import ( compat_kwargs, compat_parse_qs, compat_str, + compat_urlparse, compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, ) from ..utils import ( clean_html, ExtractorError, + float_or_none, int_or_none, - orderedSet, parse_duration, parse_iso8601, qualities, @@ -150,120 +152,16 @@ class TwitchBaseIE(InfoExtractor): }) self._sort_formats(formats) + def _download_access_token(self, channel_name): + return self._call_api( + 'api/channels/%s/access_token' % channel_name, channel_name, + 'Downloading access token JSON') -class TwitchItemBaseIE(TwitchBaseIE): - def _download_info(self, item, item_id): - return self._extract_info(self._call_api( - 'kraken/videos/%s%s' % (item, item_id), item_id, - 'Downloading %s info JSON' % self._ITEM_TYPE)) - - def _extract_media(self, item_id): - info = self._download_info(self._ITEM_SHORTCUT, item_id) - response = self._call_api( - 'api/videos/%s%s' % (self._ITEM_SHORTCUT, item_id), item_id, - 'Downloading %s playlist JSON' % self._ITEM_TYPE) - entries = [] - chunks = response['chunks'] - qualities = list(chunks.keys()) - for num, fragment in enumerate(zip(*chunks.values()), start=1): - formats = [] - for fmt_num, fragment_fmt in enumerate(fragment): - format_id = qualities[fmt_num] - fmt = { - 'url': fragment_fmt['url'], - 'format_id': format_id, - 'quality': 1 if format_id == 'live' else 0, - } - m = re.search(r'^(?P<height>\d+)[Pp]', format_id) - if m: - fmt['height'] = int(m.group('height')) - formats.append(fmt) - self._sort_formats(formats) - entry = dict(info) - entry['id'] = '%s_%d' % (entry['id'], num) - entry['title'] = '%s part %d' % (entry['title'], num) - entry['formats'] = formats - entries.append(entry) - return self.playlist_result(entries, info['id'], info['title']) - - def _extract_info(self, info): - status = info.get('status') - if status == 'recording': - is_live = True - elif status == 'recorded': - is_live = False - else: - is_live = None - _QUALITIES = ('small', 'medium', 'large') - quality_key = qualities(_QUALITIES) - thumbnails = [] - preview = info.get('preview') - if isinstance(preview, dict): - for thumbnail_id, thumbnail_url in preview.items(): - thumbnail_url = url_or_none(thumbnail_url) - if not thumbnail_url: - continue - if thumbnail_id not in _QUALITIES: - continue - thumbnails.append({ - 'url': thumbnail_url, - 'preference': quality_key(thumbnail_id), - }) - return { - 'id': info['_id'], - 'title': info.get('title') or 'Untitled Broadcast', - 'description': info.get('description'), - 'duration': int_or_none(info.get('length')), - 'thumbnails': thumbnails, - 'uploader': info.get('channel', {}).get('display_name'), - 'uploader_id': info.get('channel', {}).get('name'), - 'timestamp': parse_iso8601(info.get('recorded_at')), - 'view_count': int_or_none(info.get('views')), - 'is_live': is_live, - } - - def _real_extract(self, url): - return self._extract_media(self._match_id(url)) - - -class TwitchVideoIE(TwitchItemBaseIE): - IE_NAME = 'twitch:video' - _VALID_URL = r'%s/[^/]+/b/(?P<id>\d+)' % TwitchBaseIE._VALID_URL_BASE - _ITEM_TYPE = 'video' - _ITEM_SHORTCUT = 'a' - - _TEST = { - 'url': 'http://www.twitch.tv/riotgames/b/577357806', - 'info_dict': { - 'id': 'a577357806', - 'title': 'Worlds Semifinals - Star Horn Royal Club vs. OMG', - }, - 'playlist_mincount': 12, - 'skip': 'HTTP Error 404: Not Found', - } - - -class TwitchChapterIE(TwitchItemBaseIE): - IE_NAME = 'twitch:chapter' - _VALID_URL = r'%s/[^/]+/c/(?P<id>\d+)' % TwitchBaseIE._VALID_URL_BASE - _ITEM_TYPE = 'chapter' - _ITEM_SHORTCUT = 'c' + def _extract_channel_id(self, token, channel_name): + return compat_str(self._parse_json(token, channel_name)['channel_id']) - _TESTS = [{ - 'url': 'http://www.twitch.tv/acracingleague/c/5285812', - 'info_dict': { - 'id': 'c5285812', - 'title': 'ACRL Off Season - Sports Cars @ Nordschleife', - }, - 'playlist_mincount': 3, - 'skip': 'HTTP Error 404: Not Found', - }, { - 'url': 'http://www.twitch.tv/tsm_theoddone/c/2349361', - 'only_matching': True, - }] - -class TwitchVodIE(TwitchItemBaseIE): +class TwitchVodIE(TwitchBaseIE): IE_NAME = 'twitch:vod' _VALID_URL = r'''(?x) https?:// @@ -332,17 +230,60 @@ class TwitchVodIE(TwitchItemBaseIE): 'only_matching': True, }] + def _download_info(self, item_id): + return self._extract_info( + self._call_api( + 'kraken/videos/%s' % item_id, item_id, + 'Downloading video info JSON')) + + @staticmethod + def _extract_info(info): + status = info.get('status') + if status == 'recording': + is_live = True + elif status == 'recorded': + is_live = False + else: + is_live = None + _QUALITIES = ('small', 'medium', 'large') + quality_key = qualities(_QUALITIES) + thumbnails = [] + preview = info.get('preview') + if isinstance(preview, dict): + for thumbnail_id, thumbnail_url in preview.items(): + thumbnail_url = url_or_none(thumbnail_url) + if not thumbnail_url: + continue + if thumbnail_id not in _QUALITIES: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'preference': quality_key(thumbnail_id), + }) + return { + 'id': info['_id'], + 'title': info.get('title') or 'Untitled Broadcast', + 'description': info.get('description'), + 'duration': int_or_none(info.get('length')), + 'thumbnails': thumbnails, + 'uploader': info.get('channel', {}).get('display_name'), + 'uploader_id': info.get('channel', {}).get('name'), + 'timestamp': parse_iso8601(info.get('recorded_at')), + 'view_count': int_or_none(info.get('views')), + 'is_live': is_live, + } + def _real_extract(self, url): - item_id = self._match_id(url) + vod_id = self._match_id(url) - info = self._download_info(self._ITEM_SHORTCUT, item_id) + info = self._download_info(vod_id) access_token = self._call_api( - 'api/vods/%s/access_token' % item_id, item_id, + 'api/vods/%s/access_token' % vod_id, vod_id, 'Downloading %s access token' % self._ITEM_TYPE) formats = self._extract_m3u8_formats( '%s/vod/%s.m3u8?%s' % ( - self._USHER_BASE, item_id, + self._USHER_BASE, vod_id, compat_urllib_parse_urlencode({ 'allow_source': 'true', 'allow_audio_only': 'true', @@ -352,7 +293,7 @@ class TwitchVodIE(TwitchItemBaseIE): 'nauth': access_token['token'], 'nauthsig': access_token['sig'], })), - item_id, 'mp4', entry_protocol='m3u8_native') + vod_id, 'mp4', entry_protocol='m3u8_native') self._prefer_source(formats) info['formats'] = formats @@ -366,7 +307,7 @@ class TwitchVodIE(TwitchItemBaseIE): info['subtitles'] = { 'rechat': [{ 'url': update_url_query( - 'https://api.twitch.tv/v5/videos/%s/comments' % item_id, { + 'https://api.twitch.tv/v5/videos/%s/comments' % vod_id, { 'client_id': self._CLIENT_ID, }), 'ext': 'json', @@ -376,170 +317,405 @@ class TwitchVodIE(TwitchItemBaseIE): return info -class TwitchPlaylistBaseIE(TwitchBaseIE): - _PLAYLIST_PATH = 'kraken/channels/%s/videos/?offset=%d&limit=%d' +def _make_video_result(node): + assert isinstance(node, dict) + video_id = node.get('id') + if not video_id: + return + return { + '_type': 'url_transparent', + 'ie_key': TwitchVodIE.ie_key(), + 'id': video_id, + 'url': 'https://www.twitch.tv/videos/%s' % video_id, + 'title': node.get('title'), + 'thumbnail': node.get('previewThumbnailURL'), + 'duration': float_or_none(node.get('lengthSeconds')), + 'view_count': int_or_none(node.get('viewCount')), + } + + +class TwitchGraphQLBaseIE(TwitchBaseIE): _PAGE_LIMIT = 100 - def _extract_playlist(self, channel_name): - info = self._call_api( - 'kraken/users?login=%s' % channel_name, - channel_name, 'Downloading channel info JSON') - info = info['users'][0] - channel_id = info['_id'] - channel_name = info.get('display_name') or info.get('name') or channel_name + def _download_gql(self, video_id, op, variables, sha256_hash, note, fatal=True): + return self._download_json( + 'https://gql.twitch.tv/gql', video_id, note, + data=json.dumps({ + 'operationName': op, + 'variables': variables, + 'extensions': { + 'persistedQuery': { + 'version': 1, + 'sha256Hash': sha256_hash, + } + } + }).encode(), + headers={ + 'Content-Type': 'text/plain;charset=UTF-8', + 'Client-ID': self._CLIENT_ID, + }, fatal=fatal) + + +class TwitchCollectionIE(TwitchGraphQLBaseIE): + _VALID_URL = r'https?://(?:(?:www|go|m)\.)?twitch\.tv/collections/(?P<id>[^/]+)' + + _TESTS = [{ + 'url': 'https://www.twitch.tv/collections/wlDCoH0zEBZZbQ', + 'info_dict': { + 'id': 'wlDCoH0zEBZZbQ', + 'title': 'Overthrow Nook, capitalism for children', + }, + 'playlist_mincount': 13, + }] + + _OPERATION_NAME = 'CollectionSideBar' + _SHA256_HASH = '27111f1b382effad0b6def325caef1909c733fe6a4fbabf54f8d491ef2cf2f14' + + def _real_extract(self, url): + collection_id = self._match_id(url) + collection = self._download_gql( + collection_id, self._OPERATION_NAME, + {'collectionID': collection_id}, self._SHA256_HASH, + 'Downloading collection GraphQL')['data']['collection'] + title = collection.get('title') entries = [] + for edge in collection['items']['edges']: + if not isinstance(edge, dict): + continue + node = edge.get('node') + if not isinstance(node, dict): + continue + video = _make_video_result(node) + if video: + entries.append(video) + return self.playlist_result( + entries, playlist_id=collection_id, playlist_title=title) + + +class TwitchPlaylistBaseIE(TwitchGraphQLBaseIE): + def _entries(self, channel_name, *args): + cursor = None + variables_common = self._make_variables(channel_name, *args) + entries_key = '%ss' % self._ENTRY_KIND + for page_num in itertools.count(1): + variables = variables_common.copy() + variables['limit'] = self._PAGE_LIMIT + if cursor: + variables['cursor'] = cursor + page = self._download_gql( + channel_name, self._OPERATION_NAME, variables, + self._SHA256_HASH, + 'Downloading %ss GraphQL page %s' % (self._NODE_KIND, page_num), + fatal=False) + if not page: + break + edges = try_get( + page, lambda x: x['data']['user'][entries_key]['edges'], list) + if not edges: + break + for edge in edges: + if not isinstance(edge, dict): + continue + if edge.get('__typename') != self._EDGE_KIND: + continue + node = edge.get('node') + if not isinstance(node, dict): + continue + if node.get('__typename') != self._NODE_KIND: + continue + entry = self._extract_entry(node) + if entry: + cursor = edge.get('cursor') + yield entry + if not cursor or not isinstance(cursor, compat_str): + break + + # Deprecated kraken v5 API + def _entries_kraken(self, channel_name, broadcast_type, sort): + access_token = self._download_access_token(channel_name) + channel_id = self._extract_channel_id(access_token['token'], channel_name) offset = 0 - limit = self._PAGE_LIMIT - broken_paging_detected = False counter_override = None for counter in itertools.count(1): response = self._call_api( - self._PLAYLIST_PATH % (channel_id, offset, limit), + 'kraken/channels/%s/videos/' % channel_id, channel_id, - 'Downloading %s JSON page %s' - % (self._PLAYLIST_TYPE, counter_override or counter)) - page_entries = self._extract_playlist_page(response) - if not page_entries: + 'Downloading video JSON page %s' % (counter_override or counter), + query={ + 'offset': offset, + 'limit': self._PAGE_LIMIT, + 'broadcast_type': broadcast_type, + 'sort': sort, + }) + videos = response.get('videos') + if not isinstance(videos, list): break + for video in videos: + if not isinstance(video, dict): + continue + video_url = url_or_none(video.get('url')) + if not video_url: + continue + yield { + '_type': 'url_transparent', + 'ie_key': TwitchVodIE.ie_key(), + 'id': video.get('_id'), + 'url': video_url, + 'title': video.get('title'), + 'description': video.get('description'), + 'timestamp': unified_timestamp(video.get('published_at')), + 'duration': float_or_none(video.get('length')), + 'view_count': int_or_none(video.get('views')), + 'language': video.get('language'), + } + offset += self._PAGE_LIMIT total = int_or_none(response.get('_total')) - # Since the beginning of March 2016 twitch's paging mechanism - # is completely broken on the twitch side. It simply ignores - # a limit and returns the whole offset number of videos. - # Working around by just requesting all videos at once. - # Upd: pagination bug was fixed by twitch on 15.03.2016. - if not broken_paging_detected and total and len(page_entries) > limit: - self.report_warning( - 'Twitch pagination is broken on twitch side, requesting all videos at once', - channel_id) - broken_paging_detected = True - offset = total - counter_override = '(all at once)' - continue - entries.extend(page_entries) - if broken_paging_detected or total and len(page_entries) >= total: + if total and offset >= total: break - offset += limit - return self.playlist_result( - [self._make_url_result(entry) for entry in orderedSet(entries)], - channel_id, channel_name) - - def _make_url_result(self, url): - try: - video_id = 'v%s' % TwitchVodIE._match_id(url) - return self.url_result(url, TwitchVodIE.ie_key(), video_id=video_id) - except AssertionError: - return self.url_result(url) - - def _extract_playlist_page(self, response): - videos = response.get('videos') - return [video['url'] for video in videos] if videos else [] - - def _real_extract(self, url): - return self._extract_playlist(self._match_id(url)) -class TwitchProfileIE(TwitchPlaylistBaseIE): - IE_NAME = 'twitch:profile' - _VALID_URL = r'%s/(?P<id>[^/]+)/profile/?(?:\#.*)?$' % TwitchBaseIE._VALID_URL_BASE - _PLAYLIST_TYPE = 'profile' +class TwitchVideosIE(TwitchPlaylistBaseIE): + _VALID_URL = r'https?://(?:(?:www|go|m)\.)?twitch\.tv/(?P<id>[^/]+)/(?:videos|profile)' _TESTS = [{ - 'url': 'http://www.twitch.tv/vanillatv/profile', + # All Videos sorted by Date + 'url': 'https://www.twitch.tv/spamfish/videos?filter=all', + 'info_dict': { + 'id': 'spamfish', + 'title': 'spamfish - All Videos sorted by Date', + }, + 'playlist_mincount': 924, + }, { + # All Videos sorted by Popular + 'url': 'https://www.twitch.tv/spamfish/videos?filter=all&sort=views', + 'info_dict': { + 'id': 'spamfish', + 'title': 'spamfish - All Videos sorted by Popular', + }, + 'playlist_mincount': 931, + }, { + # Past Broadcasts sorted by Date + 'url': 'https://www.twitch.tv/spamfish/videos?filter=archives', 'info_dict': { - 'id': '22744919', - 'title': 'VanillaTV', + 'id': 'spamfish', + 'title': 'spamfish - Past Broadcasts sorted by Date', }, - 'playlist_mincount': 412, + 'playlist_mincount': 27, + }, { + # Highlights sorted by Date + 'url': 'https://www.twitch.tv/spamfish/videos?filter=highlights', + 'info_dict': { + 'id': 'spamfish', + 'title': 'spamfish - Highlights sorted by Date', + }, + 'playlist_mincount': 901, + }, { + # Uploads sorted by Date + 'url': 'https://www.twitch.tv/esl_csgo/videos?filter=uploads&sort=time', + 'info_dict': { + 'id': 'esl_csgo', + 'title': 'esl_csgo - Uploads sorted by Date', + }, + 'playlist_mincount': 5, + }, { + # Past Premieres sorted by Date + 'url': 'https://www.twitch.tv/spamfish/videos?filter=past_premieres', + 'info_dict': { + 'id': 'spamfish', + 'title': 'spamfish - Past Premieres sorted by Date', + }, + 'playlist_mincount': 1, + }, { + 'url': 'https://www.twitch.tv/spamfish/videos/all', + 'only_matching': True, + }, { + 'url': 'https://m.twitch.tv/spamfish/videos/all', + 'only_matching': True, }, { - 'url': 'http://m.twitch.tv/vanillatv/profile', + 'url': 'https://www.twitch.tv/spamfish/videos', 'only_matching': True, }] + Broadcast = collections.namedtuple('Broadcast', ['type', 'label']) -class TwitchVideosBaseIE(TwitchPlaylistBaseIE): - _VALID_URL_VIDEOS_BASE = r'%s/(?P<id>[^/]+)/videos' % TwitchBaseIE._VALID_URL_BASE - _VALID_URL_VIDEOS_FILTERS = r'\?(?:.*?[&;])??filter=%s' - _PLAYLIST_PATH = TwitchPlaylistBaseIE._PLAYLIST_PATH + '&broadcast_type=' + _DEFAULT_BROADCAST = Broadcast(None, 'All Videos') + _BROADCASTS = { + 'archives': Broadcast('ARCHIVE', 'Past Broadcasts'), + 'highlights': Broadcast('HIGHLIGHT', 'Highlights'), + 'uploads': Broadcast('UPLOAD', 'Uploads'), + 'past_premieres': Broadcast('PAST_PREMIERE', 'Past Premieres'), + 'all': _DEFAULT_BROADCAST, + } + + _DEFAULT_SORTED_BY = 'Date' + _SORTED_BY = { + 'time': _DEFAULT_SORTED_BY, + 'views': 'Popular', + } + + _SHA256_HASH = 'a937f1d22e269e39a03b509f65a7490f9fc247d7f83d6ac1421523e3b68042cb' + _OPERATION_NAME = 'FilterableVideoTower_Videos' + _ENTRY_KIND = 'video' + _EDGE_KIND = 'VideoEdge' + _NODE_KIND = 'Video' + + @classmethod + def suitable(cls, url): + return (False + if any(ie.suitable(url) for ie in ( + TwitchVideosClipsIE, + TwitchVideosCollectionsIE)) + else super(TwitchVideosIE, cls).suitable(url)) + + @staticmethod + def _make_variables(channel_name, broadcast_type, sort): + return { + 'channelOwnerLogin': channel_name, + 'broadcastType': broadcast_type, + 'videoSort': sort.upper(), + } + + @staticmethod + def _extract_entry(node): + return _make_video_result(node) + + def _real_extract(self, url): + channel_name = self._match_id(url) + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + filter = qs.get('filter', ['all'])[0] + sort = qs.get('sort', ['time'])[0] + broadcast = self._BROADCASTS.get(filter, self._DEFAULT_BROADCAST) + return self.playlist_result( + self._entries(channel_name, broadcast.type, sort), + playlist_id=channel_name, + playlist_title='%s - %s sorted by %s' + % (channel_name, broadcast.label, + self._SORTED_BY.get(sort, self._DEFAULT_SORTED_BY))) -class TwitchAllVideosIE(TwitchVideosBaseIE): - IE_NAME = 'twitch:videos:all' - _VALID_URL = '%s/?(?:(?:%s)|$)' % ( - TwitchVideosBaseIE._VALID_URL_VIDEOS_BASE, - TwitchVideosBaseIE._VALID_URL_VIDEOS_FILTERS % 'all' - ) - _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'archive,upload,highlight' - _PLAYLIST_TYPE = 'all videos' +class TwitchVideosClipsIE(TwitchPlaylistBaseIE): + _VALID_URL = r'https?://(?:(?:www|go|m)\.)?twitch\.tv/(?P<id>[^/]+)/(?:clips|videos/*?\?.*?\bfilter=clips)' _TESTS = [{ - 'url': 'https://www.twitch.tv/spamfish/videos?filter=all&sort=time', + # Clips + 'url': 'https://www.twitch.tv/vanillatv/clips?filter=clips&range=all', 'info_dict': { - 'id': '497952', - 'title': 'Spamfish', + 'id': 'vanillatv', + 'title': 'vanillatv - Clips Top All', }, - 'playlist_mincount': 869, + 'playlist_mincount': 1, }, { - 'url': 'https://m.twitch.tv/spamfish/videos/', + 'url': 'https://www.twitch.tv/dota2ruhub/videos?filter=clips&range=7d', 'only_matching': True, }] + Clip = collections.namedtuple('Clip', ['filter', 'label']) -class TwitchUploadsIE(TwitchVideosBaseIE): - IE_NAME = 'twitch:videos:uploads' - _VALID_URL = '%s/?(?:%s)' % ( - TwitchVideosBaseIE._VALID_URL_VIDEOS_BASE, - TwitchVideosBaseIE._VALID_URL_VIDEOS_FILTERS % 'uploads' - ) - _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'upload' - _PLAYLIST_TYPE = 'uploads' + _DEFAULT_CLIP = Clip('LAST_WEEK', 'Top 7D') + _RANGE = { + '24hr': Clip('LAST_DAY', 'Top 24H'), + '7d': _DEFAULT_CLIP, + '30d': Clip('LAST_MONTH', 'Top 30D'), + 'all': Clip('ALL_TIME', 'Top All'), + } - _TESTS = [{ - 'url': 'https://www.twitch.tv/spamfish/videos?filter=uploads&sort=time', - 'info_dict': { - 'id': '497952', - 'title': 'Spamfish', - }, - 'playlist_mincount': 0, - }] + # NB: values other than 20 result in skipped videos + _PAGE_LIMIT = 20 + _SHA256_HASH = 'b73ad2bfaecfd30a9e6c28fada15bd97032c83ec77a0440766a56fe0bd632777' + _OPERATION_NAME = 'ClipsCards__User' + _ENTRY_KIND = 'clip' + _EDGE_KIND = 'ClipEdge' + _NODE_KIND = 'Clip' -class TwitchPastBroadcastsIE(TwitchVideosBaseIE): - IE_NAME = 'twitch:videos:past-broadcasts' - _VALID_URL = '%s/?(?:%s)' % ( - TwitchVideosBaseIE._VALID_URL_VIDEOS_BASE, - TwitchVideosBaseIE._VALID_URL_VIDEOS_FILTERS % 'archives' - ) - _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'archive' - _PLAYLIST_TYPE = 'past broadcasts' + @staticmethod + def _make_variables(channel_name, filter): + return { + 'login': channel_name, + 'criteria': { + 'filter': filter, + }, + } - _TESTS = [{ - 'url': 'https://www.twitch.tv/spamfish/videos?filter=archives&sort=time', - 'info_dict': { - 'id': '497952', - 'title': 'Spamfish', - }, - 'playlist_mincount': 0, - }] + @staticmethod + def _extract_entry(node): + assert isinstance(node, dict) + clip_url = url_or_none(node.get('url')) + if not clip_url: + return + return { + '_type': 'url_transparent', + 'ie_key': TwitchClipsIE.ie_key(), + 'id': node.get('id'), + 'url': clip_url, + 'title': node.get('title'), + 'thumbnail': node.get('thumbnailURL'), + 'duration': float_or_none(node.get('durationSeconds')), + 'timestamp': unified_timestamp(node.get('createdAt')), + 'view_count': int_or_none(node.get('viewCount')), + 'language': node.get('language'), + } + + def _real_extract(self, url): + channel_name = self._match_id(url) + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + range = qs.get('range', ['7d'])[0] + clip = self._RANGE.get(range, self._DEFAULT_CLIP) + return self.playlist_result( + self._entries(channel_name, clip.filter), + playlist_id=channel_name, + playlist_title='%s - Clips %s' % (channel_name, clip.label)) -class TwitchHighlightsIE(TwitchVideosBaseIE): - IE_NAME = 'twitch:videos:highlights' - _VALID_URL = '%s/?(?:%s)' % ( - TwitchVideosBaseIE._VALID_URL_VIDEOS_BASE, - TwitchVideosBaseIE._VALID_URL_VIDEOS_FILTERS % 'highlights' - ) - _PLAYLIST_PATH = TwitchVideosBaseIE._PLAYLIST_PATH + 'highlight' - _PLAYLIST_TYPE = 'highlights' +class TwitchVideosCollectionsIE(TwitchPlaylistBaseIE): + _VALID_URL = r'https?://(?:(?:www|go|m)\.)?twitch\.tv/(?P<id>[^/]+)/videos/*?\?.*?\bfilter=collections' _TESTS = [{ - 'url': 'https://www.twitch.tv/spamfish/videos?filter=highlights&sort=views', + # Collections + 'url': 'https://www.twitch.tv/spamfish/videos?filter=collections', 'info_dict': { - 'id': '497952', - 'title': 'Spamfish', + 'id': 'spamfish', + 'title': 'spamfish - Collections', }, - 'playlist_mincount': 805, + 'playlist_mincount': 3, }] + _SHA256_HASH = '07e3691a1bad77a36aba590c351180439a40baefc1c275356f40fc7082419a84' + _OPERATION_NAME = 'ChannelCollectionsContent' + _ENTRY_KIND = 'collection' + _EDGE_KIND = 'CollectionsItemEdge' + _NODE_KIND = 'Collection' + + @staticmethod + def _make_variables(channel_name): + return { + 'ownerLogin': channel_name, + } + + @staticmethod + def _extract_entry(node): + assert isinstance(node, dict) + collection_id = node.get('id') + if not collection_id: + return + return { + '_type': 'url_transparent', + 'ie_key': TwitchCollectionIE.ie_key(), + 'id': collection_id, + 'url': 'https://www.twitch.tv/collections/%s' % collection_id, + 'title': node.get('title'), + 'thumbnail': node.get('thumbnailURL'), + 'duration': float_or_none(node.get('lengthSeconds')), + 'timestamp': unified_timestamp(node.get('updatedAt')), + 'view_count': int_or_none(node.get('viewCount')), + } + + def _real_extract(self, url): + channel_name = self._match_id(url) + return self.playlist_result( + self._entries(channel_name), playlist_id=channel_name, + playlist_title='%s - Collections' % channel_name) + class TwitchStreamIE(TwitchBaseIE): IE_NAME = 'twitch:stream' @@ -589,27 +765,21 @@ class TwitchStreamIE(TwitchBaseIE): def suitable(cls, url): return (False if any(ie.suitable(url) for ie in ( - TwitchVideoIE, - TwitchChapterIE, TwitchVodIE, - TwitchProfileIE, - TwitchAllVideosIE, - TwitchUploadsIE, - TwitchPastBroadcastsIE, - TwitchHighlightsIE, + TwitchCollectionIE, + TwitchVideosIE, + TwitchVideosClipsIE, + TwitchVideosCollectionsIE, TwitchClipsIE)) else super(TwitchStreamIE, cls).suitable(url)) def _real_extract(self, url): channel_name = self._match_id(url) - access_token = self._call_api( - 'api/channels/%s/access_token' % channel_name, channel_name, - 'Downloading access token JSON') + access_token = self._download_access_token(channel_name) token = access_token['token'] - channel_id = compat_str(self._parse_json( - token, channel_name)['channel_id']) + channel_id = self._extract_channel_id(token, channel_name) stream = self._call_api( 'kraken/streams/%s?stream_type=all' % channel_id, diff --git a/youtube_dlc/extractor/viki.py b/youtube_dlc/extractor/viki.py index 9e4171237..f8e360338 100644 --- a/youtube_dlc/extractor/viki.py +++ b/youtube_dlc/extractor/viki.py @@ -12,6 +12,7 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, + HEADRequest, parse_age_limit, parse_iso8601, sanitized_Request, @@ -220,6 +221,69 @@ class VikiIE(VikiBaseIE): video = self._call_api( 'videos/%s.json' % video_id, video_id, 'Downloading video JSON') + streams = self._call_api( + 'videos/%s/streams.json' % video_id, video_id, + 'Downloading video streams JSON') + + formats = [] + for format_id, stream_dict in streams.items(): + height = int_or_none(self._search_regex( + r'^(\d+)[pP]$', format_id, 'height', default=None)) + for protocol, format_dict in stream_dict.items(): + # rtmps URLs does not seem to work + if protocol == 'rtmps': + continue + format_url = format_dict.get('url') + format_drms = format_dict.get('drms') + format_stream_id = format_dict.get('id') + if format_id == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', + m3u8_id='m3u8-%s' % protocol, fatal=False) + # Despite CODECS metadata in m3u8 all video-only formats + # are actually video+audio + for f in m3u8_formats: + if f.get('acodec') == 'none' and f.get('vcodec') != 'none': + f['acodec'] = None + formats.extend(m3u8_formats) + elif format_id == 'mpd': + mpd_formats = self._extract_mpd_formats( + format_url, video_id, + mpd_id='mpd-%s' % protocol, fatal=False) + formats.extend(mpd_formats) + elif format_id == 'mpd': + + formats.extend(mpd_formats) + elif format_url.startswith('rtmp'): + mobj = re.search( + r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$', + format_url) + if not mobj: + continue + formats.append({ + 'format_id': 'rtmp-%s' % format_id, + 'ext': 'flv', + 'url': mobj.group('url'), + 'play_path': mobj.group('playpath'), + 'app': mobj.group('app'), + 'page_url': url, + 'drms': format_drms, + 'stream_id': format_stream_id, + }) + else: + urlh = self._request_webpage( + HEADRequest(format_url), video_id, 'Checking file size', fatal=False) + formats.append({ + 'url': format_url, + 'format_id': '%s-%s' % (format_id, protocol), + 'height': height, + 'drms': format_drms, + 'stream_id': format_stream_id, + 'filesize': int_or_none(urlh.headers.get('Content-Length')), + }) + self._sort_formats(formats) + self._check_errors(video) title = self.dict_selection(video.get('titles', {}), 'en', allow_fallback=False) @@ -244,12 +308,18 @@ class VikiIE(VikiBaseIE): 'url': thumbnail.get('url'), }) + stream_ids = [] + for f in formats: + s_id = f.get('stream_id') + if s_id is not None: + stream_ids.append(s_id) + subtitles = {} for subtitle_lang, _ in video.get('subtitle_completions', {}).items(): subtitles[subtitle_lang] = [{ 'ext': subtitles_format, 'url': self._prepare_call( - 'videos/%s/subtitles/%s.%s' % (video_id, subtitle_lang, subtitles_format)), + 'videos/%s/subtitles/%s.%s?stream_id=%s' % (video_id, subtitle_lang, subtitles_format, stream_ids[0])), } for subtitles_format in ('srt', 'vtt')] result = { @@ -265,10 +335,6 @@ class VikiIE(VikiBaseIE): 'subtitles': subtitles, } - streams = self._call_api( - 'videos/%s/streams.json' % video_id, video_id, - 'Downloading video streams JSON') - if 'external' in streams: result.update({ '_type': 'url_transparent', @@ -276,48 +342,6 @@ class VikiIE(VikiBaseIE): }) return result - formats = [] - for format_id, stream_dict in streams.items(): - height = int_or_none(self._search_regex( - r'^(\d+)[pP]$', format_id, 'height', default=None)) - for protocol, format_dict in stream_dict.items(): - # rtmps URLs does not seem to work - if protocol == 'rtmps': - continue - format_url = format_dict['url'] - if format_id == 'm3u8': - m3u8_formats = self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', - m3u8_id='m3u8-%s' % protocol, fatal=False) - # Despite CODECS metadata in m3u8 all video-only formats - # are actually video+audio - for f in m3u8_formats: - if f.get('acodec') == 'none' and f.get('vcodec') != 'none': - f['acodec'] = None - formats.extend(m3u8_formats) - elif format_url.startswith('rtmp'): - mobj = re.search( - r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$', - format_url) - if not mobj: - continue - formats.append({ - 'format_id': 'rtmp-%s' % format_id, - 'ext': 'flv', - 'url': mobj.group('url'), - 'play_path': mobj.group('playpath'), - 'app': mobj.group('app'), - 'page_url': url, - }) - else: - formats.append({ - 'url': format_url, - 'format_id': '%s-%s' % (format_id, protocol), - 'height': height, - }) - self._sort_formats(formats) - result['formats'] = formats return result diff --git a/youtube_dlc/extractor/vrt.py b/youtube_dlc/extractor/vrt.py index 422025267..2b65d2e5f 100644 --- a/youtube_dlc/extractor/vrt.py +++ b/youtube_dlc/extractor/vrt.py @@ -55,13 +55,13 @@ class VRTIE(InfoExtractor): site, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) attrs = extract_attributes(self._search_regex( - r'(<[^>]+class="vrtvideo"[^>]*>)', webpage, 'vrt video')) + r'(<[^>]+class="vrtvideo( [^"]*)?"[^>]*>)', webpage, 'vrt video')) - asset_id = attrs['data-videoid'] - publication_id = attrs.get('data-publicationid') + asset_id = attrs['data-video-id'] + publication_id = attrs.get('data-publication-id') if publication_id: asset_id = publication_id + '$' + asset_id - client = attrs.get('data-client') or self._CLIENT_MAP[site] + client = attrs.get('data-client-code') or self._CLIENT_MAP[site] title = strip_or_none(get_element_by_class( 'vrt-title', webpage) or self._html_search_meta( diff --git a/youtube_dlc/extractor/xhamster.py b/youtube_dlc/extractor/xhamster.py index 902a3ed33..76aeaf9a4 100644 --- a/youtube_dlc/extractor/xhamster.py +++ b/youtube_dlc/extractor/xhamster.py @@ -138,7 +138,8 @@ class XHamsterIE(InfoExtractor): initials = self._parse_json( self._search_regex( - r'window\.initials\s*=\s*({.+?})\s*;', webpage, 'initials', + (r'window\.initials\s*=\s*({.+?})\s*;\s*</script>', + r'window\.initials\s*=\s*({.+?})\s*;'), webpage, 'initials', default='{}'), video_id, fatal=False) if initials: diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py index 1cbbf2085..a97921060 100644 --- a/youtube_dlc/extractor/youtube.py +++ b/youtube_dlc/extractor/youtube.py @@ -549,7 +549,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '396': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, '397': {'acodec': 'none', 'vcodec': 'av01.0.05M.08'}, } - _SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt', 'json3') + _SUBTITLE_FORMATS = ('json3', 'srv1', 'srv2', 'srv3', 'ttml', 'vtt') _GEO_BYPASS = False @@ -1843,7 +1843,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Get video info video_info = {} embed_webpage = None - if self._html_search_meta('og:restrictions:age', video_webpage, default=None) == "18+": + if (self._og_search_property('restrictions:age', video_webpage, default=None) == '18+' + or re.search(r'player-age-gate-content">', video_webpage) is not None): age_gate = True # We simulate the access to the video from www.youtube.com/v/{video_id} # this can be viewed without login into Youtube @@ -3036,7 +3037,7 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): class YoutubeUserIE(YoutubeChannelIE): IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)' - _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)' + _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9%-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_%-]+)' _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos' IE_NAME = 'youtube:user' @@ -3067,6 +3068,9 @@ class YoutubeUserIE(YoutubeChannelIE): 'url': 'https://www.youtube.com/c/gametrailers', 'only_matching': True, }, { + 'url': 'https://www.youtube.com/c/Pawe%C5%82Zadro%C5%BCniak', + 'only_matching': True, + }, { 'url': 'https://www.youtube.com/gametrailers', 'only_matching': True, }, { diff --git a/youtube_dlc/utils.py b/youtube_dlc/utils.py index 7dafacac2..32b179c6f 100644 --- a/youtube_dlc/utils.py +++ b/youtube_dlc/utils.py @@ -4199,6 +4199,7 @@ def mimetype2ext(mt): 'vnd.ms-sstr+xml': 'ism', 'quicktime': 'mov', 'mp2t': 'ts', + 'x-wav': 'wav', }.get(res, res) diff --git a/youtube_dlc/version.py b/youtube_dlc/version.py index 45b4d3291..9dd9adf08 100644 --- a/youtube_dlc/version.py +++ b/youtube_dlc/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2020.09.06' +__version__ = '2020.09.12' |