aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTom-Oliver Heidel <github@tom-oliver.eu>2020-11-30 02:51:41 +0100
committerGitHub <noreply@github.com>2020-11-30 02:51:41 +0100
commitb662fc8d2033c615ffbdd3b51123b446c03255e8 (patch)
tree3d32dc25a663f4e61fc28860acda67c7a425cb7a
parent8924ddc3eec4c03c6776673d0d5e823dc5445549 (diff)
parent929576bb9e4aa31f0516f1437d2ae762afdd9f2c (diff)
downloadhypervideo-pre-b662fc8d2033c615ffbdd3b51123b446c03255e8.tar.lz
hypervideo-pre-b662fc8d2033c615ffbdd3b51123b446c03255e8.tar.xz
hypervideo-pre-b662fc8d2033c615ffbdd3b51123b446c03255e8.zip
Merge branch 'master' into gedi
-rw-r--r--.github/ISSUE_TEMPLATE/1_broken_site.md10
-rw-r--r--.github/ISSUE_TEMPLATE/2_site_support_request.md8
-rw-r--r--.github/ISSUE_TEMPLATE/3_site_feature_request.md6
-rw-r--r--.github/ISSUE_TEMPLATE/4_bug_report.md12
-rw-r--r--.github/ISSUE_TEMPLATE/5_feature_request.md6
-rw-r--r--.github/ISSUE_TEMPLATE/6_question.md6
-rw-r--r--.github/ISSUE_TEMPLATE_tmpl/1_broken_site.md6
-rw-r--r--.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.md6
-rw-r--r--.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.md4
-rw-r--r--.github/ISSUE_TEMPLATE_tmpl/4_bug_report.md8
-rw-r--r--.github/ISSUE_TEMPLATE_tmpl/5_feature_request.md4
-rw-r--r--.github/workflows/build.yml42
-rw-r--r--README.md14
-rw-r--r--devscripts/make_lazy_extractors.py2
-rw-r--r--docs/supportedsites.md40
-rw-r--r--make_win.bat2
-rw-r--r--setup.py2
-rw-r--r--test/parameters.json2
-rw-r--r--test/test_YoutubeDL.py70
-rw-r--r--test/test_all_urls.py40
-rw-r--r--test/test_utils.py28
-rw-r--r--youtube_dlc/YoutubeDL.py93
-rw-r--r--youtube_dlc/__init__.py1
-rw-r--r--youtube_dlc/compat.py2
-rw-r--r--youtube_dlc/downloader/common.py31
-rw-r--r--youtube_dlc/downloader/external.py29
-rw-r--r--youtube_dlc/downloader/fragment.py14
-rw-r--r--youtube_dlc/downloader/http.py4
-rw-r--r--youtube_dlc/downloader/youtube_live_chat.py5
-rw-r--r--youtube_dlc/extractor/adobepass.py7
-rw-r--r--youtube_dlc/extractor/afreecatv.py2
-rw-r--r--youtube_dlc/extractor/amara.py103
-rw-r--r--youtube_dlc/extractor/arte.py167
-rw-r--r--youtube_dlc/extractor/bandcamp.py162
-rw-r--r--youtube_dlc/extractor/bbc.py57
-rw-r--r--youtube_dlc/extractor/bitchute.py8
-rw-r--r--youtube_dlc/extractor/bitwave.py61
-rw-r--r--youtube_dlc/extractor/box.py98
-rw-r--r--youtube_dlc/extractor/brightcove.py21
-rw-r--r--youtube_dlc/extractor/cda.py35
-rw-r--r--youtube_dlc/extractor/cnbc.py19
-rw-r--r--youtube_dlc/extractor/common.py34
-rw-r--r--youtube_dlc/extractor/condenast.py27
-rw-r--r--youtube_dlc/extractor/discoverynetworks.py5
-rw-r--r--youtube_dlc/extractor/europa.py4
-rw-r--r--youtube_dlc/extractor/extractors.py46
-rw-r--r--youtube_dlc/extractor/franceinter.py3
-rw-r--r--youtube_dlc/extractor/francetv.py47
-rw-r--r--youtube_dlc/extractor/generic.py23
-rw-r--r--youtube_dlc/extractor/googledrive.py58
-rw-r--r--youtube_dlc/extractor/ina.py5
-rw-r--r--youtube_dlc/extractor/infoq.py7
-rw-r--r--youtube_dlc/extractor/instagram.py23
-rw-r--r--youtube_dlc/extractor/iqiyi.py2
-rw-r--r--youtube_dlc/extractor/itv.py21
-rw-r--r--youtube_dlc/extractor/kusi.py4
-rw-r--r--youtube_dlc/extractor/la7.py3
-rw-r--r--youtube_dlc/extractor/lbry.py91
-rw-r--r--youtube_dlc/extractor/lrt.py91
-rw-r--r--youtube_dlc/extractor/mailru.py13
-rw-r--r--youtube_dlc/extractor/malltv.py60
-rw-r--r--youtube_dlc/extractor/medaltv.py131
-rw-r--r--youtube_dlc/extractor/mgtv.py10
-rw-r--r--youtube_dlc/extractor/mtv.py19
-rw-r--r--youtube_dlc/extractor/nbc.py5
-rw-r--r--youtube_dlc/extractor/ndr.py38
-rw-r--r--youtube_dlc/extractor/netzkino.py47
-rw-r--r--youtube_dlc/extractor/newgrounds.py107
-rw-r--r--youtube_dlc/extractor/nitter.py167
-rw-r--r--youtube_dlc/extractor/npr.py2
-rw-r--r--youtube_dlc/extractor/nrk.py424
-rw-r--r--youtube_dlc/extractor/nytimes.py38
-rw-r--r--youtube_dlc/extractor/pbs.py2
-rw-r--r--youtube_dlc/extractor/pinterest.py201
-rw-r--r--youtube_dlc/extractor/rai.py145
-rw-r--r--youtube_dlc/extractor/rcs.py413
-rw-r--r--youtube_dlc/extractor/rumble.py67
-rw-r--r--youtube_dlc/extractor/servus.py111
-rw-r--r--youtube_dlc/extractor/skyitalia.py123
-rw-r--r--youtube_dlc/extractor/soundcloud.py2
-rw-r--r--youtube_dlc/extractor/southpark.py2
-rw-r--r--youtube_dlc/extractor/spiegel.py153
-rw-r--r--youtube_dlc/extractor/spreaker.py176
-rw-r--r--youtube_dlc/extractor/svt.py48
-rw-r--r--youtube_dlc/extractor/tagesschau.py2
-rw-r--r--youtube_dlc/extractor/theplatform.py2
-rw-r--r--youtube_dlc/extractor/thisvid.py97
-rw-r--r--youtube_dlc/extractor/turner.py6
-rw-r--r--youtube_dlc/extractor/tvland.py2
-rw-r--r--youtube_dlc/extractor/twentythreevideo.py11
-rw-r--r--youtube_dlc/extractor/urplay.py78
-rw-r--r--youtube_dlc/extractor/usanetwork.py82
-rw-r--r--youtube_dlc/extractor/ustream.py7
-rw-r--r--youtube_dlc/extractor/viki.py205
-rw-r--r--youtube_dlc/extractor/vimeo.py15
-rw-r--r--youtube_dlc/extractor/vlive.py451
-rw-r--r--youtube_dlc/extractor/xiami.py8
-rw-r--r--youtube_dlc/extractor/xtube.py18
-rw-r--r--youtube_dlc/extractor/youporn.py7
-rw-r--r--youtube_dlc/extractor/youtube.py1926
-rw-r--r--youtube_dlc/extractor/zoom.py82
-rw-r--r--youtube_dlc/options.py6
-rw-r--r--youtube_dlc/postprocessor/embedthumbnail.py11
-rw-r--r--youtube_dlc/postprocessor/ffmpeg.py22
-rw-r--r--youtube_dlc/update.py20
-rw-r--r--youtube_dlc/utils.py37
-rw-r--r--youtube_dlc/version.py2
107 files changed, 4989 insertions, 2343 deletions
diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md
index bf4251004..32c14aa85 100644
--- a/.github/ISSUE_TEMPLATE/1_broken_site.md
+++ b/.github/ISSUE_TEMPLATE/1_broken_site.md
@@ -21,15 +21,15 @@ assignees: ''
<!--
Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dlc:
-- First of, make sure you are using the latest version of youtube-dlc. Run `youtube-dlc --version` and ensure your version is 2020.10.26. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED.
+- First of, make sure you are using the latest version of youtube-dlc. Run `youtube-dlc --version` and ensure your version is 2020.10.31. If it's not, see https://github.com/blackjack4494/yt-dlc on how to update. Issues with outdated version will be REJECTED.
- Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser.
-- Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape.
-- Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates.
+- Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in https://github.com/blackjack4494/yt-dlc.
+- Search the bugtracker for similar issues: https://github.com/blackjack4494/yt-dlc. DO NOT post duplicates.
- Finally, put x into all relevant boxes (like this [x])
-->
- [ ] I'm reporting a broken site support
-- [ ] I've verified that I'm running youtube-dlc version **2020.10.26**
+- [ ] I've verified that I'm running youtube-dlc version **2020.10.31**
- [ ] I've checked that all provided URLs are alive and playable in a browser
- [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped
- [ ] I've searched the bugtracker for similar issues including closed ones
@@ -44,7 +44,7 @@ Add the `-v` flag to your command line you run youtube-dlc with (`youtube-dlc -v
[debug] User config: []
[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
- [debug] youtube-dlc version 2020.10.26
+ [debug] youtube-dlc version 2020.10.31
[debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2
[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
[debug] Proxy map: {}
diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md
index 889005097..fe1aade05 100644
--- a/.github/ISSUE_TEMPLATE/2_site_support_request.md
+++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md
@@ -21,15 +21,15 @@ assignees: ''
<!--
Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dlc:
-- First of, make sure you are using the latest version of youtube-dlc. Run `youtube-dlc --version` and ensure your version is 2020.10.26. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED.
+- First of, make sure you are using the latest version of youtube-dlc. Run `youtube-dlc --version` and ensure your version is 2020.10.31. If it's not, see https://github.com/blackjack4494/yt-dlc on how to update. Issues with outdated version will be REJECTED.
- Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser.
-- Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dlc does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights.
-- Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates.
+- Make sure that site you are requesting is not dedicated to copyright infringement, see https://github.com/blackjack4494/yt-dlc. youtube-dlc does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights.
+- Search the bugtracker for similar site support requests: https://github.com/blackjack4494/yt-dlc. DO NOT post duplicates.
- Finally, put x into all relevant boxes (like this [x])
-->
- [ ] I'm reporting a new site support request
-- [ ] I've verified that I'm running youtube-dlcc version **2020.10.26**
+- [ ] I've verified that I'm running youtube-dlcc version **2020.10.31**
- [ ] I've checked that all provided URLs are alive and playable in a browser
- [ ] I've checked that none of provided URLs violate any copyrights
- [ ] I've searched the bugtracker for similar site support requests including closed ones
diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md
index e5d714388..cddb81dda 100644
--- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md
+++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md
@@ -21,13 +21,13 @@ assignees: ''
<!--
Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dlc:
-- First of, make sure you are using the latest version of youtube-dlc. Run `youtube-dlc --version` and ensure your version is 2020.10.26. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED.
-- Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates.
+- First of, make sure you are using the latest version of youtube-dlc. Run `youtube-dlc --version` and ensure your version is 2020.10.31. If it's not, see https://github.com/blackjack4494/yt-dlc on how to update. Issues with outdated version will be REJECTED.
+- Search the bugtracker for similar site feature requests: https://github.com/blackjack4494/yt-dlc. DO NOT post duplicates.
- Finally, put x into all relevant boxes (like this [x])
-->
- [ ] I'm reporting a site feature request
-- [ ] I've verified that I'm running youtube-dlc version **2020.10.26**
+- [ ] I've verified that I'm running youtube-dlc version **2020.10.31**
- [ ] I've searched the bugtracker for similar site feature requests including closed ones
diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md
index 9de52f98c..920ae8dbc 100644
--- a/.github/ISSUE_TEMPLATE/4_bug_report.md
+++ b/.github/ISSUE_TEMPLATE/4_bug_report.md
@@ -21,16 +21,16 @@ assignees: ''
<!--
Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dlc:
-- First of, make sure you are using the latest version of youtube-dlc. Run `youtube-dlc --version` and ensure your version is 2020.10.26. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED.
+- First of, make sure you are using the latest version of youtube-dlc. Run `youtube-dlc --version` and ensure your version is 2020.10.31. If it's not, see https://github.com/blackjack4494/yt-dlc on how to update. Issues with outdated version will be REJECTED.
- Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser.
-- Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape.
-- Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates.
-- Read bugs section in FAQ: http://yt-dl.org/reporting
+- Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in https://github.com/blackjack4494/yt-dlc.
+- Search the bugtracker for similar issues: https://github.com/blackjack4494/yt-dlc. DO NOT post duplicates.
+- Read bugs section in FAQ: https://github.com/blackjack4494/yt-dlc
- Finally, put x into all relevant boxes (like this [x])
-->
- [ ] I'm reporting a broken site support issue
-- [ ] I've verified that I'm running youtube-dlc version **2020.10.26**
+- [ ] I've verified that I'm running youtube-dlc version **2020.10.31**
- [ ] I've checked that all provided URLs are alive and playable in a browser
- [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped
- [ ] I've searched the bugtracker for similar bug reports including closed ones
@@ -46,7 +46,7 @@ Add the `-v` flag to your command line you run youtube-dlc with (`youtube-dlc -v
[debug] User config: []
[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
- [debug] youtube-dlc version 2020.10.26
+ [debug] youtube-dlc version 2020.10.31
[debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2
[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
[debug] Proxy map: {}
diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md
index 86fac96dd..7cc390f58 100644
--- a/.github/ISSUE_TEMPLATE/5_feature_request.md
+++ b/.github/ISSUE_TEMPLATE/5_feature_request.md
@@ -21,13 +21,13 @@ assignees: ''
<!--
Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dlc:
-- First of, make sure you are using the latest version of youtube-dlc. Run `youtube-dlc --version` and ensure your version is 2020.10.26. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED.
-- Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates.
+- First of, make sure you are using the latest version of youtube-dlc. Run `youtube-dlc --version` and ensure your version is 2020.10.31. If it's not, see https://github.com/blackjack4494/yt-dlc on how to update. Issues with outdated version will be REJECTED.
+- Search the bugtracker for similar feature requests: https://github.com/blackjack4494/yt-dlc. DO NOT post duplicates.
- Finally, put x into all relevant boxes (like this [x])
-->
- [ ] I'm reporting a feature request
-- [ ] I've verified that I'm running youtube-dlc version **2020.10.26**
+- [ ] I've verified that I'm running youtube-dlc version **2020.10.31**
- [ ] I've searched the bugtracker for similar feature requests including closed ones
diff --git a/.github/ISSUE_TEMPLATE/6_question.md b/.github/ISSUE_TEMPLATE/6_question.md
index 034a9c5ac..3c3ae0f3b 100644
--- a/.github/ISSUE_TEMPLATE/6_question.md
+++ b/.github/ISSUE_TEMPLATE/6_question.md
@@ -21,8 +21,8 @@ assignees: ''
<!--
Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl:
-- Look through the README (http://yt-dl.org/readme) and FAQ (http://yt-dl.org/faq) for similar questions
-- Search the bugtracker for similar questions: http://yt-dl.org/search-issues
+- Look through the README (https://github.com/blackjack4494/yt-dlc) and FAQ (https://github.com/blackjack4494/yt-dlc) for similar questions
+- Search the bugtracker for similar questions: https://github.com/blackjack4494/yt-dlc
- Finally, put x into all relevant boxes (like this [x])
-->
@@ -34,7 +34,7 @@ Carefully read and work through this check list in order to prevent the most com
## Question
<!--
-Ask your question in an arbitrary form. Please make sure it's worded well enough to be understood, see https://github.com/ytdl-org/youtube-dl#is-the-description-of-the-issue-itself-sufficient.
+Ask your question in an arbitrary form. Please make sure it's worded well enough to be understood, see https://github.com/blackjack4494/yt-dlc.
-->
WRITE QUESTION HERE
diff --git a/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.md b/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.md
index 8f9bb2c33..3fe4d6968 100644
--- a/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.md
+++ b/.github/ISSUE_TEMPLATE_tmpl/1_broken_site.md
@@ -18,10 +18,10 @@ title: ''
<!--
Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dlc:
-- First of, make sure you are using the latest version of youtube-dlc. Run `youtube-dlc --version` and ensure your version is %(version)s. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED.
+- First of, make sure you are using the latest version of youtube-dlc. Run `youtube-dlc --version` and ensure your version is %(version)s. If it's not, see https://github.com/blackjack4494/yt-dlc on how to update. Issues with outdated version will be REJECTED.
- Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser.
-- Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape.
-- Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates.
+- Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in https://github.com/blackjack4494/yt-dlc.
+- Search the bugtracker for similar issues: https://github.com/blackjack4494/yt-dlc. DO NOT post duplicates.
- Finally, put x into all relevant boxes (like this [x])
-->
diff --git a/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.md b/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.md
index 9748afd4d..aad8fa054 100644
--- a/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.md
+++ b/.github/ISSUE_TEMPLATE_tmpl/2_site_support_request.md
@@ -19,10 +19,10 @@ labels: 'site-support-request'
<!--
Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dlc:
-- First of, make sure you are using the latest version of youtube-dlc. Run `youtube-dlc --version` and ensure your version is %(version)s. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED.
+- First of, make sure you are using the latest version of youtube-dlc. Run `youtube-dlc --version` and ensure your version is %(version)s. If it's not, see https://github.com/blackjack4494/yt-dlc on how to update. Issues with outdated version will be REJECTED.
- Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser.
-- Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dlc does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights.
-- Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates.
+- Make sure that site you are requesting is not dedicated to copyright infringement, see https://github.com/blackjack4494/yt-dlc. youtube-dlc does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights.
+- Search the bugtracker for similar site support requests: https://github.com/blackjack4494/yt-dlc. DO NOT post duplicates.
- Finally, put x into all relevant boxes (like this [x])
-->
diff --git a/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.md b/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.md
index f274e8aeb..2fb82f828 100644
--- a/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.md
+++ b/.github/ISSUE_TEMPLATE_tmpl/3_site_feature_request.md
@@ -18,8 +18,8 @@ title: ''
<!--
Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dlc:
-- First of, make sure you are using the latest version of youtube-dlc. Run `youtube-dlc --version` and ensure your version is %(version)s. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED.
-- Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates.
+- First of, make sure you are using the latest version of youtube-dlc. Run `youtube-dlc --version` and ensure your version is %(version)s. If it's not, see https://github.com/blackjack4494/yt-dlc on how to update. Issues with outdated version will be REJECTED.
+- Search the bugtracker for similar site feature requests: https://github.com/blackjack4494/yt-dlc. DO NOT post duplicates.
- Finally, put x into all relevant boxes (like this [x])
-->
diff --git a/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.md b/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.md
index 788f1c9a1..b7bebf8ab 100644
--- a/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.md
+++ b/.github/ISSUE_TEMPLATE_tmpl/4_bug_report.md
@@ -18,11 +18,11 @@ title: ''
<!--
Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dlc:
-- First of, make sure you are using the latest version of youtube-dlc. Run `youtube-dlc --version` and ensure your version is %(version)s. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED.
+- First of, make sure you are using the latest version of youtube-dlc. Run `youtube-dlc --version` and ensure your version is %(version)s. If it's not, see https://github.com/blackjack4494/yt-dlc on how to update. Issues with outdated version will be REJECTED.
- Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser.
-- Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape.
-- Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates.
-- Read bugs section in FAQ: http://yt-dl.org/reporting
+- Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in https://github.com/blackjack4494/yt-dlc.
+- Search the bugtracker for similar issues: https://github.com/blackjack4494/yt-dlc. DO NOT post duplicates.
+- Read bugs section in FAQ: https://github.com/blackjack4494/yt-dlc
- Finally, put x into all relevant boxes (like this [x])
-->
diff --git a/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.md b/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.md
index 9b3b8c3bf..99592f79d 100644
--- a/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.md
+++ b/.github/ISSUE_TEMPLATE_tmpl/5_feature_request.md
@@ -19,8 +19,8 @@ labels: 'request'
<!--
Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dlc:
-- First of, make sure you are using the latest version of youtube-dlc. Run `youtube-dlc --version` and ensure your version is %(version)s. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED.
-- Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates.
+- First of, make sure you are using the latest version of youtube-dlc. Run `youtube-dlc --version` and ensure your version is %(version)s. If it's not, see https://github.com/blackjack4494/yt-dlc on how to update. Issues with outdated version will be REJECTED.
+- Search the bugtracker for similar feature requests: https://github.com/blackjack4494/yt-dlc. DO NOT post duplicates.
- Finally, put x into all relevant boxes (like this [x])
-->
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 8db7e92f2..dd6a95256 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -20,7 +20,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v2
with:
- python-version: '3.x'
+ python-version: '3.8'
- name: Install packages
run: sudo apt-get -y install zip pandoc man
- name: Bump version
@@ -57,7 +57,7 @@ jobs:
id: sha2_file
env:
SHA2: ${{ hashFiles('youtube-dlc') }}
- run: echo "::set-output name=sha2_unix::${env:SHA2}"
+ run: echo "::set-output name=sha2_unix::$SHA2"
- name: Install dependencies for pypi
run: |
python -m pip install --upgrade pip
@@ -82,7 +82,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v2
with:
- python-version: '3.x'
+ python-version: '3.8'
- name: Install Requirements
run: pip install pyinstaller
- name: Bump version
@@ -98,25 +98,25 @@ jobs:
upload_url: ${{ needs.build_unix.outputs.upload_url }}
asset_path: ./dist/youtube-dlc.exe
asset_name: youtube-dlc.exe
- asset_content_type: application/octet-stream
+ asset_content_type: application/vnd.microsoft.portable-executable
- name: Get SHA2-256SUMS for youtube-dlc.exe
id: sha2_file_win
env:
- SHA2: ${{ hashFiles('dist/youtube-dlc.exe') }}
- run: echo "::set-output name=sha2_windows::${env:SHA2}"
+ SHA2_win: ${{ hashFiles('dist/youtube-dlc.exe') }}
+ run: echo "::set-output name=sha2_windows::$SHA2_win"
build_windows32:
runs-on: windows-latest
- needs: build_unix
+ needs: [build_unix, build_windows]
steps:
- uses: actions/checkout@v2
- - name: Set up Python 3.5.4 32-Bit
+ - name: Set up Python 3.4.4 32-Bit
uses: actions/setup-python@v2
with:
- python-version: '3.5.4'
+ python-version: '3.4.4'
architecture: 'x86'
- name: Install Requirements for 32 Bit
run: pip install pyinstaller==3.5
@@ -133,12 +133,12 @@ jobs:
upload_url: ${{ needs.build_unix.outputs.upload_url }}
asset_path: ./dist/youtube-dlc_x86.exe
asset_name: youtube-dlc_x86.exe
- asset_content_type: application/octet-stream
+ asset_content_type: application/vnd.microsoft.portable-executable
- name: Get SHA2-256SUMS for youtube-dlc_x86.exe
id: sha2_file_win32
env:
- SHA2: ${{ hashFiles('dist/youtube-dlc_x86.exe') }}
- run: echo "::set-output name=sha2_windows32::${env:SHA2}"
+ SHA2_win32: ${{ hashFiles('dist/youtube-dlc_x86.exe') }}
+ run: echo "::set-output name=sha2_windows32::$SHA2_win32"
- name: Make SHA2-256SUMS file
env:
SHA2_WINDOWS: ${{ needs.build_windows.outputs.sha2_windows }}
@@ -146,6 +146,18 @@ jobs:
SHA2_UNIX: ${{ needs.build_unix.outputs.sha2_unix }}
YTDLC_VERSION: ${{ needs.build_unix.outputs.ytdlc_version }}
run: |
- echo "$SHA2_WINDOWS youtube-dlc.exe" > SHA2-256SUMS
- echo "$SHA2_WINDOWS32 youtube-dlc32.exe" > SHA2-256SUMS
- echo "$SHA2_UNIX youtube-dlc" >> SHA2-256SUMS
+ echo "version:${env:YTDLC_VERSION}" >> SHA2-256SUMS
+ echo "youtube-dlc.exe:${env:SHA2_WINDOWS}" >> SHA2-256SUMS
+ echo "youtube-dlc_x86.exe:${env:SHA2_WINDOWS32}" >> SHA2-256SUMS
+ echo "youtube-dlc:${env:SHA2_UNIX}" >> SHA2-256SUMS
+
+ - name: Upload 256SUMS file
+ id: upload-sums
+ uses: actions/upload-release-asset@v1
+ env:
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ with:
+ upload_url: ${{ needs.build_unix.outputs.upload_url }}
+ asset_path: ./SHA2-256SUMS
+ asset_name: SHA2-256SUMS
+ asset_content_type: text/plain
diff --git a/README.md b/README.md
index 9d40d2631..170c85c48 100644
--- a/README.md
+++ b/README.md
@@ -1,15 +1,15 @@
-[![Build Status](https://travis-ci.com/blackjack4494/youtube-dlc.svg?branch=master)](https://travis-ci.com/blackjack4494/youtube-dlc)
+[![Build Status](https://travis-ci.com/blackjack4494/yt-dlc.svg?branch=master)](https://travis-ci.com/blackjack4494/yt-dlc)
[![PyPi](https://img.shields.io/pypi/v/youtube-dlc.svg)](https://pypi.org/project/youtube-dlc)
-[![Downloads](https://pepy.tech/badge/youtube-dlc)](https://pepy.tech/project/youtube-dlc)
[![Gitter chat](https://img.shields.io/gitter/room/youtube-dlc/community)](https://gitter.im/youtube-dlc)
-[![License: Unlicense](https://img.shields.io/badge/license-Unlicense-blue.svg)](https://github.com/blackjack4494/youtube-dlc/blob/master/LICENSE)
+[![License: Unlicense](https://img.shields.io/badge/license-Unlicense-blue.svg)](https://github.com/blackjack4494/yt-dlc/blob/master/LICENSE)
youtube-dlc - download videos from youtube.com or other video platforms.
-youtube-dlc is a fork of youtube-dl with the intention of getting features tested by the community merged in the tool faster, since youtube-dl's development seems to be slowing down. (https://github.com/ytdl-org/youtube-dl/issues/26462)
+youtube-dlc is a fork of youtube-dl with the intention of getting features tested by the community merged in the tool faster, since youtube-dl's development seems to be slowing down. (https://web.archive.org/web/20201014194602/https://github.com/ytdl-org/youtube-dl/issues/26462)
- [INSTALLATION](#installation)
+- [UPDATE](#update)
- [DESCRIPTION](#description)
- [OPTIONS](#options)
- [Network Options:](#network-options)
@@ -44,6 +44,10 @@ You may want to use `python3` instead of `python`
python -m pip install --upgrade youtube-dlc
+If you want to install the current master branch
+
+ python -m pip install git+https://github.com/blackjack4494/yt-dlc
+
**UNIX** (Linux, macOS, etc.)
Using wget:
@@ -213,6 +217,8 @@ I will add some memorable short links to the binaries so you can download them e
--download-archive FILE Download only videos not listed in the
archive file. Record the IDs of all
downloaded videos in it.
+ --break-on-existing Stop the download process after attempting
+ to download a file that's in the archive.
--include-ads Download advertisements as well
(experimental)
diff --git a/devscripts/make_lazy_extractors.py b/devscripts/make_lazy_extractors.py
index e6de72b33..c27ef9781 100644
--- a/devscripts/make_lazy_extractors.py
+++ b/devscripts/make_lazy_extractors.py
@@ -61,7 +61,7 @@ def build_lazy_ie(ie, name):
return s
-# find the correct sorting and add the required base classes so that sublcasses
+# find the correct sorting and add the required base classes so that subclasses
# can be correctly created
classes = _ALL_CLASSES[:-1]
ordered_cls = []
diff --git a/docs/supportedsites.md b/docs/supportedsites.md
index c46d122ff..0b183b272 100644
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@@ -59,9 +59,9 @@
- **ARD:mediathek**
- **ARDBetaMediathek**
- **Arkena**
- - **arte.tv:+7**
- - **arte.tv:embed**
- - **arte.tv:playlist**
+ - **ArteTV**
+ - **ArteTVEmbed**
+ - **ArteTVPlaylist**
- **AsianCrush**
- **AsianCrushPlaylist**
- **AtresPlayer**
@@ -104,12 +104,14 @@
- **BIQLE**
- **BitChute**
- **BitChuteChannel**
+ - **bitwave.tv**
- **BleacherReport**
- **BleacherReportCMS**
- **blinkx**
- **Bloomberg**
- **BokeCC**
- **BostonGlobe**
+ - **Box**
- **Bpb**: Bundeszentrale für politische Bildung
- **BR**: Bayerischer Rundfunk
- **BravoTV**
@@ -157,6 +159,7 @@
- **Chilloutzone**
- **chirbit**
- **chirbit:profile**
+ - **cielotv.it**
- **Cinchcast**
- **Cinemax**
- **CiscoLiveSearch**
@@ -424,6 +427,7 @@
- **la7.it**
- **laola1tv**
- **laola1tv:embed**
+ - **lbry.tv**
- **LCI**
- **Lcp**
- **LcpPlay**
@@ -474,6 +478,7 @@
- **massengeschmack.tv**
- **MatchTV**
- **MDR**: MDR.DE and KiKA
+ - **MedalTV**
- **media.ccc.de**
- **media.ccc.de:lists**
- **Medialaan**
@@ -582,6 +587,7 @@
- **niconico**: ニコニコ動画
- **NiconicoPlaylist**
- **Nintendo**
+ - **Nitter**
- **njoy**: N-JOY
- **njoy:embed**
- **NJPWWorld**: 新日本プロレスワールド
@@ -616,6 +622,7 @@
- **Nuvid**
- **NYTimes**
- **NYTimesArticle**
+ - **NYTimesCooking**
- **NZZ**
- **ocw.mit.edu**
- **OdaTV**
@@ -668,6 +675,8 @@
- **PicartoVod**
- **Piksel**
- **Pinkbike**
+ - **Pinterest**
+ - **PinterestCollection**
- **Pladform**
- **Platzi**
- **PlatziCourse**
@@ -764,6 +773,7 @@
- **RTVNH**
- **RTVS**
- **RUHD**
+ - **RumbleEmbed**
- **rutube**: Rutube videos
- **rutube:channel**: Rutube channels
- **rutube:embed**: Rutube embedded videos
@@ -834,12 +844,14 @@
- **SpankBangPlaylist**
- **Spankwire**
- **Spiegel**
- - **Spiegel:Article**: Articles on spiegel.de
- - **Spiegeltv**
- **sport.francetvinfo.fr**
- **Sport5**
- **SportBox**
- **SportDeutschland**
+ - **Spreaker**
+ - **SpreakerPage**
+ - **SpreakerShow**
+ - **SpreakerShowPage**
- **SpringboardPlatform**
- **Sprout**
- **sr:mediathek**: Saarländischer Rundfunk
@@ -943,6 +955,7 @@
- **TV2DKBornholmPlay**
- **TV4**: tv4.se and tv4play.se
- **TV5MondePlus**: TV5MONDE+
+ - **tv8.it**
- **TVA**
- **TVANouvelles**
- **TVANouvellesArticle**
@@ -1057,7 +1070,7 @@
- **vk:wallpost**
- **vlive**
- **vlive:channel**
- - **vlive:playlist**
+ - **vlive:post**
- **Vodlocker**
- **VODPl**
- **VODPlatform**
@@ -1146,20 +1159,17 @@
- **YourPorn**
- **YourUpload**
- **youtube**: YouTube.com
- - **youtube:channel**: YouTube.com channels
- - **youtube:favorites**: YouTube.com favourite videos, ":ytfav" for short (requires authentication)
+ - **youtube:favorites**: YouTube.com liked videos, ":ytfav" for short (requires authentication)
- **youtube:history**: Youtube watch history, ":ythistory" for short (requires authentication)
- - **youtube:live**: YouTube.com live streams
- **youtube:playlist**: YouTube.com playlists
- - **youtube:playlists**: YouTube.com user/channel playlists
- **youtube:recommended**: YouTube.com recommended videos, ":ytrec" for short (requires authentication)
- - **youtube:search**: YouTube.com searches
- - **youtube:search:date**: YouTube.com searches, newest videos first
+ - **youtube:search**: YouTube.com searches, "ytsearch" keyword
+ - **youtube:search:date**: YouTube.com searches, newest videos first, "ytsearchdate" keyword
- **youtube:search_url**: YouTube.com search URLs
- - **youtube:show**: YouTube.com (multi-season) shows
- - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)
- - **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword)
+ - **youtube:subscriptions**: YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)
+ - **youtube:tab**: YouTube.com tab
- **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication)
+ - **YoutubeYtUser**: YouTube.com user videos, URL or "ytuser" keyword
- **Zapiks**
- **Zaq1**
- **Zattoo**
diff --git a/make_win.bat b/make_win.bat
index 891d517b3..c35d9937e 100644
--- a/make_win.bat
+++ b/make_win.bat
@@ -1 +1 @@
-py -m PyInstaller youtube_dlc\__main__.py --onefile --name youtube-dlc --version-file win\ver.txt --icon win\icon\cloud.ico \ No newline at end of file
+py -m PyInstaller youtube_dlc\__main__.py --onefile --name youtube-dlc --version-file win\ver.txt --icon win\icon\cloud.ico --upx-exclude=vcruntime140.dll \ No newline at end of file
diff --git a/setup.py b/setup.py
index a10ef0a77..6908f2404 100644
--- a/setup.py
+++ b/setup.py
@@ -66,7 +66,7 @@ setup(
description=DESCRIPTION,
long_description=LONG_DESCRIPTION,
# long_description_content_type="text/markdown",
- url="https://github.com/blackjack4494/youtube-dlc",
+ url="https://github.com/blackjack4494/yt-dlc",
packages=find_packages(exclude=("youtube_dl","test",)),
#packages=[
# 'youtube_dlc',
diff --git a/test/parameters.json b/test/parameters.json
index 7bf59c25f..65fd54428 100644
--- a/test/parameters.json
+++ b/test/parameters.json
@@ -37,7 +37,7 @@
"writeinfojson": true,
"writesubtitles": false,
"allsubtitles": false,
- "listssubtitles": false,
+ "listsubtitles": false,
"socket_timeout": 20,
"fixup": "never"
}
diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py
index 6d02c2a54..a9e649191 100644
--- a/test/test_YoutubeDL.py
+++ b/test/test_YoutubeDL.py
@@ -919,6 +919,76 @@ class TestYoutubeDL(unittest.TestCase):
self.assertEqual(downloaded['extractor'], 'testex')
self.assertEqual(downloaded['extractor_key'], 'TestEx')
+ # Test case for https://github.com/ytdl-org/youtube-dl/issues/27064
+ def test_ignoreerrors_for_playlist_with_url_transparent_iterable_entries(self):
+
+ class _YDL(YDL):
+ def __init__(self, *args, **kwargs):
+ super(_YDL, self).__init__(*args, **kwargs)
+
+ def trouble(self, s, tb=None):
+ pass
+
+ ydl = _YDL({
+ 'format': 'extra',
+ 'ignoreerrors': True,
+ })
+
+ class VideoIE(InfoExtractor):
+ _VALID_URL = r'video:(?P<id>\d+)'
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ formats = [{
+ 'format_id': 'default',
+ 'url': 'url:',
+ }]
+ if video_id == '0':
+ raise ExtractorError('foo')
+ if video_id == '2':
+ formats.append({
+ 'format_id': 'extra',
+ 'url': TEST_URL,
+ })
+ return {
+ 'id': video_id,
+ 'title': 'Video %s' % video_id,
+ 'formats': formats,
+ }
+
+ class PlaylistIE(InfoExtractor):
+ _VALID_URL = r'playlist:'
+
+ def _entries(self):
+ for n in range(3):
+ video_id = compat_str(n)
+ yield {
+ '_type': 'url_transparent',
+ 'ie_key': VideoIE.ie_key(),
+ 'id': video_id,
+ 'url': 'video:%s' % video_id,
+ 'title': 'Video Transparent %s' % video_id,
+ }
+
+ def _real_extract(self, url):
+ return self.playlist_result(self._entries())
+
+ ydl.add_info_extractor(VideoIE(ydl))
+ ydl.add_info_extractor(PlaylistIE(ydl))
+ info = ydl.extract_info('playlist:')
+ entries = info['entries']
+ self.assertEqual(len(entries), 3)
+ self.assertTrue(entries[0] is None)
+ self.assertTrue(entries[1] is None)
+ self.assertEqual(len(ydl.downloaded_info_dicts), 1)
+ downloaded = ydl.downloaded_info_dicts[0]
+ self.assertEqual(entries[2], downloaded)
+ self.assertEqual(downloaded['url'], TEST_URL)
+ self.assertEqual(downloaded['title'], 'Video Transparent 2')
+ self.assertEqual(downloaded['id'], '2')
+ self.assertEqual(downloaded['extractor'], 'Video')
+ self.assertEqual(downloaded['extractor_key'], 'Video')
+
if __name__ == '__main__':
unittest.main()
diff --git a/test/test_all_urls.py b/test/test_all_urls.py
index 548bc6750..8dcdc4e58 100644
--- a/test/test_all_urls.py
+++ b/test/test_all_urls.py
@@ -31,45 +31,47 @@ class TestAllURLsMatching(unittest.TestCase):
def test_youtube_playlist_matching(self):
assertPlaylist = lambda url: self.assertMatch(url, ['youtube:playlist'])
+ assertTab = lambda url: self.assertMatch(url, ['youtube:tab'])
assertPlaylist('ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
assertPlaylist('UUBABnxM4Ar9ten8Mdjj1j0Q') # 585
- assertPlaylist('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
+ assertPlaylist('PL63F0C78739B09958')
+ assertTab('https://www.youtube.com/AsapSCIENCE')
+ assertTab('https://www.youtube.com/embedded')
+ assertTab('https://www.youtube.com/feed') # Own channel's home page
+ assertTab('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')
assertPlaylist('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')
- assertPlaylist('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
- assertPlaylist('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668
+ assertTab('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')
+ assertTab('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012') # 668
self.assertFalse('youtube:playlist' in self.matching_ies('PLtS2H6bU1M'))
# Top tracks
- assertPlaylist('https://www.youtube.com/playlist?list=MCUS.20142101')
+ assertTab('https://www.youtube.com/playlist?list=MCUS.20142101')
def test_youtube_matching(self):
self.assertTrue(YoutubeIE.suitable('PLtS2H6bU1M'))
self.assertFalse(YoutubeIE.suitable('https://www.youtube.com/watch?v=AV6J6_AeFEQ&playnext=1&list=PL4023E734DA416012')) # 668
self.assertMatch('http://youtu.be/BaW_jenozKc', ['youtube'])
- self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube'])
+ # self.assertMatch('http://www.youtube.com/v/BaW_jenozKc', ['youtube']) # /v/ is no longer valid
self.assertMatch('https://youtube.googleapis.com/v/BaW_jenozKc', ['youtube'])
self.assertMatch('http://www.cleanvideosearch.com/media/action/yt/watch?videoId=8v_4O44sfjM', ['youtube'])
def test_youtube_channel_matching(self):
- assertChannel = lambda url: self.assertMatch(url, ['youtube:channel'])
+ assertChannel = lambda url: self.assertMatch(url, ['youtube:tab'])
assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM')
assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM?feature=gb_ch_rec')
assertChannel('https://www.youtube.com/channel/HCtnHdj3df7iM/videos')
- def test_youtube_user_matching(self):
- self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:user'])
+ # def test_youtube_user_matching(self):
+ # self.assertMatch('http://www.youtube.com/NASAgovVideo/videos', ['youtube:tab'])
def test_youtube_feeds(self):
- self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater'])
- self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions'])
- self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended'])
- self.assertMatch('https://www.youtube.com/my_favorites', ['youtube:favorites'])
-
- def test_youtube_show_matching(self):
- self.assertMatch('http://www.youtube.com/show/airdisasters', ['youtube:show'])
-
- def test_youtube_search_matching(self):
- self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url'])
- self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url'])
+ self.assertMatch('https://www.youtube.com/feed/library', ['youtube:tab'])
+ self.assertMatch('https://www.youtube.com/feed/history', ['youtube:tab'])
+ self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:tab'])
+ self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:tab'])
+
+ # def test_youtube_search_matching(self):
+ # self.assertMatch('http://www.youtube.com/results?search_query=making+mustard', ['youtube:search_url'])
+ # self.assertMatch('https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', ['youtube:search_url'])
def test_youtube_extract(self):
assertExtractId = lambda url, id: self.assertEqual(YoutubeIE.extract_id(url), id)
diff --git a/test/test_utils.py b/test/test_utils.py
index 95231200b..16ad40831 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -937,6 +937,28 @@ class TestUtil(unittest.TestCase):
self.assertEqual(d['x'], 1)
self.assertEqual(d['y'], 'a')
+ # Just drop ! prefix for now though this results in a wrong value
+ on = js_to_json('''{
+ a: !0,
+ b: !1,
+ c: !!0,
+ d: !!42.42,
+ e: !!![],
+ f: !"abc",
+ g: !"",
+ !42: 42
+ }''')
+ self.assertEqual(json.loads(on), {
+ 'a': 0,
+ 'b': 1,
+ 'c': 0,
+ 'd': 42.42,
+ 'e': [],
+ 'f': "abc",
+ 'g': "",
+ '42': 42
+ })
+
on = js_to_json('["abc", "def",]')
self.assertEqual(json.loads(on), ['abc', 'def'])
@@ -994,6 +1016,12 @@ class TestUtil(unittest.TestCase):
on = js_to_json('{42:4.2e1}')
self.assertEqual(json.loads(on), {'42': 42.0})
+ on = js_to_json('{ "0x40": "0x40" }')
+ self.assertEqual(json.loads(on), {'0x40': '0x40'})
+
+ on = js_to_json('{ "040": "040" }')
+ self.assertEqual(json.loads(on), {'040': '040'})
+
def test_js_to_json_malformed(self):
self.assertEqual(js_to_json('42a1'), '42"a1"')
self.assertEqual(js_to_json('42a-1'), '42"a"-1')
diff --git a/youtube_dlc/YoutubeDL.py b/youtube_dlc/YoutubeDL.py
index fc351db0d..ef6fe0a78 100644
--- a/youtube_dlc/YoutubeDL.py
+++ b/youtube_dlc/YoutubeDL.py
@@ -210,6 +210,8 @@ class YoutubeDL(object):
download_archive: File name of a file where all downloads are recorded.
Videos already present in the file are not downloaded
again.
+ break_on_existing: Stop the download process after attempting to download a file that's
+ in the archive.
cookiefile: File name where cookies should be read from and dumped to.
nocheckcertificate:Do not verify SSL certificates
prefer_insecure: Use HTTP instead of HTTPS to retrieve information.
@@ -801,7 +803,7 @@ class YoutubeDL(object):
for key, value in extra_info.items():
info_dict.setdefault(key, value)
- def extract_info(self, url, download=True, ie_key=None, extra_info={},
+ def extract_info(self, url, download=True, ie_key=None, info_dict=None, extra_info={},
process=True, force_generic_extractor=False):
'''
Returns a list with a dictionary for each video we find.
@@ -821,26 +823,30 @@ class YoutubeDL(object):
if not ie.suitable(url):
continue
- ie = self.get_info_extractor(ie.ie_key())
+ ie_key = ie.ie_key()
+ ie = self.get_info_extractor(ie_key)
if not ie.working():
self.report_warning('The program functionality for this site has been marked as broken, '
'and will probably not work.')
try:
- ie_result = ie.extract(url)
- if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
- break
- if isinstance(ie_result, list):
- # Backwards compatibility: old IE result format
- ie_result = {
- '_type': 'compat_list',
- 'entries': ie_result,
- }
- self.add_default_extra_info(ie_result, ie, url)
- if process:
- return self.process_ie_result(ie_result, download, extra_info)
- else:
- return ie_result
+ temp_id = ie.extract_id(url) if callable(getattr(ie, 'extract_id', None)) else ie._match_id(url)
+ except (AssertionError, IndexError, AttributeError):
+ temp_id = None
+ if temp_id is not None and self.in_download_archive({'id': temp_id, 'ie_key': ie_key}):
+ self.to_screen("[%s] %s: has already been recorded in archive" % (
+ ie_key, temp_id))
+ break
+
+ return self.__extract_info(url, ie, download, extra_info, process, info_dict)
+
+ else:
+ self.report_error('no suitable InfoExtractor for URL %s' % url)
+
+ def __handle_extraction_exceptions(func):
+ def wrapper(self, *args, **kwargs):
+ try:
+ return func(self, *args, **kwargs)
except GeoRestrictedError as e:
msg = e.msg
if e.countries:
@@ -848,20 +854,38 @@ class YoutubeDL(object):
map(ISO3166Utils.short2full, e.countries))
msg += '\nYou might want to use a VPN or a proxy server (with --proxy) to workaround.'
self.report_error(msg)
- break
except ExtractorError as e: # An error we somewhat expected
self.report_error(compat_str(e), e.format_traceback())
- break
except MaxDownloadsReached:
raise
except Exception as e:
if self.params.get('ignoreerrors', False):
self.report_error(error_to_compat_str(e), tb=encode_compat_str(traceback.format_exc()))
- break
else:
raise
+ return wrapper
+
+ @__handle_extraction_exceptions
+ def __extract_info(self, url, ie, download, extra_info, process, info_dict):
+ ie_result = ie.extract(url)
+ if ie_result is None: # Finished already (backwards compatibility; listformats and friends should be moved here)
+ return
+ if isinstance(ie_result, list):
+ # Backwards compatibility: old IE result format
+ ie_result = {
+ '_type': 'compat_list',
+ 'entries': ie_result,
+ }
+ if info_dict:
+ if info_dict.get('id'):
+ ie_result['id'] = info_dict['id']
+ if info_dict.get('title'):
+ ie_result['title'] = info_dict['title']
+ self.add_default_extra_info(ie_result, ie, url)
+ if process:
+ return self.process_ie_result(ie_result, download, extra_info)
else:
- self.report_error('no suitable InfoExtractor for URL %s' % url)
+ return ie_result
def add_default_extra_info(self, ie_result, ie, url):
self.add_extra_info(ie_result, {
@@ -898,7 +922,7 @@ class YoutubeDL(object):
# We have to add extra_info to the results because it may be
# contained in a playlist
return self.extract_info(ie_result['url'],
- download,
+ download, info_dict=ie_result,
ie_key=ie_result.get('ie_key'),
extra_info=extra_info)
elif result_type == 'url_transparent':
@@ -1033,12 +1057,15 @@ class YoutubeDL(object):
reason = self._match_entry(entry, incomplete=True)
if reason is not None:
- self.to_screen('[download] ' + reason)
- continue
+ if reason.endswith('has already been recorded in the archive') and self.params.get('break_on_existing'):
+ print('[download] tried downloading a file that\'s already in the archive, stopping since --break-on-existing is set.')
+ break
+ else:
+ self.to_screen('[download] ' + reason)
+ continue
- entry_result = self.process_ie_result(entry,
- download=download,
- extra_info=extra)
+ entry_result = self.__process_iterable_entry(entry, download, extra)
+ # TODO: skip failed (empty) entries?
playlist_results.append(entry_result)
ie_result['entries'] = playlist_results
self.to_screen('[download] Finished downloading playlist: %s' % playlist)
@@ -1067,6 +1094,11 @@ class YoutubeDL(object):
else:
raise Exception('Invalid result type: %s' % result_type)
+ @__handle_extraction_exceptions
+ def __process_iterable_entry(self, entry, download, extra_info):
+ return self.process_ie_result(
+ entry, download=download, extra_info=extra_info)
+
def _build_format_filter(self, filter_spec):
" Returns a function to filter the formats according to the filter_spec "
@@ -1852,13 +1884,13 @@ class YoutubeDL(object):
self.report_error('Cannot write annotations file: ' + annofn)
return
- def dl(name, info):
+ def dl(name, info, subtitle=False):
fd = get_suitable_downloader(info, self.params)(self, self.params)
for ph in self._progress_hooks:
fd.add_progress_hook(ph)
if self.params.get('verbose'):
self.to_stdout('[debug] Invoking downloader on %r' % info.get('url'))
- return fd.download(name, info)
+ return fd.download(name, info, subtitle)
subtitles_are_requested = any([self.params.get('writesubtitles', False),
self.params.get('writeautomaticsub')])
@@ -1867,7 +1899,7 @@ class YoutubeDL(object):
# subtitles download errors are already managed as troubles in relevant IE
# that way it will silently go on when used with unsupporting IE
subtitles = info_dict['requested_subtitles']
- ie = self.get_info_extractor(info_dict['extractor_key'])
+ # ie = self.get_info_extractor(info_dict['extractor_key'])
for sub_lang, sub_info in subtitles.items():
sub_format = sub_info['ext']
sub_filename = subtitles_filename(filename, sub_lang, sub_format, info_dict.get('ext'))
@@ -1886,6 +1918,8 @@ class YoutubeDL(object):
return
else:
try:
+ dl(sub_filename, sub_info, subtitle=True)
+ '''
if self.params.get('sleep_interval_subtitles', False):
dl(sub_filename, sub_info)
else:
@@ -1893,6 +1927,7 @@ class YoutubeDL(object):
sub_info['url'], info_dict['id'], note=False).read()
with io.open(encodeFilename(sub_filename), 'wb') as subfile:
subfile.write(sub_data)
+ '''
except (ExtractorError, IOError, OSError, ValueError, compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self.report_warning('Unable to download subtitle for "%s": %s' %
(sub_lang, error_to_compat_str(err)))
diff --git a/youtube_dlc/__init__.py b/youtube_dlc/__init__.py
index 105786bc0..7d72ab985 100644
--- a/youtube_dlc/__init__.py
+++ b/youtube_dlc/__init__.py
@@ -405,6 +405,7 @@ def _real_main(argv=None):
'youtube_print_sig_code': opts.youtube_print_sig_code,
'age_limit': opts.age_limit,
'download_archive': download_archive_fn,
+ 'break_on_existing': opts.break_on_existing,
'cookiefile': opts.cookiefile,
'nocheckcertificate': opts.no_check_certificate,
'prefer_insecure': opts.prefer_insecure,
diff --git a/youtube_dlc/compat.py b/youtube_dlc/compat.py
index 1cf7efed6..ac889ddd7 100644
--- a/youtube_dlc/compat.py
+++ b/youtube_dlc/compat.py
@@ -2345,7 +2345,7 @@ except ImportError: # Python <3.4
# HTMLParseError has been deprecated in Python 3.3 and removed in
# Python 3.5. Introducing dummy exception for Python >3.5 for compatible
- # and uniform cross-version exceptiong handling
+ # and uniform cross-version exception handling
class compat_HTMLParseError(Exception):
pass
diff --git a/youtube_dlc/downloader/common.py b/youtube_dlc/downloader/common.py
index 31c286458..7d303be1c 100644
--- a/youtube_dlc/downloader/common.py
+++ b/youtube_dlc/downloader/common.py
@@ -326,7 +326,7 @@ class FileDownloader(object):
"""Report it was impossible to resume download."""
self.to_screen('[download] Unable to resume')
- def download(self, filename, info_dict):
+ def download(self, filename, info_dict, subtitle=False):
"""Download to a filename using the info from info_dict
Return True on success and False otherwise
"""
@@ -353,16 +353,25 @@ class FileDownloader(object):
})
return True
- min_sleep_interval = self.params.get('sleep_interval')
- if min_sleep_interval:
- max_sleep_interval = self.params.get('max_sleep_interval', min_sleep_interval)
- sleep_interval = random.uniform(min_sleep_interval, max_sleep_interval)
- self.to_screen(
- '[download] Sleeping %s seconds...' % (
- int(sleep_interval) if sleep_interval.is_integer()
- else '%.2f' % sleep_interval))
- time.sleep(sleep_interval)
-
+ if subtitle is False:
+ min_sleep_interval = self.params.get('sleep_interval')
+ if min_sleep_interval:
+ max_sleep_interval = self.params.get('max_sleep_interval', min_sleep_interval)
+ sleep_interval = random.uniform(min_sleep_interval, max_sleep_interval)
+ self.to_screen(
+ '[download] Sleeping %s seconds...' % (
+ int(sleep_interval) if sleep_interval.is_integer()
+ else '%.2f' % sleep_interval))
+ time.sleep(sleep_interval)
+ else:
+ sleep_interval_sub = 0
+ if type(self.params.get('sleep_interval_subtitles')) is int:
+ sleep_interval_sub = self.params.get('sleep_interval_subtitles')
+ if sleep_interval_sub > 0:
+ self.to_screen(
+ '[download] Sleeping %s seconds...' % (
+ sleep_interval_sub))
+ time.sleep(sleep_interval_sub)
return self.real_download(filename, info_dict)
def real_download(self, filename, info_dict):
diff --git a/youtube_dlc/downloader/external.py b/youtube_dlc/downloader/external.py
index c31f8910a..d2f8f271d 100644
--- a/youtube_dlc/downloader/external.py
+++ b/youtube_dlc/downloader/external.py
@@ -115,8 +115,10 @@ class CurlFD(ExternalFD):
def _make_cmd(self, tmpfilename, info_dict):
cmd = [self.exe, '--location', '-o', tmpfilename]
- for key, val in info_dict['http_headers'].items():
- cmd += ['--header', '%s: %s' % (key, val)]
+ if info_dict.get('http_headers') is not None:
+ for key, val in info_dict['http_headers'].items():
+ cmd += ['--header', '%s: %s' % (key, val)]
+
cmd += self._bool_option('--continue-at', 'continuedl', '-', '0')
cmd += self._valueless_option('--silent', 'noprogress')
cmd += self._valueless_option('--verbose', 'verbose')
@@ -150,8 +152,9 @@ class AxelFD(ExternalFD):
def _make_cmd(self, tmpfilename, info_dict):
cmd = [self.exe, '-o', tmpfilename]
- for key, val in info_dict['http_headers'].items():
- cmd += ['-H', '%s: %s' % (key, val)]
+ if info_dict.get('http_headers') is not None:
+ for key, val in info_dict['http_headers'].items():
+ cmd += ['-H', '%s: %s' % (key, val)]
cmd += self._configuration_args()
cmd += ['--', info_dict['url']]
return cmd
@@ -162,8 +165,9 @@ class WgetFD(ExternalFD):
def _make_cmd(self, tmpfilename, info_dict):
cmd = [self.exe, '-O', tmpfilename, '-nv', '--no-cookies']
- for key, val in info_dict['http_headers'].items():
- cmd += ['--header', '%s: %s' % (key, val)]
+ if info_dict.get('http_headers') is not None:
+ for key, val in info_dict['http_headers'].items():
+ cmd += ['--header', '%s: %s' % (key, val)]
cmd += self._option('--limit-rate', 'ratelimit')
retry = self._option('--tries', 'retries')
if len(retry) == 2:
@@ -189,8 +193,9 @@ class Aria2cFD(ExternalFD):
if dn:
cmd += ['--dir', dn]
cmd += ['--out', os.path.basename(tmpfilename)]
- for key, val in info_dict['http_headers'].items():
- cmd += ['--header', '%s: %s' % (key, val)]
+ if info_dict.get('http_headers') is not None:
+ for key, val in info_dict['http_headers'].items():
+ cmd += ['--header', '%s: %s' % (key, val)]
cmd += self._option('--interface', 'source_address')
cmd += self._option('--all-proxy', 'proxy')
cmd += self._bool_option('--check-certificate', 'nocheckcertificate', 'false', 'true', '=')
@@ -206,8 +211,10 @@ class HttpieFD(ExternalFD):
def _make_cmd(self, tmpfilename, info_dict):
cmd = ['http', '--download', '--output', tmpfilename, info_dict['url']]
- for key, val in info_dict['http_headers'].items():
- cmd += ['%s:%s' % (key, val)]
+
+ if info_dict.get('http_headers') is not None:
+ for key, val in info_dict['http_headers'].items():
+ cmd += ['%s:%s' % (key, val)]
return cmd
@@ -253,7 +260,7 @@ class FFmpegFD(ExternalFD):
# if end_time:
# args += ['-t', compat_str(end_time - start_time)]
- if info_dict['http_headers'] and re.match(r'^https?://', url):
+ if info_dict.get('http_headers') is not None and re.match(r'^https?://', url):
# Trailing \r\n after each HTTP header is important to prevent warning from ffmpeg/avconv:
# [http @ 00000000003d2fa0] No trailing CRLF found in HTTP header.
headers = handle_youtubedl_headers(info_dict['http_headers'])
diff --git a/youtube_dlc/downloader/fragment.py b/youtube_dlc/downloader/fragment.py
index 9339b3a62..cf4fd41da 100644
--- a/youtube_dlc/downloader/fragment.py
+++ b/youtube_dlc/downloader/fragment.py
@@ -97,12 +97,15 @@ class FragmentFD(FileDownloader):
def _download_fragment(self, ctx, frag_url, info_dict, headers=None):
fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], ctx['fragment_index'])
- success = ctx['dl'].download(fragment_filename, {
+ fragment_info_dict = {
'url': frag_url,
'http_headers': headers or info_dict.get('http_headers'),
- })
+ }
+ success = ctx['dl'].download(fragment_filename, fragment_info_dict)
if not success:
return False, None
+ if fragment_info_dict.get('filetime'):
+ ctx['fragment_filetime'] = fragment_info_dict.get('filetime')
down, frag_sanitized = sanitize_open(fragment_filename, 'rb')
ctx['fragment_filename_sanitized'] = frag_sanitized
frag_content = down.read()
@@ -258,6 +261,13 @@ class FragmentFD(FileDownloader):
downloaded_bytes = ctx['complete_frags_downloaded_bytes']
else:
self.try_rename(ctx['tmpfilename'], ctx['filename'])
+ if self.params.get('updatetime', True):
+ filetime = ctx.get('fragment_filetime')
+ if filetime:
+ try:
+ os.utime(ctx['filename'], (time.time(), filetime))
+ except Exception:
+ pass
downloaded_bytes = os.path.getsize(encodeFilename(ctx['filename']))
self._hook_progress({
diff --git a/youtube_dlc/downloader/http.py b/youtube_dlc/downloader/http.py
index 96379caf1..d8ac41dcc 100644
--- a/youtube_dlc/downloader/http.py
+++ b/youtube_dlc/downloader/http.py
@@ -109,7 +109,9 @@ class HttpFD(FileDownloader):
try:
ctx.data = self.ydl.urlopen(request)
except (compat_urllib_error.URLError, ) as err:
- if isinstance(err.reason, socket.timeout):
+ # reason may not be available, e.g. for urllib2.HTTPError on python 2.6
+ reason = getattr(err, 'reason', None)
+ if isinstance(reason, socket.timeout):
raise RetryDownload(err)
raise err
# When trying to resume, Content-Range HTTP header of response has to be checked
diff --git a/youtube_dlc/downloader/youtube_live_chat.py b/youtube_dlc/downloader/youtube_live_chat.py
index 4932dd9c5..b333afa5b 100644
--- a/youtube_dlc/downloader/youtube_live_chat.py
+++ b/youtube_dlc/downloader/youtube_live_chat.py
@@ -82,7 +82,10 @@ class YoutubeLiveChatReplayFD(FragmentFD):
offset = int(replay_chat_item_action['videoOffsetTimeMsec'])
processed_fragment.extend(
json.dumps(action, ensure_ascii=False).encode('utf-8') + b'\n')
- continuation_id = live_chat_continuation['continuations'][0]['liveChatReplayContinuationData']['continuation']
+ try:
+ continuation_id = live_chat_continuation['continuations'][0]['liveChatReplayContinuationData']['continuation']
+ except KeyError:
+ continuation_id = None
self._append_fragment(ctx, processed_fragment)
diff --git a/youtube_dlc/extractor/adobepass.py b/youtube_dlc/extractor/adobepass.py
index 38dca1b0a..649f9940f 100644
--- a/youtube_dlc/extractor/adobepass.py
+++ b/youtube_dlc/extractor/adobepass.py
@@ -1438,6 +1438,13 @@ class AdobePassIE(InfoExtractor):
provider_redirect_page, 'oauth redirect')
self._download_webpage(
oauth_redirect_url, video_id, 'Confirming auto login')
+ elif 'automatically signed in with' in provider_redirect_page:
+ # Seems like comcast is rolling up new way of automatically signing customers
+ oauth_redirect_url = self._html_search_regex(
+ r'continue:\s*"(https://oauth.xfinity.com/oauth/authorize\?.+)"', provider_redirect_page,
+ 'oauth redirect (signed)')
+ # Just need to process the request. No useful data comes back
+ self._download_webpage(oauth_redirect_url, video_id, 'Confirming auto login')
else:
if '<form name="signin"' in provider_redirect_page:
provider_login_page_res = provider_redirect_page_res
diff --git a/youtube_dlc/extractor/afreecatv.py b/youtube_dlc/extractor/afreecatv.py
index 6275e5209..b56abb1e6 100644
--- a/youtube_dlc/extractor/afreecatv.py
+++ b/youtube_dlc/extractor/afreecatv.py
@@ -275,7 +275,7 @@ class AfreecaTVIE(InfoExtractor):
video_element = video_xml.findall(compat_xpath('./track/video'))[-1]
if video_element is None or video_element.text is None:
raise ExtractorError(
- 'Video %s video does not exist' % video_id, expected=True)
+ 'Video %s does not exist' % video_id, expected=True)
video_url = video_element.text.strip()
diff --git a/youtube_dlc/extractor/amara.py b/youtube_dlc/extractor/amara.py
new file mode 100644
index 000000000..61d469574
--- /dev/null
+++ b/youtube_dlc/extractor/amara.py
@@ -0,0 +1,103 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .youtube import YoutubeIE
+from .vimeo import VimeoIE
+from ..utils import (
+ int_or_none,
+ parse_iso8601,
+ update_url_query,
+)
+
+
+class AmaraIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?amara\.org/(?:\w+/)?videos/(?P<id>\w+)'
+ _TESTS = [{
+ # Youtube
+ 'url': 'https://amara.org/en/videos/jVx79ZKGK1ky/info/why-jury-trials-are-becoming-less-common/?tab=video',
+ 'md5': 'ea10daf2b6154b8c1ecf9922aca5e8ae',
+ 'info_dict': {
+ 'id': 'h6ZuVdvYnfE',
+ 'ext': 'mp4',
+ 'title': 'Why jury trials are becoming less common',
+ 'description': 'md5:a61811c319943960b6ab1c23e0cbc2c1',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'subtitles': dict,
+ 'upload_date': '20160813',
+ 'uploader': 'PBS NewsHour',
+ 'uploader_id': 'PBSNewsHour',
+ 'timestamp': 1549639570,
+ }
+ }, {
+ # Vimeo
+ 'url': 'https://amara.org/en/videos/kYkK1VUTWW5I/info/vimeo-at-ces-2011',
+ 'md5': '99392c75fa05d432a8f11df03612195e',
+ 'info_dict': {
+ 'id': '18622084',
+ 'ext': 'mov',
+ 'title': 'Vimeo at CES 2011!',
+ 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'subtitles': dict,
+ 'timestamp': 1294763658,
+ 'upload_date': '20110111',
+ 'uploader': 'Sam Morrill',
+ 'uploader_id': 'sammorrill'
+ }
+ }, {
+ # Direct Link
+ 'url': 'https://amara.org/en/videos/s8KL7I3jLmh6/info/the-danger-of-a-single-story/',
+ 'md5': 'd3970f08512738ee60c5807311ff5d3f',
+ 'info_dict': {
+ 'id': 's8KL7I3jLmh6',
+ 'ext': 'mp4',
+ 'title': 'The danger of a single story',
+ 'description': 'md5:d769b31139c3b8bb5be9177f62ea3f23',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'subtitles': dict,
+ 'upload_date': '20091007',
+ 'timestamp': 1254942511,
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ meta = self._download_json(
+ 'https://amara.org/api/videos/%s/' % video_id,
+ video_id, query={'format': 'json'})
+ title = meta['title']
+ video_url = meta['all_urls'][0]
+
+ subtitles = {}
+ for language in (meta.get('languages') or []):
+ subtitles_uri = language.get('subtitles_uri')
+ if not (subtitles_uri and language.get('published')):
+ continue
+ subtitle = subtitles.setdefault(language.get('code') or 'en', [])
+ for f in ('json', 'srt', 'vtt'):
+ subtitle.append({
+ 'ext': f,
+ 'url': update_url_query(subtitles_uri, {'format': f}),
+ })
+
+ info = {
+ 'url': video_url,
+ 'id': video_id,
+ 'subtitles': subtitles,
+ 'title': title,
+ 'description': meta.get('description'),
+ 'thumbnail': meta.get('thumbnail'),
+ 'duration': int_or_none(meta.get('duration')),
+ 'timestamp': parse_iso8601(meta.get('created')),
+ }
+
+ for ie in (YoutubeIE, VimeoIE):
+ if ie.suitable(video_url):
+ info.update({
+ '_type': 'url_transparent',
+ 'ie_key': ie.ie_key(),
+ })
+ break
+
+ return info
diff --git a/youtube_dlc/extractor/arte.py b/youtube_dlc/extractor/arte.py
index 2bd3bfe8a..03abdbfaf 100644
--- a/youtube_dlc/extractor/arte.py
+++ b/youtube_dlc/extractor/arte.py
@@ -4,23 +4,57 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import compat_str
+from ..compat import (
+ compat_str,
+ compat_urlparse,
+)
from ..utils import (
ExtractorError,
int_or_none,
qualities,
try_get,
unified_strdate,
+ url_or_none,
)
-# There are different sources of video in arte.tv, the extraction process
-# is different for each one. The videos usually expire in 7 days, so we can't
-# add tests.
-
class ArteTVBaseIE(InfoExtractor):
- def _extract_from_json_url(self, json_url, video_id, lang, title=None):
- info = self._download_json(json_url, video_id)
+ _ARTE_LANGUAGES = 'fr|de|en|es|it|pl'
+ _API_BASE = 'https://api.arte.tv/api/player/v1'
+
+
+class ArteTVIE(ArteTVBaseIE):
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:
+ (?:www\.)?arte\.tv/(?P<lang>%(langs)s)/videos|
+ api\.arte\.tv/api/player/v\d+/config/(?P<lang_2>%(langs)s)
+ )
+ /(?P<id>\d{6}-\d{3}-[AF])
+ ''' % {'langs': ArteTVBaseIE._ARTE_LANGUAGES}
+ _TESTS = [{
+ 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
+ 'info_dict': {
+ 'id': '088501-000-A',
+ 'ext': 'mp4',
+ 'title': 'Mexico: Stealing Petrol to Survive',
+ 'upload_date': '20190628',
+ },
+ }, {
+ 'url': 'https://www.arte.tv/pl/videos/100103-000-A/usa-dyskryminacja-na-porodowce/',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://api.arte.tv/api/player/v2/config/de/100605-013-A',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ lang = mobj.group('lang') or mobj.group('lang_2')
+
+ info = self._download_json(
+ '%s/config/%s/%s' % (self._API_BASE, lang, video_id), video_id)
player_info = info['videoJsonPlayer']
vsr = try_get(player_info, lambda x: x['VSR'], dict)
@@ -37,18 +71,11 @@ class ArteTVBaseIE(InfoExtractor):
if not upload_date_str:
upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0]
- title = (player_info.get('VTI') or title or player_info['VID']).strip()
+ title = (player_info.get('VTI') or player_info['VID']).strip()
subtitle = player_info.get('VSU', '').strip()
if subtitle:
title += ' - %s' % subtitle
- info_dict = {
- 'id': player_info['VID'],
- 'title': title,
- 'description': player_info.get('VDE'),
- 'upload_date': unified_strdate(upload_date_str),
- 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
- }
qfunc = qualities(['MQ', 'HQ', 'EQ', 'SQ'])
LANGS = {
@@ -65,6 +92,10 @@ class ArteTVBaseIE(InfoExtractor):
formats = []
for format_id, format_dict in vsr.items():
f = dict(format_dict)
+ format_url = url_or_none(f.get('url'))
+ streamer = f.get('streamer')
+ if not format_url and not streamer:
+ continue
versionCode = f.get('versionCode')
l = re.escape(langcode)
@@ -107,6 +138,16 @@ class ArteTVBaseIE(InfoExtractor):
else:
lang_pref = -1
+ media_type = f.get('mediaType')
+ if media_type == 'hls':
+ m3u8_formats = self._extract_m3u8_formats(
+ format_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id=format_id, fatal=False)
+ for m3u8_format in m3u8_formats:
+ m3u8_format['language_preference'] = lang_pref
+ formats.extend(m3u8_formats)
+ continue
+
format = {
'format_id': format_id,
'preference': -10 if f.get('videoFormat') == 'M3U8' else None,
@@ -118,7 +159,7 @@ class ArteTVBaseIE(InfoExtractor):
'quality': qfunc(f.get('quality')),
}
- if f.get('mediaType') == 'rtmp':
+ if media_type == 'rtmp':
format['url'] = f['streamer']
format['play_path'] = 'mp4:' + f['url']
format['ext'] = 'flv'
@@ -127,56 +168,50 @@ class ArteTVBaseIE(InfoExtractor):
formats.append(format)
- self._check_formats(formats, video_id)
self._sort_formats(formats)
- info_dict['formats'] = formats
- return info_dict
-
+ return {
+ 'id': player_info.get('VID') or video_id,
+ 'title': title,
+ 'description': player_info.get('VDE'),
+ 'upload_date': unified_strdate(upload_date_str),
+ 'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
+ 'formats': formats,
+ }
-class ArteTVPlus7IE(ArteTVBaseIE):
- IE_NAME = 'arte.tv:+7'
- _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>\d{6}-\d{3}-[AF])'
+class ArteTVEmbedIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+'
_TESTS = [{
- 'url': 'https://www.arte.tv/en/videos/088501-000-A/mexico-stealing-petrol-to-survive/',
+ 'url': 'https://www.arte.tv/player/v5/index.php?json_url=https%3A%2F%2Fapi.arte.tv%2Fapi%2Fplayer%2Fv2%2Fconfig%2Fde%2F100605-013-A&lang=de&autoplay=true&mute=0100605-013-A',
'info_dict': {
- 'id': '088501-000-A',
+ 'id': '100605-013-A',
'ext': 'mp4',
- 'title': 'Mexico: Stealing Petrol to Survive',
- 'upload_date': '20190628',
+ 'title': 'United we Stream November Lockdown Edition #13',
+ 'description': 'md5:be40b667f45189632b78c1425c7c2ce1',
+ 'upload_date': '20201116',
},
+ }, {
+ 'url': 'https://www.arte.tv/player/v3/index.php?json_url=https://api.arte.tv/api/player/v2/config/de/100605-013-A',
+ 'only_matching': True,
}]
- def _real_extract(self, url):
- lang, video_id = re.match(self._VALID_URL, url).groups()
- return self._extract_from_json_url(
- 'https://api.arte.tv/api/player/v1/config/%s/%s' % (lang, video_id),
- video_id, lang)
-
-
-class ArteTVEmbedIE(ArteTVPlus7IE):
- IE_NAME = 'arte.tv:embed'
- _VALID_URL = r'''(?x)
- https://www\.arte\.tv
- /player/v3/index\.php\?json_url=
- (?P<json_url>
- https?://api\.arte\.tv/api/player/v1/config/
- (?P<lang>[^/]+)/(?P<id>\d{6}-\d{3}-[AF])
- )
- '''
-
- _TESTS = []
+ @staticmethod
+ def _extract_urls(webpage):
+ return [url for _, url in re.findall(
+ r'<(?:iframe|script)[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?arte\.tv/player/v\d+/index\.php\?.*?\bjson_url=.+?)\1',
+ webpage)]
def _real_extract(self, url):
- json_url, lang, video_id = re.match(self._VALID_URL, url).groups()
- return self._extract_from_json_url(json_url, video_id, lang)
+ qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+ json_url = qs['json_url'][0]
+ video_id = ArteTVIE._match_id(json_url)
+ return self.url_result(
+ json_url, ie=ArteTVIE.ie_key(), video_id=video_id)
class ArteTVPlaylistIE(ArteTVBaseIE):
- IE_NAME = 'arte.tv:playlist'
- _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>fr|de|en|es|it|pl)/videos/(?P<id>RC-\d{6})'
-
+ _VALID_URL = r'https?://(?:www\.)?arte\.tv/(?P<lang>%s)/videos/(?P<id>RC-\d{6})' % ArteTVBaseIE._ARTE_LANGUAGES
_TESTS = [{
'url': 'https://www.arte.tv/en/videos/RC-016954/earn-a-living/',
'info_dict': {
@@ -185,17 +220,35 @@ class ArteTVPlaylistIE(ArteTVBaseIE):
'description': 'md5:d322c55011514b3a7241f7fb80d494c2',
},
'playlist_mincount': 6,
+ }, {
+ 'url': 'https://www.arte.tv/pl/videos/RC-014123/arte-reportage/',
+ 'only_matching': True,
}]
def _real_extract(self, url):
lang, playlist_id = re.match(self._VALID_URL, url).groups()
collection = self._download_json(
- 'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos'
- % (lang, playlist_id), playlist_id)
+ '%s/collectionData/%s/%s?source=videos'
+ % (self._API_BASE, lang, playlist_id), playlist_id)
+ entries = []
+ for video in collection['videos']:
+ if not isinstance(video, dict):
+ continue
+ video_url = url_or_none(video.get('url')) or url_or_none(video.get('jsonUrl'))
+ if not video_url:
+ continue
+ video_id = video.get('programId')
+ entries.append({
+ '_type': 'url_transparent',
+ 'url': video_url,
+ 'id': video_id,
+ 'title': video.get('title'),
+ 'alt_title': video.get('subtitle'),
+ 'thumbnail': url_or_none(try_get(video, lambda x: x['mainImage']['url'], compat_str)),
+ 'duration': int_or_none(video.get('durationSeconds')),
+ 'view_count': int_or_none(video.get('views')),
+ 'ie_key': ArteTVIE.ie_key(),
+ })
title = collection.get('title')
description = collection.get('shortDescription') or collection.get('teaserText')
- entries = [
- self._extract_from_json_url(
- video['jsonUrl'], video.get('programId') or playlist_id, lang)
- for video in collection['videos'] if video.get('jsonUrl')]
return self.playlist_result(entries, playlist_id, title, description)
diff --git a/youtube_dlc/extractor/bandcamp.py b/youtube_dlc/extractor/bandcamp.py
index 9dbafe86d..69e673a26 100644
--- a/youtube_dlc/extractor/bandcamp.py
+++ b/youtube_dlc/extractor/bandcamp.py
@@ -1,3 +1,4 @@
+# coding: utf-8
from __future__ import unicode_literals
import random
@@ -5,10 +6,7 @@ import re
import time
from .common import InfoExtractor
-from ..compat import (
- compat_str,
- compat_urlparse,
-)
+from ..compat import compat_str
from ..utils import (
ExtractorError,
float_or_none,
@@ -17,33 +15,32 @@ from ..utils import (
parse_filesize,
str_or_none,
try_get,
- unescapeHTML,
update_url_query,
unified_strdate,
unified_timestamp,
url_or_none,
+ urljoin,
)
class BandcampIE(InfoExtractor):
- _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<title>[^/?#&]+)'
+ _VALID_URL = r'https?://[^/]+\.bandcamp\.com/track/(?P<id>[^/?#&]+)'
_TESTS = [{
- 'url': 'http://youtube-dlc.bandcamp.com/track/youtube-dlc-test-song',
+ 'url': 'http://youtube-dl.bandcamp.com/track/youtube-dl-test-song',
'md5': 'c557841d5e50261777a6585648adf439',
'info_dict': {
'id': '1812978515',
'ext': 'mp3',
- 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad",
+ 'title': "youtube-dl \"'/\\ä↭ - youtube-dl \"'/\\ä↭ - youtube-dl test song \"'/\\ä↭",
'duration': 9.8485,
- 'uploader': "youtube-dl \"'/\\\u00e4\u21ad",
- 'timestamp': 1354224127,
+ 'uploader': 'youtube-dl "\'/\\ä↭',
'upload_date': '20121129',
+ 'timestamp': 1354224127,
},
'_skip': 'There is a limit of 200 free downloads / month for the test song'
}, {
# free download
'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
- 'md5': '5d92af55811e47f38962a54c30b07ef0',
'info_dict': {
'id': '2650410135',
'ext': 'aiff',
@@ -82,11 +79,16 @@ class BandcampIE(InfoExtractor):
},
}]
+ def _extract_data_attr(self, webpage, video_id, attr='tralbum', fatal=True):
+ return self._parse_json(self._html_search_regex(
+ r'data-%s=(["\'])({.+?})\1' % attr, webpage,
+ attr + ' data', group=2), video_id, fatal=fatal)
+
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- title = mobj.group('title')
+ title = self._match_id(url)
webpage = self._download_webpage(url, title)
- thumbnail = self._html_search_meta('og:image', webpage, default=None)
+ tralbum = self._extract_data_attr(webpage, title)
+ thumbnail = self._og_search_thumbnail(webpage)
track_id = None
track = None
@@ -94,11 +96,7 @@ class BandcampIE(InfoExtractor):
duration = None
formats = []
- trackinfo_block = self._html_search_regex(
- r'trackinfo(?:["\']|&quot;):\[\s*({.+?})\s*\],(?:["\']|&quot;)',
- webpage, 'track info', default='{}')
-
- track_info = self._parse_json(trackinfo_block, title)
+ track_info = try_get(tralbum, lambda x: x['trackinfo'][0], dict)
if track_info:
file_ = track_info.get('file')
if isinstance(file_, dict):
@@ -114,40 +112,26 @@ class BandcampIE(InfoExtractor):
'acodec': ext,
'abr': int_or_none(abr_str),
})
-
- track_id = str_or_none(track_info.get('track_id') or track_info.get('id'))
+ track = track_info.get('title')
+ track_id = str_or_none(
+ track_info.get('track_id') or track_info.get('id'))
track_number = int_or_none(track_info.get('track_num'))
duration = float_or_none(track_info.get('duration'))
- def extract(key):
- data = self._html_search_regex(
- r',(["\']|&quot;)%s\1:\1(?P<value>(?:\\\1|((?!\1).))+)\1' % key,
- webpage, key, default=None, group='value')
- return data.replace(r'\"', '"').replace('\\\\', '\\') if data else data
-
- track = extract('title')
- artist = extract('artist')
- album = extract('album_title')
+ embed = self._extract_data_attr(webpage, title, 'embed', False)
+ current = tralbum.get('current') or {}
+ artist = embed.get('artist') or current.get('artist') or tralbum.get('artist')
timestamp = unified_timestamp(
- extract('publish_date') or extract('album_publish_date'))
- release_date = unified_strdate(extract('album_release_date'))
+ current.get('publish_date') or tralbum.get('album_publish_date'))
- download_link = self._search_regex(
- r'freeDownloadPage(?:["\']|&quot;):\s*(["\']|&quot;)(?P<url>(?:(?!\1).)+)\1', webpage,
- 'download link', default=None, group='url')
+ download_link = tralbum.get('freeDownloadPage')
if download_link:
- track_id = self._search_regex(
- r'\?id=(?P<id>\d+)&',
- download_link, 'track id')
+ track_id = compat_str(tralbum['id'])
download_webpage = self._download_webpage(
download_link, track_id, 'Downloading free downloads page')
- blob = self._parse_json(
- self._search_regex(
- r'data-blob=(["\'])(?P<blob>{.+?})\1', download_webpage,
- 'blob', group='blob'),
- track_id, transform_source=unescapeHTML)
+ blob = self._extract_data_attr(download_webpage, track_id, 'blob')
info = try_get(
blob, (lambda x: x['digital_items'][0],
@@ -213,20 +197,20 @@ class BandcampIE(InfoExtractor):
'thumbnail': thumbnail,
'uploader': artist,
'timestamp': timestamp,
- 'release_date': release_date,
+ 'release_date': unified_strdate(tralbum.get('album_release_date')),
'duration': duration,
'track': track,
'track_number': track_number,
'track_id': track_id,
'artist': artist,
- 'album': album,
+ 'album': embed.get('album_title'),
'formats': formats,
}
-class BandcampAlbumIE(InfoExtractor):
+class BandcampAlbumIE(BandcampIE):
IE_NAME = 'Bandcamp:album'
- _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<album_id>[^/?#&]+))?'
+ _VALID_URL = r'https?://(?:(?P<subdomain>[^.]+)\.)?bandcamp\.com(?:/album/(?P<id>[^/?#&]+))?'
_TESTS = [{
'url': 'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1',
@@ -236,7 +220,10 @@ class BandcampAlbumIE(InfoExtractor):
'info_dict': {
'id': '1353101989',
'ext': 'mp3',
- 'title': 'Intro',
+ 'title': 'Blazo - Intro',
+ 'timestamp': 1311756226,
+ 'upload_date': '20110727',
+ 'uploader': 'Blazo',
}
},
{
@@ -244,7 +231,10 @@ class BandcampAlbumIE(InfoExtractor):
'info_dict': {
'id': '38097443',
'ext': 'mp3',
- 'title': 'Kero One - Keep It Alive (Blazo remix)',
+ 'title': 'Blazo - Kero One - Keep It Alive (Blazo remix)',
+ 'timestamp': 1311757238,
+ 'upload_date': '20110727',
+ 'uploader': 'Blazo',
}
},
],
@@ -280,6 +270,7 @@ class BandcampAlbumIE(InfoExtractor):
'title': '"Entropy" EP',
'uploader_id': 'jstrecords',
'id': 'entropy-ep',
+ 'description': 'md5:0ff22959c943622972596062f2f366a5',
},
'playlist_mincount': 3,
}, {
@@ -289,6 +280,7 @@ class BandcampAlbumIE(InfoExtractor):
'id': 'we-are-the-plague',
'title': 'WE ARE THE PLAGUE',
'uploader_id': 'insulters',
+ 'description': 'md5:b3cf845ee41b2b1141dc7bde9237255f',
},
'playlist_count': 2,
}]
@@ -300,43 +292,34 @@ class BandcampAlbumIE(InfoExtractor):
else super(BandcampAlbumIE, cls).suitable(url))
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- uploader_id = mobj.group('subdomain')
- album_id = mobj.group('album_id')
+ uploader_id, album_id = re.match(self._VALID_URL, url).groups()
playlist_id = album_id or uploader_id
webpage = self._download_webpage(url, playlist_id)
- track_elements = re.findall(
- r'(?s)<div[^>]*>(.*?<a[^>]+href="([^"]+?)"[^>]+itemprop="url"[^>]*>.*?)</div>', webpage)
- if not track_elements:
+ tralbum = self._extract_data_attr(webpage, playlist_id)
+ track_info = tralbum.get('trackinfo')
+ if not track_info:
raise ExtractorError('The page doesn\'t contain any tracks')
# Only tracks with duration info have songs
entries = [
self.url_result(
- compat_urlparse.urljoin(url, t_path),
- ie=BandcampIE.ie_key(),
- video_title=self._search_regex(
- r'<span\b[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)',
- elem_content, 'track title', fatal=False))
- for elem_content, t_path in track_elements
- if self._html_search_meta('duration', elem_content, default=None)]
-
- title = self._html_search_regex(
- r'album_title\s*(?:&quot;|["\']):\s*(&quot;|["\'])(?P<album>(?:\\\1|((?!\1).))+)\1',
- webpage, 'title', fatal=False, group='album')
+ urljoin(url, t['title_link']), BandcampIE.ie_key(),
+ str_or_none(t.get('track_id') or t.get('id')), t.get('title'))
+ for t in track_info
+ if t.get('duration')]
- if title:
- title = title.replace(r'\"', '"')
+ current = tralbum.get('current') or {}
return {
'_type': 'playlist',
'uploader_id': uploader_id,
'id': playlist_id,
- 'title': title,
+ 'title': current.get('title'),
+ 'description': current.get('about'),
'entries': entries,
}
-class BandcampWeeklyIE(InfoExtractor):
+class BandcampWeeklyIE(BandcampIE):
IE_NAME = 'Bandcamp:weekly'
_VALID_URL = r'https?://(?:www\.)?bandcamp\.com/?\?(?:.*?&)?show=(?P<id>\d+)'
_TESTS = [{
@@ -351,29 +334,23 @@ class BandcampWeeklyIE(InfoExtractor):
'release_date': '20170404',
'series': 'Bandcamp Weekly',
'episode': 'Magic Moments',
- 'episode_number': 208,
'episode_id': '224',
- }
+ },
+ 'params': {
+ 'format': 'opus-lo',
+ },
}, {
'url': 'https://bandcamp.com/?blah/blah@&show=228',
'only_matching': True
}]
def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
-
- blob = self._parse_json(
- self._search_regex(
- r'data-blob=(["\'])(?P<blob>{.+?})\1', webpage,
- 'blob', group='blob'),
- video_id, transform_source=unescapeHTML)
+ show_id = self._match_id(url)
+ webpage = self._download_webpage(url, show_id)
- show = blob['bcw_show']
+ blob = self._extract_data_attr(webpage, show_id, 'blob')
- # This is desired because any invalid show id redirects to `bandcamp.com`
- # which happens to expose the latest Bandcamp Weekly episode.
- show_id = int_or_none(show.get('show_id')) or int_or_none(video_id)
+ show = blob['bcw_data'][show_id]
formats = []
for format_id, format_url in show['audio_stream'].items():
@@ -398,20 +375,8 @@ class BandcampWeeklyIE(InfoExtractor):
if subtitle:
title += ' - %s' % subtitle
- episode_number = None
- seq = blob.get('bcw_seq')
-
- if seq and isinstance(seq, list):
- try:
- episode_number = next(
- int_or_none(e.get('episode_number'))
- for e in seq
- if isinstance(e, dict) and int_or_none(e.get('id')) == show_id)
- except StopIteration:
- pass
-
return {
- 'id': video_id,
+ 'id': show_id,
'title': title,
'description': show.get('desc') or show.get('short_desc'),
'duration': float_or_none(show.get('audio_duration')),
@@ -419,7 +384,6 @@ class BandcampWeeklyIE(InfoExtractor):
'release_date': unified_strdate(show.get('published_date')),
'series': 'Bandcamp Weekly',
'episode': show.get('subtitle'),
- 'episode_number': episode_number,
- 'episode_id': compat_str(video_id),
+ 'episode_id': show_id,
'formats': formats
}
diff --git a/youtube_dlc/extractor/bbc.py b/youtube_dlc/extractor/bbc.py
index 002c39c39..54cbcdc8e 100644
--- a/youtube_dlc/extractor/bbc.py
+++ b/youtube_dlc/extractor/bbc.py
@@ -981,7 +981,7 @@ class BBCIE(BBCCoUkIE):
group_id = self._search_regex(
r'<div[^>]+\bclass=["\']video["\'][^>]+\bdata-pid=["\'](%s)' % self._ID_REGEX,
webpage, 'group id', default=None)
- if playlist_id:
+ if group_id:
return self.url_result(
'https://www.bbc.co.uk/programmes/%s' % group_id,
ie=BBCCoUkIE.ie_key())
@@ -1092,10 +1092,26 @@ class BBCIE(BBCCoUkIE):
self._search_regex(
r'(?s)bbcthreeConfig\s*=\s*({.+?})\s*;\s*<', webpage,
'bbcthree config', default='{}'),
- playlist_id, transform_source=js_to_json, fatal=False)
- if bbc3_config:
+ playlist_id, transform_source=js_to_json, fatal=False) or {}
+ payload = bbc3_config.get('payload') or {}
+ if payload:
+ clip = payload.get('currentClip') or {}
+ clip_vpid = clip.get('vpid')
+ clip_title = clip.get('title')
+ if clip_vpid and clip_title:
+ formats, subtitles = self._download_media_selector(clip_vpid)
+ self._sort_formats(formats)
+ return {
+ 'id': clip_vpid,
+ 'title': clip_title,
+ 'thumbnail': dict_get(clip, ('poster', 'imageUrl')),
+ 'description': clip.get('description'),
+ 'duration': parse_duration(clip.get('duration')),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ }
bbc3_playlist = try_get(
- bbc3_config, lambda x: x['payload']['content']['bbcMedia']['playlist'],
+ payload, lambda x: x['content']['bbcMedia']['playlist'],
dict)
if bbc3_playlist:
playlist_title = bbc3_playlist.get('title') or playlist_title
@@ -1118,6 +1134,39 @@ class BBCIE(BBCCoUkIE):
return self.playlist_result(
entries, playlist_id, playlist_title, playlist_description)
+ initial_data = self._parse_json(self._search_regex(
+ r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage,
+ 'preload state', default='{}'), playlist_id, fatal=False)
+ if initial_data:
+ def parse_media(media):
+ if not media:
+ return
+ for item in (try_get(media, lambda x: x['media']['items'], list) or []):
+ item_id = item.get('id')
+ item_title = item.get('title')
+ if not (item_id and item_title):
+ continue
+ formats, subtitles = self._download_media_selector(item_id)
+ self._sort_formats(formats)
+ entries.append({
+ 'id': item_id,
+ 'title': item_title,
+ 'thumbnail': item.get('holdingImageUrl'),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ })
+ for resp in (initial_data.get('data') or {}).values():
+ name = resp.get('name')
+ if name == 'media-experience':
+ parse_media(try_get(resp, lambda x: x['data']['initialItem']['mediaItem'], dict))
+ elif name == 'article':
+ for block in (try_get(resp, lambda x: x['data']['blocks'], list) or []):
+ if block.get('type') != 'media':
+ continue
+ parse_media(block.get('model'))
+ return self.playlist_result(
+ entries, playlist_id, playlist_title, playlist_description)
+
def extract_all(pattern):
return list(filter(None, map(
lambda s: self._parse_json(s, playlist_id, fatal=False),
diff --git a/youtube_dlc/extractor/bitchute.py b/youtube_dlc/extractor/bitchute.py
index 92fc70b5a..94219a138 100644
--- a/youtube_dlc/extractor/bitchute.py
+++ b/youtube_dlc/extractor/bitchute.py
@@ -36,6 +36,14 @@ class BitChuteIE(InfoExtractor):
'only_matching': True,
}]
+ @staticmethod
+ def _extract_urls(webpage):
+ return [
+ mobj.group('url')
+ for mobj in re.finditer(
+ r'<(?:script|iframe)[^>]+\bsrc=(["\'])(?P<url>%s)' % BitChuteIE._VALID_URL,
+ webpage)]
+
def _real_extract(self, url):
video_id = self._match_id(url)
diff --git a/youtube_dlc/extractor/bitwave.py b/youtube_dlc/extractor/bitwave.py
new file mode 100644
index 000000000..eb16c469d
--- /dev/null
+++ b/youtube_dlc/extractor/bitwave.py
@@ -0,0 +1,61 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class BitwaveReplayIE(InfoExtractor):
+ IE_NAME = 'bitwave:replay'
+ _VALID_URL = r'https?://(?:www\.)?bitwave\.tv/(?P<user>\w+)/replay/(?P<id>\w+)/?$'
+ _TEST = {
+ 'url': 'https://bitwave.tv/RhythmicCarnage/replay/z4P6eq5L7WDrM85UCrVr',
+ 'only_matching': True
+ }
+
+ def _real_extract(self, url):
+ replay_id = self._match_id(url)
+ replay = self._download_json(
+ 'https://api.bitwave.tv/v1/replays/' + replay_id,
+ replay_id
+ )
+
+ return {
+ 'id': replay_id,
+ 'title': replay['data']['title'],
+ 'uploader': replay['data']['name'],
+ 'uploader_id': replay['data']['name'],
+ 'url': replay['data']['url'],
+ 'thumbnails': [
+ {'url': x} for x in replay['data']['thumbnails']
+ ],
+ }
+
+
+class BitwaveStreamIE(InfoExtractor):
+ IE_NAME = 'bitwave:stream'
+ _VALID_URL = r'https?://(?:www\.)?bitwave\.tv/(?P<id>\w+)/?$'
+ _TEST = {
+ 'url': 'https://bitwave.tv/doomtube',
+ 'only_matching': True
+ }
+
+ def _real_extract(self, url):
+ username = self._match_id(url)
+ channel = self._download_json(
+ 'https://api.bitwave.tv/v1/channels/' + username,
+ username)
+
+ formats = self._extract_m3u8_formats(
+ channel['data']['url'], username,
+ 'mp4')
+ self._sort_formats(formats)
+
+ return {
+ 'id': username,
+ 'title': self._live_title(channel['data']['title']),
+ 'uploader': username,
+ 'uploader_id': username,
+ 'formats': formats,
+ 'thumbnail': channel['data']['thumbnail'],
+ 'is_live': True,
+ 'view_count': channel['data']['viewCount']
+ }
diff --git a/youtube_dlc/extractor/box.py b/youtube_dlc/extractor/box.py
new file mode 100644
index 000000000..aae82d1af
--- /dev/null
+++ b/youtube_dlc/extractor/box.py
@@ -0,0 +1,98 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ parse_iso8601,
+ # try_get,
+ update_url_query,
+)
+
+
+class BoxIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:[^.]+\.)?app\.box\.com/s/(?P<shared_name>[^/]+)/file/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'https://mlssoccer.app.box.com/s/0evd2o3e08l60lr4ygukepvnkord1o1x/file/510727257538',
+ 'md5': '1f81b2fd3960f38a40a3b8823e5fcd43',
+ 'info_dict': {
+ 'id': '510727257538',
+ 'ext': 'mp4',
+ 'title': 'Garber St. Louis will be 28th MLS team +scarving.mp4',
+ 'uploader': 'MLS Video',
+ 'timestamp': 1566320259,
+ 'upload_date': '20190820',
+ 'uploader_id': '235196876',
+ }
+ }
+
+ def _real_extract(self, url):
+ shared_name, file_id = re.match(self._VALID_URL, url).groups()
+ webpage = self._download_webpage(url, file_id)
+ request_token = self._parse_json(self._search_regex(
+ r'Box\.config\s*=\s*({.+?});', webpage,
+ 'Box config'), file_id)['requestToken']
+ access_token = self._download_json(
+ 'https://app.box.com/app-api/enduserapp/elements/tokens', file_id,
+ 'Downloading token JSON metadata',
+ data=json.dumps({'fileIDs': [file_id]}).encode(), headers={
+ 'Content-Type': 'application/json',
+ 'X-Request-Token': request_token,
+ 'X-Box-EndUser-API': 'sharedName=' + shared_name,
+ })[file_id]['read']
+ shared_link = 'https://app.box.com/s/' + shared_name
+ f = self._download_json(
+ 'https://api.box.com/2.0/files/' + file_id, file_id,
+ 'Downloading file JSON metadata', headers={
+ 'Authorization': 'Bearer ' + access_token,
+ 'BoxApi': 'shared_link=' + shared_link,
+ 'X-Rep-Hints': '[dash]', # TODO: extract `hls` formats
+ }, query={
+ 'fields': 'authenticated_download_url,created_at,created_by,description,extension,is_download_available,name,representations,size'
+ })
+ title = f['name']
+
+ query = {
+ 'access_token': access_token,
+ 'shared_link': shared_link
+ }
+
+ formats = []
+
+ # for entry in (try_get(f, lambda x: x['representations']['entries'], list) or []):
+ # entry_url_template = try_get(
+ # entry, lambda x: x['content']['url_template'])
+ # if not entry_url_template:
+ # continue
+ # representation = entry.get('representation')
+ # if representation == 'dash':
+ # TODO: append query to every fragment URL
+ # formats.extend(self._extract_mpd_formats(
+ # entry_url_template.replace('{+asset_path}', 'manifest.mpd'),
+ # file_id, query=query))
+
+ authenticated_download_url = f.get('authenticated_download_url')
+ if authenticated_download_url and f.get('is_download_available'):
+ formats.append({
+ 'ext': f.get('extension') or determine_ext(title),
+ 'filesize': f.get('size'),
+ 'format_id': 'download',
+ 'url': update_url_query(authenticated_download_url, query),
+ })
+
+ self._sort_formats(formats)
+
+ creator = f.get('created_by') or {}
+
+ return {
+ 'id': file_id,
+ 'title': title,
+ 'formats': formats,
+ 'description': f.get('description') or None,
+ 'uploader': creator.get('name'),
+ 'timestamp': parse_iso8601(f.get('created_at')),
+ 'uploader_id': creator.get('id'),
+ }
diff --git a/youtube_dlc/extractor/brightcove.py b/youtube_dlc/extractor/brightcove.py
index 2aa9f4782..c6ca939dd 100644
--- a/youtube_dlc/extractor/brightcove.py
+++ b/youtube_dlc/extractor/brightcove.py
@@ -147,7 +147,7 @@ class BrightcoveLegacyIE(InfoExtractor):
]
@classmethod
- def _build_brighcove_url(cls, object_str):
+ def _build_brightcove_url(cls, object_str):
"""
Build a Brightcove url from a xml string containing
<object class="BrightcoveExperience">{params}</object>
@@ -217,7 +217,7 @@ class BrightcoveLegacyIE(InfoExtractor):
return cls._make_brightcove_url(params)
@classmethod
- def _build_brighcove_url_from_js(cls, object_js):
+ def _build_brightcove_url_from_js(cls, object_js):
# The layout of JS is as follows:
# customBC.createVideo = function (width, height, playerID, playerKey, videoPlayer, VideoRandomID) {
# // build Brightcove <object /> XML
@@ -272,12 +272,12 @@ class BrightcoveLegacyIE(InfoExtractor):
).+?>\s*</object>''',
webpage)
if matches:
- return list(filter(None, [cls._build_brighcove_url(m) for m in matches]))
+ return list(filter(None, [cls._build_brightcove_url(m) for m in matches]))
matches = re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)
if matches:
return list(filter(None, [
- cls._build_brighcove_url_from_js(custom_bc)
+ cls._build_brightcove_url_from_js(custom_bc)
for custom_bc in matches]))
return [src for _, src in re.findall(
r'<iframe[^>]+src=([\'"])((?:https?:)?//link\.brightcove\.com/services/player/(?!\1).+)\1', webpage)]
@@ -471,12 +471,17 @@ class BrightcoveNewIE(AdobePassIE):
title = json_data['name'].strip()
formats = []
+ sources_num = len(json_data.get('sources'))
+ key_systems_present = 0
for source in json_data.get('sources', []):
container = source.get('container')
ext = mimetype2ext(source.get('type'))
src = source.get('src')
- # https://support.brightcove.com/playback-api-video-fields-reference#key_systems_object
- if ext == 'ism' or container == 'WVM' or source.get('key_systems'):
+ # https://apis.support.brightcove.com/playback/references/playback-api-video-fields-reference.html
+ if source.get('key_systems'):
+ key_systems_present += 1
+ continue
+ elif ext == 'ism' or container == 'WVM':
continue
elif ext == 'm3u8' or container == 'M2TS':
if not src:
@@ -533,6 +538,10 @@ class BrightcoveNewIE(AdobePassIE):
'format_id': build_format_id('rtmp'),
})
formats.append(f)
+
+ if sources_num == key_systems_present:
+ raise ExtractorError('This video is DRM protected', expected=True)
+
if not formats:
# for sonyliv.com DRM protected videos
s3_source_url = json_data.get('custom_fields', {}).get('s3sourceurl')
diff --git a/youtube_dlc/extractor/cda.py b/youtube_dlc/extractor/cda.py
index 0c3af23d5..d67900e62 100644
--- a/youtube_dlc/extractor/cda.py
+++ b/youtube_dlc/extractor/cda.py
@@ -5,10 +5,16 @@ import codecs
import re
from .common import InfoExtractor
+from ..compat import (
+ compat_chr,
+ compat_ord,
+ compat_urllib_parse_unquote,
+)
from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
+ merge_dicts,
multipart_encode,
parse_duration,
random_birthday,
@@ -107,8 +113,9 @@ class CDAIE(InfoExtractor):
r'Odsłony:(?:\s|&nbsp;)*([0-9]+)', webpage,
'view_count', default=None)
average_rating = self._search_regex(
- r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)',
- webpage, 'rating', fatal=False, group='rating_value')
+ (r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)',
+ r'<span[^>]+\bclass=["\']rating["\'][^>]*>(?P<rating_value>[0-9.]+)'), webpage, 'rating', fatal=False,
+ group='rating_value')
info_dict = {
'id': video_id,
@@ -123,6 +130,24 @@ class CDAIE(InfoExtractor):
'age_limit': 18 if need_confirm_age else 0,
}
+ # Source: https://www.cda.pl/js/player.js?t=1606154898
+ def decrypt_file(a):
+ for p in ('_XDDD', '_CDA', '_ADC', '_CXD', '_QWE', '_Q5', '_IKSDE'):
+ a = a.replace(p, '')
+ a = compat_urllib_parse_unquote(a)
+ b = []
+ for c in a:
+ f = compat_ord(c)
+ b.append(compat_chr(33 + (f + 14) % 94) if 33 <= f and 126 >= f else compat_chr(f))
+ a = ''.join(b)
+ a = a.replace('.cda.mp4', '')
+ for p in ('.2cda.pl', '.3cda.pl'):
+ a = a.replace(p, '.cda.pl')
+ if '/upstream' in a:
+ a = a.replace('/upstream', '.mp4/upstream')
+ return 'https://' + a
+ return 'https://' + a + '.mp4'
+
def extract_format(page, version):
json_str = self._html_search_regex(
r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page,
@@ -141,6 +166,8 @@ class CDAIE(InfoExtractor):
video['file'] = codecs.decode(video['file'], 'rot_13')
if video['file'].endswith('adc.mp4'):
video['file'] = video['file'].replace('adc.mp4', '.mp4')
+ elif not video['file'].startswith('http'):
+ video['file'] = decrypt_file(video['file'])
f = {
'url': video['file'],
}
@@ -179,4 +206,6 @@ class CDAIE(InfoExtractor):
self._sort_formats(formats)
- return info_dict
+ info = self._search_json_ld(webpage, video_id, default={})
+
+ return merge_dicts(info_dict, info)
diff --git a/youtube_dlc/extractor/cnbc.py b/youtube_dlc/extractor/cnbc.py
index 6889b0f40..7b9f4536a 100644
--- a/youtube_dlc/extractor/cnbc.py
+++ b/youtube_dlc/extractor/cnbc.py
@@ -1,6 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
from .common import InfoExtractor
from ..utils import smuggle_url
@@ -38,7 +39,7 @@ class CNBCIE(InfoExtractor):
class CNBCVideoIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?cnbc\.com/video/(?:[^/]+/)+(?P<id>[^./?#&]+)'
+ _VALID_URL = r'https?://(?:www\.)?cnbc\.com(?P<path>/video/(?:[^/]+/)+(?P<id>[^./?#&]+)\.html)'
_TEST = {
'url': 'https://www.cnbc.com/video/2018/07/19/trump-i-dont-necessarily-agree-with-raising-rates.html',
'info_dict': {
@@ -56,11 +57,15 @@ class CNBCVideoIE(InfoExtractor):
}
def _real_extract(self, url):
- display_id = self._match_id(url)
- webpage = self._download_webpage(url, display_id)
- video_id = self._search_regex(
- r'content_id["\']\s*:\s*["\'](\d+)', webpage, display_id,
- 'video id')
+ path, display_id = re.match(self._VALID_URL, url).groups()
+ video_id = self._download_json(
+ 'https://webql-redesign.cnbcfm.com/graphql', display_id, query={
+ 'query': '''{
+ page(path: "%s") {
+ vcpsId
+ }
+}''' % path,
+ })['data']['page']['vcpsId']
return self.url_result(
- 'http://video.cnbc.com/gallery/?video=%s' % video_id,
+ 'http://video.cnbc.com/gallery/?video=%d' % video_id,
CNBCIE.ie_key())
diff --git a/youtube_dlc/extractor/common.py b/youtube_dlc/extractor/common.py
index 4b42d699f..aacdf06fe 100644
--- a/youtube_dlc/extractor/common.py
+++ b/youtube_dlc/extractor/common.py
@@ -1456,9 +1456,10 @@ class InfoExtractor(object):
try:
self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
return True
- except ExtractorError:
+ except ExtractorError as e:
self.to_screen(
- '%s: %s URL is invalid, skipping' % (video_id, item))
+ '%s: %s URL is invalid, skipping: %s'
+ % (video_id, item, error_to_compat_str(e.cause)))
return False
def http_scheme(self):
@@ -1663,7 +1664,7 @@ class InfoExtractor(object):
# just the media without qualities renditions.
# Fortunately, master playlist can be easily distinguished from media
# playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
- # master playlist tags MUST NOT appear in a media playist and vice versa.
+ # master playlist tags MUST NOT appear in a media playlist and vice versa.
# As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
# media playlist and MUST NOT appear in master playlist thus we can
# clearly detect media playlist with this criterion.
@@ -2596,6 +2597,7 @@ class InfoExtractor(object):
def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
formats = []
+
hdcore_sign = 'hdcore=3.7.0'
f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
hds_host = hosts.get('hds')
@@ -2608,6 +2610,7 @@ class InfoExtractor(object):
for entry in f4m_formats:
entry.update({'extra_param_to_segment_url': hdcore_sign})
formats.extend(f4m_formats)
+
m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
hls_host = hosts.get('hls')
if hls_host:
@@ -2615,6 +2618,31 @@ class InfoExtractor(object):
formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False))
+
+ http_host = hosts.get('http')
+ if http_host and 'hdnea=' not in manifest_url:
+ REPL_REGEX = r'https://[^/]+/i/([^,]+),([^/]+),([^/]+).csmil/.+'
+ qualities = re.match(REPL_REGEX, m3u8_url).group(2).split(',')
+ qualities_length = len(qualities)
+ if len(formats) in (qualities_length + 1, qualities_length * 2 + 1):
+ i = 0
+ http_formats = []
+ for f in formats:
+ if f['protocol'] == 'm3u8_native' and f['vcodec'] != 'none':
+ for protocol in ('http', 'https'):
+ http_f = f.copy()
+ del http_f['manifest_url']
+ http_url = re.sub(
+ REPL_REGEX, protocol + r'://%s/\1%s\3' % (http_host, qualities[i]), f['url'])
+ http_f.update({
+ 'format_id': http_f['format_id'].replace('hls-', protocol + '-'),
+ 'url': http_url,
+ 'protocol': protocol,
+ })
+ http_formats.append(http_f)
+ i += 1
+ formats.extend(http_formats)
+
return formats
def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
diff --git a/youtube_dlc/extractor/condenast.py b/youtube_dlc/extractor/condenast.py
index ed278fefc..d5e77af32 100644
--- a/youtube_dlc/extractor/condenast.py
+++ b/youtube_dlc/extractor/condenast.py
@@ -16,6 +16,8 @@ from ..utils import (
mimetype2ext,
orderedSet,
parse_iso8601,
+ strip_or_none,
+ try_get,
)
@@ -82,6 +84,7 @@ class CondeNastIE(InfoExtractor):
'uploader': 'gq',
'upload_date': '20170321',
'timestamp': 1490126427,
+ 'description': 'How much grimmer would things be if these people were competent?',
},
}, {
# JS embed
@@ -93,7 +96,7 @@ class CondeNastIE(InfoExtractor):
'title': '3D printed TSA Travel Sentry keys really do open TSA locks',
'uploader': 'arstechnica',
'upload_date': '20150916',
- 'timestamp': 1442434955,
+ 'timestamp': 1442434920,
}
}, {
'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player',
@@ -196,6 +199,13 @@ class CondeNastIE(InfoExtractor):
})
self._sort_formats(formats)
+ subtitles = {}
+ for t, caption in video_info.get('captions', {}).items():
+ caption_url = caption.get('src')
+ if not (t in ('vtt', 'srt', 'tml') and caption_url):
+ continue
+ subtitles.setdefault('en', []).append({'url': caption_url})
+
return {
'id': video_id,
'formats': formats,
@@ -208,6 +218,7 @@ class CondeNastIE(InfoExtractor):
'season': video_info.get('season_title'),
'timestamp': parse_iso8601(video_info.get('premiere_date')),
'categories': video_info.get('categories'),
+ 'subtitles': subtitles,
}
def _real_extract(self, url):
@@ -225,8 +236,16 @@ class CondeNastIE(InfoExtractor):
if url_type == 'series':
return self._extract_series(url, webpage)
else:
- params = self._extract_video_params(webpage, display_id)
- info = self._search_json_ld(
- webpage, display_id, fatal=False)
+ video = try_get(self._parse_json(self._search_regex(
+ r'__PRELOADED_STATE__\s*=\s*({.+?});', webpage,
+ 'preload state', '{}'), display_id),
+ lambda x: x['transformed']['video'])
+ if video:
+ params = {'videoId': video['id']}
+ info = {'description': strip_or_none(video.get('description'))}
+ else:
+ params = self._extract_video_params(webpage, display_id)
+ info = self._search_json_ld(
+ webpage, display_id, fatal=False)
info.update(self._extract_video(params))
return info
diff --git a/youtube_dlc/extractor/discoverynetworks.py b/youtube_dlc/extractor/discoverynetworks.py
index 607a54948..c512b95d0 100644
--- a/youtube_dlc/extractor/discoverynetworks.py
+++ b/youtube_dlc/extractor/discoverynetworks.py
@@ -7,7 +7,7 @@ from .dplay import DPlayIE
class DiscoveryNetworksDeIE(DPlayIE):
- _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show)/(?P<programme>[^/]+)/video/(?P<alternate_id>[^/]+)'
+ _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show|sendungen)/(?P<programme>[^/]+)/(?:video/)?(?P<alternate_id>[^/]+)'
_TESTS = [{
'url': 'https://www.tlc.de/programme/breaking-amish/video/die-welt-da-drauen/DCB331270001100',
@@ -29,6 +29,9 @@ class DiscoveryNetworksDeIE(DPlayIE):
}, {
'url': 'https://www.dplay.co.uk/show/ghost-adventures/video/hotel-leger-103620/EHD_280313B',
'only_matching': True,
+ }, {
+ 'url': 'https://tlc.de/sendungen/breaking-amish/die-welt-da-drauen/',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/youtube_dlc/extractor/europa.py b/youtube_dlc/extractor/europa.py
index 1efc0b2ec..2c1c747a1 100644
--- a/youtube_dlc/extractor/europa.py
+++ b/youtube_dlc/extractor/europa.py
@@ -60,7 +60,7 @@ class EuropaIE(InfoExtractor):
title = get_item('title', preferred_langs) or video_id
description = get_item('description', preferred_langs)
- thumbnmail = xpath_text(playlist, './info/thumburl', 'thumbnail')
+ thumbnail = xpath_text(playlist, './info/thumburl', 'thumbnail')
upload_date = unified_strdate(xpath_text(playlist, './info/date', 'upload date'))
duration = parse_duration(xpath_text(playlist, './info/duration', 'duration'))
view_count = int_or_none(xpath_text(playlist, './info/views', 'views'))
@@ -85,7 +85,7 @@ class EuropaIE(InfoExtractor):
'id': video_id,
'title': title,
'description': description,
- 'thumbnail': thumbnmail,
+ 'thumbnail': thumbnail,
'upload_date': upload_date,
'duration': duration,
'view_count': view_count,
diff --git a/youtube_dlc/extractor/extractors.py b/youtube_dlc/extractor/extractors.py
index 01f69c006..f5894504e 100644
--- a/youtube_dlc/extractor/extractors.py
+++ b/youtube_dlc/extractor/extractors.py
@@ -36,6 +36,7 @@ from .afreecatv import AfreecaTVIE
from .airmozilla import AirMozillaIE
from .aljazeera import AlJazeeraIE
from .alphaporno import AlphaPornoIE
+from .amara import AmaraIE
from .alura import (
AluraIE,
AluraCourseIE
@@ -62,7 +63,7 @@ from .ard import (
ARDMediathekIE,
)
from .arte import (
- ArteTVPlus7IE,
+ ArteTVIE,
ArteTVEmbedIE,
ArteTVPlaylistIE,
)
@@ -116,6 +117,10 @@ from .bitchute import (
BitChuteIE,
BitChuteChannelIE,
)
+from .bitwave import (
+ BitwaveReplayIE,
+ BitwaveStreamIE,
+)
from .biqle import BIQLEIE
from .bleacherreport import (
BleacherReportIE,
@@ -125,6 +130,7 @@ from .blinkx import BlinkxIE
from .bloomberg import BloombergIE
from .bokecc import BokeCCIE
from .bostonglobe import BostonGlobeIE
+from .box import BoxIE
from .bpb import BpbIE
from .br import (
BRIE,
@@ -546,6 +552,7 @@ from .laola1tv import (
EHFTVIE,
ITTFIE,
)
+from .lbry import LBRYIE
from .lci import LCIIE
from .lcp import (
LcpPlayIE,
@@ -621,6 +628,7 @@ from .markiza import (
from .massengeschmacktv import MassengeschmackTVIE
from .matchtv import MatchTVIE
from .mdr import MDRIE
+from .medaltv import MedalTVIE
from .mediaset import MediasetIE
from .mediasite import (
MediasiteIE,
@@ -755,6 +763,7 @@ from .ninecninemedia import NineCNineMediaIE
from .ninegag import NineGagIE
from .ninenow import NineNowIE
from .nintendo import NintendoIE
+from .nitter import NitterIE
from .njpwworld import NJPWWorldIE
from .nobelprize import NobelPrizeIE
from .noco import NocoIE
@@ -802,6 +811,7 @@ from .ntvru import NTVRuIE
from .nytimes import (
NYTimesIE,
NYTimesArticleIE,
+ NYTimesCookingIE,
)
from .nuvid import NuvidIE
from .nzz import NZZIE
@@ -864,6 +874,10 @@ from .picarto import (
)
from .piksel import PikselIE
from .pinkbike import PinkbikeIE
+from .pinterest import (
+ PinterestIE,
+ PinterestCollectionIE,
+)
from .pladform import PladformIE
from .platzi import (
PlatziIE,
@@ -940,6 +954,11 @@ from .raywenderlich import (
RayWenderlichCourseIE,
)
from .rbmaradio import RBMARadioIE
+from .rcs import (
+ RCSIE,
+ RCSEmbedsIE,
+ RCSVariousIE,
+)
from .rds import RDSIE
from .redbulltv import (
RedBullTVIE,
@@ -982,6 +1001,7 @@ from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETe
from .rtvnh import RTVNHIE
from .rtvs import RTVSIE
from .ruhd import RUHDIE
+from .rumble import RumbleEmbedIE
from .rutube import (
RutubeIE,
RutubeChannelIE,
@@ -1041,6 +1061,10 @@ from .sky import (
SkyNewsIE,
SkySportsIE,
)
+from .skyitalia import (
+ SkyArteItaliaIE,
+ SkyItaliaIE,
+)
from .slideshare import SlideshareIE
from .slideslive import SlidesLiveIE
from .slutload import SlutloadIE
@@ -1078,8 +1102,7 @@ from .spankbang import (
SpankBangPlaylistIE,
)
from .spankwire import SpankwireIE
-from .spiegel import SpiegelIE, SpiegelArticleIE
-from .spiegeltv import SpiegeltvIE
+from .spiegel import SpiegelIE
from .spike import (
BellatorIE,
ParamountNetworkIE,
@@ -1093,6 +1116,12 @@ from .stitcher import StitcherIE
from .sport5 import Sport5IE
from .sportbox import SportBoxIE
from .sportdeutschland import SportDeutschlandIE
+from .spreaker import (
+ SpreakerIE,
+ SpreakerPageIE,
+ SpreakerShowIE,
+ SpreakerShowPageIE,
+)
from .springboardplatform import SpringboardPlatformIE
from .sprout import SproutIE
from .srgssr import (
@@ -1174,6 +1203,7 @@ from .theweatherchannel import TheWeatherChannelIE
from .thisamericanlife import ThisAmericanLifeIE
from .thisav import ThisAVIE
from .thisoldhouse import ThisOldHouseIE
+from .thisvid import ThisVidIE
from .threeqsdn import ThreeQSDNIE
from .tiktok import TikTokIE
from .tinypic import TinyPicIE
@@ -1385,8 +1415,8 @@ from .vk import (
)
from .vlive import (
VLiveIE,
+ VLivePostIE,
VLiveChannelIE,
- VLivePlaylistIE
)
from .vodlocker import VodlockerIE
from .vodpl import VODPlIE
@@ -1503,21 +1533,18 @@ from .yourporn import YourPornIE
from .yourupload import YourUploadIE
from .youtube import (
YoutubeIE,
- YoutubeChannelIE,
YoutubeFavouritesIE,
YoutubeHistoryIE,
- YoutubeLiveIE,
+ YoutubeTabIE,
YoutubePlaylistIE,
- YoutubePlaylistsIE,
YoutubeRecommendedIE,
YoutubeSearchDateIE,
YoutubeSearchIE,
YoutubeSearchURLIE,
- YoutubeShowIE,
YoutubeSubscriptionsIE,
YoutubeTruncatedIDIE,
YoutubeTruncatedURLIE,
- YoutubeUserIE,
+ YoutubeYtUserIE,
YoutubeWatchLaterIE,
)
from .zapiks import ZapiksIE
@@ -1543,4 +1570,5 @@ from .zattoo import (
)
from .zdf import ZDFIE, ZDFChannelIE
from .zingmp3 import ZingMp3IE
+from .zoom import ZoomIE
from .zype import ZypeIE
diff --git a/youtube_dlc/extractor/franceinter.py b/youtube_dlc/extractor/franceinter.py
index 05806895c..ae822a50e 100644
--- a/youtube_dlc/extractor/franceinter.py
+++ b/youtube_dlc/extractor/franceinter.py
@@ -16,6 +16,7 @@ class FranceInterIE(InfoExtractor):
'ext': 'mp3',
'title': 'Affaire Cahuzac : le contentieux du compte en Suisse',
'description': 'md5:401969c5d318c061f86bda1fa359292b',
+ 'thumbnail': r're:^https?://.*\.jpg',
'upload_date': '20160907',
},
}
@@ -31,6 +32,7 @@ class FranceInterIE(InfoExtractor):
title = self._og_search_title(webpage)
description = self._og_search_description(webpage)
+ thumbnail = self._html_search_meta(['og:image', 'twitter:image'], webpage)
upload_date_str = self._search_regex(
r'class=["\']\s*cover-emission-period\s*["\'][^>]*>[^<]+\s+(\d{1,2}\s+[^\s]+\s+\d{4})<',
@@ -48,6 +50,7 @@ class FranceInterIE(InfoExtractor):
'id': video_id,
'title': title,
'description': description,
+ 'thumbnail': thumbnail,
'upload_date': upload_date,
'formats': [{
'url': video_url,
diff --git a/youtube_dlc/extractor/francetv.py b/youtube_dlc/extractor/francetv.py
index e340cddba..ab0df1bed 100644
--- a/youtube_dlc/extractor/francetv.py
+++ b/youtube_dlc/extractor/francetv.py
@@ -17,6 +17,7 @@ from ..utils import (
parse_duration,
try_get,
url_or_none,
+ urljoin,
)
from .dailymotion import DailymotionIE
@@ -128,18 +129,38 @@ class FranceTVIE(InfoExtractor):
is_live = None
- formats = []
- for video in info['videos']:
- if video['statut'] != 'ONLINE':
+ videos = []
+
+ for video in (info.get('videos') or []):
+ if video.get('statut') != 'ONLINE':
continue
- video_url = video['url']
+ if not video.get('url'):
+ continue
+ videos.append(video)
+
+ if not videos:
+ for device_type in ['desktop', 'mobile']:
+ fallback_info = self._download_json(
+ 'https://player.webservices.francetelevisions.fr/v1/videos/%s' % video_id,
+ video_id, 'Downloading fallback %s video JSON' % device_type, query={
+ 'device_type': device_type,
+ 'browser': 'chrome',
+ }, fatal=False)
+
+ if fallback_info and fallback_info.get('video'):
+ videos.append(fallback_info['video'])
+
+ formats = []
+ for video in videos:
+ video_url = video.get('url')
if not video_url:
continue
if is_live is None:
is_live = (try_get(
- video, lambda x: x['plages_ouverture'][0]['direct'],
- bool) is True) or '/live.francetv.fr/' in video_url
- format_id = video['format']
+ video, lambda x: x['plages_ouverture'][0]['direct'], bool) is True
+ or video.get('is_live') is True
+ or '/live.francetv.fr/' in video_url)
+ format_id = video.get('format')
ext = determine_ext(video_url)
if ext == 'f4m':
if georestricted:
@@ -154,6 +175,9 @@ class FranceTVIE(InfoExtractor):
sign(video_url, format_id), video_id, 'mp4',
entry_protocol='m3u8_native', m3u8_id=format_id,
fatal=False))
+ elif ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ sign(video_url, format_id), video_id, mpd_id=format_id, fatal=False))
elif video_url.startswith('rtmp'):
formats.append({
'url': video_url,
@@ -166,6 +190,7 @@ class FranceTVIE(InfoExtractor):
'url': video_url,
'format_id': format_id,
})
+
self._sort_formats(formats)
title = info['titre']
@@ -185,10 +210,10 @@ class FranceTVIE(InfoExtractor):
return {
'id': video_id,
'title': self._live_title(title) if is_live else title,
- 'description': clean_html(info['synopsis']),
- 'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', info['image']),
- 'duration': int_or_none(info.get('real_duration')) or parse_duration(info['duree']),
- 'timestamp': int_or_none(info['diffusion']['timestamp']),
+ 'description': clean_html(info.get('synopsis')),
+ 'thumbnail': urljoin('https://sivideo.webservices.francetelevisions.fr', info.get('image')),
+ 'duration': int_or_none(info.get('real_duration')) or parse_duration(info.get('duree')),
+ 'timestamp': int_or_none(try_get(info, lambda x: x['diffusion']['timestamp'])),
'is_live': is_live,
'formats': formats,
'subtitles': subtitles,
diff --git a/youtube_dlc/extractor/generic.py b/youtube_dlc/extractor/generic.py
index 3fab929a8..e5d29f316 100644
--- a/youtube_dlc/extractor/generic.py
+++ b/youtube_dlc/extractor/generic.py
@@ -91,6 +91,7 @@ from .piksel import PikselIE
from .videa import VideaIE
from .twentymin import TwentyMinutenIE
from .ustream import UstreamIE
+from .arte import ArteTVEmbedIE
from .videopress import VideoPressIE
from .rutube import RutubeIE
from .limelight import LimelightBaseIE
@@ -120,6 +121,8 @@ from .zype import ZypeIE
from .odnoklassniki import OdnoklassnikiIE
from .kinja import KinjaEmbedIE
from .gedi import GediEmbedsIE
+from .rcs import RCSEmbedsIE
+from .bitchute import BitChuteIE
class GenericIE(InfoExtractor):
@@ -842,7 +845,7 @@ class GenericIE(InfoExtractor):
'skip_download': True,
}
},
- # MTVSercices embed
+ # MTVServices embed
{
'url': 'http://www.vulture.com/2016/06/new-key-peele-sketches-released.html',
'md5': 'ca1aef97695ef2c1d6973256a57e5252',
@@ -2761,11 +2764,9 @@ class GenericIE(InfoExtractor):
return self.url_result(ustream_url, UstreamIE.ie_key())
# Look for embedded arte.tv player
- mobj = re.search(
- r'<(?:script|iframe) [^>]*?src="(?P<url>http://www\.arte\.tv/(?:playerv2/embed|arte_vp/index)[^"]+)"',
- webpage)
- if mobj is not None:
- return self.url_result(mobj.group('url'), 'ArteTVEmbed')
+ arte_urls = ArteTVEmbedIE._extract_urls(webpage)
+ if arte_urls:
+ return self.playlist_from_matches(arte_urls, video_id, video_title)
# Look for embedded francetv player
mobj = re.search(
@@ -3220,6 +3221,16 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches(
gedi_urls, video_id, video_title, ie=GediEmbedsIE.ie_key())
+ rcs_urls = RCSEmbedsIE._extract_urls(webpage)
+ if rcs_urls:
+ return self.playlist_from_matches(
+ rcs_urls, video_id, video_title, ie=RCSEmbedsIE.ie_key())
+
+ bitchute_urls = BitChuteIE._extract_urls(webpage)
+ if bitchute_urls:
+ return self.playlist_from_matches(
+ bitchute_urls, video_id, video_title, ie=BitChuteIE.ie_key())
+
# Look for HTML5 media
entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
if entries:
diff --git a/youtube_dlc/extractor/googledrive.py b/youtube_dlc/extractor/googledrive.py
index ec0d58a57..fdb15795a 100644
--- a/youtube_dlc/extractor/googledrive.py
+++ b/youtube_dlc/extractor/googledrive.py
@@ -3,11 +3,13 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_parse_qs
from ..utils import (
determine_ext,
ExtractorError,
int_or_none,
lowercase_escape,
+ try_get,
update_url_query,
)
@@ -38,21 +40,10 @@ class GoogleDriveIE(InfoExtractor):
# video can't be watched anonymously due to view count limit reached,
# but can be downloaded (see https://github.com/ytdl-org/youtube-dl/issues/14046)
'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view',
- 'md5': 'bfbd670d03a470bb1e6d4a257adec12e',
- 'info_dict': {
- 'id': '0B-vUyvmDLdWDcEt4WjBqcmI2XzQ',
- 'ext': 'mp4',
- 'title': 'Annabelle Creation (2017)- Z.V1 [TH].MP4',
- }
+ 'only_matching': True,
}, {
# video id is longer than 28 characters
'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit',
- 'info_dict': {
- 'id': '1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ',
- 'ext': 'mp4',
- 'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4',
- 'duration': 189,
- },
'only_matching': True,
}, {
'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
@@ -171,23 +162,21 @@ class GoogleDriveIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(
- 'http://docs.google.com/file/d/%s' % video_id, video_id)
+ video_info = compat_parse_qs(self._download_webpage(
+ 'https://drive.google.com/get_video_info',
+ video_id, query={'docid': video_id}))
+
+ def get_value(key):
+ return try_get(video_info, lambda x: x[key][0])
- title = self._search_regex(
- r'"title"\s*,\s*"([^"]+)', webpage, 'title',
- default=None) or self._og_search_title(webpage)
- duration = int_or_none(self._search_regex(
- r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds',
- default=None))
+ reason = get_value('reason')
+ title = get_value('title')
+ if not title and reason:
+ raise ExtractorError(reason, expected=True)
formats = []
- fmt_stream_map = self._search_regex(
- r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage,
- 'fmt stream map', default='').split(',')
- fmt_list = self._search_regex(
- r'"fmt_list"\s*,\s*"([^"]+)', webpage,
- 'fmt_list', default='').split(',')
+ fmt_stream_map = (get_value('fmt_stream_map') or '').split(',')
+ fmt_list = (get_value('fmt_list') or '').split(',')
if fmt_stream_map and fmt_list:
resolutions = {}
for fmt in fmt_list:
@@ -257,19 +246,14 @@ class GoogleDriveIE(InfoExtractor):
if urlh and urlh.headers.get('Content-Disposition'):
add_source_format(urlh)
- if not formats:
- reason = self._search_regex(
- r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None)
- if reason:
- raise ExtractorError(reason, expected=True)
+ if not formats and reason:
+ raise ExtractorError(reason, expected=True)
self._sort_formats(formats)
- hl = self._search_regex(
- r'"hl"\s*,\s*"([^"]+)', webpage, 'hl', default=None)
+ hl = get_value('hl')
subtitles_id = None
- ttsurl = self._search_regex(
- r'"ttsurl"\s*,\s*"([^"]+)', webpage, 'ttsurl', default=None)
+ ttsurl = get_value('ttsurl')
if ttsurl:
# the video Id for subtitles will be the last value in the ttsurl
# query string
@@ -281,8 +265,8 @@ class GoogleDriveIE(InfoExtractor):
return {
'id': video_id,
'title': title,
- 'thumbnail': self._og_search_thumbnail(webpage, default=None),
- 'duration': duration,
+ 'thumbnail': 'https://drive.google.com/thumbnail?id=' + video_id,
+ 'duration': int_or_none(get_value('length_seconds')),
'formats': formats,
'subtitles': self.extract_subtitles(video_id, subtitles_id, hl),
'automatic_captions': self.extract_automatic_captions(
diff --git a/youtube_dlc/extractor/ina.py b/youtube_dlc/extractor/ina.py
index 12695af27..b3b2683cb 100644
--- a/youtube_dlc/extractor/ina.py
+++ b/youtube_dlc/extractor/ina.py
@@ -12,7 +12,7 @@ from ..utils import (
class InaIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?ina\.fr/(?:video|audio)/(?P<id>[A-Z0-9_]+)'
+ _VALID_URL = r'https?://(?:(?:www|m)\.)?ina\.fr/(?:video|audio)/(?P<id>[A-Z0-9_]+)'
_TESTS = [{
'url': 'http://www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html',
'md5': 'a667021bf2b41f8dc6049479d9bb38a3',
@@ -31,6 +31,9 @@ class InaIE(InfoExtractor):
}, {
'url': 'https://www.ina.fr/video/P16173408-video.html',
'only_matching': True,
+ }, {
+ 'url': 'http://m.ina.fr/video/I12055569',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/youtube_dlc/extractor/infoq.py b/youtube_dlc/extractor/infoq.py
index 18249cf9b..0a70a1fb4 100644
--- a/youtube_dlc/extractor/infoq.py
+++ b/youtube_dlc/extractor/infoq.py
@@ -54,7 +54,7 @@ class InfoQIE(BokeCCBaseIE):
def _extract_rtmp_video(self, webpage):
# The server URL is hardcoded
- video_url = 'rtmpe://video.infoq.com/cfx/st/'
+ video_url = 'rtmpe://videof.infoq.com/cfx/st/'
# Extract video URL
encoded_id = self._search_regex(
@@ -86,17 +86,18 @@ class InfoQIE(BokeCCBaseIE):
return [{
'format_id': 'http_video',
'url': http_video_url,
+ 'http_headers': {'Referer': 'https://www.infoq.com/'},
}]
def _extract_http_audio(self, webpage, video_id):
- fields = self._hidden_inputs(webpage)
+ fields = self._form_hidden_inputs('mp3Form', webpage)
http_audio_url = fields.get('filename')
if not http_audio_url:
return []
# base URL is found in the Location header in the response returned by
# GET https://www.infoq.com/mp3download.action?filename=... when logged in.
- http_audio_url = compat_urlparse.urljoin('http://res.infoq.com/downloads/mp3downloads/', http_audio_url)
+ http_audio_url = compat_urlparse.urljoin('http://ress.infoq.com/downloads/mp3downloads/', http_audio_url)
http_audio_url = update_url_query(http_audio_url, self._extract_cf_auth(webpage))
# audio file seem to be missing some times even if there is a download link
diff --git a/youtube_dlc/extractor/instagram.py b/youtube_dlc/extractor/instagram.py
index b061850a1..c3eba0114 100644
--- a/youtube_dlc/extractor/instagram.py
+++ b/youtube_dlc/extractor/instagram.py
@@ -126,16 +126,23 @@ class InstagramIE(InfoExtractor):
uploader_id, like_count, comment_count, comments, height,
width) = [None] * 11
- shared_data = self._parse_json(
- self._search_regex(
- r'window\._sharedData\s*=\s*({.+?});',
- webpage, 'shared data', default='{}'),
- video_id, fatal=False)
+ shared_data = try_get(webpage,
+ (lambda x: self._parse_json(
+ self._search_regex(
+ r'window\.__additionalDataLoaded\(\'/(?:p|tv)/(?:[^/?#&]+)/\',({.+?})\);',
+ x, 'additional data', default='{}'),
+ video_id, fatal=False),
+ lambda x: self._parse_json(
+ self._search_regex(
+ r'window\._sharedData\s*=\s*({.+?});',
+ x, 'shared data', default='{}'),
+ video_id, fatal=False)['entry_data']['PostPage'][0]),
+ None)
if shared_data:
media = try_get(
shared_data,
- (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'],
- lambda x: x['entry_data']['PostPage'][0]['media']),
+ (lambda x: x['graphql']['shortcode_media'],
+ lambda x: x['media']),
dict)
if media:
video_url = media.get('video_url')
@@ -144,7 +151,7 @@ class InstagramIE(InfoExtractor):
description = try_get(
media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'],
compat_str) or media.get('caption')
- thumbnail = media.get('display_src')
+ thumbnail = media.get('display_src') or media.get('thumbnail_src')
timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date'))
uploader = media.get('owner', {}).get('full_name')
uploader_id = media.get('owner', {}).get('username')
diff --git a/youtube_dlc/extractor/iqiyi.py b/youtube_dlc/extractor/iqiyi.py
index cd11aa70f..5df674daf 100644
--- a/youtube_dlc/extractor/iqiyi.py
+++ b/youtube_dlc/extractor/iqiyi.py
@@ -150,7 +150,7 @@ class IqiyiSDKInterpreter(object):
elif function in other_functions:
other_functions[function]()
else:
- raise ExtractorError('Unknown funcion %s' % function)
+ raise ExtractorError('Unknown function %s' % function)
return sdk.target
diff --git a/youtube_dlc/extractor/itv.py b/youtube_dlc/extractor/itv.py
index ad2f4eca5..20144cd82 100644
--- a/youtube_dlc/extractor/itv.py
+++ b/youtube_dlc/extractor/itv.py
@@ -20,6 +20,7 @@ from ..utils import (
merge_dicts,
parse_duration,
smuggle_url,
+ try_get,
url_or_none,
xpath_with_ns,
xpath_element,
@@ -280,12 +281,12 @@ class ITVIE(InfoExtractor):
class ITVBTCCIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?itv\.com/btcc/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_TEST = {
- 'url': 'http://www.itv.com/btcc/races/btcc-2018-all-the-action-from-brands-hatch',
+ 'url': 'https://www.itv.com/btcc/articles/btcc-2019-brands-hatch-gp-race-action',
'info_dict': {
- 'id': 'btcc-2018-all-the-action-from-brands-hatch',
- 'title': 'BTCC 2018: All the action from Brands Hatch',
+ 'id': 'btcc-2019-brands-hatch-gp-race-action',
+ 'title': 'BTCC 2019: Brands Hatch GP race action',
},
- 'playlist_mincount': 9,
+ 'playlist_count': 12,
}
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1582188683001/HkiHLnNRx_default/index.html?videoId=%s'
@@ -294,6 +295,16 @@ class ITVBTCCIE(InfoExtractor):
webpage = self._download_webpage(url, playlist_id)
+ json_map = try_get(self._parse_json(self._html_search_regex(
+ '(?s)<script[^>]+id=[\'"]__NEXT_DATA__[^>]*>([^<]+)</script>', webpage, 'json_map'), playlist_id),
+ lambda x: x['props']['pageProps']['article']['body']['content']) or []
+
+ # Discard empty objects
+ video_ids = []
+ for video in json_map:
+ if video['data'].get('id'):
+ video_ids.append(video['data']['id'])
+
entries = [
self.url_result(
smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % video_id, {
@@ -305,7 +316,7 @@ class ITVBTCCIE(InfoExtractor):
'referrer': url,
}),
ie=BrightcoveNewIE.ie_key(), video_id=video_id)
- for video_id in re.findall(r'data-video-id=["\'](\d+)', webpage)]
+ for video_id in video_ids]
title = self._og_search_title(webpage, fatal=False)
diff --git a/youtube_dlc/extractor/kusi.py b/youtube_dlc/extractor/kusi.py
index 6a7e3baa7..9833d35eb 100644
--- a/youtube_dlc/extractor/kusi.py
+++ b/youtube_dlc/extractor/kusi.py
@@ -64,7 +64,7 @@ class KUSIIE(InfoExtractor):
duration = float_or_none(xpath_text(doc, 'DURATION'), scale=1000)
description = xpath_text(doc, 'ABSTRACT')
thumbnail = xpath_text(doc, './THUMBNAILIMAGE/FILENAME')
- createtion_time = timeconvert(xpath_text(doc, 'rfc822creationdate'))
+ creation_time = timeconvert(xpath_text(doc, 'rfc822creationdate'))
quality_options = doc.find('{http://search.yahoo.com/mrss/}group').findall('{http://search.yahoo.com/mrss/}content')
formats = []
@@ -84,5 +84,5 @@ class KUSIIE(InfoExtractor):
'duration': duration,
'formats': formats,
'thumbnail': thumbnail,
- 'timestamp': createtion_time,
+ 'timestamp': creation_time,
}
diff --git a/youtube_dlc/extractor/la7.py b/youtube_dlc/extractor/la7.py
index f5d4564fa..74b006fb5 100644
--- a/youtube_dlc/extractor/la7.py
+++ b/youtube_dlc/extractor/la7.py
@@ -36,6 +36,9 @@ class LA7IE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
+ if not url.startswith('http'):
+ url = '%s//%s' % (self.http_scheme(), url)
+
webpage = self._download_webpage(url, video_id)
player_data = self._search_regex(
diff --git a/youtube_dlc/extractor/lbry.py b/youtube_dlc/extractor/lbry.py
new file mode 100644
index 000000000..6177297ab
--- /dev/null
+++ b/youtube_dlc/extractor/lbry.py
@@ -0,0 +1,91 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ determine_ext,
+ ExtractorError,
+ int_or_none,
+ mimetype2ext,
+ try_get,
+)
+
+
+class LBRYIE(InfoExtractor):
+ IE_NAME = 'lbry.tv'
+ _VALID_URL = r'https?://(?:www\.)?(?:lbry\.tv|odysee\.com)/(?P<id>@[^:]+:[0-9a-z]+/[^:]+:[0-9a-z])'
+ _TESTS = [{
+ # Video
+ 'url': 'https://lbry.tv/@Mantega:1/First-day-LBRY:1',
+ 'md5': '65bd7ec1f6744ada55da8e4c48a2edf9',
+ 'info_dict': {
+ 'id': '17f983b61f53091fb8ea58a9c56804e4ff8cff4d',
+ 'ext': 'mp4',
+ 'title': 'First day in LBRY? Start HERE!',
+ 'description': 'md5:f6cb5c704b332d37f5119313c2c98f51',
+ 'timestamp': 1595694354,
+ 'upload_date': '20200725',
+ }
+ }, {
+ # Audio
+ 'url': 'https://lbry.tv/@LBRYFoundation:0/Episode-1:e',
+ 'md5': 'c94017d3eba9b49ce085a8fad6b98d00',
+ 'info_dict': {
+ 'id': 'e7d93d772bd87e2b62d5ab993c1c3ced86ebb396',
+ 'ext': 'mp3',
+ 'title': 'The LBRY Foundation Community Podcast Episode 1 - Introduction, Streaming on LBRY, Transcoding',
+ 'description': 'md5:661ac4f1db09f31728931d7b88807a61',
+ 'timestamp': 1591312601,
+ 'upload_date': '20200604',
+ }
+ }, {
+ 'url': 'https://odysee.com/@BrodieRobertson:5/apple-is-tracking-everything-you-do-on:e',
+ 'only_matching': True,
+ }, {
+ 'url': "https://odysee.com/@ScammerRevolts:b0/I-SYSKEY'D-THE-SAME-SCAMMERS-3-TIMES!:b",
+ 'only_matching': True,
+ }]
+
+ def _call_api_proxy(self, method, display_id, params):
+ return self._download_json(
+ 'https://api.lbry.tv/api/v1/proxy', display_id,
+ headers={'Content-Type': 'application/json-rpc'},
+ data=json.dumps({
+ 'method': method,
+ 'params': params,
+ }).encode())['result']
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url).replace(':', '#')
+ uri = 'lbry://' + display_id
+ result = self._call_api_proxy(
+ 'resolve', display_id, {'urls': [uri]})[uri]
+ result_value = result['value']
+ if result_value.get('stream_type') not in ('video', 'audio'):
+ raise ExtractorError('Unsupported URL', expected=True)
+ streaming_url = self._call_api_proxy(
+ 'get', display_id, {'uri': uri})['streaming_url']
+ source = result_value.get('source') or {}
+ media = result_value.get('video') or result_value.get('audio') or {}
+ signing_channel = result_value.get('signing_channel') or {}
+
+ return {
+ 'id': result['claim_id'],
+ 'title': result_value['title'],
+ 'thumbnail': try_get(result_value, lambda x: x['thumbnail']['url'], compat_str),
+ 'description': result_value.get('description'),
+ 'license': result_value.get('license'),
+ 'timestamp': int_or_none(result.get('timestamp')),
+ 'tags': result_value.get('tags'),
+ 'width': int_or_none(media.get('width')),
+ 'height': int_or_none(media.get('height')),
+ 'duration': int_or_none(media.get('duration')),
+ 'channel': signing_channel.get('name'),
+ 'channel_id': signing_channel.get('claim_id'),
+ 'ext': determine_ext(source.get('name')) or mimetype2ext(source.get('media_type')),
+ 'filesize': int_or_none(source.get('size')),
+ 'url': streaming_url,
+ }
diff --git a/youtube_dlc/extractor/lrt.py b/youtube_dlc/extractor/lrt.py
index f5c997ef4..89d549858 100644
--- a/youtube_dlc/extractor/lrt.py
+++ b/youtube_dlc/extractor/lrt.py
@@ -5,28 +5,26 @@ import re
from .common import InfoExtractor
from ..utils import (
- determine_ext,
- int_or_none,
- parse_duration,
- remove_end,
+ clean_html,
+ merge_dicts,
)
class LRTIE(InfoExtractor):
IE_NAME = 'lrt.lt'
- _VALID_URL = r'https?://(?:www\.)?lrt\.lt/mediateka/irasas/(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?lrt\.lt(?P<path>/mediateka/irasas/(?P<id>[0-9]+))'
_TESTS = [{
# m3u8 download
- 'url': 'http://www.lrt.lt/mediateka/irasas/54391/',
- 'md5': 'fe44cf7e4ab3198055f2c598fc175cb0',
+ 'url': 'https://www.lrt.lt/mediateka/irasas/2000127261/greita-ir-gardu-sicilijos-ikvepta-klasikiniu-makaronu-su-baklazanais-vakariene',
+ 'md5': '85cb2bb530f31d91a9c65b479516ade4',
'info_dict': {
- 'id': '54391',
+ 'id': '2000127261',
'ext': 'mp4',
- 'title': 'Septynios Kauno dienos',
- 'description': 'md5:24d84534c7dc76581e59f5689462411a',
- 'duration': 1783,
- 'view_count': int,
- 'like_count': int,
+ 'title': 'Greita ir gardu: Sicilijos įkvėpta klasikinių makaronų su baklažanais vakarienė',
+ 'description': 'md5:ad7d985f51b0dc1489ba2d76d7ed47fa',
+ 'duration': 3035,
+ 'timestamp': 1604079000,
+ 'upload_date': '20201030',
},
}, {
# direct mp3 download
@@ -43,52 +41,35 @@ class LRTIE(InfoExtractor):
},
}]
+ def _extract_js_var(self, webpage, var_name, default):
+ return self._search_regex(
+ r'%s\s*=\s*(["\'])((?:(?!\1).)+)\1' % var_name,
+ webpage, var_name.replace('_', ' '), default, group=2)
+
def _real_extract(self, url):
- video_id = self._match_id(url)
+ path, video_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(url, video_id)
- title = remove_end(self._og_search_title(webpage), ' - LRT')
-
- formats = []
- for _, file_url in re.findall(
- r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage):
- ext = determine_ext(file_url)
- if ext not in ('m3u8', 'mp3'):
- continue
- # mp3 served as m3u8 produces stuttered media file
- if ext == 'm3u8' and '.mp3' in file_url:
- continue
- if ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- file_url, video_id, 'mp4', entry_protocol='m3u8_native',
- fatal=False))
- elif ext == 'mp3':
- formats.append({
- 'url': file_url,
- 'vcodec': 'none',
- })
- self._sort_formats(formats)
+ media_url = self._extract_js_var(webpage, 'main_url', path)
+ media = self._download_json(self._extract_js_var(
+ webpage, 'media_info_url',
+ 'https://www.lrt.lt/servisai/stream_url/vod/media_info/'),
+ video_id, query={'url': media_url})
+ jw_data = self._parse_jwplayer_data(
+ media['playlist_item'], video_id, base_url=url)
- thumbnail = self._og_search_thumbnail(webpage)
- description = self._og_search_description(webpage)
- duration = parse_duration(self._search_regex(
- r'var\s+record_len\s*=\s*(["\'])(?P<duration>[0-9]+:[0-9]+:[0-9]+)\1',
- webpage, 'duration', default=None, group='duration'))
+ json_ld_data = self._search_json_ld(webpage, video_id)
- view_count = int_or_none(self._html_search_regex(
- r'<div[^>]+class=(["\']).*?record-desc-seen.*?\1[^>]*>(?P<count>.+?)</div>',
- webpage, 'view count', fatal=False, group='count'))
- like_count = int_or_none(self._search_regex(
- r'<span[^>]+id=(["\'])flikesCount.*?\1>(?P<count>\d+)<',
- webpage, 'like count', fatal=False, group='count'))
+ tags = []
+ for tag in (media.get('tags') or []):
+ tag_name = tag.get('name')
+ if not tag_name:
+ continue
+ tags.append(tag_name)
- return {
- 'id': video_id,
- 'title': title,
- 'formats': formats,
- 'thumbnail': thumbnail,
- 'description': description,
- 'duration': duration,
- 'view_count': view_count,
- 'like_count': like_count,
+ clean_info = {
+ 'description': clean_html(media.get('content')),
+ 'tags': tags,
}
+
+ return merge_dicts(clean_info, jw_data, json_ld_data)
diff --git a/youtube_dlc/extractor/mailru.py b/youtube_dlc/extractor/mailru.py
index 6fdf70aa6..5bfe40649 100644
--- a/youtube_dlc/extractor/mailru.py
+++ b/youtube_dlc/extractor/mailru.py
@@ -12,6 +12,7 @@ from ..utils import (
parse_duration,
remove_end,
try_get,
+ urljoin,
)
@@ -93,6 +94,14 @@ class MailRuIE(InfoExtractor):
{
'url': 'https://my.mail.ru//list//sinyutin10/video/_myvideo/4.html',
'only_matching': True,
+ },
+ {
+ 'url': 'https://my.mail.ru/mail/cloud-strife/video/embed/Games/2009',
+ 'only_matching': True,
+ },
+ {
+ 'url': 'https://videoapi.my.mail.ru/videos/embed/mail/cloud-strife/Games/2009.html',
+ 'only_matching': True,
}
]
@@ -110,7 +119,7 @@ class MailRuIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
page_config = self._parse_json(self._search_regex([
r'(?s)<script[^>]+class="sp-video__page-config"[^>]*>(.+?)</script>',
- r'(?s)"video":\s*(\{.+?\}),'],
+ r'(?s)"video":\s*({.+?}),'],
webpage, 'page config', default='{}'), video_id, fatal=False)
if page_config:
meta_url = page_config.get('metaUrl') or page_config.get('video', {}).get('metaUrl') or page_config.get('metadataUrl')
@@ -121,7 +130,7 @@ class MailRuIE(InfoExtractor):
# fix meta_url if missing the host address
if re.match(r'^\/\+\/', meta_url):
- meta_url = 'https://my.mail.ru' + meta_url
+ meta_url = urljoin('https://my.mail.ru', meta_url)
if meta_url:
video_data = self._download_json(
diff --git a/youtube_dlc/extractor/malltv.py b/youtube_dlc/extractor/malltv.py
index 6f4fd927f..fadfd9338 100644
--- a/youtube_dlc/extractor/malltv.py
+++ b/youtube_dlc/extractor/malltv.py
@@ -1,10 +1,16 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from ..utils import merge_dicts
+from ..utils import (
+ clean_html,
+ dict_get,
+ float_or_none,
+ int_or_none,
+ merge_dicts,
+ parse_duration,
+ try_get,
+)
class MallTVIE(InfoExtractor):
@@ -17,7 +23,7 @@ class MallTVIE(InfoExtractor):
'display_id': '18-miliard-pro-neziskovky-opravdu-jsou-sportovci-nebo-clovek-v-tisni-pijavice',
'ext': 'mp4',
'title': '18 miliard pro neziskovky. Opravdu jsou sportovci nebo Člověk v tísni pijavice?',
- 'description': 'md5:25fc0ec42a72ba602b602c683fa29deb',
+ 'description': 'md5:db7d5744a4bd4043d9d98324aa72ab35',
'duration': 216,
'timestamp': 1538870400,
'upload_date': '20181007',
@@ -37,20 +43,46 @@ class MallTVIE(InfoExtractor):
webpage = self._download_webpage(
url, display_id, headers=self.geo_verification_headers())
- SOURCE_RE = r'(<source[^>]+\bsrc=(?:(["\'])(?:(?!\2).)+|[^\s]+)/(?P<id>[\da-z]+)/index)\b'
+ video = self._parse_json(self._search_regex(
+ r'videoObject\s*=\s*JSON\.parse\(JSON\.stringify\(({.+?})\)\);',
+ webpage, 'video object'), display_id)
+ video_source = video['VideoSource']
video_id = self._search_regex(
- SOURCE_RE, webpage, 'video id', group='id')
+ r'/([\da-z]+)/index\b', video_source, 'video id')
+
+ formats = self._extract_m3u8_formats(
+ video_source + '.m3u8', video_id, 'mp4', 'm3u8_native')
+ self._sort_formats(formats)
+
+ subtitles = {}
+ for s in (video.get('Subtitles') or {}):
+ s_url = s.get('Url')
+ if not s_url:
+ continue
+ subtitles.setdefault(s.get('Language') or 'cz', []).append({
+ 'url': s_url,
+ })
+
+ entity_counts = video.get('EntityCounts') or {}
- media = self._parse_html5_media_entries(
- url, re.sub(SOURCE_RE, r'\1.m3u8', webpage), video_id,
- m3u8_id='hls', m3u8_entry_protocol='m3u8_native')[0]
+ def get_count(k):
+ v = entity_counts.get(k + 's') or {}
+ return int_or_none(dict_get(v, ('Count', 'StrCount')))
info = self._search_json_ld(webpage, video_id, default={})
- return merge_dicts(media, info, {
+ return merge_dicts({
'id': video_id,
'display_id': display_id,
- 'title': self._og_search_title(webpage, default=None) or display_id,
- 'description': self._og_search_description(webpage, default=None),
- 'thumbnail': self._og_search_thumbnail(webpage, default=None),
- })
+ 'title': video.get('Title'),
+ 'description': clean_html(video.get('Description')),
+ 'thumbnail': video.get('ThumbnailUrl'),
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'duration': int_or_none(video.get('DurationSeconds')) or parse_duration(video.get('Duration')),
+ 'view_count': get_count('View'),
+ 'like_count': get_count('Like'),
+ 'dislike_count': get_count('Dislike'),
+ 'average_rating': float_or_none(try_get(video, lambda x: x['EntityRating']['AvarageRate'])),
+ 'comment_count': get_count('Comment'),
+ }, info)
diff --git a/youtube_dlc/extractor/medaltv.py b/youtube_dlc/extractor/medaltv.py
new file mode 100644
index 000000000..1603b55f6
--- /dev/null
+++ b/youtube_dlc/extractor/medaltv.py
@@ -0,0 +1,131 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ ExtractorError,
+ float_or_none,
+ int_or_none,
+ str_or_none,
+ try_get,
+)
+
+
+class MedalTVIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?medal\.tv/clips/(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'https://medal.tv/clips/34934644/3Is9zyGMoBMr',
+ 'md5': '7b07b064331b1cf9e8e5c52a06ae68fa',
+ 'info_dict': {
+ 'id': '34934644',
+ 'ext': 'mp4',
+ 'title': 'Quad Cold',
+ 'description': 'Medal,https://medal.tv/desktop/',
+ 'uploader': 'MowgliSB',
+ 'timestamp': 1603165266,
+ 'upload_date': '20201020',
+ 'uploader_id': 10619174,
+ }
+ }, {
+ 'url': 'https://medal.tv/clips/36787208',
+ 'md5': 'b6dc76b78195fff0b4f8bf4a33ec2148',
+ 'info_dict': {
+ 'id': '36787208',
+ 'ext': 'mp4',
+ 'title': 'u tk me i tk u bigger',
+ 'description': 'Medal,https://medal.tv/desktop/',
+ 'uploader': 'Mimicc',
+ 'timestamp': 1605580939,
+ 'upload_date': '20201117',
+ 'uploader_id': 5156321,
+ }
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ hydration_data = self._parse_json(self._search_regex(
+ r'<script[^>]*>\s*(?:var\s*)?hydrationData\s*=\s*({.+?})\s*</script>',
+ webpage, 'hydration data', default='{}'), video_id)
+
+ clip = try_get(
+ hydration_data, lambda x: x['clips'][video_id], dict) or {}
+ if not clip:
+ raise ExtractorError(
+ 'Could not find video information.', video_id=video_id)
+
+ title = clip['contentTitle']
+
+ source_width = int_or_none(clip.get('sourceWidth'))
+ source_height = int_or_none(clip.get('sourceHeight'))
+
+ aspect_ratio = source_width / source_height if source_width and source_height else 16 / 9
+
+ def add_item(container, item_url, height, id_key='format_id', item_id=None):
+ item_id = item_id or '%dp' % height
+ if item_id not in item_url:
+ return
+ width = int(round(aspect_ratio * height))
+ container.append({
+ 'url': item_url,
+ id_key: item_id,
+ 'width': width,
+ 'height': height
+ })
+
+ formats = []
+ thumbnails = []
+ for k, v in clip.items():
+ if not (v and isinstance(v, compat_str)):
+ continue
+ mobj = re.match(r'(contentUrl|thumbnail)(?:(\d+)p)?$', k)
+ if not mobj:
+ continue
+ prefix = mobj.group(1)
+ height = int_or_none(mobj.group(2))
+ if prefix == 'contentUrl':
+ add_item(
+ formats, v, height or source_height,
+ item_id=None if height else 'source')
+ elif prefix == 'thumbnail':
+ add_item(thumbnails, v, height, 'id')
+
+ error = clip.get('error')
+ if not formats and error:
+ if error == 404:
+ raise ExtractorError(
+ 'That clip does not exist.',
+ expected=True, video_id=video_id)
+ else:
+ raise ExtractorError(
+ 'An unknown error occurred ({0}).'.format(error),
+ video_id=video_id)
+
+ self._sort_formats(formats)
+
+ # Necessary because the id of the author is not known in advance.
+ # Won't raise an issue if no profile can be found as this is optional.
+ author = try_get(
+ hydration_data, lambda x: list(x['profiles'].values())[0], dict) or {}
+ author_id = str_or_none(author.get('id'))
+ author_url = 'https://medal.tv/users/{0}'.format(author_id) if author_id else None
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnails': thumbnails,
+ 'description': clip.get('contentDescription'),
+ 'uploader': author.get('displayName'),
+ 'timestamp': float_or_none(clip.get('created'), 1000),
+ 'uploader_id': author_id,
+ 'uploader_url': author_url,
+ 'duration': int_or_none(clip.get('videoLengthSeconds')),
+ 'view_count': int_or_none(clip.get('views')),
+ 'like_count': int_or_none(clip.get('likes')),
+ 'comment_count': int_or_none(clip.get('comments')),
+ }
diff --git a/youtube_dlc/extractor/mgtv.py b/youtube_dlc/extractor/mgtv.py
index 71fc3ec56..cab3aa045 100644
--- a/youtube_dlc/extractor/mgtv.py
+++ b/youtube_dlc/extractor/mgtv.py
@@ -17,9 +17,8 @@ from ..utils import (
class MGTVIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P<id>\d+)\.html'
+ _VALID_URL = r'https?://(?:w(?:ww)?\.)?mgtv\.com/(v|b)/(?:[^/]+/)*(?P<id>\d+)\.html'
IE_DESC = '芒果TV'
- _GEO_COUNTRIES = ['CN']
_TESTS = [{
'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html',
@@ -34,14 +33,18 @@ class MGTVIE(InfoExtractor):
}, {
'url': 'http://www.mgtv.com/b/301817/3826653.html',
'only_matching': True,
+ }, {
+ 'url': 'https://w.mgtv.com/b/301817/3826653.html',
+ 'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
+ tk2 = base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1]
try:
api_data = self._download_json(
'https://pcweb.api.mgtv.com/player/video', video_id, query={
- 'tk2': base64.urlsafe_b64encode(b'did=%s|pno=1030|ver=0.3.0301|clit=%d' % (compat_str(uuid.uuid4()).encode(), time.time()))[::-1],
+ 'tk2': tk2,
'video_id': video_id,
}, headers=self.geo_verification_headers())['data']
except ExtractorError as e:
@@ -56,6 +59,7 @@ class MGTVIE(InfoExtractor):
stream_data = self._download_json(
'https://pcweb.api.mgtv.com/player/getSource', video_id, query={
'pm2': api_data['atc']['pm2'],
+ 'tk2': tk2,
'video_id': video_id,
}, headers=self.geo_verification_headers())['data']
stream_domain = stream_data['stream_domain'][0]
diff --git a/youtube_dlc/extractor/mtv.py b/youtube_dlc/extractor/mtv.py
index 6b3658397..d31f53137 100644
--- a/youtube_dlc/extractor/mtv.py
+++ b/youtube_dlc/extractor/mtv.py
@@ -289,7 +289,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
return mgid
- def _extract_mgid(self, webpage, url, data_zone=None):
+ def _extract_mgid(self, webpage, url, title=None, data_zone=None):
try:
# the url can be http://media.mtvnservices.com/fb/{mgid}.swf
# or http://media.mtvnservices.com/{mgid}
@@ -300,7 +300,8 @@ class MTVServicesInfoExtractor(InfoExtractor):
except RegexNotFoundError:
mgid = None
- title = self._match_id(url)
+ if not title:
+ title = url_basename(url)
try:
window_data = self._parse_json(self._search_regex(
@@ -336,7 +337,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
def _real_extract(self, url):
title = url_basename(url)
webpage = self._download_webpage(url, title)
- mgid = self._extract_mgid(webpage, url)
+ mgid = self._extract_mgid(webpage, url, title=title)
videos_info = self._get_videos_info(mgid, url=url)
return videos_info
@@ -402,6 +403,18 @@ class MTVIE(MTVServicesInfoExtractor):
'only_matching': True,
}]
+ @staticmethod
+ def extract_child_with_type(parent, t):
+ children = parent['children']
+ return next(c for c in children if c.get('type') == t)
+
+ def _extract_mgid(self, webpage):
+ data = self._parse_json(self._search_regex(
+ r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None)
+ main_container = self.extract_child_with_type(data, 'MainContainer')
+ video_player = self.extract_child_with_type(main_container, 'VideoPlayer')
+ return video_player['props']['media']['video']['config']['uri']
+
class MTVJapanIE(MTVServicesInfoExtractor):
IE_NAME = 'mtvjapan'
diff --git a/youtube_dlc/extractor/nbc.py b/youtube_dlc/extractor/nbc.py
index 6f3cb3003..ea5f5a315 100644
--- a/youtube_dlc/extractor/nbc.py
+++ b/youtube_dlc/extractor/nbc.py
@@ -10,7 +10,6 @@ from .adobepass import AdobePassIE
from ..compat import compat_urllib_parse_unquote
from ..utils import (
int_or_none,
- js_to_json,
parse_duration,
smuggle_url,
try_get,
@@ -394,8 +393,8 @@ class NBCNewsIE(ThePlatformIE):
webpage = self._download_webpage(url, video_id)
data = self._parse_json(self._search_regex(
- r'window\.__data\s*=\s*({.+});', webpage,
- 'bootstrap json'), video_id, js_to_json)
+ r'<script[^>]+id="__NEXT_DATA__"[^>]*>({.+?})</script>',
+ webpage, 'bootstrap json'), video_id)['props']['initialState']
video_data = try_get(data, lambda x: x['video']['current'], dict)
if not video_data:
video_data = data['article']['content'][0]['primaryMedia']['video']
diff --git a/youtube_dlc/extractor/ndr.py b/youtube_dlc/extractor/ndr.py
index f3897c71b..81abb3120 100644
--- a/youtube_dlc/extractor/ndr.py
+++ b/youtube_dlc/extractor/ndr.py
@@ -83,6 +83,29 @@ class NDRIE(NDRBaseIE):
'skip_download': True,
},
}, {
+ # with subtitles
+ 'url': 'https://www.ndr.de/fernsehen/sendungen/extra_3/extra-3-Satiremagazin-mit-Christian-Ehring,sendung1091858.html',
+ 'info_dict': {
+ 'id': 'extra18674',
+ 'display_id': 'extra-3-Satiremagazin-mit-Christian-Ehring',
+ 'ext': 'mp4',
+ 'title': 'Extra 3 vom 11.11.2020 mit Christian Ehring',
+ 'description': 'md5:42ee53990a715eaaf4dc7f13a3bd56c6',
+ 'uploader': 'ndrtv',
+ 'upload_date': '20201113',
+ 'duration': 1749,
+ 'subtitles': {
+ 'de': [{
+ 'ext': 'ttml',
+ 'url': r're:^https://www\.ndr\.de.+',
+ }],
+ },
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'expected_warnings': ['Unable to download f4m manifest'],
+ }, {
'url': 'https://www.ndr.de/Fettes-Brot-Ferris-MC-und-Thees-Uhlmann-live-on-stage,festivalsommer116.html',
'only_matching': True,
}]
@@ -242,6 +265,20 @@ class NDREmbedBaseIE(InfoExtractor):
'preference': quality_key(thumbnail.get('quality')),
})
+ subtitles = {}
+ tracks = config.get('tracks')
+ if tracks and isinstance(tracks, list):
+ for track in tracks:
+ if not isinstance(track, dict):
+ continue
+ track_url = urljoin(url, track.get('src'))
+ if not track_url:
+ continue
+ subtitles.setdefault(track.get('srclang') or 'de', []).append({
+ 'url': track_url,
+ 'ext': 'ttml',
+ })
+
return {
'id': video_id,
'title': title,
@@ -251,6 +288,7 @@ class NDREmbedBaseIE(InfoExtractor):
'duration': duration,
'thumbnails': thumbnails,
'formats': formats,
+ 'subtitles': subtitles,
}
diff --git a/youtube_dlc/extractor/netzkino.py b/youtube_dlc/extractor/netzkino.py
index aec3026b1..3d1a06d0b 100644
--- a/youtube_dlc/extractor/netzkino.py
+++ b/youtube_dlc/extractor/netzkino.py
@@ -13,17 +13,16 @@ from ..utils import (
class NetzkinoIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?netzkino\.de/\#!/(?P<category>[^/]+)/(?P<id>[^/]+)'
+ _VALID_URL = r'https?://(?:www\.)?netzkino\.de/\#!/[^/]+/(?P<id>[^/]+)'
- _TEST = {
- 'url': 'http://www.netzkino.de/#!/scifikino/rakete-zum-mond',
+ _TESTS = [{
+ 'url': 'https://www.netzkino.de/#!/scifikino/rakete-zum-mond',
'md5': '92a3f8b76f8d7220acce5377ea5d4873',
'info_dict': {
'id': 'rakete-zum-mond',
'ext': 'mp4',
- 'title': 'Rakete zum Mond (Endstation Mond, Destination Moon)',
- 'comments': 'mincount:3',
- 'description': 'md5:1eddeacc7e62d5a25a2d1a7290c64a28',
+ 'title': 'Rakete zum Mond \u2013 Jules Verne',
+ 'description': 'md5:f0a8024479618ddbfa450ff48ffa6c60',
'upload_date': '20120813',
'thumbnail': r're:https?://.*\.jpg$',
'timestamp': 1344858571,
@@ -32,17 +31,30 @@ class NetzkinoIE(InfoExtractor):
'params': {
'skip_download': 'Download only works from Germany',
}
- }
+ }, {
+ 'url': 'https://www.netzkino.de/#!/filme/dr-jekyll-mrs-hyde-2',
+ 'md5': 'c7728b2dadd04ff6727814847a51ef03',
+ 'info_dict': {
+ 'id': 'dr-jekyll-mrs-hyde-2',
+ 'ext': 'mp4',
+ 'title': 'Dr. Jekyll & Mrs. Hyde 2',
+ 'description': 'md5:c2e9626ebd02de0a794b95407045d186',
+ 'upload_date': '20190130',
+ 'thumbnail': r're:https?://.*\.jpg$',
+ 'timestamp': 1548849437,
+ 'age_limit': 18,
+ },
+ 'params': {
+ 'skip_download': 'Download only works from Germany',
+ }
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- category_id = mobj.group('category')
video_id = mobj.group('id')
- api_url = 'http://api.netzkino.de.simplecache.net/capi-2.0a/categories/%s.json?d=www' % category_id
- api_info = self._download_json(api_url, video_id)
- info = next(
- p for p in api_info['posts'] if p['slug'] == video_id)
+ api_url = 'https://api.netzkino.de.simplecache.net/capi-2.0a/movies/%s.json?d=www' % video_id
+ info = self._download_json(api_url, video_id)
custom_fields = info['custom_fields']
production_js = self._download_webpage(
@@ -67,23 +79,12 @@ class NetzkinoIE(InfoExtractor):
} for key, tpl in templates.items()]
self._sort_formats(formats)
- comments = [{
- 'timestamp': parse_iso8601(c.get('date'), delimiter=' '),
- 'id': c['id'],
- 'author': c['name'],
- 'html': c['content'],
- 'parent': 'root' if c.get('parent', 0) == 0 else c['parent'],
- } for c in info.get('comments', [])]
-
return {
'id': video_id,
'formats': formats,
- 'comments': comments,
'title': info['title'],
'age_limit': int_or_none(custom_fields.get('FSK')[0]),
'timestamp': parse_iso8601(info.get('date'), delimiter=' '),
'description': clean_html(info.get('content')),
'thumbnail': info.get('thumbnail'),
- 'playlist_title': api_info.get('title'),
- 'playlist_id': category_id,
}
diff --git a/youtube_dlc/extractor/newgrounds.py b/youtube_dlc/extractor/newgrounds.py
index 82e7cf522..b9f01235f 100644
--- a/youtube_dlc/extractor/newgrounds.py
+++ b/youtube_dlc/extractor/newgrounds.py
@@ -4,6 +4,7 @@ import re
from .common import InfoExtractor
from ..utils import (
+ ExtractorError,
extract_attributes,
int_or_none,
parse_duration,
@@ -20,22 +21,22 @@ class NewgroundsIE(InfoExtractor):
'info_dict': {
'id': '549479',
'ext': 'mp3',
- 'title': 'B7 - BusMode',
+ 'title': 'Burn7 - B7 - BusMode',
'uploader': 'Burn7',
'timestamp': 1378878540,
'upload_date': '20130911',
'duration': 143,
},
}, {
- 'url': 'https://www.newgrounds.com/portal/view/673111',
- 'md5': '3394735822aab2478c31b1004fe5e5bc',
+ 'url': 'https://www.newgrounds.com/portal/view/1',
+ 'md5': 'fbfb40e2dc765a7e830cb251d370d981',
'info_dict': {
- 'id': '673111',
+ 'id': '1',
'ext': 'mp4',
- 'title': 'Dancin',
- 'uploader': 'Squirrelman82',
- 'timestamp': 1460256780,
- 'upload_date': '20160410',
+ 'title': 'Brian-Beaton - Scrotum 1',
+ 'uploader': 'Brian-Beaton',
+ 'timestamp': 955064100,
+ 'upload_date': '20000406',
},
}, {
# source format unavailable, additional mp4 formats
@@ -43,7 +44,7 @@ class NewgroundsIE(InfoExtractor):
'info_dict': {
'id': '689400',
'ext': 'mp4',
- 'title': 'ZTV News Episode 8',
+ 'title': 'Bennettthesage - ZTV News Episode 8',
'uploader': 'BennettTheSage',
'timestamp': 1487965140,
'upload_date': '20170224',
@@ -55,42 +56,73 @@ class NewgroundsIE(InfoExtractor):
def _real_extract(self, url):
media_id = self._match_id(url)
-
+ formats = []
+ uploader = None
webpage = self._download_webpage(url, media_id)
title = self._html_search_regex(
r'<title>([^>]+)</title>', webpage, 'title')
- media_url = self._parse_json(self._search_regex(
- r'"url"\s*:\s*("[^"]+"),', webpage, ''), media_id)
-
- formats = [{
- 'url': media_url,
- 'format_id': 'source',
- 'quality': 1,
- }]
-
- max_resolution = int_or_none(self._search_regex(
- r'max_resolution["\']\s*:\s*(\d+)', webpage, 'max resolution',
- default=None))
- if max_resolution:
- url_base = media_url.rpartition('.')[0]
- for resolution in (360, 720, 1080):
- if resolution > max_resolution:
- break
- formats.append({
- 'url': '%s.%dp.mp4' % (url_base, resolution),
- 'format_id': '%dp' % resolution,
- 'height': resolution,
- })
+ media_url_string = self._search_regex(
+ r'"url"\s*:\s*("[^"]+"),', webpage, 'media url', default=None, fatal=False)
+
+ if media_url_string:
+ media_url = self._parse_json(media_url_string, media_id)
+ formats = [{
+ 'url': media_url,
+ 'format_id': 'source',
+ 'quality': 1,
+ }]
+
+ max_resolution = int_or_none(self._search_regex(
+ r'max_resolution["\']\s*:\s*(\d+)', webpage, 'max resolution',
+ default=None))
+ if max_resolution:
+ url_base = media_url.rpartition('.')[0]
+ for resolution in (360, 720, 1080):
+ if resolution > max_resolution:
+ break
+ formats.append({
+ 'url': '%s.%dp.mp4' % (url_base, resolution),
+ 'format_id': '%dp' % resolution,
+ 'height': resolution,
+ })
+ else:
+ video_id = int_or_none(self._search_regex(
+ r'data-movie-id=\\"([0-9]+)\\"', webpage, ''))
+ if not video_id:
+ raise ExtractorError('Could not extract media data')
+
+ url_video_data = 'https://www.newgrounds.com/portal/video/%s' % video_id
+ headers = {
+ 'Accept': 'application/json',
+ 'Referer': url,
+ 'X-Requested-With': 'XMLHttpRequest'
+ }
+ json_video = self._download_json(url_video_data, video_id, headers=headers, fatal=False)
+ if not json_video:
+ raise ExtractorError('Could not fetch media data')
+
+ uploader = json_video.get('author')
+ title = json_video.get('title')
+ media_formats = json_video.get('sources', [])
+ for media_format in media_formats:
+ media_sources = media_formats[media_format]
+ for source in media_sources:
+ formats.append({
+ 'format_id': media_format,
+ 'quality': int_or_none(media_format[:-1]),
+ 'url': source.get('src')
+ })
self._check_formats(formats, media_id)
self._sort_formats(formats)
- uploader = self._html_search_regex(
- (r'(?s)<h4[^>]*>(.+?)</h4>.*?<em>\s*Author\s*</em>',
- r'(?:Author|Writer)\s*<a[^>]+>([^<]+)'), webpage, 'uploader',
- fatal=False)
+ if not uploader:
+ uploader = self._html_search_regex(
+ (r'(?s)<h4[^>]*>(.+?)</h4>.*?<em>\s*(?:Author|Artist)\s*</em>',
+ r'(?:Author|Writer)\s*<a[^>]+>([^<]+)'), webpage, 'uploader',
+ fatal=False)
timestamp = unified_timestamp(self._html_search_regex(
(r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+</dd>\s*<dd>[^<]+)',
@@ -109,6 +141,9 @@ class NewgroundsIE(InfoExtractor):
if '<dd>Song' in webpage:
formats[0]['vcodec'] = 'none'
+ if uploader:
+ title = "%s - %s" % (uploader, title)
+
return {
'id': media_id,
'title': title,
diff --git a/youtube_dlc/extractor/nitter.py b/youtube_dlc/extractor/nitter.py
new file mode 100644
index 000000000..3191543ed
--- /dev/null
+++ b/youtube_dlc/extractor/nitter.py
@@ -0,0 +1,167 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+ parse_count,
+ unified_strdate,
+ unified_timestamp,
+ remove_end,
+ determine_ext,
+)
+import re
+
+
+class NitterIE(InfoExtractor):
+ # Taken from https://github.com/zedeus/nitter/wiki/Instances
+ INSTANCES = ('nitter.net',
+ 'nitter.snopyta.org',
+ 'nitter.42l.fr',
+ 'nitter.nixnet.services',
+ 'nitter.13ad.de',
+ 'nitter.pussthecat.org',
+ 'nitter.mastodont.cat',
+ 'nitter.dark.fail',
+ 'nitter.tedomum.net',
+ 'nitter.cattube.org',
+ 'nitter.fdn.fr',
+ 'nitter.1d4.us',
+ 'nitter.kavin.rocks',
+ 'tweet.lambda.dance',
+ 'nitter.cc',
+ 'nitter.weaponizedhumiliation.com',
+ '3nzoldnxplag42gqjs23xvghtzf6t6yzssrtytnntc6ppc7xxuoneoad.onion',
+ 'nitter.l4qlywnpwqsluw65ts7md3khrivpirse744un3x7mlskqauz5pyuzgqd.onion',
+ 'nitterlgj3n5fgwesu3vxc5h67ruku33nqaoeoocae2mvlzhsu6k7fqd.onion')
+
+ _INSTANCES_RE = '(?:' + '|'.join([re.escape(instance) for instance in INSTANCES]) + ')'
+ _VALID_URL = r'https?://%(instance)s/(?P<uploader_id>.+)/status/(?P<id>[0-9]+)(#.)?' % {'instance': _INSTANCES_RE}
+ current_instance = INSTANCES[0] # the test and official instance
+ _TESTS = [
+ {
+ # GIF (wrapped in mp4)
+ 'url': 'https://' + current_instance + '/firefox/status/1314279897502629888#m',
+ 'info_dict': {
+ 'id': '1314279897502629888',
+ 'ext': 'mp4',
+ 'title': 'Firefox 🔥 - You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension. Report harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg #UnfckTheInternet',
+ 'description': 'You know the old saying, if you see something say something. Now you actually can with the YouTube regrets extension. Report harmful YouTube recommendations so others can avoid watching them. ➡️ https://mzl.la/3iFIiyg #UnfckTheInternet',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'uploader': 'Firefox 🔥',
+ 'uploader_id': 'firefox',
+ 'uploader_url': 'https://' + current_instance + '/firefox',
+ 'upload_date': '20201008',
+ 'timestamp': 1602183720,
+ },
+ }, { # normal video
+ 'url': 'https://' + current_instance + '/Le___Doc/status/1299715685392756737#m',
+ 'info_dict': {
+ 'id': '1299715685392756737',
+ 'ext': 'mp4',
+ 'title': 'Le Doc - "Je ne prédis jamais rien" D Raoult, Août 2020...',
+ 'description': '"Je ne prédis jamais rien" D Raoult, Août 2020...',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'uploader': 'Le Doc',
+ 'uploader_id': 'Le___Doc',
+ 'uploader_url': 'https://' + current_instance + '/Le___Doc',
+ 'upload_date': '20200829',
+ 'timestamp': 1598711341,
+ 'view_count': int,
+ 'like_count': int,
+ 'repost_count': int,
+ 'comment_count': int,
+ },
+ }, { # video embed in a "Streaming Political Ads" box
+ 'url': 'https://' + current_instance + '/mozilla/status/1321147074491092994#m',
+ 'info_dict': {
+ 'id': '1321147074491092994',
+ 'ext': 'mp4',
+ 'title': "Mozilla - Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows? This isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few. Learn more ➡️ https://mzl.la/StreamingAds",
+ 'description': "Are you being targeted with weird, ominous or just plain annoying political ads while streaming your favorite shows? This isn't a real political ad, but if you're watching streaming TV in the U.S., chances are you've seen quite a few. Learn more ➡️ https://mzl.la/StreamingAds",
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'uploader': 'Mozilla',
+ 'uploader_id': 'mozilla',
+ 'uploader_url': 'https://' + current_instance + '/mozilla',
+ 'upload_date': '20201027',
+ 'timestamp': 1603820982
+ },
+ },
+ ]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ parsed_url = compat_urlparse.urlparse(url)
+ base_url = parsed_url.scheme + '://' + parsed_url.netloc
+
+ self._set_cookie(parsed_url.netloc, 'hlsPlayback', 'on')
+ webpage = self._download_webpage(url, video_id)
+
+ video_url = base_url + self._html_search_regex(r'(?:<video[^>]+data-url|<source[^>]+src)="([^"]+)"', webpage, 'video url')
+ ext = determine_ext(video_url)
+
+ if ext == 'unknown_video':
+ formats = self._extract_m3u8_formats(video_url, video_id, ext='mp4')
+ else:
+ formats = [{
+ 'url': video_url,
+ 'ext': ext
+ }]
+
+ title = (
+ self._og_search_description(webpage).replace('\n', ' ')
+ or self._html_search_regex(r'<div class="tweet-content[^>]+>([^<]+)</div>', webpage, 'title'))
+ description = title
+
+ mobj = re.match(self._VALID_URL, url)
+ uploader_id = (
+ mobj.group('uploader_id')
+ or self._html_search_regex(r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False))
+
+ if uploader_id:
+ uploader_url = base_url + '/' + uploader_id
+
+ uploader = self._html_search_regex(r'<a class="fullname"[^>]+title="([^"]+)"', webpage, 'uploader name', fatal=False)
+
+ if uploader:
+ title = uploader + ' - ' + title
+
+ view_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-play[^>]*></span>\s([^<]+)</div>', webpage, 'view count', fatal=False))
+ like_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-heart[^>]*></span>\s([^<]+)</div>', webpage, 'like count', fatal=False))
+ repost_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-retweet[^>]*></span>\s([^<]+)</div>', webpage, 'repost count', fatal=False))
+ comment_count = parse_count(self._html_search_regex(r'<span[^>]+class="icon-comment[^>]*></span>\s([^<]+)</div>', webpage, 'repost count', fatal=False))
+
+ thumbnail = base_url + (self._html_search_meta('og:image', webpage, 'thumbnail url')
+ or self._html_search_regex(r'<video[^>]+poster="([^"]+)"', webpage, 'thumbnail url', fatal=False))
+
+ thumbnail = remove_end(thumbnail, '%3Asmall') # if parsed with regex, it should contain this
+
+ thumbnails = []
+ thumbnail_ids = ('thumb', 'small', 'large', 'medium', 'orig')
+ for id in thumbnail_ids:
+ thumbnails.append({
+ 'id': id,
+ 'url': thumbnail + '%3A' + id,
+ })
+
+ date = self._html_search_regex(r'<span[^>]+class="tweet-date"[^>]*><a[^>]+title="([^"]+)"', webpage, 'upload date', fatal=False)
+ upload_date = unified_strdate(date)
+ timestamp = unified_timestamp(date)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'uploader': uploader,
+ 'timestamp': timestamp,
+ 'uploader_id': uploader_id,
+ 'uploader_url': uploader_url,
+ 'view_count': view_count,
+ 'like_count': like_count,
+ 'repost_count': repost_count,
+ 'comment_count': comment_count,
+ 'formats': formats,
+ 'thumbnails': thumbnails,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ }
diff --git a/youtube_dlc/extractor/npr.py b/youtube_dlc/extractor/npr.py
index 53acc6e57..9d1122f0c 100644
--- a/youtube_dlc/extractor/npr.py
+++ b/youtube_dlc/extractor/npr.py
@@ -33,7 +33,7 @@ class NprIE(InfoExtractor):
},
}],
}, {
- # mutlimedia, not media title
+ # multimedia, not media title
'url': 'https://www.npr.org/2017/06/19/533198237/tigers-jaw-tiny-desk-concert',
'info_dict': {
'id': '533198237',
diff --git a/youtube_dlc/extractor/nrk.py b/youtube_dlc/extractor/nrk.py
index 84aacbcda..4a395546f 100644
--- a/youtube_dlc/extractor/nrk.py
+++ b/youtube_dlc/extractor/nrk.py
@@ -9,6 +9,7 @@ from ..compat import (
compat_urllib_parse_unquote,
)
from ..utils import (
+ determine_ext,
ExtractorError,
int_or_none,
js_to_json,
@@ -16,185 +17,13 @@ from ..utils import (
parse_age_limit,
parse_duration,
try_get,
+ url_or_none,
)
class NRKBaseIE(InfoExtractor):
_GEO_COUNTRIES = ['NO']
- _api_host = None
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- api_hosts = (self._api_host, ) if self._api_host else self._API_HOSTS
-
- for api_host in api_hosts:
- data = self._download_json(
- 'http://%s/mediaelement/%s' % (api_host, video_id),
- video_id, 'Downloading mediaelement JSON',
- fatal=api_host == api_hosts[-1])
- if not data:
- continue
- self._api_host = api_host
- break
-
- title = data.get('fullTitle') or data.get('mainTitle') or data['title']
- video_id = data.get('id') or video_id
-
- entries = []
-
- conviva = data.get('convivaStatistics') or {}
- live = (data.get('mediaElementType') == 'Live'
- or data.get('isLive') is True or conviva.get('isLive'))
-
- def make_title(t):
- return self._live_title(t) if live else t
-
- media_assets = data.get('mediaAssets')
- if media_assets and isinstance(media_assets, list):
- def video_id_and_title(idx):
- return ((video_id, title) if len(media_assets) == 1
- else ('%s-%d' % (video_id, idx), '%s (Part %d)' % (title, idx)))
- for num, asset in enumerate(media_assets, 1):
- asset_url = asset.get('url')
- if not asset_url:
- continue
- formats = self._extract_akamai_formats(asset_url, video_id)
- if not formats:
- continue
- self._sort_formats(formats)
-
- # Some f4m streams may not work with hdcore in fragments' URLs
- for f in formats:
- extra_param = f.get('extra_param_to_segment_url')
- if extra_param and 'hdcore' in extra_param:
- del f['extra_param_to_segment_url']
-
- entry_id, entry_title = video_id_and_title(num)
- duration = parse_duration(asset.get('duration'))
- subtitles = {}
- for subtitle in ('webVtt', 'timedText'):
- subtitle_url = asset.get('%sSubtitlesUrl' % subtitle)
- if subtitle_url:
- subtitles.setdefault('no', []).append({
- 'url': compat_urllib_parse_unquote(subtitle_url)
- })
- entries.append({
- 'id': asset.get('carrierId') or entry_id,
- 'title': make_title(entry_title),
- 'duration': duration,
- 'subtitles': subtitles,
- 'formats': formats,
- })
-
- if not entries:
- media_url = data.get('mediaUrl')
- if media_url:
- formats = self._extract_akamai_formats(media_url, video_id)
- self._sort_formats(formats)
- duration = parse_duration(data.get('duration'))
- entries = [{
- 'id': video_id,
- 'title': make_title(title),
- 'duration': duration,
- 'formats': formats,
- }]
-
- if not entries:
- MESSAGES = {
- 'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet',
- 'ProgramRightsHasExpired': 'Programmet har gått ut',
- 'NoProgramRights': 'Ikke tilgjengelig',
- 'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge',
- }
- message_type = data.get('messageType', '')
- # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked*
- if 'IsGeoBlocked' in message_type:
- self.raise_geo_restricted(
- msg=MESSAGES.get('ProgramIsGeoBlocked'),
- countries=self._GEO_COUNTRIES)
- raise ExtractorError(
- '%s said: %s' % (self.IE_NAME, MESSAGES.get(
- message_type, message_type)),
- expected=True)
-
- series = conviva.get('seriesName') or data.get('seriesTitle')
- episode = conviva.get('episodeName') or data.get('episodeNumberOrDate')
-
- season_number = None
- episode_number = None
- if data.get('mediaElementType') == 'Episode':
- _season_episode = data.get('scoresStatistics', {}).get('springStreamStream') or \
- data.get('relativeOriginUrl', '')
- EPISODENUM_RE = [
- r'/s(?P<season>\d{,2})e(?P<episode>\d{,2})\.',
- r'/sesong-(?P<season>\d{,2})/episode-(?P<episode>\d{,2})',
- ]
- season_number = int_or_none(self._search_regex(
- EPISODENUM_RE, _season_episode, 'season number',
- default=None, group='season'))
- episode_number = int_or_none(self._search_regex(
- EPISODENUM_RE, _season_episode, 'episode number',
- default=None, group='episode'))
-
- thumbnails = None
- images = data.get('images')
- if images and isinstance(images, dict):
- web_images = images.get('webImages')
- if isinstance(web_images, list):
- thumbnails = [{
- 'url': image['imageUrl'],
- 'width': int_or_none(image.get('width')),
- 'height': int_or_none(image.get('height')),
- } for image in web_images if image.get('imageUrl')]
-
- description = data.get('description')
- category = data.get('mediaAnalytics', {}).get('category')
-
- common_info = {
- 'description': description,
- 'series': series,
- 'episode': episode,
- 'season_number': season_number,
- 'episode_number': episode_number,
- 'categories': [category] if category else None,
- 'age_limit': parse_age_limit(data.get('legalAge')),
- 'thumbnails': thumbnails,
- }
-
- vcodec = 'none' if data.get('mediaType') == 'Audio' else None
-
- for entry in entries:
- entry.update(common_info)
- for f in entry['formats']:
- f['vcodec'] = vcodec
-
- points = data.get('shortIndexPoints')
- if isinstance(points, list):
- chapters = []
- for next_num, point in enumerate(points, start=1):
- if not isinstance(point, dict):
- continue
- start_time = parse_duration(point.get('startPoint'))
- if start_time is None:
- continue
- end_time = parse_duration(
- data.get('duration')
- if next_num == len(points)
- else points[next_num].get('startPoint'))
- if end_time is None:
- continue
- chapters.append({
- 'start_time': start_time,
- 'end_time': end_time,
- 'title': point.get('title'),
- })
- if chapters and len(entries) == 1:
- entries[0]['chapters'] = chapters
-
- return self.playlist_result(entries, video_id, title, description)
-
class NRKIE(NRKBaseIE):
_VALID_URL = r'''(?x)
@@ -202,13 +31,13 @@ class NRKIE(NRKBaseIE):
nrk:|
https?://
(?:
- (?:www\.)?nrk\.no/video/PS\*|
+ (?:www\.)?nrk\.no/video/(?:PS\*|[^_]+_)|
v8[-.]psapi\.nrk\.no/mediaelement/
)
)
- (?P<id>[^?#&]+)
+ (?P<id>[^?\#&]+)
'''
- _API_HOSTS = ('psapi.nrk.no', 'v8-psapi.nrk.no')
+
_TESTS = [{
# video
'url': 'http://www.nrk.no/video/PS*150533',
@@ -240,8 +69,76 @@ class NRKIE(NRKBaseIE):
}, {
'url': 'https://v8-psapi.nrk.no/mediaelement/ecc1b952-96dc-4a98-81b9-5296dc7a98d9',
'only_matching': True,
+ }, {
+ 'url': 'https://www.nrk.no/video/dompap-og-andre-fugler-i-piip-show_150533',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.nrk.no/video/humor/kommentatorboksen-reiser-til-sjos_d1fda11f-a4ad-437a-a374-0398bc84e999',
+ 'only_matching': True,
}]
+ def _extract_from_playback(self, video_id):
+ manifest = self._download_json(
+ 'http://psapi.nrk.no/playback/manifest/%s' % video_id,
+ video_id, 'Downloading manifest JSON')
+
+ playable = manifest['playable']
+
+ formats = []
+ for asset in playable['assets']:
+ if not isinstance(asset, dict):
+ continue
+ if asset.get('encrypted'):
+ continue
+ format_url = url_or_none(asset.get('url'))
+ if not format_url:
+ continue
+ if asset.get('format') == 'HLS' or determine_ext(format_url) == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ format_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
+ self._sort_formats(formats)
+
+ data = self._download_json(
+ 'http://psapi.nrk.no/playback/metadata/%s' % video_id,
+ video_id, 'Downloading metadata JSON')
+
+ preplay = data['preplay']
+ titles = preplay['titles']
+ title = titles['title']
+ alt_title = titles.get('subtitle')
+
+ description = preplay.get('description')
+ duration = parse_duration(playable.get('duration')) or parse_duration(data.get('duration'))
+
+ thumbnails = []
+ for image in try_get(
+ preplay, lambda x: x['poster']['images'], list) or []:
+ if not isinstance(image, dict):
+ continue
+ image_url = url_or_none(image.get('url'))
+ if not image_url:
+ continue
+ thumbnails.append({
+ 'url': image_url,
+ 'width': int_or_none(image.get('pixelWidth')),
+ 'height': int_or_none(image.get('pixelHeight')),
+ })
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'alt_title': alt_title,
+ 'description': description,
+ 'duration': duration,
+ 'thumbnails': thumbnails,
+ 'formats': formats,
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ return self._extract_from_playback(video_id)
+
class NRKTVIE(NRKBaseIE):
IE_DESC = 'NRK TV and NRK Radio'
@@ -380,6 +277,181 @@ class NRKTVIE(NRKBaseIE):
'only_matching': True,
}]
+ _api_host = None
+
+ def _extract_from_mediaelement(self, video_id):
+ api_hosts = (self._api_host, ) if self._api_host else self._API_HOSTS
+
+ for api_host in api_hosts:
+ data = self._download_json(
+ 'http://%s/mediaelement/%s' % (api_host, video_id),
+ video_id, 'Downloading mediaelement JSON',
+ fatal=api_host == api_hosts[-1])
+ if not data:
+ continue
+ self._api_host = api_host
+ break
+
+ title = data.get('fullTitle') or data.get('mainTitle') or data['title']
+ video_id = data.get('id') or video_id
+
+ entries = []
+
+ conviva = data.get('convivaStatistics') or {}
+ live = (data.get('mediaElementType') == 'Live'
+ or data.get('isLive') is True or conviva.get('isLive'))
+
+ def make_title(t):
+ return self._live_title(t) if live else t
+
+ media_assets = data.get('mediaAssets')
+ if media_assets and isinstance(media_assets, list):
+ def video_id_and_title(idx):
+ return ((video_id, title) if len(media_assets) == 1
+ else ('%s-%d' % (video_id, idx), '%s (Part %d)' % (title, idx)))
+ for num, asset in enumerate(media_assets, 1):
+ asset_url = asset.get('url')
+ if not asset_url:
+ continue
+ formats = self._extract_akamai_formats(asset_url, video_id)
+ if not formats:
+ continue
+ self._sort_formats(formats)
+
+ # Some f4m streams may not work with hdcore in fragments' URLs
+ for f in formats:
+ extra_param = f.get('extra_param_to_segment_url')
+ if extra_param and 'hdcore' in extra_param:
+ del f['extra_param_to_segment_url']
+
+ entry_id, entry_title = video_id_and_title(num)
+ duration = parse_duration(asset.get('duration'))
+ subtitles = {}
+ for subtitle in ('webVtt', 'timedText'):
+ subtitle_url = asset.get('%sSubtitlesUrl' % subtitle)
+ if subtitle_url:
+ subtitles.setdefault('no', []).append({
+ 'url': compat_urllib_parse_unquote(subtitle_url)
+ })
+ entries.append({
+ 'id': asset.get('carrierId') or entry_id,
+ 'title': make_title(entry_title),
+ 'duration': duration,
+ 'subtitles': subtitles,
+ 'formats': formats,
+ })
+
+ if not entries:
+ media_url = data.get('mediaUrl')
+ if media_url:
+ formats = self._extract_akamai_formats(media_url, video_id)
+ self._sort_formats(formats)
+ duration = parse_duration(data.get('duration'))
+ entries = [{
+ 'id': video_id,
+ 'title': make_title(title),
+ 'duration': duration,
+ 'formats': formats,
+ }]
+
+ if not entries:
+ MESSAGES = {
+ 'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet',
+ 'ProgramRightsHasExpired': 'Programmet har gått ut',
+ 'NoProgramRights': 'Ikke tilgjengelig',
+ 'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge',
+ }
+ message_type = data.get('messageType', '')
+ # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked*
+ if 'IsGeoBlocked' in message_type:
+ self.raise_geo_restricted(
+ msg=MESSAGES.get('ProgramIsGeoBlocked'),
+ countries=self._GEO_COUNTRIES)
+ raise ExtractorError(
+ '%s said: %s' % (self.IE_NAME, MESSAGES.get(
+ message_type, message_type)),
+ expected=True)
+
+ series = conviva.get('seriesName') or data.get('seriesTitle')
+ episode = conviva.get('episodeName') or data.get('episodeNumberOrDate')
+
+ season_number = None
+ episode_number = None
+ if data.get('mediaElementType') == 'Episode':
+ _season_episode = data.get('scoresStatistics', {}).get('springStreamStream') or \
+ data.get('relativeOriginUrl', '')
+ EPISODENUM_RE = [
+ r'/s(?P<season>\d{,2})e(?P<episode>\d{,2})\.',
+ r'/sesong-(?P<season>\d{,2})/episode-(?P<episode>\d{,2})',
+ ]
+ season_number = int_or_none(self._search_regex(
+ EPISODENUM_RE, _season_episode, 'season number',
+ default=None, group='season'))
+ episode_number = int_or_none(self._search_regex(
+ EPISODENUM_RE, _season_episode, 'episode number',
+ default=None, group='episode'))
+
+ thumbnails = None
+ images = data.get('images')
+ if images and isinstance(images, dict):
+ web_images = images.get('webImages')
+ if isinstance(web_images, list):
+ thumbnails = [{
+ 'url': image['imageUrl'],
+ 'width': int_or_none(image.get('width')),
+ 'height': int_or_none(image.get('height')),
+ } for image in web_images if image.get('imageUrl')]
+
+ description = data.get('description')
+ category = data.get('mediaAnalytics', {}).get('category')
+
+ common_info = {
+ 'description': description,
+ 'series': series,
+ 'episode': episode,
+ 'season_number': season_number,
+ 'episode_number': episode_number,
+ 'categories': [category] if category else None,
+ 'age_limit': parse_age_limit(data.get('legalAge')),
+ 'thumbnails': thumbnails,
+ }
+
+ vcodec = 'none' if data.get('mediaType') == 'Audio' else None
+
+ for entry in entries:
+ entry.update(common_info)
+ for f in entry['formats']:
+ f['vcodec'] = vcodec
+
+ points = data.get('shortIndexPoints')
+ if isinstance(points, list):
+ chapters = []
+ for next_num, point in enumerate(points, start=1):
+ if not isinstance(point, dict):
+ continue
+ start_time = parse_duration(point.get('startPoint'))
+ if start_time is None:
+ continue
+ end_time = parse_duration(
+ data.get('duration')
+ if next_num == len(points)
+ else points[next_num].get('startPoint'))
+ if end_time is None:
+ continue
+ chapters.append({
+ 'start_time': start_time,
+ 'end_time': end_time,
+ 'title': point.get('title'),
+ })
+ if chapters and len(entries) == 1:
+ entries[0]['chapters'] = chapters
+
+ return self.playlist_result(entries, video_id, title, description)
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ return self._extract_from_mediaelement(video_id)
+
class NRKTVEpisodeIE(InfoExtractor):
_VALID_URL = r'https?://tv\.nrk\.no/serie/(?P<id>[^/]+/sesong/\d+/episode/\d+)'
diff --git a/youtube_dlc/extractor/nytimes.py b/youtube_dlc/extractor/nytimes.py
index fc78ca56c..976b1c694 100644
--- a/youtube_dlc/extractor/nytimes.py
+++ b/youtube_dlc/extractor/nytimes.py
@@ -221,3 +221,41 @@ class NYTimesArticleIE(NYTimesBaseIE):
r'NYTD\.FlexTypes\.push\s*\(\s*({.+})\s*\)\s*;'),
webpage, 'podcast data')
return self._extract_podcast_from_json(podcast_data, page_id, webpage)
+
+
+class NYTimesCookingIE(NYTimesBaseIE):
+ _VALID_URL = r'https?://cooking\.nytimes\.com/(?:guid|recip)es/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://cooking.nytimes.com/recipes/1017817-cranberry-curd-tart',
+ 'md5': 'dab81fa2eaeb3f9ed47498bdcfcdc1d3',
+ 'info_dict': {
+ 'id': '100000004756089',
+ 'ext': 'mov',
+ 'timestamp': 1479383008,
+ 'uploader': 'By SHAW LASH, ADAM SAEWITZ and JAMES HERRON',
+ 'title': 'Cranberry Tart',
+ 'upload_date': '20161117',
+ 'description': 'If you are a fan of lemon curd or the classic French tarte au citron, you will love this cranberry version.',
+ },
+ }, {
+ 'url': 'https://cooking.nytimes.com/guides/13-how-to-cook-a-turkey',
+ 'md5': '4b2e8c70530a89b8d905a2b572316eb8',
+ 'info_dict': {
+ 'id': '100000003951728',
+ 'ext': 'mov',
+ 'timestamp': 1445509539,
+ 'description': 'Turkey guide',
+ 'upload_date': '20151022',
+ 'title': 'Turkey',
+ }
+ }]
+
+ def _real_extract(self, url):
+ page_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, page_id)
+
+ video_id = self._search_regex(
+ r'data-video-id=["\'](\d+)', webpage, 'video id')
+
+ return self._extract_video_from_id(video_id)
diff --git a/youtube_dlc/extractor/pbs.py b/youtube_dlc/extractor/pbs.py
index 4dbe661be..d4baa16ee 100644
--- a/youtube_dlc/extractor/pbs.py
+++ b/youtube_dlc/extractor/pbs.py
@@ -477,7 +477,7 @@ class PBSIE(InfoExtractor):
if media_id:
return media_id, presumptive_id, upload_date, description
- # Fronline video embedded via flp
+ # Frontline video embedded via flp
video_id = self._search_regex(
r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid', default=None)
if video_id:
diff --git a/youtube_dlc/extractor/pinterest.py b/youtube_dlc/extractor/pinterest.py
new file mode 100644
index 000000000..b249c9eda
--- /dev/null
+++ b/youtube_dlc/extractor/pinterest.py
@@ -0,0 +1,201 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ determine_ext,
+ float_or_none,
+ int_or_none,
+ try_get,
+ unified_timestamp,
+ url_or_none,
+)
+
+
+class PinterestBaseIE(InfoExtractor):
+ _VALID_URL_BASE = r'https?://(?:[^/]+\.)?pinterest\.(?:com|fr|de|ch|jp|cl|ca|it|co\.uk|nz|ru|com\.au|at|pt|co\.kr|es|com\.mx|dk|ph|th|com\.uy|co|nl|info|kr|ie|vn|com\.vn|ec|mx|in|pe|co\.at|hu|co\.in|co\.nz|id|com\.ec|com\.py|tw|be|uk|com\.bo|com\.pe)'
+
+ def _call_api(self, resource, video_id, options):
+ return self._download_json(
+ 'https://www.pinterest.com/resource/%sResource/get/' % resource,
+ video_id, 'Download %s JSON metadata' % resource, query={
+ 'data': json.dumps({'options': options})
+ })['resource_response']
+
+ def _extract_video(self, data, extract_formats=True):
+ video_id = data['id']
+
+ title = (data.get('title') or data.get('grid_title') or video_id).strip()
+
+ formats = []
+ duration = None
+ if extract_formats:
+ for format_id, format_dict in data['videos']['video_list'].items():
+ if not isinstance(format_dict, dict):
+ continue
+ format_url = url_or_none(format_dict.get('url'))
+ if not format_url:
+ continue
+ duration = float_or_none(format_dict.get('duration'), scale=1000)
+ ext = determine_ext(format_url)
+ if 'hls' in format_id.lower() or ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ format_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id=format_id, fatal=False))
+ else:
+ formats.append({
+ 'url': format_url,
+ 'format_id': format_id,
+ 'width': int_or_none(format_dict.get('width')),
+ 'height': int_or_none(format_dict.get('height')),
+ 'duration': duration,
+ })
+ self._sort_formats(
+ formats, field_preference=('height', 'width', 'tbr', 'format_id'))
+
+ description = data.get('description') or data.get('description_html') or data.get('seo_description')
+ timestamp = unified_timestamp(data.get('created_at'))
+
+ def _u(field):
+ return try_get(data, lambda x: x['closeup_attribution'][field], compat_str)
+
+ uploader = _u('full_name')
+ uploader_id = _u('id')
+
+ repost_count = int_or_none(data.get('repin_count'))
+ comment_count = int_or_none(data.get('comment_count'))
+ categories = try_get(data, lambda x: x['pin_join']['visual_annotation'], list)
+ tags = data.get('hashtags')
+
+ thumbnails = []
+ images = data.get('images')
+ if isinstance(images, dict):
+ for thumbnail_id, thumbnail in images.items():
+ if not isinstance(thumbnail, dict):
+ continue
+ thumbnail_url = url_or_none(thumbnail.get('url'))
+ if not thumbnail_url:
+ continue
+ thumbnails.append({
+ 'url': thumbnail_url,
+ 'width': int_or_none(thumbnail.get('width')),
+ 'height': int_or_none(thumbnail.get('height')),
+ })
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'thumbnails': thumbnails,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'repost_count': repost_count,
+ 'comment_count': comment_count,
+ 'categories': categories,
+ 'tags': tags,
+ 'formats': formats,
+ 'extractor_key': PinterestIE.ie_key(),
+ }
+
+
+class PinterestIE(PinterestBaseIE):
+ _VALID_URL = r'%s/pin/(?P<id>\d+)' % PinterestBaseIE._VALID_URL_BASE
+ _TESTS = [{
+ 'url': 'https://www.pinterest.com/pin/664281013778109217/',
+ 'md5': '6550c2af85d6d9f3fe3b88954d1577fc',
+ 'info_dict': {
+ 'id': '664281013778109217',
+ 'ext': 'mp4',
+ 'title': 'Origami',
+ 'description': 'md5:b9d90ddf7848e897882de9e73344f7dd',
+ 'duration': 57.7,
+ 'timestamp': 1593073622,
+ 'upload_date': '20200625',
+ 'uploader': 'Love origami -I am Dafei',
+ 'uploader_id': '586523688879454212',
+ 'repost_count': 50,
+ 'comment_count': 0,
+ 'categories': list,
+ 'tags': list,
+ },
+ }, {
+ 'url': 'https://co.pinterest.com/pin/824721750502199491/',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ data = self._call_api(
+ 'Pin', video_id, {
+ 'field_set_key': 'unauth_react_main_pin',
+ 'id': video_id,
+ })['data']
+ return self._extract_video(data)
+
+
+class PinterestCollectionIE(PinterestBaseIE):
+ _VALID_URL = r'%s/(?P<username>[^/]+)/(?P<id>[^/?#&]+)' % PinterestBaseIE._VALID_URL_BASE
+ _TESTS = [{
+ 'url': 'https://www.pinterest.ca/mashal0407/cool-diys/',
+ 'info_dict': {
+ 'id': '585890301462791043',
+ 'title': 'cool diys',
+ },
+ 'playlist_count': 8,
+ }, {
+ 'url': 'https://www.pinterest.ca/fudohub/videos/',
+ 'info_dict': {
+ 'id': '682858430939307450',
+ 'title': 'VIDEOS',
+ },
+ 'playlist_mincount': 365,
+ 'skip': 'Test with extract_formats=False',
+ }]
+
+ @classmethod
+ def suitable(cls, url):
+ return False if PinterestIE.suitable(url) else super(
+ PinterestCollectionIE, cls).suitable(url)
+
+ def _real_extract(self, url):
+ username, slug = re.match(self._VALID_URL, url).groups()
+ board = self._call_api(
+ 'Board', slug, {
+ 'slug': slug,
+ 'username': username
+ })['data']
+ board_id = board['id']
+ options = {
+ 'board_id': board_id,
+ 'page_size': 250,
+ }
+ bookmark = None
+ entries = []
+ while True:
+ if bookmark:
+ options['bookmarks'] = [bookmark]
+ board_feed = self._call_api('BoardFeed', board_id, options)
+ for item in (board_feed.get('data') or []):
+ if not isinstance(item, dict) or item.get('type') != 'pin':
+ continue
+ video_id = item.get('id')
+ if video_id:
+ # Some pins may not be available anonymously via pin URL
+ # video = self._extract_video(item, extract_formats=False)
+ # video.update({
+ # '_type': 'url_transparent',
+ # 'url': 'https://www.pinterest.com/pin/%s/' % video_id,
+ # })
+ # entries.append(video)
+ entries.append(self._extract_video(item))
+ bookmark = board_feed.get('bookmark')
+ if not bookmark:
+ break
+ return self.playlist_result(
+ entries, playlist_id=board_id, playlist_title=board.get('name'))
diff --git a/youtube_dlc/extractor/rai.py b/youtube_dlc/extractor/rai.py
index 51a310f5c..5eef7c633 100644
--- a/youtube_dlc/extractor/rai.py
+++ b/youtube_dlc/extractor/rai.py
@@ -16,6 +16,7 @@ from ..utils import (
GeoRestrictedError,
int_or_none,
parse_duration,
+ remove_start,
strip_or_none,
try_get,
unified_strdate,
@@ -30,7 +31,6 @@ class RaiBaseIE(InfoExtractor):
_UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}'
_GEO_COUNTRIES = ['IT']
_GEO_BYPASS = False
- _BASE_URL = 'https://www.raiplay.it'
def _extract_relinker_info(self, relinker_url, video_id):
if not re.match(r'https?://', relinker_url):
@@ -68,7 +68,7 @@ class RaiBaseIE(InfoExtractor):
# This does not imply geo restriction (e.g.
# http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html)
- if media_url == 'http://download.rai.it/video_no_available.mp4':
+ if '/video_no_available.mp4' in media_url:
continue
ext = determine_ext(media_url)
@@ -123,7 +123,7 @@ class RaiBaseIE(InfoExtractor):
class RaiPlayIE(RaiBaseIE):
- _VALID_URL = r'(?P<url>(?P<base>https?://(?:www\.)?raiplay\.it/.+?-)(?P<id>%s)(?P<ext>\.(?:html|json)))' % RaiBaseIE._UUID_RE
+ _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s))\.(?:html|json)' % RaiBaseIE._UUID_RE
_TESTS = [{
'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html',
'md5': '8970abf8caf8aef4696e7b1f2adfc696',
@@ -131,11 +131,13 @@ class RaiPlayIE(RaiBaseIE):
'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391',
'ext': 'mp4',
'title': 'Report del 07/04/2014',
- 'alt_title': 'St 2013/14 - Espresso nel caffè - 07/04/2014 ',
+ 'alt_title': 'St 2013/14 - Espresso nel caffè - 07/04/2014',
'description': 'md5:d730c168a58f4bb35600fc2f881ec04e',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Rai Gulp',
'duration': 6160,
+ 'series': 'Report',
+ 'season': '2013/14',
},
'params': {
'skip_download': True,
@@ -146,11 +148,10 @@ class RaiPlayIE(RaiBaseIE):
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- url, base, video_id, ext = mobj.group('url', 'base', 'id', 'ext')
+ base, video_id = re.match(self._VALID_URL, url).groups()
media = self._download_json(
- '%s%s.json' % (base, video_id), video_id, 'Downloading video JSON')
+ base + '.json', video_id, 'Downloading video JSON')
title = media['name']
video = media['video']
@@ -159,34 +160,39 @@ class RaiPlayIE(RaiBaseIE):
self._sort_formats(relinker_info['formats'])
thumbnails = []
- if 'images' in media:
- for _, value in media.get('images').items():
- if value:
- thumbnails.append({
- 'url': urljoin(RaiBaseIE._BASE_URL, value.replace('[RESOLUTION]', '600x400'))
- })
+ for _, value in media.get('images', {}).items():
+ if value:
+ thumbnails.append({
+ 'url': urljoin(url, value),
+ })
- timestamp = unified_timestamp(try_get(
- media, lambda x: x['availabilities'][0]['start'], compat_str))
+ date_published = media.get('date_published')
+ time_published = media.get('time_published')
+ if date_published and time_published:
+ date_published += ' ' + time_published
subtitles = self._extract_subtitles(url, video.get('subtitles'))
+ program_info = media.get('program_info') or {}
+ season = media.get('season')
+
info = {
- 'id': video_id,
+ 'id': remove_start(media.get('id'), 'ContentItem-') or video_id,
+ 'display_id': video_id,
'title': self._live_title(title) if relinker_info.get(
'is_live') else title,
- 'alt_title': media.get('subtitle'),
+ 'alt_title': strip_or_none(media.get('subtitle')),
'description': media.get('description'),
'uploader': strip_or_none(media.get('channel')),
- 'creator': strip_or_none(media.get('editor')),
+ 'creator': strip_or_none(media.get('editor') or None),
'duration': parse_duration(video.get('duration')),
- 'timestamp': timestamp,
+ 'timestamp': unified_timestamp(date_published),
'thumbnails': thumbnails,
- 'series': try_get(
- media, lambda x: x['isPartOf']['name'], compat_str),
- 'season_number': int_or_none(try_get(
- media, lambda x: x['isPartOf']['numeroStagioni'])),
- 'season': media.get('stagione') or None,
+ 'series': program_info.get('name'),
+ 'season_number': int_or_none(season),
+ 'season': season if (season and not season.isdigit()) else None,
+ 'episode': media.get('episode_title'),
+ 'episode_number': int_or_none(media.get('episode')),
'subtitles': subtitles,
}
@@ -194,9 +200,9 @@ class RaiPlayIE(RaiBaseIE):
return info
-class RaiPlayLiveIE(RaiBaseIE):
- _VALID_URL = r'https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+)'
- _TEST = {
+class RaiPlayLiveIE(RaiPlayIE):
+ _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/dirette/(?P<id>[^/?#&]+))'
+ _TESTS = [{
'url': 'http://www.raiplay.it/dirette/rainews24',
'info_dict': {
'id': 'd784ad40-e0ae-4a69-aa76-37519d238a9c',
@@ -211,40 +217,11 @@ class RaiPlayLiveIE(RaiBaseIE):
'params': {
'skip_download': True,
},
- }
-
- def _real_extract(self, url):
- display_id = self._match_id(url)
-
- media = self._download_json(
- '%s.json' % urljoin(RaiBaseIE._BASE_URL, 'dirette/' + display_id),
- display_id, 'Downloading channel JSON')
-
- title = media['name']
- video = media['video']
- video_id = media['id'].replace('ContentItem-', '')
-
- relinker_info = self._extract_relinker_info(video['content_url'], video_id)
- self._sort_formats(relinker_info['formats'])
-
- info = {
- 'id': video_id,
- 'display_id': display_id,
- 'title': self._live_title(title) if relinker_info.get(
- 'is_live') else title,
- 'alt_title': media.get('subtitle'),
- 'description': media.get('description'),
- 'uploader': strip_or_none(media.get('channel')),
- 'creator': strip_or_none(media.get('editor')),
- 'duration': parse_duration(video.get('duration')),
- }
-
- info.update(relinker_info)
- return info
+ }]
class RaiPlayPlaylistIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+)'
+ _VALID_URL = r'(?P<base>https?://(?:www\.)?raiplay\.it/programmi/(?P<id>[^/?#&]+))'
_TESTS = [{
'url': 'http://www.raiplay.it/programmi/nondirloalmiocapo/',
'info_dict': {
@@ -256,29 +233,34 @@ class RaiPlayPlaylistIE(InfoExtractor):
}]
def _real_extract(self, url):
- playlist_id = self._match_id(url)
-
- media = self._download_json(
- '%s.json' % urljoin(RaiBaseIE._BASE_URL, 'programmi/' + playlist_id),
- playlist_id, 'Downloading program JSON')
-
- title = media['name']
- description = media['program_info']['description']
+ base, playlist_id = re.match(self._VALID_URL, url).groups()
- content_sets = [s['id'] for b in media['blocks'] for s in b['sets']]
+ program = self._download_json(
+ base + '.json', playlist_id, 'Downloading program JSON')
entries = []
- for cs in content_sets:
- medias = self._download_json(
- '%s/%s.json' % (urljoin(RaiBaseIE._BASE_URL, 'programmi/' + playlist_id), cs),
- cs, 'Downloading content set JSON')
- for m in medias['items']:
- video_url = urljoin(url, m['path_id'])
- entries.append(self.url_result(
- video_url, ie=RaiPlayIE.ie_key(),
- video_id=RaiPlayIE._match_id(video_url)))
-
- return self.playlist_result(entries, playlist_id, title, description)
+ for b in (program.get('blocks') or []):
+ for s in (b.get('sets') or []):
+ s_id = s.get('id')
+ if not s_id:
+ continue
+ medias = self._download_json(
+ '%s/%s.json' % (base, s_id), s_id,
+ 'Downloading content set JSON', fatal=False)
+ if not medias:
+ continue
+ for m in (medias.get('items') or []):
+ path_id = m.get('path_id')
+ if not path_id:
+ continue
+ video_url = urljoin(url, path_id)
+ entries.append(self.url_result(
+ video_url, ie=RaiPlayIE.ie_key(),
+ video_id=RaiPlayIE._match_id(video_url)))
+
+ return self.playlist_result(
+ entries, playlist_id, program.get('name'),
+ try_get(program, lambda x: x['program_info']['description']))
class RaiIE(RaiBaseIE):
@@ -294,7 +276,8 @@ class RaiIE(RaiBaseIE):
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 1758,
'upload_date': '20140612',
- }
+ },
+ 'skip': 'This content is available only in Italy',
}, {
# with ContentItem in many metas
'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html',
@@ -440,7 +423,7 @@ class RaiIE(RaiBaseIE):
except ExtractorError:
pass
- relinker_url = self._search_regex(
+ relinker_url = self._proto_relative_url(self._search_regex(
r'''(?x)
(?:
var\s+videoURL|
@@ -452,7 +435,7 @@ class RaiIE(RaiBaseIE):
//mediapolis(?:vod)?\.rai\.it/relinker/relinkerServlet\.htm\?
(?:(?!\1).)*\bcont=(?:(?!\1).)+)\1
''',
- webpage, 'relinker URL', group='url')
+ webpage, 'relinker URL', group='url'))
relinker_info = self._extract_relinker_info(
urljoin(url, relinker_url), video_id)
diff --git a/youtube_dlc/extractor/rcs.py b/youtube_dlc/extractor/rcs.py
new file mode 100644
index 000000000..830182c6d
--- /dev/null
+++ b/youtube_dlc/extractor/rcs.py
@@ -0,0 +1,413 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ ExtractorError,
+ js_to_json,
+ base_url,
+ url_basename,
+ urljoin,
+)
+
+
+class RCSBaseIE(InfoExtractor):
+ _ALL_REPLACE = {
+ 'media2vam.corriere.it.edgesuite.net':
+ 'media2vam-corriere-it.akamaized.net',
+ 'media.youreporter.it.edgesuite.net':
+ 'media-youreporter-it.akamaized.net',
+ 'corrierepmd.corriere.it.edgesuite.net':
+ 'corrierepmd-corriere-it.akamaized.net',
+ 'media2vam-corriere-it.akamaized.net/fcs.quotidiani/vr/videos/':
+ 'video.corriere.it/vr360/videos/',
+ '.net//': '.net/',
+ }
+ _MP4_REPLACE = {
+ 'media2vam.corbologna.corriere.it.edgesuite.net':
+ 'media2vam-bologna-corriere-it.akamaized.net',
+ 'media2vam.corfiorentino.corriere.it.edgesuite.net':
+ 'media2vam-fiorentino-corriere-it.akamaized.net',
+ 'media2vam.cormezzogiorno.corriere.it.edgesuite.net':
+ 'media2vam-mezzogiorno-corriere-it.akamaized.net',
+ 'media2vam.corveneto.corriere.it.edgesuite.net':
+ 'media2vam-veneto-corriere-it.akamaized.net',
+ 'media2.oggi.it.edgesuite.net':
+ 'media2-oggi-it.akamaized.net',
+ 'media2.quimamme.it.edgesuite.net':
+ 'media2-quimamme-it.akamaized.net',
+ 'media2.amica.it.edgesuite.net':
+ 'media2-amica-it.akamaized.net',
+ 'media2.living.corriere.it.edgesuite.net':
+ 'media2-living-corriere-it.akamaized.net',
+ 'media2.style.corriere.it.edgesuite.net':
+ 'media2-style-corriere-it.akamaized.net',
+ 'media2.iodonna.it.edgesuite.net':
+ 'media2-iodonna-it.akamaized.net',
+ 'media2.leitv.it.edgesuite.net':
+ 'media2-leitv-it.akamaized.net',
+ }
+ _MIGRATION_MAP = {
+ 'videoamica-vh.akamaihd': 'amica',
+ 'media2-amica-it.akamaized': 'amica',
+ 'corrierevam-vh.akamaihd': 'corriere',
+ 'media2vam-corriere-it.akamaized': 'corriere',
+ 'cormezzogiorno-vh.akamaihd': 'corrieredelmezzogiorno',
+ 'media2vam-mezzogiorno-corriere-it.akamaized': 'corrieredelmezzogiorno',
+ 'corveneto-vh.akamaihd': 'corrieredelveneto',
+ 'media2vam-veneto-corriere-it.akamaized': 'corrieredelveneto',
+ 'corbologna-vh.akamaihd': 'corrieredibologna',
+ 'media2vam-bologna-corriere-it.akamaized': 'corrieredibologna',
+ 'corfiorentino-vh.akamaihd': 'corrierefiorentino',
+ 'media2vam-fiorentino-corriere-it.akamaized': 'corrierefiorentino',
+ 'corinnovazione-vh.akamaihd': 'corriereinnovazione',
+ 'media2-gazzanet-gazzetta-it.akamaized': 'gazzanet',
+ 'videogazzanet-vh.akamaihd': 'gazzanet',
+ 'videogazzaworld-vh.akamaihd': 'gazzaworld',
+ 'gazzettavam-vh.akamaihd': 'gazzetta',
+ 'media2vam-gazzetta-it.akamaized': 'gazzetta',
+ 'videoiodonna-vh.akamaihd': 'iodonna',
+ 'media2-leitv-it.akamaized': 'leitv',
+ 'videoleitv-vh.akamaihd': 'leitv',
+ 'videoliving-vh.akamaihd': 'living',
+ 'media2-living-corriere-it.akamaized': 'living',
+ 'media2-oggi-it.akamaized': 'oggi',
+ 'videooggi-vh.akamaihd': 'oggi',
+ 'media2-quimamme-it.akamaized': 'quimamme',
+ 'quimamme-vh.akamaihd': 'quimamme',
+ 'videorunning-vh.akamaihd': 'running',
+ 'media2-style-corriere-it.akamaized': 'style',
+ 'style-vh.akamaihd': 'style',
+ 'videostyle-vh.akamaihd': 'style',
+ 'media2-stylepiccoli-it.akamaized': 'stylepiccoli',
+ 'stylepiccoli-vh.akamaihd': 'stylepiccoli',
+ 'doveviaggi-vh.akamaihd': 'viaggi',
+ 'media2-doveviaggi-it.akamaized': 'viaggi',
+ 'media2-vivimilano-corriere-it.akamaized': 'vivimilano',
+ 'vivimilano-vh.akamaihd': 'vivimilano',
+ 'media2-youreporter-it.akamaized': 'youreporter'
+ }
+ _MIGRATION_MEDIA = {
+ 'advrcs-vh.akamaihd': '',
+ 'corriere-f.akamaihd': '',
+ 'corrierepmd-corriere-it.akamaized': '',
+ 'corrprotetto-vh.akamaihd': '',
+ 'gazzetta-f.akamaihd': '',
+ 'gazzettapmd-gazzetta-it.akamaized': '',
+ 'gazzprotetto-vh.akamaihd': '',
+ 'periodici-f.akamaihd': '',
+ 'periodicisecure-vh.akamaihd': '',
+ 'videocoracademy-vh.akamaihd': ''
+ }
+
+ def _get_video_src(self, video):
+ mediaFiles = video.get('mediaProfile').get('mediaFile')
+ src = {}
+ # audio
+ if video.get('mediaType') == 'AUDIO':
+ for aud in mediaFiles:
+ # todo: check
+ src['mp3'] = aud.get('value')
+ # video
+ else:
+ for vid in mediaFiles:
+ if vid.get('mimeType') == 'application/vnd.apple.mpegurl':
+ src['m3u8'] = vid.get('value')
+ if vid.get('mimeType') == 'video/mp4':
+ src['mp4'] = vid.get('value')
+
+ # replace host
+ for t in src:
+ for s, r in self._ALL_REPLACE.items():
+ src[t] = src[t].replace(s, r)
+ for s, r in self._MP4_REPLACE.items():
+ src[t] = src[t].replace(s, r)
+
+ # switch cdn
+ if 'mp4' in src and 'm3u8' in src:
+ if ('-lh.akamaihd' not in src.get('m3u8')
+ and 'akamai' in src.get('mp4')):
+ if 'm3u8' in src:
+ matches = re.search(r'(?:https*:)?\/\/(?P<host>.*)\.net\/i(?P<path>.*)$', src.get('m3u8'))
+ src['m3u8'] = 'https://vod.rcsobjects.it/hls/%s%s' % (
+ self._MIGRATION_MAP[matches.group('host')],
+ matches.group('path').replace(
+ '///', '/').replace(
+ '//', '/').replace(
+ '.csmil', '.urlset'
+ )
+ )
+ if 'mp4' in src:
+ matches = re.search(r'(?:https*:)?\/\/(?P<host>.*)\.net\/i(?P<path>.*)$', src.get('mp4'))
+ if matches:
+ if matches.group('host') in self._MIGRATION_MEDIA:
+ vh_stream = 'https://media2.corriereobjects.it'
+ if src.get('mp4').find('fcs.quotidiani_!'):
+ vh_stream = 'https://media2-it.corriereobjects.it'
+ src['mp4'] = '%s%s' % (
+ vh_stream,
+ matches.group('path').replace(
+ '///', '/').replace(
+ '//', '/').replace(
+ '/fcs.quotidiani/mediacenter', '').replace(
+ '/fcs.quotidiani_!/mediacenter', '').replace(
+ 'corriere/content/mediacenter/', '').replace(
+ 'gazzetta/content/mediacenter/', '')
+ )
+ else:
+ src['mp4'] = 'https://vod.rcsobjects.it/%s%s' % (
+ self._MIGRATION_MAP[matches.group('host')],
+ matches.group('path').replace('///', '/').replace('//', '/')
+ )
+
+ if 'mp3' in src:
+ src['mp3'] = src.get('mp3').replace(
+ 'media2vam-corriere-it.akamaized.net',
+ 'vod.rcsobjects.it/corriere')
+ if 'mp4' in src:
+ if src.get('mp4').find('fcs.quotidiani_!'):
+ src['mp4'] = src.get('mp4').replace('vod.rcsobjects', 'vod-it.rcsobjects')
+ if 'm3u8' in src:
+ if src.get('m3u8').find('fcs.quotidiani_!'):
+ src['m3u8'] = src.get('m3u8').replace('vod.rcsobjects', 'vod-it.rcsobjects')
+
+ if 'geoblocking' in video.get('mediaProfile'):
+ if 'm3u8' in src:
+ src['m3u8'] = src.get('m3u8').replace('vod.rcsobjects', 'vod-it.rcsobjects')
+ if 'mp4' in src:
+ src['mp4'] = src.get('mp4').replace('vod.rcsobjects', 'vod-it.rcsobjects')
+ if 'm3u8' in src:
+ if src.get('m3u8').find('csmil') and src.get('m3u8').find('vod'):
+ src['m3u8'] = src.get('m3u8').replace('.csmil', '.urlset')
+
+ return src
+
+ def _create_formats(self, urls, video_id):
+ formats = []
+ formats = self._extract_m3u8_formats(
+ urls.get('m3u8'), video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False)
+
+ if not formats:
+ formats.append({
+ 'format_id': 'http-mp4',
+ 'url': urls.get('mp4')
+ })
+ self._sort_formats(formats)
+ return formats
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ mobj = re.search(self._VALID_URL, url)
+
+ if 'cdn' not in mobj.groupdict():
+ raise ExtractorError('CDN not found in url: %s' % url)
+
+ # for leitv/youreporter/viaggi don't use the embed page
+ if ((mobj.group('cdn') not in ['leitv.it', 'youreporter.it'])
+ and (mobj.group('vid') == 'video')):
+ url = 'https://video.%s/video-embed/%s' % (mobj.group('cdn'), video_id)
+
+ page = self._download_webpage(url, video_id)
+
+ video_data = None
+ # look for json video data url
+ json = self._search_regex(
+ r'''(?x)var url\s*=\s*["']((?:https?:)?
+ //video\.rcs\.it
+ /fragment-includes/video-includes/.+?\.json)["'];''',
+ page, video_id, default=None)
+ if json:
+ if json.startswith('//'):
+ json = 'https:%s' % json
+ video_data = self._download_json(json, video_id)
+
+ # if json url not found, look for json video data directly in the page
+ else:
+ json = self._search_regex(
+ r'[\s;]video\s*=\s*({[\s\S]+?})(?:;|,playlist=)',
+ page, video_id, default=None)
+ if json:
+ video_data = self._parse_json(
+ json, video_id, transform_source=js_to_json)
+ else:
+ # if no video data found try search for iframes
+ emb = RCSEmbedsIE._extract_url(page)
+ if emb:
+ return {
+ '_type': 'url_transparent',
+ 'url': emb,
+ 'ie_key': RCSEmbedsIE.ie_key()
+ }
+
+ if not video_data:
+ raise ExtractorError('Video data not found in the page')
+
+ formats = self._create_formats(
+ self._get_video_src(video_data), video_id)
+
+ description = (video_data.get('description')
+ or clean_html(video_data.get('htmlDescription')))
+ uploader = video_data.get('provider') or mobj.group('cdn')
+
+ return {
+ 'id': video_id,
+ 'title': video_data.get('title'),
+ 'description': description,
+ 'uploader': uploader,
+ 'formats': formats
+ }
+
+
+class RCSEmbedsIE(RCSBaseIE):
+ _VALID_URL = r'''(?x)
+ https?://(?P<vid>video)\.
+ (?P<cdn>
+ (?:
+ rcs|
+ (?:corriere\w+\.)?corriere|
+ (?:gazzanet\.)?gazzetta
+ )\.it)
+ /video-embed/(?P<id>[^/=&\?]+?)(?:$|\?)'''
+ _TESTS = [{
+ 'url': 'https://video.rcs.it/video-embed/iodonna-0001585037',
+ 'md5': '623ecc8ffe7299b2d0c1046d8331a9df',
+ 'info_dict': {
+ 'id': 'iodonna-0001585037',
+ 'ext': 'mp4',
+ 'title': 'Sky Arte racconta Madonna nella serie "Artist to icon"',
+ 'description': 'md5:65b09633df9ffee57f48b39e34c9e067',
+ 'uploader': 'rcs.it',
+ }
+ }, {
+ 'url': 'https://video.gazzanet.gazzetta.it/video-embed/gazzanet-mo05-0000260789',
+ 'md5': 'a043e3fecbe4d9ed7fc5d888652a5440',
+ 'info_dict': {
+ 'id': 'gazzanet-mo05-0000260789',
+ 'ext': 'mp4',
+ 'title': 'Valentino Rossi e papà Graziano si divertono col drifting',
+ 'description': 'md5:a8bf90d6adafd9815f70fc74c0fc370a',
+ 'uploader': 'rcd',
+ }
+ }, {
+ 'url': 'https://video.corriere.it/video-embed/b727632a-f9d0-11ea-91b0-38d50a849abb?player',
+ 'match_only': True
+ }, {
+ 'url': 'https://video.gazzetta.it/video-embed/49612410-00ca-11eb-bcd8-30d4253e0140',
+ 'match_only': True
+ }]
+
+ @staticmethod
+ def _sanitize_urls(urls):
+ # add protocol if missing
+ for i, e in enumerate(urls):
+ if e.startswith('//'):
+ urls[i] = 'https:%s' % e
+ # clean iframes urls
+ for i, e in enumerate(urls):
+ urls[i] = urljoin(base_url(e), url_basename(e))
+ return urls
+
+ @staticmethod
+ def _extract_urls(webpage):
+ entries = [
+ mobj.group('url')
+ for mobj in re.finditer(r'''(?x)
+ (?:
+ data-frame-src=|
+ <iframe[^\n]+src=
+ )
+ (["'])
+ (?P<url>(?:https?:)?//video\.
+ (?:
+ rcs|
+ (?:corriere\w+\.)?corriere|
+ (?:gazzanet\.)?gazzetta
+ )
+ \.it/video-embed/.+?)
+ \1''', webpage)]
+ return RCSEmbedsIE._sanitize_urls(entries)
+
+ @staticmethod
+ def _extract_url(webpage):
+ urls = RCSEmbedsIE._extract_urls(webpage)
+ return urls[0] if urls else None
+
+
+class RCSIE(RCSBaseIE):
+ _VALID_URL = r'''(?x)https?://(?P<vid>video|viaggi)\.
+ (?P<cdn>
+ (?:
+ corrieredelmezzogiorno\.
+ |corrieredelveneto\.
+ |corrieredibologna\.
+ |corrierefiorentino\.
+ )?corriere\.it
+ |(?:gazzanet\.)?gazzetta\.it)
+ /(?!video-embed/).+?/(?P<id>[^/\?]+)(?=\?|/$|$)'''
+ _TESTS = [{
+ 'url': 'https://video.corriere.it/sport/formula-1/vettel-guida-ferrari-sf90-mugello-suo-fianco-c-elecrerc-bendato-video-esilarante/b727632a-f9d0-11ea-91b0-38d50a849abb',
+ 'md5': '0f4ededc202b0f00b6e509d831e2dcda',
+ 'info_dict': {
+ 'id': 'b727632a-f9d0-11ea-91b0-38d50a849abb',
+ 'ext': 'mp4',
+ 'title': 'Vettel guida la Ferrari SF90 al Mugello e al suo fianco c\'è Leclerc (bendato): il video è esilarante',
+ 'description': 'md5:93b51c9161ac8a64fb2f997b054d0152',
+ 'uploader': 'Corriere Tv',
+ }
+ }, {
+ 'url': 'https://viaggi.corriere.it/video/norvegia-il-nuovo-ponte-spettacolare-sopra-la-cascata-di-voringsfossen/',
+ 'md5': 'da378e4918d2afbf7d61c35abb948d4c',
+ 'info_dict': {
+ 'id': '5b7cd134-e2c1-11ea-89b3-b56dd0df2aa2',
+ 'ext': 'mp4',
+ 'title': 'La nuova spettacolare attrazione in Norvegia: il ponte sopra Vøringsfossen',
+ 'description': 'md5:18b35a291f6746c0c8dacd16e5f5f4f8',
+ 'uploader': 'DOVE Viaggi',
+ }
+ }, {
+ 'url': 'https://video.gazzetta.it/video-motogp-catalogna-cadute-dovizioso-vale-rossi/49612410-00ca-11eb-bcd8-30d4253e0140?vclk=Videobar',
+ 'md5': 'eedc1b5defd18e67383afef51ff7bdf9',
+ 'info_dict': {
+ 'id': '49612410-00ca-11eb-bcd8-30d4253e0140',
+ 'ext': 'mp4',
+ 'title': 'Dovizioso, il contatto con Zarco e la caduta. E anche Vale finisce a terra',
+ 'description': 'md5:8c6e905dc3b9413218beca11ebd69778',
+ 'uploader': 'AMorici',
+ }
+ }, {
+ 'url': 'https://video.corriere.it/video-360/metro-copenaghen-tutta-italiana/a248a7f0-e2db-11e9-9830-af2de6b1f945',
+ 'match_only': True
+ }]
+
+
+class RCSVariousIE(RCSBaseIE):
+ _VALID_URL = r'''(?x)https?://www\.
+ (?P<cdn>
+ leitv\.it|
+ youreporter\.it
+ )/(?:video/)?(?P<id>[^/]+?)(?:$|\?|/)'''
+ _TESTS = [{
+ 'url': 'https://www.leitv.it/video/marmellata-di-ciliegie-fatta-in-casa/',
+ 'md5': '618aaabac32152199c1af86784d4d554',
+ 'info_dict': {
+ 'id': 'marmellata-di-ciliegie-fatta-in-casa',
+ 'ext': 'mp4',
+ 'title': 'Marmellata di ciliegie fatta in casa',
+ 'description': 'md5:89133864d6aad456dbcf6e7a29f86263',
+ 'uploader': 'leitv.it',
+ }
+ }, {
+ 'url': 'https://www.youreporter.it/fiume-sesia-3-ottobre-2020/',
+ 'md5': '8dccd436b47a830bab5b4a88232f391a',
+ 'info_dict': {
+ 'id': 'fiume-sesia-3-ottobre-2020',
+ 'ext': 'mp4',
+ 'title': 'Fiume Sesia 3 ottobre 2020',
+ 'description': 'md5:0070eef1cc884d13c970a4125063de55',
+ 'uploader': 'youreporter.it',
+ }
+ }]
diff --git a/youtube_dlc/extractor/rumble.py b/youtube_dlc/extractor/rumble.py
new file mode 100644
index 000000000..4a0225109
--- /dev/null
+++ b/youtube_dlc/extractor/rumble.py
@@ -0,0 +1,67 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ determine_ext,
+ int_or_none,
+ parse_iso8601,
+ try_get,
+)
+
+
+class RumbleEmbedIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?rumble\.com/embed/(?:[0-9a-z]+\.)?(?P<id>[0-9a-z]+)'
+ _TESTS = [{
+ 'url': 'https://rumble.com/embed/v5pv5f',
+ 'md5': '36a18a049856720189f30977ccbb2c34',
+ 'info_dict': {
+ 'id': 'v5pv5f',
+ 'ext': 'mp4',
+ 'title': 'WMAR 2 News Latest Headlines | October 20, 6pm',
+ 'timestamp': 1571611968,
+ 'upload_date': '20191020',
+ }
+ }, {
+ 'url': 'https://rumble.com/embed/ufe9n.v5pv5f',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ video = self._download_json(
+ 'https://rumble.com/embedJS/', video_id,
+ query={'request': 'video', 'v': video_id})
+ title = video['title']
+
+ formats = []
+ for height, ua in (video.get('ua') or {}).items():
+ for i in range(2):
+ f_url = try_get(ua, lambda x: x[i], compat_str)
+ if f_url:
+ ext = determine_ext(f_url)
+ f = {
+ 'ext': ext,
+ 'format_id': '%s-%sp' % (ext, height),
+ 'height': int_or_none(height),
+ 'url': f_url,
+ }
+ bitrate = try_get(ua, lambda x: x[i + 2]['bitrate'])
+ if bitrate:
+ f['tbr'] = int_or_none(bitrate)
+ formats.append(f)
+ self._sort_formats(formats)
+
+ author = video.get('author') or {}
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': video.get('i'),
+ 'timestamp': parse_iso8601(video.get('pubDate')),
+ 'channel': author.get('name'),
+ 'channel_url': author.get('url'),
+ 'duration': int_or_none(video.get('duration')),
+ }
diff --git a/youtube_dlc/extractor/servus.py b/youtube_dlc/extractor/servus.py
index 9401bf2cf..1610ddc2c 100644
--- a/youtube_dlc/extractor/servus.py
+++ b/youtube_dlc/extractor/servus.py
@@ -1,9 +1,15 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ float_or_none,
+ int_or_none,
+ unified_timestamp,
+ urlencode_postdata,
+ url_or_none,
+)
class ServusIE(InfoExtractor):
@@ -12,20 +18,29 @@ class ServusIE(InfoExtractor):
(?:www\.)?
(?:
servus\.com/(?:(?:at|de)/p/[^/]+|tv/videos)|
- servustv\.com/videos
+ (?:servustv|pm-wissen)\.com/videos
)
/(?P<id>[aA]{2}-\w+|\d+-\d+)
'''
_TESTS = [{
# new URL schema
'url': 'https://www.servustv.com/videos/aa-1t6vbu5pw1w12/',
- 'md5': '3e1dd16775aa8d5cbef23628cfffc1f4',
+ 'md5': '60474d4c21f3eb148838f215c37f02b9',
'info_dict': {
'id': 'AA-1T6VBU5PW1W12',
'ext': 'mp4',
'title': 'Die Grünen aus Sicht des Volkes',
+ 'alt_title': 'Talk im Hangar-7 Voxpops Gruene',
'description': 'md5:1247204d85783afe3682644398ff2ec4',
'thumbnail': r're:^https?://.*\.jpg',
+ 'duration': 62.442,
+ 'timestamp': 1605193976,
+ 'upload_date': '20201112',
+ 'series': 'Talk im Hangar-7',
+ 'season': 'Season 9',
+ 'season_number': 9,
+ 'episode': 'Episode 31 - September 14',
+ 'episode_number': 31,
}
}, {
# old URL schema
@@ -40,30 +55,94 @@ class ServusIE(InfoExtractor):
}, {
'url': 'https://www.servus.com/tv/videos/1380889096408-1235196658/',
'only_matching': True,
+ }, {
+ 'url': 'https://www.pm-wissen.com/videos/aa-24mus4g2w2112/',
+ 'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url).upper()
- webpage = self._download_webpage(url, video_id)
- title = self._search_regex(
- (r'videoLabel\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1',
- r'<h\d+[^>]+\bclass=["\']heading--(?:one|two)["\'][^>]*>(?P<title>[^<]+)'),
- webpage, 'title', default=None,
- group='title') or self._og_search_title(webpage)
- title = re.sub(r'\s*-\s*Servus TV\s*$', '', title)
- description = self._og_search_description(webpage)
- thumbnail = self._og_search_thumbnail(webpage)
+ token = self._download_json(
+ 'https://auth.redbullmediahouse.com/token', video_id,
+ 'Downloading token', data=urlencode_postdata({
+ 'grant_type': 'client_credentials',
+ }), headers={
+ 'Authorization': 'Basic SVgtMjJYNEhBNFdEM1cxMTpEdDRVSkFLd2ZOMG5IMjB1NGFBWTBmUFpDNlpoQ1EzNA==',
+ })
+ access_token = token['access_token']
+ token_type = token.get('token_type', 'Bearer')
- formats = self._extract_m3u8_formats(
- 'https://stv.rbmbtnx.net/api/v1/manifests/%s.m3u8' % video_id,
- video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls')
+ video = self._download_json(
+ 'https://sparkle-api.liiift.io/api/v1/stv/channels/international/assets/%s' % video_id,
+ video_id, 'Downloading video JSON', headers={
+ 'Authorization': '%s %s' % (token_type, access_token),
+ })
+
+ formats = []
+ thumbnail = None
+ for resource in video['resources']:
+ if not isinstance(resource, dict):
+ continue
+ format_url = url_or_none(resource.get('url'))
+ if not format_url:
+ continue
+ extension = resource.get('extension')
+ type_ = resource.get('type')
+ if extension == 'jpg' or type_ == 'reference_keyframe':
+ thumbnail = format_url
+ continue
+ ext = determine_ext(format_url)
+ if type_ == 'dash' or ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ format_url, video_id, mpd_id='dash', fatal=False))
+ elif type_ == 'hls' or ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ format_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
+ elif extension == 'mp4' or ext == 'mp4':
+ formats.append({
+ 'url': format_url,
+ 'format_id': type_,
+ 'width': int_or_none(resource.get('width')),
+ 'height': int_or_none(resource.get('height')),
+ })
self._sort_formats(formats)
+ attrs = {}
+ for attribute in video['attributes']:
+ if not isinstance(attribute, dict):
+ continue
+ key = attribute.get('fieldKey')
+ value = attribute.get('fieldValue')
+ if not key or not value:
+ continue
+ attrs[key] = value
+
+ title = attrs.get('title_stv') or video_id
+ alt_title = attrs.get('title')
+ description = attrs.get('long_description') or attrs.get('short_description')
+ series = attrs.get('label')
+ season = attrs.get('season')
+ episode = attrs.get('chapter')
+ duration = float_or_none(attrs.get('duration'), scale=1000)
+ season_number = int_or_none(self._search_regex(
+ r'Season (\d+)', season or '', 'season number', default=None))
+ episode_number = int_or_none(self._search_regex(
+ r'Episode (\d+)', episode or '', 'episode number', default=None))
+
return {
'id': video_id,
'title': title,
+ 'alt_title': alt_title,
'description': description,
'thumbnail': thumbnail,
+ 'duration': duration,
+ 'timestamp': unified_timestamp(video.get('lastPublished')),
+ 'series': series,
+ 'season': season,
+ 'season_number': season_number,
+ 'episode': episode,
+ 'episode_number': episode_number,
'formats': formats,
}
diff --git a/youtube_dlc/extractor/skyitalia.py b/youtube_dlc/extractor/skyitalia.py
new file mode 100644
index 000000000..22a6be2be
--- /dev/null
+++ b/youtube_dlc/extractor/skyitalia.py
@@ -0,0 +1,123 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import ExtractorError
+
+
+class SkyItaliaBaseIE(InfoExtractor):
+ _GET_VIDEO_DATA = 'https://apid.sky.it/vdp/v1/getVideoData?token={token}&caller=sky&rendition=web&id={id}'
+ _RES = {
+ 'low': [426, 240],
+ 'med': [640, 360],
+ 'high': [854, 480],
+ 'hd': [1280, 720]
+ }
+ _GEO_BYPASS = False
+
+ def _extract_video_id(self, url):
+ webpage = self._download_webpage(url, 'skyitalia')
+ video_id = self._html_search_regex(
+ [r'data-videoid=\"(\d+)\"',
+ r'http://player\.sky\.it/social\?id=(\d+)\&'],
+ webpage, 'video_id')
+ if video_id:
+ return video_id
+ raise ExtractorError('Video ID not found.')
+
+ def _get_formats(self, video_id, token):
+ data_url = self._GET_VIDEO_DATA.replace('{id}', video_id)
+ data_url = data_url.replace('{token}', token)
+ video_data = self._parse_json(
+ self._download_webpage(data_url, video_id),
+ video_id)
+
+ formats = []
+ for q, r in self._RES.items():
+ key = 'web_%s_url' % q
+ if key not in video_data:
+ continue
+ formats.append({
+ 'url': video_data.get(key),
+ 'format_id': q,
+ 'width': r[0],
+ 'height': r[1]
+ })
+
+ if not formats and video_data.get('geob') == 1:
+ self.raise_geo_restricted(countries=['IT'])
+
+ self._sort_formats(formats)
+ title = video_data.get('title')
+ thumb = video_data.get('thumb')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumb,
+ 'formats': formats
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ if video_id == 'None':
+ video_id = self._extract_video_id(url)
+ return self._get_formats(video_id, self._TOKEN)
+
+
+class SkyItaliaIE(SkyItaliaBaseIE):
+ IE_NAME = 'sky.it'
+ _VALID_URL = r'''(?x)https?://
+ (?P<ie>sport|tg24|video)
+ \.sky\.it/(?:.+?)
+ (?P<id>[0-9]{6})?
+ (?:$|\?)'''
+
+ _TESTS = [{
+ 'url': 'https://video.sky.it/sport/motogp/video/motogp-gp-emilia-romagna-highlights-prove-libere-616162',
+ 'md5': '9c03b590b06e5952d8051f0e02b0feca',
+ 'info_dict': {
+ 'id': '616162',
+ 'ext': 'mp4',
+ 'title': 'MotoGP, GP Emilia Romagna: gli highlights delle prove libere',
+ 'thumbnail': 'https://videoplatform.sky.it/thumbnail/2020/09/18/1600441214452_hl-libere-motogp-misano2_5602634_thumbnail_1.jpg',
+ }
+ }, {
+ 'url': 'https://sport.sky.it/motogp/2020/09/18/motogp-gp-emilia-romagna-misano-2020-prove-libere-diretta',
+ 'md5': '9c03b590b06e5952d8051f0e02b0feca',
+ 'info_dict': {
+ 'id': '616162',
+ 'ext': 'mp4',
+ 'title': 'MotoGP, GP Emilia Romagna: gli highlights delle prove libere',
+ 'thumbnail': 'https://videoplatform.sky.it/thumbnail/2020/09/18/1600441214452_hl-libere-motogp-misano2_5602634_thumbnail_1.jpg',
+ }
+ }, {
+ 'url': 'https://tg24.sky.it/salute-e-benessere/2020/09/18/coronavirus-vaccino-ue-sanofi',
+ 'md5': 'caa25e62dadb529bc5e0b078da99f854',
+ 'info_dict': {
+ 'id': '615904',
+ 'ext': 'mp4',
+ 'title': 'Covid-19, al Buzzi di Milano tamponi drive-in per studenti',
+ 'thumbnail': 'https://videoplatform.sky.it/thumbnail/2020/09/17/1600351405841_error-coronavirus-al-buzzi-di-milano-tamponi_thumbnail_1.jpg',
+ }
+ }, {
+ 'url': 'https://video.sky.it/sport/motogp/video/motogp-gp-emilia-romagna-highlights-prove-libere-616162?itm_source=parsely-api',
+ 'only_matching': True,
+ }]
+ _TOKEN = 'F96WlOd8yoFmLQgiqv6fNQRvHZcsWk5jDaYnDvhbiJk'
+
+
+class SkyArteItaliaIE(SkyItaliaBaseIE):
+ IE_NAME = 'arte.sky.it'
+ _VALID_URL = r'https?://arte\.sky\.it/video/.+?(?P<id>[0-9]{6})?$'
+ _TEST = {
+ 'url': 'https://arte.sky.it/video/federico-fellini-maestri-cinema/',
+ 'md5': '2f22513a89f45142f2746f878d690647',
+ 'info_dict': {
+ 'id': '612888',
+ 'ext': 'mp4',
+ 'title': 'I maestri del cinema Federico Felini',
+ 'thumbnail': 'https://videoplatform.sky.it/thumbnail/2020/09/03/1599146747305_i-maestri-del-cinema-federico-felini_thumbnail_1.jpg',
+ }
+ }
+ _TOKEN = 'LWk29hfiU39NNdq87ePeRach3nzTSV20o0lTv2001Cd'
diff --git a/youtube_dlc/extractor/soundcloud.py b/youtube_dlc/extractor/soundcloud.py
index ed70b7169..47f68bf19 100644
--- a/youtube_dlc/extractor/soundcloud.py
+++ b/youtube_dlc/extractor/soundcloud.py
@@ -649,7 +649,7 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE):
class SoundcloudPagedPlaylistBaseIE(SoundcloudIE):
def _extract_playlist(self, base_url, playlist_id, playlist_title):
- # Per the SoundCloud documentation, the maximum limit for a linked partioning query is 200.
+ # Per the SoundCloud documentation, the maximum limit for a linked partitioning query is 200.
# https://developers.soundcloud.com/blog/offset-pagination-deprecated
COMMON_QUERY = {
'limit': 200,
diff --git a/youtube_dlc/extractor/southpark.py b/youtube_dlc/extractor/southpark.py
index 20ae7c5e7..95e6d2890 100644
--- a/youtube_dlc/extractor/southpark.py
+++ b/youtube_dlc/extractor/southpark.py
@@ -44,7 +44,7 @@ class SouthParkEsIE(SouthParkIE):
class SouthParkDeIE(SouthParkIE):
IE_NAME = 'southpark.de'
- _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.de/(?:videoclip|collections|folgen)/(?P<id>(?P<unique_id>.+?)/.+?)(?:\?|#|$))'
+ _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.de/(?:(en/(videoclip|collections|episodes))|(videoclip|collections|folgen))/(?P<id>(?P<unique_id>.+?)/.+?)(?:\?|#|$))'
# _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed'
_TESTS = [{
diff --git a/youtube_dlc/extractor/spiegel.py b/youtube_dlc/extractor/spiegel.py
index 4df7f4ddc..2da32b9b2 100644
--- a/youtube_dlc/extractor/spiegel.py
+++ b/youtube_dlc/extractor/spiegel.py
@@ -1,159 +1,54 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from .nexx import (
- NexxIE,
- NexxEmbedIE,
-)
-from .spiegeltv import SpiegeltvIE
-from ..compat import compat_urlparse
-from ..utils import (
- parse_duration,
- strip_or_none,
- unified_timestamp,
-)
+from .jwplatform import JWPlatformIE
class SpiegelIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<id>[0-9]+)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$'
+ _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}'
+ _VALID_URL = r'https?://(?:www\.)?(?:spiegel|manager-magazin)\.de(?:/[^/]+)+/[^/]*-(?P<id>[0-9]+|%s)(?:-embed|-iframe)?(?:\.html)?(?:#.*)?$' % _UUID_RE
_TESTS = [{
'url': 'http://www.spiegel.de/video/vulkan-tungurahua-in-ecuador-ist-wieder-aktiv-video-1259285.html',
- 'md5': 'b57399839d055fccfeb9a0455c439868',
+ 'md5': '50c7948883ec85a3e431a0a44b7ad1d6',
'info_dict': {
- 'id': '563747',
+ 'id': 'II0BUyxY',
+ 'display_id': '1259285',
'ext': 'mp4',
- 'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv',
+ 'title': 'Vulkan Tungurahua in Ecuador ist wieder aktiv - DER SPIEGEL - Wissenschaft',
'description': 'md5:8029d8310232196eb235d27575a8b9f4',
- 'duration': 49,
+ 'duration': 48.0,
'upload_date': '20130311',
- 'timestamp': 1362994320,
+ 'timestamp': 1362997920,
},
}, {
'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html',
- 'md5': '5b6c2f4add9d62912ed5fc78a1faed80',
- 'info_dict': {
- 'id': '580988',
- 'ext': 'mp4',
- 'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers',
- 'description': 'md5:c2322b65e58f385a820c10fa03b2d088',
- 'duration': 983,
- 'upload_date': '20131115',
- 'timestamp': 1384546642,
- },
+ 'only_matching': True,
}, {
- 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html',
- 'md5': '97b91083a672d72976faa8433430afb9',
- 'info_dict': {
- 'id': '601883',
- 'ext': 'mp4',
- 'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.',
- 'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"',
- 'upload_date': '20140904',
- 'timestamp': 1409834160,
- }
+ 'url': 'https://www.spiegel.de/video/eifel-zoo-aufregung-um-ausgebrochene-raubtiere-video-99018031.html',
+ 'only_matching': True,
}, {
- 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html',
+ 'url': 'https://www.spiegel.de/panorama/urteile-im-goldmuenzenprozess-haftstrafen-fuer-clanmitglieder-a-aae8df48-43c1-4c61-867d-23f0a2d254b7',
'only_matching': True,
}, {
- # nexx video
'url': 'http://www.spiegel.de/video/spiegel-tv-magazin-ueber-guellekrise-in-schleswig-holstein-video-99012776.html',
'only_matching': True,
+ }, {
+ 'url': 'http://www.spiegel.de/sport/sonst/badminton-wm-die-randsportart-soll-populaerer-werden-a-987092.html',
+ 'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
- metadata_url = 'http://www.spiegel.de/video/metadata/video-%s.json' % video_id
- handle = self._request_webpage(metadata_url, video_id)
-
- # 302 to spiegel.tv, like http://www.spiegel.de/video/der-film-zum-wochenende-die-wahrheit-ueber-maenner-video-99003272.html
- if SpiegeltvIE.suitable(handle.geturl()):
- return self.url_result(handle.geturl(), 'Spiegeltv')
-
- video_data = self._parse_json(self._webpage_read_content(
- handle, metadata_url, video_id), video_id)
- title = video_data['title']
- nexx_id = video_data['nexxOmniaId']
- domain_id = video_data.get('nexxOmniaDomain') or '748'
-
+ webpage = self._download_webpage(url, video_id)
+ media_id = self._html_search_regex(
+ r'(&#34;|["\'])mediaId\1\s*:\s*(&#34;|["\'])(?P<id>(?:(?!\2).)+)\2',
+ webpage, 'media id', group='id')
return {
'_type': 'url_transparent',
'id': video_id,
- 'url': 'nexx:%s:%s' % (domain_id, nexx_id),
- 'title': title,
- 'description': strip_or_none(video_data.get('teaser')),
- 'duration': parse_duration(video_data.get('duration')),
- 'timestamp': unified_timestamp(video_data.get('datum')),
- 'ie_key': NexxIE.ie_key(),
+ 'display_id': video_id,
+ 'url': 'jwplatform:%s' % media_id,
+ 'title': self._og_search_title(webpage, default=None),
+ 'ie_key': JWPlatformIE.ie_key(),
}
-
-
-class SpiegelArticleIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?spiegel\.de/(?!video/)[^?#]*?-(?P<id>[0-9]+)\.html'
- IE_NAME = 'Spiegel:Article'
- IE_DESC = 'Articles on spiegel.de'
- _TESTS = [{
- 'url': 'http://www.spiegel.de/sport/sonst/badminton-wm-die-randsportart-soll-populaerer-werden-a-987092.html',
- 'info_dict': {
- 'id': '1516455',
- 'ext': 'mp4',
- 'title': 'Faszination Badminton: Nennt es bloß nicht Federball',
- 'description': 're:^Patrick Kämnitz gehört.{100,}',
- 'upload_date': '20140825',
- },
- }, {
- 'url': 'http://www.spiegel.de/wissenschaft/weltall/astronaut-alexander-gerst-antwortet-spiegel-online-lesern-a-989876.html',
- 'info_dict': {
-
- },
- 'playlist_count': 6,
- }, {
- # Nexx iFrame embed
- 'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-schnellste-katapult-achterbahn-der-welt-taron-a-1137884.html',
- 'info_dict': {
- 'id': '161464',
- 'ext': 'mp4',
- 'title': 'Nervenkitzel Achterbahn',
- 'alt_title': 'Karussellbauer in Deutschland',
- 'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc',
- 'release_year': 2005,
- 'creator': 'SPIEGEL TV',
- 'thumbnail': r're:^https?://.*\.jpg$',
- 'duration': 2761,
- 'timestamp': 1394021479,
- 'upload_date': '20140305',
- },
- 'params': {
- 'format': 'bestvideo',
- 'skip_download': True,
- },
- }]
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
-
- # Single video on top of the page
- video_link = self._search_regex(
- r'<a href="([^"]+)" onclick="return spOpenVideo\(this,', webpage,
- 'video page URL', default=None)
- if video_link:
- video_url = compat_urlparse.urljoin(
- self.http_scheme() + '//spiegel.de/', video_link)
- return self.url_result(video_url)
-
- # Multiple embedded videos
- embeds = re.findall(
- r'<div class="vid_holder[0-9]+.*?</div>\s*.*?url\s*=\s*"([^"]+)"',
- webpage)
- entries = [
- self.url_result(compat_urlparse.urljoin(
- self.http_scheme() + '//spiegel.de/', embed_path))
- for embed_path in embeds]
- if embeds:
- return self.playlist_result(entries)
-
- return self.playlist_from_matches(
- NexxEmbedIE._extract_urls(webpage), ie=NexxEmbedIE.ie_key())
diff --git a/youtube_dlc/extractor/spreaker.py b/youtube_dlc/extractor/spreaker.py
new file mode 100644
index 000000000..6c7e40ae4
--- /dev/null
+++ b/youtube_dlc/extractor/spreaker.py
@@ -0,0 +1,176 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ float_or_none,
+ int_or_none,
+ str_or_none,
+ try_get,
+ unified_timestamp,
+ url_or_none,
+)
+
+
+def _extract_episode(data, episode_id=None):
+ title = data['title']
+ download_url = data['download_url']
+
+ series = try_get(data, lambda x: x['show']['title'], compat_str)
+ uploader = try_get(data, lambda x: x['author']['fullname'], compat_str)
+
+ thumbnails = []
+ for image in ('image_original', 'image_medium', 'image'):
+ image_url = url_or_none(data.get('%s_url' % image))
+ if image_url:
+ thumbnails.append({'url': image_url})
+
+ def stats(key):
+ return int_or_none(try_get(
+ data,
+ (lambda x: x['%ss_count' % key],
+ lambda x: x['stats']['%ss' % key])))
+
+ def duration(key):
+ return float_or_none(data.get(key), scale=1000)
+
+ return {
+ 'id': compat_str(episode_id or data['episode_id']),
+ 'url': download_url,
+ 'display_id': data.get('permalink'),
+ 'title': title,
+ 'description': data.get('description'),
+ 'timestamp': unified_timestamp(data.get('published_at')),
+ 'uploader': uploader,
+ 'uploader_id': str_or_none(data.get('author_id')),
+ 'creator': uploader,
+ 'duration': duration('duration') or duration('length'),
+ 'view_count': stats('play'),
+ 'like_count': stats('like'),
+ 'comment_count': stats('message'),
+ 'format': 'MPEG Layer 3',
+ 'format_id': 'mp3',
+ 'container': 'mp3',
+ 'ext': 'mp3',
+ 'thumbnails': thumbnails,
+ 'series': series,
+ 'extractor_key': SpreakerIE.ie_key(),
+ }
+
+
+class SpreakerIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ https?://
+ api\.spreaker\.com/
+ (?:
+ (?:download/)?episode|
+ v2/episodes
+ )/
+ (?P<id>\d+)
+ '''
+ _TESTS = [{
+ 'url': 'https://api.spreaker.com/episode/12534508',
+ 'info_dict': {
+ 'id': '12534508',
+ 'display_id': 'swm-ep15-how-to-market-your-music-part-2',
+ 'ext': 'mp3',
+ 'title': 'EP:15 | Music Marketing (Likes) - Part 2',
+ 'description': 'md5:0588c43e27be46423e183076fa071177',
+ 'timestamp': 1502250336,
+ 'upload_date': '20170809',
+ 'uploader': 'SWM',
+ 'uploader_id': '9780658',
+ 'duration': 1063.42,
+ 'view_count': int,
+ 'like_count': int,
+ 'comment_count': int,
+ 'series': 'Success With Music (SWM)',
+ },
+ }, {
+ 'url': 'https://api.spreaker.com/download/episode/12534508/swm_ep15_how_to_market_your_music_part_2.mp3',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://api.spreaker.com/v2/episodes/12534508?export=episode_segments',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ episode_id = self._match_id(url)
+ data = self._download_json(
+ 'https://api.spreaker.com/v2/episodes/%s' % episode_id,
+ episode_id)['response']['episode']
+ return _extract_episode(data, episode_id)
+
+
+class SpreakerPageIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?spreaker\.com/user/[^/]+/(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'https://www.spreaker.com/user/9780658/swm-ep15-how-to-market-your-music-part-2',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ episode_id = self._search_regex(
+ (r'data-episode_id=["\'](?P<id>\d+)',
+ r'episode_id\s*:\s*(?P<id>\d+)'), webpage, 'episode id')
+ return self.url_result(
+ 'https://api.spreaker.com/episode/%s' % episode_id,
+ ie=SpreakerIE.ie_key(), video_id=episode_id)
+
+
+class SpreakerShowIE(InfoExtractor):
+ _VALID_URL = r'https?://api\.spreaker\.com/show/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://api.spreaker.com/show/4652058',
+ 'info_dict': {
+ 'id': '4652058',
+ },
+ 'playlist_mincount': 118,
+ }]
+
+ def _entries(self, show_id):
+ for page_num in itertools.count(1):
+ episodes = self._download_json(
+ 'https://api.spreaker.com/show/%s/episodes' % show_id,
+ show_id, note='Downloading JSON page %d' % page_num, query={
+ 'page': page_num,
+ 'max_per_page': 100,
+ })
+ pager = try_get(episodes, lambda x: x['response']['pager'], dict)
+ if not pager:
+ break
+ results = pager.get('results')
+ if not results or not isinstance(results, list):
+ break
+ for result in results:
+ if not isinstance(result, dict):
+ continue
+ yield _extract_episode(result)
+ if page_num == pager.get('last_page'):
+ break
+
+ def _real_extract(self, url):
+ show_id = self._match_id(url)
+ return self.playlist_result(self._entries(show_id), playlist_id=show_id)
+
+
+class SpreakerShowPageIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?spreaker\.com/show/(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'https://www.spreaker.com/show/success-with-music',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ show_id = self._search_regex(
+ r'show_id\s*:\s*(?P<id>\d+)', webpage, 'show id')
+ return self.url_result(
+ 'https://api.spreaker.com/show/%s' % show_id,
+ ie=SpreakerShowIE.ie_key(), video_id=show_id)
diff --git a/youtube_dlc/extractor/svt.py b/youtube_dlc/extractor/svt.py
index 2f6887d86..a0b6ef4db 100644
--- a/youtube_dlc/extractor/svt.py
+++ b/youtube_dlc/extractor/svt.py
@@ -9,6 +9,7 @@ from ..utils import (
determine_ext,
dict_get,
int_or_none,
+ unified_timestamp,
str_or_none,
strip_or_none,
try_get,
@@ -44,7 +45,8 @@ class SVTBaseIE(InfoExtractor):
'format_id': player_type,
'url': vurl,
})
- if not formats and video_info.get('rights', {}).get('geoBlockedSweden'):
+ rights = try_get(video_info, lambda x: x['rights'], dict) or {}
+ if not formats and rights.get('geoBlockedSweden'):
self.raise_geo_restricted(
'This video is only available in Sweden',
countries=self._GEO_COUNTRIES)
@@ -70,6 +72,7 @@ class SVTBaseIE(InfoExtractor):
episode = video_info.get('episodeTitle')
episode_number = int_or_none(video_info.get('episodeNumber'))
+ timestamp = unified_timestamp(rights.get('validFrom'))
duration = int_or_none(dict_get(video_info, ('materialLength', 'contentDuration')))
age_limit = None
adult = dict_get(
@@ -84,6 +87,7 @@ class SVTBaseIE(InfoExtractor):
'formats': formats,
'subtitles': subtitles,
'duration': duration,
+ 'timestamp': timestamp,
'age_limit': age_limit,
'series': series,
'season_number': season_number,
@@ -136,26 +140,39 @@ class SVTPlayIE(SVTPlayBaseIE):
IE_DESC = 'SVT Play and Öppet arkiv'
_VALID_URL = r'''(?x)
(?:
- svt:(?P<svt_id>[^/?#&]+)|
+ (?:
+ svt:|
+ https?://(?:www\.)?svt\.se/barnkanalen/barnplay/[^/]+/
+ )
+ (?P<svt_id>[^/?#&]+)|
https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp|kanaler)/(?P<id>[^/?#&]+)
)
'''
_TESTS = [{
- 'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2',
- 'md5': '2b6704fe4a28801e1a098bbf3c5ac611',
+ 'url': 'https://www.svtplay.se/video/26194546/det-har-ar-himlen',
+ 'md5': '2382036fd6f8c994856c323fe51c426e',
'info_dict': {
- 'id': '5996901',
+ 'id': 'jNwpV9P',
'ext': 'mp4',
- 'title': 'Flygplan till Haile Selassie',
- 'duration': 3527,
- 'thumbnail': r're:^https?://.*[\.-]jpg$',
+ 'title': 'Det här är himlen',
+ 'timestamp': 1586044800,
+ 'upload_date': '20200405',
+ 'duration': 3515,
+ 'thumbnail': r're:^https?://(?:.*[\.-]jpg|www.svtstatic.se/image/.*)$',
'age_limit': 0,
'subtitles': {
'sv': [{
- 'ext': 'wsrt',
+ 'ext': 'vtt',
}]
},
},
+ 'params': {
+ 'format': 'bestvideo',
+ # skip for now due to download test asserts that segment is > 10000 bytes and svt uses
+ # init segments that are smaller
+ # AssertionError: Expected test_SVTPlay_jNwpV9P.mp4 to be at least 9.77KiB, but it's only 864.00B
+ 'skip_download': True,
+ },
}, {
# geo restricted to Sweden
'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten',
@@ -172,6 +189,12 @@ class SVTPlayIE(SVTPlayBaseIE):
}, {
'url': 'svt:14278044',
'only_matching': True,
+ }, {
+ 'url': 'https://www.svt.se/barnkanalen/barnplay/kar/eWv5MLX/',
+ 'only_matching': True,
+ }, {
+ 'url': 'svt:eWv5MLX',
+ 'only_matching': True,
}]
def _adjust_title(self, info):
@@ -236,7 +259,10 @@ class SVTPlayIE(SVTPlayBaseIE):
r'["\']svtId["\']\s*:\s*["\']([\da-zA-Z-]+)'),
webpage, 'video id')
- return self._extract_by_video_id(svt_id, webpage)
+ info_dict = self._extract_by_video_id(svt_id, webpage)
+ info_dict['thumbnail'] = thumbnail
+
+ return info_dict
class SVTSeriesIE(SVTPlayBaseIE):
@@ -360,7 +386,7 @@ class SVTPageIE(InfoExtractor):
@classmethod
def suitable(cls, url):
- return False if SVTIE.suitable(url) else super(SVTPageIE, cls).suitable(url)
+ return False if SVTIE.suitable(url) or SVTPlayIE.suitable(url) else super(SVTPageIE, cls).suitable(url)
def _real_extract(self, url):
path, display_id = re.match(self._VALID_URL, url).groups()
diff --git a/youtube_dlc/extractor/tagesschau.py b/youtube_dlc/extractor/tagesschau.py
index c351b7545..8ceab7e35 100644
--- a/youtube_dlc/extractor/tagesschau.py
+++ b/youtube_dlc/extractor/tagesschau.py
@@ -86,7 +86,7 @@ class TagesschauPlayerIE(InfoExtractor):
# return self._extract_via_api(kind, video_id)
# JSON api does not provide some audio formats (e.g. ogg) thus
- # extractiong audio via webpage
+ # extracting audio via webpage
webpage = self._download_webpage(url, video_id)
diff --git a/youtube_dlc/extractor/theplatform.py b/youtube_dlc/extractor/theplatform.py
index 07055513a..41bfbe80f 100644
--- a/youtube_dlc/extractor/theplatform.py
+++ b/youtube_dlc/extractor/theplatform.py
@@ -208,7 +208,7 @@ class ThePlatformIE(ThePlatformBaseIE, AdobePassIE):
if m:
return [m.group('url')]
- # Are whitesapces ignored in URLs?
+ # Are whitespaces ignored in URLs?
# https://github.com/ytdl-org/youtube-dl/issues/12044
matches = re.findall(
r'(?s)<(?:iframe|script)[^>]+src=(["\'])((?:https?:)?//player\.theplatform\.com/p/.+?)\1', webpage)
diff --git a/youtube_dlc/extractor/thisvid.py b/youtube_dlc/extractor/thisvid.py
new file mode 100644
index 000000000..f507e1b06
--- /dev/null
+++ b/youtube_dlc/extractor/thisvid.py
@@ -0,0 +1,97 @@
+# coding: utf-8
+from __future__ import unicode_literals
+import re
+
+from .common import InfoExtractor
+
+
+class ThisVidIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?thisvid\.com/(?P<type>videos|embed)/(?P<id>[A-Za-z0-9-]+/?)'
+ _TESTS = [{
+ 'url': 'https://thisvid.com/videos/french-boy-pantsed/',
+ 'md5': '3397979512c682f6b85b3b04989df224',
+ 'info_dict': {
+ 'id': '2400174',
+ 'ext': 'mp4',
+ 'title': 'French Boy Pantsed',
+ 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg',
+ 'age_limit': 18,
+ }
+ }, {
+ 'url': 'https://thisvid.com/embed/2400174/',
+ 'md5': '3397979512c682f6b85b3b04989df224',
+ 'info_dict': {
+ 'id': '2400174',
+ 'ext': 'mp4',
+ 'title': 'French Boy Pantsed',
+ 'thumbnail': 'https://media.thisvid.com/contents/videos_screenshots/2400000/2400174/preview.mp4.jpg',
+ 'age_limit': 18,
+ }
+ }]
+
+ def _real_extract(self, url):
+ main_id = self._match_id(url)
+ webpage = self._download_webpage(url, main_id)
+
+ # URL decryptor was reversed from version 4.0.4, later verified working with 5.2.0 and may change in the future.
+ kvs_version = self._html_search_regex(r'<script [^>]+?src="https://thisvid\.com/player/kt_player\.js\?v=(\d+(\.\d+)+)">', webpage, 'kvs_version', fatal=False)
+ if not kvs_version.startswith("5."):
+ self.report_warning("Major version change (" + kvs_version + ") in player engine--Download may fail.")
+
+ title = self._html_search_regex(r'<title>(?:Video: )?(.+?)(?: - (?:\w+ porn at )?ThisVid(?:.com| tube))?</title>', webpage, 'title')
+ # video_id, video_url and license_code from the 'flashvars' JSON object:
+ video_id = self._html_search_regex(r"video_id: '([0-9]+)',", webpage, 'video_id')
+ video_url = self._html_search_regex(r"video_url: '(function/0/.+?)',", webpage, 'video_url')
+ license_code = self._html_search_regex(r"license_code: '([0-9$]{16})',", webpage, 'license_code')
+ thumbnail = self._html_search_regex(r"preview_url: '((?:https?:)?//media.thisvid.com/.+?.jpg)',", webpage, 'thumbnail', fatal=False)
+ if thumbnail.startswith("//"):
+ thumbnail = "https:" + thumbnail
+ if (re.match(self._VALID_URL, url).group('type') == "videos"):
+ display_id = main_id
+ else:
+ display_id = self._search_regex(r'<link rel="canonical" href="' + self._VALID_URL + r'">', webpage, 'display_id', fatal=False),
+
+ return {
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'url': getrealurl(video_url, license_code),
+ 'thumbnail': thumbnail,
+ 'age_limit': 18,
+ }
+
+
+def getrealurl(video_url, license_code):
+ urlparts = video_url.split('/')[2:]
+ license = getlicensetoken(license_code)
+ newmagic = urlparts[5][:32]
+
+ for o in range(len(newmagic) - 1, -1, -1):
+ new = ""
+ l = (o + sum([int(n) for n in license[o:]])) % 32
+
+ for i in range(0, len(newmagic)):
+ if i == o:
+ new += newmagic[l]
+ elif i == l:
+ new += newmagic[o]
+ else:
+ new += newmagic[i]
+ newmagic = new
+
+ urlparts[5] = newmagic + urlparts[5][32:]
+ return "/".join(urlparts)
+
+
+def getlicensetoken(license):
+ modlicense = license.replace("$", "").replace("0", "1")
+ center = int(len(modlicense) / 2)
+ fronthalf = int(modlicense[:center + 1])
+ backhalf = int(modlicense[center:])
+
+ modlicense = str(4 * abs(fronthalf - backhalf))
+ retval = ""
+ for o in range(0, center + 1):
+ for i in range(1, 5):
+ retval += str((int(license[o + i]) + int(modlicense[o])) % 10)
+ return retval
diff --git a/youtube_dlc/extractor/turner.py b/youtube_dlc/extractor/turner.py
index 4a6cbfbb8..2964504a2 100644
--- a/youtube_dlc/extractor/turner.py
+++ b/youtube_dlc/extractor/turner.py
@@ -56,9 +56,9 @@ class TurnerBaseIE(AdobePassIE):
content_id = xpath_text(video_data, 'contentId') or video_id
# rtmp_src = xpath_text(video_data, 'akamai/src')
# if rtmp_src:
- # splited_rtmp_src = rtmp_src.split(',')
- # if len(splited_rtmp_src) == 2:
- # rtmp_src = splited_rtmp_src[1]
+ # split_rtmp_src = rtmp_src.split(',')
+ # if len(split_rtmp_src) == 2:
+ # rtmp_src = split_rtmp_src[1]
# aifp = xpath_text(video_data, 'akamai/aifp', default='')
urls = []
diff --git a/youtube_dlc/extractor/tvland.py b/youtube_dlc/extractor/tvland.py
index 791144128..225b6b078 100644
--- a/youtube_dlc/extractor/tvland.py
+++ b/youtube_dlc/extractor/tvland.py
@@ -3,6 +3,8 @@ from __future__ import unicode_literals
from .spike import ParamountNetworkIE
+# TODO: Remove - Reason not used anymore - Service moved to youtube
+
class TVLandIE(ParamountNetworkIE):
IE_NAME = 'tvland.com'
diff --git a/youtube_dlc/extractor/twentythreevideo.py b/youtube_dlc/extractor/twentythreevideo.py
index aa0c6e90f..dc5609192 100644
--- a/youtube_dlc/extractor/twentythreevideo.py
+++ b/youtube_dlc/extractor/twentythreevideo.py
@@ -8,8 +8,8 @@ from ..utils import int_or_none
class TwentyThreeVideoIE(InfoExtractor):
IE_NAME = '23video'
- _VALID_URL = r'https?://video\.(?P<domain>twentythree\.net|23video\.com|filmweb\.no)/v\.ihtml/player\.html\?(?P<query>.*?\bphoto(?:_|%5f)id=(?P<id>\d+).*)'
- _TEST = {
+ _VALID_URL = r'https?://(?P<domain>[^.]+\.(?:twentythree\.net|23video\.com|filmweb\.no))/v\.ihtml/player\.html\?(?P<query>.*?\bphoto(?:_|%5f)id=(?P<id>\d+).*)'
+ _TESTS = [{
'url': 'https://video.twentythree.net/v.ihtml/player.html?showDescriptions=0&source=site&photo%5fid=20448876&autoPlay=1',
'md5': '75fcf216303eb1dae9920d651f85ced4',
'info_dict': {
@@ -21,11 +21,14 @@ class TwentyThreeVideoIE(InfoExtractor):
'uploader_id': '12258964',
'uploader': 'Rasmus Bysted',
}
- }
+ }, {
+ 'url': 'https://bonnier-publications-danmark.23video.com/v.ihtml/player.html?token=f0dc46476e06e13afd5a1f84a29e31e8&source=embed&photo%5fid=36137620',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
domain, query, photo_id = re.match(self._VALID_URL, url).groups()
- base_url = 'https://video.%s' % domain
+ base_url = 'https://%s' % domain
photo_data = self._download_json(
base_url + '/api/photo/list?' + query, photo_id, query={
'format': 'json',
diff --git a/youtube_dlc/extractor/urplay.py b/youtube_dlc/extractor/urplay.py
index 6030b7cb5..2c41f78bd 100644
--- a/youtube_dlc/extractor/urplay.py
+++ b/youtube_dlc/extractor/urplay.py
@@ -2,7 +2,11 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import unified_timestamp
+from ..utils import (
+ dict_get,
+ int_or_none,
+ unified_timestamp,
+)
class URPlayIE(InfoExtractor):
@@ -15,8 +19,8 @@ class URPlayIE(InfoExtractor):
'ext': 'mp4',
'title': 'UR Samtiden - Livet, universum och rymdens märkliga musik : Om vetenskap, kritiskt tänkande och motstånd',
'description': 'md5:5344508a52aa78c1ced6c1b8b9e44e9a',
- 'timestamp': 1513512768,
- 'upload_date': '20171217',
+ 'timestamp': 1513292400,
+ 'upload_date': '20171214',
},
}, {
'url': 'https://urskola.se/Produkter/190031-Tripp-Trapp-Trad-Sovkudde',
@@ -25,7 +29,7 @@ class URPlayIE(InfoExtractor):
'ext': 'mp4',
'title': 'Tripp, Trapp, Träd : Sovkudde',
'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1',
- 'timestamp': 1440093600,
+ 'timestamp': 1440086400,
'upload_date': '20150820',
},
}, {
@@ -35,37 +39,65 @@ class URPlayIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
-
+ url = url.replace('skola.se/Produkter', 'play.se/program')
webpage = self._download_webpage(url, video_id)
- urplayer_data = self._parse_json(self._search_regex(
- r'urPlayer\.init\(({.+?})\);', webpage, 'urplayer data'), video_id)
- host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect']
+ urplayer_data = self._parse_json(self._html_search_regex(
+ r'data-react-class="components/Player/Player"[^>]+data-react-props="({.+?})"',
+ webpage, 'urplayer data'), video_id)['currentProduct']
+ episode = urplayer_data['title']
+ host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect']
formats = []
- for quality_attr, quality, preference in (('', 'sd', 0), ('_hd', 'hd', 1)):
- file_http = urplayer_data.get('file_http' + quality_attr) or urplayer_data.get('file_http_sub' + quality_attr)
+ urplayer_streams = urplayer_data.get('streamingInfo', {})
+
+ for k, v in urplayer_streams.get('raw', {}).items():
+ if not (k in ('sd', 'hd') and isinstance(v, dict)):
+ continue
+ file_http = v.get('location')
if file_http:
formats.extend(self._extract_wowza_formats(
- 'http://%s/%splaylist.m3u8' % (host, file_http), video_id, skip_protocols=['rtmp', 'rtsp']))
+ 'http://%s/%splaylist.m3u8' % (host, file_http),
+ video_id, skip_protocols=['f4m', 'rtmp', 'rtsp']))
self._sort_formats(formats)
subtitles = {}
- for subtitle in urplayer_data.get('subtitles', []):
- subtitle_url = subtitle.get('file')
- kind = subtitle.get('kind')
- if not subtitle_url or (kind and kind != 'captions'):
- continue
- subtitles.setdefault(subtitle.get('label', 'Svenska'), []).append({
- 'url': subtitle_url,
+ subs = urplayer_streams.get("sweComplete", {}).get("tt", {}).get("location")
+ if subs:
+ subtitles.setdefault('Svenska', []).append({
+ 'url': subs,
})
+ image = urplayer_data.get('image') or {}
+ thumbnails = []
+ for k, v in image.items():
+ t = {
+ 'id': k,
+ 'url': v,
+ }
+ wh = k.split('x')
+ if len(wh) == 2:
+ t.update({
+ 'width': int_or_none(wh[0]),
+ 'height': int_or_none(wh[1]),
+ })
+ thumbnails.append(t)
+
+ series = urplayer_data.get('series') or {}
+ series_title = dict_get(series, ('seriesTitle', 'title')) or dict_get(urplayer_data, ('seriesTitle', 'mainTitle'))
+
return {
'id': video_id,
- 'title': urplayer_data['title'],
- 'description': self._og_search_description(webpage),
- 'thumbnail': urplayer_data.get('image'),
- 'timestamp': unified_timestamp(self._html_search_meta(('uploadDate', 'schema:uploadDate'), webpage, 'timestamp')),
- 'series': urplayer_data.get('series_title'),
'subtitles': subtitles,
+ 'title': '%s : %s' % (series_title, episode) if series_title else episode,
+ 'description': urplayer_data.get('description'),
+ 'thumbnails': thumbnails,
+ 'timestamp': unified_timestamp(urplayer_data.get('publishedAt')),
+ 'series': series_title,
'formats': formats,
+ 'duration': int_or_none(urplayer_data.get('duration')),
+ 'categories': urplayer_data.get('categories'),
+ 'tags': urplayer_data.get('keywords'),
+ 'season': series.get('label'),
+ 'episode': episode,
+ 'episode_number': int_or_none(urplayer_data.get('episodeNumber')),
}
diff --git a/youtube_dlc/extractor/usanetwork.py b/youtube_dlc/extractor/usanetwork.py
index 54c7495cc..d953e460b 100644
--- a/youtube_dlc/extractor/usanetwork.py
+++ b/youtube_dlc/extractor/usanetwork.py
@@ -1,74 +1,24 @@
# coding: utf-8
from __future__ import unicode_literals
-from .adobepass import AdobePassIE
-from ..utils import (
- NO_DEFAULT,
- smuggle_url,
- update_url_query,
-)
+from .nbc import NBCIE
-class USANetworkIE(AdobePassIE):
- _VALID_URL = r'https?://(?:www\.)?usanetwork\.com/(?:[^/]+/videos|movies)/(?P<id>[^/?#]+)'
- _TEST = {
- 'url': 'http://www.usanetwork.com/mrrobot/videos/hpe-cybersecurity',
- 'md5': '33c0d2ba381571b414024440d08d57fd',
+class USANetworkIE(NBCIE):
+ _VALID_URL = r'https?(?P<permalink>://(?:www\.)?usanetwork\.com/(?:[^/]+/videos?|movies?)/(?:[^/]+/)?(?P<id>\d+))'
+ _TESTS = [{
+ 'url': 'https://www.usanetwork.com/peacock-trailers/video/intelligence-trailer/4185302',
'info_dict': {
- 'id': '3086229',
+ 'id': '4185302',
'ext': 'mp4',
- 'title': 'HPE Cybersecurity',
- 'description': 'The more we digitize our world, the more vulnerable we are.',
- 'upload_date': '20160818',
- 'timestamp': 1471535460,
- 'uploader': 'NBCU-USA',
+ 'title': 'Intelligence (Trailer)',
+ 'description': 'A maverick NSA agent enlists the help of a junior systems analyst in a workplace power grab.',
+ 'upload_date': '20200715',
+ 'timestamp': 1594785600,
+ 'uploader': 'NBCU-MPAT',
},
- }
-
- def _real_extract(self, url):
- display_id = self._match_id(url)
- webpage = self._download_webpage(url, display_id)
-
- def _x(name, default=NO_DEFAULT):
- return self._search_regex(
- r'data-%s\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % name,
- webpage, name, default=default, group='value')
-
- video_id = _x('mpx-guid')
- title = _x('episode-title')
- mpx_account_id = _x('mpx-account-id', '2304992029')
-
- query = {
- 'mbr': 'true',
- }
- if _x('is-full-episode', None) == '1':
- query['manifest'] = 'm3u'
-
- if _x('is-entitlement', None) == '1':
- adobe_pass = {}
- drupal_settings = self._search_regex(
- r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
- webpage, 'drupal settings', fatal=False)
- if drupal_settings:
- drupal_settings = self._parse_json(drupal_settings, video_id, fatal=False)
- if drupal_settings:
- adobe_pass = drupal_settings.get('adobePass', {})
- resource = self._get_mvpd_resource(
- adobe_pass.get('adobePassResourceId', 'usa'),
- title, video_id, _x('episode-rating', 'TV-14'))
- query['auth'] = self._extract_mvpd_auth(
- url, video_id, adobe_pass.get('adobePassRequestorId', 'usa'), resource)
-
- info = self._search_json_ld(webpage, video_id, default={})
- info.update({
- '_type': 'url_transparent',
- 'url': smuggle_url(update_url_query(
- 'http://link.theplatform.com/s/HNK2IC/media/guid/%s/%s' % (mpx_account_id, video_id),
- query), {'force_smil_url': True}),
- 'id': video_id,
- 'title': title,
- 'series': _x('show-title', None),
- 'episode': title,
- 'ie_key': 'ThePlatform',
- })
- return info
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }]
diff --git a/youtube_dlc/extractor/ustream.py b/youtube_dlc/extractor/ustream.py
index 582090d0d..9e860aeb7 100644
--- a/youtube_dlc/extractor/ustream.py
+++ b/youtube_dlc/extractor/ustream.py
@@ -19,7 +19,7 @@ from ..utils import (
class UstreamIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?ustream\.tv/(?P<type>recorded|embed|embed/recorded)/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/(?P<type>recorded|embed|embed/recorded)/(?P<id>\d+)'
IE_NAME = 'ustream'
_TESTS = [{
'url': 'http://www.ustream.tv/recorded/20274954',
@@ -67,12 +67,15 @@ class UstreamIE(InfoExtractor):
'params': {
'skip_download': True, # m3u8 download
},
+ }, {
+ 'url': 'https://video.ibm.com/embed/recorded/128240221?&autoplay=true&controls=true&volume=100',
+ 'only_matching': True,
}]
@staticmethod
def _extract_url(webpage):
mobj = re.search(
- r'<iframe[^>]+?src=(["\'])(?P<url>http://www\.ustream\.tv/embed/.+?)\1', webpage)
+ r'<iframe[^>]+?src=(["\'])(?P<url>http://(?:www\.)?(?:ustream\.tv|video\.ibm\.com)/embed/.+?)\1', webpage)
if mobj is not None:
return mobj.group('url')
diff --git a/youtube_dlc/extractor/viki.py b/youtube_dlc/extractor/viki.py
index f8e360338..09da4338d 100644
--- a/youtube_dlc/extractor/viki.py
+++ b/youtube_dlc/extractor/viki.py
@@ -1,6 +1,7 @@
# coding: utf-8
from __future__ import unicode_literals
+import base64
import hashlib
import hmac
import itertools
@@ -9,6 +10,10 @@ import re
import time
from .common import InfoExtractor
+from ..compat import (
+ compat_parse_qs,
+ compat_urllib_parse_urlparse,
+)
from ..utils import (
ExtractorError,
int_or_none,
@@ -16,6 +21,7 @@ from ..utils import (
parse_age_limit,
parse_iso8601,
sanitized_Request,
+ std_headers,
)
@@ -166,19 +172,20 @@ class VikiIE(VikiBaseIE):
}, {
# episode
'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1',
- 'md5': '5fa476a902e902783ac7a4d615cdbc7a',
+ 'md5': '94e0e34fd58f169f40c184f232356cfe',
'info_dict': {
'id': '44699v',
'ext': 'mp4',
'title': 'Boys Over Flowers - Episode 1',
'description': 'md5:b89cf50038b480b88b5b3c93589a9076',
- 'duration': 4204,
+ 'duration': 4172,
'timestamp': 1270496524,
'upload_date': '20100405',
'uploader': 'group8',
'like_count': int,
'age_limit': 13,
- }
+ },
+ 'expected_warnings': ['Unknown MIME type image/jpeg in DASH manifest'],
}, {
# youtube external
'url': 'http://www.viki.com/videos/50562v-poor-nastya-complete-episode-1',
@@ -195,14 +202,15 @@ class VikiIE(VikiBaseIE):
'uploader_id': 'ad14065n',
'like_count': int,
'age_limit': 13,
- }
+ },
+ 'skip': 'Page not found!',
}, {
'url': 'http://www.viki.com/player/44699v',
'only_matching': True,
}, {
# non-English description
'url': 'http://www.viki.com/videos/158036v-love-in-magic',
- 'md5': '1713ae35df5a521b31f6dc40730e7c9c',
+ 'md5': 'adf9e321a0ae5d0aace349efaaff7691',
'info_dict': {
'id': '158036v',
'ext': 'mp4',
@@ -218,71 +226,13 @@ class VikiIE(VikiBaseIE):
def _real_extract(self, url):
video_id = self._match_id(url)
- video = self._call_api(
- 'videos/%s.json' % video_id, video_id, 'Downloading video JSON')
-
- streams = self._call_api(
- 'videos/%s/streams.json' % video_id, video_id,
- 'Downloading video streams JSON')
-
- formats = []
- for format_id, stream_dict in streams.items():
- height = int_or_none(self._search_regex(
- r'^(\d+)[pP]$', format_id, 'height', default=None))
- for protocol, format_dict in stream_dict.items():
- # rtmps URLs does not seem to work
- if protocol == 'rtmps':
- continue
- format_url = format_dict.get('url')
- format_drms = format_dict.get('drms')
- format_stream_id = format_dict.get('id')
- if format_id == 'm3u8':
- m3u8_formats = self._extract_m3u8_formats(
- format_url, video_id, 'mp4',
- entry_protocol='m3u8_native',
- m3u8_id='m3u8-%s' % protocol, fatal=False)
- # Despite CODECS metadata in m3u8 all video-only formats
- # are actually video+audio
- for f in m3u8_formats:
- if f.get('acodec') == 'none' and f.get('vcodec') != 'none':
- f['acodec'] = None
- formats.extend(m3u8_formats)
- elif format_id == 'mpd':
- mpd_formats = self._extract_mpd_formats(
- format_url, video_id,
- mpd_id='mpd-%s' % protocol, fatal=False)
- formats.extend(mpd_formats)
- elif format_id == 'mpd':
-
- formats.extend(mpd_formats)
- elif format_url.startswith('rtmp'):
- mobj = re.search(
- r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$',
- format_url)
- if not mobj:
- continue
- formats.append({
- 'format_id': 'rtmp-%s' % format_id,
- 'ext': 'flv',
- 'url': mobj.group('url'),
- 'play_path': mobj.group('playpath'),
- 'app': mobj.group('app'),
- 'page_url': url,
- 'drms': format_drms,
- 'stream_id': format_stream_id,
- })
- else:
- urlh = self._request_webpage(
- HEADRequest(format_url), video_id, 'Checking file size', fatal=False)
- formats.append({
- 'url': format_url,
- 'format_id': '%s-%s' % (format_id, protocol),
- 'height': height,
- 'drms': format_drms,
- 'stream_id': format_stream_id,
- 'filesize': int_or_none(urlh.headers.get('Content-Length')),
- })
- self._sort_formats(formats)
+ resp = self._download_json(
+ 'https://www.viki.com/api/videos/' + video_id,
+ video_id, 'Downloading video JSON', headers={
+ 'x-client-user-agent': std_headers['User-Agent'],
+ 'x-viki-app-ver': '4.0.57',
+ })
+ video = resp['video']
self._check_errors(video)
@@ -308,19 +258,26 @@ class VikiIE(VikiBaseIE):
'url': thumbnail.get('url'),
})
- stream_ids = []
- for f in formats:
- s_id = f.get('stream_id')
- if s_id is not None:
- stream_ids.append(s_id)
-
subtitles = {}
- for subtitle_lang, _ in video.get('subtitle_completions', {}).items():
- subtitles[subtitle_lang] = [{
- 'ext': subtitles_format,
- 'url': self._prepare_call(
- 'videos/%s/subtitles/%s.%s?stream_id=%s' % (video_id, subtitle_lang, subtitles_format, stream_ids[0])),
- } for subtitles_format in ('srt', 'vtt')]
+ try:
+ # New way to fetch subtitles
+ new_video = self._download_json(
+ 'https://www.viki.com/api/videos/%s' % video_id, video_id,
+ 'Downloading new video JSON to get subtitles', headers={'x-viki-app-ver': '2.2.5.1428709186'}, expected_status=[200, 400, 404])
+ for sub in new_video.get('streamSubtitles').get('dash'):
+ subtitles[sub.get('srclang')] = [{
+ 'ext': 'vtt',
+ 'url': sub.get('src'),
+ 'completion': sub.get('percentage'),
+ }]
+ except AttributeError:
+ # fall-back to the old way if there isn't a streamSubtitles attribute
+ for subtitle_lang, _ in video.get('subtitle_completions', {}).items():
+ subtitles[subtitle_lang] = [{
+ 'ext': subtitles_format,
+ 'url': self._prepare_call(
+ 'videos/%s/subtitles/%s.%s' % (video_id, subtitle_lang, subtitles_format)),
+ } for subtitles_format in ('srt', 'vtt')]
result = {
'id': video_id,
@@ -335,12 +292,84 @@ class VikiIE(VikiBaseIE):
'subtitles': subtitles,
}
- if 'external' in streams:
- result.update({
- '_type': 'url_transparent',
- 'url': streams['external']['url'],
- })
- return result
+ formats = []
+
+ def add_format(format_id, format_dict, protocol='http'):
+ # rtmps URLs does not seem to work
+ if protocol == 'rtmps':
+ return
+ format_url = format_dict.get('url')
+ if not format_url:
+ return
+ format_drms = format_dict.get('drms')
+ format_stream_id = format_dict.get('id')
+ qs = compat_parse_qs(compat_urllib_parse_urlparse(format_url).query)
+ stream = qs.get('stream', [None])[0]
+ if stream:
+ format_url = base64.b64decode(stream).decode()
+ if format_id in ('m3u8', 'hls'):
+ m3u8_formats = self._extract_m3u8_formats(
+ format_url, video_id, 'mp4',
+ entry_protocol='m3u8_native',
+ m3u8_id='m3u8-%s' % protocol, fatal=False)
+ # Despite CODECS metadata in m3u8 all video-only formats
+ # are actually video+audio
+ for f in m3u8_formats:
+ if '_drm/index_' in f['url']:
+ continue
+ if f.get('acodec') == 'none' and f.get('vcodec') != 'none':
+ f['acodec'] = None
+ formats.append(f)
+ elif format_id in ('mpd', 'dash'):
+ formats.extend(self._extract_mpd_formats(
+ format_url, video_id, 'mpd-%s' % protocol, fatal=False))
+ elif format_url.startswith('rtmp'):
+ mobj = re.search(
+ r'^(?P<url>rtmp://[^/]+/(?P<app>.+?))/(?P<playpath>mp4:.+)$',
+ format_url)
+ if not mobj:
+ return
+ formats.append({
+ 'format_id': 'rtmp-%s' % format_id,
+ 'ext': 'flv',
+ 'url': mobj.group('url'),
+ 'play_path': mobj.group('playpath'),
+ 'app': mobj.group('app'),
+ 'page_url': url,
+ 'drms': format_drms,
+ 'stream_id': format_stream_id,
+ })
+ else:
+ urlh = self._request_webpage(
+ HEADRequest(format_url), video_id, 'Checking file size', fatal=False)
+ formats.append({
+ 'url': format_url,
+ 'format_id': '%s-%s' % (format_id, protocol),
+ 'height': int_or_none(self._search_regex(
+ r'^(\d+)[pP]$', format_id, 'height', default=None)),
+ 'drms': format_drms,
+ 'stream_id': format_stream_id,
+ 'filesize': int_or_none(urlh.headers.get('Content-Length')),
+ })
+
+ for format_id, format_dict in (resp.get('streams') or {}).items():
+ add_format(format_id, format_dict)
+ if not formats:
+ streams = self._call_api(
+ 'videos/%s/streams.json' % video_id, video_id,
+ 'Downloading video streams JSON')
+
+ if 'external' in streams:
+ result.update({
+ '_type': 'url_transparent',
+ 'url': streams['external']['url'],
+ })
+ return result
+
+ for format_id, stream_dict in streams.items():
+ for protocol, format_dict in stream_dict.items():
+ add_format(format_id, format_dict, protocol)
+ self._sort_formats(formats)
result['formats'] = formats
return result
diff --git a/youtube_dlc/extractor/vimeo.py b/youtube_dlc/extractor/vimeo.py
index 9839657ca..51a0ab2fa 100644
--- a/youtube_dlc/extractor/vimeo.py
+++ b/youtube_dlc/extractor/vimeo.py
@@ -922,7 +922,7 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor):
}]
_PAGE_SIZE = 100
- def _fetch_page(self, album_id, authorizaion, hashed_pass, page):
+ def _fetch_page(self, album_id, authorization, hashed_pass, page):
api_page = page + 1
query = {
'fields': 'link,uri',
@@ -934,7 +934,7 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor):
videos = self._download_json(
'https://api.vimeo.com/albums/%s/videos' % album_id,
album_id, 'Downloading page %d' % api_page, query=query, headers={
- 'Authorization': 'jwt ' + authorizaion,
+ 'Authorization': 'jwt ' + authorization,
})['data']
for video in videos:
link = video.get('link')
@@ -946,10 +946,13 @@ class VimeoAlbumIE(VimeoBaseInfoExtractor):
def _real_extract(self, url):
album_id = self._match_id(url)
- webpage = self._download_webpage(url, album_id)
- viewer = self._parse_json(self._search_regex(
- r'bootstrap_data\s*=\s*({.+?})</script>',
- webpage, 'bootstrap data'), album_id)['viewer']
+ viewer = self._download_json(
+ 'https://vimeo.com/_rv/viewer', album_id, fatal=False)
+ if not viewer:
+ webpage = self._download_webpage(url, album_id)
+ viewer = self._parse_json(self._search_regex(
+ r'bootstrap_data\s*=\s*({.+?})</script>',
+ webpage, 'bootstrap data'), album_id)['viewer']
jwt = viewer['jwt']
album = self._download_json(
'https://api.vimeo.com/albums/' + album_id,
diff --git a/youtube_dlc/extractor/vlive.py b/youtube_dlc/extractor/vlive.py
index f79531e6f..c07550810 100644
--- a/youtube_dlc/extractor/vlive.py
+++ b/youtube_dlc/extractor/vlive.py
@@ -1,25 +1,32 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-import time
import itertools
+import json
-from .common import InfoExtractor
from .naver import NaverBaseIE
-from ..compat import compat_str
+from ..compat import (
+ compat_HTTPError,
+ compat_str,
+)
from ..utils import (
ExtractorError,
+ int_or_none,
merge_dicts,
- remove_start,
+ str_or_none,
+ strip_or_none,
try_get,
urlencode_postdata,
)
-class VLiveIE(NaverBaseIE):
+class VLiveBaseIE(NaverBaseIE):
+ _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b'
+
+
+class VLiveIE(VLiveBaseIE):
IE_NAME = 'vlive'
- _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/(?:video|embed)/(?P<id>[0-9]+)'
_NETRC_MACHINE = 'vlive'
_TESTS = [{
'url': 'http://www.vlive.tv/video/1326',
@@ -27,7 +34,7 @@ class VLiveIE(NaverBaseIE):
'info_dict': {
'id': '1326',
'ext': 'mp4',
- 'title': "[V LIVE] Girl's Day's Broadcast",
+ 'title': "Girl's Day's Broadcast",
'creator': "Girl's Day",
'view_count': int,
'uploader_id': 'muploader_a',
@@ -37,7 +44,7 @@ class VLiveIE(NaverBaseIE):
'info_dict': {
'id': '16937',
'ext': 'mp4',
- 'title': '[V LIVE] 첸백시 걍방',
+ 'title': '첸백시 걍방',
'creator': 'EXO',
'view_count': int,
'subtitles': 'mincount:12',
@@ -58,12 +65,15 @@ class VLiveIE(NaverBaseIE):
'subtitles': 'mincount:10',
},
'skip': 'This video is only available for CH+ subscribers',
+ }, {
+ 'url': 'https://www.vlive.tv/embed/1326',
+ 'only_matching': True,
+ }, {
+ # works only with gcc=KR
+ 'url': 'https://www.vlive.tv/video/225019',
+ 'only_matching': True,
}]
- @classmethod
- def suitable(cls, url):
- return False if VLivePlaylistIE.suitable(url) else super(VLiveIE, cls).suitable(url)
-
def _real_initialize(self):
self._login()
@@ -95,173 +105,199 @@ class VLiveIE(NaverBaseIE):
if not is_logged_in():
raise ExtractorError('Unable to log in', expected=True)
+ def _call_api(self, path_template, video_id, fields=None):
+ query = {'appId': self._APP_ID, 'gcc': 'KR'}
+ if fields:
+ query['fields'] = fields
+ try:
+ return self._download_json(
+ 'https://www.vlive.tv/globalv-web/vam-web/' + path_template % video_id, video_id,
+ 'Downloading %s JSON metadata' % path_template.split('/')[-1].split('-')[0],
+ headers={'Referer': 'https://www.vlive.tv/'}, query=query)
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ self.raise_login_required(json.loads(e.cause.read().decode())['message'])
+ raise
+
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(
- 'https://www.vlive.tv/video/%s' % video_id, video_id)
-
- VIDEO_PARAMS_RE = r'\bvlive\.video\.init\(([^)]+)'
- VIDEO_PARAMS_FIELD = 'video params'
-
- params = self._parse_json(self._search_regex(
- VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD, default=''), video_id,
- transform_source=lambda s: '[' + s + ']', fatal=False)
-
- if not params or len(params) < 7:
- params = self._search_regex(
- VIDEO_PARAMS_RE, webpage, VIDEO_PARAMS_FIELD)
- params = [p.strip(r'"') for p in re.split(r'\s*,\s*', params)]
-
- status, long_video_id, key = params[2], params[5], params[6]
- status = remove_start(status, 'PRODUCT_')
-
- if status in ('LIVE_ON_AIR', 'BIG_EVENT_ON_AIR'):
- return self._live(video_id, webpage)
- elif status in ('VOD_ON_AIR', 'BIG_EVENT_INTRO'):
- return self._replay(video_id, webpage, long_video_id, key)
-
- if status == 'LIVE_END':
- raise ExtractorError('Uploading for replay. Please wait...',
- expected=True)
- elif status == 'COMING_SOON':
- raise ExtractorError('Coming soon!', expected=True)
- elif status == 'CANCELED':
- raise ExtractorError('We are sorry, '
- 'but the live broadcast has been canceled.',
- expected=True)
- elif status == 'ONLY_APP':
- raise ExtractorError('Unsupported video type', expected=True)
- else:
- raise ExtractorError('Unknown status %s' % status)
-
- def _get_common_fields(self, webpage):
- title = self._og_search_title(webpage)
- creator = self._html_search_regex(
- r'<div[^>]+class="info_area"[^>]*>\s*(?:<em[^>]*>.*?</em\s*>\s*)?<a\s+[^>]*>([^<]+)',
- webpage, 'creator', fatal=False)
- thumbnail = self._og_search_thumbnail(webpage)
- return {
- 'title': title,
- 'creator': creator,
- 'thumbnail': thumbnail,
- }
+ post = self._call_api(
+ 'post/v1.0/officialVideoPost-%s', video_id,
+ 'author{nickname},channel{channelCode,channelName},officialVideo{commentCount,exposeStatus,likeCount,playCount,playTime,status,title,type,vodId}')
+
+ video = post['officialVideo']
+
+ def get_common_fields():
+ channel = post.get('channel') or {}
+ return {
+ 'title': video.get('title'),
+ 'creator': post.get('author', {}).get('nickname'),
+ 'channel': channel.get('channelName'),
+ 'channel_id': channel.get('channelCode'),
+ 'duration': int_or_none(video.get('playTime')),
+ 'view_count': int_or_none(video.get('playCount')),
+ 'like_count': int_or_none(video.get('likeCount')),
+ 'comment_count': int_or_none(video.get('commentCount')),
+ }
+
+ video_type = video.get('type')
+ if video_type == 'VOD':
+ inkey = self._call_api('video/v1.0/vod/%s/inkey', video_id)['inkey']
+ vod_id = video['vodId']
+ return merge_dicts(
+ get_common_fields(),
+ self._extract_video_info(video_id, vod_id, inkey))
+ elif video_type == 'LIVE':
+ status = video.get('status')
+ if status == 'ON_AIR':
+ stream_url = self._call_api(
+ 'old/v3/live/%s/playInfo',
+ video_id)['result']['adaptiveStreamUrl']
+ formats = self._extract_m3u8_formats(stream_url, video_id, 'mp4')
+ info = get_common_fields()
+ info.update({
+ 'title': self._live_title(video['title']),
+ 'id': video_id,
+ 'formats': formats,
+ 'is_live': True,
+ })
+ return info
+ elif status == 'ENDED':
+ raise ExtractorError(
+ 'Uploading for replay. Please wait...', expected=True)
+ elif status == 'RESERVED':
+ raise ExtractorError('Coming soon!', expected=True)
+ elif video.get('exposeStatus') == 'CANCEL':
+ raise ExtractorError(
+ 'We are sorry, but the live broadcast has been canceled.',
+ expected=True)
+ else:
+ raise ExtractorError('Unknown status ' + status)
+
+
+class VLivePostIE(VLiveIE):
+ IE_NAME = 'vlive:post'
+ _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/post/(?P<id>\d-\d+)'
+ _TESTS = [{
+ # uploadType = SOS
+ 'url': 'https://www.vlive.tv/post/1-20088044',
+ 'info_dict': {
+ 'id': '1-20088044',
+ 'title': 'Hola estrellitas la tierra les dice hola (si era así no?) Ha...',
+ 'description': 'md5:fab8a1e50e6e51608907f46c7fa4b407',
+ },
+ 'playlist_count': 3,
+ }, {
+ # uploadType = V
+ 'url': 'https://www.vlive.tv/post/1-20087926',
+ 'info_dict': {
+ 'id': '1-20087926',
+ 'title': 'James Corden: And so, the baby becamos the Papa💜😭💪😭',
+ },
+ 'playlist_count': 1,
+ }]
+ _FVIDEO_TMPL = 'fvideo/v1.0/fvideo-%%s/%s'
+ _SOS_TMPL = _FVIDEO_TMPL % 'sosPlayInfo'
+ _INKEY_TMPL = _FVIDEO_TMPL % 'inKey'
- def _live(self, video_id, webpage):
- init_page = self._download_init_page(video_id)
-
- live_params = self._search_regex(
- r'"liveStreamInfo"\s*:\s*(".*"),',
- init_page, 'live stream info')
- live_params = self._parse_json(live_params, video_id)
- live_params = self._parse_json(live_params, video_id)
-
- formats = []
- for vid in live_params.get('resolutions', []):
- formats.extend(self._extract_m3u8_formats(
- vid['cdnUrl'], video_id, 'mp4',
- m3u8_id=vid.get('name'),
- fatal=False, live=True))
- self._sort_formats(formats)
-
- info = self._get_common_fields(webpage)
- info.update({
- 'title': self._live_title(info['title']),
- 'id': video_id,
- 'formats': formats,
- 'is_live': True,
- })
- return info
-
- def _replay(self, video_id, webpage, long_video_id, key):
- if '' in (long_video_id, key):
- init_page = self._download_init_page(video_id)
- video_info = self._parse_json(self._search_regex(
- (r'(?s)oVideoStatus\s*=\s*({.+?})\s*</script',
- r'(?s)oVideoStatus\s*=\s*({.+})'), init_page, 'video info'),
- video_id)
- if video_info.get('status') == 'NEED_CHANNEL_PLUS':
- self.raise_login_required(
- 'This video is only available for CH+ subscribers')
- long_video_id, key = video_info['vid'], video_info['inkey']
-
- return merge_dicts(
- self._get_common_fields(webpage),
- self._extract_video_info(video_id, long_video_id, key))
-
- def _download_init_page(self, video_id):
- return self._download_webpage(
- 'https://www.vlive.tv/video/init/view',
- video_id, note='Downloading live webpage',
- data=urlencode_postdata({'videoSeq': video_id}),
- headers={
- 'Referer': 'https://www.vlive.tv/video/%s' % video_id,
- 'Content-Type': 'application/x-www-form-urlencoded'
- })
+ def _real_extract(self, url):
+ post_id = self._match_id(url)
+
+ post = self._call_api(
+ 'post/v1.0/post-%s', post_id,
+ 'attachments{video},officialVideo{videoSeq},plainBody,title')
+
+ video_seq = str_or_none(try_get(
+ post, lambda x: x['officialVideo']['videoSeq']))
+ if video_seq:
+ return self.url_result(
+ 'http://www.vlive.tv/video/' + video_seq,
+ VLiveIE.ie_key(), video_seq)
+ title = post['title']
+ entries = []
+ for idx, video in enumerate(post['attachments']['video'].values()):
+ video_id = video.get('videoId')
+ if not video_id:
+ continue
+ upload_type = video.get('uploadType')
+ upload_info = video.get('uploadInfo') or {}
+ entry = None
+ if upload_type == 'SOS':
+ download = self._call_api(
+ self._SOS_TMPL, video_id)['videoUrl']['download']
+ formats = []
+ for f_id, f_url in download.items():
+ formats.append({
+ 'format_id': f_id,
+ 'url': f_url,
+ 'height': int_or_none(f_id[:-1]),
+ })
+ self._sort_formats(formats)
+ entry = {
+ 'formats': formats,
+ 'id': video_id,
+ 'thumbnail': upload_info.get('imageUrl'),
+ }
+ elif upload_type == 'V':
+ vod_id = upload_info.get('videoId')
+ if not vod_id:
+ continue
+ inkey = self._call_api(self._INKEY_TMPL, video_id)['inKey']
+ entry = self._extract_video_info(video_id, vod_id, inkey)
+ if entry:
+ entry['title'] = '%s_part%s' % (title, idx)
+ entries.append(entry)
+ return self.playlist_result(
+ entries, post_id, title, strip_or_none(post.get('plainBody')))
-class VLiveChannelIE(InfoExtractor):
+
+class VLiveChannelIE(VLiveBaseIE):
IE_NAME = 'vlive:channel'
- _VALID_URL = r'https?://channels\.vlive\.tv/(?P<id>[0-9A-Z]+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:channels\.vlive\.tv|(?:(?:www|m)\.)?vlive\.tv/channel)/(?P<id>[0-9A-Z]+)'
+ _TESTS = [{
'url': 'http://channels.vlive.tv/FCD4B',
'info_dict': {
'id': 'FCD4B',
'title': 'MAMAMOO',
},
'playlist_mincount': 110
- }
- _APP_ID = '8c6cc7b45d2568fb668be6e05b6e5a3b'
+ }, {
+ 'url': 'https://www.vlive.tv/channel/FCD4B',
+ 'only_matching': True,
+ }]
+
+ def _call_api(self, path, channel_key_suffix, channel_value, note, query):
+ q = {
+ 'app_id': self._APP_ID,
+ 'channel' + channel_key_suffix: channel_value,
+ }
+ q.update(query)
+ return self._download_json(
+ 'http://api.vfan.vlive.tv/vproxy/channelplus/' + path,
+ channel_value, note='Downloading ' + note, query=q)['result']
def _real_extract(self, url):
channel_code = self._match_id(url)
- webpage = self._download_webpage(
- 'http://channels.vlive.tv/%s/video' % channel_code, channel_code)
-
- app_id = None
-
- app_js_url = self._search_regex(
- r'<script[^>]+src=(["\'])(?P<url>http.+?/app\.js.*?)\1',
- webpage, 'app js', default=None, group='url')
-
- if app_js_url:
- app_js = self._download_webpage(
- app_js_url, channel_code, 'Downloading app JS', fatal=False)
- if app_js:
- app_id = self._search_regex(
- r'Global\.VFAN_APP_ID\s*=\s*[\'"]([^\'"]+)[\'"]',
- app_js, 'app id', default=None)
+ channel_seq = self._call_api(
+ 'decodeChannelCode', 'Code', channel_code,
+ 'decode channel code', {})['channelSeq']
- app_id = app_id or self._APP_ID
-
- channel_info = self._download_json(
- 'http://api.vfan.vlive.tv/vproxy/channelplus/decodeChannelCode',
- channel_code, note='Downloading decode channel code',
- query={
- 'app_id': app_id,
- 'channelCode': channel_code,
- '_': int(time.time())
- })
-
- channel_seq = channel_info['result']['channelSeq']
channel_name = None
entries = []
for page_num in itertools.count(1):
- video_list = self._download_json(
- 'http://api.vfan.vlive.tv/vproxy/channelplus/getChannelVideoList',
- channel_code, note='Downloading channel list page #%d' % page_num,
- query={
- 'app_id': app_id,
- 'channelSeq': channel_seq,
+ video_list = self._call_api(
+ 'getChannelVideoList', 'Seq', channel_seq,
+ 'channel list page #%d' % page_num, {
# Large values of maxNumOfRows (~300 or above) may cause
# empty responses (see [1]), e.g. this happens for [2] that
# has more than 300 videos.
# 1. https://github.com/ytdl-org/youtube-dl/issues/13830
# 2. http://channels.vlive.tv/EDBF.
'maxNumOfRows': 100,
- '_': int(time.time()),
'pageNo': page_num
}
)
@@ -269,99 +305,44 @@ class VLiveChannelIE(InfoExtractor):
if not channel_name:
channel_name = try_get(
video_list,
- lambda x: x['result']['channelInfo']['channelName'],
+ lambda x: x['channelInfo']['channelName'],
compat_str)
videos = try_get(
- video_list, lambda x: x['result']['videoList'], list)
+ video_list, lambda x: x['videoList'], list)
if not videos:
break
for video in videos:
video_id = video.get('videoSeq')
- if not video_id:
+ video_type = video.get('videoType')
+
+ if not video_id or not video_type:
continue
video_id = compat_str(video_id)
- entries.append(
- self.url_result(
- 'http://www.vlive.tv/video/%s' % video_id,
- ie=VLiveIE.ie_key(), video_id=video_id))
+
+ if video_type in ('PLAYLIST'):
+ playlist_videos = try_get(
+ video,
+ lambda x: x['videoPlaylist']['videoList'], list)
+ if not playlist_videos:
+ continue
+
+ for playlist_video in playlist_videos:
+ playlist_video_id = playlist_video.get('videoSeq')
+ if not playlist_video_id:
+ continue
+ playlist_video_id = compat_str(playlist_video_id)
+
+ entries.append(
+ self.url_result(
+ 'http://www.vlive.tv/video/%s' % playlist_video_id,
+ ie=VLiveIE.ie_key(), video_id=playlist_video_id))
+ else:
+ entries.append(
+ self.url_result(
+ 'http://www.vlive.tv/video/%s' % video_id,
+ ie=VLiveIE.ie_key(), video_id=video_id))
return self.playlist_result(
entries, channel_code, channel_name)
-
-
-class VLivePlaylistIE(InfoExtractor):
- IE_NAME = 'vlive:playlist'
- _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<video_id>[0-9]+)/playlist/(?P<id>[0-9]+)'
- _VIDEO_URL_TEMPLATE = 'http://www.vlive.tv/video/%s'
- _TESTS = [{
- # regular working playlist
- 'url': 'https://www.vlive.tv/video/117956/playlist/117963',
- 'info_dict': {
- 'id': '117963',
- 'title': '아이돌룸(IDOL ROOM) 41회 - (여자)아이들'
- },
- 'playlist_mincount': 10
- }, {
- # playlist with no playlistVideoSeqs
- 'url': 'http://www.vlive.tv/video/22867/playlist/22912',
- 'info_dict': {
- 'id': '22867',
- 'ext': 'mp4',
- 'title': '[V LIVE] Valentine Day Message from MINA',
- 'creator': 'TWICE',
- 'view_count': int
- },
- 'params': {
- 'skip_download': True,
- }
- }]
-
- def _build_video_result(self, video_id, message):
- self.to_screen(message)
- return self.url_result(
- self._VIDEO_URL_TEMPLATE % video_id,
- ie=VLiveIE.ie_key(), video_id=video_id)
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id, playlist_id = mobj.group('video_id', 'id')
-
- if self._downloader.params.get('noplaylist'):
- return self._build_video_result(
- video_id,
- 'Downloading just video %s because of --no-playlist'
- % video_id)
-
- self.to_screen(
- 'Downloading playlist %s - add --no-playlist to just download video'
- % playlist_id)
-
- webpage = self._download_webpage(
- 'http://www.vlive.tv/video/%s/playlist/%s'
- % (video_id, playlist_id), playlist_id)
-
- raw_item_ids = self._search_regex(
- r'playlistVideoSeqs\s*=\s*(\[[^]]+\])', webpage,
- 'playlist video seqs', default=None, fatal=False)
-
- if not raw_item_ids:
- return self._build_video_result(
- video_id,
- 'Downloading just video %s because no playlist was found'
- % video_id)
-
- item_ids = self._parse_json(raw_item_ids, playlist_id)
-
- entries = [
- self.url_result(
- self._VIDEO_URL_TEMPLATE % item_id, ie=VLiveIE.ie_key(),
- video_id=compat_str(item_id))
- for item_id in item_ids]
-
- playlist_name = self._html_search_regex(
- r'<div[^>]+class="[^"]*multicam_playlist[^>]*>\s*<h3[^>]+>([^<]+)',
- webpage, 'playlist title', fatal=False)
-
- return self.playlist_result(entries, playlist_id, playlist_name)
diff --git a/youtube_dlc/extractor/xiami.py b/youtube_dlc/extractor/xiami.py
index 618da8382..769aab331 100644
--- a/youtube_dlc/extractor/xiami.py
+++ b/youtube_dlc/extractor/xiami.py
@@ -54,17 +54,17 @@ class XiamiBaseIE(InfoExtractor):
def _decrypt(origin):
n = int(origin[0])
origin = origin[1:]
- short_lenth = len(origin) // n
- long_num = len(origin) - short_lenth * n
+ short_length = len(origin) // n
+ long_num = len(origin) - short_length * n
l = tuple()
for i in range(0, n):
- length = short_lenth
+ length = short_length
if i < long_num:
length += 1
l += (origin[0:length], )
origin = origin[length:]
ans = ''
- for i in range(0, short_lenth + 1):
+ for i in range(0, short_length + 1):
for j in range(0, n):
if len(l[j]) > i:
ans += l[j][i]
diff --git a/youtube_dlc/extractor/xtube.py b/youtube_dlc/extractor/xtube.py
index 01b253dcb..98d2adb99 100644
--- a/youtube_dlc/extractor/xtube.py
+++ b/youtube_dlc/extractor/xtube.py
@@ -39,22 +39,6 @@ class XTubeIE(InfoExtractor):
'age_limit': 18,
}
}, {
- # FLV videos with duplicated formats
- 'url': 'http://www.xtube.com/video-watch/A-Super-Run-Part-1-YT-9299752',
- 'md5': 'a406963eb349dd43692ec54631efd88b',
- 'info_dict': {
- 'id': '9299752',
- 'display_id': 'A-Super-Run-Part-1-YT',
- 'ext': 'flv',
- 'title': 'A Super Run - Part 1 (YT)',
- 'description': 'md5:4cc3af1aa1b0413289babc88f0d4f616',
- 'uploader': 'tshirtguy59',
- 'duration': 579,
- 'view_count': int,
- 'comment_count': int,
- 'age_limit': 18,
- },
- }, {
# new URL schema
'url': 'http://www.xtube.com/video-watch/strange-erotica-625837',
'only_matching': True,
@@ -90,7 +74,7 @@ class XTubeIE(InfoExtractor):
title, thumbnail, duration = [None] * 3
config = self._parse_json(self._search_regex(
- r'playerConf\s*=\s*({.+?})\s*,\s*\n', webpage, 'config',
+ r'playerConf\s*=\s*({.+?})\s*,\s*(?:\n|loaderConf)', webpage, 'config',
default='{}'), video_id, transform_source=js_to_json, fatal=False)
if config:
config = config.get('mainRoll')
diff --git a/youtube_dlc/extractor/youporn.py b/youtube_dlc/extractor/youporn.py
index e7fca22de..7b9feafeb 100644
--- a/youtube_dlc/extractor/youporn.py
+++ b/youtube_dlc/extractor/youporn.py
@@ -29,7 +29,6 @@ class YouPornIE(InfoExtractor):
'upload_date': '20101217',
'average_rating': int,
'view_count': int,
- 'comment_count': int,
'categories': list,
'tags': list,
'age_limit': 18,
@@ -48,7 +47,6 @@ class YouPornIE(InfoExtractor):
'upload_date': '20110418',
'average_rating': int,
'view_count': int,
- 'comment_count': int,
'categories': list,
'tags': list,
'age_limit': 18,
@@ -156,7 +154,8 @@ class YouPornIE(InfoExtractor):
r'(?s)<div[^>]+class=["\']submitByLink["\'][^>]*>(.+?)</div>',
webpage, 'uploader', fatal=False)
upload_date = unified_strdate(self._html_search_regex(
- [r'Date\s+[Aa]dded:\s*<span>([^<]+)',
+ [r'UPLOADED:\s*<span>([^<]+)',
+ r'Date\s+[Aa]dded:\s*<span>([^<]+)',
r'(?s)<div[^>]+class=["\']videoInfo(?:Date|Time)["\'][^>]*>(.+?)</div>'],
webpage, 'upload date', fatal=False))
@@ -171,7 +170,7 @@ class YouPornIE(InfoExtractor):
webpage, 'view count', fatal=False, group='count'))
comment_count = str_to_int(self._search_regex(
r'>All [Cc]omments? \(([\d,.]+)\)',
- webpage, 'comment count', fatal=False))
+ webpage, 'comment count', default=None))
def extract_tag_box(regex, title):
tag_box = self._search_regex(regex, webpage, title, default=None)
diff --git a/youtube_dlc/extractor/youtube.py b/youtube_dlc/extractor/youtube.py
index 4fb49b864..e87692754 100644
--- a/youtube_dlc/extractor/youtube.py
+++ b/youtube_dlc/extractor/youtube.py
@@ -16,7 +16,6 @@ from ..jsinterp import JSInterpreter
from ..swfinterp import SWFInterpreter
from ..compat import (
compat_chr,
- compat_HTTPError,
compat_kwargs,
compat_parse_qs,
compat_urllib_parse_unquote,
@@ -30,14 +29,11 @@ from ..utils import (
bool_or_none,
clean_html,
error_to_compat_str,
- extract_attributes,
ExtractorError,
float_or_none,
- get_element_by_attribute,
get_element_by_id,
int_or_none,
mimetype2ext,
- orderedSet,
parse_codecs,
parse_count,
parse_duration,
@@ -50,9 +46,11 @@ from ..utils import (
unescapeHTML,
unified_strdate,
unsmuggle_url,
+ update_url_query,
uppercase_escape,
url_or_none,
urlencode_postdata,
+ urljoin,
)
@@ -65,11 +63,16 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
_CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
_TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
+ _RESERVED_NAMES = (
+ r'course|embed|channel|c|user|playlist|watch|w|results|storefront|oops|'
+ r'shared|index|account|reporthistory|t/terms|about|upload|signin|logout|'
+ r'feed/(watch_later|history|subscriptions|library|trending|recommended)')
+
_NETRC_MACHINE = 'youtube'
# If True it will raise an error if no login info is provided
_LOGIN_REQUIRED = False
- _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}'
+ _PLAYLIST_ID_RE = r'(?:(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}|RDMM|WL|LL|LM)'
_YOUTUBE_CLIENT_HEADERS = {
'x-youtube-client-name': '1',
@@ -274,11 +277,19 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
def _download_webpage_handle(self, *args, **kwargs):
query = kwargs.get('query', {}).copy()
- query['disable_polymer'] = 'true'
kwargs['query'] = query
return super(YoutubeBaseInfoExtractor, self)._download_webpage_handle(
*args, **compat_kwargs(kwargs))
+ def _get_yt_initial_data(self, video_id, webpage):
+ config = self._search_regex(
+ (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
+ r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
+ webpage, 'ytInitialData', default=None)
+ if config:
+ return self._parse_json(
+ uppercase_escape(config), video_id, fatal=False)
+
def _real_initialize(self):
if self._downloader is None:
return
@@ -286,93 +297,36 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
if not self._login():
return
+ _DEFAULT_API_DATA = {
+ 'context': {
+ 'client': {
+ 'clientName': 'WEB',
+ 'clientVersion': '2.20201021.03.00',
+ }
+ },
+ }
-class YoutubeEntryListBaseInfoExtractor(YoutubeBaseInfoExtractor):
- # Extract entries from page with "Load more" button
- def _entries(self, page, playlist_id):
- more_widget_html = content_html = page
- for page_num in itertools.count(1):
- for entry in self._process_page(content_html):
- yield entry
-
- mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
- if not mobj:
- break
-
- count = 0
- retries = 3
- while count <= retries:
- try:
- # Downloading page may result in intermittent 5xx HTTP error
- # that is usually worked around with a retry
- more = self._download_json(
- 'https://www.youtube.com/%s' % mobj.group('more'), playlist_id,
- 'Downloading page #%s%s'
- % (page_num, ' (retry #%d)' % count if count else ''),
- transform_source=uppercase_escape,
- headers=self._YOUTUBE_CLIENT_HEADERS)
- break
- except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code in (500, 503):
- count += 1
- if count <= retries:
- continue
- raise
-
- content_html = more['content_html']
- if not content_html.strip():
- # Some webpages show a "Load more" button but they don't
- # have more videos
- break
- more_widget_html = more['load_more_widget_html']
+ _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;'
+ def _call_api(self, ep, query, video_id):
+ data = self._DEFAULT_API_DATA.copy()
+ data.update(query)
-class YoutubePlaylistBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
- def _process_page(self, content):
- for video_id, video_title in self.extract_videos_from_page(content):
- yield self.url_result(video_id, 'Youtube', video_id, video_title)
+ response = self._download_json(
+ 'https://www.youtube.com/youtubei/v1/%s' % ep, video_id=video_id,
+ note='Downloading API JSON', errnote='Unable to download API page',
+ data=json.dumps(data).encode('utf8'),
+ headers={'content-type': 'application/json'},
+ query={'key': 'AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8'})
- def extract_videos_from_page_impl(self, video_re, page, ids_in_page, titles_in_page):
- for mobj in re.finditer(video_re, page):
- # The link with index 0 is not the first video of the playlist (not sure if still actual)
- if 'index' in mobj.groupdict() and mobj.group('id') == '0':
- continue
- video_id = mobj.group('id')
- video_title = unescapeHTML(
- mobj.group('title')) if 'title' in mobj.groupdict() else None
- if video_title:
- video_title = video_title.strip()
- if video_title == '► Play all':
- video_title = None
- try:
- idx = ids_in_page.index(video_id)
- if video_title and not titles_in_page[idx]:
- titles_in_page[idx] = video_title
- except ValueError:
- ids_in_page.append(video_id)
- titles_in_page.append(video_title)
-
- def extract_videos_from_page(self, page):
- ids_in_page = []
- titles_in_page = []
- self.extract_videos_from_page_impl(
- self._VIDEO_RE, page, ids_in_page, titles_in_page)
- return zip(ids_in_page, titles_in_page)
-
-
-class YoutubePlaylistsBaseInfoExtractor(YoutubeEntryListBaseInfoExtractor):
- def _process_page(self, content):
- for playlist_id in orderedSet(re.findall(
- r'<h3[^>]+class="[^"]*yt-lockup-title[^"]*"[^>]*><a[^>]+href="/?playlist\?list=([0-9A-Za-z-_]{10,})"',
- content)):
- yield self.url_result(
- 'https://www.youtube.com/playlist?list=%s' % playlist_id, 'YoutubePlaylist')
+ return response
- def _real_extract(self, url):
- playlist_id = self._match_id(url)
- webpage = self._download_webpage(url, playlist_id)
- title = self._og_search_title(webpage, fatal=False)
- return self.playlist_result(self._entries(webpage, playlist_id), playlist_id, title)
+ def _extract_yt_initial_data(self, video_id, webpage):
+ return self._parse_json(
+ self._search_regex(
+ (r'%s\s*\n' % self._YT_INITIAL_DATA_RE,
+ self._YT_INITIAL_DATA_RE), webpage, 'yt initial data'),
+ video_id)
class YoutubeIE(YoutubeBaseInfoExtractor):
@@ -433,7 +387,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
|(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
)
)? # all until now is optional -> you can pass the naked ID
- ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
+ (?P<id>[0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
(?!.*?\blist=
(?:
%(playlist_id)s| # combined list/video URLs are handled by the playlist IE
@@ -597,7 +551,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
}
},
{
- 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=UxxajLWwzqY',
+ 'url': 'https://www.youtube.com/watch?v=BaW_jenozKc&v=yZIXLfi8CZQ',
'note': 'Use the first video ID in the URL',
'info_dict': {
'id': 'BaW_jenozKc',
@@ -638,6 +592,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
},
'skip': 'format 141 not served anymore',
},
+ # DASH manifest with encrypted signature
+ {
+ 'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
+ 'info_dict': {
+ 'id': 'IB3lcPjvWLA',
+ 'ext': 'm4a',
+ 'title': 'Afrojack, Spree Wilson - The Spark (Official Music Video) ft. Spree Wilson',
+ 'description': 'md5:8f5e2b82460520b619ccac1f509d43bf',
+ 'duration': 244,
+ 'uploader': 'AfrojackVEVO',
+ 'uploader_id': 'AfrojackVEVO',
+ 'upload_date': '20131011',
+ },
+ 'params': {
+ 'youtube_include_dash_manifest': True,
+ 'format': '141/bestaudio[ext=m4a]',
+ },
+ },
# Controversy video
{
'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
@@ -669,6 +641,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'age_limit': 18,
},
},
+ # video_info is None (https://github.com/ytdl-org/youtube-dl/issues/4421)
+ # YouTube Red ad is not captured for creator
+ {
+ 'url': '__2ABJjxzNo',
+ 'info_dict': {
+ 'id': '__2ABJjxzNo',
+ 'ext': 'mp4',
+ 'duration': 266,
+ 'upload_date': '20100430',
+ 'uploader_id': 'deadmau5',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/deadmau5',
+ 'creator': 'Dada Life, deadmau5',
+ 'description': 'md5:12c56784b8032162bb936a5f76d55360',
+ 'uploader': 'deadmau5',
+ 'title': 'Deadmau5 - Some Chords (HD)',
+ 'alt_title': 'This Machine Kills Some Chords',
+ },
+ 'expected_warnings': [
+ 'DASH manifest missing',
+ ]
+ },
# Olympics (https://github.com/ytdl-org/youtube-dl/issues/4431)
{
'url': 'lqQg6PlCWgI',
@@ -1008,10 +1001,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'only_matching': True,
},
{
- 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
- 'only_matching': True,
- },
- {
'url': 'https://invidio.us/watch?v=BaW_jenozKc',
'only_matching': True,
},
@@ -1063,73 +1052,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
},
},
{
- # Youtube Music Auto-generated description
- # Retrieve 'artist' field from 'Artist:' in video description
- # when it is present on youtube music video
- 'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY',
- 'info_dict': {
- 'id': 'k0jLE7tTwjY',
- 'ext': 'mp4',
- 'title': 'Latch Feat. Sam Smith',
- 'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335',
- 'upload_date': '20150110',
- 'uploader': 'Various Artists - Topic',
- 'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w',
- 'artist': 'Disclosure',
- 'track': 'Latch Feat. Sam Smith',
- 'album': 'Latch Featuring Sam Smith',
- 'release_date': '20121008',
- 'release_year': 2012,
- },
- 'params': {
- 'skip_download': True,
- },
- },
- {
- # Youtube Music Auto-generated description
- # handle multiple artists on youtube music video
- 'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA',
- 'info_dict': {
- 'id': '74qn0eJSjpA',
- 'ext': 'mp4',
- 'title': 'Eastside',
- 'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2',
- 'upload_date': '20180710',
- 'uploader': 'Benny Blanco - Topic',
- 'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A',
- 'artist': 'benny blanco, Halsey, Khalid',
- 'track': 'Eastside',
- 'album': 'Eastside',
- 'release_date': '20180713',
- 'release_year': 2018,
- },
- 'params': {
- 'skip_download': True,
- },
- },
- {
- # Youtube Music Auto-generated description
- # handle youtube music video with release_year and no release_date
- 'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M',
- 'info_dict': {
- 'id': '-hcAI0g-f5M',
- 'ext': 'mp4',
- 'title': 'Put It On Me',
- 'description': 'md5:f6422397c07c4c907c6638e1fee380a5',
- 'upload_date': '20180426',
- 'uploader': 'Matt Maeson - Topic',
- 'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ',
- 'artist': 'Matt Maeson',
- 'track': 'Put It On Me',
- 'album': 'The Hearse',
- 'release_date': None,
- 'release_year': 2018,
- },
- 'params': {
- 'skip_download': True,
- },
- },
- {
'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
'only_matching': True,
},
@@ -1169,6 +1091,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'skip_download': True,
},
},
+ {
+ # with '};' inside yt initial data (see https://github.com/ytdl-org/youtube-dl/issues/27093)
+ 'url': 'https://www.youtube.com/watch?v=CHqg6qOn4no',
+ 'info_dict': {
+ 'id': 'CHqg6qOn4no',
+ 'ext': 'mp4',
+ 'title': 'Part 77 Sort a list of simple types in c#',
+ 'description': 'md5:b8746fa52e10cdbf47997903f13b20dc',
+ 'upload_date': '20130831',
+ 'uploader_id': 'kudvenkat',
+ 'uploader': 'kudvenkat',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
]
def __init__(self, *args, **kwargs):
@@ -1397,15 +1335,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return self._parse_json(
uppercase_escape(config), video_id, fatal=False)
- def _get_yt_initial_data(self, video_id, webpage):
- config = self._search_regex(
- (r'window\["ytInitialData"\]\s*=\s*(.*?)(?<=});',
- r'var\s+ytInitialData\s*=\s*(.*?)(?<=});'),
- webpage, 'ytInitialData', default=None)
- if config:
- return self._parse_json(
- uppercase_escape(config), video_id, fatal=False)
-
def _get_automatic_captions(self, video_id, webpage):
"""We need the webpage for getting the captions url, pass it as an
argument to speed up the process."""
@@ -1481,21 +1410,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
player_response, video_id, fatal=False)
if player_response:
renderer = player_response['captions']['playerCaptionsTracklistRenderer']
- caption_tracks = renderer['captionTracks']
- for caption_track in caption_tracks:
- if 'kind' not in caption_track:
- # not an automatic transcription
- continue
- base_url = caption_track['baseUrl']
- sub_lang_list = []
- for lang in renderer['translationLanguages']:
- lang_code = lang.get('languageCode')
- if lang_code:
- sub_lang_list.append(lang_code)
- return make_captions(base_url, sub_lang_list)
-
- self._downloader.report_warning("Couldn't find automatic captions for %s" % video_id)
- return {}
+ base_url = renderer['captionTracks'][0]['baseUrl']
+ sub_lang_list = []
+ for lang in renderer['translationLanguages']:
+ lang_code = lang.get('languageCode')
+ if lang_code:
+ sub_lang_list.append(lang_code)
+ return make_captions(base_url, sub_lang_list)
+
# Some videos don't provide ttsurl but rather caption_tracks and
# caption_translation_languages (e.g. 20LmZk1hakA)
# Does not used anymore as of 22.06.2017
@@ -1589,15 +1511,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _extract_chapters_from_json(self, webpage, video_id, duration):
if not webpage:
return
- initial_data = self._parse_json(
- self._search_regex(
- r'window\["ytInitialData"\] = (.+);\n', webpage,
- 'player args', default='{}'),
- video_id, fatal=False)
- if not initial_data or not isinstance(initial_data, dict):
+ data = self._extract_yt_initial_data(video_id, webpage)
+ if not data or not isinstance(data, dict):
return
chapters_list = try_get(
- initial_data,
+ data,
lambda x: x['playerOverlays']
['playerOverlayRenderer']
['decoratedPlayerBarRenderer']
@@ -1784,21 +1702,24 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# Try looking directly into the video webpage
ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
if ytplayer_config:
- args = ytplayer_config['args']
- if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
- # Convert to the same format returned by compat_parse_qs
- video_info = dict((k, [v]) for k, v in args.items())
- add_dash_mpd(video_info)
- # Rental video is not rented but preview is available (e.g.
- # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
- # https://github.com/ytdl-org/youtube-dl/issues/10532)
- if not video_info and args.get('ypc_vid'):
- return self.url_result(
- args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
- if args.get('livestream') == '1' or args.get('live_playback') == 1:
- is_live = True
- if not player_response:
- player_response = extract_player_response(args.get('player_response'), video_id)
+ args = ytplayer_config.get("args")
+ if args is not None:
+ if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
+ # Convert to the same format returned by compat_parse_qs
+ video_info = dict((k, [v]) for k, v in args.items())
+ add_dash_mpd(video_info)
+ # Rental video is not rented but preview is available (e.g.
+ # https://www.youtube.com/watch?v=yYr8q0y5Jfg,
+ # https://github.com/ytdl-org/youtube-dl/issues/10532)
+ if not video_info and args.get('ypc_vid'):
+ return self.url_result(
+ args['ypc_vid'], YoutubeIE.ie_key(), video_id=args['ypc_vid'])
+ if args.get('livestream') == '1' or args.get('live_playback') == 1:
+ is_live = True
+ if not player_response:
+ player_response = extract_player_response(args.get('player_response'), video_id)
+ elif not player_response:
+ player_response = ytplayer_config
if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
add_dash_mpd_pr(player_response)
else:
@@ -1829,7 +1750,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# Try looking directly into the video webpage
ytplayer_config = self._get_ytplayer_config(video_id, video_webpage)
if ytplayer_config:
- args = ytplayer_config['args']
+ args = ytplayer_config.get('args', {})
if args.get('url_encoded_fmt_stream_map') or args.get('hlsvp'):
# Convert to the same format returned by compat_parse_qs
video_info = dict((k, [v]) for k, v in args.items())
@@ -1847,6 +1768,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if not video_info or self._downloader.params.get('youtube_include_dash_manifest', True):
add_dash_mpd_pr(player_response)
+ if not video_info and not player_response:
+ player_response = extract_player_response(
+ self._search_regex(
+ r'ytInitialPlayerResponse\s*=\s*({.+?})\s*;', video_webpage,
+ 'initial player response', default='{}'),
+ video_id)
+
def extract_unavailable_message():
messages = []
for tag, kind in (('h1', 'message'), ('div', 'submessage')):
@@ -2051,7 +1979,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if cipher:
if 's' in url_data or self._downloader.params.get('youtube_include_dash_manifest', True):
- ASSETS_RE = r'"assets":.+?"js":\s*("[^"]+")'
+ ASSETS_RE = (
+ r'<script[^>]+\bsrc=("[^"]+")[^>]+\bname=["\']player_ias/base',
+ r'"jsUrl"\s*:\s*("[^"]+")',
+ r'"assets":.+?"js":\s*("[^"]+")')
jsplayer_url_json = self._search_regex(
ASSETS_RE,
embed_webpage if age_gate else video_webpage,
@@ -2187,6 +2118,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
else:
error_message = extract_unavailable_message()
if not error_message:
+ reason_list = try_get(
+ player_response,
+ lambda x: x['playabilityStatus']['errorScreen']['playerErrorMessageRenderer']['subreason']['runs'],
+ list) or []
+ for reason in reason_list:
+ if not isinstance(reason, dict):
+ continue
+ reason_text = try_get(reason, lambda x: x['text'], compat_str)
+ if reason_text:
+ if not error_message:
+ error_message = ''
+ error_message += reason_text
+ if error_message:
+ error_message = clean_html(error_message)
+ if not error_message:
error_message = clean_html(try_get(
player_response, lambda x: x['playabilityStatus']['reason'],
compat_str))
@@ -2311,7 +2257,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
# Youtube Music Auto-generated description
release_date = release_year = None
if video_description:
- mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description)
+ mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description)
if mobj:
if not track:
track = mobj.group('track').strip()
@@ -2328,6 +2274,34 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if release_year:
release_year = int(release_year)
+ yt_initial_data = self._extract_yt_initial_data(video_id, video_webpage)
+ contents = try_get(yt_initial_data, lambda x: x['contents']['twoColumnWatchNextResults']['results']['results']['contents'], list) or []
+ for content in contents:
+ rows = try_get(content, lambda x: x['videoSecondaryInfoRenderer']['metadataRowContainer']['metadataRowContainerRenderer']['rows'], list) or []
+ multiple_songs = False
+ for row in rows:
+ if try_get(row, lambda x: x['metadataRowRenderer']['hasDividerLine']) is True:
+ multiple_songs = True
+ break
+ for row in rows:
+ mrr = row.get('metadataRowRenderer') or {}
+ mrr_title = try_get(
+ mrr, lambda x: x['title']['simpleText'], compat_str)
+ mrr_contents = try_get(
+ mrr, lambda x: x['contents'][0], dict) or {}
+ mrr_contents_text = try_get(mrr_contents, [lambda x: x['simpleText'], lambda x: x['runs'][0]['text']], compat_str)
+ if not (mrr_title and mrr_contents_text):
+ continue
+ if mrr_title == 'License':
+ video_license = mrr_contents_text
+ elif not multiple_songs:
+ if mrr_title == 'Album':
+ album = mrr_contents_text
+ elif mrr_title == 'Artist':
+ artist = mrr_contents_text
+ elif mrr_title == 'Song':
+ track = mrr_contents_text
+
m_episode = re.search(
r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
video_webpage)
@@ -2359,8 +2333,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _extract_count(count_name):
return str_to_int(self._search_regex(
- r'"accessibilityData":\{"label":"([\d,\w]+) %ss"\}'
- % re.escape(count_name),
+ (r'-%s-button[^>]+><span[^>]+class="yt-uix-button-content"[^>]*>([\d,]+)</span>' % re.escape(count_name),
+ r'["\']label["\']\s*:\s*["\']([\d,.]+)\s+%ss["\']' % re.escape(count_name)),
video_webpage, count_name, default=None))
like_count = _extract_count('like')
@@ -2537,38 +2511,59 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
}
-class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
- IE_DESC = 'YouTube.com playlists'
- _VALID_URL = r"""(?x)(?:
- (?:https?://)?
+class YoutubeTabIE(YoutubeBaseInfoExtractor):
+ IE_DESC = 'YouTube.com tab'
+ _VALID_URL = r'''(?x)
+ https?://
(?:\w+\.)?
(?:
- (?:
- youtube(?:kids)?\.com|
- invidio\.us
- )
- /
- (?:
- (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11}))
- \? (?:.*?[&;])*? (?:p|a|list)=
- | p/
+ youtube(?:kids)?\.com|
+ invidio\.us
+ )/
+ (?:
+ (?:channel|c|user)/|
+ (?P<not_channel>
+ feed/|
+ (?:playlist|watch)\?.*?\blist=
)|
- youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
+ (?!(%s)([/#?]|$)) # Direct URLs
)
- (
- (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,}
- # Top tracks, they can also include dots
- |(?:MC)[\w\.]*
- )
- .*
- |
- (%(playlist_id)s)
- )""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
- _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
- _VIDEO_RE_TPL = r'href="\s*/watch\?v=%s(?:&amp;(?:[^"]*?index=(?P<index>\d+))?(?:[^>]+>(?P<title>[^<]+))?)?'
- _VIDEO_RE = _VIDEO_RE_TPL % r'(?P<id>[0-9A-Za-z_-]{11})'
- IE_NAME = 'youtube:playlist'
+ (?P<id>[^/?\#&]+)
+ ''' % YoutubeBaseInfoExtractor._RESERVED_NAMES
+ IE_NAME = 'youtube:tab'
+
_TESTS = [{
+ # playlists, multipage
+ 'url': 'https://www.youtube.com/c/ИгорьКлейнер/playlists?view=1&flow=grid',
+ 'playlist_mincount': 94,
+ 'info_dict': {
+ 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
+ 'title': 'Игорь Клейнер - Playlists',
+ 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
+ },
+ }, {
+ # playlists, multipage, different order
+ 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
+ 'playlist_mincount': 94,
+ 'info_dict': {
+ 'id': 'UCqj7Cz7revf5maW9g5pgNcg',
+ 'title': 'Игорь Клейнер - Playlists',
+ 'description': 'md5:be97ee0f14ee314f1f002cf187166ee2',
+ },
+ }, {
+ # playlists, singlepage
+ 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
+ 'playlist_mincount': 4,
+ 'info_dict': {
+ 'id': 'UCAEtajcuhQ6an9WEzY9LEMQ',
+ 'title': 'ThirstForScience - Playlists',
+ 'description': 'md5:609399d937ea957b0f53cbffb747a14c',
+ }
+ }, {
+ 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
+ 'only_matching': True,
+ }, {
+ # basic, single video playlist
'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
'info_dict': {
'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
@@ -2578,6 +2573,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
},
'playlist_count': 1,
}, {
+ # empty playlist
'url': 'https://www.youtube.com/playlist?list=PL4lCao7KL_QFodcLWhDpGCYnngnHtQ-Xf',
'info_dict': {
'uploader_id': 'UCmlqkdCBesrv2Lak1mF_MxA',
@@ -2587,71 +2583,92 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
},
'playlist_count': 0,
}, {
- 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
- 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
+ # Home tab
+ 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/featured',
'info_dict': {
- 'title': '29C3: Not my department',
- 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
- 'uploader': 'Christiaan008',
- 'uploader_id': 'ChRiStIaAn008',
+ 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'title': 'lex will - Home',
+ 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
},
- 'playlist_count': 96,
+ 'playlist_mincount': 2,
}, {
- 'note': 'issue #673',
- 'url': 'PLBB231211A4F62143',
+ # Videos tab
+ 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos',
'info_dict': {
- 'title': '[OLD]Team Fortress 2 (Class-based LP)',
- 'id': 'PLBB231211A4F62143',
- 'uploader': 'Wickydoo',
- 'uploader_id': 'Wickydoo',
+ 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'title': 'lex will - Videos',
+ 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
},
- 'playlist_mincount': 26,
+ 'playlist_mincount': 975,
}, {
- 'note': 'Large playlist',
- 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
+ # Videos tab, sorted by popular
+ 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/videos?view=0&sort=p&flow=grid',
'info_dict': {
- 'title': 'Uploads from Cauchemar',
- 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
- 'uploader': 'Cauchemar',
- 'uploader_id': 'Cauchemar89',
+ 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'title': 'lex will - Videos',
+ 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
},
- 'playlist_mincount': 799,
+ 'playlist_mincount': 199,
}, {
- 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
+ # Playlists tab
+ 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/playlists',
'info_dict': {
- 'title': 'YDL_safe_search',
- 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
+ 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'title': 'lex will - Playlists',
+ 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
},
- 'playlist_count': 2,
- 'skip': 'This playlist is private',
+ 'playlist_mincount': 17,
}, {
- 'note': 'embedded',
- 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
- 'playlist_count': 4,
+ # Community tab
+ 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/community',
'info_dict': {
- 'title': 'JODA15',
- 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
- 'uploader': 'milan',
- 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
- }
+ 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'title': 'lex will - Community',
+ 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
+ },
+ 'playlist_mincount': 18,
}, {
- 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
- 'playlist_mincount': 485,
+ # Channels tab
+ 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w/channels',
'info_dict': {
- 'title': '2018 Chinese New Singles (11/6 updated)',
- 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
- 'uploader': 'LBK',
- 'uploader_id': 'sdragonfang',
- }
+ 'id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
+ 'title': 'lex will - Channels',
+ 'description': 'md5:2163c5d0ff54ed5f598d6a7e6211e488',
+ },
+ 'playlist_mincount': 138,
}, {
- 'note': 'Embedded SWF player',
- 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
- 'playlist_count': 4,
+ 'url': 'https://invidio.us/channel/UCmlqkdCBesrv2Lak1mF_MxA',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtubekids.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://music.youtube.com/channel/UCmlqkdCBesrv2Lak1mF_MxA',
+ 'only_matching': True,
+ }, {
+ 'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
+ 'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
+ 'info_dict': {
+ 'title': '29C3: Not my department',
+ 'id': 'PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
+ 'uploader': 'Christiaan008',
+ 'uploader_id': 'UCEPzS1rYsrkqzSLNp76nrcg',
+ },
+ 'playlist_count': 96,
+ }, {
+ 'note': 'Large playlist',
+ 'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
'info_dict': {
- 'title': 'JODA7',
- 'id': 'YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ',
+ 'title': 'Uploads from Cauchemar',
+ 'id': 'UUBABnxM4Ar9ten8Mdjj1j0Q',
+ 'uploader': 'Cauchemar',
+ 'uploader_id': 'UCBABnxM4Ar9ten8Mdjj1j0Q',
},
- 'skip': 'This playlist does not exist',
+ 'playlist_mincount': 1123,
+ }, {
+ # even larger playlist, 8832 videos
+ 'url': 'http://www.youtube.com/user/NASAgovVideo/videos',
+ 'only_matching': True,
}, {
'note': 'Buggy playlist: the webpage has a "Load more" button but it doesn\'t have more videos',
'url': 'https://www.youtube.com/playlist?list=UUXw-G3eDE9trcvY2sBMM_aA',
@@ -2659,10 +2676,23 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
'title': 'Uploads from Interstellar Movie',
'id': 'UUXw-G3eDE9trcvY2sBMM_aA',
'uploader': 'Interstellar Movie',
- 'uploader_id': 'InterstellarMovie1',
+ 'uploader_id': 'UCXw-G3eDE9trcvY2sBMM_aA',
},
'playlist_mincount': 21,
}, {
+ # https://github.com/ytdl-org/youtube-dl/issues/21844
+ 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
+ 'info_dict': {
+ 'title': 'Data Analysis with Dr Mike Pound',
+ 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
+ 'uploader_id': 'UC9-y-6csu5WGm29I7JiwpnA',
+ 'uploader': 'Computerphile',
+ },
+ 'playlist_mincount': 11,
+ }, {
+ 'url': 'https://invidio.us/playlist?list=PL4lCao7KL_QFVb7Iudeipvc2BCavECqzc',
+ 'only_matching': True,
+ }, {
# Playlist URL that does not actually serve a playlist
'url': 'https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4',
'info_dict': {
@@ -2687,470 +2717,719 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
'skip': 'This video is not available.',
'add_ie': [YoutubeIE.ie_key()],
}, {
- 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
+ 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/watch?v=MuAGGZNfUkU&list=RDMM',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.youtube.com/channel/UCoMdktPbSTixAyNGwb-UYkQ/live',
'info_dict': {
- 'id': 'yeWKywCrFtk',
+ 'id': '9Auq9mYxFEE',
'ext': 'mp4',
- 'title': 'Small Scale Baler and Braiding Rugs',
- 'uploader': 'Backus-Page House Museum',
- 'uploader_id': 'backuspagemuseum',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
- 'upload_date': '20161008',
- 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
- 'categories': ['Nonprofits & Activism'],
+ 'title': 'Watch Sky News live',
+ 'uploader': 'Sky News',
+ 'uploader_id': 'skynews',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/skynews',
+ 'upload_date': '20191102',
+ 'description': 'md5:78de4e1c2359d0ea3ed829678e38b662',
+ 'categories': ['News & Politics'],
'tags': list,
'like_count': int,
'dislike_count': int,
},
'params': {
- 'noplaylist': True,
'skip_download': True,
},
}, {
- # https://github.com/ytdl-org/youtube-dl/issues/21844
- 'url': 'https://www.youtube.com/playlist?list=PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
+ 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
'info_dict': {
- 'title': 'Data Analysis with Dr Mike Pound',
- 'id': 'PLzH6n4zXuckpfMu_4Ff8E7Z1behQks5ba',
- 'uploader_id': 'Computerphile',
- 'uploader': 'Computerphile',
+ 'id': 'a48o2S1cPoo',
+ 'ext': 'mp4',
+ 'title': 'The Young Turks - Live Main Show',
+ 'uploader': 'The Young Turks',
+ 'uploader_id': 'TheYoungTurks',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
+ 'upload_date': '20150715',
+ 'license': 'Standard YouTube License',
+ 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
+ 'categories': ['News & Politics'],
+ 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
+ 'like_count': int,
+ 'dislike_count': int,
},
- 'playlist_mincount': 11,
+ 'params': {
+ 'skip_download': True,
+ },
+ 'only_matching': True,
}, {
- 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
+ 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
'only_matching': True,
}, {
- 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
+ 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
'only_matching': True,
}, {
- # music album playlist
- 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
+ 'url': 'https://www.youtube.com/feed/trending',
'only_matching': True,
}, {
- 'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
+ # needs auth
+ 'url': 'https://www.youtube.com/feed/library',
'only_matching': True,
}, {
- 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
+ # needs auth
+ 'url': 'https://www.youtube.com/feed/history',
'only_matching': True,
- }]
+ }, {
+ # needs auth
+ 'url': 'https://www.youtube.com/feed/subscriptions',
+ 'only_matching': True,
+ }, {
+ # needs auth
+ 'url': 'https://www.youtube.com/feed/watch_later',
+ 'only_matching': True,
+ }, {
+ # no longer available?
+ 'url': 'https://www.youtube.com/feed/recommended',
+ 'only_matching': True,
+ }
+ # TODO
+ # {
+ # 'url': 'https://www.youtube.com/TheYoungTurks/live',
+ # 'only_matching': True,
+ # }
+ ]
- def _real_initialize(self):
- self._login()
+ def _extract_channel_id(self, webpage):
+ channel_id = self._html_search_meta(
+ 'channelId', webpage, 'channel id', default=None)
+ if channel_id:
+ return channel_id
+ channel_url = self._html_search_meta(
+ ('og:url', 'al:ios:url', 'al:android:url', 'al:web:url',
+ 'twitter:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad',
+ 'twitter:app:url:googleplay'), webpage, 'channel url')
+ return self._search_regex(
+ r'https?://(?:www\.)?youtube\.com/channel/([^/?#&])+',
+ channel_url, 'channel id')
- def extract_videos_from_page(self, page):
- ids_in_page = []
- titles_in_page = []
-
- for item in re.findall(
- r'(<[^>]*\bdata-video-id\s*=\s*["\'][0-9A-Za-z_-]{11}[^>]+>)', page):
- attrs = extract_attributes(item)
- video_id = attrs['data-video-id']
- video_title = unescapeHTML(attrs.get('data-title'))
- if video_title:
- video_title = video_title.strip()
- ids_in_page.append(video_id)
- titles_in_page.append(video_title)
-
- # Fallback with old _VIDEO_RE
- self.extract_videos_from_page_impl(
- self._VIDEO_RE, page, ids_in_page, titles_in_page)
-
- # Relaxed fallbacks
- self.extract_videos_from_page_impl(
- r'href="\s*/watch\?v\s*=\s*(?P<id>[0-9A-Za-z_-]{11})', page,
- ids_in_page, titles_in_page)
- self.extract_videos_from_page_impl(
- r'data-video-ids\s*=\s*["\'](?P<id>[0-9A-Za-z_-]{11})', page,
- ids_in_page, titles_in_page)
-
- return zip(ids_in_page, titles_in_page)
-
- def _extract_mix(self, playlist_id):
- # The mixes are generated from a single video
- # the id of the playlist is just 'RD' + video_id
- ids = []
- last_id = playlist_id[-11:]
- for n in itertools.count(1):
- url = 'https://www.youtube.com/watch?v=%s&list=%s' % (last_id, playlist_id)
- webpage = self._download_webpage(
- url, playlist_id, 'Downloading page {0} of Youtube mix'.format(n))
- new_ids = orderedSet(re.findall(
- r'''(?xs)data-video-username=".*?".*?
- href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
- webpage))
- # Fetch new pages until all the videos are repeated, it seems that
- # there are always 51 unique videos.
- new_ids = [_id for _id in new_ids if _id not in ids]
- if not new_ids:
- break
- ids.extend(new_ids)
- last_id = ids[-1]
+ @staticmethod
+ def _extract_grid_item_renderer(item):
+ for item_kind in ('Playlist', 'Video', 'Channel'):
+ renderer = item.get('grid%sRenderer' % item_kind)
+ if renderer:
+ return renderer
+
+ def _extract_video(self, renderer):
+ video_id = renderer.get('videoId')
+ title = try_get(
+ renderer,
+ (lambda x: x['title']['runs'][0]['text'],
+ lambda x: x['title']['simpleText']), compat_str)
+ description = try_get(
+ renderer, lambda x: x['descriptionSnippet']['runs'][0]['text'],
+ compat_str)
+ duration = parse_duration(try_get(
+ renderer, lambda x: x['lengthText']['simpleText'], compat_str))
+ view_count_text = try_get(
+ renderer, lambda x: x['viewCountText']['simpleText'], compat_str) or ''
+ view_count = str_to_int(self._search_regex(
+ r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
+ 'view count', default=None))
+ uploader = try_get(
+ renderer, lambda x: x['ownerText']['runs'][0]['text'], compat_str)
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': YoutubeIE.ie_key(),
+ 'id': video_id,
+ 'url': video_id,
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'view_count': view_count,
+ 'uploader': uploader,
+ }
- url_results = self._ids_to_results(ids)
+ def _grid_entries(self, grid_renderer):
+ for item in grid_renderer['items']:
+ if not isinstance(item, dict):
+ continue
+ renderer = self._extract_grid_item_renderer(item)
+ if not isinstance(renderer, dict):
+ continue
+ title = try_get(
+ renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
+ # playlist
+ playlist_id = renderer.get('playlistId')
+ if playlist_id:
+ yield self.url_result(
+ 'https://www.youtube.com/playlist?list=%s' % playlist_id,
+ ie=YoutubeTabIE.ie_key(), video_id=playlist_id,
+ video_title=title)
+ # video
+ video_id = renderer.get('videoId')
+ if video_id:
+ yield self._extract_video(renderer)
+ # channel
+ channel_id = renderer.get('channelId')
+ if channel_id:
+ title = try_get(
+ renderer, lambda x: x['title']['simpleText'], compat_str)
+ yield self.url_result(
+ 'https://www.youtube.com/channel/%s' % channel_id,
+ ie=YoutubeTabIE.ie_key(), video_title=title)
+
+ def _shelf_entries_from_content(self, shelf_renderer):
+ content = shelf_renderer.get('content')
+ if not isinstance(content, dict):
+ return
+ renderer = content.get('gridRenderer')
+ if renderer:
+ # TODO: add support for nested playlists so each shelf is processed
+ # as separate playlist
+ # TODO: this includes only first N items
+ for entry in self._grid_entries(renderer):
+ yield entry
+ renderer = content.get('horizontalListRenderer')
+ if renderer:
+ # TODO
+ pass
+
+ def _shelf_entries(self, shelf_renderer):
+ ep = try_get(
+ shelf_renderer, lambda x: x['endpoint']['commandMetadata']['webCommandMetadata']['url'],
+ compat_str)
+ shelf_url = urljoin('https://www.youtube.com', ep)
+ if shelf_url:
+ title = try_get(
+ shelf_renderer, lambda x: x['title']['runs'][0]['text'], compat_str)
+ yield self.url_result(shelf_url, video_title=title)
+ # Shelf may not contain shelf URL, fallback to extraction from content
+ for entry in self._shelf_entries_from_content(shelf_renderer):
+ yield entry
+
+ def _playlist_entries(self, video_list_renderer):
+ for content in video_list_renderer['contents']:
+ if not isinstance(content, dict):
+ continue
+ renderer = content.get('playlistVideoRenderer') or content.get('playlistPanelVideoRenderer')
+ if not isinstance(renderer, dict):
+ continue
+ video_id = renderer.get('videoId')
+ if not video_id:
+ continue
+ yield self._extract_video(renderer)
- search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
- title_span = (
- search_title('playlist-title')
- or search_title('title long-title')
- or search_title('title'))
- title = clean_html(title_span)
+ r""" # Not needed in the new implementation
+ def _itemSection_entries(self, item_sect_renderer):
+ for content in item_sect_renderer['contents']:
+ if not isinstance(content, dict):
+ continue
+ renderer = content.get('videoRenderer', {})
+ if not isinstance(renderer, dict):
+ continue
+ video_id = renderer.get('videoId')
+ if not video_id:
+ continue
+ yield self._extract_video(renderer)
+ """
- return self.playlist_result(url_results, playlist_id, title)
+ def _rich_entries(self, rich_grid_renderer):
+ renderer = try_get(
+ rich_grid_renderer, lambda x: x['content']['videoRenderer'], dict) or {}
+ video_id = renderer.get('videoId')
+ if not video_id:
+ return
+ yield self._extract_video(renderer)
- def _extract_playlist(self, playlist_id):
- url = self._TEMPLATE_URL % playlist_id
- page = self._download_webpage(url, playlist_id)
+ def _video_entry(self, video_renderer):
+ video_id = video_renderer.get('videoId')
+ if video_id:
+ return self._extract_video(video_renderer)
- # the yt-alert-message now has tabindex attribute (see https://github.com/ytdl-org/youtube-dl/issues/11604)
- for match in re.findall(r'<div class="yt-alert-message"[^>]*>([^<]+)</div>', page):
- match = match.strip()
- # Check if the playlist exists or is private
- mobj = re.match(r'[^<]*(?:The|This) playlist (?P<reason>does not exist|is private)[^<]*', match)
- if mobj:
- reason = mobj.group('reason')
- message = 'This playlist %s' % reason
- if 'private' in reason:
- message += ', use --username or --netrc to access it'
- message += '.'
- raise ExtractorError(message, expected=True)
- elif re.match(r'[^<]*Invalid parameters[^<]*', match):
- raise ExtractorError(
- 'Invalid parameters. Maybe URL is incorrect.',
- expected=True)
- elif re.match(r'[^<]*Choose your language[^<]*', match):
+ def _post_thread_entries(self, post_thread_renderer):
+ post_renderer = try_get(
+ post_thread_renderer, lambda x: x['post']['backstagePostRenderer'], dict)
+ if not post_renderer:
+ return
+ # video attachment
+ video_renderer = try_get(
+ post_renderer, lambda x: x['backstageAttachment']['videoRenderer'], dict)
+ video_id = None
+ if video_renderer:
+ entry = self._video_entry(video_renderer)
+ if entry:
+ yield entry
+ # inline video links
+ runs = try_get(post_renderer, lambda x: x['contentText']['runs'], list) or []
+ for run in runs:
+ if not isinstance(run, dict):
continue
- else:
- self.report_warning('Youtube gives an alert message: ' + match)
+ ep_url = try_get(
+ run, lambda x: x['navigationEndpoint']['urlEndpoint']['url'], compat_str)
+ if not ep_url:
+ continue
+ if not YoutubeIE.suitable(ep_url):
+ continue
+ ep_video_id = YoutubeIE._match_id(ep_url)
+ if video_id == ep_video_id:
+ continue
+ yield self.url_result(ep_url, ie=YoutubeIE.ie_key(), video_id=video_id)
- playlist_title = self._html_search_regex(
- r'(?s)<h1 class="pl-header-title[^"]*"[^>]*>\s*(.*?)\s*</h1>',
- page, 'title', default=None)
+ def _post_thread_continuation_entries(self, post_thread_continuation):
+ contents = post_thread_continuation.get('contents')
+ if not isinstance(contents, list):
+ return
+ for content in contents:
+ renderer = content.get('backstagePostThreadRenderer')
+ if not isinstance(renderer, dict):
+ continue
+ for entry in self._post_thread_entries(renderer):
+ yield entry
- _UPLOADER_BASE = r'class=["\']pl-header-details[^>]+>\s*<li>\s*<a[^>]+\bhref='
- uploader = self._html_search_regex(
- r'%s["\']/(?:user|channel)/[^>]+>([^<]+)' % _UPLOADER_BASE,
- page, 'uploader', default=None)
- mobj = re.search(
- r'%s(["\'])(?P<path>/(?:user|channel)/(?P<uploader_id>.+?))\1' % _UPLOADER_BASE,
- page)
- if mobj:
- uploader_id = mobj.group('uploader_id')
- uploader_url = compat_urlparse.urljoin(url, mobj.group('path'))
- else:
- uploader_id = uploader_url = None
+ @staticmethod
+ def _extract_next_continuation_data(renderer):
+ next_continuation = try_get(
+ renderer, lambda x: x['continuations'][0]['nextContinuationData'], dict)
+ if not next_continuation:
+ return
+ continuation = next_continuation.get('continuation')
+ if not continuation:
+ return
+ ctp = next_continuation.get('clickTrackingParams')
+ return {
+ 'ctoken': continuation,
+ 'continuation': continuation,
+ 'itct': ctp,
+ }
- has_videos = True
+ @classmethod
+ def _extract_continuation(cls, renderer):
+ next_continuation = cls._extract_next_continuation_data(renderer)
+ if next_continuation:
+ return next_continuation
+ contents = renderer.get('contents')
+ if not isinstance(contents, list):
+ return
+ for content in contents:
+ if not isinstance(content, dict):
+ continue
+ continuation_ep = try_get(
+ content, lambda x: x['continuationItemRenderer']['continuationEndpoint'],
+ dict)
+ if not continuation_ep:
+ continue
+ continuation = try_get(
+ continuation_ep, lambda x: x['continuationCommand']['token'], compat_str)
+ if not continuation:
+ continue
+ ctp = continuation_ep.get('clickTrackingParams')
+ if not ctp:
+ continue
+ return {
+ 'ctoken': continuation,
+ 'continuation': continuation,
+ 'itct': ctp,
+ }
- if not playlist_title:
- try:
- # Some playlist URLs don't actually serve a playlist (e.g.
- # https://www.youtube.com/watch?v=FqZTN594JQw&list=PLMYEtVRpaqY00V9W81Cwmzp6N6vZqfUKD4)
- next(self._entries(page, playlist_id))
- except StopIteration:
- has_videos = False
+ def _entries(self, tab, identity_token):
- playlist = self.playlist_result(
- self._entries(page, playlist_id), playlist_id, playlist_title)
- playlist.update({
- 'uploader': uploader,
- 'uploader_id': uploader_id,
- 'uploader_url': uploader_url,
- })
+ def extract_entries(parent_renderer): # this needs to called again for continuation to work with feeds
+ contents = try_get(parent_renderer, lambda x: x['contents'], list) or []
+ for content in contents:
+ if not isinstance(content, dict):
+ continue
+ is_renderer = try_get(content, lambda x: x['itemSectionRenderer'], dict)
+ if not is_renderer:
+ renderer = content.get('richItemRenderer')
+ if renderer:
+ for entry in self._rich_entries(renderer):
+ yield entry
+ continuation_list[0] = self._extract_continuation(parent_renderer)
+ continue
+ isr_contents = try_get(is_renderer, lambda x: x['contents'], list) or []
+ for isr_content in isr_contents:
+ if not isinstance(isr_content, dict):
+ continue
+ renderer = isr_content.get('playlistVideoListRenderer')
+ if renderer:
+ for entry in self._playlist_entries(renderer):
+ yield entry
+ continuation_list[0] = self._extract_continuation(renderer)
+ continue
+ renderer = isr_content.get('gridRenderer')
+ if renderer:
+ for entry in self._grid_entries(renderer):
+ yield entry
+ continuation_list[0] = self._extract_continuation(renderer)
+ continue
+ renderer = isr_content.get('shelfRenderer')
+ if renderer:
+ for entry in self._shelf_entries(renderer):
+ yield entry
+ continue
+ renderer = isr_content.get('backstagePostThreadRenderer')
+ if renderer:
+ for entry in self._post_thread_entries(renderer):
+ yield entry
+ continuation_list[0] = self._extract_continuation(renderer)
+ continue
+ renderer = isr_content.get('videoRenderer')
+ if renderer:
+ entry = self._video_entry(renderer)
+ if entry:
+ yield entry
+
+ if not continuation_list[0]:
+ continuation_list[0] = self._extract_continuation(is_renderer)
+
+ if not continuation_list[0]:
+ continuation_list[0] = self._extract_continuation(parent_renderer)
+
+ continuation_list = [None] # Python 2 doesnot support nonlocal
+ parent_renderer = (
+ try_get(tab, lambda x: x['sectionListRenderer'], dict)
+ or try_get(tab, lambda x: x['richGridRenderer'], dict) or {})
+ for entry in extract_entries(parent_renderer):
+ yield entry
+ continuation = continuation_list[0]
+
+ headers = {
+ 'x-youtube-client-name': '1',
+ 'x-youtube-client-version': '2.20201112.04.01',
+ }
+ if identity_token:
+ headers['x-youtube-identity-token'] = identity_token
- return has_videos, playlist
+ for page_num in itertools.count(1):
+ if not continuation:
+ break
+ browse = self._download_json(
+ 'https://www.youtube.com/browse_ajax', None,
+ 'Downloading page %d' % page_num,
+ headers=headers, query=continuation, fatal=False)
+ if not browse:
+ break
+ response = try_get(browse, lambda x: x[1]['response'], dict)
+ if not response:
+ break
- def _check_download_just_video(self, url, playlist_id):
- # Check if it's a video-specific URL
- query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
- video_id = query_dict.get('v', [None])[0] or self._search_regex(
- r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url,
- 'video id', default=None)
- if video_id:
- if self._downloader.params.get('noplaylist'):
- self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
- return video_id, self.url_result(video_id, 'Youtube', video_id=video_id)
- else:
- self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
- return video_id, None
- return None, None
+ continuation_contents = try_get(
+ response, lambda x: x['continuationContents'], dict)
+ if continuation_contents:
+ continuation_renderer = continuation_contents.get('playlistVideoListContinuation')
+ if continuation_renderer:
+ for entry in self._playlist_entries(continuation_renderer):
+ yield entry
+ continuation = self._extract_continuation(continuation_renderer)
+ continue
+ continuation_renderer = continuation_contents.get('gridContinuation')
+ if continuation_renderer:
+ for entry in self._grid_entries(continuation_renderer):
+ yield entry
+ continuation = self._extract_continuation(continuation_renderer)
+ continue
+ continuation_renderer = continuation_contents.get('itemSectionContinuation')
+ if continuation_renderer:
+ for entry in self._post_thread_continuation_entries(continuation_renderer):
+ yield entry
+ continuation = self._extract_continuation(continuation_renderer)
+ continue
+ continuation_renderer = continuation_contents.get('sectionListContinuation') # for feeds
+ if continuation_renderer:
+ continuation_list = [None]
+ for entry in extract_entries(continuation_renderer):
+ yield entry
+ continuation = continuation_list[0]
+ continue
- def _real_extract(self, url):
- # Extract playlist id
- mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError('Invalid URL: %s' % url)
- playlist_id = mobj.group(1) or mobj.group(2)
+ continuation_items = try_get(
+ response, lambda x: x['onResponseReceivedActions'][0]['appendContinuationItemsAction']['continuationItems'], list)
+ if continuation_items:
+ continuation_item = continuation_items[0]
+ if not isinstance(continuation_item, dict):
+ continue
+ renderer = continuation_item.get('playlistVideoRenderer') or continuation_item.get('itemSectionRenderer')
+ if renderer:
+ video_list_renderer = {'contents': continuation_items}
+ for entry in self._playlist_entries(video_list_renderer):
+ yield entry
+ continuation = self._extract_continuation(video_list_renderer)
+ continue
+ break
- video_id, video = self._check_download_just_video(url, playlist_id)
- if video:
- return video
+ @staticmethod
+ def _extract_selected_tab(tabs):
+ for tab in tabs:
+ if try_get(tab, lambda x: x['tabRenderer']['selected'], bool):
+ return tab['tabRenderer']
+ else:
+ raise ExtractorError('Unable to find selected tab')
- if playlist_id.startswith(('RD', 'UL', 'PU')):
- # Mixes require a custom extraction process
- return self._extract_mix(playlist_id)
+ @staticmethod
+ def _extract_uploader(data):
+ uploader = {}
+ sidebar_renderer = try_get(
+ data, lambda x: x['sidebar']['playlistSidebarRenderer']['items'], list)
+ if sidebar_renderer:
+ for item in sidebar_renderer:
+ if not isinstance(item, dict):
+ continue
+ renderer = item.get('playlistSidebarSecondaryInfoRenderer')
+ if not isinstance(renderer, dict):
+ continue
+ owner = try_get(
+ renderer, lambda x: x['videoOwner']['videoOwnerRenderer']['title']['runs'][0], dict)
+ if owner:
+ uploader['uploader'] = owner.get('text')
+ uploader['uploader_id'] = try_get(
+ owner, lambda x: x['navigationEndpoint']['browseEndpoint']['browseId'], compat_str)
+ uploader['uploader_url'] = urljoin(
+ 'https://www.youtube.com/',
+ try_get(owner, lambda x: x['navigationEndpoint']['browseEndpoint']['canonicalBaseUrl'], compat_str))
+ return uploader
+
+ def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
+ selected_tab = self._extract_selected_tab(tabs)
+ renderer = try_get(
+ data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
+ playlist_id = title = description = None
+ if renderer:
+ channel_title = renderer.get('title') or item_id
+ tab_title = selected_tab.get('title')
+ title = channel_title or item_id
+ if tab_title:
+ title += ' - %s' % tab_title
+ description = renderer.get('description')
+ playlist_id = renderer.get('externalId')
+ renderer = try_get(
+ data, lambda x: x['metadata']['playlistMetadataRenderer'], dict)
+ if renderer:
+ title = renderer.get('title')
+ description = None
+ playlist_id = item_id
+ if playlist_id is None:
+ playlist_id = item_id
+ if title is None:
+ title = "Youtube " + playlist_id.title()
+ playlist = self.playlist_result(
+ self._entries(selected_tab['content'], identity_token),
+ playlist_id=playlist_id, playlist_title=title,
+ playlist_description=description)
+ playlist.update(self._extract_uploader(data))
+ return playlist
- has_videos, playlist = self._extract_playlist(playlist_id)
- if has_videos or not video_id:
- return playlist
+ def _extract_from_playlist(self, item_id, data, playlist):
+ title = playlist.get('title') or try_get(
+ data, lambda x: x['titleText']['simpleText'], compat_str)
+ playlist_id = playlist.get('playlistId') or item_id
+ return self.playlist_result(
+ self._playlist_entries(playlist), playlist_id=playlist_id,
+ playlist_title=title)
+
+ def _extract_alerts(self, data):
+ for alert_dict in try_get(data, lambda x: x['alerts'], list) or []:
+ for renderer in alert_dict:
+ alert = alert_dict[renderer]
+ alert_type = alert.get('type')
+ if not alert_type:
+ continue
+ message = try_get(alert, lambda x: x['text']['simpleText'], compat_str)
+ if message:
+ yield alert_type, message
+ for run in try_get(alert, lambda x: x['text']['runs'], list) or []:
+ message = try_get(run, lambda x: x['text'], compat_str)
+ if message:
+ yield alert_type, message
- # Some playlist URLs don't actually serve a playlist (see
- # https://github.com/ytdl-org/youtube-dl/issues/10537).
- # Fallback to plain video extraction if there is a video id
- # along with playlist id.
- return self.url_result(video_id, 'Youtube', video_id=video_id)
+ def _real_extract(self, url):
+ item_id = self._match_id(url)
+ url = compat_urlparse.urlunparse(
+ compat_urlparse.urlparse(url)._replace(netloc='www.youtube.com'))
+ is_home = re.match(r'(?P<pre>%s)(?P<post>/?(?![^#?]).*$)' % self._VALID_URL, url)
+ if is_home is not None and is_home.group('not_channel') is None and item_id != 'feed':
+ self._downloader.report_warning(
+ 'A channel/user page was given. All the channel\'s videos will be downloaded. '
+ 'To download only the videos in the home page, add a "/home" to the URL')
+ url = '%s/videos%s' % (is_home.group('pre'), is_home.group('post') or '')
+
+ # Handle both video/playlist URLs
+ qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+ video_id = qs.get('v', [None])[0]
+ playlist_id = qs.get('list', [None])[0]
+
+ if is_home.group('not_channel') is not None and is_home.group('not_channel').startswith('watch') and not video_id:
+ if playlist_id:
+ self._downloader.report_warning('%s is not a valid Youtube URL. Trying to download playlist %s' % (url, playlist_id))
+ url = 'https://www.youtube.com/playlist?list=%s' % playlist_id
+ # return self.url_result(playlist_id, ie=YoutubePlaylistIE.ie_key())
+ else:
+ raise ExtractorError('Unable to recognize tab page')
+ if video_id and playlist_id:
+ if self._downloader.params.get('noplaylist'):
+ self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+ return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
+ self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
+
+ webpage = self._download_webpage(url, item_id)
+ identity_token = self._search_regex(
+ r'\bID_TOKEN["\']\s*:\s*["\'](.+?)["\']', webpage,
+ 'identity token', default=None)
+ data = self._extract_yt_initial_data(item_id, webpage)
+ for alert_type, alert_message in self._extract_alerts(data):
+ self._downloader.report_warning('YouTube said: %s - %s' % (alert_type, alert_message))
+ tabs = try_get(
+ data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
+ if tabs:
+ return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
+ playlist = try_get(
+ data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
+ if playlist:
+ return self._extract_from_playlist(item_id, data, playlist)
+ # Fallback to video extraction if no playlist alike page is recognized.
+ # First check for the current video then try the v attribute of URL query.
+ video_id = try_get(
+ data, lambda x: x['currentVideoEndpoint']['watchEndpoint']['videoId'],
+ compat_str) or video_id
+ if video_id:
+ return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
+ # Failed to recognize
+ raise ExtractorError('Unable to recognize tab page')
-class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
- IE_DESC = 'YouTube.com channels'
- _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie|kids)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'
- _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
- _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
- IE_NAME = 'youtube:channel'
+class YoutubePlaylistIE(InfoExtractor):
+ IE_DESC = 'YouTube.com playlists'
+ _VALID_URL = r'''(?x)(?:
+ (?:https?://)?
+ (?:\w+\.)?
+ (?:
+ (?:
+ youtube(?:kids)?\.com|
+ invidio\.us|
+ youtu\.be
+ )
+ /.*?\?.*?\blist=
+ )?
+ (?P<id>%(playlist_id)s)
+ )''' % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
+ IE_NAME = 'youtube:playlist'
_TESTS = [{
- 'note': 'paginated channel',
- 'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
- 'playlist_mincount': 91,
+ 'note': 'issue #673',
+ 'url': 'PLBB231211A4F62143',
'info_dict': {
- 'id': 'UUKfVa3S1e4PHvxWcwyMMg8w',
- 'title': 'Uploads from lex will',
- 'uploader': 'lex will',
- 'uploader_id': 'UCKfVa3S1e4PHvxWcwyMMg8w',
- }
+ 'title': '[OLD]Team Fortress 2 (Class-based LP)',
+ 'id': 'PLBB231211A4F62143',
+ 'uploader': 'Wickydoo',
+ 'uploader_id': 'UCKSpbfbl5kRQpTdL7kMc-1Q',
+ },
+ 'playlist_mincount': 29,
}, {
- 'note': 'Age restricted channel',
- # from https://www.youtube.com/user/DeusExOfficial
- 'url': 'https://www.youtube.com/channel/UCs0ifCMCm1icqRbqhUINa0w',
- 'playlist_mincount': 64,
+ 'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
'info_dict': {
- 'id': 'UUs0ifCMCm1icqRbqhUINa0w',
- 'title': 'Uploads from Deus Ex',
- 'uploader': 'Deus Ex',
- 'uploader_id': 'DeusExOfficial',
+ 'title': 'YDL_safe_search',
+ 'id': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
},
+ 'playlist_count': 2,
+ 'skip': 'This playlist is private',
}, {
- 'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
- 'only_matching': True,
- }, {
- 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA',
- 'only_matching': True,
- }]
-
- @classmethod
- def suitable(cls, url):
- return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url)
- else super(YoutubeChannelIE, cls).suitable(url))
-
- def _build_template_url(self, url, channel_id):
- return self._TEMPLATE_URL % channel_id
-
- def _real_extract(self, url):
- channel_id = self._match_id(url)
-
- url = self._build_template_url(url, channel_id)
-
- # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778)
- # Workaround by extracting as a playlist if managed to obtain channel playlist URL
- # otherwise fallback on channel by page extraction
- channel_page = self._download_webpage(
- url + '?view=57', channel_id,
- 'Downloading channel page', fatal=False)
- if channel_page is False:
- channel_playlist_id = False
- else:
- channel_playlist_id = self._html_search_meta(
- 'channelId', channel_page, 'channel id', default=None)
- if not channel_playlist_id:
- channel_url = self._html_search_meta(
- ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'),
- channel_page, 'channel url', default=None)
- if channel_url:
- channel_playlist_id = self._search_regex(
- r'vnd\.youtube://user/([0-9A-Za-z_-]+)',
- channel_url, 'channel id', default=None)
- if channel_playlist_id and channel_playlist_id.startswith('UC'):
- playlist_id = 'UU' + channel_playlist_id[2:]
- return self.url_result(
- compat_urlparse.urljoin(url, '/playlist?list=%s' % playlist_id), 'YoutubePlaylist')
-
- channel_page = self._download_webpage(url, channel_id, 'Downloading page #1')
- autogenerated = re.search(r'''(?x)
- class="[^"]*?(?:
- channel-header-autogenerated-label|
- yt-channel-title-autogenerated
- )[^"]*"''', channel_page) is not None
-
- if autogenerated:
- # The videos are contained in a single page
- # the ajax pages can't be used, they are empty
- entries = [
- self.url_result(
- video_id, 'Youtube', video_id=video_id,
- video_title=video_title)
- for video_id, video_title in self.extract_videos_from_page(channel_page)]
- return self.playlist_result(entries, channel_id)
-
- try:
- next(self._entries(channel_page, channel_id))
- except StopIteration:
- alert_message = self._html_search_regex(
- r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>',
- channel_page, 'alert', default=None, group='alert')
- if alert_message:
- raise ExtractorError('Youtube said: %s' % alert_message, expected=True)
-
- return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
-
-
-class YoutubeUserIE(YoutubeChannelIE):
- IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
- _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9%-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_%-]+)'
- _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos'
- IE_NAME = 'youtube:user'
-
- _TESTS = [{
- 'url': 'https://www.youtube.com/user/TheLinuxFoundation',
- 'playlist_mincount': 320,
+ 'note': 'embedded',
+ 'url': 'https://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
+ 'playlist_count': 4,
'info_dict': {
- 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ',
- 'title': 'Uploads from The Linux Foundation',
- 'uploader': 'The Linux Foundation',
- 'uploader_id': 'TheLinuxFoundation',
+ 'title': 'JODA15',
+ 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
+ 'uploader': 'milan',
+ 'uploader_id': 'UCEI1-PVPcYXjB73Hfelbmaw',
}
}, {
- # Only available via https://www.youtube.com/c/12minuteathlete/videos
- # but not https://www.youtube.com/user/12minuteathlete/videos
- 'url': 'https://www.youtube.com/c/12minuteathlete/videos',
- 'playlist_mincount': 249,
+ 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
+ 'playlist_mincount': 982,
'info_dict': {
- 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ',
- 'title': 'Uploads from 12 Minute Athlete',
- 'uploader': '12 Minute Athlete',
- 'uploader_id': 'the12minuteathlete',
+ 'title': '2018 Chinese New Singles (11/6 updated)',
+ 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl',
+ 'uploader': 'LBK',
+ 'uploader_id': 'UC21nz3_MesPLqtDqwdvnoxA',
}
}, {
- 'url': 'ytuser:phihag',
- 'only_matching': True,
- }, {
- 'url': 'https://www.youtube.com/c/gametrailers',
- 'only_matching': True,
+ 'url': 'https://youtu.be/yeWKywCrFtk?list=PL2qgrgXsNUG5ig9cat4ohreBjYLAPC0J5',
+ 'info_dict': {
+ 'id': 'yeWKywCrFtk',
+ 'ext': 'mp4',
+ 'title': 'Small Scale Baler and Braiding Rugs',
+ 'uploader': 'Backus-Page House Museum',
+ 'uploader_id': 'backuspagemuseum',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/backuspagemuseum',
+ 'upload_date': '20161008',
+ 'description': 'md5:800c0c78d5eb128500bffd4f0b4f2e8a',
+ 'categories': ['Nonprofits & Activism'],
+ 'tags': list,
+ 'like_count': int,
+ 'dislike_count': int,
+ },
+ 'params': {
+ 'noplaylist': True,
+ 'skip_download': True,
+ },
}, {
- 'url': 'https://www.youtube.com/c/Pawe%C5%82Zadro%C5%BCniak',
+ 'url': 'https://youtu.be/uWyaPkt-VOI?list=PL9D9FC436B881BA21',
'only_matching': True,
}, {
- 'url': 'https://www.youtube.com/gametrailers',
+ 'url': 'TLGGrESM50VT6acwMjAyMjAxNw',
'only_matching': True,
}, {
- # This channel is not available, geo restricted to JP
- 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos',
+ # music album playlist
+ 'url': 'OLAK5uy_m4xAFdmMC5rX3Ji3g93pQe3hqLZw_9LhM',
'only_matching': True,
}]
@classmethod
def suitable(cls, url):
- # Don't return True if the url can be extracted with other youtube
- # extractor, the regex would is too permissive and it would match.
- other_yt_ies = iter(klass for (name, klass) in globals().items() if name.startswith('Youtube') and name.endswith('IE') and klass is not cls)
- if any(ie.suitable(url) for ie in other_yt_ies):
- return False
- else:
- return super(YoutubeUserIE, cls).suitable(url)
-
- def _build_template_url(self, url, channel_id):
- mobj = re.match(self._VALID_URL, url)
- return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id'))
+ return False if YoutubeTabIE.suitable(url) else super(
+ YoutubePlaylistIE, cls).suitable(url)
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+ qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+ if not qs:
+ qs = {'list': playlist_id}
+ return self.url_result(
+ update_url_query('https://www.youtube.com/playlist', qs),
+ ie=YoutubeTabIE.ie_key(), video_id=playlist_id)
-class YoutubeLiveIE(YoutubeBaseInfoExtractor):
- IE_DESC = 'YouTube.com live streams'
- _VALID_URL = r'(?P<base_url>https?://(?:\w+\.)?youtube\.com/(?:(?:user|channel|c)/)?(?P<id>[^/]+))/live'
- IE_NAME = 'youtube:live'
+class YoutubeYtUserIE(InfoExtractor):
+ _VALID_URL = r'ytuser:(?P<id>.+)'
_TESTS = [{
- 'url': 'https://www.youtube.com/user/TheYoungTurks/live',
- 'info_dict': {
- 'id': 'a48o2S1cPoo',
- 'ext': 'mp4',
- 'title': 'The Young Turks - Live Main Show',
- 'uploader': 'The Young Turks',
- 'uploader_id': 'TheYoungTurks',
- 'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/TheYoungTurks',
- 'upload_date': '20150715',
- 'license': 'Standard YouTube License',
- 'description': 'md5:438179573adcdff3c97ebb1ee632b891',
- 'categories': ['News & Politics'],
- 'tags': ['Cenk Uygur (TV Program Creator)', 'The Young Turks (Award-Winning Work)', 'Talk Show (TV Genre)'],
- 'like_count': int,
- 'dislike_count': int,
- },
- 'params': {
- 'skip_download': True,
- },
- }, {
- 'url': 'https://www.youtube.com/channel/UC1yBKRuGpC1tSM73A0ZjYjQ/live',
- 'only_matching': True,
- }, {
- 'url': 'https://www.youtube.com/c/CommanderVideoHq/live',
- 'only_matching': True,
- }, {
- 'url': 'https://www.youtube.com/TheYoungTurks/live',
+ 'url': 'ytuser:phihag',
'only_matching': True,
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- channel_id = mobj.group('id')
- base_url = mobj.group('base_url')
- webpage = self._download_webpage(url, channel_id, fatal=False)
- if webpage:
- page_type = self._og_search_property(
- 'type', webpage, 'page type', default='')
- video_id = self._html_search_meta(
- 'videoId', webpage, 'video id', default=None)
- if page_type.startswith('video') and video_id and re.match(
- r'^[0-9A-Za-z_-]{11}$', video_id):
- return self.url_result(video_id, YoutubeIE.ie_key())
- return self.url_result(base_url)
-
-
-class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor):
- IE_DESC = 'YouTube.com user/channel playlists'
- _VALID_URL = r'https?://(?:\w+\.)?youtube\.com/(?:user|channel|c)/(?P<id>[^/]+)/playlists'
- IE_NAME = 'youtube:playlists'
+ user_id = self._match_id(url)
+ return self.url_result(
+ 'https://www.youtube.com/user/%s' % user_id,
+ ie=YoutubeTabIE.ie_key(), video_id=user_id)
+
+class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
+ IE_NAME = 'youtube:favorites'
+ IE_DESC = 'YouTube.com liked videos, ":ytfav" for short (requires authentication)'
+ _VALID_URL = r':ytfav(?:ou?rite)?s?'
+ _LOGIN_REQUIRED = True
_TESTS = [{
- 'url': 'https://www.youtube.com/user/ThirstForScience/playlists',
- 'playlist_mincount': 4,
- 'info_dict': {
- 'id': 'ThirstForScience',
- 'title': 'ThirstForScience',
- },
- }, {
- # with "Load more" button
- 'url': 'https://www.youtube.com/user/igorkle1/playlists?view=1&sort=dd',
- 'playlist_mincount': 70,
- 'info_dict': {
- 'id': 'igorkle1',
- 'title': 'Игорь Клейнер',
- },
- }, {
- 'url': 'https://www.youtube.com/channel/UCiU1dHvZObB2iP6xkJ__Icw/playlists',
- 'playlist_mincount': 17,
- 'info_dict': {
- 'id': 'UCiU1dHvZObB2iP6xkJ__Icw',
- 'title': 'Chem Player',
- },
- 'skip': 'Blocked',
+ 'url': ':ytfav',
+ 'only_matching': True,
}, {
- 'url': 'https://www.youtube.com/c/ChristophLaimer/playlists',
+ 'url': ':ytfavorites',
'only_matching': True,
}]
-
-class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor):
- _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?'
+ def _real_extract(self, url):
+ return self.url_result(
+ 'https://www.youtube.com/playlist?list=LL',
+ ie=YoutubeTabIE.ie_key())
-class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
+class YoutubeSearchIE(SearchInfoExtractor, YoutubeBaseInfoExtractor):
IE_DESC = 'YouTube.com searches'
# there doesn't appear to be a real limit, for example if you search for
# 'python' you get more than 8.000.000 results
@@ -3190,10 +3469,33 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
list)
if not slr_contents:
break
- isr_contents = try_get(
- slr_contents,
- lambda x: x[0]['itemSectionRenderer']['contents'],
- list)
+
+ isr_contents = []
+ continuation_token = None
+ # Youtube sometimes adds promoted content to searches,
+ # changing the index location of videos and token.
+ # So we search through all entries till we find them.
+ for index, isr in enumerate(slr_contents):
+ if not isr_contents:
+ isr_contents = try_get(
+ slr_contents,
+ (lambda x: x[index]['itemSectionRenderer']['contents']),
+ list)
+ for content in isr_contents:
+ if content.get('videoRenderer') is not None:
+ break
+ else:
+ isr_contents = []
+
+ if continuation_token is None:
+ continuation_token = try_get(
+ slr_contents,
+ lambda x: x[index]['continuationItemRenderer']['continuationEndpoint']['continuationCommand'][
+ 'token'],
+ compat_str)
+ if continuation_token is not None and isr_contents:
+ break
+
if not isr_contents:
break
for content in isr_contents:
@@ -3227,13 +3529,9 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor):
}
if total == n:
return
- token = try_get(
- slr_contents,
- lambda x: x[1]['continuationItemRenderer']['continuationEndpoint']['continuationCommand']['token'],
- compat_str)
- if not token:
+ if not continuation_token:
break
- data['continuation'] = token
+ data['continuation'] = continuation_token
def _get_n_results(self, query, n):
"""Get a specified number of results for a query"""
@@ -3247,11 +3545,11 @@ class YoutubeSearchDateIE(YoutubeSearchIE):
_SEARCH_PARAMS = 'CAI%3D'
-class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
+class YoutubeSearchURLIE(YoutubeSearchIE):
IE_DESC = 'YouTube.com search URLs'
- IE_NAME = 'youtube:search_url'
- _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)'
- _SEARCH_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});'
+ IE_NAME = YoutubeSearchIE.IE_NAME + '_url'
+ _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?:[^&]+)(?:[&]|$)'
+ # _MAX_RESULTS = 100
_TESTS = [{
'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
'playlist_mincount': 5,
@@ -3263,92 +3561,25 @@ class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor):
'only_matching': True,
}]
- def _find_videos_in_json(self, extracted):
- videos = []
-
- def _real_find(obj):
- if obj is None or isinstance(obj, str):
- return
-
- if type(obj) is list:
- for elem in obj:
- _real_find(elem)
-
- if type(obj) is dict:
- if "videoId" in obj:
- videos.append(obj)
- return
-
- for _, o in obj.items():
- _real_find(o)
-
- _real_find(extracted)
-
- return videos
-
- def extract_videos_from_page_impl(self, page, ids_in_page, titles_in_page):
- search_response = self._parse_json(self._search_regex(self._SEARCH_DATA, page, 'ytInitialData'), None)
-
- result_items = self._find_videos_in_json(search_response)
-
- for renderer in result_items:
- video_id = try_get(renderer, lambda x: x['videoId'])
- video_title = try_get(renderer, lambda x: x['title']['runs'][0]['text']) or try_get(renderer, lambda x: x['title']['simpleText'])
-
- if video_id is None or video_title is None:
- # we do not have a videoRenderer or title extraction broke
- continue
-
- video_title = video_title.strip()
-
- try:
- idx = ids_in_page.index(video_id)
- if video_title and not titles_in_page[idx]:
- titles_in_page[idx] = video_title
- except ValueError:
- ids_in_page.append(video_id)
- titles_in_page.append(video_title)
-
- def extract_videos_from_page(self, page):
- ids_in_page = []
- titles_in_page = []
- self.extract_videos_from_page_impl(page, ids_in_page, titles_in_page)
- return zip(ids_in_page, titles_in_page)
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- query = compat_urllib_parse_unquote_plus(mobj.group('query'))
- webpage = self._download_webpage(url, query)
- return self.playlist_result(self._process_page(webpage), playlist_title=query)
-
-
-class YoutubeShowIE(YoutubePlaylistsBaseInfoExtractor):
- IE_DESC = 'YouTube.com (multi-season) shows'
- _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
- IE_NAME = 'youtube:show'
- _TESTS = [{
- 'url': 'https://www.youtube.com/show/airdisasters',
- 'playlist_mincount': 5,
- 'info_dict': {
- 'id': 'airdisasters',
- 'title': 'Air Disasters',
- }
- }]
+ @classmethod
+ def _make_valid_url(cls):
+ return cls._VALID_URL
def _real_extract(self, url):
- playlist_id = self._match_id(url)
- return super(YoutubeShowIE, self)._real_extract(
- 'https://www.youtube.com/show/%s/playlists' % playlist_id)
+ qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+ query = (qs.get('search_query') or qs.get('q'))[0]
+ self._SEARCH_PARAMS = qs.get('sp', ('',))[0]
+ return self._get_n_results(query, self._MAX_RESULTS)
-class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
+class YoutubeFeedsInfoExtractor(YoutubeTabIE):
"""
Base class for feed extractors
- Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
+ Subclasses must define the _FEED_NAME property.
"""
_LOGIN_REQUIRED = True
- _FEED_DATA = r'(?:window\["ytInitialData"\]|ytInitialData)\W?=\W?({.*?});'
- _YTCFG_DATA = r"ytcfg.set\(({.*?})\)"
+ # _MAX_PAGES = 5
+ _TESTS = []
@property
def IE_NAME(self):
@@ -3357,150 +3588,63 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
def _real_initialize(self):
self._login()
- def _find_videos_in_json(self, extracted):
- videos = []
- c = {}
-
- def _real_find(obj):
- if obj is None or isinstance(obj, str):
- return
-
- if type(obj) is list:
- for elem in obj:
- _real_find(elem)
-
- if type(obj) is dict:
- if "videoId" in obj:
- videos.append(obj)
- return
-
- if "nextContinuationData" in obj:
- c["continuation"] = obj["nextContinuationData"]
- return
-
- for _, o in obj.items():
- _real_find(o)
-
- _real_find(extracted)
-
- return videos, try_get(c, lambda x: x["continuation"])
-
- def _entries(self, page):
- info = []
-
- yt_conf = self._parse_json(self._search_regex(self._YTCFG_DATA, page, 'ytcfg.set', default="null"), None, fatal=False)
-
- search_response = self._parse_json(self._search_regex(self._FEED_DATA, page, 'ytInitialData'), None)
-
- for page_num in itertools.count(1):
- video_info, continuation = self._find_videos_in_json(search_response)
-
- new_info = []
-
- for v in video_info:
- v_id = try_get(v, lambda x: x['videoId'])
- if not v_id:
- continue
-
- have_video = False
- for old in info:
- if old['videoId'] == v_id:
- have_video = True
- break
-
- if not have_video:
- new_info.append(v)
-
- if not new_info:
- break
-
- info.extend(new_info)
-
- for video in new_info:
- yield self.url_result(try_get(video, lambda x: x['videoId']), YoutubeIE.ie_key(), video_title=try_get(video, lambda x: x['title']['runs'][0]['text']) or try_get(video, lambda x: x['title']['simpleText']))
-
- if not continuation or not yt_conf:
- break
-
- search_response = self._download_json(
- 'https://www.youtube.com/browse_ajax', self._PLAYLIST_TITLE,
- 'Downloading page #%s' % page_num,
- transform_source=uppercase_escape,
- query={
- "ctoken": try_get(continuation, lambda x: x["continuation"]),
- "continuation": try_get(continuation, lambda x: x["continuation"]),
- "itct": try_get(continuation, lambda x: x["clickTrackingParams"])
- },
- headers={
- "X-YouTube-Client-Name": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_NAME"]),
- "X-YouTube-Client-Version": try_get(yt_conf, lambda x: x["INNERTUBE_CONTEXT_CLIENT_VERSION"]),
- "X-Youtube-Identity-Token": try_get(yt_conf, lambda x: x["ID_TOKEN"]),
- "X-YouTube-Device": try_get(yt_conf, lambda x: x["DEVICE"]),
- "X-YouTube-Page-CL": try_get(yt_conf, lambda x: x["PAGE_CL"]),
- "X-YouTube-Page-Label": try_get(yt_conf, lambda x: x["PAGE_BUILD_LABEL"]),
- "X-YouTube-Variants-Checksum": try_get(yt_conf, lambda x: x["VARIANTS_CHECKSUM"]),
- })
-
def _real_extract(self, url):
- page = self._download_webpage(
+ return self.url_result(
'https://www.youtube.com/feed/%s' % self._FEED_NAME,
- self._PLAYLIST_TITLE)
- return self.playlist_result(
- self._entries(page), playlist_title=self._PLAYLIST_TITLE)
+ ie=YoutubeTabIE.ie_key())
-class YoutubeWatchLaterIE(YoutubePlaylistIE):
+class YoutubeWatchLaterIE(InfoExtractor):
IE_NAME = 'youtube:watchlater'
IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
- _VALID_URL = r'https?://(?:www\.)?youtube\.com/(?:feed/watch_later|(?:playlist|watch)\?(?:.+&)?list=WL)|:ytwatchlater'
-
+ _VALID_URL = r':ytwatchlater'
_TESTS = [{
- 'url': 'https://www.youtube.com/playlist?list=WL',
- 'only_matching': True,
- }, {
- 'url': 'https://www.youtube.com/watch?v=bCNU9TrbiRk&index=1&list=WL',
+ 'url': ':ytwatchlater',
'only_matching': True,
}]
def _real_extract(self, url):
- _, video = self._check_download_just_video(url, 'WL')
- if video:
- return video
- _, playlist = self._extract_playlist('WL')
- return playlist
-
-
-class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
- IE_NAME = 'youtube:favorites'
- IE_DESC = 'YouTube.com favourite videos, ":ytfav" for short (requires authentication)'
- _VALID_URL = r'https?://(?:www\.)?youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
- _LOGIN_REQUIRED = True
-
- def _real_extract(self, url):
- webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
- playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
- return self.url_result(playlist_id, 'YoutubePlaylist')
+ return self.url_result(
+ 'https://www.youtube.com/playlist?list=WL', ie=YoutubeTabIE.ie_key())
class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
- _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/recommended|:ytrec(?:ommended)?'
+ _VALID_URL = r'https?://(?:www\.)?youtube\.com/?(?:[?#]|$)|:ytrec(?:ommended)?'
_FEED_NAME = 'recommended'
- _PLAYLIST_TITLE = 'Youtube Recommended videos'
+ _TESTS = [{
+ 'url': ':ytrec',
+ 'only_matching': True,
+ }, {
+ 'url': ':ytrecommended',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://youtube.com',
+ 'only_matching': True,
+ }]
class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
- IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
- _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
+ IE_DESC = 'YouTube.com subscriptions feed, ":ytsubs" for short (requires authentication)'
+ _VALID_URL = r':ytsub(?:scription)?s?'
_FEED_NAME = 'subscriptions'
- _PLAYLIST_TITLE = 'Youtube Subscriptions'
+ _TESTS = [{
+ 'url': ':ytsubs',
+ 'only_matching': True,
+ }, {
+ 'url': ':ytsubscriptions',
+ 'only_matching': True,
+ }]
class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
- _VALID_URL = r'https?://(?:www\.)?youtube\.com/feed/history|:ythistory'
+ _VALID_URL = r':ythistory'
_FEED_NAME = 'history'
- _PLAYLIST_TITLE = 'Youtube History'
+ _TESTS = [{
+ 'url': ':ythistory',
+ 'only_matching': True,
+ }]
class YoutubeTruncatedURLIE(InfoExtractor):
@@ -3567,3 +3711,25 @@ class YoutubeTruncatedIDIE(InfoExtractor):
raise ExtractorError(
'Incomplete YouTube ID %s. URL %s looks truncated.' % (video_id, url),
expected=True)
+
+
+# Do Youtube show urls even exist anymore? I couldn't find any
+r'''
+class YoutubeShowIE(YoutubeTabIE):
+ IE_DESC = 'YouTube.com (multi-season) shows'
+ _VALID_URL = r'https?://(?:www\.)?youtube\.com/show/(?P<id>[^?#]*)'
+ IE_NAME = 'youtube:show'
+ _TESTS = [{
+ 'url': 'https://www.youtube.com/show/airdisasters',
+ 'playlist_mincount': 5,
+ 'info_dict': {
+ 'id': 'airdisasters',
+ 'title': 'Air Disasters',
+ }
+ }]
+
+ def _real_extract(self, url):
+ playlist_id = self._match_id(url)
+ return super(YoutubeShowIE, self)._real_extract(
+ 'https://www.youtube.com/show/%s/playlists' % playlist_id)
+'''
diff --git a/youtube_dlc/extractor/zoom.py b/youtube_dlc/extractor/zoom.py
new file mode 100644
index 000000000..038a90297
--- /dev/null
+++ b/youtube_dlc/extractor/zoom.py
@@ -0,0 +1,82 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ url_or_none,
+ parse_filesize,
+ urlencode_postdata
+)
+
+
+class ZoomIE(InfoExtractor):
+ IE_NAME = 'zoom'
+ _VALID_URL = r'https://(?:.*).?zoom.us/rec(?:ording)?/(play|share)/(?P<id>[A-Za-z0-9\-_.]+)'
+
+ _TEST = {
+ 'url': 'https://zoom.us/recording/play/SILVuCL4bFtRwWTtOCFQQxAsBQsJljFtm9e4Z_bvo-A8B-nzUSYZRNuPl3qW5IGK',
+ 'info_dict': {
+ 'md5': '031a5b379f1547a8b29c5c4c837dccf2',
+ 'title': "GAZ Transformational Tuesdays W/ Landon & Stapes",
+ 'id': "SILVuCL4bFtRwWTtOCFQQxAsBQsJljFtm9e4Z_bvo-A8B-nzUSYZRNuPl3qW5IGK",
+ 'ext': "mp4"
+ }
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ password_protected = self._search_regex(r'<form[^>]+?id="(password_form)"', webpage, 'password field', fatal=False, default=None)
+ if password_protected is not None:
+ self._verify_video_password(url, display_id, webpage)
+ webpage = self._download_webpage(url, display_id)
+
+ video_url = self._search_regex(r"viewMp4Url: \'(.*)\'", webpage, 'video url')
+ title = self._html_search_regex([r"topic: \"(.*)\",", r"<title>(.*) - Zoom</title>"], webpage, 'title')
+ viewResolvtionsWidth = self._search_regex(r"viewResolvtionsWidth: (\d*)", webpage, 'res width', fatal=False)
+ viewResolvtionsHeight = self._search_regex(r"viewResolvtionsHeight: (\d*)", webpage, 'res height', fatal=False)
+ fileSize = parse_filesize(self._search_regex(r"fileSize: \'(.+)\'", webpage, 'fileSize', fatal=False))
+
+ urlprefix = url.split("zoom.us")[0] + "zoom.us/"
+
+ formats = []
+ formats.append({
+ 'url': url_or_none(video_url),
+ 'width': int_or_none(viewResolvtionsWidth),
+ 'height': int_or_none(viewResolvtionsHeight),
+ 'http_headers': {'Accept': 'video/webm,video/ogg,video/*;q=0.9,application/ogg;q=0.7,audio/*;q=0.6,*/*;q=0.5',
+ 'Referer': urlprefix},
+ 'ext': "mp4",
+ 'filesize_approx': int_or_none(fileSize)
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': display_id,
+ 'title': title,
+ 'formats': formats
+ }
+
+ def _verify_video_password(self, url, video_id, webpage):
+ password = self._downloader.params.get('videopassword')
+ if password is None:
+ raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True)
+ meetId = self._search_regex(r'<input[^>]+?id="meetId" value="([^\"]+)"', webpage, 'meetId')
+ data = urlencode_postdata({
+ 'id': meetId,
+ 'passwd': password,
+ 'action': "viewdetailedpage",
+ 'recaptcha': ""
+ })
+ validation_url = url.split("zoom.us")[0] + "zoom.us/rec/validate_meet_passwd"
+ validation_response = self._download_json(
+ validation_url, video_id,
+ note='Validating Password...',
+ errnote='Wrong password?',
+ data=data)
+
+ if validation_response['errorCode'] != 0:
+ raise ExtractorError('Login failed, %s said: %r' % (self.IE_NAME, validation_response['errorMessage']))
diff --git a/youtube_dlc/options.py b/youtube_dlc/options.py
index 1d7a7fed2..9ad8a6ddd 100644
--- a/youtube_dlc/options.py
+++ b/youtube_dlc/options.py
@@ -345,6 +345,10 @@ def parseOpts(overrideArguments=None):
dest='download_archive',
help='Download only videos not listed in the archive file. Record the IDs of all downloaded videos in it.')
selection.add_option(
+ '--break-on-existing',
+ action='store_true', dest='break_on_existing', default=False,
+ help="Stop the download process after attempting to download a file that's in the archive.")
+ selection.add_option(
'--include-ads',
dest='include_ads', action='store_true',
help='Download advertisements as well (experimental)')
@@ -582,7 +586,7 @@ def parseOpts(overrideArguments=None):
'along with --min-sleep-interval.'))
workarounds.add_option(
'--sleep-subtitles',
- dest='sleep_interval_subtitles', action='store_true', default=False,
+ dest='sleep_interval_subtitles', default=0, type=int,
help='Enforce sleep interval on subtitles as well')
verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options')
diff --git a/youtube_dlc/postprocessor/embedthumbnail.py b/youtube_dlc/postprocessor/embedthumbnail.py
index 4a0d02fc4..e9f2161a0 100644
--- a/youtube_dlc/postprocessor/embedthumbnail.py
+++ b/youtube_dlc/postprocessor/embedthumbnail.py
@@ -89,12 +89,15 @@ class EmbedThumbnailPP(FFmpegPostProcessor):
os.rename(encodeFilename(temp_filename), encodeFilename(filename))
elif info['ext'] == 'mkv':
- os.rename(encodeFilename(thumbnail_filename), encodeFilename('cover.jpg'))
old_thumbnail_filename = thumbnail_filename
- thumbnail_filename = 'cover.jpg'
+ thumbnail_filename = os.path.join(os.path.dirname(old_thumbnail_filename), 'cover.jpg')
+ if os.path.exists(thumbnail_filename):
+ os.remove(encodeFilename(thumbnail_filename))
+ os.rename(encodeFilename(old_thumbnail_filename), encodeFilename(thumbnail_filename))
options = [
- '-c', 'copy', '-attach', thumbnail_filename, '-metadata:s:t', 'mimetype=image/jpeg']
+ '-c', 'copy', '-map', '0',
+ '-attach', thumbnail_filename, '-metadata:s:t', 'mimetype=image/jpeg']
self._downloader.to_screen('[ffmpeg] Adding thumbnail to "%s"' % filename)
@@ -140,6 +143,6 @@ class EmbedThumbnailPP(FFmpegPostProcessor):
os.remove(encodeFilename(filename))
os.rename(encodeFilename(temp_filename), encodeFilename(filename))
else:
- raise EmbedThumbnailPPError('Only mp3 and m4a/mp4 are supported for thumbnail embedding for now.')
+ raise EmbedThumbnailPPError('Only mp3, mkv, m4a and mp4 are supported for thumbnail embedding for now.')
return [], info
diff --git a/youtube_dlc/postprocessor/ffmpeg.py b/youtube_dlc/postprocessor/ffmpeg.py
index 5e85f4eeb..c7071d73d 100644
--- a/youtube_dlc/postprocessor/ffmpeg.py
+++ b/youtube_dlc/postprocessor/ffmpeg.py
@@ -359,7 +359,7 @@ class FFmpegVideoRemuxerPP(FFmpegPostProcessor):
if information['ext'] == self._preferedformat:
self._downloader.to_screen('[ffmpeg] Not remuxing video file %s - already is in target format %s' % (path, self._preferedformat))
return [], information
- options = ['-c', 'copy']
+ options = ['-c', 'copy', '-map', '0']
prefix, sep, ext = path.rpartition('.')
outpath = prefix + sep + self._preferedformat
self._downloader.to_screen('[' + 'ffmpeg' + '] Remuxing video from %s to %s, Destination: ' % (information['ext'], self._preferedformat) + outpath)
@@ -412,7 +412,9 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
for lang, sub_info in subtitles.items():
sub_ext = sub_info['ext']
- if ext != 'webm' or ext == 'webm' and sub_ext == 'vtt':
+ if sub_ext == 'json':
+ self._downloader.to_screen('[ffmpeg] JSON subtitles cannot be embedded')
+ elif ext != 'webm' or ext == 'webm' and sub_ext == 'vtt':
sub_langs.append(lang)
sub_filenames.append(subtitles_filename(filename, lang, sub_ext, ext))
else:
@@ -426,8 +428,7 @@ class FFmpegEmbedSubtitlePP(FFmpegPostProcessor):
input_files = [filename] + sub_filenames
opts = [
- '-map', '0',
- '-c', 'copy',
+ '-c', 'copy', '-map', '0',
# Don't copy the existing subtitles, we may be running the
# postprocessor a second time
'-map', '-0:s',
@@ -577,7 +578,7 @@ class FFmpegFixupStretchedPP(FFmpegPostProcessor):
filename = info['filepath']
temp_filename = prepend_extension(filename, 'temp')
- options = ['-c', 'copy', '-aspect', '%f' % stretched_ratio]
+ options = ['-c', 'copy', '-map', '0', '-aspect', '%f' % stretched_ratio]
self._downloader.to_screen('[ffmpeg] Fixing aspect ratio in "%s"' % filename)
self.run_ffmpeg(filename, temp_filename, options)
@@ -595,7 +596,7 @@ class FFmpegFixupM4aPP(FFmpegPostProcessor):
filename = info['filepath']
temp_filename = prepend_extension(filename, 'temp')
- options = ['-c', 'copy', '-f', 'mp4']
+ options = ['-c', 'copy', '-map', '0', '-f', 'mp4']
self._downloader.to_screen('[ffmpeg] Correcting container in "%s"' % filename)
self.run_ffmpeg(filename, temp_filename, options)
@@ -611,7 +612,7 @@ class FFmpegFixupM3u8PP(FFmpegPostProcessor):
if self.get_audio_codec(filename) == 'aac':
temp_filename = prepend_extension(filename, 'temp')
- options = ['-c', 'copy', '-f', 'mp4', '-bsf:a', 'aac_adtstoasc']
+ options = ['-c', 'copy', '-map', '0', '-f', 'mp4', '-bsf:a', 'aac_adtstoasc']
self._downloader.to_screen('[ffmpeg] Fixing malformed AAC bitstream in "%s"' % filename)
self.run_ffmpeg(filename, temp_filename, options)
@@ -643,13 +644,18 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor):
self._downloader.to_screen(
'[ffmpeg] Subtitle file for %s is already in the requested format' % new_ext)
continue
+ elif ext == 'json':
+ self._downloader.to_screen(
+ '[ffmpeg] You have requested to convert json subtitles into another format, '
+ 'which is currently not possible')
+ continue
old_file = subtitles_filename(filename, lang, ext, info.get('ext'))
sub_filenames.append(old_file)
new_file = subtitles_filename(filename, lang, new_ext, info.get('ext'))
if ext in ('dfxp', 'ttml', 'tt'):
self._downloader.report_warning(
- 'You have requested to convert dfxp (TTML) subtitles into another format, '
+ '[ffmpeg] You have requested to convert dfxp (TTML) subtitles into another format, '
'which results in style information loss')
dfxp_file = old_file
diff --git a/youtube_dlc/update.py b/youtube_dlc/update.py
index e49e09c17..b358e902b 100644
--- a/youtube_dlc/update.py
+++ b/youtube_dlc/update.py
@@ -37,10 +37,26 @@ def update_self(to_screen, verbose, opener):
JSON_URL = UPDATE_URL + 'versions.json'
UPDATES_RSA_KEY = (0x9d60ee4d8f805312fdb15a62f87b95bd66177b91df176765d13514a0f1754bcd2057295c5b6f1d35daa6742c3ffc9a82d3e118861c207995a8031e151d863c9927e304576bc80692bc8e094896fcf11b66f3e29e04e3a71e9a11558558acea1840aec37fc396fb6b65dc81a1c4144e03bd1c011de62e3f1357b327d08426fe93, 65537)
+ def sha256sum():
+ h = hashlib.sha256()
+ b = bytearray(128 * 1024)
+ mv = memoryview(b)
+ with open(os.path.realpath(sys.executable), 'rb', buffering=0) as f:
+ for n in iter(lambda: f.readinto(mv), 0):
+ h.update(mv[:n])
+ return h.hexdigest()
+
+ to_screen('Current Build Hash %s' % sha256sum())
+
if not isinstance(globals().get('__loader__'), zipimporter) and not hasattr(sys, 'frozen'):
to_screen('It looks like you installed youtube-dlc with a package manager, pip, setup.py or a tarball. Please use that to update.')
return
+ # compiled file.exe can find itself by
+ # to_screen(os.path.basename(sys.executable))
+ # and path to py or exe
+ # to_screen(os.path.realpath(sys.executable))
+
# Check if there is a new version
try:
newversion = opener.open(VERSION_URL).read().decode('utf-8').strip()
@@ -48,6 +64,7 @@ def update_self(to_screen, verbose, opener):
if verbose:
to_screen(encode_compat_str(traceback.format_exc()))
to_screen('ERROR: can\'t find the current version. Please try again later.')
+ to_screen('Visit https://github.com/blackjack4494/yt-dlc/releases/latest')
return
if newversion == __version__:
to_screen('youtube-dlc is up-to-date (' + __version__ + ')')
@@ -61,6 +78,7 @@ def update_self(to_screen, verbose, opener):
if verbose:
to_screen(encode_compat_str(traceback.format_exc()))
to_screen('ERROR: can\'t obtain versions info. Please try again later.')
+ to_screen('Visit https://github.com/blackjack4494/yt-dlc/releases/latest')
return
if 'signature' not in versions_info:
to_screen('ERROR: the versions file is not signed or corrupted. Aborting.')
@@ -109,6 +127,7 @@ def update_self(to_screen, verbose, opener):
if verbose:
to_screen(encode_compat_str(traceback.format_exc()))
to_screen('ERROR: unable to download latest version')
+ to_screen('Visit https://github.com/blackjack4494/yt-dlc/releases/latest')
return
newcontent_hash = hashlib.sha256(newcontent).hexdigest()
@@ -155,6 +174,7 @@ start /b "" cmd /c del "%%~f0"&exit /b"
if verbose:
to_screen(encode_compat_str(traceback.format_exc()))
to_screen('ERROR: unable to download latest version')
+ to_screen('Visit https://github.com/blackjack4494/yt-dlc/releases/latest')
return
newcontent_hash = hashlib.sha256(newcontent).hexdigest()
diff --git a/youtube_dlc/utils.py b/youtube_dlc/utils.py
index 54a4ea2aa..68b4ca944 100644
--- a/youtube_dlc/utils.py
+++ b/youtube_dlc/utils.py
@@ -2320,8 +2320,8 @@ def bug_reports_message():
if ytdl_is_updateable():
update_cmd = 'type youtube-dlc -U to update'
else:
- update_cmd = 'see https://yt-dl.org/update on how to update'
- msg = '; please report this issue on https://yt-dl.org/bug .'
+ update_cmd = 'see https://github.com/blackjack4494/yt-dlc on how to update'
+ msg = '; please report this issue on https://github.com/blackjack4494/yt-dlc .'
msg += ' Make sure you are using the latest version; %s.' % update_cmd
msg += ' Be sure to call youtube-dlc with the --verbose flag and include its complete output.'
return msg
@@ -2460,7 +2460,7 @@ class XAttrMetadataError(YoutubeDLError):
# Parsing code and msg
if (self.code in (errno.ENOSPC, errno.EDQUOT)
- or 'No space left' in self.msg or 'Disk quota excedded' in self.msg):
+ or 'No space left' in self.msg or 'Disk quota exceeded' in self.msg):
self.reason = 'NO_SPACE'
elif self.code == errno.E2BIG or 'Argument list too long' in self.msg:
self.reason = 'VALUE_TOO_LONG'
@@ -4085,7 +4085,7 @@ def js_to_json(code):
v = m.group(0)
if v in ('true', 'false', 'null'):
return v
- elif v.startswith('/*') or v.startswith('//') or v == ',':
+ elif v.startswith('/*') or v.startswith('//') or v.startswith('!') or v == ',':
return ""
if v[0] in ("'", '"'):
@@ -4095,12 +4095,12 @@ def js_to_json(code):
'\\\n': '',
'\\x': '\\u00',
}.get(m.group(0), m.group(0)), v[1:-1])
-
- for regex, base in INTEGER_TABLE:
- im = re.match(regex, v)
- if im:
- i = int(im.group(1), base)
- return '"%d":' % i if v.endswith(':') else '%d' % i
+ else:
+ for regex, base in INTEGER_TABLE:
+ im = re.match(regex, v)
+ if im:
+ i = int(im.group(1), base)
+ return '"%d":' % i if v.endswith(':') else '%d' % i
return '"%s"' % v
@@ -4110,7 +4110,8 @@ def js_to_json(code):
{comment}|,(?={skip}[\]}}])|
(?:(?<![0-9])[eE]|[a-df-zA-DF-Z_])[.a-zA-Z_0-9]*|
\b(?:0[xX][0-9a-fA-F]+|0+[0-7]+)(?:{skip}:)?|
- [0-9]+(?={skip}:)
+ [0-9]+(?={skip}:)|
+ !+
'''.format(comment=COMMENT_RE, skip=SKIP_RE), fix_kv, code)
@@ -4214,10 +4215,10 @@ def parse_codecs(codecs_str):
# http://tools.ietf.org/html/rfc6381
if not codecs_str:
return {}
- splited_codecs = list(filter(None, map(
+ split_codecs = list(filter(None, map(
lambda str: str.strip(), codecs_str.strip().strip(',').split(','))))
vcodec, acodec = None, None
- for full_codec in splited_codecs:
+ for full_codec in split_codecs:
codec = full_codec.split('.')[0]
if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v', 'hvc1', 'av01', 'theora'):
if not vcodec:
@@ -4228,10 +4229,10 @@ def parse_codecs(codecs_str):
else:
write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
if not vcodec and not acodec:
- if len(splited_codecs) == 2:
+ if len(split_codecs) == 2:
return {
- 'vcodec': splited_codecs[0],
- 'acodec': splited_codecs[1],
+ 'vcodec': split_codecs[0],
+ 'acodec': split_codecs[1],
}
else:
return {
@@ -5470,7 +5471,7 @@ def encode_base_n(num, n, table=None):
def decode_packed_codes(code):
mobj = re.search(PACKED_CODES_RE, code)
- obfucasted_code, base, count, symbols = mobj.groups()
+ obfuscated_code, base, count, symbols = mobj.groups()
base = int(base)
count = int(count)
symbols = symbols.split('|')
@@ -5483,7 +5484,7 @@ def decode_packed_codes(code):
return re.sub(
r'\b(\w+)\b', lambda mobj: symbol_table[mobj.group(0)],
- obfucasted_code)
+ obfuscated_code)
def caesar(s, alphabet, shift):
diff --git a/youtube_dlc/version.py b/youtube_dlc/version.py
index 440d8e488..201a981cf 100644
--- a/youtube_dlc/version.py
+++ b/youtube_dlc/version.py
@@ -1,3 +1,3 @@
from __future__ import unicode_literals
-__version__ = '2020.10.25'
+__version__ = '2020.11.11-2'