aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--.github/ISSUE_TEMPLATE/1_broken_site.md6
-rw-r--r--.github/ISSUE_TEMPLATE/2_site_support_request.md4
-rw-r--r--.github/ISSUE_TEMPLATE/3_site_feature_request.md4
-rw-r--r--.github/ISSUE_TEMPLATE/4_bug_report.md6
-rw-r--r--.github/ISSUE_TEMPLATE/5_feature_request.md4
-rw-r--r--.travis.yml6
-rw-r--r--ChangeLog227
-rw-r--r--README.md4
-rw-r--r--docs/supportedsites.md35
-rw-r--r--test/test_all_urls.py6
-rw-r--r--test/test_utils.py24
-rw-r--r--youtube_dl/downloader/hls.py4
-rw-r--r--youtube_dl/extractor/abcotvs.py79
-rw-r--r--youtube_dl/extractor/addanime.py95
-rw-r--r--youtube_dl/extractor/adobetv.py241
-rw-r--r--youtube_dl/extractor/bambuser.py142
-rw-r--r--youtube_dl/extractor/bellmedia.py4
-rw-r--r--youtube_dl/extractor/bitchute.py7
-rw-r--r--youtube_dl/extractor/channel9.py20
-rw-r--r--youtube_dl/extractor/chaturbate.py42
-rw-r--r--youtube_dl/extractor/comcarcoff.py74
-rw-r--r--youtube_dl/extractor/common.py32
-rw-r--r--youtube_dl/extractor/corus.py169
-rw-r--r--youtube_dl/extractor/dailymotion.py561
-rw-r--r--youtube_dl/extractor/daisuki.py154
-rw-r--r--youtube_dl/extractor/daum.py106
-rw-r--r--youtube_dl/extractor/discoverynetworks.py63
-rw-r--r--youtube_dl/extractor/dplay.py403
-rw-r--r--youtube_dl/extractor/drtv.py57
-rw-r--r--youtube_dl/extractor/extractors.py65
-rw-r--r--youtube_dl/extractor/facebook.py2
-rw-r--r--youtube_dl/extractor/flipagram.py115
-rw-r--r--youtube_dl/extractor/fox9.py43
-rw-r--r--youtube_dl/extractor/gameone.py134
-rw-r--r--youtube_dl/extractor/generic.py46
-rw-r--r--youtube_dl/extractor/go.py53
-rw-r--r--youtube_dl/extractor/go90.py149
-rw-r--r--youtube_dl/extractor/hark.py33
-rw-r--r--youtube_dl/extractor/hotstar.py9
-rw-r--r--youtube_dl/extractor/iconosquare.py85
-rw-r--r--youtube_dl/extractor/imggaming.py133
-rw-r--r--youtube_dl/extractor/internetvideoarchive.py92
-rw-r--r--youtube_dl/extractor/ivi.py109
-rw-r--r--youtube_dl/extractor/jamendo.py169
-rw-r--r--youtube_dl/extractor/kakao.py60
-rw-r--r--youtube_dl/extractor/keek.py39
-rw-r--r--youtube_dl/extractor/kinja.py221
-rw-r--r--youtube_dl/extractor/kontrtube.py73
-rw-r--r--youtube_dl/extractor/la7.py4
-rw-r--r--youtube_dl/extractor/learnr.py33
-rw-r--r--youtube_dl/extractor/lnkgo.py100
-rw-r--r--youtube_dl/extractor/macgamestore.py42
-rw-r--r--youtube_dl/extractor/makertv.py32
-rw-r--r--youtube_dl/extractor/mediaset.py25
-rw-r--r--youtube_dl/extractor/minhateca.py70
-rw-r--r--youtube_dl/extractor/mixcloud.py509
-rw-r--r--youtube_dl/extractor/msn.py196
-rw-r--r--youtube_dl/extractor/mtv.py88
-rw-r--r--youtube_dl/extractor/musicplayon.py66
-rw-r--r--youtube_dl/extractor/myspass.py77
-rw-r--r--youtube_dl/extractor/nbc.py84
-rw-r--r--youtube_dl/extractor/nexx.py14
-rw-r--r--youtube_dl/extractor/nintendo.py28
-rw-r--r--youtube_dl/extractor/nrl.py4
-rw-r--r--youtube_dl/extractor/ntvru.py49
-rw-r--r--youtube_dl/extractor/odnoklassniki.py9
-rw-r--r--youtube_dl/extractor/onet.py54
-rw-r--r--youtube_dl/extractor/onionstudios.py62
-rw-r--r--youtube_dl/extractor/ooyala.py103
-rw-r--r--youtube_dl/extractor/openload.py263
-rw-r--r--youtube_dl/extractor/patreon.py52
-rw-r--r--youtube_dl/extractor/periscope.py80
-rw-r--r--youtube_dl/extractor/revision3.py170
-rw-r--r--youtube_dl/extractor/roosterteeth.py55
-rw-r--r--youtube_dl/extractor/scte.py144
-rw-r--r--youtube_dl/extractor/seeker.py45
-rw-r--r--youtube_dl/extractor/shared.py27
-rw-r--r--youtube_dl/extractor/slideslive.py42
-rw-r--r--youtube_dl/extractor/soundcloud.py511
-rw-r--r--youtube_dl/extractor/streamango.py128
-rw-r--r--youtube_dl/extractor/stv.py89
-rw-r--r--youtube_dl/extractor/teachingchannel.py26
-rw-r--r--youtube_dl/extractor/teamcoco.py68
-rw-r--r--youtube_dl/extractor/telegraaf.py75
-rw-r--r--youtube_dl/extractor/tenplay.py55
-rw-r--r--youtube_dl/extractor/thesun.py14
-rw-r--r--youtube_dl/extractor/tutv.py36
-rw-r--r--youtube_dl/extractor/tv2.py105
-rw-r--r--youtube_dl/extractor/tv2dk.py154
-rw-r--r--youtube_dl/extractor/twitch.py122
-rw-r--r--youtube_dl/extractor/twitter.py578
-rw-r--r--youtube_dl/extractor/ufctv.py73
-rw-r--r--youtube_dl/extractor/videodetective.py11
-rw-r--r--youtube_dl/extractor/videopremium.py46
-rw-r--r--youtube_dl/extractor/vimeo.py177
-rw-r--r--youtube_dl/extractor/vk.py362
-rw-r--r--youtube_dl/extractor/vzaar.py31
-rw-r--r--youtube_dl/extractor/wistia.py15
-rw-r--r--youtube_dl/extractor/yahoo.py658
-rw-r--r--youtube_dl/extractor/youtube.py20
-rw-r--r--youtube_dl/utils.py97
-rw-r--r--youtube_dl/version.py2
102 files changed, 4609 insertions, 5580 deletions
diff --git a/.github/ISSUE_TEMPLATE/1_broken_site.md b/.github/ISSUE_TEMPLATE/1_broken_site.md
index f1afe704c..3a94bd621 100644
--- a/.github/ISSUE_TEMPLATE/1_broken_site.md
+++ b/.github/ISSUE_TEMPLATE/1_broken_site.md
@@ -18,7 +18,7 @@ title: ''
<!--
Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl:
-- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.22. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED.
+- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.11.28. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED.
- Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser.
- Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape.
- Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates.
@@ -26,7 +26,7 @@ Carefully read and work through this check list in order to prevent the most com
-->
- [ ] I'm reporting a broken site support
-- [ ] I've verified that I'm running youtube-dl version **2019.10.22**
+- [ ] I've verified that I'm running youtube-dl version **2019.11.28**
- [ ] I've checked that all provided URLs are alive and playable in a browser
- [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped
- [ ] I've searched the bugtracker for similar issues including closed ones
@@ -41,7 +41,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v <
[debug] User config: []
[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
- [debug] youtube-dl version 2019.10.22
+ [debug] youtube-dl version 2019.11.28
[debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2
[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
[debug] Proxy map: {}
diff --git a/.github/ISSUE_TEMPLATE/2_site_support_request.md b/.github/ISSUE_TEMPLATE/2_site_support_request.md
index a4dc9b005..72bee12aa 100644
--- a/.github/ISSUE_TEMPLATE/2_site_support_request.md
+++ b/.github/ISSUE_TEMPLATE/2_site_support_request.md
@@ -19,7 +19,7 @@ labels: 'site-support-request'
<!--
Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl:
-- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.22. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED.
+- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.11.28. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED.
- Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser.
- Make sure that site you are requesting is not dedicated to copyright infringement, see https://yt-dl.org/copyright-infringement. youtube-dl does not support such sites. In order for site support request to be accepted all provided example URLs should not violate any copyrights.
- Search the bugtracker for similar site support requests: http://yt-dl.org/search-issues. DO NOT post duplicates.
@@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com
-->
- [ ] I'm reporting a new site support request
-- [ ] I've verified that I'm running youtube-dl version **2019.10.22**
+- [ ] I've verified that I'm running youtube-dl version **2019.11.28**
- [ ] I've checked that all provided URLs are alive and playable in a browser
- [ ] I've checked that none of provided URLs violate any copyrights
- [ ] I've searched the bugtracker for similar site support requests including closed ones
diff --git a/.github/ISSUE_TEMPLATE/3_site_feature_request.md b/.github/ISSUE_TEMPLATE/3_site_feature_request.md
index 5bf86adce..ddf67e951 100644
--- a/.github/ISSUE_TEMPLATE/3_site_feature_request.md
+++ b/.github/ISSUE_TEMPLATE/3_site_feature_request.md
@@ -18,13 +18,13 @@ title: ''
<!--
Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl:
-- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.22. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED.
+- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.11.28. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED.
- Search the bugtracker for similar site feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates.
- Finally, put x into all relevant boxes (like this [x])
-->
- [ ] I'm reporting a site feature request
-- [ ] I've verified that I'm running youtube-dl version **2019.10.22**
+- [ ] I've verified that I'm running youtube-dl version **2019.11.28**
- [ ] I've searched the bugtracker for similar site feature requests including closed ones
diff --git a/.github/ISSUE_TEMPLATE/4_bug_report.md b/.github/ISSUE_TEMPLATE/4_bug_report.md
index 7aa5534e5..7122e2714 100644
--- a/.github/ISSUE_TEMPLATE/4_bug_report.md
+++ b/.github/ISSUE_TEMPLATE/4_bug_report.md
@@ -18,7 +18,7 @@ title: ''
<!--
Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl:
-- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.22. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED.
+- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.11.28. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED.
- Make sure that all provided video/audio/playlist URLs (if any) are alive and playable in a browser.
- Make sure that all URLs and arguments with special characters are properly quoted or escaped as explained in http://yt-dl.org/escape.
- Search the bugtracker for similar issues: http://yt-dl.org/search-issues. DO NOT post duplicates.
@@ -27,7 +27,7 @@ Carefully read and work through this check list in order to prevent the most com
-->
- [ ] I'm reporting a broken site support issue
-- [ ] I've verified that I'm running youtube-dl version **2019.10.22**
+- [ ] I've verified that I'm running youtube-dl version **2019.11.28**
- [ ] I've checked that all provided URLs are alive and playable in a browser
- [ ] I've checked that all URLs and arguments with special characters are properly quoted or escaped
- [ ] I've searched the bugtracker for similar bug reports including closed ones
@@ -43,7 +43,7 @@ Add the `-v` flag to your command line you run youtube-dl with (`youtube-dl -v <
[debug] User config: []
[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
- [debug] youtube-dl version 2019.10.22
+ [debug] youtube-dl version 2019.11.28
[debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2
[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
[debug] Proxy map: {}
diff --git a/.github/ISSUE_TEMPLATE/5_feature_request.md b/.github/ISSUE_TEMPLATE/5_feature_request.md
index 5d3645e3d..a93882b39 100644
--- a/.github/ISSUE_TEMPLATE/5_feature_request.md
+++ b/.github/ISSUE_TEMPLATE/5_feature_request.md
@@ -19,13 +19,13 @@ labels: 'request'
<!--
Carefully read and work through this check list in order to prevent the most common mistakes and misuse of youtube-dl:
-- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.10.22. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED.
+- First of, make sure you are using the latest version of youtube-dl. Run `youtube-dl --version` and ensure your version is 2019.11.28. If it's not, see https://yt-dl.org/update on how to update. Issues with outdated version will be REJECTED.
- Search the bugtracker for similar feature requests: http://yt-dl.org/search-issues. DO NOT post duplicates.
- Finally, put x into all relevant boxes (like this [x])
-->
- [ ] I'm reporting a feature request
-- [ ] I've verified that I'm running youtube-dl version **2019.10.22**
+- [ ] I've verified that I'm running youtube-dl version **2019.11.28**
- [ ] I've searched the bugtracker for similar feature requests including closed ones
diff --git a/.travis.yml b/.travis.yml
index 6d16c2955..14d95fa84 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -21,6 +21,12 @@ matrix:
- python: 3.7
dist: xenial
env: YTDL_TEST_SET=download
+ - python: 3.8
+ dist: xenial
+ env: YTDL_TEST_SET=core
+ - python: 3.8
+ dist: xenial
+ env: YTDL_TEST_SET=download
- python: 3.8-dev
dist: xenial
env: YTDL_TEST_SET=core
diff --git a/ChangeLog b/ChangeLog
index 64233b03b..d2f17ee06 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,226 @@
+version <unreleased>
+
+Core
+* [utils] Improve str_to_int
++ [downloader/hls] Add ability to override AES decryption key URL (#17521)
+
+Extractors
++ [tv2dk:bornholm:play] Add support for play.tv2bornholm.dk (#23291)
++ [slideslive] Add support for url and vimeo service names (#23414)
+* [slideslive] Fix extraction (#23413)
+* [twitch:clips] Fix extraction (#23375)
++ [soundcloud] Add support for token protected embeds (#18954)
+* [vk] Improve extraction
+ * Fix User Videos extraction (#23356)
+ * Extract all videos for lists with more than 1000 videos (#23356)
+ + Add support for video albums (#14327, #14492)
+- [kontrtube] Remove extractor
+- [videopremium] Remove extractor
+- [musicplayon] Remove extractor (#9225)
++ [ufctv] Add support for ufcfightpass.imgdge.com and
+ ufcfightpass.imggaming.com (#23343)
++ [twitch] Extract m3u8 formats frame rate (#23333)
++ [imggaming] Add support for playlists and extract subtitles
++ [ufcarabia] Add support for UFC Arabia (#23312)
+* [ufctv] Fix extraction
+* [yahoo] Fix gyao brightcove player id (#23303)
+* [vzaar] Override AES decryption key URL (#17521)
++ [vzaar] Add support for AES HLS manifests (#17521, #23299)
+* [nrl] Fix extraction
+* [teachingchannel] Fix extraction
+* [nintendo] Fix extraction and partially add support for Nintendo Direct
+ videos (#4592)
++ [ooyala] Add better fallback values for domain and streams variables
++ [youtube] Add support youtubekids.com (#23272)
+* [tv2] Detect DRM protection
++ [tv2] Add support for katsomo.fi and mtv.fi (#10543)
+* [tv2] Fix tv2.no article extraction
+* [msn] Improve extraction
+ + Add support for YouTube and NBCSports embeds
+ + Add support for articles with multiple videos
+ * Improve AOL embed support
+ * Improve format extraction
+* [abcotvs] Relax URL regular expression and improve metadata extraction
+ (#18014)
+* [channel9] Reduce response size
+* [adobetv] Improve extaction
+ * Use OnDemandPagedList for list extractors
+ * Reduce show extraction requests
+ * Extract original video format and subtitles
+ + Add support for adobe tv embeds
+
+
+version 2019.11.28
+
+Core
++ [utils] Add generic caesar cipher and rot47
+* [utils] Handle rd-suffixed day parts in unified_strdate (#23199)
+
+Extractors
+* [vimeo] Improve extraction
+ * Fix review extraction
+ * Fix ondemand extraction
+ * Make password protected player case as an expected error (#22896)
+ * Simplify channel based extractors code
+- [openload] Remove extractor (#11999)
+- [verystream] Remove extractor
+- [streamango] Remove extractor (#15406)
+* [dailymotion] Improve extraction
+ * Extract http formats included in m3u8 manifest
+ * Fix user extraction (#3553, #21415)
+ + Add suport for User Authentication (#11491)
+ * Fix password protected videos extraction (#23176)
+ * Respect age limit option and family filter cookie value (#18437)
+ * Handle video url playlist query param
+ * Report allowed countries for geo-restricted videos
+* [corus] Improve extraction
+ + Add support for Series Plus, W Network, YTV, ABC Spark, disneychannel.com
+ and disneylachaine.ca (#20861)
+ + Add support for self hosted videos (#22075)
+ * Detect DRM protection (#14910, #9164)
+* [vivo] Fix extraction (#22328, #22279)
++ [bitchute] Extract upload date (#22990, #23193)
+* [soundcloud] Update client id (#23214)
+
+
+version 2019.11.22
+
+Core
++ [extractor/common] Clean jwplayer description HTML tags
++ [extractor/common] Add data, headers and query to all major extract formats
+ methods
+
+Extractors
+* [chaturbate] Fix extraction (#23010, #23012)
++ [ntvru] Add support for non relative file URLs (#23140)
+* [vk] Fix wall audio thumbnails extraction (#23135)
+* [ivi] Fix format extraction (#21991)
+- [comcarcoff] Remove extractor
++ [drtv] Add support for new URL schema (#23059)
++ [nexx] Add support for Multi Player JS Setup (#23052)
++ [teamcoco] Add support for new videos (#23054)
+* [soundcloud] Check if the soundtrack has downloads left (#23045)
+* [facebook] Fix posts video data extraction (#22473)
+- [addanime] Remove extractor
+- [minhateca] Remove extractor
+- [daisuki] Remove extractor
+* [seeker] Fix extraction
+- [revision3] Remove extractors
+* [twitch] Fix video comments URL (#18593, #15828)
+* [twitter] Improve extraction
+ + Add support for generic embeds (#22168)
+ * Always extract http formats for native videos (#14934)
+ + Add support for Twitter Broadcasts (#21369)
+ + Extract more metadata
+ * Improve VMap format extraction
+ * Unify extraction code for both twitter statuses and cards
++ [twitch] Add support for Clip embed URLs
+* [lnkgo] Fix extraction (#16834)
+* [mixcloud] Improve extraction
+ * Improve metadata extraction (#11721)
+ * Fix playlist extraction (#22378)
+ * Fix user mixes extraction (#15197, #17865)
++ [kinja] Add support for Kinja embeds (#5756, #11282, #22237, #22384)
+* [onionstudios] Fix extraction
++ [hotstar] Pass Referer header to format requests (#22836)
+* [dplay] Minimize response size
++ [patreon] Extract uploader_id and filesize
+* [patreon] Minimize response size
+* [roosterteeth] Fix login request (#16094, #22689)
+
+
+version 2019.11.05
+
+Extractors
++ [scte] Add support for learning.scte.org (#22975)
++ [msn] Add support for Vidible and AOL embeds (#22195, #22227)
+* [myspass] Fix video URL extraction and improve metadata extraction (#22448)
+* [jamendo] Improve extraction
+ * Fix album extraction (#18564)
+ * Improve metadata extraction (#18565, #21379)
+* [mediaset] Relax URL guid matching (#18352)
++ [mediaset] Extract unprotected M3U and MPD manifests (#17204)
+* [telegraaf] Fix extraction
++ [bellmedia] Add support for marilyn.ca videos (#22193)
+* [stv] Fix extraction (#22928)
+- [iconosquare] Remove extractor
+- [keek] Remove extractor
+- [gameone] Remove extractor (#21778)
+- [flipagram] Remove extractor
+- [bambuser] Remove extractor
+* [wistia] Reduce embed extraction false positives
++ [wistia] Add support for inline embeds (#22931)
+- [go90] Remove extractor
+* [kakao] Remove raw request
++ [kakao] Extract format total bitrate
+* [daum] Fix VOD and Clip extracton (#15015)
+* [kakao] Improve extraction
+ + Add support for embed URLs
+ + Add support for Kakao Legacy vid based embed URLs
+ * Only extract fields used for extraction
+ * Strip description and extract tags
+* [mixcloud] Fix cloudcast data extraction (#22821)
+* [yahoo] Improve extraction
+ + Add support for live streams (#3597, #3779, #22178)
+ * Bypass cookie consent page for european domains (#16948, #22576)
+ + Add generic support for embeds (#20332)
+* [tv2] Fix and improve extraction (#22787)
++ [tv2dk] Add support for TV2 DK sites
+* [onet] Improve extraction …
+ + Add support for onet100.vod.pl
+ + Extract m3u8 formats
+ * Correct audio only format info
+* [fox9] Fix extraction
+
+
+version 2019.10.29
+
+Core
+* [utils] Actualize major IPv4 address blocks per country
+
+Extractors
++ [go] Add support for abc.com and freeform.com (#22823, #22864)
++ [mtv] Add support for mtvjapan.com
+* [mtv] Fix extraction for mtv.de (#22113)
+* [videodetective] Fix extraction
+* [internetvideoarchive] Fix extraction
+* [nbcnews] Fix extraction (#12569, #12576, #21703, #21923)
+- [hark] Remove extractor
+- [tutv] Remove extractor
+- [learnr] Remove extractor
+- [macgamestore] Remove extractor
+* [la7] Update Kaltura service URL (#22358)
+* [thesun] Fix extraction (#16966)
+- [makertv] Remove extractor
++ [tenplay] Add support for 10play.com.au (#21446)
+* [soundcloud] Improve extraction
+ * Improve format extraction (#22123)
+ + Extract uploader_id and uploader_url (#21916)
+ + Extract all known thumbnails (#19071, #20659)
+ * Fix extration for private playlists (#20976)
+ + Add support for playlist embeds (#20976)
+ * Skip preview formats (#22806)
+* [dplay] Improve extraction
+ + Add support for dplay.fi, dplay.jp and es.dplay.com (#16969)
+ * Fix it.dplay.com extraction (#22826)
+ + Extract creator, tags and thumbnails
+ * Handle playback API call errors
++ [discoverynetworks] Add support for dplay.co.uk
+* [vk] Improve extraction
+ + Add support for Odnoklassniki embeds
+ + Extract more videos from user lists (#4470)
+ + Fix wall post audio extraction (#18332)
+ * Improve error detection (#22568)
++ [odnoklassniki] Add support for embeds
+* [puhutv] Improve extraction
+ * Fix subtitles extraction
+ * Transform HLS URLs to HTTP URLs
+ * Improve metadata extraction
+* [ceskatelevize] Skip DRM media
++ [facebook] Extract subtitles (#22777)
+* [globo] Handle alternative hash signing method
+
+
version 2019.10.22
Core
@@ -412,7 +635,7 @@ Extractors
version 2019.04.17
Extractors
-* [openload] Randomize User-Agent (closes #20688)
+* [openload] Randomize User-Agent (#20688)
+ [openload] Add support for oladblock domains (#20471)
* [adn] Fix subtitle extraction (#12724)
+ [aol] Add support for localized websites
@@ -977,7 +1200,7 @@ Extractors
+ [youtube] Extract channel meta fields (#9676, #12939)
* [porntube] Fix extraction (#17541)
* [asiancrush] Fix extraction (#15630)
-+ [twitch:clips] Extend URL regular expression (closes #17559)
++ [twitch:clips] Extend URL regular expression (#17559)
+ [vzaar] Add support for HLS
* [tube8] Fix metadata extraction (#17520)
* [eporner] Extract JSON-LD (#17519)
diff --git a/README.md b/README.md
index c39b13616..01f975958 100644
--- a/README.md
+++ b/README.md
@@ -752,8 +752,8 @@ As a last resort, you can also uninstall the version installed by your package m
Afterwards, simply follow [our manual installation instructions](https://ytdl-org.github.io/youtube-dl/download.html):
```
-sudo wget https://yt-dl.org/latest/youtube-dl -O /usr/local/bin/youtube-dl
-sudo chmod a+x /usr/local/bin/youtube-dl
+sudo wget https://yt-dl.org/downloads/latest/youtube-dl -O /usr/local/bin/youtube-dl
+sudo chmod a+rx /usr/local/bin/youtube-dl
hash -r
```
diff --git a/docs/supportedsites.md b/docs/supportedsites.md
index a1b0edeeb..2744dfca8 100644
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@@ -26,7 +26,6 @@
- **AcademicEarth:Course**
- **acast**
- **acast:channel**
- - **AddAnime**
- **ADN**: Anime Digital Network
- **AdobeConnect**
- **AdobeTV**
@@ -76,8 +75,6 @@
- **awaan:video**
- **AZMedien**: AZ Medien videos
- **BaiduVideo**: 百度视频
- - **bambuser**
- - **bambuser:channel**
- **Bandcamp**
- **Bandcamp:album**
- **Bandcamp:weekly**
@@ -177,7 +174,6 @@
- **CNN**
- **CNNArticle**
- **CNNBlogs**
- - **ComCarCoff**
- **ComedyCentral**
- **ComedyCentralFullEpisodes**
- **ComedyCentralShortname**
@@ -205,8 +201,6 @@
- **dailymotion**
- **dailymotion:playlist**
- **dailymotion:user**
- - **DaisukiMotto**
- - **DaisukiMottoPlaylist**
- **daum.net**
- **daum.net:clip**
- **daum.net:playlist**
@@ -232,7 +226,6 @@
- **DouyuShow**
- **DouyuTV**: 斗鱼
- **DPlay**
- - **DPlayIt**
- **DRBonanza**
- **Dropbox**
- **DrTuber**
@@ -285,12 +278,12 @@
- **FiveThirtyEight**
- **FiveTV**
- **Flickr**
- - **Flipagram**
- **Folketinget**: Folketinget (ft.dk; Danish parliament)
- **FootyRoom**
- **Formula1**
- **FOX**
- **FOX9**
+ - **FOX9News**
- **Foxgay**
- **foxnews**: Fox News and Fox Business Video
- **foxnews:article**
@@ -316,8 +309,6 @@
- **FXNetworks**
- **Gaia**
- **GameInformer**
- - **GameOne**
- - **gameone:playlist**
- **GameSpot**
- **GameStar**
- **Gaskrank**
@@ -332,14 +323,12 @@
- **Globo**
- **GloboArticle**
- **Go**
- - **Go90**
- **GodTube**
- **Golem**
- **GoogleDrive**
- **Goshgay**
- **GPUTechConf**
- **Groupon**
- - **Hark**
- **hbo**
- **HearThisAt**
- **Heise**
@@ -368,7 +357,6 @@
- **Hungama**
- **HungamaSong**
- **Hypem**
- - **Iconosquare**
- **ign.com**
- **imdb**: Internet Movie Database trailers
- **imdb:list**: Internet Movie Database lists
@@ -408,11 +396,11 @@
- **Kankan**
- **Karaoketv**
- **KarriereVideos**
- - **keek**
- **KeezMovies**
- **Ketnet**
- **KhanAcademy**
- **KickStarter**
+ - **KinjaEmbed**
- **KinoPoisk**
- **KonserthusetPlay**
- **kontrtube**: KontrTube.ru - Труба зовёт
@@ -432,7 +420,6 @@
- **Lcp**
- **LcpPlay**
- **Le**: 乐视网
- - **Learnr**
- **Lecture2Go**
- **Lecturio**
- **LecturioCourse**
@@ -466,11 +453,9 @@
- **lynda**: lynda.com videos
- **lynda:course**: lynda.com online courses
- **m6**
- - **macgamestore**: MacGameStore trailers
- **mailru**: Видео@Mail.Ru
- **mailru:music**: Музыка@Mail.Ru
- **mailru:music:search**: Музыка@Mail.Ru
- - **MakerTV**
- **MallTV**
- **mangomolo:live**
- **mangomolo:video**
@@ -497,14 +482,12 @@
- **Mgoon**
- **MGTV**: 芒果TV
- **MiaoPai**
- - **Minhateca**
- **MinistryGrid**
- **Minoto**
- **miomio.tv**
- **MiTele**: mitele.es
- **mixcloud**
- **mixcloud:playlist**
- - **mixcloud:stream**
- **mixcloud:user**
- **Mixer:live**
- **Mixer:vod**
@@ -526,8 +509,8 @@
- **mtg**: MTG services
- **mtv**
- **mtv.de**
- - **mtv81**
- **mtv:video**
+ - **mtvjapan**
- **mtvservices:embedded**
- **MuenchenTV**: münchen.tv
- **MusicPlayOn**
@@ -635,7 +618,6 @@
- **OnionStudios**
- **Ooyala**
- **OoyalaExternal**
- - **Openload**
- **OraTV**
- **orf:fm4**: radio FM4
- **orf:fm4:story**: fm4.orf.at stories
@@ -735,8 +717,6 @@
- **Restudy**
- **Reuters**
- **ReverbNation**
- - **revision**
- - **revision3:embed**
- **RICE**
- **RMCDecouverte**
- **RockstarGames**
@@ -782,6 +762,8 @@
- **Screencast**
- **ScreencastOMatic**
- **scrippsnetworks:watch**
+ - **SCTE**
+ - **SCTECourse**
- **Seeker**
- **SenateISVP**
- **SendtoNews**
@@ -815,6 +797,7 @@
- **soundcloud:set**
- **soundcloud:trackstation**
- **soundcloud:user**
+ - **SoundcloudEmbed**
- **soundgasm**
- **soundgasm:profile**
- **southpark.cc.com**
@@ -841,7 +824,6 @@
- **Steam**
- **Stitcher**
- **Streamable**
- - **Streamango**
- **streamcloud.eu**
- **StreamCZ**
- **StreetVoice**
@@ -887,6 +869,7 @@
- **TeleTask**
- **Telewebion**
- **TennisTV**
+ - **TenPlay**
- **TF1**
- **TFO**
- **TheIntercept**
@@ -925,11 +908,11 @@
- **tunein:topic**
- **TunePk**
- **Turbo**
- - **Tutv**
- **tv.dfb.de**
- **TV2**
- **tv2.hu**
- **TV2Article**
+ - **TV2DK**
- **TV4**: tv4.se and tv4play.se
- **TV5MondePlus**: TV5MONDE+
- **TVA**
@@ -966,6 +949,7 @@
- **twitch:vod**
- **twitter**
- **twitter:amplify**
+ - **twitter:broadcast**
- **twitter:card**
- **udemy**
- **udemy:course**
@@ -990,7 +974,6 @@
- **Vbox7**
- **VeeHD**
- **Veoh**
- - **verystream**
- **Vesti**: Вести.Ru
- **Vevo**
- **VevoPlaylist**
diff --git a/test/test_all_urls.py b/test/test_all_urls.py
index 465ce0050..81056a999 100644
--- a/test/test_all_urls.py
+++ b/test/test_all_urls.py
@@ -123,12 +123,6 @@ class TestAllURLsMatching(unittest.TestCase):
self.assertMatch('http://video.pbs.org/viralplayer/2365173446/', ['pbs'])
self.assertMatch('http://video.pbs.org/widget/partnerplayer/980042464/', ['pbs'])
- def test_yahoo_https(self):
- # https://github.com/ytdl-org/youtube-dl/issues/2701
- self.assertMatch(
- 'https://screen.yahoo.com/smartwatches-latest-wearable-gadgets-163745379-cbs.html',
- ['Yahoo'])
-
def test_no_duplicated_ie_names(self):
name_accu = collections.defaultdict(list)
for ie in self.ies:
diff --git a/test/test_utils.py b/test/test_utils.py
index 3920542bb..0896f4150 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -19,6 +19,7 @@ from youtube_dl.utils import (
age_restricted,
args_to_str,
encode_base_n,
+ caesar,
clean_html,
date_from_str,
DateRange,
@@ -69,6 +70,7 @@ from youtube_dl.utils import (
remove_start,
remove_end,
remove_quotes,
+ rot47,
shell_quote,
smuggle_url,
str_to_int,
@@ -340,6 +342,8 @@ class TestUtil(unittest.TestCase):
self.assertEqual(unified_strdate('July 15th, 2013'), '20130715')
self.assertEqual(unified_strdate('September 1st, 2013'), '20130901')
self.assertEqual(unified_strdate('Sep 2nd, 2013'), '20130902')
+ self.assertEqual(unified_strdate('November 3rd, 2019'), '20191103')
+ self.assertEqual(unified_strdate('October 23rd, 2005'), '20051023')
def test_unified_timestamps(self):
self.assertEqual(unified_timestamp('December 21, 2010'), 1292889600)
@@ -495,6 +499,12 @@ class TestUtil(unittest.TestCase):
def test_str_to_int(self):
self.assertEqual(str_to_int('123,456'), 123456)
self.assertEqual(str_to_int('123.456'), 123456)
+ self.assertEqual(str_to_int(523), 523)
+ # Python 3 has no long
+ if sys.version_info < (3, 0):
+ eval('self.assertEqual(str_to_int(123456L), 123456)')
+ self.assertEqual(str_to_int('noninteger'), None)
+ self.assertEqual(str_to_int([]), None)
def test_url_basename(self):
self.assertEqual(url_basename('http://foo.de/'), '')
@@ -1367,6 +1377,20 @@ Line 1
self.assertRaises(ValueError, encode_base_n, 0, 70)
self.assertRaises(ValueError, encode_base_n, 0, 60, custom_table)
+ def test_caesar(self):
+ self.assertEqual(caesar('ace', 'abcdef', 2), 'cea')
+ self.assertEqual(caesar('cea', 'abcdef', -2), 'ace')
+ self.assertEqual(caesar('ace', 'abcdef', -2), 'eac')
+ self.assertEqual(caesar('eac', 'abcdef', 2), 'ace')
+ self.assertEqual(caesar('ace', 'abcdef', 0), 'ace')
+ self.assertEqual(caesar('xyz', 'abcdef', 2), 'xyz')
+ self.assertEqual(caesar('abc', 'acegik', 2), 'ebg')
+ self.assertEqual(caesar('ebg', 'acegik', -2), 'abc')
+
+ def test_rot47(self):
+ self.assertEqual(rot47('youtube-dl'), r'J@FEF36\5=')
+ self.assertEqual(rot47('YOUTUBE-DL'), r'*~&%&qt\s{')
+
def test_urshift(self):
self.assertEqual(urshift(3, 1), 1)
self.assertEqual(urshift(-3, 1), 2147483646)
diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py
index b59aad73f..84bc34928 100644
--- a/youtube_dl/downloader/hls.py
+++ b/youtube_dl/downloader/hls.py
@@ -64,7 +64,7 @@ class HlsFD(FragmentFD):
s = urlh.read().decode('utf-8', 'ignore')
if not self.can_download(s, info_dict):
- if info_dict.get('extra_param_to_segment_url'):
+ if info_dict.get('extra_param_to_segment_url') or info_dict.get('_decryption_key_url'):
self.report_error('pycrypto not found. Please install it.')
return False
self.report_warning(
@@ -169,7 +169,7 @@ class HlsFD(FragmentFD):
if decrypt_info['METHOD'] == 'AES-128':
iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence)
decrypt_info['KEY'] = decrypt_info.get('KEY') or self.ydl.urlopen(
- self._prepare_url(info_dict, decrypt_info['URI'])).read()
+ self._prepare_url(info_dict, info_dict.get('_decryption_key_url') or decrypt_info['URI'])).read()
frag_content = AES.new(
decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content)
self._append_fragment(ctx, frag_content)
diff --git a/youtube_dl/extractor/abcotvs.py b/youtube_dl/extractor/abcotvs.py
index 03b92a39c..0bc69a64f 100644
--- a/youtube_dl/extractor/abcotvs.py
+++ b/youtube_dl/extractor/abcotvs.py
@@ -4,29 +4,30 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_str
from ..utils import (
+ dict_get,
int_or_none,
- parse_iso8601,
+ try_get,
)
class ABCOTVSIE(InfoExtractor):
IE_NAME = 'abcotvs'
IE_DESC = 'ABC Owned Television Stations'
- _VALID_URL = r'https?://(?:abc(?:7(?:news|ny|chicago)?|11|13|30)|6abc)\.com(?:/[^/]+/(?P<display_id>[^/]+))?/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?P<site>abc(?:7(?:news|ny|chicago)?|11|13|30)|6abc)\.com(?:(?:/[^/]+)*/(?P<display_id>[^/]+))?/(?P<id>\d+)'
_TESTS = [
{
'url': 'http://abc7news.com/entertainment/east-bay-museum-celebrates-vintage-synthesizers/472581/',
'info_dict': {
- 'id': '472581',
+ 'id': '472548',
'display_id': 'east-bay-museum-celebrates-vintage-synthesizers',
'ext': 'mp4',
- 'title': 'East Bay museum celebrates vintage synthesizers',
+ 'title': 'East Bay museum celebrates synthesized music',
'description': 'md5:24ed2bd527096ec2a5c67b9d5a9005f3',
'thumbnail': r're:^https?://.*\.jpg$',
- 'timestamp': 1421123075,
+ 'timestamp': 1421118520,
'upload_date': '20150113',
- 'uploader': 'Jonathan Bloom',
},
'params': {
# m3u8 download
@@ -37,39 +38,63 @@ class ABCOTVSIE(InfoExtractor):
'url': 'http://abc7news.com/472581',
'only_matching': True,
},
+ {
+ 'url': 'https://6abc.com/man-75-killed-after-being-struck-by-vehicle-in-chester/5725182/',
+ 'only_matching': True,
+ },
]
+ _SITE_MAP = {
+ '6abc': 'wpvi',
+ 'abc11': 'wtvd',
+ 'abc13': 'ktrk',
+ 'abc30': 'kfsn',
+ 'abc7': 'kabc',
+ 'abc7chicago': 'wls',
+ 'abc7news': 'kgo',
+ 'abc7ny': 'wabc',
+ }
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- display_id = mobj.group('display_id') or video_id
-
- webpage = self._download_webpage(url, display_id)
+ site, display_id, video_id = re.match(self._VALID_URL, url).groups()
+ display_id = display_id or video_id
+ station = self._SITE_MAP[site]
- m3u8 = self._html_search_meta(
- 'contentURL', webpage, 'm3u8 url', fatal=True).split('?')[0]
+ data = self._download_json(
+ 'https://api.abcotvs.com/v2/content', display_id, query={
+ 'id': video_id,
+ 'key': 'otv.web.%s.story' % station,
+ 'station': station,
+ })['data']
+ video = try_get(data, lambda x: x['featuredMedia']['video'], dict) or data
+ video_id = compat_str(dict_get(video, ('id', 'publishedKey'), video_id))
+ title = video.get('title') or video['linkText']
- formats = self._extract_m3u8_formats(m3u8, display_id, 'mp4')
+ formats = []
+ m3u8_url = video.get('m3u8')
+ if m3u8_url:
+ formats = self._extract_m3u8_formats(
+ video['m3u8'].split('?')[0], display_id, 'mp4', m3u8_id='hls', fatal=False)
+ mp4_url = video.get('mp4')
+ if mp4_url:
+ formats.append({
+ 'abr': 128,
+ 'format_id': 'https',
+ 'height': 360,
+ 'url': mp4_url,
+ 'width': 640,
+ })
self._sort_formats(formats)
- title = self._og_search_title(webpage).strip()
- description = self._og_search_description(webpage).strip()
- thumbnail = self._og_search_thumbnail(webpage)
- timestamp = parse_iso8601(self._search_regex(
- r'<div class="meta">\s*<time class="timeago" datetime="([^"]+)">',
- webpage, 'upload date', fatal=False))
- uploader = self._search_regex(
- r'rel="author">([^<]+)</a>',
- webpage, 'uploader', default=None)
+ image = video.get('image') or {}
return {
'id': video_id,
'display_id': display_id,
'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'timestamp': timestamp,
- 'uploader': uploader,
+ 'description': dict_get(video, ('description', 'caption'), try_get(video, lambda x: x['meta']['description'])),
+ 'thumbnail': dict_get(image, ('source', 'dynamicSource')),
+ 'timestamp': int_or_none(video.get('date')),
+ 'duration': int_or_none(video.get('length')),
'formats': formats,
}
diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py
deleted file mode 100644
index 5e7c0724e..000000000
--- a/youtube_dl/extractor/addanime.py
+++ /dev/null
@@ -1,95 +0,0 @@
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..compat import (
- compat_HTTPError,
- compat_str,
- compat_urllib_parse_urlencode,
- compat_urllib_parse_urlparse,
-)
-from ..utils import (
- ExtractorError,
- qualities,
-)
-
-
-class AddAnimeIE(InfoExtractor):
- _VALID_URL = r'https?://(?:\w+\.)?add-anime\.net/(?:watch_video\.php\?(?:.*?)v=|video/)(?P<id>[\w_]+)'
- _TESTS = [{
- 'url': 'http://www.add-anime.net/watch_video.php?v=24MR3YO5SAS9',
- 'md5': '72954ea10bc979ab5e2eb288b21425a0',
- 'info_dict': {
- 'id': '24MR3YO5SAS9',
- 'ext': 'mp4',
- 'description': 'One Piece 606',
- 'title': 'One Piece 606',
- },
- 'skip': 'Video is gone',
- }, {
- 'url': 'http://add-anime.net/video/MDUGWYKNGBD8/One-Piece-687',
- 'only_matching': True,
- }]
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- try:
- webpage = self._download_webpage(url, video_id)
- except ExtractorError as ee:
- if not isinstance(ee.cause, compat_HTTPError) or \
- ee.cause.code != 503:
- raise
-
- redir_webpage = ee.cause.read().decode('utf-8')
- action = self._search_regex(
- r'<form id="challenge-form" action="([^"]+)"',
- redir_webpage, 'Redirect form')
- vc = self._search_regex(
- r'<input type="hidden" name="jschl_vc" value="([^"]+)"/>',
- redir_webpage, 'redirect vc value')
- av = re.search(
- r'a\.value = ([0-9]+)[+]([0-9]+)[*]([0-9]+);',
- redir_webpage)
- if av is None:
- raise ExtractorError('Cannot find redirect math task')
- av_res = int(av.group(1)) + int(av.group(2)) * int(av.group(3))
-
- parsed_url = compat_urllib_parse_urlparse(url)
- av_val = av_res + len(parsed_url.netloc)
- confirm_url = (
- parsed_url.scheme + '://' + parsed_url.netloc
- + action + '?'
- + compat_urllib_parse_urlencode({
- 'jschl_vc': vc, 'jschl_answer': compat_str(av_val)}))
- self._download_webpage(
- confirm_url, video_id,
- note='Confirming after redirect')
- webpage = self._download_webpage(url, video_id)
-
- FORMATS = ('normal', 'hq')
- quality = qualities(FORMATS)
- formats = []
- for format_id in FORMATS:
- rex = r"var %s_video_file = '(.*?)';" % re.escape(format_id)
- video_url = self._search_regex(rex, webpage, 'video file URLx',
- fatal=False)
- if not video_url:
- continue
- formats.append({
- 'format_id': format_id,
- 'url': video_url,
- 'quality': quality(format_id),
- })
- self._sort_formats(formats)
- video_title = self._og_search_title(webpage)
- video_description = self._og_search_description(webpage)
-
- return {
- '_type': 'video',
- 'id': video_id,
- 'formats': formats,
- 'title': video_title,
- 'description': video_description
- }
diff --git a/youtube_dl/extractor/adobetv.py b/youtube_dl/extractor/adobetv.py
index 008c98e51..80060f037 100644
--- a/youtube_dl/extractor/adobetv.py
+++ b/youtube_dl/extractor/adobetv.py
@@ -1,25 +1,119 @@
from __future__ import unicode_literals
+import functools
import re
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
- parse_duration,
- unified_strdate,
- str_to_int,
- int_or_none,
float_or_none,
+ int_or_none,
ISO639Utils,
- determine_ext,
+ OnDemandPagedList,
+ parse_duration,
+ str_or_none,
+ str_to_int,
+ unified_strdate,
)
class AdobeTVBaseIE(InfoExtractor):
- _API_BASE_URL = 'http://tv.adobe.com/api/v4/'
+ def _call_api(self, path, video_id, query, note=None):
+ return self._download_json(
+ 'http://tv.adobe.com/api/v4/' + path,
+ video_id, note, query=query)['data']
+
+ def _parse_subtitles(self, video_data, url_key):
+ subtitles = {}
+ for translation in video_data.get('translations', []):
+ vtt_path = translation.get(url_key)
+ if not vtt_path:
+ continue
+ lang = translation.get('language_w3c') or ISO639Utils.long2short(translation['language_medium'])
+ subtitles.setdefault(lang, []).append({
+ 'ext': 'vtt',
+ 'url': vtt_path,
+ })
+ return subtitles
+
+ def _parse_video_data(self, video_data):
+ video_id = compat_str(video_data['id'])
+ title = video_data['title']
+
+ s3_extracted = False
+ formats = []
+ for source in video_data.get('videos', []):
+ source_url = source.get('url')
+ if not source_url:
+ continue
+ f = {
+ 'format_id': source.get('quality_level'),
+ 'fps': int_or_none(source.get('frame_rate')),
+ 'height': int_or_none(source.get('height')),
+ 'tbr': int_or_none(source.get('video_data_rate')),
+ 'width': int_or_none(source.get('width')),
+ 'url': source_url,
+ }
+ original_filename = source.get('original_filename')
+ if original_filename:
+ if not (f.get('height') and f.get('width')):
+ mobj = re.search(r'_(\d+)x(\d+)', original_filename)
+ if mobj:
+ f.update({
+ 'height': int(mobj.group(2)),
+ 'width': int(mobj.group(1)),
+ })
+ if original_filename.startswith('s3://') and not s3_extracted:
+ formats.append({
+ 'format_id': 'original',
+ 'preference': 1,
+ 'url': original_filename.replace('s3://', 'https://s3.amazonaws.com/'),
+ })
+ s3_extracted = True
+ formats.append(f)
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': video_data.get('description'),
+ 'thumbnail': video_data.get('thumbnail'),
+ 'upload_date': unified_strdate(video_data.get('start_date')),
+ 'duration': parse_duration(video_data.get('duration')),
+ 'view_count': str_to_int(video_data.get('playcount')),
+ 'formats': formats,
+ 'subtitles': self._parse_subtitles(video_data, 'vtt'),
+ }
+
+
+class AdobeTVEmbedIE(AdobeTVBaseIE):
+ IE_NAME = 'adobetv:embed'
+ _VALID_URL = r'https?://tv\.adobe\.com/embed/\d+/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'https://tv.adobe.com/embed/22/4153',
+ 'md5': 'c8c0461bf04d54574fc2b4d07ac6783a',
+ 'info_dict': {
+ 'id': '4153',
+ 'ext': 'flv',
+ 'title': 'Creating Graphics Optimized for BlackBerry',
+ 'description': 'md5:eac6e8dced38bdaae51cd94447927459',
+ 'thumbnail': r're:https?://.*\.jpg$',
+ 'upload_date': '20091109',
+ 'duration': 377,
+ 'view_count': int,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ video_data = self._call_api(
+ 'episode/' + video_id, video_id, {'disclosure': 'standard'})[0]
+ return self._parse_video_data(video_data)
class AdobeTVIE(AdobeTVBaseIE):
+ IE_NAME = 'adobetv'
_VALID_URL = r'https?://tv\.adobe\.com/(?:(?P<language>fr|de|es|jp)/)?watch/(?P<show_urlname>[^/]+)/(?P<id>[^/]+)'
_TEST = {
@@ -42,45 +136,33 @@ class AdobeTVIE(AdobeTVBaseIE):
if not language:
language = 'en'
- video_data = self._download_json(
- self._API_BASE_URL + 'episode/get/?language=%s&show_urlname=%s&urlname=%s&disclosure=standard' % (language, show_urlname, urlname),
- urlname)['data'][0]
-
- formats = [{
- 'url': source['url'],
- 'format_id': source.get('quality_level') or source['url'].split('-')[-1].split('.')[0] or None,
- 'width': int_or_none(source.get('width')),
- 'height': int_or_none(source.get('height')),
- 'tbr': int_or_none(source.get('video_data_rate')),
- } for source in video_data['videos']]
- self._sort_formats(formats)
-
- return {
- 'id': compat_str(video_data['id']),
- 'title': video_data['title'],
- 'description': video_data.get('description'),
- 'thumbnail': video_data.get('thumbnail'),
- 'upload_date': unified_strdate(video_data.get('start_date')),
- 'duration': parse_duration(video_data.get('duration')),
- 'view_count': str_to_int(video_data.get('playcount')),
- 'formats': formats,
- }
+ video_data = self._call_api(
+ 'episode/get', urlname, {
+ 'disclosure': 'standard',
+ 'language': language,
+ 'show_urlname': show_urlname,
+ 'urlname': urlname,
+ })[0]
+ return self._parse_video_data(video_data)
class AdobeTVPlaylistBaseIE(AdobeTVBaseIE):
- def _parse_page_data(self, page_data):
- return [self.url_result(self._get_element_url(element_data)) for element_data in page_data]
+ _PAGE_SIZE = 25
+
+ def _fetch_page(self, display_id, query, page):
+ page += 1
+ query['page'] = page
+ for element_data in self._call_api(
+ self._RESOURCE, display_id, query, 'Download Page %d' % page):
+ yield self._process_data(element_data)
- def _extract_playlist_entries(self, url, display_id):
- page = self._download_json(url, display_id)
- entries = self._parse_page_data(page['data'])
- for page_num in range(2, page['paging']['pages'] + 1):
- entries.extend(self._parse_page_data(
- self._download_json(url + '&page=%d' % page_num, display_id)['data']))
- return entries
+ def _extract_playlist_entries(self, display_id, query):
+ return OnDemandPagedList(functools.partial(
+ self._fetch_page, display_id, query), self._PAGE_SIZE)
class AdobeTVShowIE(AdobeTVPlaylistBaseIE):
+ IE_NAME = 'adobetv:show'
_VALID_URL = r'https?://tv\.adobe\.com/(?:(?P<language>fr|de|es|jp)/)?show/(?P<id>[^/]+)'
_TEST = {
@@ -92,26 +174,31 @@ class AdobeTVShowIE(AdobeTVPlaylistBaseIE):
},
'playlist_mincount': 136,
}
-
- def _get_element_url(self, element_data):
- return element_data['urls'][0]
+ _RESOURCE = 'episode'
+ _process_data = AdobeTVBaseIE._parse_video_data
def _real_extract(self, url):
language, show_urlname = re.match(self._VALID_URL, url).groups()
if not language:
language = 'en'
- query = 'language=%s&show_urlname=%s' % (language, show_urlname)
+ query = {
+ 'disclosure': 'standard',
+ 'language': language,
+ 'show_urlname': show_urlname,
+ }
- show_data = self._download_json(self._API_BASE_URL + 'show/get/?%s' % query, show_urlname)['data'][0]
+ show_data = self._call_api(
+ 'show/get', show_urlname, query)[0]
return self.playlist_result(
- self._extract_playlist_entries(self._API_BASE_URL + 'episode/?%s' % query, show_urlname),
- compat_str(show_data['id']),
- show_data['show_name'],
- show_data['show_description'])
+ self._extract_playlist_entries(show_urlname, query),
+ str_or_none(show_data.get('id')),
+ show_data.get('show_name'),
+ show_data.get('show_description'))
class AdobeTVChannelIE(AdobeTVPlaylistBaseIE):
+ IE_NAME = 'adobetv:channel'
_VALID_URL = r'https?://tv\.adobe\.com/(?:(?P<language>fr|de|es|jp)/)?channel/(?P<id>[^/]+)(?:/(?P<category_urlname>[^/]+))?'
_TEST = {
@@ -121,24 +208,30 @@ class AdobeTVChannelIE(AdobeTVPlaylistBaseIE):
},
'playlist_mincount': 96,
}
+ _RESOURCE = 'show'
- def _get_element_url(self, element_data):
- return element_data['url']
+ def _process_data(self, show_data):
+ return self.url_result(
+ show_data['url'], 'AdobeTVShow', str_or_none(show_data.get('id')))
def _real_extract(self, url):
language, channel_urlname, category_urlname = re.match(self._VALID_URL, url).groups()
if not language:
language = 'en'
- query = 'language=%s&channel_urlname=%s' % (language, channel_urlname)
+ query = {
+ 'channel_urlname': channel_urlname,
+ 'language': language,
+ }
if category_urlname:
- query += '&category_urlname=%s' % category_urlname
+ query['category_urlname'] = category_urlname
return self.playlist_result(
- self._extract_playlist_entries(self._API_BASE_URL + 'show/?%s' % query, channel_urlname),
+ self._extract_playlist_entries(channel_urlname, query),
channel_urlname)
-class AdobeTVVideoIE(InfoExtractor):
+class AdobeTVVideoIE(AdobeTVBaseIE):
+ IE_NAME = 'adobetv:video'
_VALID_URL = r'https?://video\.tv\.adobe\.com/v/(?P<id>\d+)'
_TEST = {
@@ -160,38 +253,36 @@ class AdobeTVVideoIE(InfoExtractor):
video_data = self._parse_json(self._search_regex(
r'var\s+bridge\s*=\s*([^;]+);', webpage, 'bridged data'), video_id)
-
- formats = [{
- 'format_id': '%s-%s' % (determine_ext(source['src']), source.get('height')),
- 'url': source['src'],
- 'width': int_or_none(source.get('width')),
- 'height': int_or_none(source.get('height')),
- 'tbr': int_or_none(source.get('bitrate')),
- } for source in video_data['sources']]
+ title = video_data['title']
+
+ formats = []
+ sources = video_data.get('sources') or []
+ for source in sources:
+ source_src = source.get('src')
+ if not source_src:
+ continue
+ formats.append({
+ 'filesize': int_or_none(source.get('kilobytes') or None, invscale=1000),
+ 'format_id': '-'.join(filter(None, [source.get('format'), source.get('label')])),
+ 'height': int_or_none(source.get('height') or None),
+ 'tbr': int_or_none(source.get('bitrate') or None),
+ 'width': int_or_none(source.get('width') or None),
+ 'url': source_src,
+ })
self._sort_formats(formats)
# For both metadata and downloaded files the duration varies among
# formats. I just pick the max one
duration = max(filter(None, [
float_or_none(source.get('duration'), scale=1000)
- for source in video_data['sources']]))
-
- subtitles = {}
- for translation in video_data.get('translations', []):
- lang_id = translation.get('language_w3c') or ISO639Utils.long2short(translation['language_medium'])
- if lang_id not in subtitles:
- subtitles[lang_id] = []
- subtitles[lang_id].append({
- 'url': translation['vttPath'],
- 'ext': 'vtt',
- })
+ for source in sources]))
return {
'id': video_id,
'formats': formats,
- 'title': video_data['title'],
+ 'title': title,
'description': video_data.get('description'),
- 'thumbnail': video_data['video'].get('poster'),
+ 'thumbnail': video_data.get('video', {}).get('poster'),
'duration': duration,
- 'subtitles': subtitles,
+ 'subtitles': self._parse_subtitles(video_data, 'vttPath'),
}
diff --git a/youtube_dl/extractor/bambuser.py b/youtube_dl/extractor/bambuser.py
deleted file mode 100644
index 4400ff9c1..000000000
--- a/youtube_dl/extractor/bambuser.py
+++ /dev/null
@@ -1,142 +0,0 @@
-from __future__ import unicode_literals
-
-import re
-import itertools
-
-from .common import InfoExtractor
-from ..compat import compat_str
-from ..utils import (
- ExtractorError,
- float_or_none,
- int_or_none,
- sanitized_Request,
- urlencode_postdata,
-)
-
-
-class BambuserIE(InfoExtractor):
- IE_NAME = 'bambuser'
- _VALID_URL = r'https?://bambuser\.com/v/(?P<id>\d+)'
- _API_KEY = '005f64509e19a868399060af746a00aa'
- _LOGIN_URL = 'https://bambuser.com/user'
- _NETRC_MACHINE = 'bambuser'
-
- _TEST = {
- 'url': 'http://bambuser.com/v/4050584',
- # MD5 seems to be flaky, see https://travis-ci.org/ytdl-org/youtube-dl/jobs/14051016#L388
- # 'md5': 'fba8f7693e48fd4e8641b3fd5539a641',
- 'info_dict': {
- 'id': '4050584',
- 'ext': 'flv',
- 'title': 'Education engineering days - lightning talks',
- 'duration': 3741,
- 'uploader': 'pixelversity',
- 'uploader_id': '344706',
- 'timestamp': 1382976692,
- 'upload_date': '20131028',
- 'view_count': int,
- },
- 'params': {
- # It doesn't respect the 'Range' header, it would download the whole video
- # caused the travis builds to fail: https://travis-ci.org/ytdl-org/youtube-dl/jobs/14493845#L59
- 'skip_download': True,
- },
- }
-
- def _login(self):
- username, password = self._get_login_info()
- if username is None:
- return
-
- login_form = {
- 'form_id': 'user_login',
- 'op': 'Log in',
- 'name': username,
- 'pass': password,
- }
-
- request = sanitized_Request(
- self._LOGIN_URL, urlencode_postdata(login_form))
- request.add_header('Referer', self._LOGIN_URL)
- response = self._download_webpage(
- request, None, 'Logging in')
-
- login_error = self._html_search_regex(
- r'(?s)<div class="messages error">(.+?)</div>',
- response, 'login error', default=None)
- if login_error:
- raise ExtractorError(
- 'Unable to login: %s' % login_error, expected=True)
-
- def _real_initialize(self):
- self._login()
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- info = self._download_json(
- 'http://player-c.api.bambuser.com/getVideo.json?api_key=%s&vid=%s'
- % (self._API_KEY, video_id), video_id)
-
- error = info.get('error')
- if error:
- raise ExtractorError(
- '%s returned error: %s' % (self.IE_NAME, error), expected=True)
-
- result = info['result']
-
- return {
- 'id': video_id,
- 'title': result['title'],
- 'url': result['url'],
- 'thumbnail': result.get('preview'),
- 'duration': int_or_none(result.get('length')),
- 'uploader': result.get('username'),
- 'uploader_id': compat_str(result.get('owner', {}).get('uid')),
- 'timestamp': int_or_none(result.get('created')),
- 'fps': float_or_none(result.get('framerate')),
- 'view_count': int_or_none(result.get('views_total')),
- 'comment_count': int_or_none(result.get('comment_count')),
- }
-
-
-class BambuserChannelIE(InfoExtractor):
- IE_NAME = 'bambuser:channel'
- _VALID_URL = r'https?://bambuser\.com/channel/(?P<user>.*?)(?:/|#|\?|$)'
- # The maximum number we can get with each request
- _STEP = 50
- _TEST = {
- 'url': 'http://bambuser.com/channel/pixelversity',
- 'info_dict': {
- 'title': 'pixelversity',
- },
- 'playlist_mincount': 60,
- }
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- user = mobj.group('user')
- urls = []
- last_id = ''
- for i in itertools.count(1):
- req_url = (
- 'http://bambuser.com/xhr-api/index.php?username={user}'
- '&sort=created&access_mode=0%2C1%2C2&limit={count}'
- '&method=broadcast&format=json&vid_older_than={last}'
- ).format(user=user, count=self._STEP, last=last_id)
- req = sanitized_Request(req_url)
- # Without setting this header, we wouldn't get any result
- req.add_header('Referer', 'http://bambuser.com/channel/%s' % user)
- data = self._download_json(
- req, user, 'Downloading page %d' % i)
- results = data['result']
- if not results:
- break
- last_id = results[-1]['vid']
- urls.extend(self.url_result(v['page'], 'Bambuser') for v in results)
-
- return {
- '_type': 'playlist',
- 'title': user,
- 'entries': urls,
- }
diff --git a/youtube_dl/extractor/bellmedia.py b/youtube_dl/extractor/bellmedia.py
index f36a2452d..485173774 100644
--- a/youtube_dl/extractor/bellmedia.py
+++ b/youtube_dl/extractor/bellmedia.py
@@ -22,7 +22,8 @@ class BellMediaIE(InfoExtractor):
bravo|
mtv|
space|
- etalk
+ etalk|
+ marilyn
)\.ca|
much\.com
)/.*?(?:\bvid(?:eoid)?=|-vid|~|%7E|/(?:episode)?)(?P<id>[0-9]{6,})'''
@@ -70,6 +71,7 @@ class BellMediaIE(InfoExtractor):
'animalplanet': 'aniplan',
'etalk': 'ctv',
'bnnbloomberg': 'bnn',
+ 'marilyn': 'ctv_marilyn',
}
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/bitchute.py b/youtube_dl/extractor/bitchute.py
index 430663fbf..0c773e66e 100644
--- a/youtube_dl/extractor/bitchute.py
+++ b/youtube_dl/extractor/bitchute.py
@@ -7,6 +7,7 @@ import re
from .common import InfoExtractor
from ..utils import (
orderedSet,
+ unified_strdate,
urlencode_postdata,
)
@@ -23,6 +24,7 @@ class BitChuteIE(InfoExtractor):
'description': 'md5:3f21f6fb5b1d17c3dee9cf6b5fe60b3a',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Victoria X Rave',
+ 'upload_date': '20170813',
},
}, {
'url': 'https://www.bitchute.com/embed/lbb5G1hjPhw/',
@@ -74,12 +76,17 @@ class BitChuteIE(InfoExtractor):
r'(?s)<p\b[^>]+\bclass=["\']video-author[^>]+>(.+?)</p>'),
webpage, 'uploader', fatal=False)
+ upload_date = unified_strdate(self._search_regex(
+ r'class=["\']video-publish-date[^>]+>[^<]+ at \d+:\d+ UTC on (.+?)\.',
+ webpage, 'upload date', fatal=False))
+
return {
'id': video_id,
'title': title,
'description': description,
'thumbnail': thumbnail,
'uploader': uploader,
+ 'upload_date': upload_date,
'formats': formats,
}
diff --git a/youtube_dl/extractor/channel9.py b/youtube_dl/extractor/channel9.py
index 81108e704..09cacf6d3 100644
--- a/youtube_dl/extractor/channel9.py
+++ b/youtube_dl/extractor/channel9.py
@@ -32,7 +32,7 @@ class Channel9IE(InfoExtractor):
'upload_date': '20130828',
'session_code': 'KOS002',
'session_room': 'Arena 1A',
- 'session_speakers': ['Andrew Coates', 'Brady Gaster', 'Mads Kristensen', 'Ed Blankenship', 'Patrick Klug'],
+ 'session_speakers': 'count:5',
},
}, {
'url': 'http://channel9.msdn.com/posts/Self-service-BI-with-Power-BI-nuclear-testing',
@@ -65,14 +65,14 @@ class Channel9IE(InfoExtractor):
'skip_download': True,
},
}, {
- 'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS',
+ 'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS',
'info_dict': {
- 'id': 'Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b',
- 'title': 'Channel 9',
+ 'id': 'Events/DEVintersection/DEVintersection-2016',
+ 'title': 'DEVintersection 2016 Orlando Sessions',
},
- 'playlist_mincount': 100,
+ 'playlist_mincount': 14,
}, {
- 'url': 'https://channel9.msdn.com/Events/DEVintersection/DEVintersection-2016/RSS',
+ 'url': 'https://channel9.msdn.com/Niners/Splendid22/Queue/76acff796e8f411184b008028e0d492b/RSS',
'only_matching': True,
}, {
'url': 'https://channel9.msdn.com/Events/Speakers/scott-hanselman/RSS?UrlSafeName=scott-hanselman',
@@ -112,11 +112,11 @@ class Channel9IE(InfoExtractor):
episode_data), content_path)
content_id = episode_data['contentId']
is_session = '/Sessions(' in episode_data['api']
- content_url = 'https://channel9.msdn.com/odata' + episode_data['api']
+ content_url = 'https://channel9.msdn.com/odata' + episode_data['api'] + '?$select=Captions,CommentCount,MediaLengthInSeconds,PublishedDate,Rating,RatingCount,Title,VideoMP4High,VideoMP4Low,VideoMP4Medium,VideoPlayerPreviewImage,VideoWMV,VideoWMVHQ,Views,'
if is_session:
- content_url += '?$expand=Speakers'
+ content_url += 'Code,Description,Room,Slides,Speakers,ZipFile&$expand=Speakers'
else:
- content_url += '?$expand=Authors'
+ content_url += 'Authors,Body&$expand=Authors'
content_data = self._download_json(content_url, content_id)
title = content_data['Title']
@@ -210,7 +210,7 @@ class Channel9IE(InfoExtractor):
'id': content_id,
'title': title,
'description': clean_html(content_data.get('Description') or content_data.get('Body')),
- 'thumbnail': content_data.get('Thumbnail') or content_data.get('VideoPlayerPreviewImage'),
+ 'thumbnail': content_data.get('VideoPlayerPreviewImage'),
'duration': int_or_none(content_data.get('MediaLengthInSeconds')),
'timestamp': parse_iso8601(content_data.get('PublishedDate')),
'avg_rating': int_or_none(content_data.get('Rating')),
diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py
index 656e715ae..a459dcb8d 100644
--- a/youtube_dl/extractor/chaturbate.py
+++ b/youtube_dl/extractor/chaturbate.py
@@ -3,7 +3,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import ExtractorError
+from ..utils import (
+ ExtractorError,
+ lowercase_escape,
+ url_or_none,
+)
class ChaturbateIE(InfoExtractor):
@@ -38,12 +42,31 @@ class ChaturbateIE(InfoExtractor):
'https://chaturbate.com/%s/' % video_id, video_id,
headers=self.geo_verification_headers())
- m3u8_urls = []
+ found_m3u8_urls = []
+
+ data = self._parse_json(
+ self._search_regex(
+ r'initialRoomDossier\s*=\s*(["\'])(?P<value>(?:(?!\1).)+)\1',
+ webpage, 'data', default='{}', group='value'),
+ video_id, transform_source=lowercase_escape, fatal=False)
+ if data:
+ m3u8_url = url_or_none(data.get('hls_source'))
+ if m3u8_url:
+ found_m3u8_urls.append(m3u8_url)
+
+ if not found_m3u8_urls:
+ for m in re.finditer(
+ r'(\\u002[27])(?P<url>http.+?\.m3u8.*?)\1', webpage):
+ found_m3u8_urls.append(lowercase_escape(m.group('url')))
- for m in re.finditer(
- r'(["\'])(?P<url>http.+?\.m3u8.*?)\1', webpage):
- m3u8_fast_url, m3u8_no_fast_url = m.group('url'), m.group(
- 'url').replace('_fast', '')
+ if not found_m3u8_urls:
+ for m in re.finditer(
+ r'(["\'])(?P<url>http.+?\.m3u8.*?)\1', webpage):
+ found_m3u8_urls.append(m.group('url'))
+
+ m3u8_urls = []
+ for found_m3u8_url in found_m3u8_urls:
+ m3u8_fast_url, m3u8_no_fast_url = found_m3u8_url, found_m3u8_url.replace('_fast', '')
for m3u8_url in (m3u8_fast_url, m3u8_no_fast_url):
if m3u8_url not in m3u8_urls:
m3u8_urls.append(m3u8_url)
@@ -63,7 +86,12 @@ class ChaturbateIE(InfoExtractor):
formats = []
for m3u8_url in m3u8_urls:
- m3u8_id = 'fast' if '_fast' in m3u8_url else 'slow'
+ for known_id in ('fast', 'slow'):
+ if '_%s' % known_id in m3u8_url:
+ m3u8_id = known_id
+ break
+ else:
+ m3u8_id = None
formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, ext='mp4',
# ffmpeg skips segments for fast m3u8
diff --git a/youtube_dl/extractor/comcarcoff.py b/youtube_dl/extractor/comcarcoff.py
deleted file mode 100644
index 588aad0d9..000000000
--- a/youtube_dl/extractor/comcarcoff.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from ..compat import compat_str
-from ..utils import (
- int_or_none,
- parse_duration,
- parse_iso8601,
-)
-
-
-class ComCarCoffIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?comediansincarsgettingcoffee\.com/(?P<id>[a-z0-9\-]*)'
- _TESTS = [{
- 'url': 'http://comediansincarsgettingcoffee.com/miranda-sings-happy-thanksgiving-miranda/',
- 'info_dict': {
- 'id': '2494164',
- 'ext': 'mp4',
- 'upload_date': '20141127',
- 'timestamp': 1417107600,
- 'duration': 1232,
- 'title': 'Happy Thanksgiving Miranda',
- 'description': 'Jerry Seinfeld and his special guest Miranda Sings cruise around town in search of coffee, complaining and apologizing along the way.',
- },
- 'params': {
- 'skip_download': 'requires ffmpeg',
- }
- }]
-
- def _real_extract(self, url):
- display_id = self._match_id(url)
- if not display_id:
- display_id = 'comediansincarsgettingcoffee.com'
- webpage = self._download_webpage(url, display_id)
-
- full_data = self._parse_json(
- self._search_regex(
- r'window\.app\s*=\s*({.+?});\n', webpage, 'full data json'),
- display_id)['videoData']
-
- display_id = full_data['activeVideo']['video']
- video_data = full_data.get('videos', {}).get(display_id) or full_data['singleshots'][display_id]
-
- video_id = compat_str(video_data['mediaId'])
- title = video_data['title']
- formats = self._extract_m3u8_formats(
- video_data['mediaUrl'], video_id, 'mp4')
- self._sort_formats(formats)
-
- thumbnails = [{
- 'url': video_data['images']['thumb'],
- }, {
- 'url': video_data['images']['poster'],
- }]
-
- timestamp = int_or_none(video_data.get('pubDateTime')) or parse_iso8601(
- video_data.get('pubDate'))
- duration = int_or_none(video_data.get('durationSeconds')) or parse_duration(
- video_data.get('duration'))
-
- return {
- 'id': video_id,
- 'display_id': display_id,
- 'title': title,
- 'description': video_data.get('description'),
- 'timestamp': timestamp,
- 'duration': duration,
- 'thumbnails': thumbnails,
- 'formats': formats,
- 'season_number': int_or_none(video_data.get('season')),
- 'episode_number': int_or_none(video_data.get('episode')),
- 'webpage_url': 'http://comediansincarsgettingcoffee.com/%s' % (video_data.get('urlSlug', video_data.get('slug'))),
- }
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 50d48c40d..eaae5e484 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -1455,14 +1455,14 @@ class InfoExtractor(object):
def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
transform_source=lambda s: fix_xml_ampersands(s).strip(),
- fatal=True, m3u8_id=None):
+ fatal=True, m3u8_id=None, data=None, headers={}, query={}):
manifest = self._download_xml(
manifest_url, video_id, 'Downloading f4m manifest',
'Unable to download f4m manifest',
# Some manifests may be malformed, e.g. prosiebensat1 generated manifests
# (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
transform_source=transform_source,
- fatal=fatal)
+ fatal=fatal, data=data, headers=headers, query=query)
if manifest is False:
return []
@@ -1586,12 +1586,13 @@ class InfoExtractor(object):
def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
entry_protocol='m3u8', preference=None,
m3u8_id=None, note=None, errnote=None,
- fatal=True, live=False):
+ fatal=True, live=False, data=None, headers={},
+ query={}):
res = self._download_webpage_handle(
m3u8_url, video_id,
note=note or 'Downloading m3u8 information',
errnote=errnote or 'Failed to download m3u8 information',
- fatal=fatal)
+ fatal=fatal, data=data, headers=headers, query=query)
if res is False:
return []
@@ -1765,6 +1766,19 @@ class InfoExtractor(object):
# the same GROUP-ID
f['acodec'] = 'none'
formats.append(f)
+
+ # for DailyMotion
+ progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
+ if progressive_uri:
+ http_f = f.copy()
+ del http_f['manifest_url']
+ http_f.update({
+ 'format_id': f['format_id'].replace('hls-', 'http-'),
+ 'protocol': 'http',
+ 'url': progressive_uri,
+ })
+ formats.append(http_f)
+
last_stream_inf = {}
return formats
@@ -2009,12 +2023,12 @@ class InfoExtractor(object):
})
return entries
- def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
+ def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}, data=None, headers={}, query={}):
res = self._download_xml_handle(
mpd_url, video_id,
note=note or 'Downloading MPD manifest',
errnote=errnote or 'Failed to download MPD manifest',
- fatal=fatal)
+ fatal=fatal, data=data, headers=headers, query=query)
if res is False:
return []
mpd_doc, urlh = res
@@ -2317,12 +2331,12 @@ class InfoExtractor(object):
self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
return formats
- def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
+ def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
res = self._download_xml_handle(
ism_url, video_id,
note=note or 'Downloading ISM manifest',
errnote=errnote or 'Failed to download ISM manifest',
- fatal=fatal)
+ fatal=fatal, data=data, headers=headers, query=query)
if res is False:
return []
ism_doc, urlh = res
@@ -2689,7 +2703,7 @@ class InfoExtractor(object):
entry = {
'id': this_video_id,
'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
- 'description': video_data.get('description'),
+ 'description': clean_html(video_data.get('description')),
'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
'timestamp': int_or_none(video_data.get('pubdate')),
'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
diff --git a/youtube_dl/extractor/corus.py b/youtube_dl/extractor/corus.py
index a1b251804..e11aadf14 100644
--- a/youtube_dl/extractor/corus.py
+++ b/youtube_dl/extractor/corus.py
@@ -4,7 +4,12 @@ from __future__ import unicode_literals
import re
from .theplatform import ThePlatformFeedIE
-from ..utils import int_or_none
+from ..utils import (
+ dict_get,
+ ExtractorError,
+ float_or_none,
+ int_or_none,
+)
class CorusIE(ThePlatformFeedIE):
@@ -12,24 +17,49 @@ class CorusIE(ThePlatformFeedIE):
https?://
(?:www\.)?
(?P<domain>
- (?:globaltv|etcanada)\.com|
- (?:hgtv|foodnetwork|slice|history|showcase|bigbrothercanada)\.ca
+ (?:
+ globaltv|
+ etcanada|
+ seriesplus|
+ wnetwork|
+ ytv
+ )\.com|
+ (?:
+ hgtv|
+ foodnetwork|
+ slice|
+ history|
+ showcase|
+ bigbrothercanada|
+ abcspark|
+ disney(?:channel|lachaine)
+ )\.ca
+ )
+ /(?:[^/]+/)*
+ (?:
+ video\.html\?.*?\bv=|
+ videos?/(?:[^/]+/)*(?:[a-z0-9-]+-)?
+ )
+ (?P<id>
+ [\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}|
+ (?:[A-Z]{4})?\d{12,20}
)
- /(?:video/(?:[^/]+/)?|(?:[^/]+/)+(?:videos/[a-z0-9-]+-|video\.html\?.*?\bv=))
- (?P<id>\d+)
'''
_TESTS = [{
'url': 'http://www.hgtv.ca/shows/bryan-inc/videos/movie-night-popcorn-with-bryan-870923331648/',
- 'md5': '05dcbca777bf1e58c2acbb57168ad3a6',
'info_dict': {
'id': '870923331648',
'ext': 'mp4',
'title': 'Movie Night Popcorn with Bryan',
'description': 'Bryan whips up homemade popcorn, the old fashion way for Jojo and Lincoln.',
- 'uploader': 'SHWM-NEW',
'upload_date': '20170206',
'timestamp': 1486392197,
},
+ 'params': {
+ 'format': 'bestvideo',
+ 'skip_download': True,
+ },
+ 'expected_warnings': ['Failed to parse JSON'],
}, {
'url': 'http://www.foodnetwork.ca/shows/chopped/video/episode/chocolate-obsession/video.html?v=872683587753',
'only_matching': True,
@@ -48,58 +78,83 @@ class CorusIE(ThePlatformFeedIE):
}, {
'url': 'https://www.bigbrothercanada.ca/video/big-brother-canada-704/1457812035894/',
'only_matching': True
+ }, {
+ 'url': 'https://www.seriesplus.com/emissions/dre-mary-mort-sur-ordonnance/videos/deux-coeurs-battant/SERP0055626330000200/',
+ 'only_matching': True
+ }, {
+ 'url': 'https://www.disneychannel.ca/shows/gabby-duran-the-unsittables/video/crybaby-duran-clip/2f557eec-0588-11ea-ae2b-e2c6776b770e/',
+ 'only_matching': True
}]
-
- _TP_FEEDS = {
- 'globaltv': {
- 'feed_id': 'ChQqrem0lNUp',
- 'account_id': 2269680845,
- },
- 'etcanada': {
- 'feed_id': 'ChQqrem0lNUp',
- 'account_id': 2269680845,
- },
- 'hgtv': {
- 'feed_id': 'L0BMHXi2no43',
- 'account_id': 2414428465,
- },
- 'foodnetwork': {
- 'feed_id': 'ukK8o58zbRmJ',
- 'account_id': 2414429569,
- },
- 'slice': {
- 'feed_id': '5tUJLgV2YNJ5',
- 'account_id': 2414427935,
- },
- 'history': {
- 'feed_id': 'tQFx_TyyEq4J',
- 'account_id': 2369613659,
- },
- 'showcase': {
- 'feed_id': '9H6qyshBZU3E',
- 'account_id': 2414426607,
- },
- 'bigbrothercanada': {
- 'feed_id': 'ChQqrem0lNUp',
- 'account_id': 2269680845,
- },
+ _GEO_BYPASS = False
+ _SITE_MAP = {
+ 'globaltv': 'series',
+ 'etcanada': 'series',
+ 'foodnetwork': 'food',
+ 'bigbrothercanada': 'series',
+ 'disneychannel': 'disneyen',
+ 'disneylachaine': 'disneyfr',
}
def _real_extract(self, url):
domain, video_id = re.match(self._VALID_URL, url).groups()
- feed_info = self._TP_FEEDS[domain.split('.')[0]]
- return self._extract_feed_info('dtjsEC', feed_info['feed_id'], 'byId=' + video_id, video_id, lambda e: {
- 'episode_number': int_or_none(e.get('pl1$episode')),
- 'season_number': int_or_none(e.get('pl1$season')),
- 'series': e.get('pl1$show'),
- }, {
- 'HLS': {
- 'manifest': 'm3u',
- },
- 'DesktopHLS Default': {
- 'manifest': 'm3u',
- },
- 'MP4 MBR': {
- 'manifest': 'm3u',
- },
- }, feed_info['account_id'])
+ site = domain.split('.')[0]
+ path = self._SITE_MAP.get(site, site)
+ if path != 'series':
+ path = 'migration/' + path
+ video = self._download_json(
+ 'https://globalcontent.corusappservices.com/templates/%s/playlist/' % path,
+ video_id, query={'byId': video_id},
+ headers={'Accept': 'application/json'})[0]
+ title = video['title']
+
+ formats = []
+ for source in video.get('sources', []):
+ smil_url = source.get('file')
+ if not smil_url:
+ continue
+ source_type = source.get('type')
+ note = 'Downloading%s smil file' % (' ' + source_type if source_type else '')
+ resp = self._download_webpage(
+ smil_url, video_id, note, fatal=False,
+ headers=self.geo_verification_headers())
+ if not resp:
+ continue
+ error = self._parse_json(resp, video_id, fatal=False)
+ if error:
+ if error.get('exception') == 'GeoLocationBlocked':
+ self.raise_geo_restricted(countries=['CA'])
+ raise ExtractorError(error['description'])
+ smil = self._parse_xml(resp, video_id, fatal=False)
+ if smil is None:
+ continue
+ namespace = self._parse_smil_namespace(smil)
+ formats.extend(self._parse_smil_formats(
+ smil, smil_url, video_id, namespace))
+ if not formats and video.get('drm'):
+ raise ExtractorError('This video is DRM protected.', expected=True)
+ self._sort_formats(formats)
+
+ subtitles = {}
+ for track in video.get('tracks', []):
+ track_url = track.get('file')
+ if not track_url:
+ continue
+ lang = 'fr' if site in ('disneylachaine', 'seriesplus') else 'en'
+ subtitles.setdefault(lang, []).append({'url': track_url})
+
+ metadata = video.get('metadata') or {}
+ get_number = lambda x: int_or_none(video.get('pl1$' + x) or metadata.get(x + 'Number'))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': dict_get(video, ('defaultThumbnailUrl', 'thumbnail', 'image')),
+ 'description': video.get('description'),
+ 'timestamp': int_or_none(video.get('availableDate'), 1000),
+ 'subtitles': subtitles,
+ 'duration': float_or_none(metadata.get('duration')),
+ 'series': dict_get(video, ('show', 'pl1$show')),
+ 'season_number': get_number('season'),
+ 'episode_number': get_number('episode'),
+ }
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index 745971900..327fdb04a 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -1,50 +1,93 @@
# coding: utf-8
from __future__ import unicode_literals
-import base64
import functools
-import hashlib
-import itertools
import json
-import random
import re
-import string
from .common import InfoExtractor
-from ..compat import compat_struct_pack
+from ..compat import compat_HTTPError
from ..utils import (
- determine_ext,
- error_to_compat_str,
+ age_restricted,
+ clean_html,
ExtractorError,
int_or_none,
- mimetype2ext,
OnDemandPagedList,
- parse_iso8601,
- sanitized_Request,
- str_to_int,
try_get,
unescapeHTML,
- update_url_query,
- url_or_none,
urlencode_postdata,
)
class DailymotionBaseInfoExtractor(InfoExtractor):
- @staticmethod
- def _build_request(url):
- """Build a request with the family filter disabled"""
- request = sanitized_Request(url)
- request.add_header('Cookie', 'family_filter=off; ff=off')
- return request
+ _FAMILY_FILTER = None
+ _HEADERS = {
+ 'Content-Type': 'application/json',
+ 'Origin': 'https://www.dailymotion.com',
+ }
+ _NETRC_MACHINE = 'dailymotion'
- def _download_webpage_handle_no_ff(self, url, *args, **kwargs):
- request = self._build_request(url)
- return self._download_webpage_handle(request, *args, **kwargs)
+ def _get_dailymotion_cookies(self):
+ return self._get_cookies('https://www.dailymotion.com/')
- def _download_webpage_no_ff(self, url, *args, **kwargs):
- request = self._build_request(url)
- return self._download_webpage(request, *args, **kwargs)
+ @staticmethod
+ def _get_cookie_value(cookies, name):
+ cookie = cookies.get('name')
+ if cookie:
+ return cookie.value
+
+ def _set_dailymotion_cookie(self, name, value):
+ self._set_cookie('www.dailymotion.com', name, value)
+
+ def _real_initialize(self):
+ cookies = self._get_dailymotion_cookies()
+ ff = self._get_cookie_value(cookies, 'ff')
+ self._FAMILY_FILTER = ff == 'on' if ff else age_restricted(18, self._downloader.params.get('age_limit'))
+ self._set_dailymotion_cookie('ff', 'on' if self._FAMILY_FILTER else 'off')
+
+ def _call_api(self, object_type, xid, object_fields, note, filter_extra=None):
+ if not self._HEADERS.get('Authorization'):
+ cookies = self._get_dailymotion_cookies()
+ token = self._get_cookie_value(cookies, 'access_token') or self._get_cookie_value(cookies, 'client_token')
+ if not token:
+ data = {
+ 'client_id': 'f1a362d288c1b98099c7',
+ 'client_secret': 'eea605b96e01c796ff369935357eca920c5da4c5',
+ }
+ username, password = self._get_login_info()
+ if username:
+ data.update({
+ 'grant_type': 'password',
+ 'password': password,
+ 'username': username,
+ })
+ else:
+ data['grant_type'] = 'client_credentials'
+ try:
+ token = self._download_json(
+ 'https://graphql.api.dailymotion.com/oauth/token',
+ None, 'Downloading Access Token',
+ data=urlencode_postdata(data))['access_token']
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
+ raise ExtractorError(self._parse_json(
+ e.cause.read().decode(), xid)['error_description'], expected=True)
+ raise
+ self._set_dailymotion_cookie('access_token' if username else 'client_token', token)
+ self._HEADERS['Authorization'] = 'Bearer ' + token
+
+ resp = self._download_json(
+ 'https://graphql.api.dailymotion.com/', xid, note, data=json.dumps({
+ 'query': '''{
+ %s(xid: "%s"%s) {
+ %s
+ }
+}''' % (object_type, xid, ', ' + filter_extra if filter_extra else '', object_fields),
+ }).encode(), headers=self._HEADERS)
+ obj = resp['data'][object_type]
+ if not obj:
+ raise ExtractorError(resp['errors'][0]['message'], expected=True)
+ return obj
class DailymotionIE(DailymotionBaseInfoExtractor):
@@ -54,18 +97,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
(?:(?:www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:embed|swf|\#)/)?video|swf)|
(?:www\.)?lequipe\.fr/video
)
- /(?P<id>[^/?_]+)
+ /(?P<id>[^/?_]+)(?:.+?\bplaylist=(?P<playlist_id>x[0-9a-z]+))?
'''
IE_NAME = 'dailymotion'
-
- _FORMATS = [
- ('stream_h264_ld_url', 'ld'),
- ('stream_h264_url', 'standard'),
- ('stream_h264_hq_url', 'hq'),
- ('stream_h264_hd_url', 'hd'),
- ('stream_h264_hd1080_url', 'hd180'),
- ]
-
_TESTS = [{
'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news',
'md5': '074b95bdee76b9e3654137aee9c79dfe',
@@ -74,7 +108,6 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
'ext': 'mp4',
'title': 'Office Christmas Party Review – Jason Bateman, Olivia Munn, T.J. Miller',
'description': 'Office Christmas Party Review - Jason Bateman, Olivia Munn, T.J. Miller',
- 'thumbnail': r're:^https?:.*\.(?:jpg|png)$',
'duration': 187,
'timestamp': 1493651285,
'upload_date': '20170501',
@@ -146,7 +179,16 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
}, {
'url': 'https://www.lequipe.fr/video/k7MtHciueyTcrFtFKA2',
'only_matching': True,
+ }, {
+ 'url': 'https://www.dailymotion.com/video/x3z49k?playlist=xv4bw',
+ 'only_matching': True,
}]
+ _GEO_BYPASS = False
+ _COMMON_MEDIA_FIELDS = '''description
+ geoblockedCountries {
+ allowed
+ }
+ xid'''
@staticmethod
def _extract_urls(webpage):
@@ -162,264 +204,140 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
return urls
def _real_extract(self, url):
- video_id = self._match_id(url)
-
- webpage = self._download_webpage_no_ff(
- 'https://www.dailymotion.com/video/%s' % video_id, video_id)
-
- age_limit = self._rta_search(webpage)
-
- description = self._og_search_description(
- webpage, default=None) or self._html_search_meta(
- 'description', webpage, 'description')
-
- view_count_str = self._search_regex(
- (r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:([\s\d,.]+)"',
- r'video_views_count[^>]+>\s+([\s\d\,.]+)'),
- webpage, 'view count', default=None)
- if view_count_str:
- view_count_str = re.sub(r'\s', '', view_count_str)
- view_count = str_to_int(view_count_str)
- comment_count = int_or_none(self._search_regex(
- r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserComments:(\d+)"',
- webpage, 'comment count', default=None))
-
- player_v5 = self._search_regex(
- [r'buildPlayer\(({.+?})\);\n', # See https://github.com/ytdl-org/youtube-dl/issues/7826
- r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);',
- r'buildPlayer\(({.+?})\);',
- r'var\s+config\s*=\s*({.+?});',
- # New layout regex (see https://github.com/ytdl-org/youtube-dl/issues/13580)
- r'__PLAYER_CONFIG__\s*=\s*({.+?});'],
- webpage, 'player v5', default=None)
- if player_v5:
- player = self._parse_json(player_v5, video_id, fatal=False) or {}
- metadata = try_get(player, lambda x: x['metadata'], dict)
- if not metadata:
- metadata_url = url_or_none(try_get(
- player, lambda x: x['context']['metadata_template_url1']))
- if metadata_url:
- metadata_url = metadata_url.replace(':videoId', video_id)
- else:
- metadata_url = update_url_query(
- 'https://www.dailymotion.com/player/metadata/video/%s'
- % video_id, {
- 'embedder': url,
- 'integration': 'inline',
- 'GK_PV5_NEON': '1',
- })
- metadata = self._download_json(
- metadata_url, video_id, 'Downloading metadata JSON')
-
- if try_get(metadata, lambda x: x['error']['type']) == 'password_protected':
- password = self._downloader.params.get('videopassword')
- if password:
- r = int(metadata['id'][1:], 36)
- us64e = lambda x: base64.urlsafe_b64encode(x).decode().strip('=')
- t = ''.join(random.choice(string.ascii_letters) for i in range(10))
- n = us64e(compat_struct_pack('I', r))
- i = us64e(hashlib.md5(('%s%d%s' % (password, r, t)).encode()).digest())
- metadata = self._download_json(
- 'http://www.dailymotion.com/player/metadata/video/p' + i + t + n, video_id)
-
- self._check_error(metadata)
-
- formats = []
- for quality, media_list in metadata['qualities'].items():
- for media in media_list:
- media_url = media.get('url')
- if not media_url:
- continue
- type_ = media.get('type')
- if type_ == 'application/vnd.lumberjack.manifest':
- continue
- ext = mimetype2ext(type_) or determine_ext(media_url)
- if ext == 'm3u8':
- m3u8_formats = self._extract_m3u8_formats(
- media_url, video_id, 'mp4', preference=-1,
- m3u8_id='hls', fatal=False)
- for f in m3u8_formats:
- f['url'] = f['url'].split('#')[0]
- formats.append(f)
- elif ext == 'f4m':
- formats.extend(self._extract_f4m_formats(
- media_url, video_id, preference=-1, f4m_id='hds', fatal=False))
- else:
- f = {
- 'url': media_url,
- 'format_id': 'http-%s' % quality,
- 'ext': ext,
- }
- m = re.search(r'H264-(?P<width>\d+)x(?P<height>\d+)', media_url)
- if m:
- f.update({
- 'width': int(m.group('width')),
- 'height': int(m.group('height')),
- })
- formats.append(f)
- self._sort_formats(formats)
-
- title = metadata['title']
- duration = int_or_none(metadata.get('duration'))
- timestamp = int_or_none(metadata.get('created_time'))
- thumbnail = metadata.get('poster_url')
- uploader = metadata.get('owner', {}).get('screenname')
- uploader_id = metadata.get('owner', {}).get('id')
-
- subtitles = {}
- subtitles_data = metadata.get('subtitles', {}).get('data', {})
- if subtitles_data and isinstance(subtitles_data, dict):
- for subtitle_lang, subtitle in subtitles_data.items():
- subtitles[subtitle_lang] = [{
- 'ext': determine_ext(subtitle_url),
- 'url': subtitle_url,
- } for subtitle_url in subtitle.get('urls', [])]
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'duration': duration,
- 'timestamp': timestamp,
- 'uploader': uploader,
- 'uploader_id': uploader_id,
- 'age_limit': age_limit,
- 'view_count': view_count,
- 'comment_count': comment_count,
- 'formats': formats,
- 'subtitles': subtitles,
- }
-
- # vevo embed
- vevo_id = self._search_regex(
- r'<link rel="video_src" href="[^"]*?vevo\.com[^"]*?video=(?P<id>[\w]*)',
- webpage, 'vevo embed', default=None)
- if vevo_id:
- return self.url_result('vevo:%s' % vevo_id, 'Vevo')
-
- # fallback old player
- embed_page = self._download_webpage_no_ff(
- 'https://www.dailymotion.com/embed/video/%s' % video_id,
- video_id, 'Downloading embed page')
-
- timestamp = parse_iso8601(self._html_search_meta(
- 'video:release_date', webpage, 'upload date'))
-
- info = self._parse_json(
- self._search_regex(
- r'var info = ({.*?}),$', embed_page,
- 'video info', flags=re.MULTILINE),
- video_id)
-
- self._check_error(info)
+ video_id, playlist_id = re.match(self._VALID_URL, url).groups()
+
+ if playlist_id:
+ if not self._downloader.params.get('noplaylist'):
+ self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % playlist_id)
+ return self.url_result(
+ 'http://www.dailymotion.com/playlist/' + playlist_id,
+ 'DailymotionPlaylist', playlist_id)
+ self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+
+ password = self._downloader.params.get('videopassword')
+ media = self._call_api(
+ 'media', video_id, '''... on Video {
+ %s
+ stats {
+ likes {
+ total
+ }
+ views {
+ total
+ }
+ }
+ }
+ ... on Live {
+ %s
+ audienceCount
+ isOnAir
+ }''' % (self._COMMON_MEDIA_FIELDS, self._COMMON_MEDIA_FIELDS), 'Downloading media JSON metadata',
+ 'password: "%s"' % self._downloader.params.get('videopassword') if password else None)
+ xid = media['xid']
+
+ metadata = self._download_json(
+ 'https://www.dailymotion.com/player/metadata/video/' + xid,
+ xid, 'Downloading metadata JSON',
+ query={'app': 'com.dailymotion.neon'})
+
+ error = metadata.get('error')
+ if error:
+ title = error.get('title') or error['raw_message']
+ # See https://developer.dailymotion.com/api#access-error
+ if error.get('code') == 'DM007':
+ allowed_countries = try_get(media, lambda x: x['geoblockedCountries']['allowed'], list)
+ self.raise_geo_restricted(msg=title, countries=allowed_countries)
+ raise ExtractorError(
+ '%s said: %s' % (self.IE_NAME, title), expected=True)
+ title = metadata['title']
+ is_live = media.get('isOnAir')
formats = []
- for (key, format_id) in self._FORMATS:
- video_url = info.get(key)
- if video_url is not None:
- m_size = re.search(r'H264-(\d+)x(\d+)', video_url)
- if m_size is not None:
- width, height = map(int_or_none, (m_size.group(1), m_size.group(2)))
+ for quality, media_list in metadata['qualities'].items():
+ for m in media_list:
+ media_url = m.get('url')
+ media_type = m.get('type')
+ if not media_url or media_type == 'application/vnd.lumberjack.manifest':
+ continue
+ if media_type == 'application/x-mpegURL':
+ formats.extend(self._extract_m3u8_formats(
+ media_url, video_id, 'mp4',
+ 'm3u8' if is_live else 'm3u8_native',
+ m3u8_id='hls', fatal=False))
else:
- width, height = None, None
- formats.append({
- 'url': video_url,
- 'ext': 'mp4',
- 'format_id': format_id,
- 'width': width,
- 'height': height,
- })
+ f = {
+ 'url': media_url,
+ 'format_id': 'http-' + quality,
+ }
+ m = re.search(r'/H264-(\d+)x(\d+)(?:-(60)/)?', media_url)
+ if m:
+ width, height, fps = map(int_or_none, m.groups())
+ f.update({
+ 'fps': fps,
+ 'height': height,
+ 'width': width,
+ })
+ formats.append(f)
+ for f in formats:
+ f['url'] = f['url'].split('#')[0]
+ if not f.get('fps') and f['format_id'].endswith('@60'):
+ f['fps'] = 60
self._sort_formats(formats)
- # subtitles
- video_subtitles = self.extract_subtitles(video_id, webpage)
-
- title = self._og_search_title(webpage, default=None)
- if title is None:
- title = self._html_search_regex(
- r'(?s)<span\s+id="video_title"[^>]*>(.*?)</span>', webpage,
- 'title')
+ subtitles = {}
+ subtitles_data = try_get(metadata, lambda x: x['subtitles']['data'], dict) or {}
+ for subtitle_lang, subtitle in subtitles_data.items():
+ subtitles[subtitle_lang] = [{
+ 'url': subtitle_url,
+ } for subtitle_url in subtitle.get('urls', [])]
+
+ thumbnails = []
+ for height, poster_url in metadata.get('posters', {}).items():
+ thumbnails.append({
+ 'height': int_or_none(height),
+ 'id': height,
+ 'url': poster_url,
+ })
+
+ owner = metadata.get('owner') or {}
+ stats = media.get('stats') or {}
+ get_count = lambda x: int_or_none(try_get(stats, lambda y: y[x + 's']['total']))
return {
'id': video_id,
+ 'title': self._live_title(title) if is_live else title,
+ 'description': clean_html(media.get('description')),
+ 'thumbnails': thumbnails,
+ 'duration': int_or_none(metadata.get('duration')) or None,
+ 'timestamp': int_or_none(metadata.get('created_time')),
+ 'uploader': owner.get('screenname'),
+ 'uploader_id': owner.get('id') or metadata.get('screenname'),
+ 'age_limit': 18 if metadata.get('explicit') else 0,
+ 'tags': metadata.get('tags'),
+ 'view_count': get_count('view') or int_or_none(media.get('audienceCount')),
+ 'like_count': get_count('like'),
'formats': formats,
- 'uploader': info['owner.screenname'],
- 'timestamp': timestamp,
- 'title': title,
- 'description': description,
- 'subtitles': video_subtitles,
- 'thumbnail': info['thumbnail_url'],
- 'age_limit': age_limit,
- 'view_count': view_count,
- 'duration': info['duration']
+ 'subtitles': subtitles,
+ 'is_live': is_live,
}
- def _check_error(self, info):
- error = info.get('error')
- if error:
- title = error.get('title') or error['message']
- # See https://developer.dailymotion.com/api#access-error
- if error.get('code') == 'DM007':
- self.raise_geo_restricted(msg=title)
- raise ExtractorError(
- '%s said: %s' % (self.IE_NAME, title), expected=True)
-
- def _get_subtitles(self, video_id, webpage):
- try:
- sub_list = self._download_webpage(
- 'https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id,
- video_id, note=False)
- except ExtractorError as err:
- self._downloader.report_warning('unable to download video subtitles: %s' % error_to_compat_str(err))
- return {}
- info = json.loads(sub_list)
- if (info['total'] > 0):
- sub_lang_list = dict((l['language'], [{'url': l['url'], 'ext': 'srt'}]) for l in info['list'])
- return sub_lang_list
- self._downloader.report_warning('video doesn\'t have subtitles')
- return {}
-
-class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
- IE_NAME = 'dailymotion:playlist'
- _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>x[0-9a-z]+)'
- _TESTS = [{
- 'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q',
- 'info_dict': {
- 'title': 'SPORT',
- 'id': 'xv4bw',
- },
- 'playlist_mincount': 20,
- }]
+class DailymotionPlaylistBaseIE(DailymotionBaseInfoExtractor):
_PAGE_SIZE = 100
- def _fetch_page(self, playlist_id, authorizaion, page):
+ def _fetch_page(self, playlist_id, page):
page += 1
- videos = self._download_json(
- 'https://graphql.api.dailymotion.com',
- playlist_id, 'Downloading page %d' % page,
- data=json.dumps({
- 'query': '''{
- collection(xid: "%s") {
- videos(first: %d, page: %d) {
- pageInfo {
- hasNextPage
- nextPage
- }
+ videos = self._call_api(
+ self._OBJECT_TYPE, playlist_id,
+ '''videos(allowExplicit: %s, first: %d, page: %d) {
edges {
node {
xid
url
}
}
- }
- }
-}''' % (playlist_id, self._PAGE_SIZE, page)
- }).encode(), headers={
- 'Authorization': authorizaion,
- 'Origin': 'https://www.dailymotion.com',
- })['data']['collection']['videos']
+ }''' % ('false' if self._FAMILY_FILTER else 'true', self._PAGE_SIZE, page),
+ 'Downloading page %d' % page)['videos']
for edge in videos['edges']:
node = edge['node']
yield self.url_result(
@@ -427,86 +345,49 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor):
def _real_extract(self, url):
playlist_id = self._match_id(url)
- webpage = self._download_webpage(url, playlist_id)
- api = self._parse_json(self._search_regex(
- r'__PLAYER_CONFIG__\s*=\s*({.+?});',
- webpage, 'player config'), playlist_id)['context']['api']
- auth = self._download_json(
- api.get('auth_url', 'https://graphql.api.dailymotion.com/oauth/token'),
- playlist_id, data=urlencode_postdata({
- 'client_id': api.get('client_id', 'f1a362d288c1b98099c7'),
- 'client_secret': api.get('client_secret', 'eea605b96e01c796ff369935357eca920c5da4c5'),
- 'grant_type': 'client_credentials',
- }))
- authorizaion = '%s %s' % (auth.get('token_type', 'Bearer'), auth['access_token'])
entries = OnDemandPagedList(functools.partial(
- self._fetch_page, playlist_id, authorizaion), self._PAGE_SIZE)
+ self._fetch_page, playlist_id), self._PAGE_SIZE)
return self.playlist_result(
- entries, playlist_id,
- self._og_search_title(webpage))
+ entries, playlist_id)
+
+
+class DailymotionPlaylistIE(DailymotionPlaylistBaseIE):
+ IE_NAME = 'dailymotion:playlist'
+ _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>x[0-9a-z]+)'
+ _TESTS = [{
+ 'url': 'http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q',
+ 'info_dict': {
+ 'id': 'xv4bw',
+ },
+ 'playlist_mincount': 20,
+ }]
+ _OBJECT_TYPE = 'collection'
-class DailymotionUserIE(DailymotionBaseInfoExtractor):
+class DailymotionUserIE(DailymotionPlaylistBaseIE):
IE_NAME = 'dailymotion:user'
- _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P<user>[^/]+)'
- _MORE_PAGES_INDICATOR = r'(?s)<div class="pages[^"]*">.*?<a\s+class="[^"]*?icon-arrow_right[^"]*?"'
- _PAGE_TEMPLATE = 'http://www.dailymotion.com/user/%s/%s'
+ _VALID_URL = r'https?://(?:www\.)?dailymotion\.[a-z]{2,3}/(?!(?:embed|swf|#|video|playlist)/)(?:(?:old/)?user/)?(?P<id>[^/]+)'
_TESTS = [{
'url': 'https://www.dailymotion.com/user/nqtv',
'info_dict': {
'id': 'nqtv',
- 'title': 'Rémi Gaillard',
},
- 'playlist_mincount': 100,
+ 'playlist_mincount': 152,
}, {
'url': 'http://www.dailymotion.com/user/UnderProject',
'info_dict': {
'id': 'UnderProject',
- 'title': 'UnderProject',
},
- 'playlist_mincount': 1800,
- 'expected_warnings': [
- 'Stopped at duplicated page',
- ],
+ 'playlist_mincount': 1000,
'skip': 'Takes too long time',
+ }, {
+ 'url': 'https://www.dailymotion.com/user/nqtv',
+ 'info_dict': {
+ 'id': 'nqtv',
+ },
+ 'playlist_mincount': 148,
+ 'params': {
+ 'age_limit': 0,
+ },
}]
-
- def _extract_entries(self, id):
- video_ids = set()
- processed_urls = set()
- for pagenum in itertools.count(1):
- page_url = self._PAGE_TEMPLATE % (id, pagenum)
- webpage, urlh = self._download_webpage_handle_no_ff(
- page_url, id, 'Downloading page %s' % pagenum)
- if urlh.geturl() in processed_urls:
- self.report_warning('Stopped at duplicated page %s, which is the same as %s' % (
- page_url, urlh.geturl()), id)
- break
-
- processed_urls.add(urlh.geturl())
-
- for video_id in re.findall(r'data-xid="(.+?)"', webpage):
- if video_id not in video_ids:
- yield self.url_result(
- 'http://www.dailymotion.com/video/%s' % video_id,
- DailymotionIE.ie_key(), video_id)
- video_ids.add(video_id)
-
- if re.search(self._MORE_PAGES_INDICATOR, webpage) is None:
- break
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- user = mobj.group('user')
- webpage = self._download_webpage(
- 'https://www.dailymotion.com/user/%s' % user, user)
- full_user = unescapeHTML(self._html_search_regex(
- r'<a class="nav-image" title="([^"]+)" href="/%s">' % re.escape(user),
- webpage, 'user'))
-
- return {
- '_type': 'playlist',
- 'id': user,
- 'title': full_user,
- 'entries': self._extract_entries(user),
- }
+ _OBJECT_TYPE = 'channel'
diff --git a/youtube_dl/extractor/daisuki.py b/youtube_dl/extractor/daisuki.py
deleted file mode 100644
index dbc1aa5d4..000000000
--- a/youtube_dl/extractor/daisuki.py
+++ /dev/null
@@ -1,154 +0,0 @@
-from __future__ import unicode_literals
-
-import base64
-import json
-import random
-import re
-
-from .common import InfoExtractor
-from ..aes import (
- aes_cbc_decrypt,
- aes_cbc_encrypt,
-)
-from ..compat import compat_b64decode
-from ..utils import (
- bytes_to_intlist,
- bytes_to_long,
- extract_attributes,
- ExtractorError,
- intlist_to_bytes,
- js_to_json,
- int_or_none,
- long_to_bytes,
- pkcs1pad,
-)
-
-
-class DaisukiMottoIE(InfoExtractor):
- _VALID_URL = r'https?://motto\.daisuki\.net/framewatch/embed/[^/]+/(?P<id>[0-9a-zA-Z]{3})'
-
- _TEST = {
- 'url': 'http://motto.daisuki.net/framewatch/embed/embedDRAGONBALLSUPERUniverseSurvivalsaga/V2e/760/428',
- 'info_dict': {
- 'id': 'V2e',
- 'ext': 'mp4',
- 'title': '#117 SHOWDOWN OF LOVE! ANDROIDS VS UNIVERSE 2!!',
- 'subtitles': {
- 'mul': [{
- 'ext': 'ttml',
- }],
- },
- },
- 'params': {
- 'skip_download': True, # AES-encrypted HLS stream
- },
- }
-
- # The public key in PEM format can be found in clientlibs_anime_watch.min.js
- _RSA_KEY = (0xc5524c25e8e14b366b3754940beeb6f96cb7e2feef0b932c7659a0c5c3bf173d602464c2df73d693b513ae06ff1be8f367529ab30bf969c5640522181f2a0c51ea546ae120d3d8d908595e4eff765b389cde080a1ef7f1bbfb07411cc568db73b7f521cedf270cbfbe0ddbc29b1ac9d0f2d8f4359098caffee6d07915020077d, 65537)
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- webpage = self._download_webpage(url, video_id)
-
- flashvars = self._parse_json(self._search_regex(
- r'(?s)var\s+flashvars\s*=\s*({.+?});', webpage, 'flashvars'),
- video_id, transform_source=js_to_json)
-
- iv = [0] * 16
-
- data = {}
- for key in ('device_cd', 'mv_id', 'ss1_prm', 'ss2_prm', 'ss3_prm', 'ss_id'):
- data[key] = flashvars.get(key, '')
-
- encrypted_rtn = None
-
- # Some AES keys are rejected. Try it with different AES keys
- for idx in range(5):
- aes_key = [random.randint(0, 254) for _ in range(32)]
- padded_aeskey = intlist_to_bytes(pkcs1pad(aes_key, 128))
-
- n, e = self._RSA_KEY
- encrypted_aeskey = long_to_bytes(pow(bytes_to_long(padded_aeskey), e, n))
- init_data = self._download_json(
- 'http://motto.daisuki.net/fastAPI/bgn/init/',
- video_id, query={
- 's': flashvars.get('s', ''),
- 'c': flashvars.get('ss3_prm', ''),
- 'e': url,
- 'd': base64.b64encode(intlist_to_bytes(aes_cbc_encrypt(
- bytes_to_intlist(json.dumps(data)),
- aes_key, iv))).decode('ascii'),
- 'a': base64.b64encode(encrypted_aeskey).decode('ascii'),
- }, note='Downloading JSON metadata' + (' (try #%d)' % (idx + 1) if idx > 0 else ''))
-
- if 'rtn' in init_data:
- encrypted_rtn = init_data['rtn']
- break
-
- self._sleep(5, video_id)
-
- if encrypted_rtn is None:
- raise ExtractorError('Failed to fetch init data')
-
- rtn = self._parse_json(
- intlist_to_bytes(aes_cbc_decrypt(bytes_to_intlist(
- compat_b64decode(encrypted_rtn)),
- aes_key, iv)).decode('utf-8').rstrip('\0'),
- video_id)
-
- title = rtn['title_str']
-
- formats = self._extract_m3u8_formats(
- rtn['play_url'], video_id, ext='mp4', entry_protocol='m3u8_native')
-
- subtitles = {}
- caption_url = rtn.get('caption_url')
- if caption_url:
- # mul: multiple languages
- subtitles['mul'] = [{
- 'url': caption_url,
- 'ext': 'ttml',
- }]
-
- return {
- 'id': video_id,
- 'title': title,
- 'formats': formats,
- 'subtitles': subtitles,
- }
-
-
-class DaisukiMottoPlaylistIE(InfoExtractor):
- _VALID_URL = r'https?://motto\.daisuki\.net/(?P<id>information)/'
-
- _TEST = {
- 'url': 'http://motto.daisuki.net/information/',
- 'info_dict': {
- 'title': 'DRAGON BALL SUPER',
- },
- 'playlist_mincount': 117,
- }
-
- def _real_extract(self, url):
- playlist_id = self._match_id(url)
-
- webpage = self._download_webpage(url, playlist_id)
-
- entries = []
- for li in re.findall(r'(<li[^>]+?data-product_id="[a-zA-Z0-9]{3}"[^>]+>)', webpage):
- attr = extract_attributes(li)
- ad_id = attr.get('data-ad_id')
- product_id = attr.get('data-product_id')
- if ad_id and product_id:
- episode_id = attr.get('data-chapter')
- entries.append({
- '_type': 'url_transparent',
- 'url': 'http://motto.daisuki.net/framewatch/embed/%s/%s/760/428' % (ad_id, product_id),
- 'episode_id': episode_id,
- 'episode_number': int_or_none(episode_id),
- 'ie_key': 'DaisukiMotto',
- })
-
- return self.playlist_result(entries, playlist_title='DRAGON BALL SUPER')
diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py
index 76f021892..137095577 100644
--- a/youtube_dl/extractor/daum.py
+++ b/youtube_dl/extractor/daum.py
@@ -2,25 +2,21 @@
from __future__ import unicode_literals
-import re
import itertools
from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
compat_urllib_parse_unquote,
- compat_urllib_parse_urlencode,
compat_urlparse,
)
-from ..utils import (
- int_or_none,
- str_to_int,
- xpath_text,
- unescapeHTML,
-)
-class DaumIE(InfoExtractor):
+class DaumBaseIE(InfoExtractor):
+ _KAKAO_EMBED_BASE = 'http://tv.kakao.com/embed/player/cliplink/'
+
+
+class DaumIE(DaumBaseIE):
_VALID_URL = r'https?://(?:(?:m\.)?tvpot\.daum\.net/v/|videofarm\.daum\.net/controller/player/VodPlayer\.swf\?vid=)(?P<id>[^?#&]+)'
IE_NAME = 'daum.net'
@@ -36,6 +32,9 @@ class DaumIE(InfoExtractor):
'duration': 2117,
'view_count': int,
'comment_count': int,
+ 'uploader_id': 186139,
+ 'uploader': '콘간지',
+ 'timestamp': 1387310323,
},
}, {
'url': 'http://m.tvpot.daum.net/v/65139429',
@@ -44,11 +43,14 @@ class DaumIE(InfoExtractor):
'ext': 'mp4',
'title': '1297회, \'아빠 아들로 태어나길 잘 했어\' 민수, 감동의 눈물[아빠 어디가] 20150118',
'description': 'md5:79794514261164ff27e36a21ad229fc5',
- 'upload_date': '20150604',
+ 'upload_date': '20150118',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
'duration': 154,
'view_count': int,
'comment_count': int,
+ 'uploader': 'MBC 예능',
+ 'uploader_id': 132251,
+ 'timestamp': 1421604228,
},
}, {
'url': 'http://tvpot.daum.net/v/07dXWRka62Y%24',
@@ -59,12 +61,15 @@ class DaumIE(InfoExtractor):
'id': 'vwIpVpCQsT8$',
'ext': 'flv',
'title': '01-Korean War ( Trouble on the horizon )',
- 'description': '\nKorean War 01\nTrouble on the horizon\n전쟁의 먹구름',
+ 'description': 'Korean War 01\r\nTrouble on the horizon\r\n전쟁의 먹구름',
'upload_date': '20080223',
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
'duration': 249,
'view_count': int,
'comment_count': int,
+ 'uploader': '까칠한 墮落始祖 황비홍님의',
+ 'uploader_id': 560824,
+ 'timestamp': 1203770745,
},
}, {
# Requires dte_type=WEB (#9972)
@@ -73,60 +78,24 @@ class DaumIE(InfoExtractor):
'info_dict': {
'id': 's3794Uf1NZeZ1qMpGpeqeRU',
'ext': 'mp4',
- 'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny) [쇼! 음악중심] 508회 20160611',
- 'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\n\n[쇼! 음악중심] 20160611, 507회',
- 'upload_date': '20160611',
+ 'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)',
+ 'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\r\n\r\n[쇼! 음악중심] 20160611, 507회',
+ 'upload_date': '20170129',
+ 'uploader': '쇼! 음악중심',
+ 'uploader_id': 2653210,
+ 'timestamp': 1485684628,
},
}]
def _real_extract(self, url):
video_id = compat_urllib_parse_unquote(self._match_id(url))
- movie_data = self._download_json(
- 'http://videofarm.daum.net/controller/api/closed/v1_2/IntegratedMovieData.json',
- video_id, 'Downloading video formats info', query={'vid': video_id, 'dte_type': 'WEB'})
-
- # For urls like http://m.tvpot.daum.net/v/65139429, where the video_id is really a clipid
- if not movie_data.get('output_list', {}).get('output_list') and re.match(r'^\d+$', video_id):
- return self.url_result('http://tvpot.daum.net/clip/ClipView.do?clipid=%s' % video_id)
-
- info = self._download_xml(
- 'http://tvpot.daum.net/clip/ClipInfoXml.do', video_id,
- 'Downloading video info', query={'vid': video_id})
-
- formats = []
- for format_el in movie_data['output_list']['output_list']:
- profile = format_el['profile']
- format_query = compat_urllib_parse_urlencode({
- 'vid': video_id,
- 'profile': profile,
- })
- url_doc = self._download_xml(
- 'http://videofarm.daum.net/controller/api/open/v1_2/MovieLocation.apixml?' + format_query,
- video_id, note='Downloading video data for %s format' % profile)
- format_url = url_doc.find('result/url').text
- formats.append({
- 'url': format_url,
- 'format_id': profile,
- 'width': int_or_none(format_el.get('width')),
- 'height': int_or_none(format_el.get('height')),
- 'filesize': int_or_none(format_el.get('filesize')),
- })
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'title': info.find('TITLE').text,
- 'formats': formats,
- 'thumbnail': xpath_text(info, 'THUMB_URL'),
- 'description': xpath_text(info, 'CONTENTS'),
- 'duration': int_or_none(xpath_text(info, 'DURATION')),
- 'upload_date': info.find('REGDTTM').text[:8],
- 'view_count': str_to_int(xpath_text(info, 'PLAY_CNT')),
- 'comment_count': str_to_int(xpath_text(info, 'COMMENT_CNT')),
- }
+ if not video_id.isdigit():
+ video_id += '@my'
+ return self.url_result(
+ self._KAKAO_EMBED_BASE + video_id, 'Kakao', video_id)
-class DaumClipIE(InfoExtractor):
+class DaumClipIE(DaumBaseIE):
_VALID_URL = r'https?://(?:m\.)?tvpot\.daum\.net/(?:clip/ClipView.(?:do|tv)|mypot/View.do)\?.*?clipid=(?P<id>\d+)'
IE_NAME = 'daum.net:clip'
_URL_TEMPLATE = 'http://tvpot.daum.net/clip/ClipView.do?clipid=%s'
@@ -142,6 +111,9 @@ class DaumClipIE(InfoExtractor):
'thumbnail': r're:^https?://.*\.(?:jpg|png)',
'duration': 3868,
'view_count': int,
+ 'uploader': 'GOMeXP',
+ 'uploader_id': 6667,
+ 'timestamp': 1377911092,
},
}, {
'url': 'http://m.tvpot.daum.net/clip/ClipView.tv?clipid=54999425',
@@ -154,22 +126,8 @@ class DaumClipIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- clip_info = self._download_json(
- 'http://tvpot.daum.net/mypot/json/GetClipInfo.do?clipid=%s' % video_id,
- video_id, 'Downloading clip info')['clip_bean']
-
- return {
- '_type': 'url_transparent',
- 'id': video_id,
- 'url': 'http://tvpot.daum.net/v/%s' % clip_info['vid'],
- 'title': unescapeHTML(clip_info['title']),
- 'thumbnail': clip_info.get('thumb_url'),
- 'description': clip_info.get('contents'),
- 'duration': int_or_none(clip_info.get('duration')),
- 'upload_date': clip_info.get('up_date')[:8],
- 'view_count': int_or_none(clip_info.get('play_count')),
- 'ie_key': 'Daum',
- }
+ return self.url_result(
+ self._KAKAO_EMBED_BASE + video_id, 'Kakao', video_id)
class DaumListIE(InfoExtractor):
diff --git a/youtube_dl/extractor/discoverynetworks.py b/youtube_dl/extractor/discoverynetworks.py
index fba1ef221..607a54948 100644
--- a/youtube_dl/extractor/discoverynetworks.py
+++ b/youtube_dl/extractor/discoverynetworks.py
@@ -3,63 +3,38 @@ from __future__ import unicode_literals
import re
-from .brightcove import BrightcoveLegacyIE
from .dplay import DPlayIE
-from ..compat import (
- compat_parse_qs,
- compat_urlparse,
-)
-from ..utils import smuggle_url
class DiscoveryNetworksDeIE(DPlayIE):
- _VALID_URL = r'''(?x)https?://(?:www\.)?(?P<site>discovery|tlc|animalplanet|dmax)\.de/
- (?:
- .*\#(?P<id>\d+)|
- (?:[^/]+/)*videos/(?P<display_id>[^/?#]+)|
- programme/(?P<programme>[^/]+)/video/(?P<alternate_id>[^/]+)
- )'''
+ _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:tlc|dmax)\.de|dplay\.co\.uk)/(?:programme|show)/(?P<programme>[^/]+)/video/(?P<alternate_id>[^/]+)'
_TESTS = [{
- 'url': 'http://www.tlc.de/sendungen/breaking-amish/videos/#3235167922001',
+ 'url': 'https://www.tlc.de/programme/breaking-amish/video/die-welt-da-drauen/DCB331270001100',
'info_dict': {
- 'id': '3235167922001',
+ 'id': '78867',
'ext': 'mp4',
- 'title': 'Breaking Amish: Die Welt da draußen',
- 'description': (
- 'Vier Amische und eine Mennonitin wagen in New York'
- ' den Sprung in ein komplett anderes Leben. Begleitet sie auf'
- ' ihrem spannenden Weg.'),
- 'timestamp': 1396598084,
- 'upload_date': '20140404',
- 'uploader_id': '1659832546',
+ 'title': 'Die Welt da draußen',
+ 'description': 'md5:61033c12b73286e409d99a41742ef608',
+ 'timestamp': 1554069600,
+ 'upload_date': '20190331',
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ 'skip_download': True,
},
}, {
- 'url': 'http://www.dmax.de/programme/storage-hunters-uk/videos/storage-hunters-uk-episode-6/',
+ 'url': 'https://www.dmax.de/programme/dmax-highlights/video/tuning-star-sidney-hoffmann-exklusiv-bei-dmax/191023082312316',
'only_matching': True,
}, {
- 'url': 'http://www.discovery.de/#5332316765001',
+ 'url': 'https://www.dplay.co.uk/show/ghost-adventures/video/hotel-leger-103620/EHD_280313B',
'only_matching': True,
}]
- BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1659832546/default_default/index.html?videoId=%s'
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- alternate_id = mobj.group('alternate_id')
- if alternate_id:
- self._initialize_geo_bypass({
- 'countries': ['DE'],
- })
- return self._get_disco_api_info(
- url, '%s/%s' % (mobj.group('programme'), alternate_id),
- 'sonic-eu1-prod.disco-api.com', mobj.group('site') + 'de')
- brightcove_id = mobj.group('id')
- if not brightcove_id:
- title = mobj.group('title')
- webpage = self._download_webpage(url, title)
- brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage)
- brightcove_id = compat_parse_qs(compat_urlparse.urlparse(
- brightcove_legacy_url).query)['@videoPlayer'][0]
- return self.url_result(smuggle_url(
- self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, {'geo_countries': ['DE']}),
- 'BrightcoveNew', brightcove_id)
+ domain, programme, alternate_id = re.match(self._VALID_URL, url).groups()
+ country = 'GB' if domain == 'dplay.co.uk' else 'DE'
+ realm = 'questuk' if country == 'GB' else domain.replace('.', '')
+ return self._get_disco_api_info(
+ url, '%s/%s' % (programme, alternate_id),
+ 'sonic-eu1-prod.disco-api.com', realm, country)
diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py
index ebf59512c..a7b9db568 100644
--- a/youtube_dl/extractor/dplay.py
+++ b/youtube_dl/extractor/dplay.py
@@ -1,74 +1,68 @@
# coding: utf-8
from __future__ import unicode_literals
-import json
import re
-import time
from .common import InfoExtractor
-from ..compat import (
- compat_HTTPError,
- compat_str,
- compat_urlparse,
-)
+from ..compat import compat_HTTPError
from ..utils import (
determine_ext,
ExtractorError,
float_or_none,
int_or_none,
- remove_end,
- try_get,
- unified_strdate,
unified_timestamp,
- update_url_query,
- urljoin,
- USER_AGENTS,
)
class DPlayIE(InfoExtractor):
- _VALID_URL = r'https?://(?P<domain>www\.(?P<host>dplay\.(?P<country>dk|se|no)))/(?:video(?:er|s)/)?(?P<id>[^/]+/[^/?#]+)'
+ _VALID_URL = r'''(?x)https?://
+ (?P<domain>
+ (?:www\.)?(?P<host>dplay\.(?P<country>dk|fi|jp|se|no))|
+ (?P<subdomain_country>es|it)\.dplay\.com
+ )/[^/]+/(?P<id>[^/]+/[^/?#]+)'''
_TESTS = [{
# non geo restricted, via secure api, unsigned download hls URL
- 'url': 'http://www.dplay.se/nugammalt-77-handelser-som-format-sverige/season-1-svensken-lar-sig-njuta-av-livet/',
+ 'url': 'https://www.dplay.se/videos/nugammalt-77-handelser-som-format-sverige/nugammalt-77-handelser-som-format-sverige-101',
'info_dict': {
- 'id': '3172',
- 'display_id': 'nugammalt-77-handelser-som-format-sverige/season-1-svensken-lar-sig-njuta-av-livet',
+ 'id': '13628',
+ 'display_id': 'nugammalt-77-handelser-som-format-sverige/nugammalt-77-handelser-som-format-sverige-101',
'ext': 'mp4',
'title': 'Svensken lär sig njuta av livet',
'description': 'md5:d3819c9bccffd0fe458ca42451dd50d8',
- 'duration': 2650,
- 'timestamp': 1365454320,
+ 'duration': 2649.856,
+ 'timestamp': 1365453720,
'upload_date': '20130408',
- 'creator': 'Kanal 5 (Home)',
+ 'creator': 'Kanal 5',
'series': 'Nugammalt - 77 händelser som format Sverige',
'season_number': 1,
'episode_number': 1,
- 'age_limit': 0,
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ 'skip_download': True,
},
}, {
# geo restricted, via secure api, unsigned download hls URL
- 'url': 'http://www.dplay.dk/mig-og-min-mor/season-6-episode-12/',
+ 'url': 'http://www.dplay.dk/videoer/ted-bundy-mind-of-a-monster/ted-bundy-mind-of-a-monster',
'info_dict': {
- 'id': '70816',
- 'display_id': 'mig-og-min-mor/season-6-episode-12',
+ 'id': '104465',
+ 'display_id': 'ted-bundy-mind-of-a-monster/ted-bundy-mind-of-a-monster',
'ext': 'mp4',
- 'title': 'Episode 12',
- 'description': 'md5:9c86e51a93f8a4401fc9641ef9894c90',
- 'duration': 2563,
- 'timestamp': 1429696800,
- 'upload_date': '20150422',
- 'creator': 'Kanal 4 (Home)',
- 'series': 'Mig og min mor',
- 'season_number': 6,
- 'episode_number': 12,
- 'age_limit': 0,
+ 'title': 'Ted Bundy: Mind Of A Monster',
+ 'description': 'md5:8b780f6f18de4dae631668b8a9637995',
+ 'duration': 5290.027,
+ 'timestamp': 1570694400,
+ 'upload_date': '20191010',
+ 'creator': 'ID - Investigation Discovery',
+ 'series': 'Ted Bundy: Mind Of A Monster',
+ 'season_number': 1,
+ 'episode_number': 1,
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ 'skip_download': True,
},
- }, {
- # geo restricted, via direct unsigned hls URL
- 'url': 'http://www.dplay.no/pga-tour/season-1-hoydepunkter-18-21-februar/',
- 'only_matching': True,
}, {
# disco-api
'url': 'https://www.dplay.no/videoer/i-kongens-klr/sesong-1-episode-7',
@@ -89,19 +83,59 @@ class DPlayIE(InfoExtractor):
'format': 'bestvideo',
'skip_download': True,
},
+ 'skip': 'Available for Premium users',
}, {
-
- 'url': 'https://www.dplay.dk/videoer/singleliv/season-5-episode-3',
+ 'url': 'http://it.dplay.com/nove/biografie-imbarazzanti/luigi-di-maio-la-psicosi-di-stanislawskij/',
+ 'md5': '2b808ffb00fc47b884a172ca5d13053c',
+ 'info_dict': {
+ 'id': '6918',
+ 'display_id': 'biografie-imbarazzanti/luigi-di-maio-la-psicosi-di-stanislawskij',
+ 'ext': 'mp4',
+ 'title': 'Luigi Di Maio: la psicosi di Stanislawskij',
+ 'description': 'md5:3c7a4303aef85868f867a26f5cc14813',
+ 'thumbnail': r're:^https?://.*\.jpe?g',
+ 'upload_date': '20160524',
+ 'timestamp': 1464076800,
+ 'series': 'Biografie imbarazzanti',
+ 'season_number': 1,
+ 'episode': 'Episode 1',
+ 'episode_number': 1,
+ },
+ }, {
+ 'url': 'https://es.dplay.com/dmax/la-fiebre-del-oro/temporada-8-episodio-1/',
+ 'info_dict': {
+ 'id': '21652',
+ 'display_id': 'la-fiebre-del-oro/temporada-8-episodio-1',
+ 'ext': 'mp4',
+ 'title': 'Episodio 1',
+ 'description': 'md5:b9dcff2071086e003737485210675f69',
+ 'thumbnail': r're:^https?://.*\.png',
+ 'upload_date': '20180709',
+ 'timestamp': 1531173540,
+ 'series': 'La fiebre del oro',
+ 'season_number': 8,
+ 'episode': 'Episode 1',
+ 'episode_number': 1,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://www.dplay.fi/videot/shifting-gears-with-aaron-kaufman/episode-16',
'only_matching': True,
}, {
- 'url': 'https://www.dplay.se/videos/sofias-anglar/sofias-anglar-1001',
+ 'url': 'https://www.dplay.jp/video/gold-rush/24086',
'only_matching': True,
}]
- def _get_disco_api_info(self, url, display_id, disco_host, realm):
- disco_base = 'https://' + disco_host
+ def _get_disco_api_info(self, url, display_id, disco_host, realm, country):
+ geo_countries = [country.upper()]
+ self._initialize_geo_bypass({
+ 'countries': geo_countries,
+ })
+ disco_base = 'https://%s/' % disco_host
token = self._download_json(
- '%s/token' % disco_base, display_id, 'Downloading token',
+ disco_base + 'token', display_id, 'Downloading token',
query={
'realm': realm,
})['data']['attributes']['token']
@@ -110,17 +144,35 @@ class DPlayIE(InfoExtractor):
'Authorization': 'Bearer ' + token,
}
video = self._download_json(
- '%s/content/videos/%s' % (disco_base, display_id), display_id,
+ disco_base + 'content/videos/' + display_id, display_id,
headers=headers, query={
- 'include': 'show'
+ 'fields[channel]': 'name',
+ 'fields[image]': 'height,src,width',
+ 'fields[show]': 'name',
+ 'fields[tag]': 'name',
+ 'fields[video]': 'description,episodeNumber,name,publishStart,seasonNumber,videoDuration',
+ 'include': 'images,primaryChannel,show,tags'
})
video_id = video['data']['id']
info = video['data']['attributes']
- title = info['name']
+ title = info['name'].strip()
formats = []
- for format_id, format_dict in self._download_json(
- '%s/playback/videoPlaybackInfo/%s' % (disco_base, video_id),
- display_id, headers=headers)['data']['attributes']['streaming'].items():
+ try:
+ streaming = self._download_json(
+ disco_base + 'playback/videoPlaybackInfo/' + video_id,
+ display_id, headers=headers)['data']['attributes']['streaming']
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ info = self._parse_json(e.cause.read().decode('utf-8'), display_id)
+ error = info['errors'][0]
+ error_code = error.get('code')
+ if error_code == 'access.denied.geoblocked':
+ self.raise_geo_restricted(countries=geo_countries)
+ elif error_code == 'access.denied.missingpackage':
+ self.raise_login_required()
+ raise ExtractorError(info['errors'][0]['detail'], expected=True)
+ raise
+ for format_id, format_dict in streaming.items():
if not isinstance(format_dict, dict):
continue
format_url = format_dict.get('url')
@@ -142,235 +194,54 @@ class DPlayIE(InfoExtractor):
})
self._sort_formats(formats)
- series = None
- try:
- included = video.get('included')
- if isinstance(included, list):
- show = next(e for e in included if e.get('type') == 'show')
- series = try_get(
- show, lambda x: x['attributes']['name'], compat_str)
- except StopIteration:
- pass
+ creator = series = None
+ tags = []
+ thumbnails = []
+ included = video.get('included') or []
+ if isinstance(included, list):
+ for e in included:
+ attributes = e.get('attributes')
+ if not attributes:
+ continue
+ e_type = e.get('type')
+ if e_type == 'channel':
+ creator = attributes.get('name')
+ elif e_type == 'image':
+ src = attributes.get('src')
+ if src:
+ thumbnails.append({
+ 'url': src,
+ 'width': int_or_none(attributes.get('width')),
+ 'height': int_or_none(attributes.get('height')),
+ })
+ if e_type == 'show':
+ series = attributes.get('name')
+ elif e_type == 'tag':
+ name = attributes.get('name')
+ if name:
+ tags.append(name)
return {
'id': video_id,
'display_id': display_id,
'title': title,
'description': info.get('description'),
- 'duration': float_or_none(
- info.get('videoDuration'), scale=1000),
+ 'duration': float_or_none(info.get('videoDuration'), 1000),
'timestamp': unified_timestamp(info.get('publishStart')),
'series': series,
'season_number': int_or_none(info.get('seasonNumber')),
'episode_number': int_or_none(info.get('episodeNumber')),
- 'age_limit': int_or_none(info.get('minimum_age')),
+ 'creator': creator,
+ 'tags': tags,
+ 'thumbnails': thumbnails,
'formats': formats,
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
display_id = mobj.group('id')
- domain = mobj.group('domain')
-
- self._initialize_geo_bypass({
- 'countries': [mobj.group('country').upper()],
- })
-
- webpage = self._download_webpage(url, display_id)
-
- video_id = self._search_regex(
- r'data-video-id=["\'](\d+)', webpage, 'video id', default=None)
-
- if not video_id:
- host = mobj.group('host')
- return self._get_disco_api_info(
- url, display_id, 'disco-api.' + host, host.replace('.', ''))
-
- info = self._download_json(
- 'http://%s/api/v2/ajax/videos?video_id=%s' % (domain, video_id),
- video_id)['data'][0]
-
- title = info['title']
-
- PROTOCOLS = ('hls', 'hds')
- formats = []
-
- def extract_formats(protocol, manifest_url):
- if protocol == 'hls':
- m3u8_formats = self._extract_m3u8_formats(
- manifest_url, video_id, ext='mp4',
- entry_protocol='m3u8_native', m3u8_id=protocol, fatal=False)
- # Sometimes final URLs inside m3u8 are unsigned, let's fix this
- # ourselves. Also fragments' URLs are only served signed for
- # Safari user agent.
- query = compat_urlparse.parse_qs(compat_urlparse.urlparse(manifest_url).query)
- for m3u8_format in m3u8_formats:
- m3u8_format.update({
- 'url': update_url_query(m3u8_format['url'], query),
- 'http_headers': {
- 'User-Agent': USER_AGENTS['Safari'],
- },
- })
- formats.extend(m3u8_formats)
- elif protocol == 'hds':
- formats.extend(self._extract_f4m_formats(
- manifest_url + '&hdcore=3.8.0&plugin=flowplayer-3.8.0.0',
- video_id, f4m_id=protocol, fatal=False))
-
- domain_tld = domain.split('.')[-1]
- if domain_tld in ('se', 'dk', 'no'):
- for protocol in PROTOCOLS:
- # Providing dsc-geo allows to bypass geo restriction in some cases
- self._set_cookie(
- 'secure.dplay.%s' % domain_tld, 'dsc-geo',
- json.dumps({
- 'countryCode': domain_tld.upper(),
- 'expiry': (time.time() + 20 * 60) * 1000,
- }))
- stream = self._download_json(
- 'https://secure.dplay.%s/secure/api/v2/user/authorization/stream/%s?stream_type=%s'
- % (domain_tld, video_id, protocol), video_id,
- 'Downloading %s stream JSON' % protocol, fatal=False)
- if stream and stream.get(protocol):
- extract_formats(protocol, stream[protocol])
-
- # The last resort is to try direct unsigned hls/hds URLs from info dictionary.
- # Sometimes this does work even when secure API with dsc-geo has failed (e.g.
- # http://www.dplay.no/pga-tour/season-1-hoydepunkter-18-21-februar/).
- if not formats:
- for protocol in PROTOCOLS:
- if info.get(protocol):
- extract_formats(protocol, info[protocol])
-
- self._sort_formats(formats)
-
- subtitles = {}
- for lang in ('se', 'sv', 'da', 'nl', 'no'):
- for format_id in ('web_vtt', 'vtt', 'srt'):
- subtitle_url = info.get('subtitles_%s_%s' % (lang, format_id))
- if subtitle_url:
- subtitles.setdefault(lang, []).append({'url': subtitle_url})
-
- return {
- 'id': video_id,
- 'display_id': display_id,
- 'title': title,
- 'description': info.get('video_metadata_longDescription'),
- 'duration': int_or_none(info.get('video_metadata_length'), scale=1000),
- 'timestamp': int_or_none(info.get('video_publish_date')),
- 'creator': info.get('video_metadata_homeChannel'),
- 'series': info.get('video_metadata_show'),
- 'season_number': int_or_none(info.get('season')),
- 'episode_number': int_or_none(info.get('episode')),
- 'age_limit': int_or_none(info.get('minimum_age')),
- 'formats': formats,
- 'subtitles': subtitles,
- }
-
-
-class DPlayItIE(InfoExtractor):
- _VALID_URL = r'https?://it\.dplay\.com/[^/]+/[^/]+/(?P<id>[^/?#]+)'
- _GEO_COUNTRIES = ['IT']
- _TEST = {
- 'url': 'http://it.dplay.com/nove/biografie-imbarazzanti/luigi-di-maio-la-psicosi-di-stanislawskij/',
- 'md5': '2b808ffb00fc47b884a172ca5d13053c',
- 'info_dict': {
- 'id': '6918',
- 'display_id': 'luigi-di-maio-la-psicosi-di-stanislawskij',
- 'ext': 'mp4',
- 'title': 'Biografie imbarazzanti: Luigi Di Maio: la psicosi di Stanislawskij',
- 'description': 'md5:3c7a4303aef85868f867a26f5cc14813',
- 'thumbnail': r're:^https?://.*\.jpe?g',
- 'upload_date': '20160524',
- 'series': 'Biografie imbarazzanti',
- 'season_number': 1,
- 'episode': 'Luigi Di Maio: la psicosi di Stanislawskij',
- 'episode_number': 1,
- },
- }
-
- def _real_extract(self, url):
- display_id = self._match_id(url)
-
- webpage = self._download_webpage(url, display_id)
-
- title = remove_end(self._og_search_title(webpage), ' | Dplay')
-
- video_id = None
-
- info = self._search_regex(
- r'playback_json\s*:\s*JSON\.parse\s*\(\s*("(?:\\.|[^"\\])+?")',
- webpage, 'playback JSON', default=None)
- if info:
- for _ in range(2):
- info = self._parse_json(info, display_id, fatal=False)
- if not info:
- break
- else:
- video_id = try_get(info, lambda x: x['data']['id'])
-
- if not info:
- info_url = self._search_regex(
- (r'playback_json_url\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1',
- r'url\s*[:=]\s*["\'](?P<url>(?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)'),
- webpage, 'info url', group='url')
-
- info_url = urljoin(url, info_url)
- video_id = info_url.rpartition('/')[-1]
-
- try:
- info = self._download_json(
- info_url, display_id, headers={
- 'Authorization': 'Bearer %s' % self._get_cookies(url).get(
- 'dplayit_token').value,
- 'Referer': url,
- })
- if isinstance(info, compat_str):
- info = self._parse_json(info, display_id)
- except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 403):
- info = self._parse_json(e.cause.read().decode('utf-8'), display_id)
- error = info['errors'][0]
- if error.get('code') == 'access.denied.geoblocked':
- self.raise_geo_restricted(
- msg=error.get('detail'), countries=self._GEO_COUNTRIES)
- raise ExtractorError(info['errors'][0]['detail'], expected=True)
- raise
-
- hls_url = info['data']['attributes']['streaming']['hls']['url']
-
- formats = self._extract_m3u8_formats(
- hls_url, display_id, ext='mp4', entry_protocol='m3u8_native',
- m3u8_id='hls')
- self._sort_formats(formats)
-
- series = self._html_search_regex(
- r'(?s)<h1[^>]+class=["\'].*?\bshow_title\b.*?["\'][^>]*>(.+?)</h1>',
- webpage, 'series', fatal=False)
- episode = self._search_regex(
- r'<p[^>]+class=["\'].*?\bdesc_ep\b.*?["\'][^>]*>\s*<br/>\s*<b>([^<]+)',
- webpage, 'episode', fatal=False)
-
- mobj = re.search(
- r'(?s)<span[^>]+class=["\']dates["\'][^>]*>.+?\bS\.(?P<season_number>\d+)\s+E\.(?P<episode_number>\d+)\s*-\s*(?P<upload_date>\d{2}/\d{2}/\d{4})',
- webpage)
- if mobj:
- season_number = int(mobj.group('season_number'))
- episode_number = int(mobj.group('episode_number'))
- upload_date = unified_strdate(mobj.group('upload_date'))
- else:
- season_number = episode_number = upload_date = None
-
- return {
- 'id': compat_str(video_id or display_id),
- 'display_id': display_id,
- 'title': title,
- 'description': self._og_search_description(webpage),
- 'thumbnail': self._og_search_thumbnail(webpage),
- 'series': series,
- 'season_number': season_number,
- 'episode': episode,
- 'episode_number': episode_number,
- 'upload_date': upload_date,
- 'formats': formats,
- }
+ domain = mobj.group('domain').lstrip('www.')
+ country = mobj.group('country') or mobj.group('subdomain_country')
+ host = 'disco-api.' + domain if domain.startswith('dplay.') else 'eu2-prod.disco-api.com'
+ return self._get_disco_api_info(
+ url, display_id, host, 'dplay' + country, country)
diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py
index 218f10209..390e79f8c 100644
--- a/youtube_dl/extractor/drtv.py
+++ b/youtube_dl/extractor/drtv.py
@@ -17,6 +17,7 @@ from ..utils import (
float_or_none,
mimetype2ext,
str_or_none,
+ try_get,
unified_timestamp,
update_url_query,
url_or_none,
@@ -24,7 +25,14 @@ from ..utils import (
class DRTVIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv/se|nyheder|radio(?:/ondemand)?)/(?:[^/]+/)*(?P<id>[\da-z-]+)(?:[/#?]|$)'
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:
+ (?:www\.)?dr\.dk/(?:tv/se|nyheder|radio(?:/ondemand)?)/(?:[^/]+/)*|
+ (?:www\.)?(?:dr\.dk|dr-massive\.com)/drtv/(?:se|episode)/
+ )
+ (?P<id>[\da-z_-]+)
+ '''
_GEO_BYPASS = False
_GEO_COUNTRIES = ['DK']
IE_NAME = 'drtv'
@@ -83,6 +91,26 @@ class DRTVIE(InfoExtractor):
}, {
'url': 'https://www.dr.dk/radio/p4kbh/regionale-nyheder-kh4/p4-nyheder-2019-06-26-17-30-9',
'only_matching': True,
+ }, {
+ 'url': 'https://www.dr.dk/drtv/se/bonderoeven_71769',
+ 'info_dict': {
+ 'id': '00951930010',
+ 'ext': 'mp4',
+ 'title': 'Bonderøven (1:8)',
+ 'description': 'md5:3cf18fc0d3b205745d4505f896af8121',
+ 'timestamp': 1546542000,
+ 'upload_date': '20190103',
+ 'duration': 2576.6,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://www.dr.dk/drtv/episode/bonderoeven_71769',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://dr-massive.com/drtv/se/bonderoeven_71769',
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -100,13 +128,32 @@ class DRTVIE(InfoExtractor):
webpage, 'video id', default=None)
if not video_id:
- video_id = compat_urllib_parse_unquote(self._search_regex(
+ video_id = self._search_regex(
r'(urn(?:%3A|:)dr(?:%3A|:)mu(?:%3A|:)programcard(?:%3A|:)[\da-f]+)',
- webpage, 'urn'))
+ webpage, 'urn', default=None)
+ if video_id:
+ video_id = compat_urllib_parse_unquote(video_id)
+
+ _PROGRAMCARD_BASE = 'https://www.dr.dk/mu-online/api/1.4/programcard'
+ query = {'expanded': 'true'}
+
+ if video_id:
+ programcard_url = '%s/%s' % (_PROGRAMCARD_BASE, video_id)
+ else:
+ programcard_url = _PROGRAMCARD_BASE
+ page = self._parse_json(
+ self._search_regex(
+ r'data\s*=\s*({.+?})\s*(?:;|</script)', webpage,
+ 'data'), '1')['cache']['page']
+ page = page[list(page.keys())[0]]
+ item = try_get(
+ page, (lambda x: x['item'], lambda x: x['entries'][0]['item']),
+ dict)
+ video_id = item['customId'].split(':')[-1]
+ query['productionnumber'] = video_id
data = self._download_json(
- 'https://www.dr.dk/mu-online/api/1.4/programcard/%s' % video_id,
- video_id, 'Downloading video JSON', query={'expanded': 'true'})
+ programcard_url, video_id, 'Downloading video JSON', query=query)
title = str_or_none(data.get('Title')) or re.sub(
r'\s*\|\s*(?:TV\s*\|\s*DR|DRTV)$', '',
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index abf5bb48d..50f69f0b6 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -18,10 +18,10 @@ from .acast import (
ACastIE,
ACastChannelIE,
)
-from .addanime import AddAnimeIE
from .adn import ADNIE
from .adobeconnect import AdobeConnectIE
from .adobetv import (
+ AdobeTVEmbedIE,
AdobeTVIE,
AdobeTVShowIE,
AdobeTVChannelIE,
@@ -80,7 +80,6 @@ from .awaan import (
)
from .azmedien import AZMedienIE
from .baidu import BaiduVideoIE
-from .bambuser import BambuserIE, BambuserChannelIE
from .bandcamp import BandcampIE, BandcampAlbumIE, BandcampWeeklyIE
from .bbc import (
BBCCoUkIE,
@@ -224,7 +223,6 @@ from .comedycentral import (
ComedyCentralTVIE,
ToshIE,
)
-from .comcarcoff import ComCarCoffIE
from .commonmistakes import CommonMistakesIE, UnicodeBOMIE
from .commonprotocols import (
MmsIE,
@@ -255,10 +253,6 @@ from .dailymotion import (
DailymotionPlaylistIE,
DailymotionUserIE,
)
-from .daisuki import (
- DaisukiMottoIE,
- DaisukiMottoPlaylistIE,
-)
from .daum import (
DaumIE,
DaumClipIE,
@@ -277,10 +271,7 @@ from .douyutv import (
DouyuShowIE,
DouyuTVIE,
)
-from .dplay import (
- DPlayIE,
- DPlayItIE,
-)
+from .dplay import DPlayIE
from .dreisat import DreiSatIE
from .drbonanza import DRBonanzaIE
from .drtuber import DrTuberIE
@@ -359,7 +350,6 @@ from .firsttv import FirstTVIE
from .fivemin import FiveMinIE
from .fivetv import FiveTVIE
from .flickr import FlickrIE
-from .flipagram import FlipagramIE
from .folketinget import FolketingetIE
from .footyroom import FootyRoomIE
from .formula1 import Formula1IE
@@ -370,7 +360,10 @@ from .fourtube import (
FuxIE,
)
from .fox import FOXIE
-from .fox9 import FOX9IE
+from .fox9 import (
+ FOX9IE,
+ FOX9NewsIE,
+)
from .foxgay import FoxgayIE
from .foxnews import (
FoxNewsIE,
@@ -403,10 +396,6 @@ from .fusion import FusionIE
from .fxnetworks import FXNetworksIE
from .gaia import GaiaIE
from .gameinformer import GameInformerIE
-from .gameone import (
- GameOneIE,
- GameOnePlaylistIE,
-)
from .gamespot import GameSpotIE
from .gamestar import GameStarIE
from .gaskrank import GaskrankIE
@@ -422,7 +411,6 @@ from .globo import (
GloboArticleIE,
)
from .go import GoIE
-from .go90 import Go90IE
from .godtube import GodTubeIE
from .golem import GolemIE
from .googledrive import GoogleDriveIE
@@ -431,7 +419,6 @@ from .googlesearch import GoogleSearchIE
from .goshgay import GoshgayIE
from .gputechconf import GPUTechConfIE
from .groupon import GrouponIE
-from .hark import HarkIE
from .hbo import HBOIE
from .hearthisat import HearThisAtIE
from .heise import HeiseIE
@@ -463,7 +450,6 @@ from .hungama import (
HungamaSongIE,
)
from .hypem import HypemIE
-from .iconosquare import IconosquareIE
from .ign import (
IGNIE,
OneUPIE,
@@ -522,10 +508,9 @@ from .keezmovies import KeezMoviesIE
from .ketnet import KetnetIE
from .khanacademy import KhanAcademyIE
from .kickstarter import KickStarterIE
+from .kinja import KinjaEmbedIE
from .kinopoisk import KinoPoiskIE
-from .keek import KeekIE
from .konserthusetplay import KonserthusetPlayIE
-from .kontrtube import KontrTubeIE
from .krasview import KrasViewIE
from .ku6 import Ku6IE
from .kusi import KUSIIE
@@ -549,7 +534,6 @@ from .lcp import (
LcpPlayIE,
LcpIE,
)
-from .learnr import LearnrIE
from .lecture2go import Lecture2GoIE
from .lecturio import (
LecturioIE,
@@ -601,13 +585,11 @@ from .lynda import (
LyndaCourseIE
)
from .m6 import M6IE
-from .macgamestore import MacGameStoreIE
from .mailru import (
MailRuIE,
MailRuMusicIE,
MailRuMusicSearchIE,
)
-from .makertv import MakerTVIE
from .malltv import MallTVIE
from .mangomolo import (
MangomoloVideoIE,
@@ -641,7 +623,6 @@ from .microsoftvirtualacademy import (
MicrosoftVirtualAcademyIE,
MicrosoftVirtualAcademyCourseIE,
)
-from .minhateca import MinhatecaIE
from .ministrygrid import MinistryGridIE
from .minoto import MinotoIE
from .miomio import MioMioIE
@@ -651,7 +632,6 @@ from .mixcloud import (
MixcloudIE,
MixcloudUserIE,
MixcloudPlaylistIE,
- MixcloudStreamIE,
)
from .mlb import MLBIE
from .mnet import MnetIE
@@ -673,10 +653,9 @@ from .mtv import (
MTVVideoIE,
MTVServicesEmbeddedIE,
MTVDEIE,
- MTV81IE,
+ MTVJapanIE,
)
from .muenchentv import MuenchenTVIE
-from .musicplayon import MusicPlayOnIE
from .mwave import MwaveIE, MwaveMeetGreetIE
from .mychannels import MyChannelsIE
from .myspace import MySpaceIE, MySpaceAlbumIE
@@ -816,10 +795,6 @@ from .ooyala import (
OoyalaIE,
OoyalaExternalIE,
)
-from .openload import (
- OpenloadIE,
- VerystreamIE,
-)
from .ora import OraTVIE
from .orf import (
ORFTVthekIE,
@@ -945,10 +920,6 @@ from .rentv import (
from .restudy import RestudyIE
from .reuters import ReutersIE
from .reverbnation import ReverbNationIE
-from .revision3 import (
- Revision3EmbedIE,
- Revision3IE,
-)
from .rice import RICEIE
from .rmcdecouverte import RMCDecouverteIE
from .ro220 import Ro220IE
@@ -993,6 +964,10 @@ from .sbs import SBSIE
from .screencast import ScreencastIE
from .screencastomatic import ScreencastOMaticIE
from .scrippsnetworks import ScrippsNetworksWatchIE
+from .scte import (
+ SCTEIE,
+ SCTECourseIE,
+)
from .seeker import SeekerIE
from .senateisvp import SenateISVPIE
from .sendtonews import SendtoNewsIE
@@ -1036,6 +1011,7 @@ from .snotr import SnotrIE
from .sohu import SohuIE
from .sonyliv import SonyLIVIE
from .soundcloud import (
+ SoundcloudEmbedIE,
SoundcloudIE,
SoundcloudSetIE,
SoundcloudUserIE,
@@ -1079,7 +1055,6 @@ from .srmediathek import SRMediathekIE
from .stanfordoc import StanfordOpenClassroomIE
from .steam import SteamIE
from .streamable import StreamableIE
-from .streamango import StreamangoIE
from .streamcloud import StreamcloudIE
from .streamcz import StreamCZIE
from .streetvoice import StreetVoiceIE
@@ -1135,6 +1110,7 @@ from .telequebec import (
from .teletask import TeleTaskIE
from .telewebion import TelewebionIE
from .tennistv import TennisTVIE
+from .tenplay import TenPlayIE
from .testurl import TestURLIE
from .tf1 import TF1IE
from .tfo import TFOIE
@@ -1184,10 +1160,14 @@ from .tunein import (
)
from .tunepk import TunePkIE
from .turbo import TurboIE
-from .tutv import TutvIE
from .tv2 import (
TV2IE,
TV2ArticleIE,
+ KatsomoIE,
+)
+from .tv2dk import (
+ TV2DKIE,
+ TV2DKBornholmPlayIE,
)
from .tv2hu import TV2HuIE
from .tv4 import TV4IE
@@ -1245,13 +1225,17 @@ from .twitter import (
TwitterCardIE,
TwitterIE,
TwitterAmplifyIE,
+ TwitterBroadcastIE,
)
from .udemy import (
UdemyIE,
UdemyCourseIE
)
from .udn import UDNEmbedIE
-from .ufctv import UFCTVIE
+from .ufctv import (
+ UFCTVIE,
+ UFCArabiaIE,
+)
from .uktvplay import UKTVPlayIE
from .digiteka import DigitekaIE
from .dlive import (
@@ -1305,7 +1289,6 @@ from .videomore import (
VideomoreVideoIE,
VideomoreSeasonIE,
)
-from .videopremium import VideoPremiumIE
from .videopress import VideoPressIE
from .vidio import VidioIE
from .vidlii import VidLiiIE
diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py
index c723726b7..ce64e2683 100644
--- a/youtube_dl/extractor/facebook.py
+++ b/youtube_dl/extractor/facebook.py
@@ -334,7 +334,7 @@ class FacebookIE(InfoExtractor):
if not video_data:
server_js_data = self._parse_json(
self._search_regex(
- r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:stream_pagelet|pagelet_group_mall|permalink_video_pagelet)',
+ r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:pagelet_group_mall|permalink_video_pagelet|hyperfeed_story_id_\d+)',
webpage, 'js data', default='{}'),
video_id, transform_source=js_to_json, fatal=False)
video_data = extract_from_jsmods_instances(server_js_data)
diff --git a/youtube_dl/extractor/flipagram.py b/youtube_dl/extractor/flipagram.py
deleted file mode 100644
index b7be40f1b..000000000
--- a/youtube_dl/extractor/flipagram.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from ..compat import compat_str
-from ..utils import (
- int_or_none,
- float_or_none,
- try_get,
- unified_timestamp,
-)
-
-
-class FlipagramIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?flipagram\.com/f/(?P<id>[^/?#&]+)'
- _TEST = {
- 'url': 'https://flipagram.com/f/nyvTSJMKId',
- 'md5': '888dcf08b7ea671381f00fab74692755',
- 'info_dict': {
- 'id': 'nyvTSJMKId',
- 'ext': 'mp4',
- 'title': 'Flipagram by sjuria101 featuring Midnight Memories by One Direction',
- 'description': 'md5:d55e32edc55261cae96a41fa85ff630e',
- 'duration': 35.571,
- 'timestamp': 1461244995,
- 'upload_date': '20160421',
- 'uploader': 'kitty juria',
- 'uploader_id': 'sjuria101',
- 'creator': 'kitty juria',
- 'view_count': int,
- 'like_count': int,
- 'repost_count': int,
- 'comment_count': int,
- 'comments': list,
- 'formats': 'mincount:2',
- },
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
-
- video_data = self._parse_json(
- self._search_regex(
- r'window\.reactH2O\s*=\s*({.+});', webpage, 'video data'),
- video_id)
-
- flipagram = video_data['flipagram']
- video = flipagram['video']
-
- json_ld = self._search_json_ld(webpage, video_id, default={})
- title = json_ld.get('title') or flipagram['captionText']
- description = json_ld.get('description') or flipagram.get('captionText')
-
- formats = [{
- 'url': video['url'],
- 'width': int_or_none(video.get('width')),
- 'height': int_or_none(video.get('height')),
- 'filesize': int_or_none(video_data.get('size')),
- }]
-
- preview_url = try_get(
- flipagram, lambda x: x['music']['track']['previewUrl'], compat_str)
- if preview_url:
- formats.append({
- 'url': preview_url,
- 'ext': 'm4a',
- 'vcodec': 'none',
- })
-
- self._sort_formats(formats)
-
- counts = flipagram.get('counts', {})
- user = flipagram.get('user', {})
- video_data = flipagram.get('video', {})
-
- thumbnails = [{
- 'url': self._proto_relative_url(cover['url']),
- 'width': int_or_none(cover.get('width')),
- 'height': int_or_none(cover.get('height')),
- 'filesize': int_or_none(cover.get('size')),
- } for cover in flipagram.get('covers', []) if cover.get('url')]
-
- # Note that this only retrieves comments that are initially loaded.
- # For videos with large amounts of comments, most won't be retrieved.
- comments = []
- for comment in video_data.get('comments', {}).get(video_id, {}).get('items', []):
- text = comment.get('comment')
- if not text or not isinstance(text, list):
- continue
- comments.append({
- 'author': comment.get('user', {}).get('name'),
- 'author_id': comment.get('user', {}).get('username'),
- 'id': comment.get('id'),
- 'text': text[0],
- 'timestamp': unified_timestamp(comment.get('created')),
- })
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'duration': float_or_none(flipagram.get('duration'), 1000),
- 'thumbnails': thumbnails,
- 'timestamp': unified_timestamp(flipagram.get('iso8601Created')),
- 'uploader': user.get('name'),
- 'uploader_id': user.get('username'),
- 'creator': user.get('name'),
- 'view_count': int_or_none(counts.get('plays')),
- 'like_count': int_or_none(counts.get('likes')),
- 'repost_count': int_or_none(counts.get('reflips')),
- 'comment_count': int_or_none(counts.get('comments')),
- 'comments': comments,
- 'formats': formats,
- }
diff --git a/youtube_dl/extractor/fox9.py b/youtube_dl/extractor/fox9.py
index 17dfffa7b..91f8f7b8a 100644
--- a/youtube_dl/extractor/fox9.py
+++ b/youtube_dl/extractor/fox9.py
@@ -1,13 +1,23 @@
# coding: utf-8
from __future__ import unicode_literals
-from .anvato import AnvatoIE
+from .common import InfoExtractor
-class FOX9IE(AnvatoIE):
- _VALID_URL = r'https?://(?:www\.)?fox9\.com/(?:[^/]+/)+(?P<id>\d+)-story'
- _TESTS = [{
- 'url': 'http://www.fox9.com/news/215123287-story',
+class FOX9IE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?fox9\.com/video/(?P<id>\d+)'
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ return self.url_result(
+ 'anvato:anvato_epfox_app_web_prod_b3373168e12f423f41504f207000188daf88251b:' + video_id,
+ 'Anvato', video_id)
+
+
+class FOX9NewsIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?fox9\.com/news/(?P<id>[^/?&#]+)'
+ _TEST = {
+ 'url': 'https://www.fox9.com/news/black-bear-in-tree-draws-crowd-in-downtown-duluth-minnesota',
'md5': 'd6e1b2572c3bab8a849c9103615dd243',
'info_dict': {
'id': '314473',
@@ -21,22 +31,11 @@ class FOX9IE(AnvatoIE):
'categories': ['News', 'Sports'],
'tags': ['news', 'video'],
},
- }, {
- 'url': 'http://www.fox9.com/news/investigators/214070684-story',
- 'only_matching': True,
- }]
+ }
def _real_extract(self, url):
- video_id = self._match_id(url)
-
- webpage = self._download_webpage(url, video_id)
-
- video_id = self._parse_json(
- self._search_regex(
- r"this\.videosJson\s*=\s*'(\[.+?\])';",
- webpage, 'anvato playlist'),
- video_id)[0]['video']
-
- return self._get_anvato_videos(
- 'anvato_epfox_app_web_prod_b3373168e12f423f41504f207000188daf88251b',
- video_id)
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ anvato_id = self._search_regex(
+ r'anvatoId\s*:\s*[\'"](\d+)', webpage, 'anvato id')
+ return self.url_result('https://www.fox9.com/video/' + anvato_id, 'FOX9')
diff --git a/youtube_dl/extractor/gameone.py b/youtube_dl/extractor/gameone.py
deleted file mode 100644
index a07d69841..000000000
--- a/youtube_dl/extractor/gameone.py
+++ /dev/null
@@ -1,134 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import (
- xpath_with_ns,
- parse_iso8601,
- float_or_none,
- int_or_none,
-)
-
-NAMESPACE_MAP = {
- 'media': 'http://search.yahoo.com/mrss/',
-}
-
-# URL prefix to download the mp4 files directly instead of streaming via rtmp
-# Credits go to XBox-Maniac
-# http://board.jdownloader.org/showpost.php?p=185835&postcount=31
-RAW_MP4_URL = 'http://cdn.riptide-mtvn.com/'
-
-
-class GameOneIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?gameone\.de/tv/(?P<id>\d+)'
- _TESTS = [
- {
- 'url': 'http://www.gameone.de/tv/288',
- 'md5': '136656b7fb4c9cb4a8e2d500651c499b',
- 'info_dict': {
- 'id': '288',
- 'ext': 'mp4',
- 'title': 'Game One - Folge 288',
- 'duration': 1238,
- 'thumbnail': 'http://s3.gameone.de/gameone/assets/video_metas/teaser_images/000/643/636/big/640x360.jpg',
- 'description': 'FIFA-Pressepokal 2014, Star Citizen, Kingdom Come: Deliverance, Project Cars, Schöner Trants Nerdquiz Folge 2 Runde 1',
- 'age_limit': 16,
- 'upload_date': '20140513',
- 'timestamp': 1399980122,
- }
- },
- {
- 'url': 'http://gameone.de/tv/220',
- 'md5': '5227ca74c4ae6b5f74c0510a7c48839e',
- 'info_dict': {
- 'id': '220',
- 'ext': 'mp4',
- 'upload_date': '20120918',
- 'description': 'Jet Set Radio HD, Tekken Tag Tournament 2, Source Filmmaker',
- 'timestamp': 1347971451,
- 'title': 'Game One - Folge 220',
- 'duration': 896.62,
- 'age_limit': 16,
- }
- }
-
- ]
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- webpage = self._download_webpage(url, video_id)
- og_video = self._og_search_video_url(webpage, secure=False)
- description = self._html_search_meta('description', webpage)
- age_limit = int(
- self._search_regex(
- r'age=(\d+)',
- self._html_search_meta(
- 'age-de-meta-label',
- webpage),
- 'age_limit',
- '0'))
- mrss_url = self._search_regex(r'mrss=([^&]+)', og_video, 'mrss')
-
- mrss = self._download_xml(mrss_url, video_id, 'Downloading mrss')
- title = mrss.find('.//item/title').text
- thumbnail = mrss.find('.//item/image').get('url')
- timestamp = parse_iso8601(mrss.find('.//pubDate').text, delimiter=' ')
- content = mrss.find(xpath_with_ns('.//media:content', NAMESPACE_MAP))
- content_url = content.get('url')
-
- content = self._download_xml(
- content_url,
- video_id,
- 'Downloading media:content')
- rendition_items = content.findall('.//rendition')
- duration = float_or_none(rendition_items[0].get('duration'))
- formats = [
- {
- 'url': re.sub(r'.*/(r2)', RAW_MP4_URL + r'\1', r.find('./src').text),
- 'width': int_or_none(r.get('width')),
- 'height': int_or_none(r.get('height')),
- 'tbr': int_or_none(r.get('bitrate')),
- }
- for r in rendition_items
- ]
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'title': title,
- 'thumbnail': thumbnail,
- 'duration': duration,
- 'formats': formats,
- 'description': description,
- 'age_limit': age_limit,
- 'timestamp': timestamp,
- }
-
-
-class GameOnePlaylistIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?gameone\.de(?:/tv)?/?$'
- IE_NAME = 'gameone:playlist'
- _TEST = {
- 'url': 'http://www.gameone.de/tv',
- 'info_dict': {
- 'title': 'GameOne',
- },
- 'playlist_mincount': 294,
- }
-
- def _real_extract(self, url):
- webpage = self._download_webpage('http://www.gameone.de/tv', 'TV')
- max_id = max(map(int, re.findall(r'<a href="/tv/(\d+)"', webpage)))
- entries = [
- self.url_result('http://www.gameone.de/tv/%d' %
- video_id, 'GameOne')
- for video_id in range(max_id, 0, -1)]
-
- return {
- '_type': 'playlist',
- 'title': 'GameOne',
- 'entries': entries,
- }
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 5ed952b29..743ef47db 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -80,7 +80,7 @@ from .theplatform import ThePlatformIE
from .kaltura import KalturaIE
from .eagleplatform import EaglePlatformIE
from .facebook import FacebookIE
-from .soundcloud import SoundcloudIE
+from .soundcloud import SoundcloudEmbedIE
from .tunein import TuneInBaseIE
from .vbox7 import Vbox7IE
from .dbtv import DBTVIE
@@ -88,10 +88,6 @@ from .piksel import PikselIE
from .videa import VideaIE
from .twentymin import TwentyMinutenIE
from .ustream import UstreamIE
-from .openload import (
- OpenloadIE,
- VerystreamIE,
-)
from .videopress import VideoPressIE
from .rutube import RutubeIE
from .limelight import LimelightBaseIE
@@ -118,6 +114,8 @@ from .foxnews import FoxNewsIE
from .viqeo import ViqeoIE
from .expressen import ExpressenIE
from .zype import ZypeIE
+from .odnoklassniki import OdnoklassnikiIE
+from .kinja import KinjaEmbedIE
class GenericIE(InfoExtractor):
@@ -1486,16 +1484,18 @@ class GenericIE(InfoExtractor):
'timestamp': 1432570283,
},
},
- # OnionStudios embed
+ # Kinja embed
{
'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537',
'info_dict': {
- 'id': '2855',
+ 'id': '106351',
'ext': 'mp4',
'title': 'Don’t Understand Bitcoin? This Man Will Mumble An Explanation At You',
+ 'description': 'Migrated from OnionStudios',
'thumbnail': r're:^https?://.*\.jpe?g$',
- 'uploader': 'ClickHole',
- 'uploader_id': 'clickhole',
+ 'uploader': 'clickhole',
+ 'upload_date': '20150527',
+ 'timestamp': 1432744860,
}
},
# SnagFilms embed
@@ -2627,9 +2627,9 @@ class GenericIE(InfoExtractor):
return self.url_result(mobj.group('url'), 'VK')
# Look for embedded Odnoklassniki player
- mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1', webpage)
- if mobj is not None:
- return self.url_result(mobj.group('url'), 'Odnoklassniki')
+ odnoklassniki_url = OdnoklassnikiIE._extract_url(webpage)
+ if odnoklassniki_url:
+ return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key())
# Look for embedded ivi player
mobj = re.search(r'<embed[^>]+?src=(["\'])(?P<url>https?://(?:www\.)?ivi\.ru/video/player.+?)\1', webpage)
@@ -2748,9 +2748,9 @@ class GenericIE(InfoExtractor):
return self.url_result(myvi_url)
# Look for embedded soundcloud player
- soundcloud_urls = SoundcloudIE._extract_urls(webpage)
+ soundcloud_urls = SoundcloudEmbedIE._extract_urls(webpage)
if soundcloud_urls:
- return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML, ie=SoundcloudIE.ie_key())
+ return self.playlist_from_matches(soundcloud_urls, video_id, video_title, getter=unescapeHTML)
# Look for tunein player
tunein_urls = TuneInBaseIE._extract_urls(webpage)
@@ -2893,6 +2893,12 @@ class GenericIE(InfoExtractor):
if senate_isvp_url:
return self.url_result(senate_isvp_url, 'SenateISVP')
+ # Look for Kinja embeds
+ kinja_embed_urls = KinjaEmbedIE._extract_urls(webpage, url)
+ if kinja_embed_urls:
+ return self.playlist_from_matches(
+ kinja_embed_urls, video_id, video_title)
+
# Look for OnionStudios embeds
onionstudios_url = OnionStudiosIE._extract_url(webpage)
if onionstudios_url:
@@ -3038,18 +3044,6 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches(
twentymin_urls, video_id, video_title, ie=TwentyMinutenIE.ie_key())
- # Look for Openload embeds
- openload_urls = OpenloadIE._extract_urls(webpage)
- if openload_urls:
- return self.playlist_from_matches(
- openload_urls, video_id, video_title, ie=OpenloadIE.ie_key())
-
- # Look for Verystream embeds
- verystream_urls = VerystreamIE._extract_urls(webpage)
- if verystream_urls:
- return self.playlist_from_matches(
- verystream_urls, video_id, video_title, ie=VerystreamIE.ie_key())
-
# Look for VideoPress embeds
videopress_urls = VideoPressIE._extract_urls(webpage)
if videopress_urls:
diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py
index 03e48f4ea..03cfba91f 100644
--- a/youtube_dl/extractor/go.py
+++ b/youtube_dl/extractor/go.py
@@ -40,8 +40,17 @@ class GoIE(AdobePassIE):
'resource_id': 'Disney',
}
}
- _VALID_URL = r'https?://(?:(?:(?P<sub_domain>%s)\.)?go|(?P<sub_domain_2>disneynow))\.com/(?:(?:[^/]+/)*(?P<id>vdka\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))'\
- % '|'.join(list(_SITE_INFO.keys()) + ['disneynow'])
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:
+ (?:(?P<sub_domain>%s)\.)?go|
+ (?P<sub_domain_2>abc|freeform|disneynow)
+ )\.com/
+ (?:
+ (?:[^/]+/)*(?P<id>[Vv][Dd][Kk][Aa]\w+)|
+ (?:[^/]+/)*(?P<display_id>[^/?\#]+)
+ )
+ ''' % '|'.join(list(_SITE_INFO.keys()))
_TESTS = [{
'url': 'http://abc.go.com/shows/designated-survivor/video/most-recent/VDKA3807643',
'info_dict': {
@@ -54,6 +63,7 @@ class GoIE(AdobePassIE):
# m3u8 download
'skip_download': True,
},
+ 'skip': 'This content is no longer available.',
}, {
'url': 'http://watchdisneyxd.go.com/doraemon',
'info_dict': {
@@ -62,6 +72,34 @@ class GoIE(AdobePassIE):
},
'playlist_mincount': 51,
}, {
+ 'url': 'http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood',
+ 'info_dict': {
+ 'id': 'VDKA3609139',
+ 'ext': 'mp4',
+ 'title': 'This Guilty Blood',
+ 'description': 'md5:f18e79ad1c613798d95fdabfe96cd292',
+ 'age_limit': 14,
+ },
+ 'params': {
+ 'geo_bypass_ip_block': '3.244.239.0/24',
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet',
+ 'info_dict': {
+ 'id': 'VDKA13435179',
+ 'ext': 'mp4',
+ 'title': 'The Bet',
+ 'description': 'md5:c66de8ba2e92c6c5c113c3ade84ab404',
+ 'age_limit': 14,
+ },
+ 'params': {
+ 'geo_bypass_ip_block': '3.244.239.0/24',
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding',
'only_matching': True,
}, {
@@ -95,10 +133,13 @@ class GoIE(AdobePassIE):
if not video_id or not site_info:
webpage = self._download_webpage(url, display_id or video_id)
video_id = self._search_regex(
- # There may be inner quotes, e.g. data-video-id="'VDKA3609139'"
- # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood
- r'data-video-id=["\']*(VDKA\w+)', webpage, 'video id',
- default=video_id)
+ (
+ # There may be inner quotes, e.g. data-video-id="'VDKA3609139'"
+ # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood
+ r'data-video-id=["\']*(VDKA\w+)',
+ # https://abc.com/shows/the-rookie/episode-guide/season-02/03-the-bet
+ r'\b(?:video)?id["\']\s*:\s*["\'](VDKA\w+)'
+ ), webpage, 'video id', default=video_id)
if not site_info:
brand = self._search_regex(
(r'data-brand=\s*["\']\s*(\d+)',
diff --git a/youtube_dl/extractor/go90.py b/youtube_dl/extractor/go90.py
deleted file mode 100644
index c3ea717bc..000000000
--- a/youtube_dl/extractor/go90.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..compat import compat_HTTPError
-from ..utils import (
- determine_ext,
- ExtractorError,
- int_or_none,
- parse_age_limit,
- parse_iso8601,
-)
-
-
-class Go90IE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?go90\.com/(?:videos|embed)/(?P<id>[0-9a-zA-Z]+)'
- _TESTS = [{
- 'url': 'https://www.go90.com/videos/84BUqjLpf9D',
- 'md5': 'efa7670dbbbf21a7b07b360652b24a32',
- 'info_dict': {
- 'id': '84BUqjLpf9D',
- 'ext': 'mp4',
- 'title': 'Daily VICE - Inside The Utah Coalition Against Pornography Convention',
- 'description': 'VICE\'s Karley Sciortino meets with activists who discuss the state\'s strong anti-porn stance. Then, VICE Sports explains NFL contracts.',
- 'timestamp': 1491868800,
- 'upload_date': '20170411',
- 'age_limit': 14,
- }
- }, {
- 'url': 'https://www.go90.com/embed/261MflWkD3N',
- 'only_matching': True,
- }]
- _GEO_BYPASS = False
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- try:
- headers = self.geo_verification_headers()
- headers.update({
- 'Content-Type': 'application/json; charset=utf-8',
- })
- video_data = self._download_json(
- 'https://www.go90.com/api/view/items/' + video_id, video_id,
- headers=headers, data=b'{"client":"web","device_type":"pc"}')
- except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
- message = self._parse_json(e.cause.read().decode(), None)['error']['message']
- if 'region unavailable' in message:
- self.raise_geo_restricted(countries=['US'])
- raise ExtractorError(message, expected=True)
- raise
-
- if video_data.get('requires_drm'):
- raise ExtractorError('This video is DRM protected.', expected=True)
- main_video_asset = video_data['main_video_asset']
-
- episode_number = int_or_none(video_data.get('episode_number'))
- series = None
- season = None
- season_id = None
- season_number = None
- for metadata in video_data.get('__children', {}).get('Item', {}).values():
- if metadata.get('type') == 'show':
- series = metadata.get('title')
- elif metadata.get('type') == 'season':
- season = metadata.get('title')
- season_id = metadata.get('id')
- season_number = int_or_none(metadata.get('season_number'))
-
- title = episode = video_data.get('title') or series
- if series and series != title:
- title = '%s - %s' % (series, title)
-
- thumbnails = []
- formats = []
- subtitles = {}
- for asset in video_data.get('assets'):
- if asset.get('id') == main_video_asset:
- for source in asset.get('sources', []):
- source_location = source.get('location')
- if not source_location:
- continue
- source_type = source.get('type')
- if source_type == 'hls':
- m3u8_formats = self._extract_m3u8_formats(
- source_location, video_id, 'mp4',
- 'm3u8_native', m3u8_id='hls', fatal=False)
- for f in m3u8_formats:
- mobj = re.search(r'/hls-(\d+)-(\d+)K', f['url'])
- if mobj:
- height, tbr = mobj.groups()
- height = int_or_none(height)
- f.update({
- 'height': f.get('height') or height,
- 'width': f.get('width') or int_or_none(height / 9.0 * 16.0 if height else None),
- 'tbr': f.get('tbr') or int_or_none(tbr),
- })
- formats.extend(m3u8_formats)
- elif source_type == 'dash':
- formats.extend(self._extract_mpd_formats(
- source_location, video_id, mpd_id='dash', fatal=False))
- else:
- formats.append({
- 'format_id': source.get('name'),
- 'url': source_location,
- 'width': int_or_none(source.get('width')),
- 'height': int_or_none(source.get('height')),
- 'tbr': int_or_none(source.get('bitrate')),
- })
-
- for caption in asset.get('caption_metadata', []):
- caption_url = caption.get('source_url')
- if not caption_url:
- continue
- subtitles.setdefault(caption.get('language', 'en'), []).append({
- 'url': caption_url,
- 'ext': determine_ext(caption_url, 'vtt'),
- })
- elif asset.get('type') == 'image':
- asset_location = asset.get('location')
- if not asset_location:
- continue
- thumbnails.append({
- 'url': asset_location,
- 'width': int_or_none(asset.get('width')),
- 'height': int_or_none(asset.get('height')),
- })
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'title': title,
- 'formats': formats,
- 'thumbnails': thumbnails,
- 'description': video_data.get('short_description'),
- 'like_count': int_or_none(video_data.get('like_count')),
- 'timestamp': parse_iso8601(video_data.get('released_at')),
- 'series': series,
- 'episode': episode,
- 'season': season,
- 'season_id': season_id,
- 'season_number': season_number,
- 'episode_number': episode_number,
- 'subtitles': subtitles,
- 'age_limit': parse_age_limit(video_data.get('rating')),
- }
diff --git a/youtube_dl/extractor/hark.py b/youtube_dl/extractor/hark.py
deleted file mode 100644
index 342a6130e..000000000
--- a/youtube_dl/extractor/hark.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-
-
-class HarkIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?hark\.com/clips/(?P<id>.+?)-.+'
- _TEST = {
- 'url': 'http://www.hark.com/clips/mmbzyhkgny-obama-beyond-the-afghan-theater-we-only-target-al-qaeda-on-may-23-2013',
- 'md5': '6783a58491b47b92c7c1af5a77d4cbee',
- 'info_dict': {
- 'id': 'mmbzyhkgny',
- 'ext': 'mp3',
- 'title': 'Obama: \'Beyond The Afghan Theater, We Only Target Al Qaeda\' on May 23, 2013',
- 'description': 'President Barack Obama addressed the nation live on May 23, 2013 in a speech aimed at addressing counter-terrorism policies including the use of drone strikes, detainees at Guantanamo Bay prison facility, and American citizens who are terrorists.',
- 'duration': 11,
- }
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- data = self._download_json(
- 'http://www.hark.com/clips/%s.json' % video_id, video_id)
-
- return {
- 'id': video_id,
- 'url': data['url'],
- 'title': data['name'],
- 'description': data.get('description'),
- 'thumbnail': data.get('image_original'),
- 'duration': data.get('duration'),
- }
diff --git a/youtube_dl/extractor/hotstar.py b/youtube_dl/extractor/hotstar.py
index f9f7c5a64..f97eefa3d 100644
--- a/youtube_dl/extractor/hotstar.py
+++ b/youtube_dl/extractor/hotstar.py
@@ -118,6 +118,7 @@ class HotStarIE(HotStarBaseIE):
if video_data.get('drmProtected'):
raise ExtractorError('This video is DRM protected.', expected=True)
+ headers = {'Referer': url}
formats = []
geo_restricted = False
playback_sets = self._call_api_v2('h/v2/play', video_id)['playBackSets']
@@ -137,10 +138,11 @@ class HotStarIE(HotStarBaseIE):
if 'package:hls' in tags or ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
format_url, video_id, 'mp4',
- entry_protocol='m3u8_native', m3u8_id='hls'))
+ entry_protocol='m3u8_native',
+ m3u8_id='hls', headers=headers))
elif 'package:dash' in tags or ext == 'mpd':
formats.extend(self._extract_mpd_formats(
- format_url, video_id, mpd_id='dash'))
+ format_url, video_id, mpd_id='dash', headers=headers))
elif ext == 'f4m':
# produce broken files
pass
@@ -158,6 +160,9 @@ class HotStarIE(HotStarBaseIE):
self.raise_geo_restricted(countries=['IN'])
self._sort_formats(formats)
+ for f in formats:
+ f.setdefault('http_headers', {}).update(headers)
+
return {
'id': video_id,
'title': title,
diff --git a/youtube_dl/extractor/iconosquare.py b/youtube_dl/extractor/iconosquare.py
deleted file mode 100644
index a39f422e9..000000000
--- a/youtube_dl/extractor/iconosquare.py
+++ /dev/null
@@ -1,85 +0,0 @@
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from ..utils import (
- int_or_none,
- get_element_by_id,
- remove_end,
-)
-
-
-class IconosquareIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?(?:iconosquare\.com|statigr\.am)/p/(?P<id>[^/]+)'
- _TEST = {
- 'url': 'http://statigr.am/p/522207370455279102_24101272',
- 'md5': '6eb93b882a3ded7c378ee1d6884b1814',
- 'info_dict': {
- 'id': '522207370455279102_24101272',
- 'ext': 'mp4',
- 'title': 'Instagram photo by @aguynamedpatrick (Patrick Janelle)',
- 'description': 'md5:644406a9ec27457ed7aa7a9ebcd4ce3d',
- 'timestamp': 1376471991,
- 'upload_date': '20130814',
- 'uploader': 'aguynamedpatrick',
- 'uploader_id': '24101272',
- 'comment_count': int,
- 'like_count': int,
- },
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- webpage = self._download_webpage(url, video_id)
-
- media = self._parse_json(
- get_element_by_id('mediaJson', webpage),
- video_id)
-
- formats = [{
- 'url': f['url'],
- 'format_id': format_id,
- 'width': int_or_none(f.get('width')),
- 'height': int_or_none(f.get('height'))
- } for format_id, f in media['videos'].items()]
- self._sort_formats(formats)
-
- title = remove_end(self._og_search_title(webpage), ' - via Iconosquare')
-
- timestamp = int_or_none(media.get('created_time') or media.get('caption', {}).get('created_time'))
- description = media.get('caption', {}).get('text')
-
- uploader = media.get('user', {}).get('username')
- uploader_id = media.get('user', {}).get('id')
-
- comment_count = int_or_none(media.get('comments', {}).get('count'))
- like_count = int_or_none(media.get('likes', {}).get('count'))
-
- thumbnails = [{
- 'url': t['url'],
- 'id': thumbnail_id,
- 'width': int_or_none(t.get('width')),
- 'height': int_or_none(t.get('height'))
- } for thumbnail_id, t in media.get('images', {}).items()]
-
- comments = [{
- 'id': comment.get('id'),
- 'text': comment['text'],
- 'timestamp': int_or_none(comment.get('created_time')),
- 'author': comment.get('from', {}).get('full_name'),
- 'author_id': comment.get('from', {}).get('username'),
- } for comment in media.get('comments', {}).get('data', []) if 'text' in comment]
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'thumbnails': thumbnails,
- 'timestamp': timestamp,
- 'uploader': uploader,
- 'uploader_id': uploader_id,
- 'comment_count': comment_count,
- 'like_count': like_count,
- 'formats': formats,
- 'comments': comments,
- }
diff --git a/youtube_dl/extractor/imggaming.py b/youtube_dl/extractor/imggaming.py
new file mode 100644
index 000000000..e11f92053
--- /dev/null
+++ b/youtube_dl/extractor/imggaming.py
@@ -0,0 +1,133 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_HTTPError
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ str_or_none,
+ try_get,
+)
+
+
+class ImgGamingBaseIE(InfoExtractor):
+ _API_BASE = 'https://dce-frontoffice.imggaming.com/api/v2/'
+ _API_KEY = '857a1e5d-e35e-4fdf-805b-a87b6f8364bf'
+ _HEADERS = None
+ _MANIFEST_HEADERS = {'Accept-Encoding': 'identity'}
+ _REALM = None
+ _VALID_URL_TEMPL = r'https?://(?P<domain>%s)/(?P<type>live|playlist|video)/(?P<id>\d+)(?:\?.*?\bplaylistId=(?P<playlist_id>\d+))?'
+
+ def _real_initialize(self):
+ self._HEADERS = {
+ 'Realm': 'dce.' + self._REALM,
+ 'x-api-key': self._API_KEY,
+ }
+
+ email, password = self._get_login_info()
+ if email is None:
+ self.raise_login_required()
+
+ p_headers = self._HEADERS.copy()
+ p_headers['Content-Type'] = 'application/json'
+ self._HEADERS['Authorization'] = 'Bearer ' + self._download_json(
+ self._API_BASE + 'login',
+ None, 'Logging in', data=json.dumps({
+ 'id': email,
+ 'secret': password,
+ }).encode(), headers=p_headers)['authorisationToken']
+
+ def _call_api(self, path, media_id):
+ return self._download_json(
+ self._API_BASE + path + media_id, media_id, headers=self._HEADERS)
+
+ def _extract_dve_api_url(self, media_id, media_type):
+ stream_path = 'stream'
+ if media_type == 'video':
+ stream_path += '/vod/'
+ else:
+ stream_path += '?eventId='
+ try:
+ return self._call_api(
+ stream_path, media_id)['playerUrlCallback']
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ raise ExtractorError(
+ self._parse_json(e.cause.read().decode(), media_id)['messages'][0],
+ expected=True)
+ raise
+
+ def _real_extract(self, url):
+ domain, media_type, media_id, playlist_id = re.match(self._VALID_URL, url).groups()
+
+ if playlist_id:
+ if self._downloader.params.get('noplaylist'):
+ self.to_screen('Downloading just video %s because of --no-playlist' % media_id)
+ else:
+ self.to_screen('Downloading playlist %s - add --no-playlist to just download video' % playlist_id)
+ media_type, media_id = 'playlist', playlist_id
+
+ if media_type == 'playlist':
+ playlist = self._call_api('vod/playlist/', media_id)
+ entries = []
+ for video in try_get(playlist, lambda x: x['videos']['vods']) or []:
+ video_id = str_or_none(video.get('id'))
+ if not video_id:
+ continue
+ entries.append(self.url_result(
+ 'https://%s/video/%s' % (domain, video_id),
+ self.ie_key(), video_id))
+ return self.playlist_result(
+ entries, media_id, playlist.get('title'),
+ playlist.get('description'))
+
+ dve_api_url = self._extract_dve_api_url(media_id, media_type)
+ video_data = self._download_json(dve_api_url, media_id)
+ is_live = media_type == 'live'
+ if is_live:
+ title = self._live_title(self._call_api('event/', media_id)['title'])
+ else:
+ title = video_data['name']
+
+ formats = []
+ for proto in ('hls', 'dash'):
+ media_url = video_data.get(proto + 'Url') or try_get(video_data, lambda x: x[proto]['url'])
+ if not media_url:
+ continue
+ if proto == 'hls':
+ m3u8_formats = self._extract_m3u8_formats(
+ media_url, media_id, 'mp4', 'm3u8' if is_live else 'm3u8_native',
+ m3u8_id='hls', fatal=False, headers=self._MANIFEST_HEADERS)
+ for f in m3u8_formats:
+ f.setdefault('http_headers', {}).update(self._MANIFEST_HEADERS)
+ formats.append(f)
+ else:
+ formats.extend(self._extract_mpd_formats(
+ media_url, media_id, mpd_id='dash', fatal=False,
+ headers=self._MANIFEST_HEADERS))
+ self._sort_formats(formats)
+
+ subtitles = {}
+ for subtitle in video_data.get('subtitles', []):
+ subtitle_url = subtitle.get('url')
+ if not subtitle_url:
+ continue
+ subtitles.setdefault(subtitle.get('lang', 'en_US'), []).append({
+ 'url': subtitle_url,
+ })
+
+ return {
+ 'id': media_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnail': video_data.get('thumbnailUrl'),
+ 'description': video_data.get('description'),
+ 'duration': int_or_none(video_data.get('duration')),
+ 'tags': video_data.get('tags'),
+ 'is_live': is_live,
+ 'subtitles': subtitles,
+ }
diff --git a/youtube_dl/extractor/internetvideoarchive.py b/youtube_dl/extractor/internetvideoarchive.py
index 76cc5ec3e..59b0a90c3 100644
--- a/youtube_dl/extractor/internetvideoarchive.py
+++ b/youtube_dl/extractor/internetvideoarchive.py
@@ -1,15 +1,13 @@
from __future__ import unicode_literals
+import json
+import re
+
from .common import InfoExtractor
from ..compat import (
compat_parse_qs,
compat_urlparse,
)
-from ..utils import (
- determine_ext,
- int_or_none,
- xpath_text,
-)
class InternetVideoArchiveIE(InfoExtractor):
@@ -20,7 +18,7 @@ class InternetVideoArchiveIE(InfoExtractor):
'info_dict': {
'id': '194487',
'ext': 'mp4',
- 'title': 'KICK-ASS 2',
+ 'title': 'Kick-Ass 2',
'description': 'md5:c189d5b7280400630a1d3dd17eaa8d8a',
},
'params': {
@@ -33,68 +31,34 @@ class InternetVideoArchiveIE(InfoExtractor):
def _build_json_url(query):
return 'http://video.internetvideoarchive.net/player/6/configuration.ashx?' + query
- @staticmethod
- def _build_xml_url(query):
- return 'http://video.internetvideoarchive.net/flash/players/flashconfiguration.aspx?' + query
-
def _real_extract(self, url):
- query = compat_urlparse.urlparse(url).query
- query_dic = compat_parse_qs(query)
- video_id = query_dic['publishedid'][0]
-
- if '/player/' in url:
- configuration = self._download_json(url, video_id)
-
- # There are multiple videos in the playlist whlie only the first one
- # matches the video played in browsers
- video_info = configuration['playlist'][0]
- title = video_info['title']
-
- formats = []
- for source in video_info['sources']:
- file_url = source['file']
- if determine_ext(file_url) == 'm3u8':
- m3u8_formats = self._extract_m3u8_formats(
- file_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
- if m3u8_formats:
- formats.extend(m3u8_formats)
- file_url = m3u8_formats[0]['url']
- formats.extend(self._extract_f4m_formats(
- file_url.replace('.m3u8', '.f4m'),
- video_id, f4m_id='hds', fatal=False))
- formats.extend(self._extract_mpd_formats(
- file_url.replace('.m3u8', '.mpd'),
- video_id, mpd_id='dash', fatal=False))
- else:
- a_format = {
- 'url': file_url,
- }
-
- if source.get('label') and source['label'][-4:] == ' kbs':
- tbr = int_or_none(source['label'][:-4])
- a_format.update({
- 'tbr': tbr,
- 'format_id': 'http-%d' % tbr,
- })
- formats.append(a_format)
-
- self._sort_formats(formats)
-
- description = video_info.get('description')
- thumbnail = video_info.get('image')
- else:
- configuration = self._download_xml(url, video_id)
- formats = [{
- 'url': xpath_text(configuration, './file', 'file URL', fatal=True),
- }]
- thumbnail = xpath_text(configuration, './image', 'thumbnail')
- title = 'InternetVideoArchive video %s' % video_id
- description = None
+ query = compat_parse_qs(compat_urlparse.urlparse(url).query)
+ video_id = query['publishedid'][0]
+ data = self._download_json(
+ 'https://video.internetvideoarchive.net/videojs7/videojs7.ivasettings.ashx',
+ video_id, data=json.dumps({
+ 'customerid': query['customerid'][0],
+ 'publishedid': video_id,
+ }).encode())
+ title = data['Title']
+ formats = self._extract_m3u8_formats(
+ data['VideoUrl'], video_id, 'mp4',
+ 'm3u8_native', m3u8_id='hls', fatal=False)
+ file_url = formats[0]['url']
+ if '.ism/' in file_url:
+ replace_url = lambda x: re.sub(r'\.ism/[^?]+', '.ism/' + x, file_url)
+ formats.extend(self._extract_f4m_formats(
+ replace_url('.f4m'), video_id, f4m_id='hds', fatal=False))
+ formats.extend(self._extract_mpd_formats(
+ replace_url('.mpd'), video_id, mpd_id='dash', fatal=False))
+ formats.extend(self._extract_ism_formats(
+ replace_url('Manifest'), video_id, ism_id='mss', fatal=False))
+ self._sort_formats(formats)
return {
'id': video_id,
'title': title,
'formats': formats,
- 'thumbnail': thumbnail,
- 'description': description,
+ 'thumbnail': data.get('PosterUrl'),
+ 'description': data.get('Description'),
}
diff --git a/youtube_dl/extractor/ivi.py b/youtube_dl/extractor/ivi.py
index 86c014b07..a502e8806 100644
--- a/youtube_dl/extractor/ivi.py
+++ b/youtube_dl/extractor/ivi.py
@@ -1,8 +1,9 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
import json
+import re
+import sys
from .common import InfoExtractor
from ..utils import (
@@ -18,6 +19,8 @@ class IviIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?ivi\.(?:ru|tv)/(?:watch/(?:[^/]+/)?|video/player\?.*?videoId=)(?P<id>\d+)'
_GEO_BYPASS = False
_GEO_COUNTRIES = ['RU']
+ _LIGHT_KEY = b'\xf1\x02\x32\xb7\xbc\x5c\x7a\xe8\xf7\x96\xc1\x33\x2b\x27\xa1\x8c'
+ _LIGHT_URL = 'https://api.ivi.ru/light/'
_TESTS = [
# Single movie
@@ -80,48 +83,96 @@ class IviIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- data = {
+ data = json.dumps({
'method': 'da.content.get',
'params': [
video_id, {
- 'site': 's183',
+ 'site': 's%d',
'referrer': 'http://www.ivi.ru/watch/%s' % video_id,
'contentid': video_id
}
]
- }
+ })
- video_json = self._download_json(
- 'http://api.digitalaccess.ru/api/json/', video_id,
- 'Downloading video JSON', data=json.dumps(data))
-
- if 'error' in video_json:
- error = video_json['error']
- origin = error['origin']
- if origin == 'NotAllowedForLocation':
- self.raise_geo_restricted(
- msg=error['message'], countries=self._GEO_COUNTRIES)
- elif origin == 'NoRedisValidData':
- raise ExtractorError('Video %s does not exist' % video_id, expected=True)
- raise ExtractorError(
- 'Unable to download video %s: %s' % (video_id, error['message']),
- expected=True)
+ bundled = hasattr(sys, 'frozen')
- result = video_json['result']
+ for site in (353, 183):
+ content_data = (data % site).encode()
+ if site == 353:
+ if bundled:
+ continue
+ try:
+ from Cryptodome.Cipher import Blowfish
+ from Cryptodome.Hash import CMAC
+ pycryptodomex_found = True
+ except ImportError:
+ pycryptodomex_found = False
+ continue
- quality = qualities(self._KNOWN_FORMATS)
+ timestamp = (self._download_json(
+ self._LIGHT_URL, video_id,
+ 'Downloading timestamp JSON', data=json.dumps({
+ 'method': 'da.timestamp.get',
+ 'params': []
+ }).encode(), fatal=False) or {}).get('result')
+ if not timestamp:
+ continue
- formats = [{
- 'url': x['url'],
- 'format_id': x.get('content_format'),
- 'quality': quality(x.get('content_format')),
- } for x in result['files'] if x.get('url')]
+ query = {
+ 'ts': timestamp,
+ 'sign': CMAC.new(self._LIGHT_KEY, timestamp.encode() + content_data, Blowfish).hexdigest(),
+ }
+ else:
+ query = {}
- self._sort_formats(formats)
+ video_json = self._download_json(
+ self._LIGHT_URL, video_id,
+ 'Downloading video JSON', data=content_data, query=query)
+ error = video_json.get('error')
+ if error:
+ origin = error.get('origin')
+ message = error.get('message') or error.get('user_message')
+ extractor_msg = 'Unable to download video %s'
+ if origin == 'NotAllowedForLocation':
+ self.raise_geo_restricted(message, self._GEO_COUNTRIES)
+ elif origin == 'NoRedisValidData':
+ extractor_msg = 'Video %s does not exist'
+ elif site == 353:
+ continue
+ elif bundled:
+ raise ExtractorError(
+ 'This feature does not work from bundled exe. Run youtube-dl from sources.',
+ expected=True)
+ elif not pycryptodomex_found:
+ raise ExtractorError(
+ 'pycryptodomex not found. Please install it.',
+ expected=True)
+ elif message:
+ extractor_msg += ': ' + message
+ raise ExtractorError(extractor_msg % video_id, expected=True)
+ else:
+ break
+
+ result = video_json['result']
title = result['title']
- duration = int_or_none(result.get('duration'))
+ quality = qualities(self._KNOWN_FORMATS)
+
+ formats = []
+ for f in result.get('files', []):
+ f_url = f.get('url')
+ content_format = f.get('content_format')
+ if not f_url or '-MDRM-' in content_format or '-FPS-' in content_format:
+ continue
+ formats.append({
+ 'url': f_url,
+ 'format_id': content_format,
+ 'quality': quality(content_format),
+ 'filesize': int_or_none(f.get('size_in_bytes')),
+ })
+ self._sort_formats(formats)
+
compilation = result.get('compilation')
episode = title if compilation else None
@@ -158,7 +209,7 @@ class IviIE(InfoExtractor):
'episode_number': episode_number,
'thumbnails': thumbnails,
'description': description,
- 'duration': duration,
+ 'duration': int_or_none(result.get('duration')),
'formats': formats,
}
diff --git a/youtube_dl/extractor/jamendo.py b/youtube_dl/extractor/jamendo.py
index c21827618..490efa8fb 100644
--- a/youtube_dl/extractor/jamendo.py
+++ b/youtube_dl/extractor/jamendo.py
@@ -1,38 +1,26 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
+import hashlib
+import random
-from ..compat import compat_urlparse
+from ..compat import compat_str
from .common import InfoExtractor
-from ..utils import parse_duration
-
-
-class JamendoBaseIE(InfoExtractor):
- def _extract_meta(self, webpage, fatal=True):
- title = self._og_search_title(
- webpage, default=None) or self._search_regex(
- r'<title>([^<]+)', webpage,
- 'title', default=None)
- if title:
- title = self._search_regex(
- r'(.+?)\s*\|\s*Jamendo Music', title, 'title', default=None)
- if not title:
- title = self._html_search_meta(
- 'name', webpage, 'title', fatal=fatal)
- mobj = re.search(r'(.+) - (.+)', title or '')
- artist, second = mobj.groups() if mobj else [None] * 2
- return title, artist, second
-
-
-class JamendoIE(JamendoBaseIE):
+from ..utils import (
+ clean_html,
+ int_or_none,
+ try_get,
+)
+
+
+class JamendoIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://
(?:
licensing\.jamendo\.com/[^/]+|
(?:www\.)?jamendo\.com
)
- /track/(?P<id>[0-9]+)/(?P<display_id>[^/?#&]+)
+ /track/(?P<id>[0-9]+)(?:/(?P<display_id>[^/?#&]+))?
'''
_TESTS = [{
'url': 'https://www.jamendo.com/track/196219/stories-from-emona-i',
@@ -45,7 +33,9 @@ class JamendoIE(JamendoBaseIE):
'artist': 'Maya Filipič',
'track': 'Stories from Emona I',
'duration': 210,
- 'thumbnail': r're:^https?://.*\.jpg'
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'timestamp': 1217438117,
+ 'upload_date': '20080730',
}
}, {
'url': 'https://licensing.jamendo.com/en/track/1496667/energetic-rock',
@@ -53,15 +43,20 @@ class JamendoIE(JamendoBaseIE):
}]
def _real_extract(self, url):
- mobj = self._VALID_URL_RE.match(url)
- track_id = mobj.group('id')
- display_id = mobj.group('display_id')
-
+ track_id, display_id = self._VALID_URL_RE.match(url).groups()
webpage = self._download_webpage(
- 'https://www.jamendo.com/track/%s/%s' % (track_id, display_id),
- display_id)
-
- title, artist, track = self._extract_meta(webpage)
+ 'https://www.jamendo.com/track/' + track_id, track_id)
+ models = self._parse_json(self._html_search_regex(
+ r"data-bundled-models='([^']+)",
+ webpage, 'bundled models'), track_id)
+ track = models['track']['models'][0]
+ title = track_name = track['name']
+ get_model = lambda x: try_get(models, lambda y: y[x]['models'][0], dict) or {}
+ artist = get_model('artist')
+ artist_name = artist.get('name')
+ if artist_name:
+ title = '%s - %s' % (artist_name, title)
+ album = get_model('album')
formats = [{
'url': 'https://%s.jamendo.com/?trackid=%s&format=%s&from=app-97dab294'
@@ -77,31 +72,58 @@ class JamendoIE(JamendoBaseIE):
))]
self._sort_formats(formats)
- thumbnail = self._html_search_meta(
- 'image', webpage, 'thumbnail', fatal=False)
- duration = parse_duration(self._search_regex(
- r'<span[^>]+itemprop=["\']duration["\'][^>]+content=["\'](.+?)["\']',
- webpage, 'duration', fatal=False))
+ urls = []
+ thumbnails = []
+ for _, covers in track.get('cover', {}).items():
+ for cover_id, cover_url in covers.items():
+ if not cover_url or cover_url in urls:
+ continue
+ urls.append(cover_url)
+ size = int_or_none(cover_id.lstrip('size'))
+ thumbnails.append({
+ 'id': cover_id,
+ 'url': cover_url,
+ 'width': size,
+ 'height': size,
+ })
+
+ tags = []
+ for tag in track.get('tags', []):
+ tag_name = tag.get('name')
+ if not tag_name:
+ continue
+ tags.append(tag_name)
+
+ stats = track.get('stats') or {}
return {
'id': track_id,
'display_id': display_id,
- 'thumbnail': thumbnail,
+ 'thumbnails': thumbnails,
'title': title,
- 'duration': duration,
- 'artist': artist,
- 'track': track,
- 'formats': formats
+ 'description': track.get('description'),
+ 'duration': int_or_none(track.get('duration')),
+ 'artist': artist_name,
+ 'track': track_name,
+ 'album': album.get('name'),
+ 'formats': formats,
+ 'license': '-'.join(track.get('licenseCC', [])) or None,
+ 'timestamp': int_or_none(track.get('dateCreated')),
+ 'view_count': int_or_none(stats.get('listenedAll')),
+ 'like_count': int_or_none(stats.get('favorited')),
+ 'average_rating': int_or_none(stats.get('averageNote')),
+ 'tags': tags,
}
-class JamendoAlbumIE(JamendoBaseIE):
- _VALID_URL = r'https?://(?:www\.)?jamendo\.com/album/(?P<id>[0-9]+)/(?P<display_id>[\w-]+)'
+class JamendoAlbumIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?jamendo\.com/album/(?P<id>[0-9]+)'
_TEST = {
'url': 'https://www.jamendo.com/album/121486/duck-on-cover',
'info_dict': {
'id': '121486',
- 'title': 'Shearer - Duck On Cover'
+ 'title': 'Duck On Cover',
+ 'description': 'md5:c2920eaeef07d7af5b96d7c64daf1239',
},
'playlist': [{
'md5': 'e1a2fcb42bda30dfac990212924149a8',
@@ -111,6 +133,8 @@ class JamendoAlbumIE(JamendoBaseIE):
'title': 'Shearer - Warmachine',
'artist': 'Shearer',
'track': 'Warmachine',
+ 'timestamp': 1368089771,
+ 'upload_date': '20130509',
}
}, {
'md5': '1f358d7b2f98edfe90fd55dac0799d50',
@@ -120,6 +144,8 @@ class JamendoAlbumIE(JamendoBaseIE):
'title': 'Shearer - Without Your Ghost',
'artist': 'Shearer',
'track': 'Without Your Ghost',
+ 'timestamp': 1368089771,
+ 'upload_date': '20130509',
}
}],
'params': {
@@ -127,24 +153,35 @@ class JamendoAlbumIE(JamendoBaseIE):
}
}
+ def _call_api(self, resource, resource_id):
+ path = '/api/%ss' % resource
+ rand = compat_str(random.random())
+ return self._download_json(
+ 'https://www.jamendo.com' + path, resource_id, query={
+ 'id[]': resource_id,
+ }, headers={
+ 'X-Jam-Call': '$%s*%s~' % (hashlib.sha1((path + rand).encode()).hexdigest(), rand)
+ })[0]
+
def _real_extract(self, url):
- mobj = self._VALID_URL_RE.match(url)
- album_id = mobj.group('id')
-
- webpage = self._download_webpage(url, mobj.group('display_id'))
-
- title, artist, album = self._extract_meta(webpage, fatal=False)
-
- entries = [{
- '_type': 'url_transparent',
- 'url': compat_urlparse.urljoin(url, m.group('path')),
- 'ie_key': JamendoIE.ie_key(),
- 'id': self._search_regex(
- r'/track/(\d+)', m.group('path'), 'track id', default=None),
- 'artist': artist,
- 'album': album,
- } for m in re.finditer(
- r'<a[^>]+href=(["\'])(?P<path>(?:(?!\1).)+)\1[^>]+class=["\'][^>]*js-trackrow-albumpage-link',
- webpage)]
-
- return self.playlist_result(entries, album_id, title)
+ album_id = self._match_id(url)
+ album = self._call_api('album', album_id)
+ album_name = album.get('name')
+
+ entries = []
+ for track in album.get('tracks', []):
+ track_id = track.get('id')
+ if not track_id:
+ continue
+ track_id = compat_str(track_id)
+ entries.append({
+ '_type': 'url_transparent',
+ 'url': 'https://www.jamendo.com/track/' + track_id,
+ 'ie_key': JamendoIE.ie_key(),
+ 'id': track_id,
+ 'album': album_name,
+ })
+
+ return self.playlist_result(
+ entries, album_id, album_name,
+ clean_html(try_get(album, lambda x: x['description']['en'], compat_str)))
diff --git a/youtube_dl/extractor/kakao.py b/youtube_dl/extractor/kakao.py
index 7fa140b0c..32935bb28 100644
--- a/youtube_dl/extractor/kakao.py
+++ b/youtube_dl/extractor/kakao.py
@@ -6,14 +6,15 @@ from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
int_or_none,
+ strip_or_none,
unified_timestamp,
update_url_query,
)
class KakaoIE(InfoExtractor):
- _VALID_URL = r'https?://tv\.kakao\.com/channel/(?P<channel>\d+)/cliplink/(?P<id>\d+)'
- _API_BASE = 'http://tv.kakao.com/api/v1/ft/cliplinks'
+ _VALID_URL = r'https?://(?:play-)?tv\.kakao\.com/(?:channel/\d+|embed/player)/cliplink/(?P<id>\d+|[^?#&]+@my)'
+ _API_BASE_TMPL = 'http://tv.kakao.com/api/v1/ft/cliplinks/%s/'
_TESTS = [{
'url': 'http://tv.kakao.com/channel/2671005/cliplink/301965083',
@@ -36,7 +37,7 @@ class KakaoIE(InfoExtractor):
'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\r\n\r\n[쇼! 음악중심] 20160611, 507회',
'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)',
'uploader_id': 2653210,
- 'uploader': '쇼 음악중심',
+ 'uploader': '쇼! 음악중심',
'timestamp': 1485684628,
'upload_date': '20170129',
}
@@ -44,6 +45,8 @@ class KakaoIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
+ display_id = video_id.rstrip('@my')
+ api_base = self._API_BASE_TMPL % video_id
player_header = {
'Referer': update_url_query(
@@ -55,20 +58,23 @@ class KakaoIE(InfoExtractor):
})
}
- QUERY_COMMON = {
+ query = {
'player': 'monet_html5',
'referer': url,
'uuid': '',
'service': 'kakao_tv',
'section': '',
'dteType': 'PC',
+ 'fields': ','.join([
+ '-*', 'tid', 'clipLink', 'displayTitle', 'clip', 'title',
+ 'description', 'channelId', 'createTime', 'duration', 'playCount',
+ 'likeCount', 'commentCount', 'tagList', 'channel', 'name',
+ 'clipChapterThumbnailList', 'thumbnailUrl', 'timeInSec', 'isDefault',
+ 'videoOutputList', 'width', 'height', 'kbps', 'profile', 'label'])
}
- query = QUERY_COMMON.copy()
- query['fields'] = 'clipLink,clip,channel,hasPlusFriend,-service,-tagList'
impress = self._download_json(
- '%s/%s/impress' % (self._API_BASE, video_id),
- video_id, 'Downloading video info',
+ api_base + 'impress', display_id, 'Downloading video info',
query=query, headers=player_header)
clip_link = impress['clipLink']
@@ -76,32 +82,22 @@ class KakaoIE(InfoExtractor):
title = clip.get('title') or clip_link.get('displayTitle')
- tid = impress.get('tid', '')
-
- query = QUERY_COMMON.copy()
- query.update({
- 'tid': tid,
- 'profile': 'HIGH',
- })
- raw = self._download_json(
- '%s/%s/raw' % (self._API_BASE, video_id),
- video_id, 'Downloading video formats info',
- query=query, headers=player_header)
+ query['tid'] = impress.get('tid', '')
formats = []
- for fmt in raw.get('outputList', []):
+ for fmt in clip.get('videoOutputList', []):
try:
profile_name = fmt['profile']
+ if profile_name == 'AUDIO':
+ continue
+ query.update({
+ 'profile': profile_name,
+ 'fields': '-*,url',
+ })
fmt_url_json = self._download_json(
- '%s/%s/raw/videolocation' % (self._API_BASE, video_id),
- video_id,
+ api_base + 'raw/videolocation', display_id,
'Downloading video URL for profile %s' % profile_name,
- query={
- 'service': 'kakao_tv',
- 'section': '',
- 'tid': tid,
- 'profile': profile_name
- }, headers=player_header, fatal=False)
+ query=query, headers=player_header, fatal=False)
if fmt_url_json is None:
continue
@@ -113,7 +109,8 @@ class KakaoIE(InfoExtractor):
'width': int_or_none(fmt.get('width')),
'height': int_or_none(fmt.get('height')),
'format_note': fmt.get('label'),
- 'filesize': int_or_none(fmt.get('filesize'))
+ 'filesize': int_or_none(fmt.get('filesize')),
+ 'tbr': int_or_none(fmt.get('kbps')),
})
except KeyError:
pass
@@ -134,9 +131,9 @@ class KakaoIE(InfoExtractor):
})
return {
- 'id': video_id,
+ 'id': display_id,
'title': title,
- 'description': clip.get('description'),
+ 'description': strip_or_none(clip.get('description')),
'uploader': clip_link.get('channel', {}).get('name'),
'uploader_id': clip_link.get('channelId'),
'thumbnails': thumbs,
@@ -146,4 +143,5 @@ class KakaoIE(InfoExtractor):
'like_count': int_or_none(clip.get('likeCount')),
'comment_count': int_or_none(clip.get('commentCount')),
'formats': formats,
+ 'tags': clip.get('tagList'),
}
diff --git a/youtube_dl/extractor/keek.py b/youtube_dl/extractor/keek.py
deleted file mode 100644
index 94a03d277..000000000
--- a/youtube_dl/extractor/keek.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-
-
-class KeekIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?keek\.com/keek/(?P<id>\w+)'
- IE_NAME = 'keek'
- _TEST = {
- 'url': 'https://www.keek.com/keek/NODfbab',
- 'md5': '9b0636f8c0f7614afa4ea5e4c6e57e83',
- 'info_dict': {
- 'id': 'NODfbab',
- 'ext': 'mp4',
- 'title': 'md5:35d42050a3ece241d5ddd7fdcc6fd896',
- 'uploader': 'ytdl',
- 'uploader_id': 'eGT5bab',
- },
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- webpage = self._download_webpage(url, video_id)
-
- return {
- 'id': video_id,
- 'url': self._og_search_video_url(webpage),
- 'ext': 'mp4',
- 'title': self._og_search_description(webpage).strip(),
- 'thumbnail': self._og_search_thumbnail(webpage),
- 'uploader': self._search_regex(
- r'data-username=(["\'])(?P<uploader>.+?)\1', webpage,
- 'uploader', fatal=False, group='uploader'),
- 'uploader_id': self._search_regex(
- r'data-user-id=(["\'])(?P<uploader_id>.+?)\1', webpage,
- 'uploader id', fatal=False, group='uploader_id'),
- }
diff --git a/youtube_dl/extractor/kinja.py b/youtube_dl/extractor/kinja.py
new file mode 100644
index 000000000..79e3026d2
--- /dev/null
+++ b/youtube_dl/extractor/kinja.py
@@ -0,0 +1,221 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_str,
+ compat_urllib_parse_unquote,
+)
+from ..utils import (
+ int_or_none,
+ parse_iso8601,
+ strip_or_none,
+ try_get,
+ unescapeHTML,
+ urljoin,
+)
+
+
+class KinjaEmbedIE(InfoExtractor):
+ IENAME = 'kinja:embed'
+ _DOMAIN_REGEX = r'''(?:[^.]+\.)?
+ (?:
+ avclub|
+ clickhole|
+ deadspin|
+ gizmodo|
+ jalopnik|
+ jezebel|
+ kinja|
+ kotaku|
+ lifehacker|
+ splinternews|
+ the(?:inventory|onion|root|takeout)
+ )\.com'''
+ _COMMON_REGEX = r'''/
+ (?:
+ ajax/inset|
+ embed/video
+ )/iframe\?.*?\bid='''
+ _VALID_URL = r'''(?x)https?://%s%s
+ (?P<type>
+ fb|
+ imgur|
+ instagram|
+ jwp(?:layer)?-video|
+ kinjavideo|
+ mcp|
+ megaphone|
+ ooyala|
+ soundcloud(?:-playlist)?|
+ tumblr-post|
+ twitch-stream|
+ twitter|
+ ustream-channel|
+ vimeo|
+ vine|
+ youtube-(?:list|video)
+ )-(?P<id>[^&]+)''' % (_DOMAIN_REGEX, _COMMON_REGEX)
+ _TESTS = [{
+ 'url': 'https://kinja.com/ajax/inset/iframe?id=fb-10103303356633621',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://kinja.com/ajax/inset/iframe?id=kinjavideo-100313',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://kinja.com/ajax/inset/iframe?id=megaphone-PPY1300931075',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://kinja.com/ajax/inset/iframe?id=ooyala-xzMXhleDpopuT0u1ijt_qZj3Va-34pEX%2FZTIxYmJjZDM2NWYzZDViZGRiOWJjYzc5',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://kinja.com/ajax/inset/iframe?id=soundcloud-128574047',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://kinja.com/ajax/inset/iframe?id=soundcloud-playlist-317413750',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://kinja.com/ajax/inset/iframe?id=tumblr-post-160130699814-daydreams-at-midnight',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://kinja.com/ajax/inset/iframe?id=twitch-stream-libratus_extra',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://kinja.com/ajax/inset/iframe?id=twitter-1068875942473404422',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://kinja.com/ajax/inset/iframe?id=ustream-channel-10414700',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://kinja.com/ajax/inset/iframe?id=vimeo-120153502',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://kinja.com/ajax/inset/iframe?id=vine-5BlvV5qqPrD',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://kinja.com/ajax/inset/iframe?id=youtube-list-BCQ3KyrPjgA/PLE6509247C270A72E',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://kinja.com/ajax/inset/iframe?id=youtube-video-00QyL0AgPAE',
+ 'only_matching': True,
+ }]
+ _JWPLATFORM_PROVIDER = ('cdn.jwplayer.com/v2/media/', 'JWPlatform')
+ _PROVIDER_MAP = {
+ 'fb': ('facebook.com/video.php?v=', 'Facebook'),
+ 'imgur': ('imgur.com/', 'Imgur'),
+ 'instagram': ('instagram.com/p/', 'Instagram'),
+ 'jwplayer-video': _JWPLATFORM_PROVIDER,
+ 'jwp-video': _JWPLATFORM_PROVIDER,
+ 'megaphone': ('player.megaphone.fm/', 'Generic'),
+ 'ooyala': ('player.ooyala.com/player.js?embedCode=', 'Ooyala'),
+ 'soundcloud': ('api.soundcloud.com/tracks/', 'Soundcloud'),
+ 'soundcloud-playlist': ('api.soundcloud.com/playlists/', 'SoundcloudPlaylist'),
+ 'tumblr-post': ('%s.tumblr.com/post/%s', 'Tumblr'),
+ 'twitch-stream': ('twitch.tv/', 'TwitchStream'),
+ 'twitter': ('twitter.com/i/cards/tfw/v1/', 'TwitterCard'),
+ 'ustream-channel': ('ustream.tv/embed/', 'Ustream'),
+ 'vimeo': ('vimeo.com/', 'Vimeo'),
+ 'vine': ('vine.co/v/', 'Vine'),
+ 'youtube-list': ('youtube.com/embed/%s?list=%s', 'YoutubePlaylist'),
+ 'youtube-video': ('youtube.com/embed/', 'Youtube'),
+ }
+
+ @staticmethod
+ def _extract_urls(webpage, url):
+ return [urljoin(url, unescapeHTML(mobj.group('url'))) for mobj in re.finditer(
+ r'(?x)<iframe[^>]+?src=(?P<q>["\'])(?P<url>(?:(?:https?:)?//%s)?%s(?:(?!\1).)+)\1' % (KinjaEmbedIE._DOMAIN_REGEX, KinjaEmbedIE._COMMON_REGEX),
+ webpage)]
+
+ def _real_extract(self, url):
+ video_type, video_id = re.match(self._VALID_URL, url).groups()
+
+ provider = self._PROVIDER_MAP.get(video_type)
+ if provider:
+ video_id = compat_urllib_parse_unquote(video_id)
+ if video_type == 'tumblr-post':
+ video_id, blog = video_id.split('-', 1)
+ result_url = provider[0] % (blog, video_id)
+ elif video_type == 'youtube-list':
+ video_id, playlist_id = video_id.split('/')
+ result_url = provider[0] % (video_id, playlist_id)
+ else:
+ if video_type == 'ooyala':
+ video_id = video_id.split('/')[0]
+ result_url = provider[0] + video_id
+ return self.url_result('http://' + result_url, provider[1])
+
+ if video_type == 'kinjavideo':
+ data = self._download_json(
+ 'https://kinja.com/api/core/video/views/videoById',
+ video_id, query={'videoId': video_id})['data']
+ title = data['title']
+
+ formats = []
+ for k in ('signedPlaylist', 'streaming'):
+ m3u8_url = data.get(k + 'Url')
+ if m3u8_url:
+ formats.extend(self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
+ self._sort_formats(formats)
+
+ thumbnail = None
+ poster = data.get('poster') or {}
+ poster_id = poster.get('id')
+ if poster_id:
+ thumbnail = 'https://i.kinja-img.com/gawker-media/image/upload/%s.%s' % (poster_id, poster.get('format') or 'jpg')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': strip_or_none(data.get('description')),
+ 'formats': formats,
+ 'tags': data.get('tags'),
+ 'timestamp': int_or_none(try_get(
+ data, lambda x: x['postInfo']['publishTimeMillis']), 1000),
+ 'thumbnail': thumbnail,
+ 'uploader': data.get('network'),
+ }
+ else:
+ video_data = self._download_json(
+ 'https://api.vmh.univision.com/metadata/v1/content/' + video_id,
+ video_id)['videoMetadata']
+ iptc = video_data['photoVideoMetadataIPTC']
+ title = iptc['title']['en']
+ fmg = video_data.get('photoVideoMetadata_fmg') or {}
+ tvss_domain = fmg.get('tvssDomain') or 'https://auth.univision.com'
+ data = self._download_json(
+ tvss_domain + '/api/v3/video-auth/url-signature-tokens',
+ video_id, query={'mcpids': video_id})['data'][0]
+ formats = []
+
+ rendition_url = data.get('renditionUrl')
+ if rendition_url:
+ formats = self._extract_m3u8_formats(
+ rendition_url, video_id, 'mp4',
+ 'm3u8_native', m3u8_id='hls', fatal=False)
+
+ fallback_rendition_url = data.get('fallbackRenditionUrl')
+ if fallback_rendition_url:
+ formats.append({
+ 'format_id': 'fallback',
+ 'tbr': int_or_none(self._search_regex(
+ r'_(\d+)\.mp4', fallback_rendition_url,
+ 'bitrate', default=None)),
+ 'url': fallback_rendition_url,
+ })
+
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': try_get(iptc, lambda x: x['cloudinaryLink']['link'], compat_str),
+ 'uploader': fmg.get('network'),
+ 'duration': int_or_none(iptc.get('fileDuration')),
+ 'formats': formats,
+ 'description': try_get(iptc, lambda x: x['description']['en'], compat_str),
+ 'timestamp': parse_iso8601(iptc.get('dateReleased')),
+ }
diff --git a/youtube_dl/extractor/kontrtube.py b/youtube_dl/extractor/kontrtube.py
deleted file mode 100644
index 1fda45107..000000000
--- a/youtube_dl/extractor/kontrtube.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..utils import (
- int_or_none,
- parse_duration,
-)
-
-
-class KontrTubeIE(InfoExtractor):
- IE_NAME = 'kontrtube'
- IE_DESC = 'KontrTube.ru - Труба зовёт'
- _VALID_URL = r'https?://(?:www\.)?kontrtube\.ru/videos/(?P<id>\d+)/(?P<display_id>[^/]+)/'
-
- _TEST = {
- 'url': 'http://www.kontrtube.ru/videos/2678/nad-olimpiyskoy-derevney-v-sochi-podnyat-rossiyskiy-flag/',
- 'md5': '975a991a4926c9a85f383a736a2e6b80',
- 'info_dict': {
- 'id': '2678',
- 'display_id': 'nad-olimpiyskoy-derevney-v-sochi-podnyat-rossiyskiy-flag',
- 'ext': 'mp4',
- 'title': 'Над олимпийской деревней в Сочи поднят российский флаг',
- 'description': 'md5:80edc4c613d5887ae8ccf1d59432be41',
- 'thumbnail': 'http://www.kontrtube.ru/contents/videos_screenshots/2000/2678/preview.mp4.jpg',
- 'duration': 270,
- }
- }
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
- display_id = mobj.group('display_id')
-
- webpage = self._download_webpage(
- url, display_id, 'Downloading page')
-
- video_url = self._search_regex(
- r"video_url\s*:\s*'(.+?)/?',", webpage, 'video URL')
- thumbnail = self._search_regex(
- r"preview_url\s*:\s*'(.+?)/?',", webpage, 'thumbnail', fatal=False)
- title = self._html_search_regex(
- r'(?s)<h2>(.+?)</h2>', webpage, 'title')
- description = self._html_search_meta(
- 'description', webpage, 'description')
-
- duration = self._search_regex(
- r'Длительность: <em>([^<]+)</em>', webpage, 'duration', fatal=False)
- if duration:
- duration = parse_duration(duration.replace('мин', 'min').replace('сек', 'sec'))
-
- view_count = self._search_regex(
- r'Просмотров: <em>([^<]+)</em>',
- webpage, 'view count', fatal=False)
- if view_count:
- view_count = int_or_none(view_count.replace(' ', ''))
-
- comment_count = int_or_none(self._search_regex(
- r'Комментарии \((\d+)\)<', webpage, ' comment count', fatal=False))
-
- return {
- 'id': video_id,
- 'display_id': display_id,
- 'url': video_url,
- 'thumbnail': thumbnail,
- 'title': title,
- 'description': description,
- 'duration': duration,
- 'view_count': int_or_none(view_count),
- 'comment_count': int_or_none(comment_count),
- }
diff --git a/youtube_dl/extractor/la7.py b/youtube_dl/extractor/la7.py
index 6373268c4..c3b4ffa7e 100644
--- a/youtube_dl/extractor/la7.py
+++ b/youtube_dl/extractor/la7.py
@@ -20,7 +20,7 @@ class LA7IE(InfoExtractor):
'url': 'http://www.la7.it/crozza/video/inccool8-02-10-2015-163722',
'md5': '8b613ffc0c4bf9b9e377169fc19c214c',
'info_dict': {
- 'id': 'inccool8-02-10-2015-163722',
+ 'id': '0_42j6wd36',
'ext': 'mp4',
'title': 'Inc.Cool8',
'description': 'Benvenuti nell\'incredibile mondo della INC. COOL. 8. dove “INC.” sta per “Incorporated” “COOL” sta per “fashion” ed Eight sta per il gesto atletico',
@@ -57,7 +57,7 @@ class LA7IE(InfoExtractor):
return {
'_type': 'url_transparent',
'url': smuggle_url('kaltura:103:%s' % player_data['vid'], {
- 'service_url': 'http://kdam.iltrovatore.it',
+ 'service_url': 'http://nkdam.iltrovatore.it',
}),
'id': video_id,
'title': player_data['title'],
diff --git a/youtube_dl/extractor/learnr.py b/youtube_dl/extractor/learnr.py
deleted file mode 100644
index 1435e090e..000000000
--- a/youtube_dl/extractor/learnr.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-
-
-class LearnrIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?learnr\.pro/view/video/(?P<id>[0-9]+)'
- _TEST = {
- 'url': 'http://www.learnr.pro/view/video/51624-web-development-tutorial-for-beginners-1-how-to-build-webpages-with-html-css-javascript',
- 'md5': '3719fdf0a68397f49899e82c308a89de',
- 'info_dict': {
- 'id': '51624',
- 'ext': 'mp4',
- 'title': 'Web Development Tutorial for Beginners (#1) - How to build webpages with HTML, CSS, Javascript',
- 'description': 'md5:b36dbfa92350176cdf12b4d388485503',
- 'uploader': 'LearnCode.academy',
- 'uploader_id': 'learncodeacademy',
- 'upload_date': '20131021',
- },
- 'add_ie': ['Youtube'],
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
-
- return {
- '_type': 'url_transparent',
- 'url': self._search_regex(
- r"videoId\s*:\s*'([^']+)'", webpage, 'youtube id'),
- 'id': video_id,
- }
diff --git a/youtube_dl/extractor/lnkgo.py b/youtube_dl/extractor/lnkgo.py
index cfec0d3d0..3e71852aa 100644
--- a/youtube_dl/extractor/lnkgo.py
+++ b/youtube_dl/extractor/lnkgo.py
@@ -5,24 +5,27 @@ import re
from .common import InfoExtractor
from ..utils import (
+ clean_html,
+ compat_str,
int_or_none,
- unified_strdate,
+ parse_iso8601,
)
class LnkGoIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?lnkgo\.(?:alfa\.)?lt/visi-video/(?P<show>[^/]+)/ziurek-(?P<id>[A-Za-z0-9-]+)'
+ _VALID_URL = r'https?://(?:www\.)?lnk(?:go)?\.(?:alfa\.)?lt/(?:visi-video/[^/]+|video)/(?P<id>[A-Za-z0-9-]+)(?:/(?P<episode_id>\d+))?'
_TESTS = [{
- 'url': 'http://lnkgo.alfa.lt/visi-video/yra-kaip-yra/ziurek-yra-kaip-yra-162',
+ 'url': 'http://www.lnkgo.lt/visi-video/aktualai-pratesimas/ziurek-putka-trys-klausimai',
'info_dict': {
- 'id': '46712',
+ 'id': '10809',
'ext': 'mp4',
- 'title': 'Yra kaip yra',
- 'upload_date': '20150107',
- 'description': 'md5:d82a5e36b775b7048617f263a0e3475e',
- 'age_limit': 7,
- 'duration': 3019,
- 'thumbnail': r're:^https?://.*\.jpg$'
+ 'title': "Put'ka: Trys Klausimai",
+ 'upload_date': '20161216',
+ 'description': 'Seniai matytas Put’ka užduoda tris klausimėlius. Pabandykime surasti atsakymus.',
+ 'age_limit': 18,
+ 'duration': 117,
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'timestamp': 1481904000,
},
'params': {
'skip_download': True, # HLS download
@@ -30,20 +33,21 @@ class LnkGoIE(InfoExtractor):
}, {
'url': 'http://lnkgo.alfa.lt/visi-video/aktualai-pratesimas/ziurek-nerdas-taiso-kompiuteri-2',
'info_dict': {
- 'id': '47289',
+ 'id': '10467',
'ext': 'mp4',
'title': 'Nėrdas: Kompiuterio Valymas',
'upload_date': '20150113',
'description': 'md5:7352d113a242a808676ff17e69db6a69',
'age_limit': 18,
'duration': 346,
- 'thumbnail': r're:^https?://.*\.jpg$'
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'timestamp': 1421164800,
},
'params': {
'skip_download': True, # HLS download
},
}, {
- 'url': 'http://www.lnkgo.lt/visi-video/aktualai-pratesimas/ziurek-putka-trys-klausimai',
+ 'url': 'https://lnk.lt/video/neigalieji-tv-bokste/37413',
'only_matching': True,
}]
_AGE_LIMITS = {
@@ -51,66 +55,34 @@ class LnkGoIE(InfoExtractor):
'N-14': 14,
'S': 18,
}
+ _M3U8_TEMPL = 'https://vod.lnk.lt/lnk_vod/lnk/lnk/%s:%s/playlist.m3u8%s'
def _real_extract(self, url):
- display_id = self._match_id(url)
-
- webpage = self._download_webpage(
- url, display_id, 'Downloading player webpage')
-
- video_id = self._search_regex(
- r'data-ep="([^"]+)"', webpage, 'video ID')
- title = self._og_search_title(webpage)
- description = self._og_search_description(webpage)
- upload_date = unified_strdate(self._search_regex(
- r'class="[^"]*meta-item[^"]*air-time[^"]*">.*?<strong>([^<]+)</strong>', webpage, 'upload date', fatal=False))
-
- thumbnail_w = int_or_none(
- self._og_search_property('image:width', webpage, 'thumbnail width', fatal=False))
- thumbnail_h = int_or_none(
- self._og_search_property('image:height', webpage, 'thumbnail height', fatal=False))
- thumbnail = {
- 'url': self._og_search_thumbnail(webpage),
- }
- if thumbnail_w and thumbnail_h:
- thumbnail.update({
- 'width': thumbnail_w,
- 'height': thumbnail_h,
- })
-
- config = self._parse_json(self._search_regex(
- r'episodePlayer\((\{.*?\}),\s*\{', webpage, 'sources'), video_id)
-
- if config.get('pGeo'):
- self.report_warning(
- 'This content might not be available in your country due to copyright reasons')
+ display_id, video_id = re.match(self._VALID_URL, url).groups()
- formats = [{
- 'format_id': 'hls',
- 'ext': 'mp4',
- 'url': config['EpisodeVideoLink_HLS'],
- }]
-
- m = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<play_path>.+)$', config['EpisodeVideoLink'])
- if m:
- formats.append({
- 'format_id': 'rtmp',
- 'ext': 'flv',
- 'url': m.group('url'),
- 'play_path': m.group('play_path'),
- 'page_url': url,
- })
+ video_info = self._download_json(
+ 'https://lnk.lt/api/main/video-page/%s/%s/false' % (display_id, video_id or '0'),
+ display_id)['videoConfig']['videoInfo']
+ video_id = compat_str(video_info['id'])
+ title = video_info['title']
+ prefix = 'smil' if video_info.get('isQualityChangeAvailable') else 'mp4'
+ formats = self._extract_m3u8_formats(
+ self._M3U8_TEMPL % (prefix, video_info['videoUrl'], video_info.get('secureTokenParams') or ''),
+ video_id, 'mp4', 'm3u8_native')
self._sort_formats(formats)
+ poster_image = video_info.get('posterImage')
+
return {
'id': video_id,
'display_id': display_id,
'title': title,
'formats': formats,
- 'thumbnails': [thumbnail],
- 'duration': int_or_none(config.get('VideoTime')),
- 'description': description,
- 'age_limit': self._AGE_LIMITS.get(config.get('PGRating'), 0),
- 'upload_date': upload_date,
+ 'thumbnail': 'https://lnk.lt/all-images/' + poster_image if poster_image else None,
+ 'duration': int_or_none(video_info.get('duration')),
+ 'description': clean_html(video_info.get('htmlDescription')),
+ 'age_limit': self._AGE_LIMITS.get(video_info.get('pgRating'), 0),
+ 'timestamp': parse_iso8601(video_info.get('airDate')),
+ 'view_count': int_or_none(video_info.get('viewsCount')),
}
diff --git a/youtube_dl/extractor/macgamestore.py b/youtube_dl/extractor/macgamestore.py
deleted file mode 100644
index 43db9929c..000000000
--- a/youtube_dl/extractor/macgamestore.py
+++ /dev/null
@@ -1,42 +0,0 @@
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from ..utils import ExtractorError
-
-
-class MacGameStoreIE(InfoExtractor):
- IE_NAME = 'macgamestore'
- IE_DESC = 'MacGameStore trailers'
- _VALID_URL = r'https?://(?:www\.)?macgamestore\.com/mediaviewer\.php\?trailer=(?P<id>\d+)'
-
- _TEST = {
- 'url': 'http://www.macgamestore.com/mediaviewer.php?trailer=2450',
- 'md5': '8649b8ea684b6666b4c5be736ecddc61',
- 'info_dict': {
- 'id': '2450',
- 'ext': 'm4v',
- 'title': 'Crow',
- }
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(
- url, video_id, 'Downloading trailer page')
-
- if '>Missing Media<' in webpage:
- raise ExtractorError(
- 'Trailer %s does not exist' % video_id, expected=True)
-
- video_title = self._html_search_regex(
- r'<title>MacGameStore: (.*?) Trailer</title>', webpage, 'title')
-
- video_url = self._html_search_regex(
- r'(?s)<div\s+id="video-player".*?href="([^"]+)"\s*>',
- webpage, 'video URL')
-
- return {
- 'id': video_id,
- 'url': video_url,
- 'title': video_title
- }
diff --git a/youtube_dl/extractor/makertv.py b/youtube_dl/extractor/makertv.py
deleted file mode 100644
index 8eda69cfc..000000000
--- a/youtube_dl/extractor/makertv.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-
-
-class MakerTVIE(InfoExtractor):
- _VALID_URL = r'https?://(?:(?:www\.)?maker\.tv/(?:[^/]+/)*video|makerplayer\.com/embed/maker)/(?P<id>[a-zA-Z0-9]{12})'
- _TEST = {
- 'url': 'http://www.maker.tv/video/Fh3QgymL9gsc',
- 'md5': 'ca237a53a8eb20b6dc5bd60564d4ab3e',
- 'info_dict': {
- 'id': 'Fh3QgymL9gsc',
- 'ext': 'mp4',
- 'title': 'Maze Runner: The Scorch Trials Official Movie Review',
- 'description': 'md5:11ff3362d7ef1d679fdb649f6413975a',
- 'upload_date': '20150918',
- 'timestamp': 1442549540,
- }
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- jwplatform_id = self._search_regex(r'jw_?id="([^"]+)"', webpage, 'jwplatform id')
-
- return {
- '_type': 'url_transparent',
- 'id': video_id,
- 'url': 'jwplatform:%s' % jwplatform_id,
- 'ie_key': 'JWPlatform',
- }
diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py
index df3748798..f976506f4 100644
--- a/youtube_dl/extractor/mediaset.py
+++ b/youtube_dl/extractor/mediaset.py
@@ -27,7 +27,7 @@ class MediasetIE(ThePlatformBaseIE):
(?:video|on-demand)/(?:[^/]+/)+[^/]+_|
player/index\.html\?.*?\bprogramGuid=
)
- )(?P<id>[0-9A-Z]{16})
+ )(?P<id>[0-9A-Z]{16,})
'''
_TESTS = [{
# full episode
@@ -62,7 +62,6 @@ class MediasetIE(ThePlatformBaseIE):
'uploader': 'Canale 5',
'uploader_id': 'C5',
},
- 'expected_warnings': ['HTTP Error 403: Forbidden'],
}, {
# clip
'url': 'https://www.mediasetplay.mediaset.it/video/gogglebox/un-grande-classico-della-commedia-sexy_FAFU000000661680',
@@ -78,6 +77,18 @@ class MediasetIE(ThePlatformBaseIE):
}, {
'url': 'mediaset:FAFU000000665924',
'only_matching': True,
+ }, {
+ 'url': 'https://www.mediasetplay.mediaset.it/video/mediasethaacuoreilfuturo/palmieri-alicudi-lisola-dei-tre-bambini-felici--un-decreto-per-alicudi-e-tutte-le-microscuole_FD00000000102295',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.mediasetplay.mediaset.it/video/cherryseason/anticipazioni-degli-episodi-del-23-ottobre_F306837101005C02',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.mediasetplay.mediaset.it/video/tg5/ambiente-onda-umana-per-salvare-il-pianeta_F309453601079D01',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.mediasetplay.mediaset.it/video/grandefratellovip/benedetta-una-doccia-gelata_F309344401044C135',
+ 'only_matching': True,
}]
@staticmethod
@@ -109,6 +120,11 @@ class MediasetIE(ThePlatformBaseIE):
entries.append(embed_url)
return entries
+ def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
+ for video in smil.findall(self._xpath_ns('.//video', namespace)):
+ video.attrib['src'] = re.sub(r'(https?://vod05)t(-mediaset-it\.akamaized\.net/.+?.mpd)\?.+', r'\1\2', video.attrib['src'])
+ return super()._parse_smil_formats(smil, smil_url, video_id, namespace, f4m_params, transform_rtmp_url)
+
def _real_extract(self, url):
guid = self._match_id(url)
tp_path = 'PR1GhC/media/guid/2702976343/' + guid
@@ -118,14 +134,15 @@ class MediasetIE(ThePlatformBaseIE):
subtitles = {}
first_e = None
for asset_type in ('SD', 'HD'):
- for f in ('MPEG4', 'MPEG-DASH', 'M3U', 'ISM'):
+ # TODO: fixup ISM+none manifest URLs
+ for f in ('MPEG4', 'MPEG-DASH+none', 'M3U+none'):
try:
tp_formats, tp_subtitles = self._extract_theplatform_smil(
update_url_query('http://link.theplatform.%s/s/%s' % (self._TP_TLD, tp_path), {
'mbr': 'true',
'formats': f,
'assetTypes': asset_type,
- }), guid, 'Downloading %s %s SMIL data' % (f, asset_type))
+ }), guid, 'Downloading %s %s SMIL data' % (f.split('+')[0], asset_type))
except ExtractorError as e:
if not first_e:
first_e = e
diff --git a/youtube_dl/extractor/minhateca.py b/youtube_dl/extractor/minhateca.py
deleted file mode 100644
index dccc54249..000000000
--- a/youtube_dl/extractor/minhateca.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from ..utils import (
- int_or_none,
- parse_duration,
- parse_filesize,
- sanitized_Request,
- urlencode_postdata,
-)
-
-
-class MinhatecaIE(InfoExtractor):
- _VALID_URL = r'https?://minhateca\.com\.br/[^?#]+,(?P<id>[0-9]+)\.'
- _TEST = {
- 'url': 'http://minhateca.com.br/pereba/misc/youtube-dl+test+video,125848331.mp4(video)',
- 'info_dict': {
- 'id': '125848331',
- 'ext': 'mp4',
- 'title': 'youtube-dl test video',
- 'thumbnail': r're:^https?://.*\.jpg$',
- 'filesize_approx': 1530000,
- 'duration': 9,
- 'view_count': int,
- }
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
-
- token = self._html_search_regex(
- r'<input name="__RequestVerificationToken".*?value="([^"]+)"',
- webpage, 'request token')
- token_data = [
- ('fileId', video_id),
- ('__RequestVerificationToken', token),
- ]
- req = sanitized_Request(
- 'http://minhateca.com.br/action/License/Download',
- data=urlencode_postdata(token_data))
- req.add_header('Content-Type', 'application/x-www-form-urlencoded')
- data = self._download_json(
- req, video_id, note='Downloading metadata')
-
- video_url = data['redirectUrl']
- title_str = self._html_search_regex(
- r'<h1.*?>(.*?)</h1>', webpage, 'title')
- title, _, ext = title_str.rpartition('.')
- filesize_approx = parse_filesize(self._html_search_regex(
- r'<p class="fileSize">(.*?)</p>',
- webpage, 'file size approximation', fatal=False))
- duration = parse_duration(self._html_search_regex(
- r'(?s)<p class="fileLeng[ht][th]">.*?class="bold">(.*?)<',
- webpage, 'duration', fatal=False))
- view_count = int_or_none(self._html_search_regex(
- r'<p class="downloadsCounter">([0-9]+)</p>',
- webpage, 'view count', fatal=False))
-
- return {
- 'id': video_id,
- 'url': video_url,
- 'title': title,
- 'ext': ext,
- 'filesize_approx': filesize_approx,
- 'duration': duration,
- 'view_count': view_count,
- 'thumbnail': self._og_search_thumbnail(webpage),
- }
diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py
index bf5353ef9..9759560f1 100644
--- a/youtube_dl/extractor/mixcloud.py
+++ b/youtube_dl/extractor/mixcloud.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import functools
import itertools
import re
@@ -11,28 +10,37 @@ from ..compat import (
compat_ord,
compat_str,
compat_urllib_parse_unquote,
- compat_urlparse,
compat_zip
)
from ..utils import (
- clean_html,
- ExtractorError,
int_or_none,
- OnDemandPagedList,
- str_to_int,
+ parse_iso8601,
+ strip_or_none,
try_get,
- urljoin,
)
-class MixcloudIE(InfoExtractor):
+class MixcloudBaseIE(InfoExtractor):
+ def _call_api(self, object_type, object_fields, display_id, username, slug=None):
+ lookup_key = object_type + 'Lookup'
+ return self._download_json(
+ 'https://www.mixcloud.com/graphql', display_id, query={
+ 'query': '''{
+ %s(lookup: {username: "%s"%s}) {
+ %s
+ }
+}''' % (lookup_key, username, ', slug: "%s"' % slug if slug else '', object_fields)
+ })['data'][lookup_key]
+
+
+class MixcloudIE(MixcloudBaseIE):
_VALID_URL = r'https?://(?:(?:www|beta|m)\.)?mixcloud\.com/([^/]+)/(?!stream|uploads|favorites|listens|playlists)([^/]+)'
IE_NAME = 'mixcloud'
_TESTS = [{
'url': 'http://www.mixcloud.com/dholbach/cryptkeeper/',
'info_dict': {
- 'id': 'dholbach-cryptkeeper',
+ 'id': 'dholbach_cryptkeeper',
'ext': 'm4a',
'title': 'Cryptkeeper',
'description': 'After quite a long silence from myself, finally another Drum\'n\'Bass mix with my favourite current dance floor bangers.',
@@ -40,11 +48,13 @@ class MixcloudIE(InfoExtractor):
'uploader_id': 'dholbach',
'thumbnail': r're:https?://.*\.jpg',
'view_count': int,
+ 'timestamp': 1321359578,
+ 'upload_date': '20111115',
},
}, {
'url': 'http://www.mixcloud.com/gillespeterson/caribou-7-inch-vinyl-mix-chat/',
'info_dict': {
- 'id': 'gillespeterson-caribou-7-inch-vinyl-mix-chat',
+ 'id': 'gillespeterson_caribou-7-inch-vinyl-mix-chat',
'ext': 'mp3',
'title': 'Caribou 7 inch Vinyl Mix & Chat',
'description': 'md5:2b8aec6adce69f9d41724647c65875e8',
@@ -52,11 +62,14 @@ class MixcloudIE(InfoExtractor):
'uploader_id': 'gillespeterson',
'thumbnail': 're:https?://.*',
'view_count': int,
+ 'timestamp': 1422987057,
+ 'upload_date': '20150203',
},
}, {
'url': 'https://beta.mixcloud.com/RedLightRadio/nosedrip-15-red-light-radio-01-18-2016/',
'only_matching': True,
}]
+ _DECRYPTION_KEY = 'IFYOUWANTTHEARTISTSTOGETPAIDDONOTDOWNLOADFROMMIXCLOUD'
@staticmethod
def _decrypt_xor_cipher(key, ciphertext):
@@ -66,176 +79,193 @@ class MixcloudIE(InfoExtractor):
for ch, k in compat_zip(ciphertext, itertools.cycle(key))])
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- uploader = mobj.group(1)
- cloudcast_name = mobj.group(2)
- track_id = compat_urllib_parse_unquote('-'.join((uploader, cloudcast_name)))
+ username, slug = re.match(self._VALID_URL, url).groups()
+ username, slug = compat_urllib_parse_unquote(username), compat_urllib_parse_unquote(slug)
+ track_id = '%s_%s' % (username, slug)
+
+ cloudcast = self._call_api('cloudcast', '''audioLength
+ comments(first: 100) {
+ edges {
+ node {
+ comment
+ created
+ user {
+ displayName
+ username
+ }
+ }
+ }
+ totalCount
+ }
+ description
+ favorites {
+ totalCount
+ }
+ featuringArtistList
+ isExclusive
+ name
+ owner {
+ displayName
+ url
+ username
+ }
+ picture(width: 1024, height: 1024) {
+ url
+ }
+ plays
+ publishDate
+ reposts {
+ totalCount
+ }
+ streamInfo {
+ dashUrl
+ hlsUrl
+ url
+ }
+ tags {
+ tag {
+ name
+ }
+ }''', track_id, username, slug)
- webpage = self._download_webpage(url, track_id)
+ title = cloudcast['name']
- # Legacy path
- encrypted_play_info = self._search_regex(
- r'm-play-info="([^"]+)"', webpage, 'play info', default=None)
+ stream_info = cloudcast['streamInfo']
+ formats = []
- if encrypted_play_info is not None:
- # Decode
- encrypted_play_info = compat_b64decode(encrypted_play_info)
- else:
- # New path
- full_info_json = self._parse_json(self._html_search_regex(
- r'<script id="relay-data" type="text/x-mixcloud">([^<]+)</script>',
- webpage, 'play info'), 'play info')
- for item in full_info_json:
- item_data = try_get(
- item, lambda x: x['cloudcast']['data']['cloudcastLookup'],
- dict)
- if try_get(item_data, lambda x: x['streamInfo']['url']):
- info_json = item_data
- break
- else:
- raise ExtractorError('Failed to extract matching stream info')
-
- message = self._html_search_regex(
- r'(?s)<div[^>]+class="global-message cloudcast-disabled-notice-light"[^>]*>(.+?)<(?:a|/div)',
- webpage, 'error message', default=None)
-
- js_url = self._search_regex(
- r'<script[^>]+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/(?:js2/www_js_4|js/www)\.[^>]+\.js)',
- webpage, 'js url')
- js = self._download_webpage(js_url, track_id, 'Downloading JS')
- # Known plaintext attack
- if encrypted_play_info:
- kps = ['{"stream_url":']
- kpa_target = encrypted_play_info
- else:
- kps = ['https://', 'http://']
- kpa_target = compat_b64decode(info_json['streamInfo']['url'])
- for kp in kps:
- partial_key = self._decrypt_xor_cipher(kpa_target, kp)
- for quote in ["'", '"']:
- key = self._search_regex(
- r'{0}({1}[^{0}]*){0}'.format(quote, re.escape(partial_key)),
- js, 'encryption key', default=None)
- if key is not None:
- break
+ for url_key in ('url', 'hlsUrl', 'dashUrl'):
+ format_url = stream_info.get(url_key)
+ if not format_url:
+ continue
+ decrypted = self._decrypt_xor_cipher(
+ self._DECRYPTION_KEY, compat_b64decode(format_url))
+ if url_key == 'hlsUrl':
+ formats.extend(self._extract_m3u8_formats(
+ decrypted, track_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
+ elif url_key == 'dashUrl':
+ formats.extend(self._extract_mpd_formats(
+ decrypted, track_id, mpd_id='dash', fatal=False))
else:
+ formats.append({
+ 'format_id': 'http',
+ 'url': decrypted,
+ 'downloader_options': {
+ # Mixcloud starts throttling at >~5M
+ 'http_chunk_size': 5242880,
+ },
+ })
+
+ if not formats and cloudcast.get('isExclusive'):
+ self.raise_login_required()
+
+ self._sort_formats(formats)
+
+ comments = []
+ for edge in (try_get(cloudcast, lambda x: x['comments']['edges']) or []):
+ node = edge.get('node') or {}
+ text = strip_or_none(node.get('comment'))
+ if not text:
continue
- break
- else:
- raise ExtractorError('Failed to extract encryption key')
-
- if encrypted_play_info is not None:
- play_info = self._parse_json(self._decrypt_xor_cipher(key, encrypted_play_info), 'play info')
- if message and 'stream_url' not in play_info:
- raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True)
- song_url = play_info['stream_url']
- formats = [{
- 'format_id': 'normal',
- 'url': song_url
- }]
-
- title = self._html_search_regex(r'm-title="([^"]+)"', webpage, 'title')
- thumbnail = self._proto_relative_url(self._html_search_regex(
- r'm-thumbnail-url="([^"]+)"', webpage, 'thumbnail', fatal=False))
- uploader = self._html_search_regex(
- r'm-owner-name="([^"]+)"', webpage, 'uploader', fatal=False)
- uploader_id = self._search_regex(
- r'\s+"profile": "([^"]+)",', webpage, 'uploader id', fatal=False)
- description = self._og_search_description(webpage)
- view_count = str_to_int(self._search_regex(
- [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"',
- r'/listeners/?">([0-9,.]+)</a>',
- r'(?:m|data)-tooltip=["\']([\d,.]+) plays'],
- webpage, 'play count', default=None))
+ user = node.get('user') or {}
+ comments.append({
+ 'author': user.get('displayName'),
+ 'author_id': user.get('username'),
+ 'text': text,
+ 'timestamp': parse_iso8601(node.get('created')),
+ })
- else:
- title = info_json['name']
- thumbnail = urljoin(
- 'https://thumbnailer.mixcloud.com/unsafe/600x600/',
- try_get(info_json, lambda x: x['picture']['urlRoot'], compat_str))
- uploader = try_get(info_json, lambda x: x['owner']['displayName'])
- uploader_id = try_get(info_json, lambda x: x['owner']['username'])
- description = try_get(info_json, lambda x: x['description'])
- view_count = int_or_none(try_get(info_json, lambda x: x['plays']))
-
- stream_info = info_json['streamInfo']
- formats = []
-
- def decrypt_url(f_url):
- for k in (key, 'IFYOUWANTTHEARTISTSTOGETPAIDDONOTDOWNLOADFROMMIXCLOUD'):
- decrypted_url = self._decrypt_xor_cipher(k, f_url)
- if re.search(r'^https?://[0-9A-Za-z.]+/[0-9A-Za-z/.?=&_-]+$', decrypted_url):
- return decrypted_url
-
- for url_key in ('url', 'hlsUrl', 'dashUrl'):
- format_url = stream_info.get(url_key)
- if not format_url:
- continue
- decrypted = decrypt_url(compat_b64decode(format_url))
- if not decrypted:
- continue
- if url_key == 'hlsUrl':
- formats.extend(self._extract_m3u8_formats(
- decrypted, track_id, 'mp4', entry_protocol='m3u8_native',
- m3u8_id='hls', fatal=False))
- elif url_key == 'dashUrl':
- formats.extend(self._extract_mpd_formats(
- decrypted, track_id, mpd_id='dash', fatal=False))
- else:
- formats.append({
- 'format_id': 'http',
- 'url': decrypted,
- 'downloader_options': {
- # Mixcloud starts throttling at >~5M
- 'http_chunk_size': 5242880,
- },
- })
- self._sort_formats(formats)
+ tags = []
+ for t in cloudcast.get('tags'):
+ tag = try_get(t, lambda x: x['tag']['name'], compat_str)
+ if not tag:
+ tags.append(tag)
+
+ get_count = lambda x: int_or_none(try_get(cloudcast, lambda y: y[x]['totalCount']))
+
+ owner = cloudcast.get('owner') or {}
return {
'id': track_id,
'title': title,
'formats': formats,
- 'description': description,
- 'thumbnail': thumbnail,
- 'uploader': uploader,
- 'uploader_id': uploader_id,
- 'view_count': view_count,
+ 'description': cloudcast.get('description'),
+ 'thumbnail': try_get(cloudcast, lambda x: x['picture']['url'], compat_str),
+ 'uploader': owner.get('displayName'),
+ 'timestamp': parse_iso8601(cloudcast.get('publishDate')),
+ 'uploader_id': owner.get('username'),
+ 'uploader_url': owner.get('url'),
+ 'duration': int_or_none(cloudcast.get('audioLength')),
+ 'view_count': int_or_none(cloudcast.get('plays')),
+ 'like_count': get_count('favorites'),
+ 'repost_count': get_count('reposts'),
+ 'comment_count': get_count('comments'),
+ 'comments': comments,
+ 'tags': tags,
+ 'artist': ', '.join(cloudcast.get('featuringArtistList') or []) or None,
}
-class MixcloudPlaylistBaseIE(InfoExtractor):
- _PAGE_SIZE = 24
+class MixcloudPlaylistBaseIE(MixcloudBaseIE):
+ def _get_cloudcast(self, node):
+ return node
- def _find_urls_in_page(self, page):
- for url in re.findall(r'm-play-button m-url="(?P<url>[^"]+)"', page):
- yield self.url_result(
- compat_urlparse.urljoin('https://www.mixcloud.com', clean_html(url)),
- MixcloudIE.ie_key())
+ def _get_playlist_title(self, title, slug):
+ return title
+
+ def _real_extract(self, url):
+ username, slug = re.match(self._VALID_URL, url).groups()
+ username = compat_urllib_parse_unquote(username)
+ if not slug:
+ slug = 'uploads'
+ else:
+ slug = compat_urllib_parse_unquote(slug)
+ playlist_id = '%s_%s' % (username, slug)
- def _fetch_tracks_page(self, path, video_id, page_name, current_page, real_page_number=None):
- real_page_number = real_page_number or current_page + 1
- return self._download_webpage(
- 'https://www.mixcloud.com/%s/' % path, video_id,
- note='Download %s (page %d)' % (page_name, current_page + 1),
- errnote='Unable to download %s' % page_name,
- query={'page': real_page_number, 'list': 'main', '_ajax': '1'},
- headers={'X-Requested-With': 'XMLHttpRequest'})
+ is_playlist_type = self._ROOT_TYPE == 'playlist'
+ playlist_type = 'items' if is_playlist_type else slug
+ list_filter = ''
- def _tracks_page_func(self, page, video_id, page_name, current_page):
- resp = self._fetch_tracks_page(page, video_id, page_name, current_page)
+ has_next_page = True
+ entries = []
+ while has_next_page:
+ playlist = self._call_api(
+ self._ROOT_TYPE, '''%s
+ %s
+ %s(first: 100%s) {
+ edges {
+ node {
+ %s
+ }
+ }
+ pageInfo {
+ endCursor
+ hasNextPage
+ }
+ }''' % (self._TITLE_KEY, self._DESCRIPTION_KEY, playlist_type, list_filter, self._NODE_TEMPLATE),
+ playlist_id, username, slug if is_playlist_type else None)
+
+ items = playlist.get(playlist_type) or {}
+ for edge in items.get('edges', []):
+ cloudcast = self._get_cloudcast(edge.get('node') or {})
+ cloudcast_url = cloudcast.get('url')
+ if not cloudcast_url:
+ continue
+ entries.append(self.url_result(
+ cloudcast_url, MixcloudIE.ie_key(), cloudcast.get('slug')))
- for item in self._find_urls_in_page(resp):
- yield item
+ page_info = items['pageInfo']
+ has_next_page = page_info['hasNextPage']
+ list_filter = ', after: "%s"' % page_info['endCursor']
- def _get_user_description(self, page_content):
- return self._html_search_regex(
- r'<div[^>]+class="profile-bio"[^>]*>(.+?)</div>',
- page_content, 'user description', fatal=False)
+ return self.playlist_result(
+ entries, playlist_id,
+ self._get_playlist_title(playlist[self._TITLE_KEY], slug),
+ playlist.get(self._DESCRIPTION_KEY))
class MixcloudUserIE(MixcloudPlaylistBaseIE):
- _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/(?P<type>uploads|favorites|listens)?/?$'
+ _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<id>[^/]+)/(?P<type>uploads|favorites|listens|stream)?/?$'
IE_NAME = 'mixcloud:user'
_TESTS = [{
@@ -243,68 +273,58 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE):
'info_dict': {
'id': 'dholbach_uploads',
'title': 'Daniel Holbach (uploads)',
- 'description': 'md5:def36060ac8747b3aabca54924897e47',
+ 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789',
},
- 'playlist_mincount': 11,
+ 'playlist_mincount': 36,
}, {
'url': 'http://www.mixcloud.com/dholbach/uploads/',
'info_dict': {
'id': 'dholbach_uploads',
'title': 'Daniel Holbach (uploads)',
- 'description': 'md5:def36060ac8747b3aabca54924897e47',
+ 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789',
},
- 'playlist_mincount': 11,
+ 'playlist_mincount': 36,
}, {
'url': 'http://www.mixcloud.com/dholbach/favorites/',
'info_dict': {
'id': 'dholbach_favorites',
'title': 'Daniel Holbach (favorites)',
- 'description': 'md5:def36060ac8747b3aabca54924897e47',
- },
- 'params': {
- 'playlist_items': '1-100',
+ 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789',
},
- 'playlist_mincount': 100,
+ # 'params': {
+ # 'playlist_items': '1-100',
+ # },
+ 'playlist_mincount': 396,
}, {
'url': 'http://www.mixcloud.com/dholbach/listens/',
'info_dict': {
'id': 'dholbach_listens',
'title': 'Daniel Holbach (listens)',
- 'description': 'md5:def36060ac8747b3aabca54924897e47',
+ 'description': 'md5:b60d776f0bab534c5dabe0a34e47a789',
},
- 'params': {
- 'playlist_items': '1-100',
+ # 'params': {
+ # 'playlist_items': '1-100',
+ # },
+ 'playlist_mincount': 1623,
+ 'skip': 'Large list',
+ }, {
+ 'url': 'https://www.mixcloud.com/FirstEar/stream/',
+ 'info_dict': {
+ 'id': 'FirstEar_stream',
+ 'title': 'First Ear (stream)',
+ 'description': 'Curators of good music\r\n\r\nfirstearmusic.com',
},
- 'playlist_mincount': 100,
+ 'playlist_mincount': 271,
}]
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- user_id = mobj.group('user')
- list_type = mobj.group('type')
-
- # if only a profile URL was supplied, default to download all uploads
- if list_type is None:
- list_type = 'uploads'
-
- video_id = '%s_%s' % (user_id, list_type)
+ _TITLE_KEY = 'displayName'
+ _DESCRIPTION_KEY = 'biog'
+ _ROOT_TYPE = 'user'
+ _NODE_TEMPLATE = '''slug
+ url'''
- profile = self._download_webpage(
- 'https://www.mixcloud.com/%s/' % user_id, video_id,
- note='Downloading user profile',
- errnote='Unable to download user profile')
-
- username = self._og_search_title(profile)
- description = self._get_user_description(profile)
-
- entries = OnDemandPagedList(
- functools.partial(
- self._tracks_page_func,
- '%s/%s' % (user_id, list_type), video_id, 'list of %s' % list_type),
- self._PAGE_SIZE)
-
- return self.playlist_result(
- entries, video_id, '%s (%s)' % (username, list_type), description)
+ def _get_playlist_title(self, title, slug):
+ return '%s (%s)' % (title, slug)
class MixcloudPlaylistIE(MixcloudPlaylistBaseIE):
@@ -312,87 +332,20 @@ class MixcloudPlaylistIE(MixcloudPlaylistBaseIE):
IE_NAME = 'mixcloud:playlist'
_TESTS = [{
- 'url': 'https://www.mixcloud.com/RedBullThre3style/playlists/tokyo-finalists-2015/',
- 'info_dict': {
- 'id': 'RedBullThre3style_tokyo-finalists-2015',
- 'title': 'National Champions 2015',
- 'description': 'md5:6ff5fb01ac76a31abc9b3939c16243a3',
- },
- 'playlist_mincount': 16,
- }, {
'url': 'https://www.mixcloud.com/maxvibes/playlists/jazzcat-on-ness-radio/',
- 'only_matching': True,
- }]
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- user_id = mobj.group('user')
- playlist_id = mobj.group('playlist')
- video_id = '%s_%s' % (user_id, playlist_id)
-
- webpage = self._download_webpage(
- url, user_id,
- note='Downloading playlist page',
- errnote='Unable to download playlist page')
-
- title = self._html_search_regex(
- r'<a[^>]+class="parent active"[^>]*><b>\d+</b><span[^>]*>([^<]+)',
- webpage, 'playlist title',
- default=None) or self._og_search_title(webpage, fatal=False)
- description = self._get_user_description(webpage)
-
- entries = OnDemandPagedList(
- functools.partial(
- self._tracks_page_func,
- '%s/playlists/%s' % (user_id, playlist_id), video_id, 'tracklist'),
- self._PAGE_SIZE)
-
- return self.playlist_result(entries, video_id, title, description)
-
-
-class MixcloudStreamIE(MixcloudPlaylistBaseIE):
- _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<id>[^/]+)/stream/?$'
- IE_NAME = 'mixcloud:stream'
-
- _TEST = {
- 'url': 'https://www.mixcloud.com/FirstEar/stream/',
'info_dict': {
- 'id': 'FirstEar',
- 'title': 'First Ear',
- 'description': 'Curators of good music\nfirstearmusic.com',
+ 'id': 'maxvibes_jazzcat-on-ness-radio',
+ 'title': 'Ness Radio sessions',
},
- 'playlist_mincount': 192,
- }
-
- def _real_extract(self, url):
- user_id = self._match_id(url)
-
- webpage = self._download_webpage(url, user_id)
-
- entries = []
- prev_page_url = None
-
- def _handle_page(page):
- entries.extend(self._find_urls_in_page(page))
- return self._search_regex(
- r'm-next-page-url="([^"]+)"', page,
- 'next page URL', default=None)
-
- next_page_url = _handle_page(webpage)
-
- for idx in itertools.count(0):
- if not next_page_url or prev_page_url == next_page_url:
- break
-
- prev_page_url = next_page_url
- current_page = int(self._search_regex(
- r'\?page=(\d+)', next_page_url, 'next page number'))
-
- next_page_url = _handle_page(self._fetch_tracks_page(
- '%s/stream' % user_id, user_id, 'stream', idx,
- real_page_number=current_page))
-
- username = self._og_search_title(webpage)
- description = self._get_user_description(webpage)
-
- return self.playlist_result(entries, user_id, username, description)
+ 'playlist_mincount': 59,
+ }]
+ _TITLE_KEY = 'name'
+ _DESCRIPTION_KEY = 'description'
+ _ROOT_TYPE = 'playlist'
+ _NODE_TEMPLATE = '''cloudcast {
+ slug
+ url
+ }'''
+
+ def _get_cloudcast(self, node):
+ return node.get('cloudcast') or {}
diff --git a/youtube_dl/extractor/msn.py b/youtube_dl/extractor/msn.py
index 0460cf4d5..e59b0b7b0 100644
--- a/youtube_dl/extractor/msn.py
+++ b/youtube_dl/extractor/msn.py
@@ -14,21 +14,28 @@ from ..utils import (
class MSNIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?msn\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/[a-z]{2}-(?P<id>[\da-zA-Z]+)'
+ _VALID_URL = r'https?://(?:(?:www|preview)\.)?msn\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/[a-z]{2}-(?P<id>[\da-zA-Z]+)'
_TESTS = [{
- 'url': 'http://www.msn.com/en-ae/foodanddrink/joinourtable/criminal-minds-shemar-moore-shares-a-touching-goodbye-message/vp-BBqQYNE',
- 'md5': '8442f66c116cbab1ff7098f986983458',
+ 'url': 'https://www.msn.com/en-in/money/video/7-ways-to-get-rid-of-chest-congestion/vi-BBPxU6d',
+ 'md5': '087548191d273c5c55d05028f8d2cbcd',
'info_dict': {
- 'id': 'BBqQYNE',
- 'display_id': 'criminal-minds-shemar-moore-shares-a-touching-goodbye-message',
+ 'id': 'BBPxU6d',
+ 'display_id': '7-ways-to-get-rid-of-chest-congestion',
'ext': 'mp4',
- 'title': 'Criminal Minds - Shemar Moore Shares A Touching Goodbye Message',
- 'description': 'md5:e8e89b897b222eb33a6b5067a8f1bc25',
- 'duration': 104,
- 'uploader': 'CBS Entertainment',
- 'uploader_id': 'IT0X5aoJ6bJgYerJXSDCgFmYPB1__54v',
+ 'title': 'Seven ways to get rid of chest congestion',
+ 'description': '7 Ways to Get Rid of Chest Congestion',
+ 'duration': 88,
+ 'uploader': 'Health',
+ 'uploader_id': 'BBPrMqa',
},
}, {
+ # Article, multiple Dailymotion Embeds
+ 'url': 'https://www.msn.com/en-in/money/sports/hottest-football-wags-greatest-footballers-turned-managers-and-more/ar-BBpc7Nl',
+ 'info_dict': {
+ 'id': 'BBpc7Nl',
+ },
+ 'playlist_mincount': 4,
+ }, {
'url': 'http://www.msn.com/en-ae/news/offbeat/meet-the-nine-year-old-self-made-millionaire/ar-BBt6ZKf',
'only_matching': True,
}, {
@@ -41,75 +48,124 @@ class MSNIE(InfoExtractor):
}, {
'url': 'http://www.msn.com/en-ae/entertainment/bollywood/watch-how-salman-khan-reacted-when-asked-if-he-would-apologize-for-his-‘raped-woman’-comment/vi-AAhvzW6',
'only_matching': True,
+ }, {
+ # Vidible(AOL) Embed
+ 'url': 'https://www.msn.com/en-us/money/other/jupiter-is-about-to-come-so-close-you-can-see-its-moons-with-binoculars/vi-AACqsHR',
+ 'only_matching': True,
+ }, {
+ # Dailymotion Embed
+ 'url': 'https://www.msn.com/es-ve/entretenimiento/watch/winston-salem-paire-refait-des-siennes-en-perdant-sa-raquette-au-service/vp-AAG704L',
+ 'only_matching': True,
+ }, {
+ # YouTube Embed
+ 'url': 'https://www.msn.com/en-in/money/news/meet-vikram-%E2%80%94-chandrayaan-2s-lander/vi-AAGUr0v',
+ 'only_matching': True,
+ }, {
+ # NBCSports Embed
+ 'url': 'https://www.msn.com/en-us/money/football_nfl/week-13-preview-redskins-vs-panthers/vi-BBXsCDb',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id, display_id = mobj.group('id', 'display_id')
+ display_id, page_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(url, display_id)
- video = self._parse_json(
- self._search_regex(
- r'data-metadata\s*=\s*(["\'])(?P<data>.+?)\1',
- webpage, 'video data', default='{}', group='data'),
- display_id, transform_source=unescapeHTML)
+ entries = []
+ for _, metadata in re.findall(r'data-metadata\s*=\s*(["\'])(?P<data>.+?)\1', webpage):
+ video = self._parse_json(unescapeHTML(metadata), display_id)
+
+ provider_id = video.get('providerId')
+ player_name = video.get('playerName')
+ if player_name and provider_id:
+ entry = None
+ if player_name == 'AOL':
+ if provider_id.startswith('http'):
+ provider_id = self._search_regex(
+ r'https?://delivery\.vidible\.tv/video/redirect/([0-9a-f]{24})',
+ provider_id, 'vidible id')
+ entry = self.url_result(
+ 'aol-video:' + provider_id, 'Aol', provider_id)
+ elif player_name == 'Dailymotion':
+ entry = self.url_result(
+ 'https://www.dailymotion.com/video/' + provider_id,
+ 'Dailymotion', provider_id)
+ elif player_name == 'YouTube':
+ entry = self.url_result(
+ provider_id, 'Youtube', provider_id)
+ elif player_name == 'NBCSports':
+ entry = self.url_result(
+ 'http://vplayer.nbcsports.com/p/BxmELC/nbcsports_embed/select/media/' + provider_id,
+ 'NBCSportsVPlayer', provider_id)
+ if entry:
+ entries.append(entry)
+ continue
+
+ video_id = video['uuid']
+ title = video['title']
+
+ formats = []
+ for file_ in video.get('videoFiles', []):
+ format_url = file_.get('url')
+ if not format_url:
+ continue
+ if 'format=m3u8-aapl' in format_url:
+ # m3u8_native should not be used here until
+ # https://github.com/ytdl-org/youtube-dl/issues/9913 is fixed
+ formats.extend(self._extract_m3u8_formats(
+ format_url, display_id, 'mp4',
+ m3u8_id='hls', fatal=False))
+ elif 'format=mpd-time-csf' in format_url:
+ formats.extend(self._extract_mpd_formats(
+ format_url, display_id, 'dash', fatal=False))
+ elif '.ism' in format_url:
+ if format_url.endswith('.ism'):
+ format_url += '/manifest'
+ formats.extend(self._extract_ism_formats(
+ format_url, display_id, 'mss', fatal=False))
+ else:
+ format_id = file_.get('formatCode')
+ formats.append({
+ 'url': format_url,
+ 'ext': 'mp4',
+ 'format_id': format_id,
+ 'width': int_or_none(file_.get('width')),
+ 'height': int_or_none(file_.get('height')),
+ 'vbr': int_or_none(self._search_regex(r'_(\d+)\.mp4', format_url, 'vbr', default=None)),
+ 'preference': 1 if format_id == '1001' else None,
+ })
+ self._sort_formats(formats)
+
+ subtitles = {}
+ for file_ in video.get('files', []):
+ format_url = file_.get('url')
+ format_code = file_.get('formatCode')
+ if not format_url or not format_code:
+ continue
+ if compat_str(format_code) == '3100':
+ subtitles.setdefault(file_.get('culture', 'en'), []).append({
+ 'ext': determine_ext(format_url, 'ttml'),
+ 'url': format_url,
+ })
- if not video:
+ entries.append({
+ 'id': video_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': video.get('description'),
+ 'thumbnail': video.get('headlineImage', {}).get('url'),
+ 'duration': int_or_none(video.get('durationSecs')),
+ 'uploader': video.get('sourceFriendly'),
+ 'uploader_id': video.get('providerId'),
+ 'creator': video.get('creator'),
+ 'subtitles': subtitles,
+ 'formats': formats,
+ })
+
+ if not entries:
error = unescapeHTML(self._search_regex(
r'data-error=(["\'])(?P<error>.+?)\1',
webpage, 'error', group='error'))
raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
- title = video['title']
-
- formats = []
- for file_ in video.get('videoFiles', []):
- format_url = file_.get('url')
- if not format_url:
- continue
- if 'm3u8' in format_url:
- # m3u8_native should not be used here until
- # https://github.com/ytdl-org/youtube-dl/issues/9913 is fixed
- m3u8_formats = self._extract_m3u8_formats(
- format_url, display_id, 'mp4',
- m3u8_id='hls', fatal=False)
- formats.extend(m3u8_formats)
- elif determine_ext(format_url) == 'ism':
- formats.extend(self._extract_ism_formats(
- format_url + '/Manifest', display_id, 'mss', fatal=False))
- else:
- formats.append({
- 'url': format_url,
- 'ext': 'mp4',
- 'format_id': 'http',
- 'width': int_or_none(file_.get('width')),
- 'height': int_or_none(file_.get('height')),
- })
- self._sort_formats(formats)
-
- subtitles = {}
- for file_ in video.get('files', []):
- format_url = file_.get('url')
- format_code = file_.get('formatCode')
- if not format_url or not format_code:
- continue
- if compat_str(format_code) == '3100':
- subtitles.setdefault(file_.get('culture', 'en'), []).append({
- 'ext': determine_ext(format_url, 'ttml'),
- 'url': format_url,
- })
-
- return {
- 'id': video_id,
- 'display_id': display_id,
- 'title': title,
- 'description': video.get('description'),
- 'thumbnail': video.get('headlineImage', {}).get('url'),
- 'duration': int_or_none(video.get('durationSecs')),
- 'uploader': video.get('sourceFriendly'),
- 'uploader_id': video.get('providerId'),
- 'creator': video.get('creator'),
- 'subtitles': subtitles,
- 'formats': formats,
- }
+ return self.playlist_result(entries, page_id)
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py
index 7a3b57abd..fedd5f46b 100644
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -1,3 +1,4 @@
+# coding: utf-8
from __future__ import unicode_literals
import re
@@ -349,33 +350,29 @@ class MTVIE(MTVServicesInfoExtractor):
}]
-class MTV81IE(InfoExtractor):
- IE_NAME = 'mtv81'
- _VALID_URL = r'https?://(?:www\.)?mtv81\.com/videos/(?P<id>[^/?#.]+)'
+class MTVJapanIE(MTVServicesInfoExtractor):
+ IE_NAME = 'mtvjapan'
+ _VALID_URL = r'https?://(?:www\.)?mtvjapan\.com/videos/(?P<id>[0-9a-z]+)'
_TEST = {
- 'url': 'http://www.mtv81.com/videos/artist-to-watch/the-godfather-of-japanese-hip-hop-segment-1/',
- 'md5': '1edbcdf1e7628e414a8c5dcebca3d32b',
+ 'url': 'http://www.mtvjapan.com/videos/prayht/fresh-info-cadillac-escalade',
'info_dict': {
- 'id': '5e14040d-18a4-47c4-a582-43ff602de88e',
+ 'id': 'bc01da03-6fe5-4284-8880-f291f4e368f5',
'ext': 'mp4',
- 'title': 'Unlocking The Truth|July 18, 2016|1|101|Trailer',
- 'description': '"Unlocking the Truth" premieres August 17th at 11/10c.',
- 'timestamp': 1468846800,
- 'upload_date': '20160718',
+ 'title': '【Fresh Info】Cadillac ESCALADE Sport Edition',
+ },
+ 'params': {
+ 'skip_download': True,
},
}
+ _GEO_COUNTRIES = ['JP']
+ _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed'
- def _extract_mgid(self, webpage):
- return self._search_regex(
- r'getTheVideo\((["\'])(?P<id>mgid:.+?)\1', webpage,
- 'mgid', group='id')
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- mgid = self._extract_mgid(webpage)
- return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid)
+ def _get_feed_query(self, uri):
+ return {
+ 'arcEp': 'mtvjapan.com',
+ 'mgid': uri,
+ }
class MTVVideoIE(MTVServicesInfoExtractor):
@@ -425,14 +422,14 @@ class MTVVideoIE(MTVServicesInfoExtractor):
class MTVDEIE(MTVServicesInfoExtractor):
IE_NAME = 'mtv.de'
- _VALID_URL = r'https?://(?:www\.)?mtv\.de/(?:artists|shows|news)/(?:[^/]+/)*(?P<id>\d+)-[^/#?]+/*(?:[#?].*)?$'
+ _VALID_URL = r'https?://(?:www\.)?mtv\.de/(?:musik/videoclips|folgen|news)/(?P<id>[0-9a-z]+)'
_TESTS = [{
- 'url': 'http://www.mtv.de/artists/10571-cro/videos/61131-traum',
+ 'url': 'http://www.mtv.de/musik/videoclips/2gpnv7/Traum',
'info_dict': {
- 'id': 'music_video-a50bc5f0b3aa4b3190aa',
- 'ext': 'flv',
- 'title': 'MusicVideo_cro-traum',
- 'description': 'Cro - Traum',
+ 'id': 'd5d472bc-f5b7-11e5-bffd-a4badb20dab5',
+ 'ext': 'mp4',
+ 'title': 'Traum',
+ 'description': 'Traum',
},
'params': {
# rtmp download
@@ -441,11 +438,12 @@ class MTVDEIE(MTVServicesInfoExtractor):
'skip': 'Blocked at Travis CI',
}, {
# mediagen URL without query (e.g. http://videos.mtvnn.com/mediagen/e865da714c166d18d6f80893195fcb97)
- 'url': 'http://www.mtv.de/shows/933-teen-mom-2/staffeln/5353/folgen/63565-enthullungen',
+ 'url': 'http://www.mtv.de/folgen/6b1ylu/teen-mom-2-enthuellungen-S5-F1',
'info_dict': {
- 'id': 'local_playlist-f5ae778b9832cc837189',
- 'ext': 'flv',
- 'title': 'Episode_teen-mom-2_shows_season-5_episode-1_full-episode_part1',
+ 'id': '1e5a878b-31c5-11e7-a442-0e40cf2fc285',
+ 'ext': 'mp4',
+ 'title': 'Teen Mom 2',
+ 'description': 'md5:dc65e357ef7e1085ed53e9e9d83146a7',
},
'params': {
# rtmp download
@@ -453,7 +451,7 @@ class MTVDEIE(MTVServicesInfoExtractor):
},
'skip': 'Blocked at Travis CI',
}, {
- 'url': 'http://www.mtv.de/news/77491-mtv-movies-spotlight-pixels-teil-3',
+ 'url': 'http://www.mtv.de/news/glolix/77491-mtv-movies-spotlight--pixels--teil-3',
'info_dict': {
'id': 'local_playlist-4e760566473c4c8c5344',
'ext': 'mp4',
@@ -466,25 +464,11 @@ class MTVDEIE(MTVServicesInfoExtractor):
},
'skip': 'Das Video kann zur Zeit nicht abgespielt werden.',
}]
+ _GEO_COUNTRIES = ['DE']
+ _FEED_URL = 'http://feeds.mtvnservices.com/od/feed/intl-mrss-player-feed'
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- webpage = self._download_webpage(url, video_id)
-
- playlist = self._parse_json(
- self._search_regex(
- r'window\.pagePlaylist\s*=\s*(\[.+?\]);\n', webpage, 'page playlist'),
- video_id)
-
- def _mrss_url(item):
- return item['mrss'] + item.get('mrssvars', '')
-
- # news pages contain single video in playlist with different id
- if len(playlist) == 1:
- return self._get_videos_info_from_url(_mrss_url(playlist[0]), video_id)
-
- for item in playlist:
- item_id = item.get('id')
- if item_id and compat_str(item_id) == video_id:
- return self._get_videos_info_from_url(_mrss_url(item), video_id)
+ def _get_feed_query(self, uri):
+ return {
+ 'arcEp': 'mtv.de',
+ 'mgid': uri,
+ }
diff --git a/youtube_dl/extractor/musicplayon.py b/youtube_dl/extractor/musicplayon.py
deleted file mode 100644
index 1854d59a5..000000000
--- a/youtube_dl/extractor/musicplayon.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from ..compat import compat_urlparse
-from ..utils import (
- int_or_none,
- js_to_json,
- mimetype2ext,
-)
-
-
-class MusicPlayOnIE(InfoExtractor):
- _VALID_URL = r'https?://(?:.+?\.)?musicplayon\.com/play(?:-touch)?\?(?:v|pl=\d+&play)=(?P<id>\d+)'
-
- _TESTS = [{
- 'url': 'http://en.musicplayon.com/play?v=433377',
- 'md5': '00cdcdea1726abdf500d1e7fd6dd59bb',
- 'info_dict': {
- 'id': '433377',
- 'ext': 'mp4',
- 'title': 'Rick Ross - Interview On Chelsea Lately (2014)',
- 'description': 'Rick Ross Interview On Chelsea Lately',
- 'duration': 342,
- 'uploader': 'ultrafish',
- },
- }, {
- 'url': 'http://en.musicplayon.com/play?pl=102&play=442629',
- 'only_matching': True,
- }]
-
- _URL_TEMPLATE = 'http://en.musicplayon.com/play?v=%s'
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- url = self._URL_TEMPLATE % video_id
-
- page = self._download_webpage(url, video_id)
-
- title = self._og_search_title(page)
- description = self._og_search_description(page)
- thumbnail = self._og_search_thumbnail(page)
- duration = self._html_search_meta('video:duration', page, 'duration', fatal=False)
- view_count = self._og_search_property('count', page, fatal=False)
- uploader = self._html_search_regex(
- r'<div>by&nbsp;<a href="[^"]+" class="purple">([^<]+)</a></div>', page, 'uploader', fatal=False)
-
- sources = self._parse_json(
- self._search_regex(r'setup\[\'_sources\'\]\s*=\s*([^;]+);', page, 'video sources'),
- video_id, transform_source=js_to_json)
- formats = [{
- 'url': compat_urlparse.urljoin(url, source['src']),
- 'ext': mimetype2ext(source.get('type')),
- 'format_note': source.get('data-res'),
- } for source in sources]
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'uploader': uploader,
- 'duration': int_or_none(duration),
- 'view_count': int_or_none(view_count),
- 'formats': formats,
- }
diff --git a/youtube_dl/extractor/myspass.py b/youtube_dl/extractor/myspass.py
index 2afe535b5..db7ebc94c 100644
--- a/youtube_dl/extractor/myspass.py
+++ b/youtube_dl/extractor/myspass.py
@@ -1,73 +1,56 @@
+# coding: utf-8
from __future__ import unicode_literals
-import os.path
+
+import re
from .common import InfoExtractor
-from ..compat import (
- compat_urllib_parse_urlparse,
-)
+from ..compat import compat_str
from ..utils import (
- ExtractorError,
+ int_or_none,
+ parse_duration,
+ xpath_text,
)
class MySpassIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?myspass\.de/.*'
+ _VALID_URL = r'https?://(?:www\.)?myspass\.de/([^/]+/)*(?P<id>\d+)'
_TEST = {
'url': 'http://www.myspass.de/myspass/shows/tvshows/absolute-mehrheit/Absolute-Mehrheit-vom-17022013-Die-Highlights-Teil-2--/11741/',
'md5': '0b49f4844a068f8b33f4b7c88405862b',
'info_dict': {
'id': '11741',
'ext': 'mp4',
- 'description': 'Wer kann in die Fu\u00dfstapfen von Wolfgang Kubicki treten und die Mehrheit der Zuschauer hinter sich versammeln? Wird vielleicht sogar die Absolute Mehrheit geknackt und der Jackpot von 200.000 Euro mit nach Hause genommen?',
- 'title': 'Absolute Mehrheit vom 17.02.2013 - Die Highlights, Teil 2',
+ 'description': 'Wer kann in die Fußstapfen von Wolfgang Kubicki treten und die Mehrheit der Zuschauer hinter sich versammeln? Wird vielleicht sogar die Absolute Mehrheit geknackt und der Jackpot von 200.000 Euro mit nach Hause genommen?',
+ 'title': '17.02.2013 - Die Highlights, Teil 2',
},
}
def _real_extract(self, url):
- META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
+ video_id = self._match_id(url)
- # video id is the last path element of the URL
- # usually there is a trailing slash, so also try the second but last
- url_path = compat_urllib_parse_urlparse(url).path
- url_parent_path, video_id = os.path.split(url_path)
- if not video_id:
- _, video_id = os.path.split(url_parent_path)
-
- # get metadata
- metadata_url = META_DATA_URL_TEMPLATE % video_id
metadata = self._download_xml(
- metadata_url, video_id, transform_source=lambda s: s.strip())
-
- # extract values from metadata
- url_flv_el = metadata.find('url_flv')
- if url_flv_el is None:
- raise ExtractorError('Unable to extract download url')
- video_url = url_flv_el.text
- title_el = metadata.find('title')
- if title_el is None:
- raise ExtractorError('Unable to extract title')
- title = title_el.text
- format_id_el = metadata.find('format_id')
- if format_id_el is None:
- format = 'mp4'
- else:
- format = format_id_el.text
- description_el = metadata.find('description')
- if description_el is not None:
- description = description_el.text
- else:
- description = None
- imagePreview_el = metadata.find('imagePreview')
- if imagePreview_el is not None:
- thumbnail = imagePreview_el.text
- else:
- thumbnail = None
+ 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=' + video_id,
+ video_id)
+
+ title = xpath_text(metadata, 'title', fatal=True)
+ video_url = xpath_text(metadata, 'url_flv', 'download url', True)
+ video_id_int = int(video_id)
+ for group in re.search(r'/myspass2009/\d+/(\d+)/(\d+)/(\d+)/', video_url).groups():
+ group_int = int(group)
+ if group_int > video_id_int:
+ video_url = video_url.replace(
+ group, compat_str(group_int // video_id_int))
return {
'id': video_id,
'url': video_url,
'title': title,
- 'format': format,
- 'thumbnail': thumbnail,
- 'description': description,
+ 'thumbnail': xpath_text(metadata, 'imagePreview'),
+ 'description': xpath_text(metadata, 'description'),
+ 'duration': parse_duration(xpath_text(metadata, 'duration')),
+ 'series': xpath_text(metadata, 'format'),
+ 'season_number': int_or_none(xpath_text(metadata, 'season')),
+ 'season_id': xpath_text(metadata, 'season_id'),
+ 'episode': title,
+ 'episode_number': int_or_none(xpath_text(metadata, 'episode')),
}
diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py
index 10680b202..5bc39d002 100644
--- a/youtube_dl/extractor/nbc.py
+++ b/youtube_dl/extractor/nbc.py
@@ -9,9 +9,13 @@ from .theplatform import ThePlatformIE
from .adobepass import AdobePassIE
from ..compat import compat_urllib_parse_unquote
from ..utils import (
+ int_or_none,
+ js_to_json,
+ parse_duration,
smuggle_url,
+ try_get,
+ unified_timestamp,
update_url_query,
- int_or_none,
)
@@ -285,13 +289,12 @@ class NBCNewsIE(ThePlatformIE):
_TESTS = [
{
'url': 'http://www.nbcnews.com/watch/nbcnews-com/how-twitter-reacted-to-the-snowden-interview-269389891880',
- 'md5': 'af1adfa51312291a017720403826bb64',
+ 'md5': 'cf4bc9e6ce0130f00f545d80ecedd4bf',
'info_dict': {
'id': '269389891880',
'ext': 'mp4',
'title': 'How Twitter Reacted To The Snowden Interview',
'description': 'md5:65a0bd5d76fe114f3c2727aa3a81fe64',
- 'uploader': 'NBCU-NEWS',
'timestamp': 1401363060,
'upload_date': '20140529',
},
@@ -309,28 +312,26 @@ class NBCNewsIE(ThePlatformIE):
},
{
'url': 'http://www.nbcnews.com/nightly-news/video/nightly-news-with-brian-williams-full-broadcast-february-4-394064451844',
- 'md5': '73135a2e0ef819107bbb55a5a9b2a802',
+ 'md5': '8eb831eca25bfa7d25ddd83e85946548',
'info_dict': {
'id': '394064451844',
'ext': 'mp4',
'title': 'Nightly News with Brian Williams Full Broadcast (February 4)',
'description': 'md5:1c10c1eccbe84a26e5debb4381e2d3c5',
'timestamp': 1423104900,
- 'uploader': 'NBCU-NEWS',
'upload_date': '20150205',
},
},
{
'url': 'http://www.nbcnews.com/business/autos/volkswagen-11-million-vehicles-could-have-suspect-software-emissions-scandal-n431456',
- 'md5': 'a49e173825e5fcd15c13fc297fced39d',
+ 'md5': '4a8c4cec9e1ded51060bdda36ff0a5c0',
'info_dict': {
- 'id': '529953347624',
+ 'id': 'n431456',
'ext': 'mp4',
- 'title': 'Volkswagen U.S. Chief:\xa0 We Have Totally Screwed Up',
- 'description': 'md5:c8be487b2d80ff0594c005add88d8351',
+ 'title': "Volkswagen U.S. Chief: We 'Totally Screwed Up'",
+ 'description': 'md5:d22d1281a24f22ea0880741bb4dd6301',
'upload_date': '20150922',
'timestamp': 1442917800,
- 'uploader': 'NBCU-NEWS',
},
},
{
@@ -343,7 +344,6 @@ class NBCNewsIE(ThePlatformIE):
'description': 'md5:74752b7358afb99939c5f8bb2d1d04b1',
'upload_date': '20160420',
'timestamp': 1461152093,
- 'uploader': 'NBCU-NEWS',
},
},
{
@@ -357,7 +357,6 @@ class NBCNewsIE(ThePlatformIE):
'thumbnail': r're:^https?://.*\.jpg$',
'timestamp': 1406937606,
'upload_date': '20140802',
- 'uploader': 'NBCU-NEWS',
},
},
{
@@ -373,20 +372,61 @@ class NBCNewsIE(ThePlatformIE):
def _real_extract(self, url):
video_id = self._match_id(url)
- if not video_id.isdigit():
- webpage = self._download_webpage(url, video_id)
+ webpage = self._download_webpage(url, video_id)
- data = self._parse_json(self._search_regex(
- r'window\.__data\s*=\s*({.+});', webpage,
- 'bootstrap json'), video_id)
- video_id = data['article']['content'][0]['primaryMedia']['video']['mpxMetadata']['id']
+ data = self._parse_json(self._search_regex(
+ r'window\.__data\s*=\s*({.+});', webpage,
+ 'bootstrap json'), video_id, js_to_json)
+ video_data = try_get(data, lambda x: x['video']['current'], dict)
+ if not video_data:
+ video_data = data['article']['content'][0]['primaryMedia']['video']
+ title = video_data['headline']['primary']
+
+ formats = []
+ for va in video_data.get('videoAssets', []):
+ public_url = va.get('publicUrl')
+ if not public_url:
+ continue
+ if '://link.theplatform.com/' in public_url:
+ public_url = update_url_query(public_url, {'format': 'redirect'})
+ format_id = va.get('format')
+ if format_id == 'M3U':
+ formats.extend(self._extract_m3u8_formats(
+ public_url, video_id, 'mp4', 'm3u8_native',
+ m3u8_id=format_id, fatal=False))
+ continue
+ tbr = int_or_none(va.get('bitrate'), 1000)
+ if tbr:
+ format_id += '-%d' % tbr
+ formats.append({
+ 'format_id': format_id,
+ 'url': public_url,
+ 'width': int_or_none(va.get('width')),
+ 'height': int_or_none(va.get('height')),
+ 'tbr': tbr,
+ 'ext': 'mp4',
+ })
+ self._sort_formats(formats)
+
+ subtitles = {}
+ closed_captioning = video_data.get('closedCaptioning')
+ if closed_captioning:
+ for cc_url in closed_captioning.values():
+ if not cc_url:
+ continue
+ subtitles.setdefault('en', []).append({
+ 'url': cc_url,
+ })
return {
- '_type': 'url_transparent',
'id': video_id,
- # http://feed.theplatform.com/f/2E2eJC/nbcnews also works
- 'url': update_url_query('http://feed.theplatform.com/f/2E2eJC/nnd_NBCNews', {'byId': video_id}),
- 'ie_key': 'ThePlatformFeed',
+ 'title': title,
+ 'description': try_get(video_data, lambda x: x['description']['primary']),
+ 'thumbnail': try_get(video_data, lambda x: x['primaryImage']['url']['primary']),
+ 'duration': parse_duration(video_data.get('duration')),
+ 'timestamp': unified_timestamp(video_data.get('datePublished')),
+ 'formats': formats,
+ 'subtitles': subtitles,
}
diff --git a/youtube_dl/extractor/nexx.py b/youtube_dl/extractor/nexx.py
index f9aad83c4..586c1b7eb 100644
--- a/youtube_dl/extractor/nexx.py
+++ b/youtube_dl/extractor/nexx.py
@@ -108,7 +108,7 @@ class NexxIE(InfoExtractor):
@staticmethod
def _extract_domain_id(webpage):
mobj = re.search(
- r'<script\b[^>]+\bsrc=["\'](?:https?:)?//require\.nexx(?:\.cloud|cdn\.com)/(?P<id>\d+)',
+ r'<script\b[^>]+\bsrc=["\'](?:https?:)?//(?:require|arc)\.nexx(?:\.cloud|cdn\.com)/(?:sdk/)?(?P<id>\d+)',
webpage)
return mobj.group('id') if mobj else None
@@ -123,7 +123,7 @@ class NexxIE(InfoExtractor):
domain_id = NexxIE._extract_domain_id(webpage)
if domain_id:
for video_id in re.findall(
- r'(?is)onPLAYReady.+?_play\.init\s*\(.+?\s*,\s*["\']?(\d+)',
+ r'(?is)onPLAYReady.+?_play\.(?:init|(?:control\.)?addPlayer)\s*\(.+?\s*,\s*["\']?(\d+)',
webpage):
entries.append(
'https://api.nexx.cloud/v3/%s/videos/byid/%s'
@@ -410,8 +410,8 @@ class NexxIE(InfoExtractor):
class NexxEmbedIE(InfoExtractor):
- _VALID_URL = r'https?://embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?P<id>[^/?#&]+)'
- _TEST = {
+ _VALID_URL = r'https?://embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:video/)?(?P<id>[^/?#&]+)'
+ _TESTS = [{
'url': 'http://embed.nexx.cloud/748/KC1614647Z27Y7T?autoplay=1',
'md5': '16746bfc28c42049492385c989b26c4a',
'info_dict': {
@@ -420,7 +420,6 @@ class NexxEmbedIE(InfoExtractor):
'title': 'Nervenkitzel Achterbahn',
'alt_title': 'Karussellbauer in Deutschland',
'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc',
- 'release_year': 2005,
'creator': 'SPIEGEL TV',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 2761,
@@ -431,7 +430,10 @@ class NexxEmbedIE(InfoExtractor):
'format': 'bestvideo',
'skip_download': True,
},
- }
+ }, {
+ 'url': 'https://embed.nexx.cloud/11888/video/DSRTO7UVOX06S7',
+ 'only_matching': True,
+ }]
@staticmethod
def _extract_urls(webpage):
diff --git a/youtube_dl/extractor/nintendo.py b/youtube_dl/extractor/nintendo.py
index 4b4e66b05..ff8f70ba6 100644
--- a/youtube_dl/extractor/nintendo.py
+++ b/youtube_dl/extractor/nintendo.py
@@ -5,13 +5,12 @@ import re
from .common import InfoExtractor
from .ooyala import OoyalaIE
-from ..utils import unescapeHTML
class NintendoIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?nintendo\.com/games/detail/(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://(?:www\.)?nintendo\.com/(?:games/detail|nintendo-direct)/(?P<id>[^/?#&]+)'
_TESTS = [{
- 'url': 'http://www.nintendo.com/games/detail/yEiAzhU2eQI1KZ7wOHhngFoAHc1FpHwj',
+ 'url': 'https://www.nintendo.com/games/detail/duck-hunt-wii-u/',
'info_dict': {
'id': 'MzMmticjp0VPzO3CCj4rmFOuohEuEWoW',
'ext': 'flv',
@@ -28,7 +27,19 @@ class NintendoIE(InfoExtractor):
'id': 'tokyo-mirage-sessions-fe-wii-u',
'title': 'Tokyo Mirage Sessions ♯FE',
},
- 'playlist_count': 3,
+ 'playlist_count': 4,
+ }, {
+ 'url': 'https://www.nintendo.com/nintendo-direct/09-04-2019/',
+ 'info_dict': {
+ 'id': 'J2bXdmaTE6fe3dWJTPcc7m23FNbc_A1V',
+ 'ext': 'mp4',
+ 'title': 'Switch_ROS_ND0904-H264.mov',
+ 'duration': 2324.758,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'add_ie': ['Ooyala'],
}]
def _real_extract(self, url):
@@ -39,8 +50,11 @@ class NintendoIE(InfoExtractor):
entries = [
OoyalaIE._build_url_result(m.group('code'))
for m in re.finditer(
- r'class=(["\'])embed-video\1[^>]+data-video-code=(["\'])(?P<code>(?:(?!\2).)+)\2',
- webpage)]
+ r'data-(?:video-id|directVideoId)=(["\'])(?P<code>(?:(?!\1).)+)\1', webpage)]
+
+ title = self._html_search_regex(
+ r'(?s)<(?:span|div)[^>]+class="(?:title|wrapper)"[^>]*>.*?<h1>(.+?)</h1>',
+ webpage, 'title', fatal=False)
return self.playlist_result(
- entries, page_id, unescapeHTML(self._og_search_title(webpage, fatal=False)))
+ entries, page_id, title)
diff --git a/youtube_dl/extractor/nrl.py b/youtube_dl/extractor/nrl.py
index 798b91e04..22a2df8d3 100644
--- a/youtube_dl/extractor/nrl.py
+++ b/youtube_dl/extractor/nrl.py
@@ -23,8 +23,8 @@ class NRLTVIE(InfoExtractor):
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
- q_data = self._parse_json(self._search_regex(
- r"(?s)q-data='({.+?})'", webpage, 'player data'), display_id)
+ q_data = self._parse_json(self._html_search_regex(
+ r'(?s)q-data="({.+?})"', webpage, 'player data'), display_id)
ooyala_id = q_data['videoId']
return self.url_result(
'ooyala:' + ooyala_id, 'Ooyala', ooyala_id, q_data.get('title'))
diff --git a/youtube_dl/extractor/ntvru.py b/youtube_dl/extractor/ntvru.py
index 4f9cedb84..c47d1dfa4 100644
--- a/youtube_dl/extractor/ntvru.py
+++ b/youtube_dl/extractor/ntvru.py
@@ -3,9 +3,10 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
- clean_html,
- xpath_text,
int_or_none,
+ strip_or_none,
+ unescapeHTML,
+ xpath_text,
)
@@ -47,10 +48,10 @@ class NTVRuIE(InfoExtractor):
'duration': 1496,
},
}, {
- 'url': 'http://www.ntv.ru/kino/Koma_film',
- 'md5': 'f825770930937aa7e5aca0dc0d29319a',
+ 'url': 'https://www.ntv.ru/kino/Koma_film/m70281/o336036/video/',
+ 'md5': 'e9c7cde24d9d3eaed545911a04e6d4f4',
'info_dict': {
- 'id': '1007609',
+ 'id': '1126480',
'ext': 'mp4',
'title': 'Остросюжетный фильм «Кома»',
'description': 'Остросюжетный фильм «Кома»',
@@ -68,6 +69,10 @@ class NTVRuIE(InfoExtractor):
'thumbnail': r're:^http://.*\.jpg',
'duration': 2590,
},
+ }, {
+ # Schemeless file URL
+ 'url': 'https://www.ntv.ru/video/1797442',
+ 'only_matching': True,
}]
_VIDEO_ID_REGEXES = [
@@ -96,37 +101,31 @@ class NTVRuIE(InfoExtractor):
'http://www.ntv.ru/vi%s/' % video_id,
video_id, 'Downloading video XML')
- title = clean_html(xpath_text(player, './data/title', 'title', fatal=True))
- description = clean_html(xpath_text(player, './data/description', 'description'))
+ title = strip_or_none(unescapeHTML(xpath_text(player, './data/title', 'title', fatal=True)))
video = player.find('./data/video')
- video_id = xpath_text(video, './id', 'video id')
- thumbnail = xpath_text(video, './splash', 'thumbnail')
- duration = int_or_none(xpath_text(video, './totaltime', 'duration'))
- view_count = int_or_none(xpath_text(video, './views', 'view count'))
-
- token = self._download_webpage(
- 'http://stat.ntv.ru/services/access/token',
- video_id, 'Downloading access token')
formats = []
for format_id in ['', 'hi', 'webm']:
- file_ = video.find('./%sfile' % format_id)
- if file_ is None:
+ file_ = xpath_text(video, './%sfile' % format_id)
+ if not file_:
continue
- size = video.find('./%ssize' % format_id)
+ if file_.startswith('//'):
+ file_ = self._proto_relative_url(file_)
+ elif not file_.startswith('http'):
+ file_ = 'http://media.ntv.ru/vod/' + file_
formats.append({
- 'url': 'http://media2.ntv.ru/vod/%s&tok=%s' % (file_.text, token),
- 'filesize': int_or_none(size.text if size is not None else None),
+ 'url': file_,
+ 'filesize': int_or_none(xpath_text(video, './%ssize' % format_id)),
})
self._sort_formats(formats)
return {
- 'id': video_id,
+ 'id': xpath_text(video, './id'),
'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'duration': duration,
- 'view_count': view_count,
+ 'description': strip_or_none(unescapeHTML(xpath_text(player, './data/description'))),
+ 'thumbnail': xpath_text(video, './splash'),
+ 'duration': int_or_none(xpath_text(video, './totaltime')),
+ 'view_count': int_or_none(xpath_text(video, './views')),
'formats': formats,
}
diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py
index 114b93c07..7ed9fac55 100644
--- a/youtube_dl/extractor/odnoklassniki.py
+++ b/youtube_dl/extractor/odnoklassniki.py
@@ -1,6 +1,8 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..compat import (
compat_etree_fromstring,
@@ -121,6 +123,13 @@ class OdnoklassnikiIE(InfoExtractor):
'only_matching': True,
}]
+ @staticmethod
+ def _extract_url(webpage):
+ mobj = re.search(
+ r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//(?:odnoklassniki|ok)\.ru/videoembed/.+?)\1', webpage)
+ if mobj:
+ return mobj.group('url')
+
def _real_extract(self, url):
start_time = int_or_none(compat_parse_qs(
compat_urllib_parse_urlparse(url).query).get('fromTime', [None])[0])
diff --git a/youtube_dl/extractor/onet.py b/youtube_dl/extractor/onet.py
index 58da1bc27..e55b2ac89 100644
--- a/youtube_dl/extractor/onet.py
+++ b/youtube_dl/extractor/onet.py
@@ -20,6 +20,8 @@ from ..utils import (
class OnetBaseIE(InfoExtractor):
+ _URL_BASE_RE = r'https?://(?:(?:www\.)?onet\.tv|onet100\.vod\.pl)/[a-z]/'
+
def _search_mvp_id(self, webpage):
return self._search_regex(
r'id=(["\'])mvp:(?P<id>.+?)\1', webpage, 'mvp id', group='id')
@@ -45,7 +47,7 @@ class OnetBaseIE(InfoExtractor):
video = response['result'].get('0')
formats = []
- for _, formats_dict in video['formats'].items():
+ for format_type, formats_dict in video['formats'].items():
if not isinstance(formats_dict, dict):
continue
for format_id, format_list in formats_dict.items():
@@ -56,21 +58,31 @@ class OnetBaseIE(InfoExtractor):
if not video_url:
continue
ext = determine_ext(video_url)
- if format_id == 'ism':
+ if format_id.startswith('ism'):
formats.extend(self._extract_ism_formats(
video_url, video_id, 'mss', fatal=False))
elif ext == 'mpd':
formats.extend(self._extract_mpd_formats(
video_url, video_id, mpd_id='dash', fatal=False))
+ elif format_id.startswith('hls'):
+ formats.extend(self._extract_m3u8_formats(
+ video_url, video_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
else:
- formats.append({
+ http_f = {
'url': video_url,
'format_id': format_id,
- 'height': int_or_none(f.get('vertical_resolution')),
- 'width': int_or_none(f.get('horizontal_resolution')),
'abr': float_or_none(f.get('audio_bitrate')),
- 'vbr': float_or_none(f.get('video_bitrate')),
- })
+ }
+ if format_type == 'audio':
+ http_f['vcodec'] = 'none'
+ else:
+ http_f.update({
+ 'height': int_or_none(f.get('vertical_resolution')),
+ 'width': int_or_none(f.get('horizontal_resolution')),
+ 'vbr': float_or_none(f.get('video_bitrate')),
+ })
+ formats.append(http_f)
self._sort_formats(formats)
meta = video.get('meta', {})
@@ -105,12 +117,12 @@ class OnetMVPIE(OnetBaseIE):
class OnetIE(OnetBaseIE):
- _VALID_URL = r'https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/(?P<display_id>[0-9a-z-]+)/(?P<id>[0-9a-z]+)'
+ _VALID_URL = OnetBaseIE._URL_BASE_RE + r'[a-z]+/(?P<display_id>[0-9a-z-]+)/(?P<id>[0-9a-z]+)'
IE_NAME = 'onet.tv'
- _TEST = {
+ _TESTS = [{
'url': 'http://onet.tv/k/openerfestival/open-er-festival-2016-najdziwniejsze-wymagania-gwiazd/qbpyqc',
- 'md5': 'e3ffbf47590032ac3f27249204173d50',
+ 'md5': '436102770fb095c75b8bb0392d3da9ff',
'info_dict': {
'id': 'qbpyqc',
'display_id': 'open-er-festival-2016-najdziwniejsze-wymagania-gwiazd',
@@ -120,7 +132,10 @@ class OnetIE(OnetBaseIE):
'upload_date': '20160705',
'timestamp': 1467721580,
},
- }
+ }, {
+ 'url': 'https://onet100.vod.pl/k/openerfestival/open-er-festival-2016-najdziwniejsze-wymagania-gwiazd/qbpyqc',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -140,18 +155,21 @@ class OnetIE(OnetBaseIE):
class OnetChannelIE(OnetBaseIE):
- _VALID_URL = r'https?://(?:www\.)?onet\.tv/[a-z]/(?P<id>[a-z]+)(?:[?#]|$)'
+ _VALID_URL = OnetBaseIE._URL_BASE_RE + r'(?P<id>[a-z]+)(?:[?#]|$)'
IE_NAME = 'onet.tv:channel'
- _TEST = {
+ _TESTS = [{
'url': 'http://onet.tv/k/openerfestival',
'info_dict': {
'id': 'openerfestival',
- 'title': 'Open\'er Festival Live',
- 'description': 'Dziękujemy, że oglądaliście transmisje. Zobaczcie nasze relacje i wywiady z artystami.',
+ 'title': "Open'er Festival",
+ 'description': "Tak było na Open'er Festival 2016! Oglądaj nasze reportaże i wywiady z artystami.",
},
- 'playlist_mincount': 46,
- }
+ 'playlist_mincount': 35,
+ }, {
+ 'url': 'https://onet100.vod.pl/k/openerfestival',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
channel_id = self._match_id(url)
@@ -173,7 +191,7 @@ class OnetChannelIE(OnetBaseIE):
'Downloading channel %s - add --no-playlist to just download video %s' % (
channel_id, video_name))
matches = re.findall(
- r'<a[^>]+href=[\'"](https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/[0-9a-z-]+/[0-9a-z]+)',
+ r'<a[^>]+href=[\'"](%s[a-z]+/[0-9a-z-]+/[0-9a-z]+)' % self._URL_BASE_RE,
webpage)
entries = [
self.url_result(video_link, OnetIE.ie_key())
diff --git a/youtube_dl/extractor/onionstudios.py b/youtube_dl/extractor/onionstudios.py
index c6e3d5640..cf5c39e66 100644
--- a/youtube_dl/extractor/onionstudios.py
+++ b/youtube_dl/extractor/onionstudios.py
@@ -4,12 +4,8 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import (
- determine_ext,
- int_or_none,
- float_or_none,
- mimetype2ext,
-)
+from ..compat import compat_str
+from ..utils import js_to_json
class OnionStudiosIE(InfoExtractor):
@@ -17,14 +13,16 @@ class OnionStudiosIE(InfoExtractor):
_TESTS = [{
'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937',
- 'md5': '719d1f8c32094b8c33902c17bcae5e34',
+ 'md5': '5a118d466d62b5cd03647cf2c593977f',
'info_dict': {
- 'id': '2937',
+ 'id': '3459881',
'ext': 'mp4',
'title': 'Hannibal charges forward, stops for a cocktail',
+ 'description': 'md5:545299bda6abf87e5ec666548c6a9448',
'thumbnail': r're:^https?://.*\.jpg$',
- 'uploader': 'The A.V. Club',
- 'uploader_id': 'the-av-club',
+ 'uploader': 'a.v. club',
+ 'upload_date': '20150619',
+ 'timestamp': 1434728546,
},
}, {
'url': 'http://www.onionstudios.com/embed?id=2855&autoplay=true',
@@ -44,38 +42,12 @@ class OnionStudiosIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- video_data = self._download_json(
- 'http://www.onionstudios.com/video/%s.json' % video_id, video_id)
-
- title = video_data['title']
-
- formats = []
- for source in video_data.get('sources', []):
- source_url = source.get('url')
- if not source_url:
- continue
- ext = mimetype2ext(source.get('content_type')) or determine_ext(source_url)
- if ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- source_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
- else:
- tbr = int_or_none(source.get('bitrate'))
- formats.append({
- 'format_id': ext + ('-%d' % tbr if tbr else ''),
- 'url': source_url,
- 'width': int_or_none(source.get('width')),
- 'tbr': tbr,
- 'ext': ext,
- })
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'title': title,
- 'thumbnail': video_data.get('poster_url'),
- 'uploader': video_data.get('channel_name'),
- 'uploader_id': video_data.get('channel_slug'),
- 'duration': float_or_none(video_data.get('duration', 1000)),
- 'tags': video_data.get('tags'),
- 'formats': formats,
- }
+ webpage = self._download_webpage(
+ 'http://onionstudios.com/embed/dc94dc2899fe644c0e7241fa04c1b732.js',
+ video_id)
+ mcp_id = compat_str(self._parse_json(self._search_regex(
+ r'window\.mcpMapping\s*=\s*({.+?});', webpage,
+ 'MCP Mapping'), video_id, js_to_json)[video_id]['mcp_id'])
+ return self.url_result(
+ 'http://kinja.com/ajax/inset/iframe?id=mcp-' + mcp_id,
+ 'KinjaEmbed', mcp_id)
diff --git a/youtube_dl/extractor/ooyala.py b/youtube_dl/extractor/ooyala.py
index 995b24d1b..eb957b8fe 100644
--- a/youtube_dl/extractor/ooyala.py
+++ b/youtube_dl/extractor/ooyala.py
@@ -1,12 +1,12 @@
from __future__ import unicode_literals
+import base64
import re
from .common import InfoExtractor
from ..compat import (
compat_b64decode,
compat_str,
- compat_urllib_parse_urlencode,
)
from ..utils import (
determine_ext,
@@ -21,9 +21,9 @@ from ..utils import (
class OoyalaBaseIE(InfoExtractor):
_PLAYER_BASE = 'http://player.ooyala.com/'
_CONTENT_TREE_BASE = _PLAYER_BASE + 'player_api/v1/content_tree/'
- _AUTHORIZATION_URL_TEMPLATE = _PLAYER_BASE + 'sas/player_api/v2/authorization/embed_code/%s/%s?'
+ _AUTHORIZATION_URL_TEMPLATE = _PLAYER_BASE + 'sas/player_api/v2/authorization/embed_code/%s/%s'
- def _extract(self, content_tree_url, video_id, domain='example.org', supportedformats=None, embed_token=None):
+ def _extract(self, content_tree_url, video_id, domain=None, supportedformats=None, embed_token=None):
content_tree = self._download_json(content_tree_url, video_id)['content_tree']
metadata = content_tree[list(content_tree)[0]]
embed_code = metadata['embed_code']
@@ -31,59 +31,62 @@ class OoyalaBaseIE(InfoExtractor):
title = metadata['title']
auth_data = self._download_json(
- self._AUTHORIZATION_URL_TEMPLATE % (pcode, embed_code)
- + compat_urllib_parse_urlencode({
- 'domain': domain,
+ self._AUTHORIZATION_URL_TEMPLATE % (pcode, embed_code),
+ video_id, headers=self.geo_verification_headers(), query={
+ 'domain': domain or 'player.ooyala.com',
'supportedFormats': supportedformats or 'mp4,rtmp,m3u8,hds,dash,smooth',
'embedToken': embed_token,
- }), video_id, headers=self.geo_verification_headers())
-
- cur_auth_data = auth_data['authorization_data'][embed_code]
+ })['authorization_data'][embed_code]
urls = []
formats = []
- if cur_auth_data['authorized']:
- for stream in cur_auth_data['streams']:
- url_data = try_get(stream, lambda x: x['url']['data'], compat_str)
- if not url_data:
- continue
- s_url = compat_b64decode(url_data).decode('utf-8')
- if not s_url or s_url in urls:
- continue
- urls.append(s_url)
- ext = determine_ext(s_url, None)
- delivery_type = stream.get('delivery_type')
- if delivery_type == 'hls' or ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- re.sub(r'/ip(?:ad|hone)/', '/all/', s_url), embed_code, 'mp4', 'm3u8_native',
- m3u8_id='hls', fatal=False))
- elif delivery_type == 'hds' or ext == 'f4m':
- formats.extend(self._extract_f4m_formats(
- s_url + '?hdcore=3.7.0', embed_code, f4m_id='hds', fatal=False))
- elif delivery_type == 'dash' or ext == 'mpd':
- formats.extend(self._extract_mpd_formats(
- s_url, embed_code, mpd_id='dash', fatal=False))
- elif delivery_type == 'smooth':
- self._extract_ism_formats(
- s_url, embed_code, ism_id='mss', fatal=False)
- elif ext == 'smil':
- formats.extend(self._extract_smil_formats(
- s_url, embed_code, fatal=False))
- else:
- formats.append({
- 'url': s_url,
- 'ext': ext or delivery_type,
- 'vcodec': stream.get('video_codec'),
- 'format_id': delivery_type,
- 'width': int_or_none(stream.get('width')),
- 'height': int_or_none(stream.get('height')),
- 'abr': int_or_none(stream.get('audio_bitrate')),
- 'vbr': int_or_none(stream.get('video_bitrate')),
- 'fps': float_or_none(stream.get('framerate')),
- })
- else:
+ streams = auth_data.get('streams') or [{
+ 'delivery_type': 'hls',
+ 'url': {
+ 'data': base64.b64encode(('http://player.ooyala.com/hls/player/all/%s.m3u8' % embed_code).encode()).decode(),
+ }
+ }]
+ for stream in streams:
+ url_data = try_get(stream, lambda x: x['url']['data'], compat_str)
+ if not url_data:
+ continue
+ s_url = compat_b64decode(url_data).decode('utf-8')
+ if not s_url or s_url in urls:
+ continue
+ urls.append(s_url)
+ ext = determine_ext(s_url, None)
+ delivery_type = stream.get('delivery_type')
+ if delivery_type == 'hls' or ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ re.sub(r'/ip(?:ad|hone)/', '/all/', s_url), embed_code, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
+ elif delivery_type == 'hds' or ext == 'f4m':
+ formats.extend(self._extract_f4m_formats(
+ s_url + '?hdcore=3.7.0', embed_code, f4m_id='hds', fatal=False))
+ elif delivery_type == 'dash' or ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ s_url, embed_code, mpd_id='dash', fatal=False))
+ elif delivery_type == 'smooth':
+ self._extract_ism_formats(
+ s_url, embed_code, ism_id='mss', fatal=False)
+ elif ext == 'smil':
+ formats.extend(self._extract_smil_formats(
+ s_url, embed_code, fatal=False))
+ else:
+ formats.append({
+ 'url': s_url,
+ 'ext': ext or delivery_type,
+ 'vcodec': stream.get('video_codec'),
+ 'format_id': delivery_type,
+ 'width': int_or_none(stream.get('width')),
+ 'height': int_or_none(stream.get('height')),
+ 'abr': int_or_none(stream.get('audio_bitrate')),
+ 'vbr': int_or_none(stream.get('video_bitrate')),
+ 'fps': float_or_none(stream.get('framerate')),
+ })
+ if not formats and not auth_data.get('authorized'):
raise ExtractorError('%s said: %s' % (
- self.IE_NAME, cur_auth_data['message']), expected=True)
+ self.IE_NAME, auth_data['message']), expected=True)
self._sort_formats(formats)
subtitles = {}
diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py
index 66e38cdb4..0c20d0177 100644
--- a/youtube_dl/extractor/openload.py
+++ b/youtube_dl/extractor/openload.py
@@ -3,21 +3,17 @@ from __future__ import unicode_literals
import json
import os
-import re
import subprocess
import tempfile
-from .common import InfoExtractor
from ..compat import (
compat_urlparse,
compat_kwargs,
)
from ..utils import (
check_executable,
- determine_ext,
encodeArgument,
ExtractorError,
- get_element_by_id,
get_exe_version,
is_outdated_version,
std_headers,
@@ -240,262 +236,3 @@ class PhantomJSwrapper(object):
self._load_cookies()
return (html, encodeArgument(out))
-
-
-class OpenloadIE(InfoExtractor):
- _DOMAINS = r'''
- (?:
- openload\.(?:co|io|link|pw)|
- oload\.(?:tv|best|biz|stream|site|xyz|win|download|cloud|cc|icu|fun|club|info|online|monster|press|pw|life|live|space|services|website|vip)|
- oladblock\.(?:services|xyz|me)|openloed\.co
- )
- '''
- _VALID_URL = r'''(?x)
- https?://
- (?P<host>
- (?:www\.)?
- %s
- )/
- (?:f|embed)/
- (?P<id>[a-zA-Z0-9-_]+)
- ''' % _DOMAINS
- _EMBED_WORD = 'embed'
- _STREAM_WORD = 'f'
- _REDIR_WORD = 'stream'
- _URL_IDS = ('streamurl', 'streamuri', 'streamurj')
- _TESTS = [{
- 'url': 'https://openload.co/f/kUEfGclsU9o',
- 'md5': 'bf1c059b004ebc7a256f89408e65c36e',
- 'info_dict': {
- 'id': 'kUEfGclsU9o',
- 'ext': 'mp4',
- 'title': 'skyrim_no-audio_1080.mp4',
- 'thumbnail': r're:^https?://.*\.jpg$',
- },
- }, {
- 'url': 'https://openload.co/embed/rjC09fkPLYs',
- 'info_dict': {
- 'id': 'rjC09fkPLYs',
- 'ext': 'mp4',
- 'title': 'movie.mp4',
- 'thumbnail': r're:^https?://.*\.jpg$',
- 'subtitles': {
- 'en': [{
- 'ext': 'vtt',
- }],
- },
- },
- 'params': {
- 'skip_download': True, # test subtitles only
- },
- }, {
- 'url': 'https://openload.co/embed/kUEfGclsU9o/skyrim_no-audio_1080.mp4',
- 'only_matching': True,
- }, {
- 'url': 'https://openload.io/f/ZAn6oz-VZGE/',
- 'only_matching': True,
- }, {
- 'url': 'https://openload.co/f/_-ztPaZtMhM/',
- 'only_matching': True,
- }, {
- # unavailable via https://openload.co/f/Sxz5sADo82g/, different layout
- # for title and ext
- 'url': 'https://openload.co/embed/Sxz5sADo82g/',
- 'only_matching': True,
- }, {
- # unavailable via https://openload.co/embed/e-Ixz9ZR5L0/ but available
- # via https://openload.co/f/e-Ixz9ZR5L0/
- 'url': 'https://openload.co/f/e-Ixz9ZR5L0/',
- 'only_matching': True,
- }, {
- 'url': 'https://oload.tv/embed/KnG-kKZdcfY/',
- 'only_matching': True,
- }, {
- 'url': 'http://www.openload.link/f/KnG-kKZdcfY',
- 'only_matching': True,
- }, {
- 'url': 'https://oload.stream/f/KnG-kKZdcfY',
- 'only_matching': True,
- }, {
- 'url': 'https://oload.xyz/f/WwRBpzW8Wtk',
- 'only_matching': True,
- }, {
- 'url': 'https://oload.win/f/kUEfGclsU9o',
- 'only_matching': True,
- }, {
- 'url': 'https://oload.download/f/kUEfGclsU9o',
- 'only_matching': True,
- }, {
- 'url': 'https://oload.cloud/f/4ZDnBXRWiB8',
- 'only_matching': True,
- }, {
- # Its title has not got its extension but url has it
- 'url': 'https://oload.download/f/N4Otkw39VCw/Tomb.Raider.2018.HDRip.XviD.AC3-EVO.avi.mp4',
- 'only_matching': True,
- }, {
- 'url': 'https://oload.cc/embed/5NEAbI2BDSk',
- 'only_matching': True,
- }, {
- 'url': 'https://oload.icu/f/-_i4y_F_Hs8',
- 'only_matching': True,
- }, {
- 'url': 'https://oload.fun/f/gb6G1H4sHXY',
- 'only_matching': True,
- }, {
- 'url': 'https://oload.club/f/Nr1L-aZ2dbQ',
- 'only_matching': True,
- }, {
- 'url': 'https://oload.info/f/5NEAbI2BDSk',
- 'only_matching': True,
- }, {
- 'url': 'https://openload.pw/f/WyKgK8s94N0',
- 'only_matching': True,
- }, {
- 'url': 'https://oload.pw/f/WyKgK8s94N0',
- 'only_matching': True,
- }, {
- 'url': 'https://oload.live/f/-Z58UZ-GR4M',
- 'only_matching': True,
- }, {
- 'url': 'https://oload.space/f/IY4eZSst3u8/',
- 'only_matching': True,
- }, {
- 'url': 'https://oload.services/embed/bs1NWj1dCag/',
- 'only_matching': True,
- }, {
- 'url': 'https://oload.online/f/W8o2UfN1vNY/',
- 'only_matching': True,
- }, {
- 'url': 'https://oload.monster/f/W8o2UfN1vNY/',
- 'only_matching': True,
- }, {
- 'url': 'https://oload.press/embed/drTBl1aOTvk/',
- 'only_matching': True,
- }, {
- 'url': 'https://oload.website/embed/drTBl1aOTvk/',
- 'only_matching': True,
- }, {
- 'url': 'https://oload.life/embed/oOzZjNPw9Dc/',
- 'only_matching': True,
- }, {
- 'url': 'https://oload.biz/f/bEk3Gp8ARr4/',
- 'only_matching': True,
- }, {
- 'url': 'https://oload.best/embed/kkz9JgVZeWc/',
- 'only_matching': True,
- }, {
- 'url': 'https://oladblock.services/f/b8NWEgkqNLI/',
- 'only_matching': True,
- }, {
- 'url': 'https://oladblock.xyz/f/b8NWEgkqNLI/',
- 'only_matching': True,
- }, {
- 'url': 'https://oladblock.me/f/b8NWEgkqNLI/',
- 'only_matching': True,
- }, {
- 'url': 'https://openloed.co/f/b8NWEgkqNLI/',
- 'only_matching': True,
- }, {
- 'url': 'https://oload.vip/f/kUEfGclsU9o',
- 'only_matching': True,
- }]
-
- @classmethod
- def _extract_urls(cls, webpage):
- return re.findall(
- r'(?x)<iframe[^>]+src=["\']((?:https?://)?%s/%s/[a-zA-Z0-9-_]+)'
- % (cls._DOMAINS, cls._EMBED_WORD), webpage)
-
- def _extract_decrypted_page(self, page_url, webpage, video_id):
- phantom = PhantomJSwrapper(self, required_version='2.0')
- webpage, _ = phantom.get(page_url, html=webpage, video_id=video_id)
- return webpage
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- host = mobj.group('host')
- video_id = mobj.group('id')
-
- url_pattern = 'https://%s/%%s/%s/' % (host, video_id)
-
- for path in (self._EMBED_WORD, self._STREAM_WORD):
- page_url = url_pattern % path
- last = path == self._STREAM_WORD
- webpage = self._download_webpage(
- page_url, video_id, 'Downloading %s webpage' % path,
- fatal=last)
- if not webpage:
- continue
- if 'File not found' in webpage or 'deleted by the owner' in webpage:
- if not last:
- continue
- raise ExtractorError('File not found', expected=True, video_id=video_id)
- break
-
- webpage = self._extract_decrypted_page(page_url, webpage, video_id)
- for element_id in self._URL_IDS:
- decoded_id = get_element_by_id(element_id, webpage)
- if decoded_id:
- break
- if not decoded_id:
- decoded_id = self._search_regex(
- (r'>\s*([\w-]+~\d{10,}~\d+\.\d+\.0\.0~[\w-]+)\s*<',
- r'>\s*([\w~-]+~\d+\.\d+\.\d+\.\d+~[\w~-]+)',
- r'>\s*([\w-]+~\d{10,}~(?:[a-f\d]+:){2}:~[\w-]+)\s*<',
- r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)\s*<',
- r'>\s*([\w~-]+~[a-f0-9:]+~[\w~-]+)'), webpage,
- 'stream URL')
- video_url = 'https://%s/%s/%s?mime=true' % (host, self._REDIR_WORD, decoded_id)
-
- title = self._og_search_title(webpage, default=None) or self._search_regex(
- r'<span[^>]+class=["\']title["\'][^>]*>([^<]+)', webpage,
- 'title', default=None) or self._html_search_meta(
- 'description', webpage, 'title', fatal=True)
-
- entries = self._parse_html5_media_entries(page_url, webpage, video_id)
- entry = entries[0] if entries else {}
- subtitles = entry.get('subtitles')
-
- return {
- 'id': video_id,
- 'title': title,
- 'thumbnail': entry.get('thumbnail') or self._og_search_thumbnail(webpage, default=None),
- 'url': video_url,
- 'ext': determine_ext(title, None) or determine_ext(url, 'mp4'),
- 'subtitles': subtitles,
- }
-
-
-class VerystreamIE(OpenloadIE):
- IE_NAME = 'verystream'
-
- _DOMAINS = r'(?:verystream\.com|woof\.tube)'
- _VALID_URL = r'''(?x)
- https?://
- (?P<host>
- (?:www\.)?
- %s
- )/
- (?:stream|e)/
- (?P<id>[a-zA-Z0-9-_]+)
- ''' % _DOMAINS
- _EMBED_WORD = 'e'
- _STREAM_WORD = 'stream'
- _REDIR_WORD = 'gettoken'
- _URL_IDS = ('videolink', )
- _TESTS = [{
- 'url': 'https://verystream.com/stream/c1GWQ9ngBBx/',
- 'md5': 'd3e8c5628ccb9970b65fd65269886795',
- 'info_dict': {
- 'id': 'c1GWQ9ngBBx',
- 'ext': 'mp4',
- 'title': 'Big Buck Bunny.mp4',
- 'thumbnail': r're:^https?://.*\.jpg$',
- },
- }, {
- 'url': 'https://verystream.com/e/c1GWQ9ngBBx/',
- 'only_matching': True,
- }]
-
- def _extract_decrypted_page(self, page_url, webpage, video_id):
- return webpage # for Verystream, the webpage is already decrypted
diff --git a/youtube_dl/extractor/patreon.py b/youtube_dl/extractor/patreon.py
index 426dd8121..761a4b1de 100644
--- a/youtube_dl/extractor/patreon.py
+++ b/youtube_dl/extractor/patreon.py
@@ -6,7 +6,11 @@ from ..utils import (
clean_html,
determine_ext,
int_or_none,
+ KNOWN_EXTENSIONS,
+ mimetype2ext,
parse_iso8601,
+ str_or_none,
+ try_get,
)
@@ -24,6 +28,7 @@ class PatreonIE(InfoExtractor):
'thumbnail': 're:^https?://.*$',
'timestamp': 1406473987,
'upload_date': '20140727',
+ 'uploader_id': '87145',
},
}, {
'url': 'http://www.patreon.com/creation?hid=754133',
@@ -90,7 +95,13 @@ class PatreonIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
post = self._download_json(
- 'https://www.patreon.com/api/posts/' + video_id, video_id)
+ 'https://www.patreon.com/api/posts/' + video_id, video_id, query={
+ 'fields[media]': 'download_url,mimetype,size_bytes',
+ 'fields[post]': 'comment_count,content,embed,image,like_count,post_file,published_at,title',
+ 'fields[user]': 'full_name,url',
+ 'json-api-use-default-includes': 'false',
+ 'include': 'media,user',
+ })
attributes = post['data']['attributes']
title = attributes['title'].strip()
image = attributes.get('image') or {}
@@ -104,33 +115,42 @@ class PatreonIE(InfoExtractor):
'comment_count': int_or_none(attributes.get('comment_count')),
}
- def add_file(file_data):
- file_url = file_data.get('url')
- if file_url:
- info.update({
- 'url': file_url,
- 'ext': determine_ext(file_data.get('name'), 'mp3'),
- })
-
for i in post.get('included', []):
i_type = i.get('type')
- if i_type == 'attachment':
- add_file(i.get('attributes') or {})
+ if i_type == 'media':
+ media_attributes = i.get('attributes') or {}
+ download_url = media_attributes.get('download_url')
+ ext = mimetype2ext(media_attributes.get('mimetype'))
+ if download_url and ext in KNOWN_EXTENSIONS:
+ info.update({
+ 'ext': ext,
+ 'filesize': int_or_none(media_attributes.get('size_bytes')),
+ 'url': download_url,
+ })
elif i_type == 'user':
user_attributes = i.get('attributes')
if user_attributes:
info.update({
'uploader': user_attributes.get('full_name'),
+ 'uploader_id': str_or_none(i.get('id')),
'uploader_url': user_attributes.get('url'),
})
if not info.get('url'):
- add_file(attributes.get('post_file') or {})
+ embed_url = try_get(attributes, lambda x: x['embed']['url'])
+ if embed_url:
+ info.update({
+ '_type': 'url',
+ 'url': embed_url,
+ })
if not info.get('url'):
- info.update({
- '_type': 'url',
- 'url': attributes['embed']['url'],
- })
+ post_file = attributes['post_file']
+ ext = determine_ext(post_file.get('name'))
+ if ext in KNOWN_EXTENSIONS:
+ info.update({
+ 'ext': ext,
+ 'url': post_file['url'],
+ })
return info
diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py
index b337a56c0..c02e34aba 100644
--- a/youtube_dl/extractor/periscope.py
+++ b/youtube_dl/extractor/periscope.py
@@ -17,12 +17,54 @@ class PeriscopeBaseIE(InfoExtractor):
'https://api.periscope.tv/api/v2/%s' % method,
item_id, query=query)
+ def _parse_broadcast_data(self, broadcast, video_id):
+ title = broadcast['status']
+ uploader = broadcast.get('user_display_name') or broadcast.get('username')
+ title = '%s - %s' % (uploader, title) if uploader else title
+ is_live = broadcast.get('state').lower() == 'running'
+
+ thumbnails = [{
+ 'url': broadcast[image],
+ } for image in ('image_url', 'image_url_small') if broadcast.get(image)]
+
+ return {
+ 'id': broadcast.get('id') or video_id,
+ 'title': self._live_title(title) if is_live else title,
+ 'timestamp': parse_iso8601(broadcast.get('created_at')),
+ 'uploader': uploader,
+ 'uploader_id': broadcast.get('user_id') or broadcast.get('username'),
+ 'thumbnails': thumbnails,
+ 'view_count': int_or_none(broadcast.get('total_watched')),
+ 'tags': broadcast.get('tags'),
+ 'is_live': is_live,
+ }
+
+ @staticmethod
+ def _extract_common_format_info(broadcast):
+ return broadcast.get('state').lower(), int_or_none(broadcast.get('width')), int_or_none(broadcast.get('height'))
+
+ @staticmethod
+ def _add_width_and_height(f, width, height):
+ for key, val in (('width', width), ('height', height)):
+ if not f.get(key):
+ f[key] = val
+
+ def _extract_pscp_m3u8_formats(self, m3u8_url, video_id, format_id, state, width, height, fatal=True):
+ m3u8_formats = self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4',
+ entry_protocol='m3u8_native'
+ if state in ('ended', 'timed_out') else 'm3u8',
+ m3u8_id=format_id, fatal=fatal)
+ if len(m3u8_formats) == 1:
+ self._add_width_and_height(m3u8_formats[0], width, height)
+ return m3u8_formats
+
class PeriscopeIE(PeriscopeBaseIE):
IE_DESC = 'Periscope'
IE_NAME = 'periscope'
_VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/[^/]+/(?P<id>[^/?#]+)'
- # Alive example URLs can be found here http://onperiscope.com/
+ # Alive example URLs can be found here https://www.periscope.tv/
_TESTS = [{
'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==',
'md5': '65b57957972e503fcbbaeed8f4fa04ca',
@@ -61,21 +103,9 @@ class PeriscopeIE(PeriscopeBaseIE):
'accessVideoPublic', {'broadcast_id': token}, token)
broadcast = stream['broadcast']
- title = broadcast['status']
-
- uploader = broadcast.get('user_display_name') or broadcast.get('username')
- uploader_id = (broadcast.get('user_id') or broadcast.get('username'))
+ info = self._parse_broadcast_data(broadcast, token)
- title = '%s - %s' % (uploader, title) if uploader else title
state = broadcast.get('state').lower()
- if state == 'running':
- title = self._live_title(title)
- timestamp = parse_iso8601(broadcast.get('created_at'))
-
- thumbnails = [{
- 'url': broadcast[image],
- } for image in ('image_url', 'image_url_small') if broadcast.get(image)]
-
width = int_or_none(broadcast.get('width'))
height = int_or_none(broadcast.get('height'))
@@ -92,32 +122,20 @@ class PeriscopeIE(PeriscopeBaseIE):
continue
video_urls.add(video_url)
if format_id != 'rtmp':
- m3u8_formats = self._extract_m3u8_formats(
- video_url, token, 'mp4',
- entry_protocol='m3u8_native'
- if state in ('ended', 'timed_out') else 'm3u8',
- m3u8_id=format_id, fatal=False)
- if len(m3u8_formats) == 1:
- add_width_and_height(m3u8_formats[0])
+ m3u8_formats = self._extract_pscp_m3u8_formats(
+ video_url, token, format_id, state, width, height, False)
formats.extend(m3u8_formats)
continue
rtmp_format = {
'url': video_url,
'ext': 'flv' if format_id == 'rtmp' else 'mp4',
}
- add_width_and_height(rtmp_format)
+ self._add_width_and_height(rtmp_format)
formats.append(rtmp_format)
self._sort_formats(formats)
- return {
- 'id': broadcast.get('id') or token,
- 'title': title,
- 'timestamp': timestamp,
- 'uploader': uploader,
- 'uploader_id': uploader_id,
- 'thumbnails': thumbnails,
- 'formats': formats,
- }
+ info['formats'] = formats
+ return info
class PeriscopeUserIE(PeriscopeBaseIE):
diff --git a/youtube_dl/extractor/revision3.py b/youtube_dl/extractor/revision3.py
deleted file mode 100644
index 833d8a2f0..000000000
--- a/youtube_dl/extractor/revision3.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..compat import compat_str
-from ..utils import (
- int_or_none,
- parse_iso8601,
- unescapeHTML,
- qualities,
-)
-
-
-class Revision3EmbedIE(InfoExtractor):
- IE_NAME = 'revision3:embed'
- _VALID_URL = r'(?:revision3:(?:(?P<playlist_type>[^:]+):)?|https?://(?:(?:(?:www|embed)\.)?(?:revision3|animalist)|(?:(?:api|embed)\.)?seekernetwork)\.com/player/embed\?videoId=)(?P<playlist_id>\d+)'
- _TEST = {
- 'url': 'http://api.seekernetwork.com/player/embed?videoId=67558',
- 'md5': '83bcd157cab89ad7318dd7b8c9cf1306',
- 'info_dict': {
- 'id': '67558',
- 'ext': 'mp4',
- 'title': 'The Pros & Cons Of Zoos',
- 'description': 'Zoos are often depicted as a terrible place for animals to live, but is there any truth to this?',
- 'uploader_id': 'dnews',
- 'uploader': 'DNews',
- }
- }
- _API_KEY = 'ba9c741bce1b9d8e3defcc22193f3651b8867e62'
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- playlist_id = mobj.group('playlist_id')
- playlist_type = mobj.group('playlist_type') or 'video_id'
- video_data = self._download_json(
- 'http://revision3.com/api/getPlaylist.json', playlist_id, query={
- 'api_key': self._API_KEY,
- 'codecs': 'h264,vp8,theora',
- playlist_type: playlist_id,
- })['items'][0]
-
- formats = []
- for vcodec, media in video_data['media'].items():
- for quality_id, quality in media.items():
- if quality_id == 'hls':
- formats.extend(self._extract_m3u8_formats(
- quality['url'], playlist_id, 'mp4',
- 'm3u8_native', m3u8_id='hls', fatal=False))
- else:
- formats.append({
- 'url': quality['url'],
- 'format_id': '%s-%s' % (vcodec, quality_id),
- 'tbr': int_or_none(quality.get('bitrate')),
- 'vcodec': vcodec,
- })
- self._sort_formats(formats)
-
- return {
- 'id': playlist_id,
- 'title': unescapeHTML(video_data['title']),
- 'description': unescapeHTML(video_data.get('summary')),
- 'uploader': video_data.get('show', {}).get('name'),
- 'uploader_id': video_data.get('show', {}).get('slug'),
- 'duration': int_or_none(video_data.get('duration')),
- 'formats': formats,
- }
-
-
-class Revision3IE(InfoExtractor):
- IE_NAME = 'revision'
- _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:revision3|animalist)\.com)/(?P<id>[^/]+(?:/[^/?#]+)?)'
- _TESTS = [{
- 'url': 'http://www.revision3.com/technobuffalo/5-google-predictions-for-2016',
- 'md5': 'd94a72d85d0a829766de4deb8daaf7df',
- 'info_dict': {
- 'id': '71089',
- 'display_id': 'technobuffalo/5-google-predictions-for-2016',
- 'ext': 'webm',
- 'title': '5 Google Predictions for 2016',
- 'description': 'Google had a great 2015, but it\'s already time to look ahead. Here are our five predictions for 2016.',
- 'upload_date': '20151228',
- 'timestamp': 1451325600,
- 'duration': 187,
- 'uploader': 'TechnoBuffalo',
- 'uploader_id': 'technobuffalo',
- }
- }, {
- # Show
- 'url': 'http://revision3.com/variant',
- 'only_matching': True,
- }, {
- # Tag
- 'url': 'http://revision3.com/vr',
- 'only_matching': True,
- }]
- _PAGE_DATA_TEMPLATE = 'http://www.%s/apiProxy/ddn/%s?domain=%s'
-
- def _real_extract(self, url):
- domain, display_id = re.match(self._VALID_URL, url).groups()
- site = domain.split('.')[0]
- page_info = self._download_json(
- self._PAGE_DATA_TEMPLATE % (domain, display_id, domain), display_id)
-
- page_data = page_info['data']
- page_type = page_data['type']
- if page_type in ('episode', 'embed'):
- show_data = page_data['show']['data']
- page_id = compat_str(page_data['id'])
- video_id = compat_str(page_data['video']['data']['id'])
-
- preference = qualities(['mini', 'small', 'medium', 'large'])
- thumbnails = [{
- 'url': image_url,
- 'id': image_id,
- 'preference': preference(image_id)
- } for image_id, image_url in page_data.get('images', {}).items()]
-
- info = {
- 'id': page_id,
- 'display_id': display_id,
- 'title': unescapeHTML(page_data['name']),
- 'description': unescapeHTML(page_data.get('summary')),
- 'timestamp': parse_iso8601(page_data.get('publishTime'), ' '),
- 'author': page_data.get('author'),
- 'uploader': show_data.get('name'),
- 'uploader_id': show_data.get('slug'),
- 'thumbnails': thumbnails,
- 'extractor_key': site,
- }
-
- if page_type == 'embed':
- info.update({
- '_type': 'url_transparent',
- 'url': page_data['video']['data']['embed'],
- })
- return info
-
- info.update({
- '_type': 'url_transparent',
- 'url': 'revision3:%s' % video_id,
- })
- return info
- else:
- list_data = page_info[page_type]['data']
- episodes_data = page_info['episodes']['data']
- num_episodes = page_info['meta']['totalEpisodes']
- processed_episodes = 0
- entries = []
- page_num = 1
- while True:
- entries.extend([{
- '_type': 'url',
- 'url': 'http://%s%s' % (domain, episode['path']),
- 'id': compat_str(episode['id']),
- 'ie_key': 'Revision3',
- 'extractor_key': site,
- } for episode in episodes_data])
- processed_episodes += len(episodes_data)
- if processed_episodes == num_episodes:
- break
- page_num += 1
- episodes_data = self._download_json(self._PAGE_DATA_TEMPLATE % (
- domain, display_id + '/' + compat_str(page_num), domain),
- display_id)['episodes']['data']
-
- return self.playlist_result(
- entries, compat_str(list_data['id']),
- list_data.get('name'), list_data.get('summary'))
diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py
index 8d88ee499..8883639b2 100644
--- a/youtube_dl/extractor/roosterteeth.py
+++ b/youtube_dl/extractor/roosterteeth.py
@@ -1,8 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
from ..compat import (
compat_HTTPError,
@@ -18,7 +16,6 @@ from ..utils import (
class RoosterTeethIE(InfoExtractor):
_VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/(?:episode|watch)/(?P<id>[^/?#&]+)'
- _LOGIN_URL = 'https://roosterteeth.com/login'
_NETRC_MACHINE = 'roosterteeth'
_TESTS = [{
'url': 'http://roosterteeth.com/episode/million-dollars-but-season-2-million-dollars-but-the-game-announcement',
@@ -53,48 +50,40 @@ class RoosterTeethIE(InfoExtractor):
'url': 'https://roosterteeth.com/watch/million-dollars-but-season-2-million-dollars-but-the-game-announcement',
'only_matching': True,
}]
+ _EPISODE_BASE_URL = 'https://svod-be.roosterteeth.com/api/v1/episodes/'
def _login(self):
username, password = self._get_login_info()
if username is None:
return
- login_page = self._download_webpage(
- self._LOGIN_URL, None,
- note='Downloading login page',
- errnote='Unable to download login page')
-
- login_form = self._hidden_inputs(login_page)
-
- login_form.update({
- 'username': username,
- 'password': password,
- })
-
- login_request = self._download_webpage(
- self._LOGIN_URL, None,
- note='Logging in',
- data=urlencode_postdata(login_form),
- headers={
- 'Referer': self._LOGIN_URL,
- })
-
- if not any(re.search(p, login_request) for p in (
- r'href=["\']https?://(?:www\.)?roosterteeth\.com/logout"',
- r'>Sign Out<')):
- error = self._html_search_regex(
- r'(?s)<div[^>]+class=(["\']).*?\balert-danger\b.*?\1[^>]*>(?:\s*<button[^>]*>.*?</button>)?(?P<error>.+?)</div>',
- login_request, 'alert', default=None, group='error')
- if error:
- raise ExtractorError('Unable to login: %s' % error, expected=True)
- raise ExtractorError('Unable to log in')
+ try:
+ self._download_json(
+ 'https://auth.roosterteeth.com/oauth/token',
+ None, 'Logging in', data=urlencode_postdata({
+ 'client_id': '4338d2b4bdc8db1239360f28e72f0d9ddb1fd01e7a38fbb07b4b1f4ba4564cc5',
+ 'grant_type': 'password',
+ 'username': username,
+ 'password': password,
+ }))
+ except ExtractorError as e:
+ msg = 'Unable to login'
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+ resp = self._parse_json(e.cause.read().decode(), None, fatal=False)
+ if resp:
+ error = resp.get('extra_info') or resp.get('error_description') or resp.get('error')
+ if error:
+ msg += ': ' + error
+ self.report_warning(msg)
def _real_initialize(self):
+ if self._get_cookies(self._EPISODE_BASE_URL).get('rt_access_token'):
+ return
self._login()
def _real_extract(self, url):
display_id = self._match_id(url)
- api_episode_url = 'https://svod-be.roosterteeth.com/api/v1/episodes/%s' % display_id
+ api_episode_url = self._EPISODE_BASE_URL + display_id
try:
m3u8_url = self._download_json(
diff --git a/youtube_dl/extractor/scte.py b/youtube_dl/extractor/scte.py
new file mode 100644
index 000000000..ca1de63b6
--- /dev/null
+++ b/youtube_dl/extractor/scte.py
@@ -0,0 +1,144 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ decode_packed_codes,
+ ExtractorError,
+ urlencode_postdata,
+)
+
+
+class SCTEBaseIE(InfoExtractor):
+ _LOGIN_URL = 'https://www.scte.org/SCTE/Sign_In.aspx'
+ _NETRC_MACHINE = 'scte'
+
+ def _real_initialize(self):
+ self._login()
+
+ def _login(self):
+ username, password = self._get_login_info()
+ if username is None:
+ return
+
+ login_popup = self._download_webpage(
+ self._LOGIN_URL, None, 'Downloading login popup')
+
+ def is_logged(webpage):
+ return any(re.search(p, webpage) for p in (
+ r'class=["\']welcome\b', r'>Sign Out<'))
+
+ # already logged in
+ if is_logged(login_popup):
+ return
+
+ login_form = self._hidden_inputs(login_popup)
+
+ login_form.update({
+ 'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$signInUserName': username,
+ 'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$signInPassword': password,
+ 'ctl01$TemplateBody$WebPartManager1$gwpciNewContactSignInCommon$ciNewContactSignInCommon$RememberMe': 'on',
+ })
+
+ response = self._download_webpage(
+ self._LOGIN_URL, None, 'Logging in',
+ data=urlencode_postdata(login_form))
+
+ if '|pageRedirect|' not in response and not is_logged(response):
+ error = self._html_search_regex(
+ r'(?s)<[^>]+class=["\']AsiError["\'][^>]*>(.+?)</',
+ response, 'error message', default=None)
+ if error:
+ raise ExtractorError('Unable to login: %s' % error, expected=True)
+ raise ExtractorError('Unable to log in')
+
+
+class SCTEIE(SCTEBaseIE):
+ _VALID_URL = r'https?://learning\.scte\.org/mod/scorm/view\.php?.*?\bid=(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://learning.scte.org/mod/scorm/view.php?id=31484',
+ 'info_dict': {
+ 'title': 'Introduction to DOCSIS Engineering Professional',
+ 'id': '31484',
+ },
+ 'playlist_count': 5,
+ 'skip': 'Requires account credentials',
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._search_regex(r'<h1>(.+?)</h1>', webpage, 'title')
+
+ context_id = self._search_regex(r'context-(\d+)', webpage, video_id)
+ content_base = 'https://learning.scte.org/pluginfile.php/%s/mod_scorm/content/8/' % context_id
+ context = decode_packed_codes(self._download_webpage(
+ '%smobile/data.js' % content_base, video_id))
+
+ data = self._parse_xml(
+ self._search_regex(
+ r'CreateData\(\s*"(.+?)"', context, 'data').replace(r"\'", "'"),
+ video_id)
+
+ entries = []
+ for asset in data.findall('.//asset'):
+ asset_url = asset.get('url')
+ if not asset_url or not asset_url.endswith('.mp4'):
+ continue
+ asset_id = self._search_regex(
+ r'video_([^_]+)_', asset_url, 'asset id', default=None)
+ if not asset_id:
+ continue
+ entries.append({
+ 'id': asset_id,
+ 'title': title,
+ 'url': content_base + asset_url,
+ })
+
+ return self.playlist_result(entries, video_id, title)
+
+
+class SCTECourseIE(SCTEBaseIE):
+ _VALID_URL = r'https?://learning\.scte\.org/(?:mod/sub)?course/view\.php?.*?\bid=(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://learning.scte.org/mod/subcourse/view.php?id=31491',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://learning.scte.org/course/view.php?id=3639',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://learning.scte.org/course/view.php?id=3073',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ course_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, course_id)
+
+ title = self._search_regex(
+ r'<h1>(.+?)</h1>', webpage, 'title', default=None)
+
+ entries = []
+ for mobj in re.finditer(
+ r'''(?x)
+ <a[^>]+
+ href=(["\'])
+ (?P<url>
+ https?://learning\.scte\.org/mod/
+ (?P<kind>scorm|subcourse)/view\.php?(?:(?!\1).)*?
+ \bid=\d+
+ )
+ ''',
+ webpage):
+ item_url = mobj.group('url')
+ if item_url == url:
+ continue
+ ie = (SCTEIE.ie_key() if mobj.group('kind') == 'scorm'
+ else SCTECourseIE.ie_key())
+ entries.append(self.url_result(item_url, ie=ie))
+
+ return self.playlist_result(entries, course_id, title)
diff --git a/youtube_dl/extractor/seeker.py b/youtube_dl/extractor/seeker.py
index 3b9c65e7e..7872dc80d 100644
--- a/youtube_dl/extractor/seeker.py
+++ b/youtube_dl/extractor/seeker.py
@@ -4,34 +4,37 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..utils import (
+ get_element_by_class,
+ strip_or_none,
+)
class SeekerIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?seeker\.com/(?P<display_id>.*)-(?P<article_id>\d+)\.html'
_TESTS = [{
- # player.loadRevision3Item
'url': 'http://www.seeker.com/should-trump-be-required-to-release-his-tax-returns-1833805621.html',
- 'md5': '30c1dc4030cc715cf05b423d0947ac18',
+ 'md5': '897d44bbe0d8986a2ead96de565a92db',
'info_dict': {
- 'id': '76243',
- 'ext': 'webm',
+ 'id': 'Elrn3gnY',
+ 'ext': 'mp4',
'title': 'Should Trump Be Required To Release His Tax Returns?',
- 'description': 'Donald Trump has been secretive about his "big," "beautiful" tax returns. So what can we learn if he decides to release them?',
- 'uploader': 'Seeker Daily',
- 'uploader_id': 'seekerdaily',
+ 'description': 'md5:41efa8cfa8d627841045eec7b018eb45',
+ 'timestamp': 1490090165,
+ 'upload_date': '20170321',
}
}, {
'url': 'http://www.seeker.com/changes-expected-at-zoos-following-recent-gorilla-lion-shootings-1834116536.html',
'playlist': [
{
- 'md5': '83bcd157cab89ad7318dd7b8c9cf1306',
+ 'md5': '0497b9f20495174be73ae136949707d2',
'info_dict': {
- 'id': '67558',
+ 'id': 'FihYQ8AE',
'ext': 'mp4',
'title': 'The Pros & Cons Of Zoos',
- 'description': 'Zoos are often depicted as a terrible place for animals to live, but is there any truth to this?',
- 'uploader': 'DNews',
- 'uploader_id': 'dnews',
+ 'description': 'md5:d88f99a8ea8e7d25e6ff77f271b1271c',
+ 'timestamp': 1490039133,
+ 'upload_date': '20170320',
},
}
],
@@ -45,13 +48,11 @@ class SeekerIE(InfoExtractor):
def _real_extract(self, url):
display_id, article_id = re.match(self._VALID_URL, url).groups()
webpage = self._download_webpage(url, display_id)
- mobj = re.search(r"player\.loadRevision3Item\('([^']+)'\s*,\s*(\d+)\);", webpage)
- if mobj:
- playlist_type, playlist_id = mobj.groups()
- return self.url_result(
- 'revision3:%s:%s' % (playlist_type, playlist_id), 'Revision3Embed', playlist_id)
- else:
- entries = [self.url_result('revision3:video_id:%s' % video_id, 'Revision3Embed', video_id) for video_id in re.findall(
- r'<iframe[^>]+src=[\'"](?:https?:)?//api\.seekernetwork\.com/player/embed\?videoId=(\d+)', webpage)]
- return self.playlist_result(
- entries, article_id, self._og_search_title(webpage), self._og_search_description(webpage))
+ entries = []
+ for jwp_id in re.findall(r'data-video-id="([a-zA-Z0-9]{8})"', webpage):
+ entries.append(self.url_result(
+ 'jwplatform:' + jwp_id, 'JWPlatform', jwp_id))
+ return self.playlist_result(
+ entries, article_id,
+ self._og_search_title(webpage),
+ strip_or_none(get_element_by_class('subtitle__text', webpage)) or self._og_search_description(webpage))
diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py
index ff575f592..02295d1a4 100644
--- a/youtube_dl/extractor/shared.py
+++ b/youtube_dl/extractor/shared.py
@@ -1,13 +1,18 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_b64decode
+from ..compat import (
+ compat_b64decode,
+ compat_urllib_parse_unquote_plus,
+)
from ..utils import (
determine_ext,
ExtractorError,
int_or_none,
+ js_to_json,
KNOWN_EXTENSIONS,
parse_filesize,
+ rot47,
url_or_none,
urlencode_postdata,
)
@@ -112,16 +117,22 @@ class VivoIE(SharedBaseIE):
webpage, 'filesize', fatal=False))
def _extract_video_url(self, webpage, video_id, url):
- def decode_url(encoded_url):
+ def decode_url_old(encoded_url):
return compat_b64decode(encoded_url).decode('utf-8')
- stream_url = url_or_none(decode_url(self._search_regex(
+ stream_url = self._search_regex(
r'data-stream\s*=\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
- 'stream url', default=None, group='url')))
+ 'stream url', default=None, group='url')
+ if stream_url:
+ stream_url = url_or_none(decode_url_old(stream_url))
if stream_url:
return stream_url
- return self._parse_json(
+
+ def decode_url(encoded_url):
+ return rot47(compat_urllib_parse_unquote_plus(encoded_url))
+
+ return decode_url(self._parse_json(
self._search_regex(
- r'InitializeStream\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1',
- webpage, 'stream', group='url'),
- video_id, transform_source=decode_url)[0]
+ r'(?s)InitializeStream\s*\(\s*({.+?})\s*\)\s*;', webpage,
+ 'stream'),
+ video_id, transform_source=js_to_json)['source'])
diff --git a/youtube_dl/extractor/slideslive.py b/youtube_dl/extractor/slideslive.py
index ed84322c5..d9ea76831 100644
--- a/youtube_dl/extractor/slideslive.py
+++ b/youtube_dl/extractor/slideslive.py
@@ -2,7 +2,7 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..utils import ExtractorError
+from ..utils import smuggle_url
class SlidesLiveIE(InfoExtractor):
@@ -14,9 +14,9 @@ class SlidesLiveIE(InfoExtractor):
'info_dict': {
'id': 'LMtgR8ba0b0',
'ext': 'mp4',
- 'title': '38902413: external video',
- 'description': '3890241320170925-9-1yd6ech.mp4',
- 'uploader': 'SlidesLive Administrator',
+ 'title': 'GCC IA16 backend',
+ 'description': 'Watch full version of this video at https://slideslive.com/38902413.',
+ 'uploader': 'SlidesLive Videos - A',
'uploader_id': 'UC62SdArr41t_-_fX40QCLRw',
'upload_date': '20170925',
}
@@ -24,16 +24,38 @@ class SlidesLiveIE(InfoExtractor):
# video_service_name = youtube
'url': 'https://slideslive.com/38903721/magic-a-scientific-resurrection-of-an-esoteric-legend',
'only_matching': True,
+ }, {
+ # video_service_name = url
+ 'url': 'https://slideslive.com/38922070/learning-transferable-skills-1',
+ 'only_matching': True,
+ }, {
+ # video_service_name = vimeo
+ 'url': 'https://slideslive.com/38921896/retrospectives-a-venue-for-selfreflection-in-ml-research-3',
+ 'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
video_data = self._download_json(
- url, video_id, headers={'Accept': 'application/json'})
+ 'https://ben.slideslive.com/player/' + video_id, video_id)
service_name = video_data['video_service_name'].lower()
- if service_name == 'youtube':
- yt_video_id = video_data['video_service_id']
- return self.url_result(yt_video_id, 'Youtube', video_id=yt_video_id)
+ assert service_name in ('url', 'vimeo', 'youtube')
+ service_id = video_data['video_service_id']
+ info = {
+ 'id': video_id,
+ 'thumbnail': video_data.get('thumbnail'),
+ 'url': service_id,
+ }
+ if service_name == 'url':
+ info['title'] = video_data['title']
else:
- raise ExtractorError(
- 'Unsupported service name: {0}'.format(service_name), expected=True)
+ info.update({
+ '_type': 'url_transparent',
+ 'ie_key': service_name.capitalize(),
+ 'title': video_data.get('title'),
+ })
+ if service_name == 'vimeo':
+ info['url'] = smuggle_url(
+ 'https://player.vimeo.com/video/' + service_id,
+ {'http_headers': {'Referer': url}})
+ return info
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py
index 05538f3d6..c2ee54457 100644
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@@ -11,14 +11,13 @@ from .common import (
from ..compat import (
compat_str,
compat_urlparse,
- compat_urllib_parse_urlencode,
)
from ..utils import (
ExtractorError,
float_or_none,
+ HEADRequest,
int_or_none,
KNOWN_EXTENSIONS,
- merge_dicts,
mimetype2ext,
str_or_none,
try_get,
@@ -28,6 +27,30 @@ from ..utils import (
)
+class SoundcloudEmbedIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:w|player|p)\.soundcloud\.com/player/?.*?\burl=(?P<id>.+)'
+ _TEST = {
+ # from https://www.soundi.fi/uutiset/ennakkokuuntelussa-timo-kaukolammen-station-to-station-to-station-julkaisua-juhlitaan-tanaan-g-livelabissa/
+ 'url': 'https://w.soundcloud.com/player/?visual=true&url=https%3A%2F%2Fapi.soundcloud.com%2Fplaylists%2F922213810&show_artwork=true&maxwidth=640&maxheight=960&dnt=1&secret_token=s-ziYey',
+ 'only_matching': True,
+ }
+
+ @staticmethod
+ def _extract_urls(webpage):
+ return [m.group('url') for m in re.finditer(
+ r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1',
+ webpage)]
+
+ def _real_extract(self, url):
+ query = compat_urlparse.parse_qs(
+ compat_urlparse.urlparse(url).query)
+ api_url = query['url'][0]
+ secret_token = query.get('secret_token')
+ if secret_token:
+ api_url = update_url_query(api_url, {'secret_token': secret_token[0]})
+ return self.url_result(api_url)
+
+
class SoundcloudIE(InfoExtractor):
"""Information extractor for soundcloud.com
To access the media, the uid of the song and a stream token
@@ -44,9 +67,8 @@ class SoundcloudIE(InfoExtractor):
(?!(?:tracks|albums|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#]))
(?P<title>[\w\d-]+)/?
(?P<token>[^?]+?)?(?:[?].*)?$)
- |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+)
+ |(?:api(?:-v2)?\.soundcloud\.com/tracks/(?P<track_id>\d+)
(?:/?\?secret_token=(?P<secret_token>[^&]+))?)
- |(?P<player>(?:w|player|p.)\.soundcloud\.com/player/?.*?url=.*)
)
'''
IE_NAME = 'soundcloud'
@@ -60,6 +82,7 @@ class SoundcloudIE(InfoExtractor):
'title': 'Lostin Powers - She so Heavy (SneakPreview) Adrian Ackers Blueprint 1',
'description': 'No Downloads untill we record the finished version this weekend, i was too pumped n i had to post it , earl is prolly gonna b hella p.o\'d',
'uploader': 'E.T. ExTerrestrial Music',
+ 'uploader_id': '1571244',
'timestamp': 1349920598,
'upload_date': '20121011',
'duration': 143.216,
@@ -79,6 +102,7 @@ class SoundcloudIE(InfoExtractor):
'title': 'Goldrushed',
'description': 'From Stockholm Sweden\r\nPovel / Magnus / Filip / David\r\nwww.theroyalconcept.com',
'uploader': 'The Royal Concept',
+ 'uploader_id': '9615865',
'timestamp': 1337635207,
'upload_date': '20120521',
'duration': 30,
@@ -92,6 +116,7 @@ class SoundcloudIE(InfoExtractor):
# rtmp
'skip_download': True,
},
+ 'skip': 'Preview',
},
# private link
{
@@ -103,6 +128,7 @@ class SoundcloudIE(InfoExtractor):
'title': 'Youtube - Dl Test Video \'\' Ä↭',
'description': 'test chars: \"\'/\\ä↭',
'uploader': 'jaimeMF',
+ 'uploader_id': '69767071',
'timestamp': 1386604920,
'upload_date': '20131209',
'duration': 9.927,
@@ -123,6 +149,7 @@ class SoundcloudIE(InfoExtractor):
'title': 'Youtube - Dl Test Video \'\' Ä↭',
'description': 'test chars: \"\'/\\ä↭',
'uploader': 'jaimeMF',
+ 'uploader_id': '69767071',
'timestamp': 1386604920,
'upload_date': '20131209',
'duration': 9.927,
@@ -143,6 +170,7 @@ class SoundcloudIE(InfoExtractor):
'title': 'Bus Brakes',
'description': 'md5:0053ca6396e8d2fd7b7e1595ef12ab66',
'uploader': 'oddsamples',
+ 'uploader_id': '73680509',
'timestamp': 1389232924,
'upload_date': '20140109',
'duration': 17.346,
@@ -163,6 +191,7 @@ class SoundcloudIE(InfoExtractor):
'title': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]',
'description': 'md5:fa20ee0fca76a3d6df8c7e57f3715366',
'uploader': 'Ori Uplift Music',
+ 'uploader_id': '12563093',
'timestamp': 1504206263,
'upload_date': '20170831',
'duration': 7449.096,
@@ -183,6 +212,7 @@ class SoundcloudIE(InfoExtractor):
'title': 'Sideways (Prod. Mad Real)',
'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
'uploader': 'garyvee',
+ 'uploader_id': '2366352',
'timestamp': 1488152409,
'upload_date': '20170226',
'duration': 207.012,
@@ -207,6 +237,7 @@ class SoundcloudIE(InfoExtractor):
'title': 'Mezzo Valzer',
'description': 'md5:4138d582f81866a530317bae316e8b61',
'uploader': 'Giovanni Sarani',
+ 'uploader_id': '3352531',
'timestamp': 1551394171,
'upload_date': '20190228',
'duration': 180.157,
@@ -221,114 +252,81 @@ class SoundcloudIE(InfoExtractor):
}
]
- _CLIENT_ID = 'BeGVhOrGmfboy1LtiHTQF6Ejpt9ULJCI'
-
- @staticmethod
- def _extract_urls(webpage):
- return [m.group('url') for m in re.finditer(
- r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1',
- webpage)]
+ _API_BASE = 'https://api.soundcloud.com/'
+ _API_V2_BASE = 'https://api-v2.soundcloud.com/'
+ _BASE_URL = 'https://soundcloud.com/'
+ _CLIENT_ID = 'UW9ajvMgVdMMW3cdeBi8lPfN6dvOVGji'
+ _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg'
+
+ _ARTWORK_MAP = {
+ 'mini': 16,
+ 'tiny': 20,
+ 'small': 32,
+ 'badge': 47,
+ 't67x67': 67,
+ 'large': 100,
+ 't300x300': 300,
+ 'crop': 400,
+ 't500x500': 500,
+ 'original': 0,
+ }
@classmethod
def _resolv_url(cls, url):
- return 'https://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID
+ return SoundcloudIE._API_V2_BASE + 'resolve?url=' + url + '&client_id=' + cls._CLIENT_ID
- def _extract_info_dict(self, info, full_title=None, quiet=False, secret_token=None):
+ def _extract_info_dict(self, info, full_title=None, secret_token=None, version=2):
track_id = compat_str(info['id'])
title = info['title']
- name = full_title or track_id
- if quiet:
- self.report_extraction(name)
- thumbnail = info.get('artwork_url') or info.get('user', {}).get('avatar_url')
- if isinstance(thumbnail, compat_str):
- thumbnail = thumbnail.replace('-large', '-t500x500')
- username = try_get(info, lambda x: x['user']['username'], compat_str)
-
- def extract_count(key):
- return int_or_none(info.get('%s_count' % key))
-
- like_count = extract_count('favoritings')
- if like_count is None:
- like_count = extract_count('likes')
-
- result = {
- 'id': track_id,
- 'uploader': username,
- 'timestamp': unified_timestamp(info.get('created_at')),
- 'title': title,
- 'description': info.get('description'),
- 'thumbnail': thumbnail,
- 'duration': float_or_none(info.get('duration'), 1000),
- 'webpage_url': info.get('permalink_url'),
- 'license': info.get('license'),
- 'view_count': extract_count('playback'),
- 'like_count': like_count,
- 'comment_count': extract_count('comment'),
- 'repost_count': extract_count('reposts'),
- 'genre': info.get('genre'),
- }
+ track_base_url = self._API_BASE + 'tracks/%s' % track_id
format_urls = set()
formats = []
query = {'client_id': self._CLIENT_ID}
- if secret_token is not None:
+ if secret_token:
query['secret_token'] = secret_token
- if info.get('downloadable', False):
- # We can build a direct link to the song
+
+ if info.get('downloadable') and info.get('has_downloads_left'):
format_url = update_url_query(
- 'https://api.soundcloud.com/tracks/%s/download' % track_id, query)
+ info.get('download_url') or track_base_url + '/download', query)
format_urls.add(format_url)
+ if version == 2:
+ v1_info = self._download_json(
+ track_base_url, track_id, query=query, fatal=False) or {}
+ else:
+ v1_info = info
formats.append({
'format_id': 'download',
- 'ext': info.get('original_format', 'mp3'),
+ 'ext': v1_info.get('original_format') or 'mp3',
+ 'filesize': int_or_none(v1_info.get('original_content_size')),
'url': format_url,
- 'vcodec': 'none',
'preference': 10,
})
- # Old API, does not work for some tracks (e.g.
- # https://soundcloud.com/giovannisarani/mezzo-valzer)
- format_dict = self._download_json(
- 'https://api.soundcloud.com/i1/tracks/%s/streams' % track_id,
- track_id, 'Downloading track url', query=query, fatal=False)
-
- if format_dict:
- for key, stream_url in format_dict.items():
- if stream_url in format_urls:
- continue
- format_urls.add(stream_url)
- ext, abr = 'mp3', None
- mobj = re.search(r'_([^_]+)_(\d+)_url', key)
- if mobj:
- ext, abr = mobj.groups()
- abr = int(abr)
- if key.startswith('http'):
- stream_formats = [{
- 'format_id': key,
- 'ext': ext,
- 'url': stream_url,
- }]
- elif key.startswith('rtmp'):
- # The url doesn't have an rtmp app, we have to extract the playpath
- url, path = stream_url.split('mp3:', 1)
- stream_formats = [{
- 'format_id': key,
- 'url': url,
- 'play_path': 'mp3:' + path,
- 'ext': 'flv',
- }]
- elif key.startswith('hls'):
- stream_formats = self._extract_m3u8_formats(
- stream_url, track_id, ext, entry_protocol='m3u8_native',
- m3u8_id=key, fatal=False)
- else:
- continue
-
- if abr:
- for f in stream_formats:
- f['abr'] = abr
+ def invalid_url(url):
+ return not url or url in format_urls or re.search(r'/(?:preview|playlist)/0/30/', url)
- formats.extend(stream_formats)
+ def add_format(f, protocol):
+ mobj = re.search(r'\.(?P<abr>\d+)\.(?P<ext>[0-9a-z]{3,4})(?=[/?])', stream_url)
+ if mobj:
+ for k, v in mobj.groupdict().items():
+ if not f.get(k):
+ f[k] = v
+ format_id_list = []
+ if protocol:
+ format_id_list.append(protocol)
+ for k in ('ext', 'abr'):
+ v = f.get(k)
+ if v:
+ format_id_list.append(v)
+ abr = f.get('abr')
+ if abr:
+ f['abr'] = int(abr)
+ f.update({
+ 'format_id': '_'.join(format_id_list),
+ 'protocol': 'm3u8_native' if protocol == 'hls' else 'http',
+ })
+ formats.append(f)
# New API
transcodings = try_get(
@@ -337,129 +335,165 @@ class SoundcloudIE(InfoExtractor):
if not isinstance(t, dict):
continue
format_url = url_or_none(t.get('url'))
- if not format_url:
+ if not format_url or t.get('snipped') or '/preview/' in format_url:
continue
stream = self._download_json(
- update_url_query(format_url, query), track_id, fatal=False)
+ format_url, track_id, query=query, fatal=False)
if not isinstance(stream, dict):
continue
stream_url = url_or_none(stream.get('url'))
- if not stream_url:
- continue
- if stream_url in format_urls:
+ if invalid_url(stream_url):
continue
format_urls.add(stream_url)
- protocol = try_get(t, lambda x: x['format']['protocol'], compat_str)
+ stream_format = t.get('format') or {}
+ protocol = stream_format.get('protocol')
if protocol != 'hls' and '/hls' in format_url:
protocol = 'hls'
ext = None
preset = str_or_none(t.get('preset'))
if preset:
ext = preset.split('_')[0]
- if ext not in KNOWN_EXTENSIONS:
- mimetype = try_get(
- t, lambda x: x['format']['mime_type'], compat_str)
- ext = mimetype2ext(mimetype) or 'mp3'
- format_id_list = []
- if protocol:
- format_id_list.append(protocol)
- format_id_list.append(ext)
- format_id = '_'.join(format_id_list)
- formats.append({
+ if ext not in KNOWN_EXTENSIONS:
+ ext = mimetype2ext(stream_format.get('mime_type'))
+ add_format({
'url': stream_url,
- 'format_id': format_id,
'ext': ext,
- 'protocol': 'm3u8_native' if protocol == 'hls' else 'http',
- })
+ }, 'http' if protocol == 'progressive' else protocol)
+
+ if not formats:
+ # Old API, does not work for some tracks (e.g.
+ # https://soundcloud.com/giovannisarani/mezzo-valzer)
+ # and might serve preview URLs (e.g.
+ # http://www.soundcloud.com/snbrn/ele)
+ format_dict = self._download_json(
+ track_base_url + '/streams', track_id,
+ 'Downloading track url', query=query, fatal=False) or {}
+
+ for key, stream_url in format_dict.items():
+ if invalid_url(stream_url):
+ continue
+ format_urls.add(stream_url)
+ mobj = re.search(r'(http|hls)_([^_]+)_(\d+)_url', key)
+ if mobj:
+ protocol, ext, abr = mobj.groups()
+ add_format({
+ 'abr': abr,
+ 'ext': ext,
+ 'url': stream_url,
+ }, protocol)
if not formats:
# We fallback to the stream_url in the original info, this
# cannot be always used, sometimes it can give an HTTP 404 error
- formats.append({
- 'format_id': 'fallback',
- 'url': update_url_query(info['stream_url'], query),
- 'ext': 'mp3',
- })
- self._check_formats(formats, track_id)
+ urlh = self._request_webpage(
+ HEADRequest(info.get('stream_url') or track_base_url + '/stream'),
+ track_id, query=query, fatal=False)
+ if urlh:
+ stream_url = urlh.geturl()
+ if not invalid_url(stream_url):
+ add_format({'url': stream_url}, 'http')
for f in formats:
f['vcodec'] = 'none'
self._sort_formats(formats)
- result['formats'] = formats
- return result
+ user = info.get('user') or {}
+
+ thumbnails = []
+ artwork_url = info.get('artwork_url')
+ thumbnail = artwork_url or user.get('avatar_url')
+ if isinstance(thumbnail, compat_str):
+ if re.search(self._IMAGE_REPL_RE, thumbnail):
+ for image_id, size in self._ARTWORK_MAP.items():
+ i = {
+ 'id': image_id,
+ 'url': re.sub(self._IMAGE_REPL_RE, '-%s.jpg' % image_id, thumbnail),
+ }
+ if image_id == 'tiny' and not artwork_url:
+ size = 18
+ elif image_id == 'original':
+ i['preference'] = 10
+ if size:
+ i.update({
+ 'width': size,
+ 'height': size,
+ })
+ thumbnails.append(i)
+ else:
+ thumbnails = [{'url': thumbnail}]
+
+ def extract_count(key):
+ return int_or_none(info.get('%s_count' % key))
+
+ return {
+ 'id': track_id,
+ 'uploader': user.get('username'),
+ 'uploader_id': str_or_none(user.get('id')) or user.get('permalink'),
+ 'uploader_url': user.get('permalink_url'),
+ 'timestamp': unified_timestamp(info.get('created_at')),
+ 'title': title,
+ 'description': info.get('description'),
+ 'thumbnails': thumbnails,
+ 'duration': float_or_none(info.get('duration'), 1000),
+ 'webpage_url': info.get('permalink_url'),
+ 'license': info.get('license'),
+ 'view_count': extract_count('playback'),
+ 'like_count': extract_count('favoritings') or extract_count('likes'),
+ 'comment_count': extract_count('comment'),
+ 'repost_count': extract_count('reposts'),
+ 'genre': info.get('genre'),
+ 'formats': formats
+ }
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
- if mobj is None:
- raise ExtractorError('Invalid URL: %s' % url)
+ mobj = re.match(self._VALID_URL, url)
track_id = mobj.group('track_id')
- new_info = {}
- if track_id is not None:
- info_json_url = 'https://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID
+ query = {
+ 'client_id': self._CLIENT_ID,
+ }
+ if track_id:
+ info_json_url = self._API_V2_BASE + 'tracks/' + track_id
full_title = track_id
token = mobj.group('secret_token')
if token:
- info_json_url += '&secret_token=' + token
- elif mobj.group('player'):
- query = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
- real_url = query['url'][0]
- # If the token is in the query of the original url we have to
- # manually add it
- if 'secret_token' in query:
- real_url += '?secret_token=' + query['secret_token'][0]
- return self.url_result(real_url)
+ query['secret_token'] = token
else:
- # extract uploader (which is in the url)
- uploader = mobj.group('uploader')
- # extract simple title (uploader + slug of song title)
- slug_title = mobj.group('title')
+ full_title = resolve_title = '%s/%s' % mobj.group('uploader', 'title')
token = mobj.group('token')
- full_title = resolve_title = '%s/%s' % (uploader, slug_title)
if token:
resolve_title += '/%s' % token
+ info_json_url = self._resolv_url(self._BASE_URL + resolve_title)
- webpage = self._download_webpage(url, full_title, fatal=False)
- if webpage:
- entries = self._parse_json(
- self._search_regex(
- r'var\s+c\s*=\s*(\[.+?\])\s*,\s*o\s*=Date\b', webpage,
- 'data', default='[]'), full_title, fatal=False)
- if entries:
- for e in entries:
- if not isinstance(e, dict):
- continue
- if e.get('id') != 67:
- continue
- data = try_get(e, lambda x: x['data'][0], dict)
- if data:
- new_info = data
- break
- info_json_url = self._resolv_url(
- 'https://soundcloud.com/%s' % resolve_title)
-
- # Contains some additional info missing from new_info
+ version = 2
info = self._download_json(
- info_json_url, full_title, 'Downloading info JSON')
+ info_json_url, full_title, 'Downloading info JSON', query=query, fatal=False)
+ if not info:
+ info = self._download_json(
+ info_json_url.replace(self._API_V2_BASE, self._API_BASE),
+ full_title, 'Downloading info JSON', query=query)
+ version = 1
- return self._extract_info_dict(
- merge_dicts(info, new_info), full_title, secret_token=token)
+ return self._extract_info_dict(info, full_title, token, version)
class SoundcloudPlaylistBaseIE(SoundcloudIE):
- @staticmethod
- def _extract_id(e):
- return compat_str(e['id']) if e.get('id') else None
-
- def _extract_track_entries(self, tracks):
- return [
- self.url_result(
- track['permalink_url'], SoundcloudIE.ie_key(),
- video_id=self._extract_id(track))
- for track in tracks if track.get('permalink_url')]
+ def _extract_track_entries(self, tracks, token=None):
+ entries = []
+ for track in tracks:
+ track_id = str_or_none(track.get('id'))
+ url = track.get('permalink_url')
+ if not url:
+ if not track_id:
+ continue
+ url = self._API_V2_BASE + 'tracks/' + track_id
+ if token:
+ url += '?secret_token=' + token
+ entries.append(self.url_result(
+ url, SoundcloudIE.ie_key(), track_id))
+ return entries
class SoundcloudSetIE(SoundcloudPlaylistBaseIE):
@@ -480,41 +514,28 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- # extract uploader (which is in the url)
- uploader = mobj.group('uploader')
- # extract simple title (uploader + slug of song title)
- slug_title = mobj.group('slug_title')
- full_title = '%s/sets/%s' % (uploader, slug_title)
- url = 'https://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
-
+ full_title = '%s/sets/%s' % mobj.group('uploader', 'slug_title')
token = mobj.group('token')
if token:
full_title += '/' + token
- url += '/' + token
- resolv_url = self._resolv_url(url)
- info = self._download_json(resolv_url, full_title)
+ info = self._download_json(self._resolv_url(
+ self._BASE_URL + full_title), full_title)
if 'errors' in info:
msgs = (compat_str(err['error_message']) for err in info['errors'])
raise ExtractorError('unable to download video webpage: %s' % ','.join(msgs))
- entries = self._extract_track_entries(info['tracks'])
+ entries = self._extract_track_entries(info['tracks'], token)
- return {
- '_type': 'playlist',
- 'entries': entries,
- 'id': '%s' % info['id'],
- 'title': info['title'],
- }
+ return self.playlist_result(
+ entries, str_or_none(info.get('id')), info.get('title'))
class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE):
- _API_V2_BASE = 'https://api-v2.soundcloud.com'
-
def _extract_playlist(self, base_url, playlist_id, playlist_title):
COMMON_QUERY = {
- 'limit': 50,
+ 'limit': 2000000000,
'client_id': self._CLIENT_ID,
'linked_partitioning': '1',
}
@@ -522,12 +543,13 @@ class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE):
query = COMMON_QUERY.copy()
query['offset'] = 0
- next_href = base_url + '?' + compat_urllib_parse_urlencode(query)
+ next_href = base_url
entries = []
for i in itertools.count():
response = self._download_json(
- next_href, playlist_id, 'Downloading track page %s' % (i + 1))
+ next_href, playlist_id,
+ 'Downloading track page %s' % (i + 1), query=query)
collection = response['collection']
@@ -546,9 +568,8 @@ class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE):
continue
return self.url_result(
permalink_url,
- ie=SoundcloudIE.ie_key() if SoundcloudIE.suitable(permalink_url) else None,
- video_id=self._extract_id(cand),
- video_title=cand.get('title'))
+ SoundcloudIE.ie_key() if SoundcloudIE.suitable(permalink_url) else None,
+ str_or_none(cand.get('id')), cand.get('title'))
for e in collection:
entry = resolve_entry((e, e.get('track'), e.get('playlist')))
@@ -559,11 +580,10 @@ class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE):
if not next_href:
break
- parsed_next_href = compat_urlparse.urlparse(response['next_href'])
- qs = compat_urlparse.parse_qs(parsed_next_href.query)
- qs.update(COMMON_QUERY)
- next_href = compat_urlparse.urlunparse(
- parsed_next_href._replace(query=compat_urllib_parse_urlencode(qs, True)))
+ next_href = response['next_href']
+ parsed_next_href = compat_urlparse.urlparse(next_href)
+ query = compat_urlparse.parse_qs(parsed_next_href.query)
+ query.update(COMMON_QUERY)
return {
'_type': 'playlist',
@@ -609,7 +629,7 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE):
'url': 'https://soundcloud.com/jcv246/sets',
'info_dict': {
'id': '12982173',
- 'title': 'Jordi / cv (Playlists)',
+ 'title': 'Jordi / cv (Sets)',
},
'playlist_mincount': 2,
}, {
@@ -636,39 +656,29 @@ class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE):
}]
_BASE_URL_MAP = {
- 'all': '%s/stream/users/%%s' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
- 'tracks': '%s/users/%%s/tracks' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
- 'albums': '%s/users/%%s/albums' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
- 'sets': '%s/users/%%s/playlists' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
- 'reposts': '%s/stream/users/%%s/reposts' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
- 'likes': '%s/users/%%s/likes' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
- 'spotlight': '%s/users/%%s/spotlight' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
- }
-
- _TITLE_MAP = {
- 'all': 'All',
- 'tracks': 'Tracks',
- 'albums': 'Albums',
- 'sets': 'Playlists',
- 'reposts': 'Reposts',
- 'likes': 'Likes',
- 'spotlight': 'Spotlight',
+ 'all': 'stream/users/%s',
+ 'tracks': 'users/%s/tracks',
+ 'albums': 'users/%s/albums',
+ 'sets': 'users/%s/playlists',
+ 'reposts': 'stream/users/%s/reposts',
+ 'likes': 'users/%s/likes',
+ 'spotlight': 'users/%s/spotlight',
}
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
uploader = mobj.group('user')
- url = 'https://soundcloud.com/%s/' % uploader
- resolv_url = self._resolv_url(url)
user = self._download_json(
- resolv_url, uploader, 'Downloading user info')
+ self._resolv_url(self._BASE_URL + uploader),
+ uploader, 'Downloading user info')
resource = mobj.group('rsrc') or 'all'
return self._extract_playlist(
- self._BASE_URL_MAP[resource] % user['id'], compat_str(user['id']),
- '%s (%s)' % (user['username'], self._TITLE_MAP[resource]))
+ self._API_V2_BASE + self._BASE_URL_MAP[resource] % user['id'],
+ str_or_none(user.get('id')),
+ '%s (%s)' % (user['username'], resource.capitalize()))
class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE):
@@ -678,7 +688,7 @@ class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE):
'url': 'https://soundcloud.com/stations/track/officialsundial/your-text',
'info_dict': {
'id': '286017854',
- 'title': 'Track station: your-text',
+ 'title': 'Track station: your text',
},
'playlist_mincount': 47,
}]
@@ -686,19 +696,17 @@ class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE):
def _real_extract(self, url):
track_name = self._match_id(url)
- webpage = self._download_webpage(url, track_name)
-
+ track = self._download_json(self._resolv_url(url), track_name)
track_id = self._search_regex(
- r'soundcloud:track-stations:(\d+)', webpage, 'track id')
+ r'soundcloud:track-stations:(\d+)', track['id'], 'track id')
return self._extract_playlist(
- '%s/stations/soundcloud:track-stations:%s/tracks'
- % (self._API_V2_BASE, track_id),
- track_id, 'Track station: %s' % track_name)
+ self._API_V2_BASE + 'stations/%s/tracks' % track['id'],
+ track_id, 'Track station: %s' % track['title'])
class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE):
- _VALID_URL = r'https?://api\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$'
+ _VALID_URL = r'https?://api(?:-v2)?\.soundcloud\.com/playlists/(?P<id>[0-9]+)(?:/?\?secret_token=(?P<token>[^&]+?))?$'
IE_NAME = 'soundcloud:playlist'
_TESTS = [{
'url': 'https://api.soundcloud.com/playlists/4110309',
@@ -713,29 +721,22 @@ class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
playlist_id = mobj.group('id')
- base_url = '%s//api.soundcloud.com/playlists/%s.json?' % (self.http_scheme(), playlist_id)
- data_dict = {
+ query = {
'client_id': self._CLIENT_ID,
}
token = mobj.group('token')
-
if token:
- data_dict['secret_token'] = token
+ query['secret_token'] = token
- data = compat_urllib_parse_urlencode(data_dict)
data = self._download_json(
- base_url + data, playlist_id, 'Downloading playlist')
+ self._API_V2_BASE + 'playlists/' + playlist_id,
+ playlist_id, 'Downloading playlist', query=query)
- entries = self._extract_track_entries(data['tracks'])
+ entries = self._extract_track_entries(data['tracks'], token)
- return {
- '_type': 'playlist',
- 'id': playlist_id,
- 'title': data.get('title'),
- 'description': data.get('description'),
- 'entries': entries,
- }
+ return self.playlist_result(
+ entries, playlist_id, data.get('title'), data.get('description'))
class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE):
@@ -753,18 +754,18 @@ class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE):
_SEARCH_KEY = 'scsearch'
_MAX_RESULTS_PER_PAGE = 200
_DEFAULT_RESULTS_PER_PAGE = 50
- _API_V2_BASE = 'https://api-v2.soundcloud.com'
def _get_collection(self, endpoint, collection_id, **query):
limit = min(
query.get('limit', self._DEFAULT_RESULTS_PER_PAGE),
self._MAX_RESULTS_PER_PAGE)
- query['limit'] = limit
- query['client_id'] = self._CLIENT_ID
- query['linked_partitioning'] = '1'
- query['offset'] = 0
- data = compat_urllib_parse_urlencode(query)
- next_url = '{0}{1}?{2}'.format(self._API_V2_BASE, endpoint, data)
+ query.update({
+ 'limit': limit,
+ 'client_id': self._CLIENT_ID,
+ 'linked_partitioning': 1,
+ 'offset': 0,
+ })
+ next_url = update_url_query(self._API_V2_BASE + endpoint, query)
collected_results = 0
@@ -791,5 +792,5 @@ class SoundcloudSearchIE(SearchInfoExtractor, SoundcloudIE):
break
def _get_n_results(self, query, n):
- tracks = self._get_collection('/search/tracks', query, limit=n, q=query)
+ tracks = self._get_collection('search/tracks', query, limit=n, q=query)
return self.playlist_result(tracks, playlist_title=query)
diff --git a/youtube_dl/extractor/streamango.py b/youtube_dl/extractor/streamango.py
deleted file mode 100644
index f1e17dd88..000000000
--- a/youtube_dl/extractor/streamango.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..compat import compat_chr
-from ..utils import (
- determine_ext,
- ExtractorError,
- int_or_none,
- js_to_json,
-)
-
-
-class StreamangoIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?(?:streamango\.com|fruithosts\.net|streamcherry\.com)/(?:f|embed)/(?P<id>[^/?#&]+)'
- _TESTS = [{
- 'url': 'https://streamango.com/f/clapasobsptpkdfe/20170315_150006_mp4',
- 'md5': 'e992787515a182f55e38fc97588d802a',
- 'info_dict': {
- 'id': 'clapasobsptpkdfe',
- 'ext': 'mp4',
- 'title': '20170315_150006.mp4',
- }
- }, {
- # no og:title
- 'url': 'https://streamango.com/embed/foqebrpftarclpob/asdf_asd_2_mp4',
- 'info_dict': {
- 'id': 'foqebrpftarclpob',
- 'ext': 'mp4',
- 'title': 'foqebrpftarclpob',
- },
- 'params': {
- 'skip_download': True,
- },
- 'skip': 'gone',
- }, {
- 'url': 'https://streamango.com/embed/clapasobsptpkdfe/20170315_150006_mp4',
- 'only_matching': True,
- }, {
- 'url': 'https://fruithosts.net/f/mreodparcdcmspsm/w1f1_r4lph_2018_brrs_720p_latino_mp4',
- 'only_matching': True,
- }, {
- 'url': 'https://streamcherry.com/f/clapasobsptpkdfe/',
- 'only_matching': True,
- }]
-
- def _real_extract(self, url):
- def decrypt_src(encoded, val):
- ALPHABET = '=/+9876543210zyxwvutsrqponmlkjihgfedcbaZYXWVUTSRQPONMLKJIHGFEDCBA'
- encoded = re.sub(r'[^A-Za-z0-9+/=]', '', encoded)
- decoded = ''
- sm = [None] * 4
- i = 0
- str_len = len(encoded)
- while i < str_len:
- for j in range(4):
- sm[j % 4] = ALPHABET.index(encoded[i])
- i += 1
- char_code = ((sm[0] << 0x2) | (sm[1] >> 0x4)) ^ val
- decoded += compat_chr(char_code)
- if sm[2] != 0x40:
- char_code = ((sm[1] & 0xf) << 0x4) | (sm[2] >> 0x2)
- decoded += compat_chr(char_code)
- if sm[3] != 0x40:
- char_code = ((sm[2] & 0x3) << 0x6) | sm[3]
- decoded += compat_chr(char_code)
- return decoded
-
- video_id = self._match_id(url)
-
- webpage = self._download_webpage(url, video_id)
-
- title = self._og_search_title(webpage, default=video_id)
-
- formats = []
- for format_ in re.findall(r'({[^}]*\bsrc\s*:\s*[^}]*})', webpage):
- mobj = re.search(r'(src\s*:\s*[^(]+\(([^)]*)\)[\s,]*)', format_)
- if mobj is None:
- continue
-
- format_ = format_.replace(mobj.group(0), '')
-
- video = self._parse_json(
- format_, video_id, transform_source=js_to_json,
- fatal=False) or {}
-
- mobj = re.search(
- r'([\'"])(?P<src>(?:(?!\1).)+)\1\s*,\s*(?P<val>\d+)',
- mobj.group(1))
- if mobj is None:
- continue
-
- src = decrypt_src(mobj.group('src'), int_or_none(mobj.group('val')))
- if not src:
- continue
-
- ext = determine_ext(src, default_ext=None)
- if video.get('type') == 'application/dash+xml' or ext == 'mpd':
- formats.extend(self._extract_mpd_formats(
- src, video_id, mpd_id='dash', fatal=False))
- else:
- formats.append({
- 'url': src,
- 'ext': ext or 'mp4',
- 'width': int_or_none(video.get('width')),
- 'height': int_or_none(video.get('height')),
- 'tbr': int_or_none(video.get('bitrate')),
- })
-
- if not formats:
- error = self._search_regex(
- r'<p[^>]+\bclass=["\']lead[^>]+>(.+?)</p>', webpage,
- 'error', default=None)
- if not error and '>Sorry' in webpage:
- error = 'Video %s is not available' % video_id
- if error:
- raise ExtractorError(error, expected=True)
-
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'url': url,
- 'title': title,
- 'formats': formats,
- }
diff --git a/youtube_dl/extractor/stv.py b/youtube_dl/extractor/stv.py
index ccb074cd4..bae8b71f4 100644
--- a/youtube_dl/extractor/stv.py
+++ b/youtube_dl/extractor/stv.py
@@ -4,15 +4,10 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import (
- compat_parse_qs,
- compat_urllib_parse_urlparse
-)
from ..utils import (
- extract_attributes,
+ compat_str,
float_or_none,
int_or_none,
- str_or_none,
)
@@ -20,20 +15,20 @@ class STVPlayerIE(InfoExtractor):
IE_NAME = 'stv:player'
_VALID_URL = r'https?://player\.stv\.tv/(?P<type>episode|video)/(?P<id>[a-z0-9]{4})'
_TEST = {
- 'url': 'https://player.stv.tv/video/7srz/victoria/interview-with-the-cast-ahead-of-new-victoria/',
- 'md5': '2ad867d4afd641fa14187596e0fbc91b',
+ 'url': 'https://player.stv.tv/video/4gwd/emmerdale/60-seconds-on-set-with-laura-norton/',
+ 'md5': '5adf9439c31d554f8be0707c7abe7e0a',
'info_dict': {
- 'id': '6016487034001',
+ 'id': '5333973339001',
'ext': 'mp4',
- 'upload_date': '20190321',
- 'title': 'Interview with the cast ahead of new Victoria',
- 'description': 'Nell Hudson and Lily Travers tell us what to expect in the new season of Victoria.',
- 'timestamp': 1553179628,
+ 'upload_date': '20170301',
+ 'title': '60 seconds on set with Laura Norton',
+ 'description': "How many questions can Laura - a.k.a Kerry Wyatt - answer in 60 seconds? Let\'s find out!",
+ 'timestamp': 1488388054,
'uploader_id': '1486976045',
},
'skip': 'this resource is unavailable outside of the UK',
}
- _PUBLISHER_ID = '1486976045'
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1486976045/default_default/index.html?videoId=%s'
_PTYPE_MAP = {
'episode': 'episodes',
'video': 'shortform',
@@ -41,54 +36,32 @@ class STVPlayerIE(InfoExtractor):
def _real_extract(self, url):
ptype, video_id = re.match(self._VALID_URL, url).groups()
- webpage = self._download_webpage(url, video_id)
+ resp = self._download_json(
+ 'https://player.api.stv.tv/v1/%s/%s' % (self._PTYPE_MAP[ptype], video_id),
+ video_id)
- qs = compat_parse_qs(compat_urllib_parse_urlparse(self._search_regex(
- r'itemprop="embedURL"[^>]+href="([^"]+)',
- webpage, 'embed URL', default=None)).query)
- publisher_id = qs.get('publisherID', [None])[0] or self._PUBLISHER_ID
+ result = resp['results']
+ video = result['video']
+ video_id = compat_str(video['id'])
- player_attr = extract_attributes(self._search_regex(
- r'(<[^>]+class="bcplayer"[^>]+>)', webpage, 'player', default=None)) or {}
+ subtitles = {}
+ _subtitles = result.get('_subtitles') or {}
+ for ext, sub_url in _subtitles.items():
+ subtitles.setdefault('en', []).append({
+ 'ext': 'vtt' if ext == 'webvtt' else ext,
+ 'url': sub_url,
+ })
- info = {}
- duration = ref_id = series = video_id = None
- api_ref_id = player_attr.get('data-player-api-refid')
- if api_ref_id:
- resp = self._download_json(
- 'https://player.api.stv.tv/v1/%s/%s' % (self._PTYPE_MAP[ptype], api_ref_id),
- api_ref_id, fatal=False)
- if resp:
- result = resp.get('results') or {}
- video = result.get('video') or {}
- video_id = str_or_none(video.get('id'))
- ref_id = video.get('guid')
- duration = video.get('length')
- programme = result.get('programme') or {}
- series = programme.get('name') or programme.get('shortName')
- subtitles = {}
- _subtitles = result.get('_subtitles') or {}
- for ext, sub_url in _subtitles.items():
- subtitles.setdefault('en', []).append({
- 'ext': 'vtt' if ext == 'webvtt' else ext,
- 'url': sub_url,
- })
- info.update({
- 'description': result.get('summary'),
- 'subtitles': subtitles,
- 'view_count': int_or_none(result.get('views')),
- })
- if not video_id:
- video_id = qs.get('videoId', [None])[0] or self._search_regex(
- r'<link\s+itemprop="url"\s+href="(\d+)"',
- webpage, 'video id', default=None) or 'ref:' + (ref_id or player_attr['data-refid'])
+ programme = result.get('programme') or {}
- info.update({
+ return {
'_type': 'url_transparent',
- 'duration': float_or_none(duration or player_attr.get('data-duration'), 1000),
'id': video_id,
+ 'url': self.BRIGHTCOVE_URL_TEMPLATE % video_id,
+ 'description': result.get('summary'),
+ 'duration': float_or_none(video.get('length'), 1000),
+ 'subtitles': subtitles,
+ 'view_count': int_or_none(result.get('views')),
+ 'series': programme.get('name') or programme.get('shortName'),
'ie_key': 'BrightcoveNew',
- 'series': series or player_attr.get('data-programme-name'),
- 'url': 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s' % (publisher_id, video_id),
- })
- return info
+ }
diff --git a/youtube_dl/extractor/teachingchannel.py b/youtube_dl/extractor/teachingchannel.py
index e89759714..624cdb3ad 100644
--- a/youtube_dl/extractor/teachingchannel.py
+++ b/youtube_dl/extractor/teachingchannel.py
@@ -1,35 +1,33 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
-from .ooyala import OoyalaIE
class TeachingChannelIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?teachingchannel\.org/videos/(?P<title>.+)'
+ _VALID_URL = r'https?://(?:www\.)?teachingchannel\.org/videos?/(?P<id>[^/?&#]+)'
_TEST = {
'url': 'https://www.teachingchannel.org/videos/teacher-teaming-evolution',
- 'md5': '3d6361864d7cac20b57c8784da17166f',
'info_dict': {
- 'id': 'F3bnlzbToeI6pLEfRyrlfooIILUjz4nM',
+ 'id': '3swwlzkT',
'ext': 'mp4',
'title': 'A History of Teaming',
'description': 'md5:2a9033db8da81f2edffa4c99888140b3',
- 'duration': 422.255,
+ 'duration': 422,
+ 'upload_date': '20170316',
+ 'timestamp': 1489691297,
},
'params': {
'skip_download': True,
},
- 'add_ie': ['Ooyala'],
+ 'add_ie': ['JWPlatform'],
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- title = mobj.group('title')
- webpage = self._download_webpage(url, title)
- ooyala_code = self._search_regex(
- r'data-embed-code=\'(.+?)\'', webpage, 'ooyala code')
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+ mid = self._search_regex(
+ r'(?:data-mid=["\']|id=["\']jw-video-player-)([a-zA-Z0-9]{8})',
+ webpage, 'media id')
- return OoyalaIE._build_url_result(ooyala_code)
+ return self.url_result('jwplatform:' + mid, 'JWPlatform', mid)
diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py
index 7640cf00a..5793b711f 100644
--- a/youtube_dl/extractor/teamcoco.py
+++ b/youtube_dl/extractor/teamcoco.py
@@ -84,6 +84,19 @@ class TeamcocoIE(TurnerBaseIE):
'only_matching': True,
}
]
+ _RECORD_TEMPL = '''id
+ title
+ teaser
+ publishOn
+ thumb {
+ preview
+ }
+ tags {
+ name
+ }
+ duration
+ turnerMediaId
+ turnerMediaAuthToken'''
def _graphql_call(self, query_template, object_type, object_id):
find_object = 'find' + object_type
@@ -98,36 +111,36 @@ class TeamcocoIE(TurnerBaseIE):
display_id = self._match_id(url)
response = self._graphql_call('''{
- %s(slug: "%s") {
+ %%s(slug: "%%s") {
... on RecordSlug {
record {
+ %s
+ }
+ }
+ ... on PageSlug {
+ child {
id
- title
- teaser
- publishOn
- thumb {
- preview
- }
- file {
- url
- }
- tags {
- name
- }
- duration
- turnerMediaId
- turnerMediaAuthToken
}
}
... on NotFoundSlug {
status
}
}
-}''', 'Slug', display_id)
+}''' % self._RECORD_TEMPL, 'Slug', display_id)
if response.get('status'):
raise ExtractorError('This video is no longer available.', expected=True)
- record = response['record']
+ child = response.get('child')
+ if child:
+ record = self._graphql_call('''{
+ %%s(id: "%%s") {
+ ... on Video {
+ %s
+ }
+ }
+}''' % self._RECORD_TEMPL, 'Record', child['id'])
+ else:
+ record = response['record']
video_id = record['id']
info = {
@@ -150,25 +163,21 @@ class TeamcocoIE(TurnerBaseIE):
'accessTokenType': 'jws',
}))
else:
- d = self._download_json(
+ video_sources = self._download_json(
'https://teamcoco.com/_truman/d/' + video_id,
- video_id, fatal=False) or {}
- video_sources = d.get('meta') or {}
- if not video_sources:
- video_sources = self._graphql_call('''{
- %s(id: "%s") {
- src
- }
-}''', 'RecordVideoSource', video_id) or {}
+ video_id)['meta']['src']
+ if isinstance(video_sources, dict):
+ video_sources = video_sources.values()
formats = []
get_quality = qualities(['low', 'sd', 'hd', 'uhd'])
- for format_id, src in video_sources.get('src', {}).items():
+ for src in video_sources:
if not isinstance(src, dict):
continue
src_url = src.get('src')
if not src_url:
continue
+ format_id = src.get('label')
ext = determine_ext(src_url, mimetype2ext(src.get('type')))
if format_id == 'hls' or ext == 'm3u8':
# compat_urllib_parse.urljoin does not work here
@@ -190,9 +199,6 @@ class TeamcocoIE(TurnerBaseIE):
'format_id': format_id,
'quality': get_quality(format_id),
})
- if not formats:
- formats = self._extract_m3u8_formats(
- record['file']['url'], video_id, 'mp4', fatal=False)
self._sort_formats(formats)
info['formats'] = formats
diff --git a/youtube_dl/extractor/telegraaf.py b/youtube_dl/extractor/telegraaf.py
index 0f576c1ab..2dc020537 100644
--- a/youtube_dl/extractor/telegraaf.py
+++ b/youtube_dl/extractor/telegraaf.py
@@ -4,21 +4,25 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
determine_ext,
- remove_end,
+ int_or_none,
+ parse_iso8601,
+ try_get,
)
class TelegraafIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?telegraaf\.nl/tv/(?:[^/]+/)+(?P<id>\d+)/[^/]+\.html'
+ _VALID_URL = r'https?://(?:www\.)?telegraaf\.nl/video/(?P<id>\d+)'
_TEST = {
- 'url': 'http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html',
+ 'url': 'https://www.telegraaf.nl/video/734366489/historisch-scheepswrak-slaat-na-100-jaar-los',
'info_dict': {
- 'id': '24353229',
+ 'id': 'gaMItuoSeUg2',
'ext': 'mp4',
- 'title': 'Tikibad ontruimd wegens brand',
- 'description': 'md5:05ca046ff47b931f9b04855015e163a4',
- 'thumbnail': r're:^https?://.*\.jpg$',
- 'duration': 33,
+ 'title': 'Historisch scheepswrak slaat na 100 jaar los',
+ 'description': 'md5:6f53b7c4f55596722ac24d6c0ec00cfb',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'duration': 55,
+ 'timestamp': 1572805527,
+ 'upload_date': '20191103',
},
'params': {
# m3u8 download
@@ -27,23 +31,30 @@ class TelegraafIE(InfoExtractor):
}
def _real_extract(self, url):
- video_id = self._match_id(url)
+ article_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
+ video_id = self._download_json(
+ 'https://www.telegraaf.nl/graphql', article_id, query={
+ 'query': '''{
+ article(uid: %s) {
+ videos {
+ videoId
+ }
+ }
+}''' % article_id,
+ })['data']['article']['videos'][0]['videoId']
- player_url = self._html_search_regex(
- r'<iframe[^>]+src="([^"]+")', webpage, 'player URL')
- player_page = self._download_webpage(
- player_url, video_id, note='Download player webpage')
- playlist_url = self._search_regex(
- r'playlist\s*:\s*"([^"]+)"', player_page, 'playlist URL')
- playlist_data = self._download_json(playlist_url, video_id)
+ item = self._download_json(
+ 'https://content.tmgvideo.nl/playlist/item=%s/playlist.json' % video_id,
+ video_id)['items'][0]
+ title = item['title']
- item = playlist_data['items'][0]
formats = []
- locations = item['locations']
+ locations = item.get('locations') or {}
for location in locations.get('adaptive', []):
- manifest_url = location['src']
+ manifest_url = location.get('src')
+ if not manifest_url:
+ continue
ext = determine_ext(manifest_url)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
@@ -54,25 +65,25 @@ class TelegraafIE(InfoExtractor):
else:
self.report_warning('Unknown adaptive format %s' % ext)
for location in locations.get('progressive', []):
+ src = try_get(location, lambda x: x['sources'][0]['src'])
+ if not src:
+ continue
+ label = location.get('label')
formats.append({
- 'url': location['sources'][0]['src'],
- 'width': location.get('width'),
- 'height': location.get('height'),
- 'format_id': 'http-%s' % location['label'],
+ 'url': src,
+ 'width': int_or_none(location.get('width')),
+ 'height': int_or_none(location.get('height')),
+ 'format_id': 'http' + ('-%s' % label if label else ''),
})
self._sort_formats(formats)
- title = remove_end(self._og_search_title(webpage), ' - VIDEO')
- description = self._og_search_description(webpage)
- duration = item.get('duration')
- thumbnail = item.get('poster')
-
return {
'id': video_id,
'title': title,
- 'description': description,
+ 'description': item.get('description'),
'formats': formats,
- 'duration': duration,
- 'thumbnail': thumbnail,
+ 'duration': int_or_none(item.get('duration')),
+ 'thumbnail': item.get('poster'),
+ 'timestamp': parse_iso8601(item.get('datecreated'), ' '),
}
diff --git a/youtube_dl/extractor/tenplay.py b/youtube_dl/extractor/tenplay.py
new file mode 100644
index 000000000..dff44a4e2
--- /dev/null
+++ b/youtube_dl/extractor/tenplay.py
@@ -0,0 +1,55 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_age_limit,
+ parse_iso8601,
+ smuggle_url,
+)
+
+
+class TenPlayIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?10play\.com\.au/[^/]+/episodes/[^/]+/[^/]+/(?P<id>tpv\d{6}[a-z]{5})'
+ _TEST = {
+ 'url': 'https://10play.com.au/masterchef/episodes/season-1/masterchef-s1-ep-1/tpv190718kwzga',
+ 'info_dict': {
+ 'id': '6060533435001',
+ 'ext': 'mp4',
+ 'title': 'MasterChef - S1 Ep. 1',
+ 'description': 'md5:4fe7b78e28af8f2d900cd20d900ef95c',
+ 'age_limit': 10,
+ 'timestamp': 1240828200,
+ 'upload_date': '20090427',
+ 'uploader_id': '2199827728001',
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ 'skip_download': True,
+ }
+ }
+ BRIGHTCOVE_URL_TEMPLATE = 'https://players.brightcove.net/2199827728001/cN6vRtRQt_default/index.html?videoId=%s'
+
+ def _real_extract(self, url):
+ content_id = self._match_id(url)
+ data = self._download_json(
+ 'https://10play.com.au/api/video/' + content_id, content_id)
+ video = data.get('video') or {}
+ metadata = data.get('metaData') or {}
+ brightcove_id = video.get('videoId') or metadata['showContentVideoId']
+ brightcove_url = smuggle_url(
+ self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id,
+ {'geo_countries': ['AU']})
+
+ return {
+ '_type': 'url_transparent',
+ 'url': brightcove_url,
+ 'id': content_id,
+ 'title': video.get('title') or metadata.get('pageContentName') or metadata.get('showContentName'),
+ 'description': video.get('description'),
+ 'age_limit': parse_age_limit(video.get('showRatingClassification') or metadata.get('showProgramClassification')),
+ 'series': metadata.get('showName'),
+ 'season': metadata.get('showContentSeason'),
+ 'timestamp': parse_iso8601(metadata.get('contentPublishDate') or metadata.get('pageContentPublishDate')),
+ 'ie_key': 'BrightcoveNew',
+ }
diff --git a/youtube_dl/extractor/thesun.py b/youtube_dl/extractor/thesun.py
index 22d003776..15d4a6932 100644
--- a/youtube_dl/extractor/thesun.py
+++ b/youtube_dl/extractor/thesun.py
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from .ooyala import OoyalaIE
+from ..utils import extract_attributes
class TheSunIE(InfoExtractor):
@@ -16,6 +16,7 @@ class TheSunIE(InfoExtractor):
},
'playlist_count': 2,
}
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s'
def _real_extract(self, url):
article_id = self._match_id(url)
@@ -23,10 +24,15 @@ class TheSunIE(InfoExtractor):
webpage = self._download_webpage(url, article_id)
entries = []
- for ooyala_id in re.findall(
- r'<[^>]+\b(?:id\s*=\s*"thesun-ooyala-player-|data-content-id\s*=\s*")([^"]+)',
+ for video in re.findall(
+ r'<video[^>]+data-video-id-pending=[^>]+>',
webpage):
- entries.append(OoyalaIE._build_url_result(ooyala_id))
+ attrs = extract_attributes(video)
+ video_id = attrs['data-video-id-pending']
+ account_id = attrs.get('data-account', '5067014667001')
+ entries.append(self.url_result(
+ self.BRIGHTCOVE_URL_TEMPLATE % (account_id, video_id),
+ 'BrightcoveNew', video_id))
return self.playlist_result(
entries, article_id, self._og_search_title(webpage, fatal=False))
diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py
deleted file mode 100644
index 362318b24..000000000
--- a/youtube_dl/extractor/tutv.py
+++ /dev/null
@@ -1,36 +0,0 @@
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from ..compat import (
- compat_b64decode,
- compat_parse_qs,
-)
-
-
-class TutvIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?tu\.tv/videos/(?P<id>[^/?]+)'
- _TEST = {
- 'url': 'http://tu.tv/videos/robots-futbolistas',
- 'md5': '0cd9e28ad270488911b0d2a72323395d',
- 'info_dict': {
- 'id': '2973058',
- 'ext': 'mp4',
- 'title': 'Robots futbolistas',
- },
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
-
- internal_id = self._search_regex(r'codVideo=([0-9]+)', webpage, 'internal video ID')
-
- data_content = self._download_webpage(
- 'http://tu.tv/flvurl.php?codVideo=%s' % internal_id, video_id, 'Downloading video info')
- video_url = compat_b64decode(compat_parse_qs(data_content)['kpt'][0]).decode('utf-8')
-
- return {
- 'id': internal_id,
- 'url': video_url,
- 'title': self._og_search_title(webpage),
- }
diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py
index d5071e8a5..4a19b9be6 100644
--- a/youtube_dl/extractor/tv2.py
+++ b/youtube_dl/extractor/tv2.py
@@ -4,13 +4,17 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_HTTPError
from ..utils import (
determine_ext,
+ ExtractorError,
int_or_none,
float_or_none,
js_to_json,
parse_iso8601,
remove_end,
+ strip_or_none,
+ try_get,
)
@@ -20,7 +24,7 @@ class TV2IE(InfoExtractor):
'url': 'http://www.tv2.no/v/916509/',
'info_dict': {
'id': '916509',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 'Se Frode Gryttens hyllest av Steven Gerrard',
'description': 'TV 2 Sportens huspoet tar avskjed med Liverpools kaptein Steven Gerrard.',
'timestamp': 1431715610,
@@ -29,22 +33,40 @@ class TV2IE(InfoExtractor):
'view_count': int,
'categories': list,
},
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
}
+ _API_DOMAIN = 'sumo.tv2.no'
+ _PROTOCOLS = ('HDS', 'HLS', 'DASH')
+ _GEO_COUNTRIES = ['NO']
def _real_extract(self, url):
video_id = self._match_id(url)
+ api_base = 'http://%s/api/web/asset/%s' % (self._API_DOMAIN, video_id)
formats = []
format_urls = []
- for protocol in ('HDS', 'HLS'):
- data = self._download_json(
- 'http://sumo.tv2.no/api/web/asset/%s/play.json?protocol=%s&videoFormat=SMIL+ISMUSP' % (video_id, protocol),
- video_id, 'Downloading play JSON')['playback']
- for item in data['items']['item']:
+ for protocol in self._PROTOCOLS:
+ try:
+ data = self._download_json(
+ api_base + '/play.json?protocol=%s&videoFormat=SMIL+ISMUSP' % protocol,
+ video_id, 'Downloading play JSON')['playback']
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+ error = self._parse_json(e.cause.read().decode(), video_id)['error']
+ error_code = error.get('code')
+ if error_code == 'ASSET_PLAYBACK_INVALID_GEO_LOCATION':
+ self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
+ elif error_code == 'SESSION_NOT_AUTHENTICATED':
+ self.raise_login_required()
+ raise ExtractorError(error['description'])
+ raise
+ items = try_get(data, lambda x: x['items']['item'])
+ if not items:
+ continue
+ if not isinstance(items, list):
+ items = [items]
+ for item in items:
+ if not isinstance(item, dict):
+ continue
video_url = item.get('url')
if not video_url or video_url in format_urls:
continue
@@ -57,9 +79,13 @@ class TV2IE(InfoExtractor):
formats.extend(self._extract_f4m_formats(
video_url, video_id, f4m_id=format_id, fatal=False))
elif ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- video_url, video_id, 'mp4', entry_protocol='m3u8_native',
- m3u8_id=format_id, fatal=False))
+ if not data.get('drmProtected'):
+ formats.extend(self._extract_m3u8_formats(
+ video_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id=format_id, fatal=False))
+ elif ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ video_url, video_id, format_id, fatal=False))
elif ext == 'ism' or video_url.endswith('.ism/Manifest'):
pass
else:
@@ -69,34 +95,30 @@ class TV2IE(InfoExtractor):
'tbr': int_or_none(item.get('bitrate')),
'filesize': int_or_none(item.get('fileSize')),
})
+ if not formats and data.get('drmProtected'):
+ raise ExtractorError('This video is DRM protected.', expected=True)
self._sort_formats(formats)
asset = self._download_json(
- 'http://sumo.tv2.no/api/web/asset/%s.json' % video_id,
- video_id, 'Downloading metadata JSON')['asset']
-
+ api_base + '.json', video_id,
+ 'Downloading metadata JSON')['asset']
title = asset['title']
- description = asset.get('description')
- timestamp = parse_iso8601(asset.get('createTime'))
- duration = float_or_none(asset.get('accurateDuration') or asset.get('duration'))
- view_count = int_or_none(asset.get('views'))
- categories = asset.get('keywords', '').split(',')
thumbnails = [{
'id': thumbnail.get('@type'),
'url': thumbnail.get('url'),
- } for _, thumbnail in asset.get('imageVersions', {}).items()]
+ } for _, thumbnail in (asset.get('imageVersions') or {}).items()]
return {
'id': video_id,
'url': video_url,
'title': title,
- 'description': description,
+ 'description': strip_or_none(asset.get('description')),
'thumbnails': thumbnails,
- 'timestamp': timestamp,
- 'duration': duration,
- 'view_count': view_count,
- 'categories': categories,
+ 'timestamp': parse_iso8601(asset.get('createTime')),
+ 'duration': float_or_none(asset.get('accurateDuration') or asset.get('duration')),
+ 'view_count': int_or_none(asset.get('views')),
+ 'categories': asset.get('keywords', '').split(','),
'formats': formats,
}
@@ -108,7 +130,7 @@ class TV2ArticleIE(InfoExtractor):
'info_dict': {
'id': '6930542',
'title': 'Russen hetses etter pingvintyveri - innrømmer å ha åpnet luken på buret',
- 'description': 'md5:339573779d3eea3542ffe12006190954',
+ 'description': 'De fire siktede nekter fortsatt for å ha stjålet pingvinbabyene, men innrømmer å ha åpnet luken til de små kyllingene.',
},
'playlist_count': 2,
}, {
@@ -126,7 +148,7 @@ class TV2ArticleIE(InfoExtractor):
if not assets:
# New embed pattern
- for v in re.findall(r'TV2ContentboxVideo\(({.+?})\)', webpage):
+ for v in re.findall(r'(?s)TV2ContentboxVideo\(({.+?})\)', webpage):
video = self._parse_json(
v, playlist_id, transform_source=js_to_json, fatal=False)
if not video:
@@ -143,3 +165,28 @@ class TV2ArticleIE(InfoExtractor):
description = remove_end(self._og_search_description(webpage), ' - TV2.no')
return self.playlist_result(entries, playlist_id, title, description)
+
+
+class KatsomoIE(TV2IE):
+ _VALID_URL = r'https?://(?:www\.)?(?:katsomo|mtv)\.fi/(?:#!/)?(?:[^/]+/[0-9a-z-]+-\d+/[0-9a-z-]+-|[^/]+/\d+/[^/]+/)(?P<id>\d+)'
+ _TEST = {
+ 'url': 'https://www.mtv.fi/sarja/mtv-uutiset-live-33001002003/lahden-pelicans-teki-kovan-ratkaisun-ville-nieminen-pihalle-1181321',
+ 'info_dict': {
+ 'id': '1181321',
+ 'ext': 'mp4',
+ 'title': 'MTV Uutiset Live',
+ 'description': 'Päätöksen teki Pelicansin hallitus.',
+ 'timestamp': 1575116484,
+ 'upload_date': '20191130',
+ 'duration': 37.12,
+ 'view_count': int,
+ 'categories': list,
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }
+ _API_DOMAIN = 'api.katsomo.fi'
+ _PROTOCOLS = ('HLS', 'MPD')
+ _GEO_COUNTRIES = ['FI']
diff --git a/youtube_dl/extractor/tv2dk.py b/youtube_dl/extractor/tv2dk.py
new file mode 100644
index 000000000..611fdc0c6
--- /dev/null
+++ b/youtube_dl/extractor/tv2dk.py
@@ -0,0 +1,154 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ extract_attributes,
+ js_to_json,
+ url_or_none,
+)
+
+
+class TV2DKIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:www\.)?
+ (?:
+ tvsyd|
+ tv2ostjylland|
+ tvmidtvest|
+ tv2fyn|
+ tv2east|
+ tv2lorry|
+ tv2nord
+ )\.dk/
+ (:[^/]+/)*
+ (?P<id>[^/?\#&]+)
+ '''
+ _TESTS = [{
+ 'url': 'https://www.tvsyd.dk/nyheder/28-10-2019/1930/1930-28-okt-2019?autoplay=1#player',
+ 'info_dict': {
+ 'id': '0_52jmwa0p',
+ 'ext': 'mp4',
+ 'title': '19:30 - 28. okt. 2019',
+ 'timestamp': 1572290248,
+ 'upload_date': '20191028',
+ 'uploader_id': 'tvsyd',
+ 'duration': 1347,
+ 'view_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'add_ie': ['Kaltura'],
+ }, {
+ 'url': 'https://www.tv2ostjylland.dk/artikel/minister-gaar-ind-i-sag-om-diabetes-teknologi',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.tv2ostjylland.dk/nyheder/28-10-2019/22/2200-nyhederne-mandag-d-28-oktober-2019?autoplay=1#player',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.tvmidtvest.dk/nyheder/27-10-2019/1930/1930-27-okt-2019',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.tv2fyn.dk/artikel/fyn-kan-faa-landets-foerste-fabrik-til-groent-jetbraendstof',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.tv2east.dk/artikel/gods-faar-indleveret-tonsvis-af-aebler-100-kilo-aebler-gaar-til-en-aeblebrandy',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.tv2lorry.dk/koebenhavn/rasmus-paludan-evakueret-til-egen-demonstration#player',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.tv2nord.dk/artikel/dybt-uacceptabelt',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ entries = []
+ for video_el in re.findall(r'(?s)<[^>]+\bdata-entryid\s*=[^>]*>', webpage):
+ video = extract_attributes(video_el)
+ kaltura_id = video.get('data-entryid')
+ if not kaltura_id:
+ continue
+ partner_id = video.get('data-partnerid')
+ if not partner_id:
+ continue
+ entries.append(self.url_result(
+ 'kaltura:%s:%s' % (partner_id, kaltura_id), 'Kaltura',
+ video_id=kaltura_id))
+ return self.playlist_result(entries)
+
+
+class TV2DKBornholmPlayIE(InfoExtractor):
+ _VALID_URL = r'https?://play\.tv2bornholm\.dk/\?.*?\bid=(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://play.tv2bornholm.dk/?area=specifikTV&id=781021',
+ 'info_dict': {
+ 'id': '781021',
+ 'ext': 'mp4',
+ 'title': '12Nyheder-27.11.19',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ video = self._download_json(
+ 'http://play.tv2bornholm.dk/controls/AJAX.aspx/specifikVideo', video_id,
+ data=json.dumps({
+ 'playlist_id': video_id,
+ 'serienavn': '',
+ }).encode(), headers={
+ 'X-Requested-With': 'XMLHttpRequest',
+ 'Content-Type': 'application/json; charset=UTF-8',
+ })['d']
+
+ # TODO: generalize flowplayer
+ title = self._search_regex(
+ r'title\s*:\s*(["\'])(?P<value>(?:(?!\1).)+)\1', video, 'title',
+ group='value')
+ sources = self._parse_json(self._search_regex(
+ r'(?s)sources:\s*(\[.+?\]),', video, 'sources'),
+ video_id, js_to_json)
+
+ formats = []
+ srcs = set()
+ for source in sources:
+ src = url_or_none(source.get('src'))
+ if not src:
+ continue
+ if src in srcs:
+ continue
+ srcs.add(src)
+ ext = determine_ext(src)
+ src_type = source.get('type')
+ if src_type == 'application/x-mpegurl' or ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ src, video_id, ext='mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
+ elif src_type == 'application/dash+xml' or ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ src, video_id, mpd_id='dash', fatal=False))
+ else:
+ formats.append({
+ 'url': src,
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py
index ca7676fe2..a8c2502af 100644
--- a/youtube_dl/extractor/twitch.py
+++ b/youtube_dl/extractor/twitch.py
@@ -17,12 +17,10 @@ from ..compat import (
from ..utils import (
clean_html,
ExtractorError,
- float_or_none,
int_or_none,
orderedSet,
parse_duration,
parse_iso8601,
- qualities,
try_get,
unified_timestamp,
update_url_query,
@@ -327,6 +325,7 @@ class TwitchVodIE(TwitchItemBaseIE):
'allow_audio_only': 'true',
'allow_spectre': 'true',
'player': 'twitchweb',
+ 'playlist_include_framerate': 'true',
'nauth': access_token['token'],
'nauthsig': access_token['sig'],
})),
@@ -344,9 +343,8 @@ class TwitchVodIE(TwitchItemBaseIE):
info['subtitles'] = {
'rechat': [{
'url': update_url_query(
- 'https://rechat.twitch.tv/rechat-messages', {
- 'video_id': 'v%s' % item_id,
- 'start': info['timestamp'],
+ 'https://api.twitch.tv/v5/videos/%s/comments' % item_id, {
+ 'client_id': self._CLIENT_ID,
}),
'ext': 'json',
}],
@@ -599,6 +597,7 @@ class TwitchStreamIE(TwitchBaseIE):
'allow_spectre': 'true',
'p': random.randint(1000000, 10000000),
'player': 'twitchweb',
+ 'playlist_include_framerate': 'true',
'segment_preference': '4',
'sig': access_token['sig'].encode('utf-8'),
'token': access_token['token'].encode('utf-8'),
@@ -644,7 +643,7 @@ class TwitchStreamIE(TwitchBaseIE):
class TwitchClipsIE(TwitchBaseIE):
IE_NAME = 'twitch:clips'
- _VALID_URL = r'https?://(?:clips\.twitch\.tv/(?:[^/]+/)*|(?:www\.)?twitch\.tv/[^/]+/clip/)(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://(?:clips\.twitch\.tv/(?:embed\?.*?\bclip=|(?:[^/]+/)*)|(?:www\.)?twitch\.tv/[^/]+/clip/)(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'https://clips.twitch.tv/FaintLightGullWholeWheat',
@@ -667,68 +666,89 @@ class TwitchClipsIE(TwitchBaseIE):
}, {
'url': 'https://www.twitch.tv/sergeynixon/clip/StormyThankfulSproutFutureMan',
'only_matching': True,
+ }, {
+ 'url': 'https://clips.twitch.tv/embed?clip=InquisitiveBreakableYogurtJebaited',
+ 'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
- status = self._download_json(
- 'https://clips.twitch.tv/api/v2/clips/%s/status' % video_id,
- video_id)
+ clip = self._download_json(
+ 'https://gql.twitch.tv/gql', video_id, data=json.dumps({
+ 'query': '''{
+ clip(slug: "%s") {
+ broadcaster {
+ displayName
+ }
+ createdAt
+ curator {
+ displayName
+ id
+ }
+ durationSeconds
+ id
+ tiny: thumbnailURL(width: 86, height: 45)
+ small: thumbnailURL(width: 260, height: 147)
+ medium: thumbnailURL(width: 480, height: 272)
+ title
+ videoQualities {
+ frameRate
+ quality
+ sourceURL
+ }
+ viewCount
+ }
+}''' % video_id,
+ }).encode(), headers={
+ 'Client-ID': self._CLIENT_ID,
+ })['data']['clip']
+
+ if not clip:
+ raise ExtractorError(
+ 'This clip is no longer available', expected=True)
formats = []
-
- for option in status['quality_options']:
+ for option in clip.get('videoQualities', []):
if not isinstance(option, dict):
continue
- source = url_or_none(option.get('source'))
+ source = url_or_none(option.get('sourceURL'))
if not source:
continue
formats.append({
'url': source,
'format_id': option.get('quality'),
'height': int_or_none(option.get('quality')),
- 'fps': int_or_none(option.get('frame_rate')),
+ 'fps': int_or_none(option.get('frameRate')),
})
-
self._sort_formats(formats)
- info = {
+ thumbnails = []
+ for thumbnail_id in ('tiny', 'small', 'medium'):
+ thumbnail_url = clip.get(thumbnail_id)
+ if not thumbnail_url:
+ continue
+ thumb = {
+ 'id': thumbnail_id,
+ 'url': thumbnail_url,
+ }
+ mobj = re.search(r'-(\d+)x(\d+)\.', thumbnail_url)
+ if mobj:
+ thumb.update({
+ 'height': int(mobj.group(2)),
+ 'width': int(mobj.group(1)),
+ })
+ thumbnails.append(thumb)
+
+ return {
+ 'id': clip.get('id') or video_id,
+ 'title': clip.get('title') or video_id,
'formats': formats,
+ 'duration': int_or_none(clip.get('durationSeconds')),
+ 'views': int_or_none(clip.get('viewCount')),
+ 'timestamp': unified_timestamp(clip.get('createdAt')),
+ 'thumbnails': thumbnails,
+ 'creator': try_get(clip, lambda x: x['broadcaster']['displayName'], compat_str),
+ 'uploader': try_get(clip, lambda x: x['curator']['displayName'], compat_str),
+ 'uploader_id': try_get(clip, lambda x: x['curator']['id'], compat_str),
}
-
- clip = self._call_api(
- 'kraken/clips/%s' % video_id, video_id, fatal=False, headers={
- 'Accept': 'application/vnd.twitchtv.v5+json',
- })
-
- if clip:
- quality_key = qualities(('tiny', 'small', 'medium'))
- thumbnails = []
- thumbnails_dict = clip.get('thumbnails')
- if isinstance(thumbnails_dict, dict):
- for thumbnail_id, thumbnail_url in thumbnails_dict.items():
- thumbnails.append({
- 'id': thumbnail_id,
- 'url': thumbnail_url,
- 'preference': quality_key(thumbnail_id),
- })
-
- info.update({
- 'id': clip.get('tracking_id') or video_id,
- 'title': clip.get('title') or video_id,
- 'duration': float_or_none(clip.get('duration')),
- 'views': int_or_none(clip.get('views')),
- 'timestamp': unified_timestamp(clip.get('created_at')),
- 'thumbnails': thumbnails,
- 'creator': try_get(clip, lambda x: x['broadcaster']['display_name'], compat_str),
- 'uploader': try_get(clip, lambda x: x['curator']['display_name'], compat_str),
- 'uploader_id': try_get(clip, lambda x: x['curator']['id'], compat_str),
- })
- else:
- info.update({
- 'title': video_id,
- 'id': video_id,
- })
-
- return info
diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py
index cebb6238c..5f8d90fb4 100644
--- a/youtube_dl/extractor/twitter.py
+++ b/youtube_dl/extractor/twitter.py
@@ -4,32 +4,67 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import compat_urlparse
+from ..compat import (
+ compat_HTTPError,
+ compat_parse_qs,
+ compat_urllib_parse_unquote,
+ compat_urllib_parse_urlparse,
+)
from ..utils import (
- determine_ext,
dict_get,
ExtractorError,
float_or_none,
int_or_none,
- remove_end,
try_get,
+ strip_or_none,
+ unified_timestamp,
+ update_url_query,
xpath_text,
)
-from .periscope import PeriscopeIE
+from .periscope import (
+ PeriscopeBaseIE,
+ PeriscopeIE,
+)
class TwitterBaseIE(InfoExtractor):
+ _API_BASE = 'https://api.twitter.com/1.1/'
+ _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?twitter\.com/'
+ _GUEST_TOKEN = None
+
+ def _extract_variant_formats(self, variant, video_id):
+ variant_url = variant.get('url')
+ if not variant_url:
+ return []
+ elif '.m3u8' in variant_url:
+ return self._extract_m3u8_formats(
+ variant_url, video_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False)
+ else:
+ tbr = int_or_none(dict_get(variant, ('bitrate', 'bit_rate')), 1000) or None
+ f = {
+ 'url': variant_url,
+ 'format_id': 'http' + ('-%d' % tbr if tbr else ''),
+ 'tbr': tbr,
+ }
+ self._search_dimensions_in_video_url(f, variant_url)
+ return [f]
+
def _extract_formats_from_vmap_url(self, vmap_url, video_id):
vmap_data = self._download_xml(vmap_url, video_id)
- video_url = xpath_text(vmap_data, './/MediaFile').strip()
- if determine_ext(video_url) == 'm3u8':
- return self._extract_m3u8_formats(
- video_url, video_id, ext='mp4', m3u8_id='hls',
- entry_protocol='m3u8_native')
- return [{
- 'url': video_url,
- }]
+ formats = []
+ urls = []
+ for video_variant in vmap_data.findall('.//{http://twitter.com/schema/videoVMapV2.xsd}videoVariant'):
+ video_variant.attrib['url'] = compat_urllib_parse_unquote(
+ video_variant.attrib['url'])
+ urls.append(video_variant.attrib['url'])
+ formats.extend(self._extract_variant_formats(
+ video_variant.attrib, video_id))
+ video_url = strip_or_none(xpath_text(vmap_data, './/MediaFile'))
+ if video_url not in urls:
+ formats.extend(self._extract_variant_formats({'url': video_url}, video_id))
+ return formats
@staticmethod
def _search_dimensions_in_video_url(a_format, video_url):
@@ -40,10 +75,30 @@ class TwitterBaseIE(InfoExtractor):
'height': int(m.group('height')),
})
-
-class TwitterCardIE(TwitterBaseIE):
+ def _call_api(self, path, video_id, query={}):
+ headers = {
+ 'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw',
+ }
+ if not self._GUEST_TOKEN:
+ self._GUEST_TOKEN = self._download_json(
+ self._API_BASE + 'guest/activate.json', video_id,
+ 'Downloading guest token', data=b'',
+ headers=headers)['guest_token']
+ headers['x-guest-token'] = self._GUEST_TOKEN
+ try:
+ return self._download_json(
+ self._API_BASE + path, video_id, headers=headers, query=query)
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ raise ExtractorError(self._parse_json(
+ e.cause.read().decode(),
+ video_id)['errors'][0]['message'], expected=True)
+ raise
+
+
+class TwitterCardIE(InfoExtractor):
IE_NAME = 'twitter:card'
- _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/(?P<path>cards/tfw/v1|videos(?:/tweet)?)/(?P<id>\d+)'
+ _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/(?:cards/tfw/v1|videos(?:/tweet)?)/(?P<id>\d+)'
_TESTS = [
{
'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889',
@@ -51,19 +106,28 @@ class TwitterCardIE(TwitterBaseIE):
'info_dict': {
'id': '560070183650213889',
'ext': 'mp4',
- 'title': 'Twitter web player',
- 'thumbnail': r're:^https?://.*\.jpg$',
+ 'title': "Twitter - You can now shoot, edit and share video on Twitter. Capture life's most moving moments from your perspective.",
+ 'description': 'md5:18d3e24bb4f6e5007487dd546e53bd96',
+ 'uploader': 'Twitter',
+ 'uploader_id': 'Twitter',
+ 'thumbnail': r're:^https?://.*\.jpg',
'duration': 30.033,
+ 'timestamp': 1422366112,
+ 'upload_date': '20150127',
},
},
{
'url': 'https://twitter.com/i/cards/tfw/v1/623160978427936768',
- 'md5': '7ee2a553b63d1bccba97fbed97d9e1c8',
+ 'md5': '7137eca597f72b9abbe61e5ae0161399',
'info_dict': {
'id': '623160978427936768',
'ext': 'mp4',
- 'title': 'Twitter web player',
- 'thumbnail': r're:^https?://.*$',
+ 'title': "NASA - Fly over Pluto's icy Norgay Mountains and Sputnik Plain in this @NASANewHorizons #PlutoFlyby video.",
+ 'description': "Fly over Pluto's icy Norgay Mountains and Sputnik Plain in this @NASANewHorizons #PlutoFlyby video. https://t.co/BJYgOjSeGA",
+ 'uploader': 'NASA',
+ 'uploader_id': 'NASA',
+ 'timestamp': 1437408129,
+ 'upload_date': '20150720',
},
},
{
@@ -75,7 +139,7 @@ class TwitterCardIE(TwitterBaseIE):
'title': 'Ubuntu 11.10 Overview',
'description': 'md5:a831e97fa384863d6e26ce48d1c43376',
'upload_date': '20111013',
- 'uploader': 'OMG! Ubuntu!',
+ 'uploader': 'OMG! UBUNTU!',
'uploader_id': 'omgubuntu',
},
'add_ie': ['Youtube'],
@@ -99,190 +163,30 @@ class TwitterCardIE(TwitterBaseIE):
'info_dict': {
'id': '705235433198714880',
'ext': 'mp4',
- 'title': 'Twitter web player',
- 'thumbnail': r're:^https?://.*',
+ 'title': "Brent Yarina - Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight.",
+ 'description': "Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight. https://t.co/OrxcJ28Bns",
+ 'uploader': 'Brent Yarina',
+ 'uploader_id': 'BTNBrentYarina',
+ 'timestamp': 1456976204,
+ 'upload_date': '20160303',
},
+ 'skip': 'This content is no longer available.',
}, {
'url': 'https://twitter.com/i/videos/752274308186120192',
'only_matching': True,
},
]
- _API_BASE = 'https://api.twitter.com/1.1'
-
- def _parse_media_info(self, media_info, video_id):
- formats = []
- for media_variant in media_info.get('variants', []):
- media_url = media_variant['url']
- if media_url.endswith('.m3u8'):
- formats.extend(self._extract_m3u8_formats(media_url, video_id, ext='mp4', m3u8_id='hls'))
- elif media_url.endswith('.mpd'):
- formats.extend(self._extract_mpd_formats(media_url, video_id, mpd_id='dash'))
- else:
- tbr = int_or_none(dict_get(media_variant, ('bitRate', 'bitrate')), scale=1000)
- a_format = {
- 'url': media_url,
- 'format_id': 'http-%d' % tbr if tbr else 'http',
- 'tbr': tbr,
- }
- # Reported bitRate may be zero
- if not a_format['tbr']:
- del a_format['tbr']
-
- self._search_dimensions_in_video_url(a_format, media_url)
-
- formats.append(a_format)
- return formats
-
- def _extract_mobile_formats(self, username, video_id):
- webpage = self._download_webpage(
- 'https://mobile.twitter.com/%s/status/%s' % (username, video_id),
- video_id, 'Downloading mobile webpage',
- headers={
- # A recent mobile UA is necessary for `gt` cookie
- 'User-Agent': 'Mozilla/5.0 (Android 6.0.1; Mobile; rv:54.0) Gecko/54.0 Firefox/54.0',
- })
- main_script_url = self._html_search_regex(
- r'<script[^>]+src="([^"]+main\.[^"]+)"', webpage, 'main script URL')
- main_script = self._download_webpage(
- main_script_url, video_id, 'Downloading main script')
- bearer_token = self._search_regex(
- r'BEARER_TOKEN\s*:\s*"([^"]+)"',
- main_script, 'bearer token')
- # https://developer.twitter.com/en/docs/tweets/post-and-engage/api-reference/get-statuses-show-id
- api_data = self._download_json(
- '%s/statuses/show/%s.json' % (self._API_BASE, video_id),
- video_id, 'Downloading API data',
- headers={
- 'Authorization': 'Bearer ' + bearer_token,
- })
- media_info = try_get(api_data, lambda o: o['extended_entities']['media'][0]['video_info']) or {}
- return self._parse_media_info(media_info, video_id)
-
def _real_extract(self, url):
- path, video_id = re.search(self._VALID_URL, url).groups()
-
- config = None
- formats = []
- duration = None
-
- urls = [url]
- if path.startswith('cards/'):
- urls.append('https://twitter.com/i/videos/' + video_id)
-
- for u in urls:
- webpage = self._download_webpage(
- u, video_id, headers={'Referer': 'https://twitter.com/'})
-
- iframe_url = self._html_search_regex(
- r'<iframe[^>]+src="((?:https?:)?//(?:www\.youtube\.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"',
- webpage, 'video iframe', default=None)
- if iframe_url:
- return self.url_result(iframe_url)
-
- config = self._parse_json(self._html_search_regex(
- r'data-(?:player-)?config="([^"]+)"', webpage,
- 'data player config', default='{}'),
- video_id)
-
- if config.get('source_type') == 'vine':
- return self.url_result(config['player_url'], 'Vine')
-
- periscope_url = PeriscopeIE._extract_url(webpage)
- if periscope_url:
- return self.url_result(periscope_url, PeriscopeIE.ie_key())
-
- video_url = config.get('video_url') or config.get('playlist', [{}])[0].get('source')
+ status_id = self._match_id(url)
+ return self.url_result(
+ 'https://twitter.com/statuses/' + status_id,
+ TwitterIE.ie_key(), status_id)
- if video_url:
- if determine_ext(video_url) == 'm3u8':
- formats.extend(self._extract_m3u8_formats(video_url, video_id, ext='mp4', m3u8_id='hls'))
- else:
- f = {
- 'url': video_url,
- }
-
- self._search_dimensions_in_video_url(f, video_url)
-
- formats.append(f)
-
- vmap_url = config.get('vmapUrl') or config.get('vmap_url')
- if vmap_url:
- formats.extend(
- self._extract_formats_from_vmap_url(vmap_url, video_id))
-
- media_info = None
- for entity in config.get('status', {}).get('entities', []):
- if 'mediaInfo' in entity:
- media_info = entity['mediaInfo']
-
- if media_info:
- formats.extend(self._parse_media_info(media_info, video_id))
- duration = float_or_none(media_info.get('duration', {}).get('nanos'), scale=1e9)
-
- username = config.get('user', {}).get('screen_name')
- if username:
- formats.extend(self._extract_mobile_formats(username, video_id))
-
- if formats:
- title = self._search_regex(r'<title>([^<]+)</title>', webpage, 'title')
- thumbnail = config.get('posterImageUrl') or config.get('image_src')
- duration = float_or_none(config.get('duration'), scale=1000) or duration
- break
-
- if not formats:
- headers = {
- 'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw',
- 'Referer': url,
- }
- ct0 = self._get_cookies(url).get('ct0')
- if ct0:
- headers['csrf_token'] = ct0.value
- guest_token = self._download_json(
- '%s/guest/activate.json' % self._API_BASE, video_id,
- 'Downloading guest token', data=b'',
- headers=headers)['guest_token']
- headers['x-guest-token'] = guest_token
- self._set_cookie('api.twitter.com', 'gt', guest_token)
- config = self._download_json(
- '%s/videos/tweet/config/%s.json' % (self._API_BASE, video_id),
- video_id, headers=headers)
- track = config['track']
- vmap_url = track.get('vmapUrl')
- if vmap_url:
- formats = self._extract_formats_from_vmap_url(vmap_url, video_id)
- else:
- playback_url = track['playbackUrl']
- if determine_ext(playback_url) == 'm3u8':
- formats = self._extract_m3u8_formats(
- playback_url, video_id, 'mp4',
- entry_protocol='m3u8_native', m3u8_id='hls')
- else:
- formats = [{
- 'url': playback_url,
- }]
- title = 'Twitter web player'
- thumbnail = config.get('posterImage')
- duration = float_or_none(track.get('durationMs'), scale=1000)
-
- self._remove_duplicate_formats(formats)
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'title': title,
- 'thumbnail': thumbnail,
- 'duration': duration,
- 'formats': formats,
- }
-
-
-class TwitterIE(InfoExtractor):
+class TwitterIE(TwitterBaseIE):
IE_NAME = 'twitter'
- _VALID_URL = r'https?://(?:www\.|m\.|mobile\.)?twitter\.com/(?:i/web|(?P<user_id>[^/]+))/status/(?P<id>\d+)'
- _TEMPLATE_URL = 'https://twitter.com/%s/status/%s'
- _TEMPLATE_STATUSES_URL = 'https://twitter.com/statuses/%s'
+ _VALID_URL = TwitterBaseIE._BASE_REGEX + r'(?:(?:i/web|[^/]+)/status|statuses)/(?P<id>\d+)'
_TESTS = [{
'url': 'https://twitter.com/freethenipple/status/643211948184596480',
@@ -291,10 +195,13 @@ class TwitterIE(InfoExtractor):
'ext': 'mp4',
'title': 'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!',
'thumbnail': r're:^https?://.*\.jpg',
- 'description': 'FREE THE NIPPLE on Twitter: "FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ"',
+ 'description': 'FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ',
'uploader': 'FREE THE NIPPLE',
'uploader_id': 'freethenipple',
'duration': 12.922,
+ 'timestamp': 1442188653,
+ 'upload_date': '20150913',
+ 'age_limit': 18,
},
}, {
'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1',
@@ -316,19 +223,23 @@ class TwitterIE(InfoExtractor):
'id': '665052190608723968',
'ext': 'mp4',
'title': 'Star Wars - A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens.',
- 'description': 'Star Wars on Twitter: "A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens."',
+ 'description': 'A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens. https://t.co/OkSqT2fjWJ',
'uploader_id': 'starwars',
'uploader': 'Star Wars',
+ 'timestamp': 1447395772,
+ 'upload_date': '20151113',
},
}, {
'url': 'https://twitter.com/BTNBrentYarina/status/705235433198714880',
'info_dict': {
'id': '705235433198714880',
'ext': 'mp4',
- 'title': 'Brent Yarina - Khalil Iverson\'s missed highlight dunk. And made highlight dunk. In one highlight.',
- 'description': 'Brent Yarina on Twitter: "Khalil Iverson\'s missed highlight dunk. And made highlight dunk. In one highlight."',
+ 'title': "Brent Yarina - Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight.",
+ 'description': "Khalil Iverson's missed highlight dunk. And made highlight dunk. In one highlight. https://t.co/OrxcJ28Bns",
'uploader_id': 'BTNBrentYarina',
'uploader': 'Brent Yarina',
+ 'timestamp': 1456976204,
+ 'upload_date': '20160303',
},
'params': {
# The same video as https://twitter.com/i/videos/tweet/705235433198714880
@@ -340,12 +251,14 @@ class TwitterIE(InfoExtractor):
'info_dict': {
'id': '700207533655363584',
'ext': 'mp4',
- 'title': 'JG - BEAT PROD: @suhmeduh #Damndaniel',
- 'description': 'JG on Twitter: "BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"',
+ 'title': 'Simon Vertugo - BEAT PROD: @suhmeduh #Damndaniel',
+ 'description': 'BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ',
'thumbnail': r're:^https?://.*\.jpg',
- 'uploader': 'JG',
- 'uploader_id': 'jaydingeer',
+ 'uploader': 'Simon Vertugo',
+ 'uploader_id': 'simonvertugo',
'duration': 30.0,
+ 'timestamp': 1455777459,
+ 'upload_date': '20160218',
},
}, {
'url': 'https://twitter.com/Filmdrunk/status/713801302971588609',
@@ -353,10 +266,9 @@ class TwitterIE(InfoExtractor):
'info_dict': {
'id': 'MIOxnrUteUd',
'ext': 'mp4',
- 'title': 'Vince Mancini - Vine of the day',
- 'description': 'Vince Mancini on Twitter: "Vine of the day https://t.co/xmTvRdqxWf"',
- 'uploader': 'Vince Mancini',
- 'uploader_id': 'Filmdrunk',
+ 'title': 'Dr.Pepperの飲み方 #japanese #バカ #ドクペ #電動ガン',
+ 'uploader': 'TAKUMA',
+ 'uploader_id': '1004126642786242560',
'timestamp': 1402826626,
'upload_date': '20140615',
},
@@ -367,21 +279,22 @@ class TwitterIE(InfoExtractor):
'id': '719944021058060289',
'ext': 'mp4',
'title': 'Captain America - @King0fNerd Are you sure you made the right choice? Find out in theaters.',
- 'description': 'Captain America on Twitter: "@King0fNerd Are you sure you made the right choice? Find out in theaters. https://t.co/GpgYi9xMJI"',
- 'uploader_id': 'captainamerica',
+ 'description': '@King0fNerd Are you sure you made the right choice? Find out in theaters. https://t.co/GpgYi9xMJI',
+ 'uploader_id': 'CaptainAmerica',
'uploader': 'Captain America',
'duration': 3.17,
+ 'timestamp': 1460483005,
+ 'upload_date': '20160412',
},
}, {
'url': 'https://twitter.com/OPP_HSD/status/779210622571536384',
'info_dict': {
'id': '1zqKVVlkqLaKB',
'ext': 'mp4',
- 'title': 'Sgt Kerry Schmidt - LIVE on #Periscope: Road rage, mischief, assault, rollover and fire in one occurrence',
- 'description': 'Sgt Kerry Schmidt on Twitter: "LIVE on #Periscope: Road rage, mischief, assault, rollover and fire in one occurrence https://t.co/EKrVgIXF3s"',
+ 'title': 'Sgt Kerry Schmidt - Ontario Provincial Police - Road rage, mischief, assault, rollover and fire in one occurrence',
'upload_date': '20160923',
- 'uploader_id': 'OPP_HSD',
- 'uploader': 'Sgt Kerry Schmidt',
+ 'uploader_id': '1PmKqpJdOJQoY',
+ 'uploader': 'Sgt Kerry Schmidt - Ontario Provincial Police',
'timestamp': 1474613214,
},
'add_ie': ['Periscope'],
@@ -392,10 +305,12 @@ class TwitterIE(InfoExtractor):
'id': '852138619213144067',
'ext': 'mp4',
'title': 'عالم الأخبار - كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة',
- 'description': 'عالم الأخبار on Twitter: "كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة https://t.co/xg6OhpyKfN"',
+ 'description': 'كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة https://t.co/xg6OhpyKfN',
'uploader': 'عالم الأخبار',
'uploader_id': 'news_al3alm',
'duration': 277.4,
+ 'timestamp': 1492000653,
+ 'upload_date': '20170412',
},
}, {
'url': 'https://twitter.com/i/web/status/910031516746514432',
@@ -404,10 +319,12 @@ class TwitterIE(InfoExtractor):
'ext': 'mp4',
'title': 'Préfet de Guadeloupe - [Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre.',
'thumbnail': r're:^https?://.*\.jpg',
- 'description': 'Préfet de Guadeloupe on Twitter: "[Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre. https://t.co/mwx01Rs4lo"',
+ 'description': '[Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre. https://t.co/mwx01Rs4lo',
'uploader': 'Préfet de Guadeloupe',
'uploader_id': 'Prefet971',
'duration': 47.48,
+ 'timestamp': 1505803395,
+ 'upload_date': '20170919',
},
'params': {
'skip_download': True, # requires ffmpeg
@@ -420,10 +337,12 @@ class TwitterIE(InfoExtractor):
'ext': 'mp4',
'title': 're:.*?Shep is on a roll today.*?',
'thumbnail': r're:^https?://.*\.jpg',
- 'description': 'md5:63b036c228772523ae1924d5f8e5ed6b',
+ 'description': 'md5:37b9f2ff31720cef23b2bd42ee8a0f09',
'uploader': 'Lis Power',
'uploader_id': 'LisPower1',
'duration': 111.278,
+ 'timestamp': 1527623489,
+ 'upload_date': '20180529',
},
'params': {
'skip_download': True, # requires ffmpeg
@@ -435,88 +354,163 @@ class TwitterIE(InfoExtractor):
'ext': 'mp4',
'title': 'Twitter - A new is coming. Some of you got an opt-in to try it now. Check out the emoji button, quick keyboard shortcuts, upgraded trends, advanced search, and more. Let us know your thoughts!',
'thumbnail': r're:^https?://.*\.jpg',
- 'description': 'md5:66d493500c013e3e2d434195746a7f78',
+ 'description': 'md5:6dfd341a3310fb97d80d2bf7145df976',
'uploader': 'Twitter',
'uploader_id': 'Twitter',
'duration': 61.567,
+ 'timestamp': 1548184644,
+ 'upload_date': '20190122',
+ },
+ }, {
+ # not available in Periscope
+ 'url': 'https://twitter.com/ViviEducation/status/1136534865145286656',
+ 'info_dict': {
+ 'id': '1vOGwqejwoWxB',
+ 'ext': 'mp4',
+ 'title': 'Vivi - Vivi founder @lior_rauchy announcing our new student feedback tool live at @EduTECH_AU #EduTECH2019',
+ 'uploader': 'Vivi',
+ 'uploader_id': '1eVjYOLGkGrQL',
},
+ 'add_ie': ['TwitterBroadcast'],
+ }, {
+ # Twitch Clip Embed
+ 'url': 'https://twitter.com/GunB1g/status/1163218564784017422',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- twid = mobj.group('id')
-
- webpage, urlh = self._download_webpage_handle(
- self._TEMPLATE_STATUSES_URL % twid, twid)
-
- if 'twitter.com/account/suspended' in urlh.geturl():
- raise ExtractorError('Account suspended by Twitter.', expected=True)
-
- user_id = None
-
- redirect_mobj = re.match(self._VALID_URL, urlh.geturl())
- if redirect_mobj:
- user_id = redirect_mobj.group('user_id')
-
- if not user_id:
- user_id = mobj.group('user_id')
-
- username = remove_end(self._og_search_title(webpage), ' on Twitter')
-
- title = description = self._og_search_description(webpage).strip('').replace('\n', ' ').strip('“”')
+ twid = self._match_id(url)
+ status = self._call_api(
+ 'statuses/show/%s.json' % twid, twid, {
+ 'cards_platform': 'Web-12',
+ 'include_cards': 1,
+ 'include_reply_count': 1,
+ 'include_user_entities': 0,
+ 'tweet_mode': 'extended',
+ })
+ title = description = status['full_text'].replace('\n', ' ')
# strip 'https -_t.co_BJYgOjSeGA' junk from filenames
title = re.sub(r'\s+(https?://[^ ]+)', '', title)
+ user = status.get('user') or {}
+ uploader = user.get('name')
+ if uploader:
+ title = '%s - %s' % (uploader, title)
+ uploader_id = user.get('screen_name')
+
+ tags = []
+ for hashtag in (try_get(status, lambda x: x['entities']['hashtags'], list) or []):
+ hashtag_text = hashtag.get('text')
+ if not hashtag_text:
+ continue
+ tags.append(hashtag_text)
info = {
- 'uploader_id': user_id,
- 'uploader': username,
- 'webpage_url': url,
- 'description': '%s on Twitter: "%s"' % (username, description),
- 'title': username + ' - ' + title,
+ 'id': twid,
+ 'title': title,
+ 'description': description,
+ 'uploader': uploader,
+ 'timestamp': unified_timestamp(status.get('created_at')),
+ 'uploader_id': uploader_id,
+ 'uploader_url': 'https://twitter.com/' + uploader_id if uploader_id else None,
+ 'like_count': int_or_none(status.get('favorite_count')),
+ 'repost_count': int_or_none(status.get('retweet_count')),
+ 'comment_count': int_or_none(status.get('reply_count')),
+ 'age_limit': 18 if status.get('possibly_sensitive') else 0,
+ 'tags': tags,
}
- mobj = re.search(r'''(?x)
- <video[^>]+class="animated-gif"(?P<more_info>[^>]+)>\s*
- <source[^>]+video-src="(?P<url>[^"]+)"
- ''', webpage)
-
- if mobj:
- more_info = mobj.group('more_info')
- height = int_or_none(self._search_regex(
- r'data-height="(\d+)"', more_info, 'height', fatal=False))
- width = int_or_none(self._search_regex(
- r'data-width="(\d+)"', more_info, 'width', fatal=False))
- thumbnail = self._search_regex(
- r'poster="([^"]+)"', more_info, 'poster', fatal=False)
- info.update({
- 'id': twid,
- 'url': mobj.group('url'),
- 'height': height,
- 'width': width,
- 'thumbnail': thumbnail,
- })
- return info
+ media = try_get(status, lambda x: x['extended_entities']['media'][0])
+ if media and media.get('type') != 'photo':
+ video_info = media.get('video_info') or {}
+
+ formats = []
+ for variant in video_info.get('variants', []):
+ formats.extend(self._extract_variant_formats(variant, twid))
+ self._sort_formats(formats)
+
+ thumbnails = []
+ media_url = media.get('media_url_https') or media.get('media_url')
+ if media_url:
+ def add_thumbnail(name, size):
+ thumbnails.append({
+ 'id': name,
+ 'url': update_url_query(media_url, {'name': name}),
+ 'width': int_or_none(size.get('w') or size.get('width')),
+ 'height': int_or_none(size.get('h') or size.get('height')),
+ })
+ for name, size in media.get('sizes', {}).items():
+ add_thumbnail(name, size)
+ add_thumbnail('orig', media.get('original_info') or {})
- twitter_card_url = None
- if 'class="PlayableMedia' in webpage:
- twitter_card_url = '%s//twitter.com/i/videos/tweet/%s' % (self.http_scheme(), twid)
- else:
- twitter_card_iframe_url = self._search_regex(
- r'data-full-card-iframe-url=([\'"])(?P<url>(?:(?!\1).)+)\1',
- webpage, 'Twitter card iframe URL', default=None, group='url')
- if twitter_card_iframe_url:
- twitter_card_url = compat_urlparse.urljoin(url, twitter_card_iframe_url)
-
- if twitter_card_url:
info.update({
- '_type': 'url_transparent',
- 'ie_key': 'TwitterCard',
- 'url': twitter_card_url,
+ 'formats': formats,
+ 'thumbnails': thumbnails,
+ 'duration': float_or_none(video_info.get('duration_millis'), 1000),
})
- return info
-
- raise ExtractorError('There\'s no video in this tweet.')
+ else:
+ card = status.get('card')
+ if card:
+ binding_values = card['binding_values']
+
+ def get_binding_value(k):
+ o = binding_values.get(k) or {}
+ return try_get(o, lambda x: x[x['type'].lower() + '_value'])
+
+ card_name = card['name'].split(':')[-1]
+ if card_name == 'amplify':
+ formats = self._extract_formats_from_vmap_url(
+ get_binding_value('amplify_url_vmap'),
+ get_binding_value('amplify_content_id') or twid)
+ self._sort_formats(formats)
+
+ thumbnails = []
+ for suffix in ('_small', '', '_large', '_x_large', '_original'):
+ image = get_binding_value('player_image' + suffix) or {}
+ image_url = image.get('url')
+ if not image_url or '/player-placeholder' in image_url:
+ continue
+ thumbnails.append({
+ 'id': suffix[1:] if suffix else 'medium',
+ 'url': image_url,
+ 'width': int_or_none(image.get('width')),
+ 'height': int_or_none(image.get('height')),
+ })
+
+ info.update({
+ 'formats': formats,
+ 'thumbnails': thumbnails,
+ 'duration': int_or_none(get_binding_value(
+ 'content_duration_seconds')),
+ })
+ elif card_name == 'player':
+ info.update({
+ '_type': 'url',
+ 'url': get_binding_value('player_url'),
+ })
+ elif card_name == 'periscope_broadcast':
+ info.update({
+ '_type': 'url',
+ 'url': get_binding_value('url') or get_binding_value('player_url'),
+ 'ie_key': PeriscopeIE.ie_key(),
+ })
+ elif card_name == 'broadcast':
+ info.update({
+ '_type': 'url',
+ 'url': get_binding_value('broadcast_url'),
+ 'ie_key': TwitterBroadcastIE.ie_key(),
+ })
+ else:
+ raise ExtractorError('Unsupported Twitter Card.')
+ else:
+ expanded_url = try_get(status, lambda x: x['entities']['urls'][0]['expanded_url'])
+ if not expanded_url:
+ raise ExtractorError("There's no video in this tweet.")
+ info.update({
+ '_type': 'url',
+ 'url': expanded_url,
+ })
+ return info
class TwitterAmplifyIE(TwitterBaseIE):
@@ -573,3 +567,27 @@ class TwitterAmplifyIE(TwitterBaseIE):
'formats': formats,
'thumbnails': thumbnails,
}
+
+
+class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE):
+ IE_NAME = 'twitter:broadcast'
+ _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/broadcasts/(?P<id>[0-9a-zA-Z]{13})'
+
+ def _real_extract(self, url):
+ broadcast_id = self._match_id(url)
+ broadcast = self._call_api(
+ 'broadcasts/show.json', broadcast_id,
+ {'ids': broadcast_id})['broadcasts'][broadcast_id]
+ info = self._parse_broadcast_data(broadcast, broadcast_id)
+ media_key = broadcast['media_key']
+ source = self._call_api(
+ 'live_video_stream/status/' + media_key, media_key)['source']
+ m3u8_url = source.get('noRedirectPlaybackUrl') or source['location']
+ if '/live_video_stream/geoblocked/' in m3u8_url:
+ self.raise_geo_restricted()
+ m3u8_id = compat_parse_qs(compat_urllib_parse_urlparse(
+ m3u8_url).query).get('type', [None])[0]
+ state, width, height = self._extract_common_format_info(broadcast)
+ info['formats'] = self._extract_pscp_m3u8_formats(
+ m3u8_url, broadcast_id, m3u8_id, state, width, height)
+ return info
diff --git a/youtube_dl/extractor/ufctv.py b/youtube_dl/extractor/ufctv.py
index f3eaee6b3..3d74ba071 100644
--- a/youtube_dl/extractor/ufctv.py
+++ b/youtube_dl/extractor/ufctv.py
@@ -1,73 +1,16 @@
# coding: utf-8
from __future__ import unicode_literals
-from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
- parse_duration,
- parse_iso8601,
- urlencode_postdata,
-)
+from .imggaming import ImgGamingBaseIE
-class UFCTVIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?ufc\.tv/video/(?P<id>[^/]+)'
+class UFCTVIE(ImgGamingBaseIE):
+ _VALID_URL = ImgGamingBaseIE._VALID_URL_TEMPL % r'(?:(?:app|www)\.)?(?:ufc\.tv|(?:ufc)?fightpass\.com)|ufcfightpass\.img(?:dge|gaming)\.com'
_NETRC_MACHINE = 'ufctv'
- _TEST = {
- 'url': 'https://www.ufc.tv/video/ufc-219-countdown-full-episode',
- 'info_dict': {
- 'id': '34167',
- 'ext': 'mp4',
- 'title': 'UFC 219 Countdown: Full Episode',
- 'description': 'md5:26d4e8bf4665ae5878842d7050c3c646',
- 'timestamp': 1513962360,
- 'upload_date': '20171222',
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
- }
- }
+ _REALM = 'ufc'
- def _real_initialize(self):
- username, password = self._get_login_info()
- if username is None:
- return
- code = self._download_json(
- 'https://www.ufc.tv/secure/authenticate',
- None, 'Logging in', data=urlencode_postdata({
- 'username': username,
- 'password': password,
- 'format': 'json',
- })).get('code')
- if code and code != 'loginsuccess':
- raise ExtractorError(code, expected=True)
-
- def _real_extract(self, url):
- display_id = self._match_id(url)
- video_data = self._download_json(url, display_id, query={
- 'format': 'json',
- })
- video_id = str(video_data['id'])
- title = video_data['name']
- m3u8_url = self._download_json(
- 'https://www.ufc.tv/service/publishpoint', video_id, query={
- 'type': 'video',
- 'format': 'json',
- 'id': video_id,
- }, headers={
- 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0_1 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A402 Safari/604.1',
- })['path']
- m3u8_url = m3u8_url.replace('_iphone.', '.')
- formats = self._extract_m3u8_formats(m3u8_url, video_id, 'mp4')
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': video_data.get('description'),
- 'duration': parse_duration(video_data.get('runtime')),
- 'timestamp': parse_iso8601(video_data.get('releaseDate')),
- 'formats': formats,
- }
+class UFCArabiaIE(ImgGamingBaseIE):
+ _VALID_URL = ImgGamingBaseIE._VALID_URL_TEMPL % r'(?:(?:app|www)\.)?ufcarabia\.(?:ae|com)'
+ _NETRC_MACHINE = 'ufcarabia'
+ _REALM = 'admufc'
diff --git a/youtube_dl/extractor/videodetective.py b/youtube_dl/extractor/videodetective.py
index a19411a05..fe70db713 100644
--- a/youtube_dl/extractor/videodetective.py
+++ b/youtube_dl/extractor/videodetective.py
@@ -1,7 +1,6 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_urlparse
from .internetvideoarchive import InternetVideoArchiveIE
@@ -13,7 +12,7 @@ class VideoDetectiveIE(InfoExtractor):
'info_dict': {
'id': '194487',
'ext': 'mp4',
- 'title': 'KICK-ASS 2',
+ 'title': 'Kick-Ass 2',
'description': 'md5:c189d5b7280400630a1d3dd17eaa8d8a',
},
'params': {
@@ -24,7 +23,7 @@ class VideoDetectiveIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- og_video = self._og_search_video_url(webpage)
- query = compat_urlparse.urlparse(og_video).query
- return self.url_result(InternetVideoArchiveIE._build_json_url(query), ie=InternetVideoArchiveIE.ie_key())
+ query = 'customerid=69249&publishedid=' + video_id
+ return self.url_result(
+ InternetVideoArchiveIE._build_json_url(query),
+ ie=InternetVideoArchiveIE.ie_key())
diff --git a/youtube_dl/extractor/videopremium.py b/youtube_dl/extractor/videopremium.py
deleted file mode 100644
index cf690d7b0..000000000
--- a/youtube_dl/extractor/videopremium.py
+++ /dev/null
@@ -1,46 +0,0 @@
-from __future__ import unicode_literals
-
-import re
-import random
-
-from .common import InfoExtractor
-
-
-class VideoPremiumIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?videopremium\.(?:tv|me)/(?P<id>\w+)(?:/.*)?'
- _TEST = {
- 'url': 'http://videopremium.tv/4w7oadjsf156',
- 'info_dict': {
- 'id': '4w7oadjsf156',
- 'ext': 'f4v',
- 'title': 'youtube-dl_test_video____a_________-BaW_jenozKc.mp4.mp4'
- },
- 'params': {
- 'skip_download': True,
- },
- 'skip': 'Test file has been deleted.',
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage_url = 'http://videopremium.tv/' + video_id
- webpage = self._download_webpage(webpage_url, video_id)
-
- if re.match(r'^<html><head><script[^>]*>window\.location\s*=', webpage):
- # Download again, we need a cookie
- webpage = self._download_webpage(
- webpage_url, video_id,
- note='Downloading webpage again (with cookie)')
-
- video_title = self._html_search_regex(
- r'<h2(?:.*?)>\s*(.+?)\s*<', webpage, 'video title')
-
- return {
- 'id': video_id,
- 'url': 'rtmp://e%d.md.iplay.md/play' % random.randint(1, 16),
- 'play_path': 'mp4:%s.f4v' % video_id,
- 'page_url': 'http://videopremium.tv/' + video_id,
- 'player_url': 'http://videopremium.tv/uplayer/uppod.swf',
- 'ext': 'f4v',
- 'title': video_title,
- }
diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py
index 9abd59d98..baa46d5f3 100644
--- a/youtube_dl/extractor/vimeo.py
+++ b/youtube_dl/extractor/vimeo.py
@@ -15,18 +15,20 @@ from ..compat import (
compat_urlparse,
)
from ..utils import (
+ clean_html,
determine_ext,
+ dict_get,
ExtractorError,
js_to_json,
int_or_none,
merge_dicts,
- NO_DEFAULT,
OnDemandPagedList,
parse_filesize,
RegexNotFoundError,
sanitized_Request,
smuggle_url,
std_headers,
+ str_or_none,
try_get,
unified_timestamp,
unsmuggle_url,
@@ -210,7 +212,7 @@ class VimeoBaseInfoExtractor(InfoExtractor):
video_uploader_url = owner.get('url')
return {
- 'id': video_id,
+ 'id': str_or_none(video_data.get('id')) or video_id,
'title': self._live_title(video_title) if is_live else video_title,
'uploader': owner.get('name'),
'uploader_id': video_uploader_url.split('/')[-1] if video_uploader_url else None,
@@ -258,11 +260,11 @@ class VimeoIE(VimeoBaseInfoExtractor):
(?:
(?:
www|
- (?P<player>player)
+ player
)
\.
)?
- vimeo(?P<pro>pro)?\.com/
+ vimeo(?:pro)?\.com/
(?!(?:channels|album|showcase)/[^/?#]+/?(?:$|[?#])|[^/]+/review/|ondemand/)
(?:.*?/)?
(?:
@@ -284,7 +286,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'id': '56015672',
'ext': 'mp4',
'title': "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550",
- 'description': 'md5:509a9ad5c9bf97c60faee9203aca4479',
+ 'description': 'md5:2d3305bad981a06ff79f027f19865021',
'timestamp': 1355990239,
'upload_date': '20121220',
'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user7108434',
@@ -293,6 +295,9 @@ class VimeoIE(VimeoBaseInfoExtractor):
'duration': 10,
'license': 'by-sa',
},
+ 'params': {
+ 'format': 'best[protocol=https]',
+ },
},
{
'url': 'http://vimeopro.com/openstreetmapus/state-of-the-map-us-2013/video/68093876',
@@ -305,8 +310,13 @@ class VimeoIE(VimeoBaseInfoExtractor):
'uploader_id': 'openstreetmapus',
'uploader': 'OpenStreetMap US',
'title': 'Andy Allan - Putting the Carto into OpenStreetMap Cartography',
- 'description': 'md5:fd69a7b8d8c34a4e1d2ec2e4afd6ec30',
+ 'description': 'md5:2c362968038d4499f4d79f88458590c1',
'duration': 1595,
+ 'upload_date': '20130610',
+ 'timestamp': 1370893156,
+ },
+ 'params': {
+ 'format': 'best[protocol=https]',
},
},
{
@@ -323,6 +333,10 @@ class VimeoIE(VimeoBaseInfoExtractor):
'duration': 3610,
'description': None,
},
+ 'params': {
+ 'format': 'best[protocol=https]',
+ },
+ 'expected_warnings': ['Unable to download JSON metadata'],
},
{
'url': 'http://vimeo.com/68375962',
@@ -341,6 +355,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'description': 'md5:dca3ea23adb29ee387127bc4ddfce63f',
},
'params': {
+ 'format': 'best[protocol=https]',
'videopassword': 'youtube-dl',
},
},
@@ -441,10 +456,14 @@ class VimeoIE(VimeoBaseInfoExtractor):
'uploader': '10Ft Films',
'uploader_url': r're:https?://(?:www\.)?vimeo\.com/tenfootfilms',
'uploader_id': 'tenfootfilms',
+ 'description': 'md5:0fa704e05b04f91f40b7f3ca2e801384',
+ 'upload_date': '20130830',
+ 'timestamp': 1377853339,
},
'params': {
'skip_download': True,
},
+ 'expected_warnings': ['Unable to download JSON metadata'],
},
{
'url': 'http://player.vimeo.com/video/68375962',
@@ -459,6 +478,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
'duration': 10,
},
'params': {
+ 'format': 'best[protocol=https]',
'videopassword': 'youtube-dl',
},
},
@@ -523,7 +543,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
def _verify_player_video_password(self, url, video_id, headers):
password = self._downloader.params.get('videopassword')
if password is None:
- raise ExtractorError('This video is protected by a password, use the --video-password option')
+ raise ExtractorError('This video is protected by a password, use the --video-password option', expected=True)
data = urlencode_postdata({
'password': base64.b64encode(password.encode()),
})
@@ -552,28 +572,26 @@ class VimeoIE(VimeoBaseInfoExtractor):
r'vimeo\.com/channels/([^/]+)', url, 'channel id', default=None)
# Extract ID from URL
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
orig_url = url
- if mobj.group('pro'):
+ is_pro = 'vimeopro.com/' in url
+ is_player = '://player.vimeo.com/video/' in url
+ if is_pro:
# some videos require portfolio_id to be present in player url
# https://github.com/ytdl-org/youtube-dl/issues/20070
url = self._extract_url(url, self._download_webpage(url, video_id))
- elif mobj.group('player'):
+ if not url:
+ url = 'https://vimeo.com/' + video_id
+ elif is_player:
url = 'https://player.vimeo.com/video/' + video_id
elif any(p in url for p in ('play_redirect_hls', 'moogaloop.swf')):
url = 'https://vimeo.com/' + video_id
- # Retrieve video webpage to extract further information
- request = sanitized_Request(url, headers=headers)
try:
- webpage, urlh = self._download_webpage_handle(request, video_id)
+ # Retrieve video webpage to extract further information
+ webpage, urlh = self._download_webpage_handle(
+ url, video_id, headers=headers)
redirect_url = compat_str(urlh.geturl())
- # Some URLs redirect to ondemand can't be extracted with
- # this extractor right away thus should be passed through
- # ondemand extractor (e.g. https://vimeo.com/73445910)
- if VimeoOndemandIE.suitable(redirect_url):
- return self.url_result(redirect_url, VimeoOndemandIE.ie_key())
except ExtractorError as ee:
if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
errmsg = ee.cause.read()
@@ -600,6 +618,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
cc_license = None
timestamp = None
+ video_description = None
# Extract the config JSON
try:
@@ -611,17 +630,17 @@ class VimeoIE(VimeoBaseInfoExtractor):
# Sometimes new react-based page is served instead of old one that require
# different config URL extraction approach (see
# https://github.com/ytdl-org/youtube-dl/pull/7209)
- vimeo_clip_page_config = self._search_regex(
- r'vimeo\.clip_page_config\s*=\s*({.+?});', webpage,
- 'vimeo clip page config')
- page_config = self._parse_json(vimeo_clip_page_config, video_id)
+ page_config = self._parse_json(self._search_regex(
+ r'vimeo\.(?:clip|vod_title)_page_config\s*=\s*({.+?});',
+ webpage, 'page config'), video_id)
config_url = page_config['player']['config_url']
cc_license = page_config.get('cc_license')
timestamp = try_get(
page_config, lambda x: x['clip']['uploaded_on'],
compat_str)
- config_json = self._download_webpage(config_url, video_id)
- config = json.loads(config_json)
+ video_description = clean_html(dict_get(
+ page_config, ('description', 'description_html_escaped')))
+ config = self._download_json(config_url, video_id)
except RegexNotFoundError:
# For pro videos or player.vimeo.com urls
# We try to find out to which variable is assigned the config dic
@@ -675,14 +694,14 @@ class VimeoIE(VimeoBaseInfoExtractor):
{'force_feature_id': True}), 'Vimeo')
# Extract video description
-
- video_description = self._html_search_regex(
- r'(?s)<div\s+class="[^"]*description[^"]*"[^>]*>(.*?)</div>',
- webpage, 'description', default=None)
+ if not video_description:
+ video_description = self._html_search_regex(
+ r'(?s)<div\s+class="[^"]*description[^"]*"[^>]*>(.*?)</div>',
+ webpage, 'description', default=None)
if not video_description:
video_description = self._html_search_meta(
'description', webpage, default=None)
- if not video_description and mobj.group('pro'):
+ if not video_description and is_pro:
orig_webpage = self._download_webpage(
orig_url, video_id,
note='Downloading webpage for description',
@@ -690,7 +709,7 @@ class VimeoIE(VimeoBaseInfoExtractor):
if orig_webpage:
video_description = self._html_search_meta(
'description', orig_webpage, default=None)
- if not video_description and not mobj.group('player'):
+ if not video_description and not is_player:
self._downloader.report_warning('Cannot find video description')
# Extract upload date
@@ -747,9 +766,9 @@ class VimeoIE(VimeoBaseInfoExtractor):
return info_dict
-class VimeoOndemandIE(VimeoBaseInfoExtractor):
+class VimeoOndemandIE(VimeoIE):
IE_NAME = 'vimeo:ondemand'
- _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://(?:www\.)?vimeo\.com/ondemand/([^/]+/)?(?P<id>[^/?#&]+)'
_TESTS = [{
# ondemand video not available via https://vimeo.com/id
'url': 'https://vimeo.com/ondemand/20704',
@@ -761,24 +780,32 @@ class VimeoOndemandIE(VimeoBaseInfoExtractor):
'uploader': 'גם סרטים',
'uploader_url': r're:https?://(?:www\.)?vimeo\.com/gumfilms',
'uploader_id': 'gumfilms',
+ 'description': 'md5:4c027c965e439de4baab621e48b60791',
+ 'upload_date': '20140906',
+ 'timestamp': 1410032453,
},
'params': {
'format': 'best[protocol=https]',
},
+ 'expected_warnings': ['Unable to download JSON metadata'],
}, {
# requires Referer to be passed along with og:video:url
'url': 'https://vimeo.com/ondemand/36938/126682985',
'info_dict': {
- 'id': '126682985',
+ 'id': '126584684',
'ext': 'mp4',
'title': 'Rävlock, rätt läte på rätt plats',
'uploader': 'Lindroth & Norin',
- 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user14430847',
- 'uploader_id': 'user14430847',
+ 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/lindrothnorin',
+ 'uploader_id': 'lindrothnorin',
+ 'description': 'md5:c3c46a90529612c8279fb6af803fc0df',
+ 'upload_date': '20150502',
+ 'timestamp': 1430586422,
},
'params': {
'skip_download': True,
},
+ 'expected_warnings': ['Unable to download JSON metadata'],
}, {
'url': 'https://vimeo.com/ondemand/nazmaalik',
'only_matching': True,
@@ -790,16 +817,6 @@ class VimeoOndemandIE(VimeoBaseInfoExtractor):
'only_matching': True,
}]
- def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- return self.url_result(
- # Some videos require Referer to be passed along with og:video:url
- # similarly to generic vimeo embeds (e.g.
- # https://vimeo.com/ondemand/36938/126682985).
- VimeoIE._smuggle_referrer(self._og_search_video_url(webpage), url),
- VimeoIE.ie_key())
-
class VimeoChannelIE(VimeoBaseInfoExtractor):
IE_NAME = 'vimeo:channel'
@@ -815,6 +832,7 @@ class VimeoChannelIE(VimeoBaseInfoExtractor):
},
'playlist_mincount': 25,
}]
+ _BASE_URL_TEMPL = 'https://vimeo.com/channels/%s'
def _page_url(self, base_url, pagenum):
return '%s/videos/page:%d/' % (base_url, pagenum)
@@ -886,14 +904,13 @@ class VimeoChannelIE(VimeoBaseInfoExtractor):
return self.playlist_result(title_and_entries, list_id, list_title)
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- channel_id = mobj.group('id')
- return self._extract_videos(channel_id, 'https://vimeo.com/channels/%s' % channel_id)
+ channel_id = self._match_id(url)
+ return self._extract_videos(channel_id, self._BASE_URL_TEMPL % channel_id)
class VimeoUserIE(VimeoChannelIE):
IE_NAME = 'vimeo:user'
- _VALID_URL = r'https://vimeo\.com/(?!(?:[0-9]+|watchlater)(?:$|[?#/]))(?P<name>[^/]+)(?:/videos|[#?]|$)'
+ _VALID_URL = r'https://vimeo\.com/(?!(?:[0-9]+|watchlater)(?:$|[?#/]))(?P<id>[^/]+)(?:/videos|[#?]|$)'
_TITLE_RE = r'<a[^>]+?class="user">([^<>]+?)</a>'
_TESTS = [{
'url': 'https://vimeo.com/nkistudio/videos',
@@ -903,11 +920,7 @@ class VimeoUserIE(VimeoChannelIE):
},
'playlist_mincount': 66,
}]
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- name = mobj.group('name')
- return self._extract_videos(name, 'https://vimeo.com/%s' % name)
+ _BASE_URL_TEMPL = 'https://vimeo.com/%s'
class VimeoAlbumIE(VimeoChannelIE):
@@ -969,25 +982,18 @@ class VimeoAlbumIE(VimeoChannelIE):
r'<title>\s*(.+?)(?:\s+on Vimeo)?</title>', webpage, 'title', fatal=False))
-class VimeoGroupsIE(VimeoAlbumIE):
+class VimeoGroupsIE(VimeoChannelIE):
IE_NAME = 'vimeo:group'
- _VALID_URL = r'https://vimeo\.com/groups/(?P<name>[^/]+)(?:/(?!videos?/\d+)|$)'
+ _VALID_URL = r'https://vimeo\.com/groups/(?P<id>[^/]+)(?:/(?!videos?/\d+)|$)'
_TESTS = [{
- 'url': 'https://vimeo.com/groups/rolexawards',
+ 'url': 'https://vimeo.com/groups/kattykay',
'info_dict': {
- 'id': 'rolexawards',
- 'title': 'Rolex Awards for Enterprise',
+ 'id': 'kattykay',
+ 'title': 'Katty Kay',
},
- 'playlist_mincount': 73,
+ 'playlist_mincount': 27,
}]
-
- def _extract_list_title(self, webpage):
- return self._og_search_title(webpage, fatal=False)
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- name = mobj.group('name')
- return self._extract_videos(name, 'https://vimeo.com/groups/%s' % name)
+ _BASE_URL_TEMPL = 'https://vimeo.com/groups/%s'
class VimeoReviewIE(VimeoBaseInfoExtractor):
@@ -1003,7 +1009,9 @@ class VimeoReviewIE(VimeoBaseInfoExtractor):
'title': "DICK HARDWICK 'Comedian'",
'uploader': 'Richard Hardwick',
'uploader_id': 'user21297594',
- }
+ 'description': "Comedian Dick Hardwick's five minute demo filmed in front of a live theater audience.\nEdit by Doug Mattocks",
+ },
+ 'expected_warnings': ['Unable to download JSON metadata'],
}, {
'note': 'video player needs Referer',
'url': 'https://vimeo.com/user22258446/review/91613211/13f927e053',
@@ -1016,7 +1024,8 @@ class VimeoReviewIE(VimeoBaseInfoExtractor):
'duration': 2773,
'thumbnail': r're:^https?://.*\.jpg$',
'uploader_id': 'user22258446',
- }
+ },
+ 'skip': 'video gone',
}, {
'note': 'Password protected',
'url': 'https://vimeo.com/user37284429/review/138823582/c4d865efde',
@@ -1036,32 +1045,20 @@ class VimeoReviewIE(VimeoBaseInfoExtractor):
def _real_initialize(self):
self._login()
- def _get_config_url(self, webpage_url, video_id, video_password_verified=False):
- webpage = self._download_webpage(webpage_url, video_id)
- config_url = self._html_search_regex(
- r'data-config-url=(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
- 'config URL', default=None, group='url')
- if not config_url:
- data = self._parse_json(self._search_regex(
- r'window\s*=\s*_extend\(window,\s*({.+?})\);', webpage, 'data',
- default=NO_DEFAULT if video_password_verified else '{}'), video_id)
- config = data.get('vimeo_esi', {}).get('config', {})
- config_url = config.get('configUrl') or try_get(config, lambda x: x['clipData']['configUrl'])
- if config_url is None:
- self._verify_video_password(webpage_url, video_id, webpage)
- config_url = self._get_config_url(
- webpage_url, video_id, video_password_verified=True)
- return config_url
-
def _real_extract(self, url):
page_url, video_id = re.match(self._VALID_URL, url).groups()
- config_url = self._get_config_url(url, video_id)
+ clip_data = self._download_json(
+ page_url.replace('/review/', '/review/data/'),
+ video_id)['clipData']
+ config_url = clip_data['configUrl']
config = self._download_json(config_url, video_id)
info_dict = self._parse_config(config, video_id)
- source_format = self._extract_original_format(page_url, video_id)
+ source_format = self._extract_original_format(
+ page_url + '/action', video_id)
if source_format:
info_dict['formats'].append(source_format)
self._vimeo_sort_formats(info_dict['formats'])
+ info_dict['description'] = clean_html(clip_data.get('description'))
return info_dict
diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py
index 8b6dc0e24..00ec006c4 100644
--- a/youtube_dl/extractor/vk.py
+++ b/youtube_dl/extractor/vk.py
@@ -2,6 +2,7 @@
from __future__ import unicode_literals
import collections
+import functools
import re
from .common import InfoExtractor
@@ -11,8 +12,8 @@ from ..utils import (
ExtractorError,
get_element_by_class,
int_or_none,
+ OnDemandPagedList,
orderedSet,
- remove_start,
str_or_none,
str_to_int,
unescapeHTML,
@@ -21,6 +22,7 @@ from ..utils import (
urlencode_postdata,
)
from .dailymotion import DailymotionIE
+from .odnoklassniki import OdnoklassnikiIE
from .pladform import PladformIE
from .vimeo import VimeoIE
from .youtube import YoutubeIE
@@ -60,6 +62,18 @@ class VKBaseIE(InfoExtractor):
def _real_initialize(self):
self._login()
+ def _download_payload(self, path, video_id, data, fatal=True):
+ data['al'] = 1
+ code, payload = self._download_json(
+ 'https://vk.com/%s.php' % path, video_id,
+ data=urlencode_postdata(data), fatal=fatal,
+ headers={'X-Requested-With': 'XMLHttpRequest'})['payload']
+ if code == '3':
+ self.raise_login_required()
+ elif code == '8':
+ raise ExtractorError(clean_html(payload[0][1:-1]), expected=True)
+ return payload
+
class VKIE(VKBaseIE):
IE_NAME = 'vk'
@@ -96,7 +110,6 @@ class VKIE(VKBaseIE):
},
{
'url': 'http://vk.com/video205387401_165548505',
- 'md5': '6c0aeb2e90396ba97035b9cbde548700',
'info_dict': {
'id': '205387401_165548505',
'ext': 'mp4',
@@ -110,18 +123,18 @@ class VKIE(VKBaseIE):
},
{
'note': 'Embedded video',
- 'url': 'http://vk.com/video_ext.php?oid=32194266&id=162925554&hash=7d8c2e0d5e05aeaa&hd=1',
- 'md5': 'c7ce8f1f87bec05b3de07fdeafe21a0a',
+ 'url': 'https://vk.com/video_ext.php?oid=-77521&id=162222515&hash=87b046504ccd8bfa',
+ 'md5': '7babad3b85ea2e91948005b1b8b0cb84',
'info_dict': {
- 'id': '32194266_162925554',
+ 'id': '-77521_162222515',
'ext': 'mp4',
- 'uploader': 'Vladimir Gavrin',
- 'title': 'Lin Dan',
- 'duration': 101,
- 'upload_date': '20120730',
- 'view_count': int,
+ 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*',
+ 'title': 'ProtivoGunz - Хуёвая песня',
+ 'duration': 195,
+ 'upload_date': '20120212',
+ 'timestamp': 1329049880,
+ 'uploader_id': '-77521',
},
- 'skip': 'This video has been removed from public access.',
},
{
# VIDEO NOW REMOVED
@@ -138,18 +151,19 @@ class VKIE(VKBaseIE):
'upload_date': '20121218',
'view_count': int,
},
- 'skip': 'Requires vk account credentials',
+ 'skip': 'Removed',
},
{
'url': 'http://vk.com/hd_kino_mania?z=video-43215063_168067957%2F15c66b9b533119788d',
- 'md5': '4d7a5ef8cf114dfa09577e57b2993202',
'info_dict': {
'id': '-43215063_168067957',
'ext': 'mp4',
- 'uploader': 'Киномания - лучшее из мира кино',
+ 'uploader': 'Bro Mazter',
'title': ' ',
'duration': 7291,
'upload_date': '20140328',
+ 'uploader_id': '223413403',
+ 'timestamp': 1396018030,
},
'skip': 'Requires vk account credentials',
},
@@ -165,7 +179,7 @@ class VKIE(VKBaseIE):
'upload_date': '20140626',
'view_count': int,
},
- 'skip': 'Only works from Russia',
+ 'skip': 'Removed',
},
{
# video (removed?) only available with list id
@@ -204,8 +218,7 @@ class VKIE(VKBaseIE):
'id': 'k3lz2cmXyRuJQSjGHUv',
'ext': 'mp4',
'title': 'md5:d52606645c20b0ddbb21655adaa4f56f',
- # TODO: fix test by fixing dailymotion description extraction
- 'description': 'md5:c651358f03c56f1150b555c26d90a0fd',
+ 'description': 'md5:424b8e88cc873217f520e582ba28bb36',
'uploader': 'AniLibria.Tv',
'upload_date': '20160914',
'uploader_id': 'x1p5vl5',
@@ -247,6 +260,9 @@ class VKIE(VKBaseIE):
'uploader_id': '-387766',
'timestamp': 1475137527,
},
+ 'params': {
+ 'skip_download': True,
+ },
},
{
# live stream, hls and rtmp links, most likely already finished live
@@ -288,80 +304,94 @@ class VKIE(VKBaseIE):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('videoid')
+ mv_data = {}
if video_id:
- info_url = 'https://vk.com/al_video.php?act=show_inline&al=1&video=' + video_id
+ data = {
+ 'act': 'show_inline',
+ 'video': video_id,
+ }
# Some videos (removed?) can only be downloaded with list id specified
list_id = mobj.group('list_id')
if list_id:
- info_url += '&list=%s' % list_id
+ data['list'] = list_id
+
+ payload = self._download_payload('al_video', video_id, data)
+ info_page = payload[1]
+ opts = payload[-1]
+ mv_data = opts.get('mvData') or {}
+ player = opts.get('player') or {}
else:
- info_url = 'http://vk.com/video_ext.php?' + mobj.group('embed_query')
video_id = '%s_%s' % (mobj.group('oid'), mobj.group('id'))
- info_page = self._download_webpage(info_url, video_id)
+ info_page = self._download_webpage(
+ 'http://vk.com/video_ext.php?' + mobj.group('embed_query'), video_id)
- error_message = self._html_search_regex(
- [r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>',
- r'(?s)<div[^>]+id="video_ext_msg"[^>]*>(.+?)</div>'],
- info_page, 'error message', default=None)
- if error_message:
- raise ExtractorError(error_message, expected=True)
+ error_message = self._html_search_regex(
+ [r'(?s)<!><div[^>]+class="video_layer_message"[^>]*>(.+?)</div>',
+ r'(?s)<div[^>]+id="video_ext_msg"[^>]*>(.+?)</div>'],
+ info_page, 'error message', default=None)
+ if error_message:
+ raise ExtractorError(error_message, expected=True)
- if re.search(r'<!>/login\.php\?.*\bact=security_check', info_page):
- raise ExtractorError(
- 'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.',
- expected=True)
+ if re.search(r'<!>/login\.php\?.*\bact=security_check', info_page):
+ raise ExtractorError(
+ 'You are trying to log in from an unusual location. You should confirm ownership at vk.com to log in with this IP.',
+ expected=True)
- ERROR_COPYRIGHT = 'Video %s has been removed from public access due to rightholder complaint.'
+ ERROR_COPYRIGHT = 'Video %s has been removed from public access due to rightholder complaint.'
- ERRORS = {
- r'>Видеозапись .*? была изъята из публичного доступа в связи с обращением правообладателя.<':
- ERROR_COPYRIGHT,
+ ERRORS = {
+ r'>Видеозапись .*? была изъята из публичного доступа в связи с обращением правообладателя.<':
+ ERROR_COPYRIGHT,
- r'>The video .*? was removed from public access by request of the copyright holder.<':
- ERROR_COPYRIGHT,
+ r'>The video .*? was removed from public access by request of the copyright holder.<':
+ ERROR_COPYRIGHT,
- r'<!>Please log in or <':
- 'Video %s is only available for registered users, '
- 'use --username and --password options to provide account credentials.',
+ r'<!>Please log in or <':
+ 'Video %s is only available for registered users, '
+ 'use --username and --password options to provide account credentials.',
- r'<!>Unknown error':
- 'Video %s does not exist.',
+ r'<!>Unknown error':
+ 'Video %s does not exist.',
- r'<!>Видео временно недоступно':
- 'Video %s is temporarily unavailable.',
+ r'<!>Видео временно недоступно':
+ 'Video %s is temporarily unavailable.',
- r'<!>Access denied':
- 'Access denied to video %s.',
+ r'<!>Access denied':
+ 'Access denied to video %s.',
- r'<!>Видеозапись недоступна, так как её автор был заблокирован.':
- 'Video %s is no longer available, because its author has been blocked.',
+ r'<!>Видеозапись недоступна, так как её автор был заблокирован.':
+ 'Video %s is no longer available, because its author has been blocked.',
- r'<!>This video is no longer available, because its author has been blocked.':
- 'Video %s is no longer available, because its author has been blocked.',
+ r'<!>This video is no longer available, because its author has been blocked.':
+ 'Video %s is no longer available, because its author has been blocked.',
- r'<!>This video is no longer available, because it has been deleted.':
- 'Video %s is no longer available, because it has been deleted.',
+ r'<!>This video is no longer available, because it has been deleted.':
+ 'Video %s is no longer available, because it has been deleted.',
- r'<!>The video .+? is not available in your region.':
- 'Video %s is not available in your region.',
- }
+ r'<!>The video .+? is not available in your region.':
+ 'Video %s is not available in your region.',
+ }
+
+ for error_re, error_msg in ERRORS.items():
+ if re.search(error_re, info_page):
+ raise ExtractorError(error_msg % video_id, expected=True)
- for error_re, error_msg in ERRORS.items():
- if re.search(error_re, info_page):
- raise ExtractorError(error_msg % video_id, expected=True)
+ player = self._parse_json(self._search_regex(
+ r'var\s+playerParams\s*=\s*({.+?})\s*;\s*\n',
+ info_page, 'player params'), video_id)
youtube_url = YoutubeIE._extract_url(info_page)
if youtube_url:
- return self.url_result(youtube_url, ie=YoutubeIE.ie_key())
+ return self.url_result(youtube_url, YoutubeIE.ie_key())
vimeo_url = VimeoIE._extract_url(url, info_page)
if vimeo_url is not None:
- return self.url_result(vimeo_url)
+ return self.url_result(vimeo_url, VimeoIE.ie_key())
pladform_url = PladformIE._extract_url(info_page)
if pladform_url:
- return self.url_result(pladform_url)
+ return self.url_result(pladform_url, PladformIE.ie_key())
m_rutube = re.search(
r'\ssrc="((?:https?:)?//rutube\.ru\\?/(?:video|play)\\?/embed(?:.*?))\\?"', info_page)
@@ -374,6 +404,10 @@ class VKIE(VKBaseIE):
if dailymotion_urls:
return self.url_result(dailymotion_urls[0], DailymotionIE.ie_key())
+ odnoklassniki_url = OdnoklassnikiIE._extract_url(info_page)
+ if odnoklassniki_url:
+ return self.url_result(odnoklassniki_url, OdnoklassnikiIE.ie_key())
+
m_opts = re.search(r'(?s)var\s+opts\s*=\s*({.+?});', info_page)
if m_opts:
m_opts_url = re.search(r"url\s*:\s*'((?!/\b)[^']+)", m_opts.group(1))
@@ -383,38 +417,7 @@ class VKIE(VKBaseIE):
opts_url = 'http:' + opts_url
return self.url_result(opts_url)
- # vars does not look to be served anymore since 24.10.2016
- data = self._parse_json(
- self._search_regex(
- r'var\s+vars\s*=\s*({.+?});', info_page, 'vars', default='{}'),
- video_id, fatal=False)
-
- # <!json> is served instead
- if not data:
- data = self._parse_json(
- self._search_regex(
- [r'<!json>\s*({.+?})\s*<!>', r'<!json>\s*({.+})'],
- info_page, 'json', default='{}'),
- video_id)
- if data:
- data = data['player']['params'][0]
-
- if not data:
- data = self._parse_json(
- self._search_regex(
- r'var\s+playerParams\s*=\s*({.+?})\s*;\s*\n', info_page,
- 'player params', default='{}'),
- video_id)
- if data:
- data = data['params'][0]
-
- # <!--{...}
- if not data:
- data = self._parse_json(
- self._search_regex(
- r'<!--\s*({.+})', info_page, 'payload'),
- video_id)['payload'][-1][-1]['player']['params'][0]
-
+ data = player['params'][0]
title = unescapeHTML(data['md_title'])
# 2 = live
@@ -463,12 +466,12 @@ class VKIE(VKBaseIE):
'title': title,
'thumbnail': data.get('jpg'),
'uploader': data.get('md_author'),
- 'uploader_id': str_or_none(data.get('author_id')),
- 'duration': data.get('duration'),
+ 'uploader_id': str_or_none(data.get('author_id') or mv_data.get('authorId')),
+ 'duration': int_or_none(data.get('duration') or mv_data.get('duration')),
'timestamp': timestamp,
'view_count': view_count,
- 'like_count': int_or_none(data.get('liked')),
- 'dislike_count': int_or_none(data.get('nolikes')),
+ 'like_count': int_or_none(mv_data.get('likes')),
+ 'comment_count': int_or_none(mv_data.get('commcount')),
'is_live': is_live,
}
@@ -476,15 +479,23 @@ class VKIE(VKBaseIE):
class VKUserVideosIE(VKBaseIE):
IE_NAME = 'vk:uservideos'
IE_DESC = "VK - User's Videos"
- _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/videos(?P<id>-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&]|$)'
+ _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/videos(?P<id>-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&](?:.*?\bsection=(?P<section>\w+))?|$)'
_TEMPLATE_URL = 'https://vk.com/videos'
_TESTS = [{
- 'url': 'http://vk.com/videos205387401',
+ 'url': 'https://vk.com/videos-767561',
+ 'info_dict': {
+ 'id': '-767561_all',
+ },
+ 'playlist_mincount': 1150,
+ }, {
+ 'url': 'https://vk.com/videos-767561?section=uploaded',
'info_dict': {
- 'id': '205387401',
- 'title': "Tom Cruise's Videos",
+ 'id': '-767561_uploaded',
},
- 'playlist_mincount': 4,
+ 'playlist_mincount': 425,
+ }, {
+ 'url': 'http://vk.com/videos205387401',
+ 'only_matching': True,
}, {
'url': 'http://vk.com/videos-77521',
'only_matching': True,
@@ -498,22 +509,33 @@ class VKUserVideosIE(VKBaseIE):
'url': 'http://new.vk.com/videos205387401',
'only_matching': True,
}]
+ _PAGE_SIZE = 1000
+ _VIDEO = collections.namedtuple('Video', ['owner_id', 'id'])
+
+ def _fetch_page(self, page_id, section, page):
+ l = self._download_payload('al_video', page_id, {
+ 'act': 'load_videos_silent',
+ 'offset': page * self._PAGE_SIZE,
+ 'oid': page_id,
+ 'section': section,
+ })[0][section]['list']
+
+ for video in l:
+ v = self._VIDEO._make(video[:2])
+ video_id = '%d_%d' % (v.owner_id, v.id)
+ yield self.url_result(
+ 'http://vk.com/video' + video_id, VKIE.ie_key(), video_id)
def _real_extract(self, url):
- page_id = self._match_id(url)
-
- webpage = self._download_webpage(url, page_id)
-
- entries = [
- self.url_result(
- 'http://vk.com/video' + video_id, 'VK', video_id=video_id)
- for video_id in orderedSet(re.findall(r'href="/video(-?[0-9_]+)"', webpage))]
+ page_id, section = re.match(self._VALID_URL, url).groups()
+ if not section:
+ section = 'all'
- title = unescapeHTML(self._search_regex(
- r'<title>\s*([^<]+?)\s+\|\s+\d+\s+videos',
- webpage, 'title', default=page_id))
+ entries = OnDemandPagedList(
+ functools.partial(self._fetch_page, page_id, section),
+ self._PAGE_SIZE)
- return self.playlist_result(entries, page_id, title)
+ return self.playlist_result(entries, '%s_%s' % (page_id, section))
class VKWallPostIE(VKBaseIE):
@@ -523,15 +545,15 @@ class VKWallPostIE(VKBaseIE):
# public page URL, audio playlist
'url': 'https://vk.com/bs.official?w=wall-23538238_35',
'info_dict': {
- 'id': '23538238_35',
- 'title': 'Black Shadow - Wall post 23538238_35',
+ 'id': '-23538238_35',
+ 'title': 'Black Shadow - Wall post -23538238_35',
'description': 'md5:3f84b9c4f9ef499731cf1ced9998cc0c',
},
'playlist': [{
'md5': '5ba93864ec5b85f7ce19a9af4af080f6',
'info_dict': {
'id': '135220665_111806521',
- 'ext': 'mp3',
+ 'ext': 'mp4',
'title': 'Black Shadow - Слепое Верование',
'duration': 370,
'uploader': 'Black Shadow',
@@ -542,18 +564,16 @@ class VKWallPostIE(VKBaseIE):
'md5': '4cc7e804579122b17ea95af7834c9233',
'info_dict': {
'id': '135220665_111802303',
- 'ext': 'mp3',
+ 'ext': 'mp4',
'title': 'Black Shadow - Война - Негасимое Бездны Пламя!',
'duration': 423,
'uploader': 'Black Shadow',
'artist': 'Black Shadow',
'track': 'Война - Негасимое Бездны Пламя!',
},
- 'params': {
- 'skip_download': True,
- },
}],
'params': {
+ 'skip_download': True,
'usenetrc': True,
},
'skip': 'Requires vk account credentials',
@@ -562,7 +582,7 @@ class VKWallPostIE(VKBaseIE):
'url': 'https://vk.com/wall85155021_6319',
'info_dict': {
'id': '85155021_6319',
- 'title': 'Sergey Gorbunov - Wall post 85155021_6319',
+ 'title': 'Сергей Горбунов - Wall post 85155021_6319',
},
'playlist_count': 1,
'params': {
@@ -578,58 +598,72 @@ class VKWallPostIE(VKBaseIE):
'url': 'https://m.vk.com/wall-23538238_35',
'only_matching': True,
}]
+ _BASE64_CHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMN0PQRSTUVWXYZO123456789+/='
+ _AUDIO = collections.namedtuple('Audio', ['id', 'owner_id', 'url', 'title', 'performer', 'duration', 'album_id', 'unk', 'author_link', 'lyrics', 'flags', 'context', 'extra', 'hashes', 'cover_url', 'ads'])
+
+ def _decode(self, enc):
+ dec = ''
+ e = n = 0
+ for c in enc:
+ r = self._BASE64_CHARS.index(c)
+ cond = n % 4
+ e = 64 * e + r if cond else r
+ n += 1
+ if cond:
+ dec += chr(255 & e >> (-2 * n & 6))
+ return dec
+
+ def _unmask_url(self, mask_url, vk_id):
+ if 'audio_api_unavailable' in mask_url:
+ extra = mask_url.split('?extra=')[1].split('#')
+ func, base = self._decode(extra[1]).split(chr(11))
+ mask_url = list(self._decode(extra[0]))
+ url_len = len(mask_url)
+ indexes = [None] * url_len
+ index = int(base) ^ vk_id
+ for n in range(url_len - 1, -1, -1):
+ index = (url_len * (n + 1) ^ index + n) % url_len
+ indexes[n] = index
+ for n in range(1, url_len):
+ c = mask_url[n]
+ index = indexes[url_len - 1 - n]
+ mask_url[n] = mask_url[index]
+ mask_url[index] = c
+ mask_url = ''.join(mask_url)
+ return mask_url
def _real_extract(self, url):
post_id = self._match_id(url)
- wall_url = 'https://vk.com/wall%s' % post_id
-
- post_id = remove_start(post_id, '-')
-
- webpage = self._download_webpage(wall_url, post_id)
-
- error = self._html_search_regex(
- r'>Error</div>\s*<div[^>]+class=["\']body["\'][^>]*>([^<]+)',
- webpage, 'error', default=None)
- if error:
- raise ExtractorError('VK said: %s' % error, expected=True)
+ webpage = self._download_payload('wkview', post_id, {
+ 'act': 'show',
+ 'w': 'wall' + post_id,
+ })[1]
description = clean_html(get_element_by_class('wall_post_text', webpage))
uploader = clean_html(get_element_by_class('author', webpage))
- thumbnail = self._og_search_thumbnail(webpage)
entries = []
- audio_ids = re.findall(r'data-full-id=["\'](\d+_\d+)', webpage)
- if audio_ids:
- al_audio = self._download_webpage(
- 'https://vk.com/al_audio.php', post_id,
- note='Downloading audio info', fatal=False,
- data=urlencode_postdata({
- 'act': 'reload_audio',
- 'al': '1',
- 'ids': ','.join(audio_ids)
- }))
- if al_audio:
- Audio = collections.namedtuple(
- 'Audio', ['id', 'user_id', 'url', 'track', 'artist', 'duration'])
- audios = self._parse_json(
- self._search_regex(
- r'<!json>(.+?)<!>', al_audio, 'audios', default='[]'),
- post_id, fatal=False, transform_source=unescapeHTML)
- if isinstance(audios, list):
- for audio in audios:
- a = Audio._make(audio[:6])
- entries.append({
- 'id': '%s_%s' % (a.user_id, a.id),
- 'url': a.url,
- 'title': '%s - %s' % (a.artist, a.track) if a.artist and a.track else a.id,
- 'thumbnail': thumbnail,
- 'duration': a.duration,
- 'uploader': uploader,
- 'artist': a.artist,
- 'track': a.track,
- })
+ for audio in re.findall(r'data-audio="([^"]+)', webpage):
+ audio = self._parse_json(unescapeHTML(audio), post_id)
+ a = self._AUDIO._make(audio[:16])
+ if not a.url:
+ continue
+ title = unescapeHTML(a.title)
+ performer = unescapeHTML(a.performer)
+ entries.append({
+ 'id': '%s_%s' % (a.owner_id, a.id),
+ 'url': self._unmask_url(a.url, a.ads['vk_id']),
+ 'title': '%s - %s' % (performer, title) if performer else title,
+ 'thumbnails': [{'url': c_url} for c_url in a.cover_url.split(',')] if a.cover_url else None,
+ 'duration': int_or_none(a.duration),
+ 'uploader': uploader,
+ 'artist': performer,
+ 'track': title,
+ 'ext': 'mp4',
+ 'protocol': 'm3u8',
+ })
for video in re.finditer(
r'<a[^>]+href=(["\'])(?P<url>/video(?:-?[\d_]+).*?)\1', webpage):
diff --git a/youtube_dl/extractor/vzaar.py b/youtube_dl/extractor/vzaar.py
index 3336e6c15..b7d02fca3 100644
--- a/youtube_dl/extractor/vzaar.py
+++ b/youtube_dl/extractor/vzaar.py
@@ -33,6 +33,18 @@ class VzaarIE(InfoExtractor):
'title': 'MP3',
},
}, {
+ # hlsAes = true
+ 'url': 'https://view.vzaar.com/11379930/player',
+ 'info_dict': {
+ 'id': '11379930',
+ 'ext': 'mp4',
+ 'title': 'Videoaula',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
# with null videoTitle
'url': 'https://view.vzaar.com/20313539/download',
'only_matching': True,
@@ -58,6 +70,7 @@ class VzaarIE(InfoExtractor):
f = {
'url': source_url,
'format_id': 'http',
+ 'preference': 1,
}
if 'audio' in source_url:
f.update({
@@ -75,13 +88,17 @@ class VzaarIE(InfoExtractor):
video_guid = video_data.get('guid')
usp = video_data.get('usp')
- if isinstance(video_guid, compat_str) and isinstance(usp, dict):
- m3u8_url = ('http://fable.vzaar.com/v4/usp/%s/%s.ism/.m3u8?'
- % (video_guid, video_id)) + '&'.join(
- '%s=%s' % (k, v) for k, v in usp.items())
- formats.extend(self._extract_m3u8_formats(
- m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',
- m3u8_id='hls', fatal=False))
+ if video_data.get('uspEnabled') and isinstance(video_guid, compat_str) and isinstance(usp, dict):
+ hls_aes = video_data.get('hlsAes')
+ qs = '&'.join('%s=%s' % (k, v) for k, v in usp.items())
+ url_templ = 'http://%%s.vzaar.com/v5/usp%s/%s/%s.ism%%s?' % ('aes' if hls_aes else '', video_guid, video_id)
+ m3u8_formats = self._extract_m3u8_formats(
+ url_templ % ('fable', '/.m3u8') + qs, video_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False)
+ if hls_aes:
+ for f in m3u8_formats:
+ f['_decryption_key_url'] = url_templ % ('goose', '') + qs
+ formats.extend(m3u8_formats)
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py
index fa142b974..0fbc888ec 100644
--- a/youtube_dl/extractor/wistia.py
+++ b/youtube_dl/extractor/wistia.py
@@ -12,7 +12,7 @@ from ..utils import (
class WistiaIE(InfoExtractor):
- _VALID_URL = r'(?:wistia:|https?://(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/)(?P<id>[a-z0-9]+)'
+ _VALID_URL = r'(?:wistia:|https?://(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/)(?P<id>[a-z0-9]{10})'
_API_URL = 'http://fast.wistia.com/embed/medias/%s.json'
_IFRAME_URL = 'http://fast.wistia.net/embed/iframe/%s'
@@ -43,25 +43,26 @@ class WistiaIE(InfoExtractor):
'only_matching': True,
}]
+ # https://wistia.com/support/embed-and-share/video-on-your-website
@staticmethod
def _extract_url(webpage):
match = re.search(
- r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/iframe/.+?)\1', webpage)
+ r'<(?:meta[^>]+?content|(?:iframe|script)[^>]+?src)=["\'](?P<url>(?:https?:)?//(?:fast\.)?wistia\.(?:net|com)/embed/(?:iframe|medias)/[a-z0-9]{10})', webpage)
if match:
return unescapeHTML(match.group('url'))
- match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
- if match:
- return 'wistia:%s' % match.group('id')
-
match = re.search(
r'''(?sx)
<script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*?
- <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]+)\b.*?\2
+ <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]{10})\b.*?\2
''', webpage)
if match:
return 'wistia:%s' % match.group('id')
+ match = re.search(r'(?:data-wistia-?id=["\']|Wistia\.embed\(["\']|id=["\']wistia_)(?P<id>[a-z0-9]{10})', webpage)
+ if match:
+ return 'wistia:%s' % match.group('id')
+
def _real_extract(self, url):
video_id = self._match_id(url)
diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py
index e5ebdd180..238d9cea0 100644
--- a/youtube_dl/extractor/yahoo.py
+++ b/youtube_dl/extractor/yahoo.py
@@ -3,453 +3,313 @@ from __future__ import unicode_literals
import hashlib
import itertools
-import json
import re
from .common import InfoExtractor, SearchInfoExtractor
from ..compat import (
compat_str,
compat_urllib_parse,
- compat_urlparse,
)
from ..utils import (
clean_html,
- determine_ext,
- ExtractorError,
- extract_attributes,
int_or_none,
mimetype2ext,
+ parse_iso8601,
smuggle_url,
try_get,
- unescapeHTML,
url_or_none,
)
-from .brightcove import (
- BrightcoveLegacyIE,
- BrightcoveNewIE,
-)
-from .nbc import NBCSportsVPlayerIE
+from .brightcove import BrightcoveNewIE
class YahooIE(InfoExtractor):
IE_DESC = 'Yahoo screen and movies'
- _VALID_URL = r'(?P<host>https?://(?:(?P<country>[a-zA-Z]{2})\.)?[\da-zA-Z_-]+\.yahoo\.com)/(?:[^/]+/)*(?:(?P<display_id>.+)?-)?(?P<id>[0-9]+)(?:-[a-z]+)?(?:\.html)?'
- _TESTS = [
- {
- 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
- 'info_dict': {
- 'id': '2d25e626-2378-391f-ada0-ddaf1417e588',
- 'ext': 'mp4',
- 'title': 'Julian Smith & Travis Legg Watch Julian Smith',
- 'description': 'Julian and Travis watch Julian Smith',
- 'duration': 6863,
- },
+ _VALID_URL = r'(?P<url>https?://(?:(?P<country>[a-zA-Z]{2}(?:-[a-zA-Z]{2})?|malaysia)\.)?(?:[\da-zA-Z_-]+\.)?yahoo\.com/(?:[^/]+/)*(?P<id>[^?&#]*-[0-9]+(?:-[a-z]+)?)\.html)'
+ _TESTS = [{
+ 'url': 'http://screen.yahoo.com/julian-smith-travis-legg-watch-214727115.html',
+ 'info_dict': {
+ 'id': '2d25e626-2378-391f-ada0-ddaf1417e588',
+ 'ext': 'mp4',
+ 'title': 'Julian Smith & Travis Legg Watch Julian Smith',
+ 'description': 'Julian and Travis watch Julian Smith',
+ 'duration': 6863,
+ 'timestamp': 1369812016,
+ 'upload_date': '20130529',
},
- {
- 'url': 'http://screen.yahoo.com/wired/codefellas-s1-ep12-cougar-lies-103000935.html',
- 'md5': '251af144a19ebc4a033e8ba91ac726bb',
- 'info_dict': {
- 'id': 'd1dedf8c-d58c-38c3-8963-e899929ae0a9',
- 'ext': 'mp4',
- 'title': 'Codefellas - The Cougar Lies with Spanish Moss',
- 'description': 'md5:66b627ab0a282b26352136ca96ce73c1',
- 'duration': 151,
- },
- 'skip': 'HTTP Error 404',
+ }, {
+ 'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed',
+ 'md5': '7993e572fac98e044588d0b5260f4352',
+ 'info_dict': {
+ 'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb',
+ 'ext': 'mp4',
+ 'title': "Yahoo Saves 'Community'",
+ 'description': 'md5:4d4145af2fd3de00cbb6c1d664105053',
+ 'duration': 170,
+ 'timestamp': 1406838636,
+ 'upload_date': '20140731',
},
- {
- 'url': 'https://screen.yahoo.com/community/community-sizzle-reel-203225340.html?format=embed',
- 'md5': '7993e572fac98e044588d0b5260f4352',
- 'info_dict': {
- 'id': '4fe78544-8d48-39d8-97cd-13f205d9fcdb',
- 'ext': 'mp4',
- 'title': "Yahoo Saves 'Community'",
- 'description': 'md5:4d4145af2fd3de00cbb6c1d664105053',
- 'duration': 170,
- }
+ }, {
+ 'url': 'https://uk.screen.yahoo.com/editor-picks/cute-raccoon-freed-drain-using-091756545.html',
+ 'md5': '71298482f7c64cbb7fa064e4553ff1c1',
+ 'info_dict': {
+ 'id': 'b3affa53-2e14-3590-852b-0e0db6cd1a58',
+ 'ext': 'webm',
+ 'title': 'Cute Raccoon Freed From Drain\u00a0Using Angle Grinder',
+ 'description': 'md5:f66c890e1490f4910a9953c941dee944',
+ 'duration': 97,
+ 'timestamp': 1414489862,
+ 'upload_date': '20141028',
+ }
+ }, {
+ 'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',
+ 'md5': '88e209b417f173d86186bef6e4d1f160',
+ 'info_dict': {
+ 'id': 'f885cf7f-43d4-3450-9fac-46ac30ece521',
+ 'ext': 'mp4',
+ 'title': 'China Moses Is Crazy About the Blues',
+ 'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0',
+ 'duration': 128,
+ 'timestamp': 1385722202,
+ 'upload_date': '20131129',
+ }
+ }, {
+ 'url': 'https://www.yahoo.com/movies/v/true-story-trailer-173000497.html',
+ 'md5': '2a9752f74cb898af5d1083ea9f661b58',
+ 'info_dict': {
+ 'id': '071c4013-ce30-3a93-a5b2-e0413cd4a9d1',
+ 'ext': 'mp4',
+ 'title': '\'True Story\' Trailer',
+ 'description': 'True Story',
+ 'duration': 150,
+ 'timestamp': 1418919206,
+ 'upload_date': '20141218',
},
- {
- 'url': 'https://tw.news.yahoo.com/%E6%95%A2%E5%95%8F%E5%B8%82%E9%95%B7%20%E9%BB%83%E7%A7%80%E9%9C%9C%E6%89%B9%E8%B3%B4%E6%B8%85%E5%BE%B7%20%E9%9D%9E%E5%B8%B8%E9%AB%98%E5%82%B2-034024051.html',
- 'md5': '45c024bad51e63e9b6f6fad7a43a8c23',
- 'info_dict': {
- 'id': 'cac903b3-fcf4-3c14-b632-643ab541712f',
- 'ext': 'mp4',
- 'title': '敢問市長/黃秀霜批賴清德「非常高傲」',
- 'description': '直言台南沒捷運 交通居五都之末',
- 'duration': 396,
- },
+ }, {
+ 'url': 'https://gma.yahoo.com/pizza-delivery-man-surprised-huge-tip-college-kids-195200785.html',
+ 'only_matching': True,
+ }, {
+ 'note': 'NBC Sports embeds',
+ 'url': 'http://sports.yahoo.com/blogs/ncaab-the-dagger/tyler-kalinoski-s-buzzer-beater-caps-davidson-s-comeback-win-185609842.html?guid=nbc_cbk_davidsonbuzzerbeater_150313',
+ 'info_dict': {
+ 'id': '9CsDKds0kvHI',
+ 'ext': 'flv',
+ 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d',
+ 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson',
+ 'upload_date': '20150313',
+ 'uploader': 'NBCU-SPORTS',
+ 'timestamp': 1426270238,
},
- {
- 'url': 'https://uk.screen.yahoo.com/editor-picks/cute-raccoon-freed-drain-using-091756545.html',
- 'md5': '71298482f7c64cbb7fa064e4553ff1c1',
- 'info_dict': {
- 'id': 'b3affa53-2e14-3590-852b-0e0db6cd1a58',
- 'ext': 'webm',
- 'title': 'Cute Raccoon Freed From Drain\u00a0Using Angle Grinder',
- 'description': 'md5:f66c890e1490f4910a9953c941dee944',
- 'duration': 97,
- }
+ }, {
+ 'url': 'https://tw.news.yahoo.com/-100120367.html',
+ 'only_matching': True,
+ }, {
+ # Query result is embedded in webpage, but explicit request to video API fails with geo restriction
+ 'url': 'https://screen.yahoo.com/community/communitary-community-episode-1-ladders-154501237.html',
+ 'md5': '4fbafb9c9b6f07aa8f870629f6671b35',
+ 'info_dict': {
+ 'id': '1f32853c-a271-3eef-8cb6-f6d6872cb504',
+ 'ext': 'mp4',
+ 'title': 'Communitary - Community Episode 1: Ladders',
+ 'description': 'md5:8fc39608213295748e1e289807838c97',
+ 'duration': 1646,
+ 'timestamp': 1440436550,
+ 'upload_date': '20150824',
+ 'series': 'Communitary',
+ 'season_number': 6,
+ 'episode_number': 1,
},
- {
- 'url': 'https://ca.sports.yahoo.com/video/program-makes-hockey-more-affordable-013127711.html',
- 'md5': '57e06440778b1828a6079d2f744212c4',
- 'info_dict': {
- 'id': 'c9fa2a36-0d4d-3937-b8f6-cc0fb1881e73',
- 'ext': 'mp4',
- 'title': 'Program that makes hockey more affordable not offered in Manitoba',
- 'description': 'md5:c54a609f4c078d92b74ffb9bf1f496f4',
- 'duration': 121,
- },
- 'skip': 'Video gone',
- }, {
- 'url': 'https://ca.finance.yahoo.com/news/hackers-sony-more-trouble-well-154609075.html',
- 'info_dict': {
- 'id': '154609075',
- },
- 'playlist': [{
- 'md5': '000887d0dc609bc3a47c974151a40fb8',
- 'info_dict': {
- 'id': 'e624c4bc-3389-34de-9dfc-025f74943409',
- 'ext': 'mp4',
- 'title': '\'The Interview\' TV Spot: War',
- 'description': 'The Interview',
- 'duration': 30,
- },
- }, {
- 'md5': '81bc74faf10750fe36e4542f9a184c66',
- 'info_dict': {
- 'id': '1fc8ada0-718e-3abe-a450-bf31f246d1a9',
- 'ext': 'mp4',
- 'title': '\'The Interview\' TV Spot: Guys',
- 'description': 'The Interview',
- 'duration': 30,
- },
- }],
- }, {
- 'url': 'http://news.yahoo.com/video/china-moses-crazy-blues-104538833.html',
- 'md5': '88e209b417f173d86186bef6e4d1f160',
- 'info_dict': {
- 'id': 'f885cf7f-43d4-3450-9fac-46ac30ece521',
- 'ext': 'mp4',
- 'title': 'China Moses Is Crazy About the Blues',
- 'description': 'md5:9900ab8cd5808175c7b3fe55b979bed0',
- 'duration': 128,
- }
- }, {
- 'url': 'https://in.lifestyle.yahoo.com/video/connect-dots-dark-side-virgo-090247395.html',
- 'md5': 'd9a083ccf1379127bf25699d67e4791b',
- 'info_dict': {
- 'id': '52aeeaa3-b3d1-30d8-9ef8-5d0cf05efb7c',
- 'ext': 'mp4',
- 'title': 'Connect the Dots: Dark Side of Virgo',
- 'description': 'md5:1428185051cfd1949807ad4ff6d3686a',
- 'duration': 201,
- },
- 'skip': 'Domain name in.lifestyle.yahoo.com gone',
- }, {
- 'url': 'https://www.yahoo.com/movies/v/true-story-trailer-173000497.html',
- 'md5': '989396ae73d20c6f057746fb226aa215',
- 'info_dict': {
- 'id': '071c4013-ce30-3a93-a5b2-e0413cd4a9d1',
- 'ext': 'mp4',
- 'title': '\'True Story\' Trailer',
- 'description': 'True Story',
- 'duration': 150,
- },
- }, {
- 'url': 'https://gma.yahoo.com/pizza-delivery-man-surprised-huge-tip-college-kids-195200785.html',
- 'only_matching': True,
- }, {
- 'note': 'NBC Sports embeds',
- 'url': 'http://sports.yahoo.com/blogs/ncaab-the-dagger/tyler-kalinoski-s-buzzer-beater-caps-davidson-s-comeback-win-185609842.html?guid=nbc_cbk_davidsonbuzzerbeater_150313',
- 'info_dict': {
- 'id': '9CsDKds0kvHI',
- 'ext': 'flv',
- 'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d',
- 'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson',
- 'upload_date': '20150313',
- 'uploader': 'NBCU-SPORTS',
- 'timestamp': 1426270238,
- }
- }, {
- 'url': 'https://tw.news.yahoo.com/-100120367.html',
- 'only_matching': True,
- }, {
- # Query result is embedded in webpage, but explicit request to video API fails with geo restriction
- 'url': 'https://screen.yahoo.com/community/communitary-community-episode-1-ladders-154501237.html',
- 'md5': '4fbafb9c9b6f07aa8f870629f6671b35',
- 'info_dict': {
- 'id': '1f32853c-a271-3eef-8cb6-f6d6872cb504',
- 'ext': 'mp4',
- 'title': 'Communitary - Community Episode 1: Ladders',
- 'description': 'md5:8fc39608213295748e1e289807838c97',
- 'duration': 1646,
- },
- }, {
- # it uses an alias to get the video_id
- 'url': 'https://www.yahoo.com/movies/the-stars-of-daddys-home-have-very-different-212843197.html',
- 'info_dict': {
- 'id': '40eda9c8-8e5f-3552-8745-830f67d0c737',
- 'ext': 'mp4',
- 'title': 'Will Ferrell & Mark Wahlberg Are Pro-Spanking',
- 'description': 'While they play feuding fathers in \'Daddy\'s Home,\' star Will Ferrell & Mark Wahlberg share their true feelings on parenthood.',
- },
+ }, {
+ # ytwnews://cavideo/
+ 'url': 'https://tw.video.yahoo.com/movie-tw/單車天使-中文版預-092316541.html',
+ 'info_dict': {
+ 'id': 'ba133ff2-0793-3510-b636-59dfe9ff6cff',
+ 'ext': 'mp4',
+ 'title': '單車天使 - 中文版預',
+ 'description': '中文版預',
+ 'timestamp': 1476696196,
+ 'upload_date': '20161017',
},
- {
- # config['models']['applet_model']['data']['sapi'] has no query
- 'url': 'https://www.yahoo.com/music/livenation/event/galactic-2016',
- 'md5': 'dac0c72d502bc5facda80c9e6d5c98db',
- 'info_dict': {
- 'id': 'a6015640-e9e5-3efb-bb60-05589a183919',
- 'ext': 'mp4',
- 'description': 'Galactic',
- 'title': 'Dolla Diva (feat. Maggie Koerner)',
- },
- 'skip': 'redirect to https://www.yahoo.com/music',
+ 'params': {
+ 'skip_download': True,
},
- {
- # yahoo://article/
- 'url': 'https://www.yahoo.com/movies/video/true-story-trailer-173000497.html',
- 'info_dict': {
- 'id': '071c4013-ce30-3a93-a5b2-e0413cd4a9d1',
- 'ext': 'mp4',
- 'title': "'True Story' Trailer",
- 'description': 'True Story',
- },
- 'params': {
- 'skip_download': True,
- },
+ }, {
+ # Contains both a Yahoo hosted video and multiple Youtube embeds
+ 'url': 'https://www.yahoo.com/entertainment/gwen-stefani-reveals-the-pop-hit-she-passed-on-assigns-it-to-her-voice-contestant-instead-033045672.html',
+ 'info_dict': {
+ 'id': '46c5d95a-528f-3d03-b732-732fcadd51de',
+ 'title': 'Gwen Stefani reveals the pop hit she passed on, assigns it to her \'Voice\' contestant instead',
+ 'description': 'Gwen decided not to record this hit herself, but she decided it was the perfect fit for Kyndall Inskeep.',
},
- {
- # ytwnews://cavideo/
- 'url': 'https://tw.video.yahoo.com/movie-tw/單車天使-中文版預-092316541.html',
+ 'playlist': [{
'info_dict': {
- 'id': 'ba133ff2-0793-3510-b636-59dfe9ff6cff',
+ 'id': '966d4262-4fd1-3aaa-b45b-049ca6e38ba6',
'ext': 'mp4',
- 'title': '單車天使 - 中文版預',
- 'description': '中文版預',
+ 'title': 'Gwen Stefani reveals she turned down one of Sia\'s best songs',
+ 'description': 'On "The Voice" Tuesday, Gwen Stefani told Taylor Swift which Sia hit was almost hers.',
+ 'timestamp': 1572406500,
+ 'upload_date': '20191030',
},
- 'params': {
- 'skip_download': True,
- },
- },
- {
- # custom brightcove
- 'url': 'https://au.tv.yahoo.com/plus7/sunrise/-/watch/37083565/clown-entertainers-say-it-is-hurting-their-business/',
+ }, {
'info_dict': {
- 'id': '5575377707001',
+ 'id': '352CFDOQrKg',
'ext': 'mp4',
- 'title': "Clown entertainers say 'It' is hurting their business",
- 'description': 'Stephen King s horror film has much to answer for. Jelby and Mr Loopy the Clowns join us.',
- 'timestamp': 1505341164,
- 'upload_date': '20170913',
- 'uploader_id': '2376984109001',
- },
- 'params': {
- 'skip_download': True,
+ 'title': 'Kyndal Inskeep "Performs the Hell Out of" Sia\'s "Elastic Heart" - The Voice Knockouts 2019',
+ 'description': 'md5:35b61e94c2ae214bc965ff4245f80d11',
+ 'uploader': 'The Voice',
+ 'uploader_id': 'NBCTheVoice',
+ 'upload_date': '20191029',
},
+ }],
+ 'params': {
+ 'playlistend': 2,
},
- {
- # custom brightcove, geo-restricted to Australia, bypassable
- 'url': 'https://au.tv.yahoo.com/plus7/sunrise/-/watch/37263964/sunrise-episode-wed-27-sep/',
- 'only_matching': True,
- }
- ]
+ 'expected_warnings': ['HTTP Error 404'],
+ }, {
+ 'url': 'https://malaysia.news.yahoo.com/video/bystanders-help-ontario-policeman-bust-190932818.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://es-us.noticias.yahoo.com/es-la-puerta-irrompible-que-110539379.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.yahoo.com/entertainment/v/longtime-cbs-news-60-minutes-032036500-cbs.html',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- page_id = mobj.group('id')
- display_id = mobj.group('display_id') or page_id
- host = mobj.group('host')
- webpage, urlh = self._download_webpage_handle(url, display_id)
- if 'err=404' in urlh.geturl():
- raise ExtractorError('Video gone', expected=True)
-
- # Look for iframed media first
- entries = []
- iframe_urls = re.findall(r'<iframe[^>]+src="(/video/.+?-\d+\.html\?format=embed.*?)"', webpage)
- for idx, iframe_url in enumerate(iframe_urls):
- entries.append(self.url_result(host + iframe_url, 'Yahoo'))
- if entries:
- return self.playlist_result(entries, page_id)
-
- # Look for NBCSports iframes
- nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
- if nbc_sports_url:
- return self.url_result(nbc_sports_url, NBCSportsVPlayerIE.ie_key())
-
- # Look for Brightcove Legacy Studio embeds
- bc_url = BrightcoveLegacyIE._extract_brightcove_url(webpage)
- if bc_url:
- return self.url_result(bc_url, BrightcoveLegacyIE.ie_key())
-
- def brightcove_url_result(bc_url):
- return self.url_result(
- smuggle_url(bc_url, {'geo_countries': [mobj.group('country')]}),
- BrightcoveNewIE.ie_key())
-
- # Look for Brightcove New Studio embeds
- bc_url = BrightcoveNewIE._extract_url(self, webpage)
- if bc_url:
- return brightcove_url_result(bc_url)
-
- brightcove_iframe = self._search_regex(
- r'(<iframe[^>]+data-video-id=["\']\d+[^>]+>)', webpage,
- 'brightcove iframe', default=None)
- if brightcove_iframe:
- attr = extract_attributes(brightcove_iframe)
- src = attr.get('src')
- if src:
- parsed_src = compat_urlparse.urlparse(src)
- qs = compat_urlparse.parse_qs(parsed_src.query)
- account_id = qs.get('accountId', ['2376984109001'])[0]
- brightcove_id = attr.get('data-video-id') or qs.get('videoId', [None])[0]
- if account_id and brightcove_id:
- return brightcove_url_result(
- 'http://players.brightcove.net/%s/default_default/index.html?videoId=%s'
- % (account_id, brightcove_id))
-
- # Query result is often embedded in webpage as JSON. Sometimes explicit requests
- # to video API results in a failure with geo restriction reason therefore using
- # embedded query result when present sounds reasonable.
- config_json = self._search_regex(
- r'window\.Af\.bootstrap\[[^\]]+\]\s*=\s*({.*?"applet_type"\s*:\s*"td-applet-videoplayer".*?});(?:</script>|$)',
- webpage, 'videoplayer applet', default=None)
- if config_json:
- config = self._parse_json(config_json, display_id, fatal=False)
- if config:
- sapi = config.get('models', {}).get('applet_model', {}).get('data', {}).get('sapi')
- if sapi and 'query' in sapi:
- info = self._extract_info(display_id, sapi, webpage)
- self._sort_formats(info['formats'])
- return info
-
- items_json = self._search_regex(
- r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE,
- default=None)
- if items_json is None:
- alias = self._search_regex(
- r'"aliases":{"video":"(.*?)"', webpage, 'alias', default=None)
- if alias is not None:
- alias_info = self._download_json(
- 'https://www.yahoo.com/_td/api/resource/VideoService.videos;video_aliases=["%s"]' % alias,
- display_id, 'Downloading alias info')
- video_id = alias_info[0]['id']
- else:
- CONTENT_ID_REGEXES = [
- r'YUI\.namespace\("Media"\)\.CONTENT_ID\s*=\s*"([^"]+)"',
- r'root\.App\.Cache\.context\.videoCache\.curVideo = \{"([^"]+)"',
- r'"first_videoid"\s*:\s*"([^"]+)"',
- r'%s[^}]*"ccm_id"\s*:\s*"([^"]+)"' % re.escape(page_id),
- r'<article[^>]data-uuid=["\']([^"\']+)',
- r'<meta[^<>]+yahoo://article/view\?.*\buuid=([^&"\']+)',
- r'<meta[^<>]+["\']ytwnews://cavideo/(?:[^/]+/)+([\da-fA-F-]+)[&"\']',
- ]
- video_id = self._search_regex(
- CONTENT_ID_REGEXES, webpage, 'content ID')
+ url, country, display_id = re.match(self._VALID_URL, url).groups()
+ if not country:
+ country = 'us'
else:
- items = json.loads(items_json)
- info = items['mediaItems']['query']['results']['mediaObj'][0]
- # The 'meta' field is not always in the video webpage, we request it
- # from another page
- video_id = info['id']
- return self._get_info(video_id, display_id, webpage)
-
- def _extract_info(self, display_id, query, webpage):
- info = query['query']['results']['mediaObj'][0]
- meta = info.get('meta')
- video_id = info.get('id')
-
- if not meta:
- msg = info['status'].get('msg')
- if msg:
- raise ExtractorError(
- '%s returned error: %s' % (self.IE_NAME, msg), expected=True)
- raise ExtractorError('Unable to extract media object meta')
+ country = country.split('-')[0]
+ api_base = 'https://%s.yahoo.com/_td/api/resource/' % country
+
+ for i, uuid in enumerate(['url=' + url, 'ymedia-alias=' + display_id]):
+ content = self._download_json(
+ api_base + 'content;getDetailView=true;uuids=["%s"]' % uuid,
+ display_id, 'Downloading content JSON metadata', fatal=i == 1)
+ if content:
+ item = content['items'][0]
+ break
+
+ if item.get('type') != 'video':
+ entries = []
+
+ cover = item.get('cover') or {}
+ if cover.get('type') == 'yvideo':
+ cover_url = cover.get('url')
+ if cover_url:
+ entries.append(self.url_result(
+ cover_url, 'Yahoo', cover.get('uuid')))
+
+ for e in item.get('body', []):
+ if e.get('type') == 'videoIframe':
+ iframe_url = e.get('url')
+ if not iframe_url:
+ continue
+ entries.append(self.url_result(iframe_url))
+
+ return self.playlist_result(
+ entries, item.get('uuid'),
+ item.get('title'), item.get('summary'))
+
+ video_id = item['uuid']
+ video = self._download_json(
+ api_base + 'VideoService.videos;view=full;video_ids=["%s"]' % video_id,
+ video_id, 'Downloading video JSON metadata')[0]
+ title = video['title']
+
+ if country == 'malaysia':
+ country = 'my'
+ is_live = video.get('live_state') == 'live'
+ fmts = ('m3u8',) if is_live else ('webm', 'mp4')
+
+ urls = []
formats = []
- for s in info['streams']:
- tbr = int_or_none(s.get('bitrate'))
- format_info = {
- 'width': int_or_none(s.get('width')),
- 'height': int_or_none(s.get('height')),
- 'tbr': tbr,
- }
-
- host = s['host']
- path = s['path']
- if host.startswith('rtmp'):
- fmt = 'rtmp'
- format_info.update({
- 'url': host,
- 'play_path': path,
- 'ext': 'flv',
- })
- else:
- if s.get('format') == 'm3u8_playlist':
- fmt = 'hls'
- format_info.update({
- 'protocol': 'm3u8_native',
- 'ext': 'mp4',
- })
- else:
- fmt = format_info['ext'] = determine_ext(path)
- format_url = compat_urlparse.urljoin(host, path)
- format_info['url'] = format_url
- format_info['format_id'] = fmt + ('-%d' % tbr if tbr else '')
- formats.append(format_info)
-
- closed_captions = self._html_search_regex(
- r'"closedcaptions":(\[[^\]]+\])', webpage, 'closed captions',
- default='[]')
-
- cc_json = self._parse_json(closed_captions, video_id, fatal=False)
subtitles = {}
- if cc_json:
- for closed_caption in cc_json:
- lang = closed_caption['lang']
- if lang not in subtitles:
- subtitles[lang] = []
- subtitles[lang].append({
- 'url': closed_caption['url'],
- 'ext': mimetype2ext(closed_caption['content_type']),
+ for fmt in fmts:
+ media_obj = self._download_json(
+ 'https://video-api.yql.yahoo.com/v1/video/sapi/streams/' + video_id,
+ video_id, 'Downloading %s JSON metadata' % fmt,
+ headers=self.geo_verification_headers(), query={
+ 'format': fmt,
+ 'region': country.upper(),
+ })['query']['results']['mediaObj'][0]
+ msg = media_obj.get('status', {}).get('msg')
+
+ for s in media_obj.get('streams', []):
+ host = s.get('host')
+ path = s.get('path')
+ if not host or not path:
+ continue
+ s_url = host + path
+ if s.get('format') == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ s_url, video_id, 'mp4', m3u8_id='hls', fatal=False))
+ continue
+ tbr = int_or_none(s.get('bitrate'))
+ formats.append({
+ 'url': s_url,
+ 'format_id': fmt + ('-%d' % tbr if tbr else ''),
+ 'width': int_or_none(s.get('width')),
+ 'height': int_or_none(s.get('height')),
+ 'tbr': tbr,
+ 'fps': int_or_none(s.get('framerate')),
+ })
+
+ for cc in media_obj.get('closedcaptions', []):
+ cc_url = cc.get('url')
+ if not cc_url or cc_url in urls:
+ continue
+ urls.append(cc_url)
+ subtitles.setdefault(cc.get('lang') or 'en-US', []).append({
+ 'url': cc_url,
+ 'ext': mimetype2ext(cc.get('content_type')),
})
+ streaming_url = video.get('streaming_url')
+ if streaming_url and not is_live:
+ formats.extend(self._extract_m3u8_formats(
+ streaming_url, video_id, 'mp4',
+ 'm3u8_native', m3u8_id='hls', fatal=False))
+
+ if not formats and msg == 'geo restricted':
+ self.raise_geo_restricted()
+
+ self._sort_formats(formats)
+
+ thumbnails = []
+ for thumb in video.get('thumbnails', []):
+ thumb_url = thumb.get('url')
+ if not thumb_url:
+ continue
+ thumbnails.append({
+ 'id': thumb.get('tag'),
+ 'url': thumb.get('url'),
+ 'width': int_or_none(thumb.get('width')),
+ 'height': int_or_none(thumb.get('height')),
+ })
+
+ series_info = video.get('series_info') or {}
+
return {
'id': video_id,
- 'display_id': display_id,
- 'title': unescapeHTML(meta['title']),
+ 'title': self._live_title(title) if is_live else title,
'formats': formats,
- 'description': clean_html(meta['description']),
- 'thumbnail': meta['thumbnail'] if meta.get('thumbnail') else self._og_search_thumbnail(webpage),
- 'duration': int_or_none(meta.get('duration')),
+ 'display_id': display_id,
+ 'thumbnails': thumbnails,
+ 'description': clean_html(video.get('description')),
+ 'timestamp': parse_iso8601(video.get('publish_time')),
'subtitles': subtitles,
+ 'duration': int_or_none(video.get('duration')),
+ 'view_count': int_or_none(video.get('view_count')),
+ 'is_live': is_live,
+ 'series': video.get('show_name'),
+ 'season_number': int_or_none(series_info.get('season_number')),
+ 'episode_number': int_or_none(series_info.get('episode_number')),
}
- def _get_info(self, video_id, display_id, webpage):
- region = self._search_regex(
- r'\\?"region\\?"\s*:\s*\\?"([^"]+?)\\?"',
- webpage, 'region', fatal=False, default='US').upper()
- formats = []
- info = {}
- for fmt in ('webm', 'mp4'):
- query_result = self._download_json(
- 'https://video.media.yql.yahoo.com/v1/video/sapi/streams/' + video_id,
- display_id, 'Downloading %s video info' % fmt, query={
- 'protocol': 'http',
- 'region': region,
- 'format': fmt,
- })
- info = self._extract_info(display_id, query_result, webpage)
- formats.extend(info['formats'])
- formats.extend(self._extract_m3u8_formats(
- 'http://video.media.yql.yahoo.com/v1/hls/%s?region=%s' % (video_id, region),
- video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
- self._sort_formats(formats)
- info['formats'] = formats
- return info
-
class YahooSearchIE(SearchInfoExtractor):
IE_DESC = 'Yahoo screen search'
@@ -523,7 +383,7 @@ class YahooGyaOPlayerIE(InfoExtractor):
'id': video_id,
'title': video['title'],
'url': smuggle_url(
- 'http://players.brightcove.net/4235717419001/default_default/index.html?videoId=' + video['videoId'],
+ 'http://players.brightcove.net/4235717419001/SyG5P0gjb_default/index.html?videoId=' + video['videoId'],
{'geo_countries': ['JP']}),
'description': video.get('longDescription'),
'ie_key': BrightcoveNewIE.ie_key(),
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 5e397324b..b913d07a6 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -69,7 +69,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
# If True it will raise an error if no login info is provided
_LOGIN_REQUIRED = False
- _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)[0-9A-Za-z-_]{10,}'
+ _PLAYLIST_ID_RE = r'(?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)[0-9A-Za-z-_]{10,}'
def _set_language(self):
self._set_cookie(
@@ -372,7 +372,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
_VALID_URL = r"""(?x)^
(
(?:https?://|//) # http(s):// or protocol-independent URL
- (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
+ (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie|kids)?\.com/|
(?:www\.)?deturl\.com/www\.youtube\.com/|
(?:www\.)?pwnyoutube\.com/|
(?:www\.)?hooktube\.com/|
@@ -1224,6 +1224,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'skip_download': True,
},
},
+ {
+ 'url': 'https://www.youtubekids.com/watch?v=3b8nCWDgZ6Q',
+ 'only_matching': True,
+ },
]
def __init__(self, *args, **kwargs):
@@ -2465,7 +2469,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
(?:\w+\.)?
(?:
(?:
- youtube\.com|
+ youtube(?:kids)?\.com|
invidio\.us
)
/
@@ -2477,7 +2481,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
youtu\.be/[0-9A-Za-z_-]{11}\?.*?\blist=
)
(
- (?:PL|LL|EC|UU|FL|RD|UL|TL|OLAK5uy_)?[0-9A-Za-z-_]{10,}
+ (?:PL|LL|EC|UU|FL|RD|UL|TL|PU|OLAK5uy_)?[0-9A-Za-z-_]{10,}
# Top tracks, they can also include dots
|(?:MC)[\w\.]*
)
@@ -2647,6 +2651,9 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
}, {
'url': 'https://invidio.us/playlist?list=PLDIoUOhQQPlXr63I_vwF9GD8sAKh77dWU',
'only_matching': True,
+ }, {
+ 'url': 'https://www.youtubekids.com/watch?v=Agk7R8I8o5U&list=PUZ6jURNr1WQZCNHF0ao-c0g',
+ 'only_matching': True,
}]
def _real_initialize(self):
@@ -2817,7 +2824,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
IE_DESC = 'YouTube.com channels'
- _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'
+ _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie|kids)?\.com|(?:www\.)?invidio\.us)/channel/(?P<id>[0-9A-Za-z_-]+)'
_TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
_VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
IE_NAME = 'youtube:channel'
@@ -2845,6 +2852,9 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
}, {
'url': 'https://invidio.us/channel/UC23qupoDRn9YOAVzeoxjOQA',
'only_matching': True,
+ }, {
+ 'url': 'https://www.youtubekids.com/channel/UCyu8StPfZWapR6rfW_JgqcA',
+ 'only_matching': True,
}]
@classmethod
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 53117ea90..f6204692a 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -46,6 +46,7 @@ from .compat import (
compat_html_entities,
compat_html_entities_html5,
compat_http_client,
+ compat_integer_types,
compat_kwargs,
compat_os_name,
compat_parse_qs,
@@ -1718,13 +1719,16 @@ DATE_FORMATS = (
'%B %d %Y',
'%B %dst %Y',
'%B %dnd %Y',
+ '%B %drd %Y',
'%B %dth %Y',
'%b %d %Y',
'%b %dst %Y',
'%b %dnd %Y',
+ '%b %drd %Y',
'%b %dth %Y',
'%b %dst %Y %I:%M',
'%b %dnd %Y %I:%M',
+ '%b %drd %Y %I:%M',
'%b %dth %Y %I:%M',
'%Y %m %d',
'%Y-%m-%d',
@@ -3516,10 +3520,11 @@ def str_or_none(v, default=None):
def str_to_int(int_str):
""" A more relaxed version of int_or_none """
- if int_str is None:
- return None
- int_str = re.sub(r'[,\.\+]', '', int_str)
- return int(int_str)
+ if isinstance(int_str, compat_integer_types):
+ return int_str
+ elif isinstance(int_str, compat_str):
+ int_str = re.sub(r'[,\.\+]', '', int_str)
+ return int_or_none(int_str)
def float_or_none(v, scale=1, invscale=1, default=None):
@@ -4979,7 +4984,7 @@ class ISO3166Utils(object):
class GeoUtils(object):
# Major IPv4 address blocks per country
_country_ip_map = {
- 'AD': '85.94.160.0/19',
+ 'AD': '46.172.224.0/19',
'AE': '94.200.0.0/13',
'AF': '149.54.0.0/17',
'AG': '209.59.64.0/18',
@@ -4987,28 +4992,30 @@ class GeoUtils(object):
'AL': '46.99.0.0/16',
'AM': '46.70.0.0/15',
'AO': '105.168.0.0/13',
- 'AP': '159.117.192.0/21',
+ 'AP': '182.50.184.0/21',
+ 'AQ': '23.154.160.0/24',
'AR': '181.0.0.0/12',
'AS': '202.70.112.0/20',
- 'AT': '84.112.0.0/13',
+ 'AT': '77.116.0.0/14',
'AU': '1.128.0.0/11',
'AW': '181.41.0.0/18',
- 'AZ': '5.191.0.0/16',
+ 'AX': '185.217.4.0/22',
+ 'AZ': '5.197.0.0/16',
'BA': '31.176.128.0/17',
'BB': '65.48.128.0/17',
'BD': '114.130.0.0/16',
'BE': '57.0.0.0/8',
- 'BF': '129.45.128.0/17',
+ 'BF': '102.178.0.0/15',
'BG': '95.42.0.0/15',
'BH': '37.131.0.0/17',
'BI': '154.117.192.0/18',
'BJ': '137.255.0.0/16',
- 'BL': '192.131.134.0/24',
+ 'BL': '185.212.72.0/23',
'BM': '196.12.64.0/18',
'BN': '156.31.0.0/16',
'BO': '161.56.0.0/16',
'BQ': '161.0.80.0/20',
- 'BR': '152.240.0.0/12',
+ 'BR': '191.128.0.0/12',
'BS': '24.51.64.0/18',
'BT': '119.2.96.0/19',
'BW': '168.167.0.0/16',
@@ -5016,20 +5023,20 @@ class GeoUtils(object):
'BZ': '179.42.192.0/18',
'CA': '99.224.0.0/11',
'CD': '41.243.0.0/16',
- 'CF': '196.32.200.0/21',
- 'CG': '197.214.128.0/17',
+ 'CF': '197.242.176.0/21',
+ 'CG': '160.113.0.0/16',
'CH': '85.0.0.0/13',
- 'CI': '154.232.0.0/14',
+ 'CI': '102.136.0.0/14',
'CK': '202.65.32.0/19',
'CL': '152.172.0.0/14',
- 'CM': '165.210.0.0/15',
+ 'CM': '102.244.0.0/14',
'CN': '36.128.0.0/10',
'CO': '181.240.0.0/12',
'CR': '201.192.0.0/12',
'CU': '152.206.0.0/15',
'CV': '165.90.96.0/19',
'CW': '190.88.128.0/17',
- 'CY': '46.198.0.0/15',
+ 'CY': '31.153.0.0/16',
'CZ': '88.100.0.0/14',
'DE': '53.0.0.0/8',
'DJ': '197.241.0.0/17',
@@ -5046,6 +5053,7 @@ class GeoUtils(object):
'EU': '2.16.0.0/13',
'FI': '91.152.0.0/13',
'FJ': '144.120.0.0/16',
+ 'FK': '80.73.208.0/21',
'FM': '119.252.112.0/20',
'FO': '88.85.32.0/19',
'FR': '90.0.0.0/9',
@@ -5055,8 +5063,8 @@ class GeoUtils(object):
'GE': '31.146.0.0/16',
'GF': '161.22.64.0/18',
'GG': '62.68.160.0/19',
- 'GH': '45.208.0.0/14',
- 'GI': '85.115.128.0/19',
+ 'GH': '154.160.0.0/12',
+ 'GI': '95.164.0.0/16',
'GL': '88.83.0.0/19',
'GM': '160.182.0.0/15',
'GN': '197.149.192.0/18',
@@ -5085,13 +5093,13 @@ class GeoUtils(object):
'JE': '87.244.64.0/18',
'JM': '72.27.0.0/17',
'JO': '176.29.0.0/16',
- 'JP': '126.0.0.0/8',
+ 'JP': '133.0.0.0/8',
'KE': '105.48.0.0/12',
'KG': '158.181.128.0/17',
'KH': '36.37.128.0/17',
'KI': '103.25.140.0/22',
'KM': '197.255.224.0/20',
- 'KN': '198.32.32.0/19',
+ 'KN': '198.167.192.0/19',
'KP': '175.45.176.0/22',
'KR': '175.192.0.0/10',
'KW': '37.36.0.0/14',
@@ -5099,10 +5107,10 @@ class GeoUtils(object):
'KZ': '2.72.0.0/13',
'LA': '115.84.64.0/18',
'LB': '178.135.0.0/16',
- 'LC': '192.147.231.0/24',
+ 'LC': '24.92.144.0/20',
'LI': '82.117.0.0/19',
'LK': '112.134.0.0/15',
- 'LR': '41.86.0.0/19',
+ 'LR': '102.183.0.0/16',
'LS': '129.232.0.0/17',
'LT': '78.56.0.0/13',
'LU': '188.42.0.0/16',
@@ -5127,7 +5135,7 @@ class GeoUtils(object):
'MT': '46.11.0.0/16',
'MU': '105.16.0.0/12',
'MV': '27.114.128.0/18',
- 'MW': '105.234.0.0/16',
+ 'MW': '102.70.0.0/15',
'MX': '187.192.0.0/11',
'MY': '175.136.0.0/13',
'MZ': '197.218.0.0/15',
@@ -5158,23 +5166,23 @@ class GeoUtils(object):
'PW': '202.124.224.0/20',
'PY': '181.120.0.0/14',
'QA': '37.210.0.0/15',
- 'RE': '139.26.0.0/16',
+ 'RE': '102.35.0.0/16',
'RO': '79.112.0.0/13',
- 'RS': '178.220.0.0/14',
+ 'RS': '93.86.0.0/15',
'RU': '5.136.0.0/13',
- 'RW': '105.178.0.0/15',
+ 'RW': '41.186.0.0/16',
'SA': '188.48.0.0/13',
'SB': '202.1.160.0/19',
'SC': '154.192.0.0/11',
- 'SD': '154.96.0.0/13',
+ 'SD': '102.120.0.0/13',
'SE': '78.64.0.0/12',
- 'SG': '152.56.0.0/14',
+ 'SG': '8.128.0.0/10',
'SI': '188.196.0.0/14',
'SK': '78.98.0.0/15',
- 'SL': '197.215.0.0/17',
+ 'SL': '102.143.0.0/17',
'SM': '89.186.32.0/19',
'SN': '41.82.0.0/15',
- 'SO': '197.220.64.0/19',
+ 'SO': '154.115.192.0/18',
'SR': '186.179.128.0/17',
'SS': '105.235.208.0/21',
'ST': '197.159.160.0/19',
@@ -5197,15 +5205,15 @@ class GeoUtils(object):
'TV': '202.2.96.0/19',
'TW': '120.96.0.0/11',
'TZ': '156.156.0.0/14',
- 'UA': '93.72.0.0/13',
- 'UG': '154.224.0.0/13',
- 'US': '3.0.0.0/8',
+ 'UA': '37.52.0.0/14',
+ 'UG': '102.80.0.0/13',
+ 'US': '6.0.0.0/8',
'UY': '167.56.0.0/13',
- 'UZ': '82.215.64.0/18',
+ 'UZ': '84.54.64.0/18',
'VA': '212.77.0.0/19',
- 'VC': '24.92.144.0/20',
+ 'VC': '207.191.240.0/21',
'VE': '186.88.0.0/13',
- 'VG': '172.103.64.0/18',
+ 'VG': '66.81.192.0/20',
'VI': '146.226.0.0/16',
'VN': '14.160.0.0/11',
'VU': '202.80.32.0/20',
@@ -5214,8 +5222,8 @@ class GeoUtils(object):
'YE': '134.35.0.0/16',
'YT': '41.242.116.0/22',
'ZA': '41.0.0.0/11',
- 'ZM': '165.56.0.0/13',
- 'ZW': '41.85.192.0/19',
+ 'ZM': '102.144.0.0/13',
+ 'ZW': '102.177.192.0/18',
}
@classmethod
@@ -5377,6 +5385,19 @@ def decode_packed_codes(code):
obfucasted_code)
+def caesar(s, alphabet, shift):
+ if shift == 0:
+ return s
+ l = len(alphabet)
+ return ''.join(
+ alphabet[(alphabet.index(c) + shift) % l] if c in alphabet else c
+ for c in s)
+
+
+def rot47(s):
+ return caesar(s, r'''!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~''', 47)
+
+
def parse_m3u8_attributes(attrib):
info = {}
for (key, val) in re.findall(r'(?P<key>[A-Z0-9-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)', attrib):
diff --git a/youtube_dl/version.py b/youtube_dl/version.py
index 39b355b9e..1227abc0a 100644
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,3 +1,3 @@
from __future__ import unicode_literals
-__version__ = '2019.10.22'
+__version__ = '2019.11.28'