diff options
Diffstat (limited to 'youtube_dl/extractor/voicerepublic.py')
-rw-r--r-- | youtube_dl/extractor/voicerepublic.py | 76 |
1 files changed, 57 insertions, 19 deletions
diff --git a/youtube_dl/extractor/voicerepublic.py b/youtube_dl/extractor/voicerepublic.py index a52e40afa..59e1359c4 100644 --- a/youtube_dl/extractor/voicerepublic.py +++ b/youtube_dl/extractor/voicerepublic.py @@ -1,12 +1,17 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_urlparse, +) from ..utils import ( ExtractorError, determine_ext, int_or_none, - urljoin, + sanitized_Request, ) @@ -21,7 +26,8 @@ class VoiceRepublicIE(InfoExtractor): 'ext': 'm4a', 'title': 'Watching the Watchers: Building a Sousveillance State', 'description': 'Secret surveillance programs have metadata too. The people and companies that operate secret surveillance programs can be surveilled.', - 'duration': 1556, + 'thumbnail': r're:^https?://.*\.(?:png|jpg)$', + 'duration': 1800, 'view_count': int, } }, { @@ -32,31 +38,63 @@ class VoiceRepublicIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + req = sanitized_Request( + compat_urlparse.urljoin(url, '/talks/%s' % display_id)) + # Older versions of Firefox get redirected to an "upgrade browser" page + req.add_header('User-Agent', 'youtube-dl') + webpage = self._download_webpage(req, display_id) if '>Queued for processing, please stand by...<' in webpage: raise ExtractorError( 'Audio is still queued for processing', expected=True) - talk = self._parse_json(self._search_regex( - r'initialSnapshot\s*=\s*({.+?});', - webpage, 'talk'), display_id)['talk'] - title = talk['title'] - formats = [{ - 'url': urljoin(url, talk_url), - 'format_id': format_id, - 'ext': determine_ext(talk_url) or format_id, - 'vcodec': 'none', - } for format_id, talk_url in talk['media_links'].items()] + config = self._search_regex( + r'(?s)return ({.+?});\s*\n', webpage, + 'data', default=None) + data = self._parse_json(config, display_id, fatal=False) if config else None + if data: + title = data['title'] + description = data.get('teaser') + talk_id = compat_str(data.get('talk_id') or display_id) + talk = data['talk'] + duration = int_or_none(talk.get('duration')) + formats = [{ + 'url': compat_urlparse.urljoin(url, talk_url), + 'format_id': format_id, + 'ext': determine_ext(talk_url) or format_id, + 'vcodec': 'none', + } for format_id, talk_url in talk['links'].items()] + else: + title = self._og_search_title(webpage) + description = self._html_search_regex( + r"(?s)<div class='talk-teaser'[^>]*>(.+?)</div>", + webpage, 'description', fatal=False) + talk_id = self._search_regex( + [r"id='jc-(\d+)'", r"data-shareable-id='(\d+)'"], + webpage, 'talk id', default=None) or display_id + duration = None + player = self._search_regex( + r"class='vr-player jp-jplayer'([^>]+)>", webpage, 'player') + formats = [{ + 'url': compat_urlparse.urljoin(url, talk_url), + 'format_id': format_id, + 'ext': determine_ext(talk_url) or format_id, + 'vcodec': 'none', + } for format_id, talk_url in re.findall(r"data-([^=]+)='([^']+)'", player)] self._sort_formats(formats) + thumbnail = self._og_search_thumbnail(webpage) + view_count = int_or_none(self._search_regex( + r"class='play-count[^']*'>\s*(\d+) plays", + webpage, 'play count', fatal=False)) + return { - 'id': compat_str(talk.get('id') or display_id), + 'id': talk_id, 'display_id': display_id, 'title': title, - 'description': talk.get('teaser'), - 'thumbnail': talk.get('image_url'), - 'duration': int_or_none(talk.get('archived_duration')), - 'view_count': int_or_none(talk.get('play_count')), + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'view_count': view_count, 'formats': formats, } |