aboutsummaryrefslogtreecommitdiffstats
path: root/hypervideo_dl/extractor/prx.py
diff options
context:
space:
mode:
Diffstat (limited to 'hypervideo_dl/extractor/prx.py')
-rw-r--r--hypervideo_dl/extractor/prx.py431
1 files changed, 431 insertions, 0 deletions
diff --git a/hypervideo_dl/extractor/prx.py b/hypervideo_dl/extractor/prx.py
new file mode 100644
index 0000000..80561b8
--- /dev/null
+++ b/hypervideo_dl/extractor/prx.py
@@ -0,0 +1,431 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import itertools
+from .common import InfoExtractor, SearchInfoExtractor
+from ..utils import (
+ urljoin,
+ traverse_obj,
+ int_or_none,
+ mimetype2ext,
+ clean_html,
+ url_or_none,
+ unified_timestamp,
+ str_or_none,
+)
+
+
+class PRXBaseIE(InfoExtractor):
+ PRX_BASE_URL_RE = r'https?://(?:(?:beta|listen)\.)?prx.org/%s'
+
+ def _call_api(self, item_id, path, query=None, fatal=True, note='Downloading CMS API JSON'):
+ return self._download_json(
+ urljoin('https://cms.prx.org/api/v1/', path), item_id, query=query, fatal=fatal, note=note)
+
+ @staticmethod
+ def _get_prx_embed_response(response, section):
+ return traverse_obj(response, ('_embedded', f'prx:{section}'))
+
+ @staticmethod
+ def _extract_file_link(response):
+ return url_or_none(traverse_obj(
+ response, ('_links', 'enclosure', 'href'), expected_type=str))
+
+ @classmethod
+ def _extract_image(cls, image_response):
+ if not isinstance(image_response, dict):
+ return
+ return {
+ 'id': str_or_none(image_response.get('id')),
+ 'filesize': image_response.get('size'),
+ 'width': image_response.get('width'),
+ 'height': image_response.get('height'),
+ 'url': cls._extract_file_link(image_response)
+ }
+
+ @classmethod
+ def _extract_base_info(cls, response):
+ if not isinstance(response, dict):
+ return
+ item_id = str_or_none(response.get('id'))
+ if not item_id:
+ return
+ thumbnail_dict = cls._extract_image(cls._get_prx_embed_response(response, 'image'))
+ description = (
+ clean_html(response.get('description'))
+ or response.get('shortDescription'))
+ return {
+ 'id': item_id,
+ 'title': response.get('title') or item_id,
+ 'thumbnails': [thumbnail_dict] if thumbnail_dict else None,
+ 'description': description,
+ 'release_timestamp': unified_timestamp(response.get('releasedAt')),
+ 'timestamp': unified_timestamp(response.get('createdAt')),
+ 'modified_timestamp': unified_timestamp(response.get('updatedAt')),
+ 'duration': int_or_none(response.get('duration')),
+ 'tags': response.get('tags'),
+ 'episode_number': int_or_none(response.get('episodeIdentifier')),
+ 'season_number': int_or_none(response.get('seasonIdentifier'))
+ }
+
+ @classmethod
+ def _extract_series_info(cls, series_response):
+ base_info = cls._extract_base_info(series_response)
+ if not base_info:
+ return
+ account_info = cls._extract_account_info(
+ cls._get_prx_embed_response(series_response, 'account')) or {}
+ return {
+ **base_info,
+ 'channel_id': account_info.get('channel_id'),
+ 'channel_url': account_info.get('channel_url'),
+ 'channel': account_info.get('channel'),
+ 'series': base_info.get('title'),
+ 'series_id': base_info.get('id'),
+ }
+
+ @classmethod
+ def _extract_account_info(cls, account_response):
+ base_info = cls._extract_base_info(account_response)
+ if not base_info:
+ return
+ name = account_response.get('name')
+ return {
+ **base_info,
+ 'title': name,
+ 'channel_id': base_info.get('id'),
+ 'channel_url': 'https://beta.prx.org/accounts/%s' % base_info.get('id'),
+ 'channel': name,
+ }
+
+ @classmethod
+ def _extract_story_info(cls, story_response):
+ base_info = cls._extract_base_info(story_response)
+ if not base_info:
+ return
+ series = cls._extract_series_info(
+ cls._get_prx_embed_response(story_response, 'series')) or {}
+ account = cls._extract_account_info(
+ cls._get_prx_embed_response(story_response, 'account')) or {}
+ return {
+ **base_info,
+ 'series': series.get('series'),
+ 'series_id': series.get('series_id'),
+ 'channel_id': account.get('channel_id'),
+ 'channel_url': account.get('channel_url'),
+ 'channel': account.get('channel')
+ }
+
+ def _entries(self, item_id, endpoint, entry_func, query=None):
+ """
+ Extract entries from paginated list API
+ @param entry_func: Function to generate entry from response item
+ """
+ total = 0
+ for page in itertools.count(1):
+ response = self._call_api(f'{item_id}: page {page}', endpoint, query={
+ **(query or {}),
+ 'page': page,
+ 'per': 100
+ })
+ items = self._get_prx_embed_response(response, 'items')
+ if not response or not items:
+ break
+
+ yield from filter(None, map(entry_func, items))
+
+ total += response['count']
+ if total >= response['total']:
+ break
+
+ def _story_playlist_entry(self, response):
+ story = self._extract_story_info(response)
+ if not story:
+ return
+ story.update({
+ '_type': 'url',
+ 'url': 'https://beta.prx.org/stories/%s' % story['id'],
+ 'ie_key': PRXStoryIE.ie_key()
+ })
+ return story
+
+ def _series_playlist_entry(self, response):
+ series = self._extract_series_info(response)
+ if not series:
+ return
+ series.update({
+ '_type': 'url',
+ 'url': 'https://beta.prx.org/series/%s' % series['id'],
+ 'ie_key': PRXSeriesIE.ie_key()
+ })
+ return series
+
+
+class PRXStoryIE(PRXBaseIE):
+ _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'stories/(?P<id>\d+)'
+
+ _TESTS = [
+ {
+ # Story with season and episode details
+ 'url': 'https://beta.prx.org/stories/399200',
+ 'info_dict': {
+ 'id': '399200',
+ 'title': 'Fly Me To The Moon',
+ 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
+ 'release_timestamp': 1640250000,
+ 'timestamp': 1640208972,
+ 'modified_timestamp': 1641318202,
+ 'duration': 1004,
+ 'tags': 'count:7',
+ 'episode_number': 8,
+ 'season_number': 5,
+ 'series': 'AirSpace',
+ 'series_id': '38057',
+ 'channel_id': '220986',
+ 'channel_url': 'https://beta.prx.org/accounts/220986',
+ 'channel': 'Air and Space Museum',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': '399200_part1',
+ 'title': 'Fly Me To The Moon',
+ 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
+ 'release_timestamp': 1640250000,
+ 'timestamp': 1640208972,
+ 'modified_timestamp': 1641318202,
+ 'duration': 530,
+ 'tags': 'count:7',
+ 'episode_number': 8,
+ 'season_number': 5,
+ 'series': 'AirSpace',
+ 'series_id': '38057',
+ 'channel_id': '220986',
+ 'channel_url': 'https://beta.prx.org/accounts/220986',
+ 'channel': 'Air and Space Museum',
+ 'ext': 'mp3',
+ 'upload_date': '20211222',
+ 'episode': 'Episode 8',
+ 'release_date': '20211223',
+ 'season': 'Season 5',
+ 'modified_date': '20220104'
+ }
+ }, {
+ 'info_dict': {
+ 'id': '399200_part2',
+ 'title': 'Fly Me To The Moon',
+ 'description': 'md5:43230168390b95d3322048d8a56bf2bb',
+ 'release_timestamp': 1640250000,
+ 'timestamp': 1640208972,
+ 'modified_timestamp': 1641318202,
+ 'duration': 474,
+ 'tags': 'count:7',
+ 'episode_number': 8,
+ 'season_number': 5,
+ 'series': 'AirSpace',
+ 'series_id': '38057',
+ 'channel_id': '220986',
+ 'channel_url': 'https://beta.prx.org/accounts/220986',
+ 'channel': 'Air and Space Museum',
+ 'ext': 'mp3',
+ 'upload_date': '20211222',
+ 'episode': 'Episode 8',
+ 'release_date': '20211223',
+ 'season': 'Season 5',
+ 'modified_date': '20220104'
+ }
+ }
+
+ ]
+ }, {
+ # Story with only split audio
+ 'url': 'https://beta.prx.org/stories/326414',
+ 'info_dict': {
+ 'id': '326414',
+ 'title': 'Massachusetts v EPA',
+ 'description': 'md5:744fffba08f19f4deab69fa8d49d5816',
+ 'timestamp': 1592509124,
+ 'modified_timestamp': 1592510457,
+ 'duration': 3088,
+ 'tags': 'count:0',
+ 'series': 'Outside/In',
+ 'series_id': '36252',
+ 'channel_id': '206',
+ 'channel_url': 'https://beta.prx.org/accounts/206',
+ 'channel': 'New Hampshire Public Radio',
+ },
+ 'playlist_count': 4
+ }, {
+ # Story with single combined audio
+ 'url': 'https://beta.prx.org/stories/400404',
+ 'info_dict': {
+ 'id': '400404',
+ 'title': 'Cafe Chill (Episode 2022-01)',
+ 'thumbnails': 'count:1',
+ 'description': 'md5:9f1b5a3cbd64fb159d08c3baa31f1539',
+ 'timestamp': 1641233952,
+ 'modified_timestamp': 1641234248,
+ 'duration': 3540,
+ 'series': 'Café Chill',
+ 'series_id': '37762',
+ 'channel_id': '5767',
+ 'channel_url': 'https://beta.prx.org/accounts/5767',
+ 'channel': 'C89.5 - KNHC Seattle',
+ 'ext': 'mp3',
+ 'tags': 'count:0',
+ 'thumbnail': r're:https?://cms\.prx\.org/pub/\w+/0/web/story_image/767965/medium/Aurora_Over_Trees\.jpg',
+ 'upload_date': '20220103',
+ 'modified_date': '20220103'
+ }
+ }, {
+ 'url': 'https://listen.prx.org/stories/399200',
+ 'only_matching': True
+ }
+ ]
+
+ def _extract_audio_pieces(self, audio_response):
+ return [{
+ 'format_id': str_or_none(piece_response.get('id')),
+ 'format_note': str_or_none(piece_response.get('label')),
+ 'filesize': int_or_none(piece_response.get('size')),
+ 'duration': int_or_none(piece_response.get('duration')),
+ 'ext': mimetype2ext(piece_response.get('contentType')),
+ 'asr': int_or_none(piece_response.get('frequency'), scale=1000),
+ 'abr': int_or_none(piece_response.get('bitRate')),
+ 'url': self._extract_file_link(piece_response),
+ 'vcodec': 'none'
+ } for piece_response in sorted(
+ self._get_prx_embed_response(audio_response, 'items') or [],
+ key=lambda p: int_or_none(p.get('position')))]
+
+ def _extract_story(self, story_response):
+ info = self._extract_story_info(story_response)
+ if not info:
+ return
+ audio_pieces = self._extract_audio_pieces(
+ self._get_prx_embed_response(story_response, 'audio'))
+ if len(audio_pieces) == 1:
+ return {
+ 'formats': audio_pieces,
+ **info
+ }
+
+ entries = [{
+ **info,
+ 'id': '%s_part%d' % (info['id'], (idx + 1)),
+ 'formats': [fmt],
+ } for idx, fmt in enumerate(audio_pieces)]
+ return {
+ '_type': 'multi_video',
+ 'entries': entries,
+ **info
+ }
+
+ def _real_extract(self, url):
+ story_id = self._match_id(url)
+ response = self._call_api(story_id, f'stories/{story_id}')
+ return self._extract_story(response)
+
+
+class PRXSeriesIE(PRXBaseIE):
+ _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'series/(?P<id>\d+)'
+ _TESTS = [
+ {
+ 'url': 'https://beta.prx.org/series/36252',
+ 'info_dict': {
+ 'id': '36252',
+ 'title': 'Outside/In',
+ 'thumbnails': 'count:1',
+ 'description': 'md5:a6bedc5f810777bcb09ab30ff9059114',
+ 'timestamp': 1470684964,
+ 'modified_timestamp': 1582308830,
+ 'channel_id': '206',
+ 'channel_url': 'https://beta.prx.org/accounts/206',
+ 'channel': 'New Hampshire Public Radio',
+ 'series': 'Outside/In',
+ 'series_id': '36252'
+ },
+ 'playlist_mincount': 39
+ }, {
+ # Blank series
+ 'url': 'https://beta.prx.org/series/25038',
+ 'info_dict': {
+ 'id': '25038',
+ 'title': '25038',
+ 'timestamp': 1207612800,
+ 'modified_timestamp': 1207612800,
+ 'channel_id': '206',
+ 'channel_url': 'https://beta.prx.org/accounts/206',
+ 'channel': 'New Hampshire Public Radio',
+ 'series': '25038',
+ 'series_id': '25038'
+ },
+ 'playlist_count': 0
+ }
+ ]
+
+ def _extract_series(self, series_response):
+ info = self._extract_series_info(series_response)
+ return {
+ '_type': 'playlist',
+ 'entries': self._entries(info['id'], 'series/%s/stories' % info['id'], self._story_playlist_entry),
+ **info
+ }
+
+ def _real_extract(self, url):
+ series_id = self._match_id(url)
+ response = self._call_api(series_id, f'series/{series_id}')
+ return self._extract_series(response)
+
+
+class PRXAccountIE(PRXBaseIE):
+ _VALID_URL = PRXBaseIE.PRX_BASE_URL_RE % r'accounts/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://beta.prx.org/accounts/206',
+ 'info_dict': {
+ 'id': '206',
+ 'title': 'New Hampshire Public Radio',
+ 'description': 'md5:277f2395301d0aca563c80c70a18ee0a',
+ 'channel_id': '206',
+ 'channel_url': 'https://beta.prx.org/accounts/206',
+ 'channel': 'New Hampshire Public Radio',
+ 'thumbnails': 'count:1'
+ },
+ 'playlist_mincount': 380
+ }]
+
+ def _extract_account(self, account_response):
+ info = self._extract_account_info(account_response)
+ series = self._entries(
+ info['id'], f'accounts/{info["id"]}/series', self._series_playlist_entry)
+ stories = self._entries(
+ info['id'], f'accounts/{info["id"]}/stories', self._story_playlist_entry)
+ return {
+ '_type': 'playlist',
+ 'entries': itertools.chain(series, stories),
+ **info
+ }
+
+ def _real_extract(self, url):
+ account_id = self._match_id(url)
+ response = self._call_api(account_id, f'accounts/{account_id}')
+ return self._extract_account(response)
+
+
+class PRXStoriesSearchIE(PRXBaseIE, SearchInfoExtractor):
+ IE_DESC = 'PRX Stories Search'
+ IE_NAME = 'prxstories:search'
+ _SEARCH_KEY = 'prxstories'
+
+ def _search_results(self, query):
+ yield from self._entries(
+ f'query {query}', 'stories/search', self._story_playlist_entry, query={'q': query})
+
+
+class PRXSeriesSearchIE(PRXBaseIE, SearchInfoExtractor):
+ IE_DESC = 'PRX Series Search'
+ IE_NAME = 'prxseries:search'
+ _SEARCH_KEY = 'prxseries'
+
+ def _search_results(self, query):
+ yield from self._entries(
+ f'query {query}', 'series/search', self._series_playlist_entry, query={'q': query})