diff options
Diffstat (limited to 'youtube_dlc/extractor/storyfire.py')
-rw-r--r-- | youtube_dlc/extractor/storyfire.py | 314 |
1 files changed, 105 insertions, 209 deletions
diff --git a/youtube_dlc/extractor/storyfire.py b/youtube_dlc/extractor/storyfire.py index 19cb1ff9e..9c698626f 100644 --- a/youtube_dlc/extractor/storyfire.py +++ b/youtube_dlc/extractor/storyfire.py @@ -1,255 +1,151 @@ # coding: utf-8 from __future__ import unicode_literals -import itertools +import functools + from .common import InfoExtractor +from ..utils import ( + # HEADRequest, + int_or_none, + OnDemandPagedList, + smuggle_url, +) + + +class StoryFireBaseIE(InfoExtractor): + _VALID_URL_BASE = r'https?://(?:www\.)?storyfire\.com/' + + def _call_api(self, path, video_id, resource, query=None): + return self._download_json( + 'https://storyfire.com/app/%s/%s' % (path, video_id), video_id, + 'Downloading %s JSON metadata' % resource, query=query) + + def _parse_video(self, video): + title = video['title'] + vimeo_id = self._search_regex( + r'https?://player\.vimeo\.com/external/(\d+)', + video['vimeoVideoURL'], 'vimeo id') + + # video_url = self._request_webpage( + # HEADRequest(video['vimeoVideoURL']), video_id).geturl() + # formats = [] + # for v_url, suffix in [(video_url, '_sep'), (video_url.replace('/sep/video/', '/video/'), '')]: + # formats.extend(self._extract_m3u8_formats( + # v_url, video_id, 'mp4', 'm3u8_native', + # m3u8_id='hls' + suffix, fatal=False)) + # formats.extend(self._extract_mpd_formats( + # v_url.replace('.m3u8', '.mpd'), video_id, + # mpd_id='dash' + suffix, fatal=False)) + # self._sort_formats(formats) + uploader_id = video.get('hostID') -class StoryFireIE(InfoExtractor): - _VALID_URL = r'(?:(?:https?://(?:www\.)?storyfire\.com/video-details)|(?:https://storyfire.app.link))/(?P<id>[^/\s]+)' - _TESTS = [{ + return { + '_type': 'url_transparent', + 'id': vimeo_id, + 'title': title, + 'description': video.get('description'), + 'url': smuggle_url( + 'https://player.vimeo.com/video/' + vimeo_id, { + 'http_headers': { + 'Referer': 'https://storyfire.com/', + } + }), + # 'formats': formats, + 'thumbnail': video.get('storyImage'), + 'view_count': int_or_none(video.get('views')), + 'like_count': int_or_none(video.get('likesCount')), + 'comment_count': int_or_none(video.get('commentsCount')), + 'duration': int_or_none(video.get('videoDuration')), + 'timestamp': int_or_none(video.get('publishDate')), + 'uploader': video.get('username'), + 'uploader_id': uploader_id, + 'uploader_url': 'https://storyfire.com/user/%s/video' % uploader_id if uploader_id else None, + 'episode_number': int_or_none(video.get('episodeNumber') or video.get('episode_number')), + } + + +class StoryFireIE(StoryFireBaseIE): + _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'video-details/(?P<id>[0-9a-f]{24})' + _TEST = { 'url': 'https://storyfire.com/video-details/5df1d132b6378700117f9181', - 'md5': '560953bfca81a69003cfa5e53ac8a920', + 'md5': 'caec54b9e4621186d6079c7ec100c1eb', 'info_dict': { - 'id': '5df1d132b6378700117f9181', + 'id': '378954662', 'ext': 'mp4', 'title': 'Buzzfeed Teaches You About Memes', 'uploader_id': 'ntZAJFECERSgqHSxzonV5K2E89s1', 'timestamp': 1576129028, - 'description': 'Mocking Buzzfeed\'s meme lesson. Reuploaded from YouTube because of their new policies', + 'description': 'md5:0b4e28021548e144bed69bb7539e62ea', 'uploader': 'whang!', 'upload_date': '20191212', + 'duration': 418, + 'view_count': int, + 'like_count': int, + 'comment_count': int, }, - 'params': {'format': 'bestvideo'} # There are no merged formats in the playlist. - }, { - 'url': 'https://storyfire.app.link/5GxAvWOQr8', # Alternate URL format, with unrelated short ID - 'md5': '7a2dc6d60c4889edfed459c620fe690d', - 'info_dict': { - 'id': '5f1e11ecd78a57b6c702001d', - 'ext': 'm4a', - 'title': 'Weird Nintendo Prototype Leaks', - 'description': 'A stream taking a look at some weird Nintendo Prototypes with Luigi in Mario 64 and weird Yoshis', - 'timestamp': 1595808576, - 'upload_date': '20200727', - 'uploader': 'whang!', - 'uploader_id': 'ntZAJFECERSgqHSxzonV5K2E89s1', + 'params': { + 'skip_download': True, }, - 'params': {'format': 'bestaudio'} # Verifying audio extraction - - }] - - _aformats = { - 'audio-medium-audio': {'acodec': 'aac', 'abr': 125, 'preference': -10}, - 'audio-high-audio': {'acodec': 'aac', 'abr': 254, 'preference': -1}, + 'expected_warnings': ['Unable to download JSON metadata'] } def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - # Extracting the json blob is mandatory to proceed with extraction. - jsontext = self._html_search_regex( - r'<script id="__NEXT_DATA__" type="application/json">(.+?)</script>', - webpage, 'json_data') - - json = self._parse_json(jsontext, video_id) - - # The currentVideo field in the json is mandatory - # because it contains the only link to the m3u playlist - video = json['props']['initialState']['video']['currentVideo'] - videourl = video['vimeoVideoURL'] # Video URL is mandatory - - # Extract other fields from the json in an error tolerant fashion - # ID may be incorrect (on short URL format), correct it. - parsed_id = video.get('_id') - if parsed_id: - video_id = parsed_id + video = self._call_api( + 'generic/video-detail', video_id, 'video')['video'] + return self._parse_video(video) - title = video.get('title') - description = video.get('description') - thumbnail = video.get('storyImage') - views = video.get('views') - likes = video.get('likesCount') - comments = video.get('commentsCount') - duration = video.get('videoDuration') - publishdate = video.get('publishDate') # Apparently epoch time, day only - - uploader = video.get('username') - uploader_id = video.get('hostID') - # Construct an uploader URL - uploader_url = None - if uploader_id: - uploader_url = "https://storyfire.com/user/%s/video" % uploader_id - - # Collect root playlist to determine formats - formats = self._extract_m3u8_formats( - videourl, video_id, 'mp4', 'm3u8_native') - - # Modify formats to fill in missing information about audio codecs - for format in formats: - aformat = self._aformats.get(format['format_id']) - if aformat: - format['acodec'] = aformat['acodec'] - format['abr'] = aformat['abr'] - format['quality'] = aformat['preference'] - format['ext'] = 'm4a' - - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'ext': "mp4", - 'url': videourl, - 'formats': formats, - - 'thumbnail': thumbnail, - 'view_count': views, - 'like_count': likes, - 'comment_count': comments, - 'duration': duration, - 'timestamp': publishdate, - - 'uploader': uploader, - 'uploader_id': uploader_id, - 'uploader_url': uploader_url, - - } - - -class StoryFireUserIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?storyfire\.com/user/(?P<id>[^/\s]+)/video' - _TESTS = [{ - 'url': 'https://storyfire.com/user/ntZAJFECERSgqHSxzonV5K2E89s1/video', - 'info_dict': { - 'id': 'ntZAJFECERSgqHSxzonV5K2E89s1', - 'title': 'whang!', - }, - 'playlist_mincount': 18 - }, { +class StoryFireUserIE(StoryFireBaseIE): + _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'user/(?P<id>[^/]+)/video' + _TEST = { 'url': 'https://storyfire.com/user/UQ986nFxmAWIgnkZQ0ftVhq4nOk2/video', 'info_dict': { 'id': 'UQ986nFxmAWIgnkZQ0ftVhq4nOk2', - 'title': 'McJuggerNuggets', }, - 'playlist_mincount': 143 - - }] + 'playlist_mincount': 151, + } + _PAGE_SIZE = 20 - # Generator for fetching playlist items - def _enum_videos(self, baseurl, user_id, firstjson): - totalVideos = int(firstjson['videosCount']) - haveVideos = 0 - json = firstjson - - for page in itertools.count(1): - for video in json['videos']: - id = video['_id'] - url = "https://storyfire.com/video-details/%s" % id - haveVideos += 1 - yield { - '_type': 'url', - 'id': id, - 'url': url, - 'ie_key': 'StoryFire', - - 'title': video.get('title'), - 'description': video.get('description'), - 'view_count': video.get('views'), - 'comment_count': video.get('commentsCount'), - 'duration': video.get('videoDuration'), - 'timestamp': video.get('publishDate'), - } - # Are there more pages we could fetch? - if haveVideos < totalVideos: - pageurl = baseurl + ("%i" % haveVideos) - json = self._download_json(pageurl, user_id, - note='Downloading page %s' % page) - - # Are there any videos in the new json? - videos = json.get('videos') - if not videos or len(videos) == 0: - break # no videos - - else: - break # We have fetched all the videos, stop + def _fetch_page(self, user_id, page): + videos = self._call_api( + 'publicVideos', user_id, 'page %d' % (page + 1), { + 'skip': page * self._PAGE_SIZE, + })['videos'] + for video in videos: + yield self._parse_video(video) def _real_extract(self, url): user_id = self._match_id(url) + entries = OnDemandPagedList(functools.partial( + self._fetch_page, user_id), self._PAGE_SIZE) + return self.playlist_result(entries, user_id) - baseurl = "https://storyfire.com/app/publicVideos/%s?skip=" % user_id - - # Download first page to ensure it can be downloaded, and get user information if available. - firstpage = baseurl + "0" - firstjson = self._download_json(firstpage, user_id) - - title = None - videos = firstjson.get('videos') - if videos and len(videos): - title = videos[1].get('username') - return { - '_type': 'playlist', - 'entries': self._enum_videos(baseurl, user_id, firstjson), - 'id': user_id, - 'title': title, - } - - -class StoryFireSeriesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?storyfire\.com/write/series/stories/(?P<id>[^/\s]+)' +class StoryFireSeriesIE(StoryFireBaseIE): + _VALID_URL = StoryFireBaseIE._VALID_URL_BASE + r'write/series/stories/(?P<id>[^/?&#]+)' _TESTS = [{ 'url': 'https://storyfire.com/write/series/stories/-Lq6MsuIHLODO6d2dDkr/', 'info_dict': { 'id': '-Lq6MsuIHLODO6d2dDkr', }, - 'playlist_mincount': 13 + 'playlist_mincount': 13, }, { 'url': 'https://storyfire.com/write/series/stories/the_mortal_one/', 'info_dict': { 'id': 'the_mortal_one', }, - 'playlist_count': 0 # This playlist has entries, but no videos. - }, { - 'url': 'https://storyfire.com/write/series/stories/story_time', - 'info_dict': { - 'id': 'story_time', - }, - 'playlist_mincount': 10 + 'playlist_count': 0, }] - # Generator for returning playlist items - # This object is substantially different than the one in the user videos page above - def _enum_videos(self, jsonlist): - for video in jsonlist: - id = video['_id'] - if video.get('hasVideo'): # Boolean element - url = "https://storyfire.com/video-details/%s" % id - yield { - '_type': 'url', - 'id': id, - 'url': url, - 'ie_key': 'StoryFire', - - 'title': video.get('title'), - 'description': video.get('description'), - 'view_count': video.get('views'), - 'likes_count': video.get('likesCount'), - 'comment_count': video.get('commentsCount'), - 'duration': video.get('videoDuration'), - 'timestamp': video.get('publishDate'), - } + def _extract_videos(self, stories): + for story in stories.values(): + if story.get('hasVideo'): + yield self._parse_video(story) def _real_extract(self, url): - list_id = self._match_id(url) - - listurl = "https://storyfire.com/app/seriesStories/%s/list" % list_id - json = self._download_json(listurl, list_id) - - return { - '_type': 'playlist', - 'entries': self._enum_videos(json), - 'id': list_id - } + series_id = self._match_id(url) + stories = self._call_api( + 'seriesStories', series_id, 'series stories') + return self.playlist_result(self._extract_videos(stories), series_id) |