diff options
author | coletdjnz <colethedj@protonmail.com> | 2022-01-07 11:03:02 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-01-07 16:33:02 +0530 |
commit | f0d785d3ed59e879a69f69f3c9334754f11747e0 (patch) | |
tree | 3c7b85fadccf75f3b3abb2fe531cafec61b7783d | |
parent | 97a6b117d934cbe2898d5d127f14dcd837678e76 (diff) | |
download | hypervideo-pre-f0d785d3ed59e879a69f69f3c9334754f11747e0.tar.lz hypervideo-pre-f0d785d3ed59e879a69f69f3c9334754f11747e0.tar.xz hypervideo-pre-f0d785d3ed59e879a69f69f3c9334754f11747e0.zip |
[youtube:tab] Extract more playlist metadata (#2069)
* Add fields modified_date, modified_timestamp
* Add field playlist_count
* [youtube:tab] Extract view_count, playlist_count, modified_date
Authored by: coletdjnz, pukkandan
-rw-r--r-- | README.md | 5 | ||||
-rw-r--r-- | yt_dlp/YoutubeDL.py | 17 | ||||
-rw-r--r-- | yt_dlp/extractor/common.py | 16 | ||||
-rw-r--r-- | yt_dlp/extractor/youtube.py | 53 |
4 files changed, 65 insertions, 26 deletions
@@ -1120,8 +1120,10 @@ The available fields are: - `creator` (string): The creator of the video - `timestamp` (numeric): UNIX timestamp of the moment the video became available - `upload_date` (string): Video upload date (YYYYMMDD) - - `release_date` (string): The date (YYYYMMDD) when the video was released - `release_timestamp` (numeric): UNIX timestamp of the moment the video was released + - `release_date` (string): The date (YYYYMMDD) when the video was released + - `modified_timestamp` (numeric): UNIX timestamp of the moment the video was last modified + - `modified_date` (string): The date (YYYYMMDD) when the video was last modified - `uploader_id` (string): Nickname or id of the video uploader - `channel` (string): Full name of the channel the video is uploaded on - `channel_id` (string): Id of the channel @@ -1167,6 +1169,7 @@ The available fields are: - `video_autonumber` (numeric): Number that will be increased with each video - `n_entries` (numeric): Total number of extracted items in the playlist - `playlist` (string): Name or id of the playlist that contains the video + - `playlist_count` (numeric): Total number of items in the playlist. May not be known if entire playlist is not extracted - `playlist_index` (numeric): Index of the video in the playlist padded with leading zeros according the final index - `playlist_autonumber` (numeric): Position of the video in the playlist download queue padded with leading zeros according to the total length of the playlist - `playlist_id` (string): Playlist identifier diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 463251789..dff4b17b3 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1636,14 +1636,15 @@ class YoutubeDL(object): playlistitems = orderedSet(iter_playlistitems(playlistitems_str)) ie_entries = ie_result['entries'] - msg = ( - 'Downloading %d videos' if not isinstance(ie_entries, list) - else 'Collected %d videos; downloading %%d of them' % len(ie_entries)) - if isinstance(ie_entries, list): + playlist_count = len(ie_result) + msg = f'Collected {playlist_count} videos; downloading %d of them' + ie_result['playlist_count'] = ie_result.get('playlist_count') or playlist_count + def get_entry(i): return ie_entries[i - 1] else: + msg = 'Downloading %d videos' if not isinstance(ie_entries, (PagedList, LazyList)): ie_entries = LazyList(ie_entries) @@ -1652,7 +1653,7 @@ class YoutubeDL(object): lambda self, i: ie_entries[i - 1] )(self, i) - entries = [] + entries, broken = [], False items = playlistitems if playlistitems is not None else itertools.count(playliststart) for i in items: if i == 0: @@ -1674,6 +1675,7 @@ class YoutubeDL(object): if entry is not None: self._match_entry(entry, incomplete=True, silent=True) except (ExistingVideoReached, RejectedVideoReached): + broken = True break ie_result['entries'] = entries @@ -1684,6 +1686,9 @@ class YoutubeDL(object): if entry is not None] n_entries = len(entries) + if not (ie_result.get('playlist_count') or broken or playlistitems or playlistend): + ie_result['playlist_count'] = n_entries + if not playlistitems and (playliststart != 1 or playlistend): playlistitems = list(range(playliststart, playliststart + n_entries)) ie_result['requested_entries'] = playlistitems @@ -1733,6 +1738,7 @@ class YoutubeDL(object): extra = { 'n_entries': n_entries, '_last_playlist_index': max(playlistitems) if playlistitems else (playlistend or n_entries), + 'playlist_count': ie_result.get('playlist_count'), 'playlist_index': playlist_index, 'playlist_autonumber': i, 'playlist': playlist, @@ -2331,6 +2337,7 @@ class YoutubeDL(object): for ts_key, date_key in ( ('timestamp', 'upload_date'), ('release_timestamp', 'release_date'), + ('modified_timestamp', 'modified_date'), ): if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None: # Working around out-of-range timestamp values (e.g. negative ones on Windows, diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 79f53c9c2..7c83991ea 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -243,11 +243,16 @@ class InfoExtractor(object): uploader: Full name of the video uploader. license: License name the video is licensed under. creator: The creator of the video. - release_timestamp: UNIX timestamp of the moment the video was released. - release_date: The date (YYYYMMDD) when the video was released. timestamp: UNIX timestamp of the moment the video was uploaded upload_date: Video upload date (YYYYMMDD). - If not explicitly set, calculated from timestamp. + If not explicitly set, calculated from timestamp + release_timestamp: UNIX timestamp of the moment the video was released. + If it is not clear whether to use timestamp or this, use the former + release_date: The date (YYYYMMDD) when the video was released. + If not explicitly set, calculated from release_timestamp + modified_timestamp: UNIX timestamp of the moment the video was last modified. + modified_date: The date (YYYYMMDD) when the video was last modified. + If not explicitly set, calculated from modified_timestamp uploader_id: Nickname or id of the video uploader. uploader_url: Full URL to a personal webpage of the video uploader. channel: Full name of the channel the video is uploaded on. @@ -383,6 +388,11 @@ class InfoExtractor(object): Additionally, playlists can have "id", "title", and any other relevent attributes with the same semantics as videos (see above). + It can also have the following optional fields: + + playlist_count: The total number of videos in a playlist. If not given, + YoutubeDL tries to calculate it from "entries" + _type "multi_video" indicates that there are multiple videos that form a single show, for examples multiple acts of an opera or TV episode. diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 65d59802b..d266a36c6 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -62,6 +62,7 @@ from ..utils import ( try_get, unescapeHTML, unified_strdate, + unified_timestamp, unsmuggle_url, update_url_query, url_or_none, @@ -667,6 +668,14 @@ class YoutubeBaseInfoExtractor(InfoExtractor): if text: return text + def _get_count(self, data, *path_list): + count_text = self._get_text(data, *path_list) or '' + count = parse_count(count_text) + if count is None: + count = str_to_int( + self._search_regex(r'^([\d,]+)', re.sub(r'\s', '', count_text), 'count', default=None)) + return count + @staticmethod def _extract_thumbnails(data, *path_list): """ @@ -695,12 +704,15 @@ class YoutubeBaseInfoExtractor(InfoExtractor): def extract_relative_time(relative_time_text): """ Extracts a relative time from string and converts to dt object - e.g. 'streamed 6 days ago', '5 seconds ago (edited)' + e.g. 'streamed 6 days ago', '5 seconds ago (edited)', 'updated today' """ - mobj = re.search(r'(?P<time>\d+)\s*(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?\s*ago', relative_time_text) + mobj = re.search(r'(?P<start>today|yesterday|now)|(?P<time>\d+)\s*(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?\s*ago', relative_time_text) if mobj: + start = mobj.group('start') + if start: + return datetime_from_str(start) try: - return datetime_from_str('now-%s%s' % (mobj.group('time'), mobj.group('unit')), precision='auto') + return datetime_from_str('now-%s%s' % (mobj.group('time'), mobj.group('unit'))) except ValueError: return None @@ -710,6 +722,13 @@ class YoutubeBaseInfoExtractor(InfoExtractor): timestamp = None if isinstance(dt, datetime.datetime): timestamp = calendar.timegm(dt.timetuple()) + + if timestamp is None: + timestamp = ( + unified_timestamp(text) or unified_timestamp( + self._search_regex( + (r'(?:.+|^)(?:live|premieres|ed|ing)(?:\s*on)?\s*(.+\d)', r'\w+[\s,\.-]*\w+[\s,\.-]+20\d{2}'), text.lower(), 'time text', default=None))) + if text and timestamp is None: self.report_warning('Cannot parse localized time text' + bug_reports_message(), only_once=True) return timestamp, text @@ -794,10 +813,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): description = self._get_text(renderer, 'descriptionSnippet') duration = parse_duration(self._get_text( renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text'))) - view_count_text = self._get_text(renderer, 'viewCountText') or '' - view_count = str_to_int(self._search_regex( - r'^([\d,]+)', re.sub(r'\s', '', view_count_text), - 'view count', default=None)) + view_count = self._get_count(renderer, 'viewCountText') uploader = self._get_text(renderer, 'ownerText', 'shortBylineText') channel_id = traverse_obj( @@ -2317,8 +2333,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): _continuation = None for content in contents: comments_header_renderer = traverse_obj(content, 'commentsHeaderRenderer') - expected_comment_count = parse_count(self._get_text( - comments_header_renderer, 'countText', 'commentsCount', max_runs=1)) + expected_comment_count = self._get_count( + comments_header_renderer, 'countText', 'commentsCount') if expected_comment_count: tracker['est_total'] = expected_comment_count @@ -3603,6 +3619,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): tags = [] selected_tab = self._extract_selected_tab(tabs) + primary_sidebar_renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer') renderer = try_get( data, lambda x: x['metadata']['channelMetadataRenderer'], dict) if renderer: @@ -3622,17 +3639,18 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): thumbnails = ( self._extract_thumbnails(renderer, 'avatar') or self._extract_thumbnails( - self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer'), - ('thumbnailRenderer', 'playlistVideoThumbnailRenderer', 'thumbnail'))) + primary_sidebar_renderer, ('thumbnailRenderer', 'playlistVideoThumbnailRenderer', 'thumbnail'))) if playlist_id is None: playlist_id = item_id + + playlist_stats = traverse_obj(primary_sidebar_renderer, 'stats') + last_updated_unix, _ = self._extract_time_text(playlist_stats, 2) if title is None: - title = ( - try_get(data, lambda x: x['header']['hashtagHeaderRenderer']['hashtag']['simpleText']) - or playlist_id) + title = self._get_text(data, ('header', 'hashtagHeaderRenderer', 'hashtag')) or playlist_id title += format_field(selected_tab, 'title', ' - %s') title += format_field(selected_tab, 'expandedText', ' - %s') + metadata = { 'playlist_id': playlist_id, 'playlist_title': title, @@ -3642,10 +3660,11 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor): 'uploader_url': channel_url, 'thumbnails': thumbnails, 'tags': tags, + 'view_count': self._get_count(playlist_stats, 1), + 'availability': self._extract_availability(data), + 'modified_date': strftime_or_none(last_updated_unix, '%Y%m%d'), + 'playlist_count': self._get_count(playlist_stats, 0) } - availability = self._extract_availability(data) - if availability: - metadata['availability'] = availability if not channel_id: metadata.update(self._extract_uploader(data)) metadata.update({ |