aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorcoletdjnz <colethedj@protonmail.com>2022-01-07 11:03:02 +0000
committerGitHub <noreply@github.com>2022-01-07 16:33:02 +0530
commitf0d785d3ed59e879a69f69f3c9334754f11747e0 (patch)
tree3c7b85fadccf75f3b3abb2fe531cafec61b7783d
parent97a6b117d934cbe2898d5d127f14dcd837678e76 (diff)
downloadhypervideo-pre-f0d785d3ed59e879a69f69f3c9334754f11747e0.tar.lz
hypervideo-pre-f0d785d3ed59e879a69f69f3c9334754f11747e0.tar.xz
hypervideo-pre-f0d785d3ed59e879a69f69f3c9334754f11747e0.zip
[youtube:tab] Extract more playlist metadata (#2069)
* Add fields modified_date, modified_timestamp * Add field playlist_count * [youtube:tab] Extract view_count, playlist_count, modified_date Authored by: coletdjnz, pukkandan
-rw-r--r--README.md5
-rw-r--r--yt_dlp/YoutubeDL.py17
-rw-r--r--yt_dlp/extractor/common.py16
-rw-r--r--yt_dlp/extractor/youtube.py53
4 files changed, 65 insertions, 26 deletions
diff --git a/README.md b/README.md
index db559c83e..b40f5c693 100644
--- a/README.md
+++ b/README.md
@@ -1120,8 +1120,10 @@ The available fields are:
- `creator` (string): The creator of the video
- `timestamp` (numeric): UNIX timestamp of the moment the video became available
- `upload_date` (string): Video upload date (YYYYMMDD)
- - `release_date` (string): The date (YYYYMMDD) when the video was released
- `release_timestamp` (numeric): UNIX timestamp of the moment the video was released
+ - `release_date` (string): The date (YYYYMMDD) when the video was released
+ - `modified_timestamp` (numeric): UNIX timestamp of the moment the video was last modified
+ - `modified_date` (string): The date (YYYYMMDD) when the video was last modified
- `uploader_id` (string): Nickname or id of the video uploader
- `channel` (string): Full name of the channel the video is uploaded on
- `channel_id` (string): Id of the channel
@@ -1167,6 +1169,7 @@ The available fields are:
- `video_autonumber` (numeric): Number that will be increased with each video
- `n_entries` (numeric): Total number of extracted items in the playlist
- `playlist` (string): Name or id of the playlist that contains the video
+ - `playlist_count` (numeric): Total number of items in the playlist. May not be known if entire playlist is not extracted
- `playlist_index` (numeric): Index of the video in the playlist padded with leading zeros according the final index
- `playlist_autonumber` (numeric): Position of the video in the playlist download queue padded with leading zeros according to the total length of the playlist
- `playlist_id` (string): Playlist identifier
diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py
index 463251789..dff4b17b3 100644
--- a/yt_dlp/YoutubeDL.py
+++ b/yt_dlp/YoutubeDL.py
@@ -1636,14 +1636,15 @@ class YoutubeDL(object):
playlistitems = orderedSet(iter_playlistitems(playlistitems_str))
ie_entries = ie_result['entries']
- msg = (
- 'Downloading %d videos' if not isinstance(ie_entries, list)
- else 'Collected %d videos; downloading %%d of them' % len(ie_entries))
-
if isinstance(ie_entries, list):
+ playlist_count = len(ie_result)
+ msg = f'Collected {playlist_count} videos; downloading %d of them'
+ ie_result['playlist_count'] = ie_result.get('playlist_count') or playlist_count
+
def get_entry(i):
return ie_entries[i - 1]
else:
+ msg = 'Downloading %d videos'
if not isinstance(ie_entries, (PagedList, LazyList)):
ie_entries = LazyList(ie_entries)
@@ -1652,7 +1653,7 @@ class YoutubeDL(object):
lambda self, i: ie_entries[i - 1]
)(self, i)
- entries = []
+ entries, broken = [], False
items = playlistitems if playlistitems is not None else itertools.count(playliststart)
for i in items:
if i == 0:
@@ -1674,6 +1675,7 @@ class YoutubeDL(object):
if entry is not None:
self._match_entry(entry, incomplete=True, silent=True)
except (ExistingVideoReached, RejectedVideoReached):
+ broken = True
break
ie_result['entries'] = entries
@@ -1684,6 +1686,9 @@ class YoutubeDL(object):
if entry is not None]
n_entries = len(entries)
+ if not (ie_result.get('playlist_count') or broken or playlistitems or playlistend):
+ ie_result['playlist_count'] = n_entries
+
if not playlistitems and (playliststart != 1 or playlistend):
playlistitems = list(range(playliststart, playliststart + n_entries))
ie_result['requested_entries'] = playlistitems
@@ -1733,6 +1738,7 @@ class YoutubeDL(object):
extra = {
'n_entries': n_entries,
'_last_playlist_index': max(playlistitems) if playlistitems else (playlistend or n_entries),
+ 'playlist_count': ie_result.get('playlist_count'),
'playlist_index': playlist_index,
'playlist_autonumber': i,
'playlist': playlist,
@@ -2331,6 +2337,7 @@ class YoutubeDL(object):
for ts_key, date_key in (
('timestamp', 'upload_date'),
('release_timestamp', 'release_date'),
+ ('modified_timestamp', 'modified_date'),
):
if info_dict.get(date_key) is None and info_dict.get(ts_key) is not None:
# Working around out-of-range timestamp values (e.g. negative ones on Windows,
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py
index 79f53c9c2..7c83991ea 100644
--- a/yt_dlp/extractor/common.py
+++ b/yt_dlp/extractor/common.py
@@ -243,11 +243,16 @@ class InfoExtractor(object):
uploader: Full name of the video uploader.
license: License name the video is licensed under.
creator: The creator of the video.
- release_timestamp: UNIX timestamp of the moment the video was released.
- release_date: The date (YYYYMMDD) when the video was released.
timestamp: UNIX timestamp of the moment the video was uploaded
upload_date: Video upload date (YYYYMMDD).
- If not explicitly set, calculated from timestamp.
+ If not explicitly set, calculated from timestamp
+ release_timestamp: UNIX timestamp of the moment the video was released.
+ If it is not clear whether to use timestamp or this, use the former
+ release_date: The date (YYYYMMDD) when the video was released.
+ If not explicitly set, calculated from release_timestamp
+ modified_timestamp: UNIX timestamp of the moment the video was last modified.
+ modified_date: The date (YYYYMMDD) when the video was last modified.
+ If not explicitly set, calculated from modified_timestamp
uploader_id: Nickname or id of the video uploader.
uploader_url: Full URL to a personal webpage of the video uploader.
channel: Full name of the channel the video is uploaded on.
@@ -383,6 +388,11 @@ class InfoExtractor(object):
Additionally, playlists can have "id", "title", and any other relevent
attributes with the same semantics as videos (see above).
+ It can also have the following optional fields:
+
+ playlist_count: The total number of videos in a playlist. If not given,
+ YoutubeDL tries to calculate it from "entries"
+
_type "multi_video" indicates that there are multiple videos that
form a single show, for examples multiple acts of an opera or TV episode.
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py
index 65d59802b..d266a36c6 100644
--- a/yt_dlp/extractor/youtube.py
+++ b/yt_dlp/extractor/youtube.py
@@ -62,6 +62,7 @@ from ..utils import (
try_get,
unescapeHTML,
unified_strdate,
+ unified_timestamp,
unsmuggle_url,
update_url_query,
url_or_none,
@@ -667,6 +668,14 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
if text:
return text
+ def _get_count(self, data, *path_list):
+ count_text = self._get_text(data, *path_list) or ''
+ count = parse_count(count_text)
+ if count is None:
+ count = str_to_int(
+ self._search_regex(r'^([\d,]+)', re.sub(r'\s', '', count_text), 'count', default=None))
+ return count
+
@staticmethod
def _extract_thumbnails(data, *path_list):
"""
@@ -695,12 +704,15 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
def extract_relative_time(relative_time_text):
"""
Extracts a relative time from string and converts to dt object
- e.g. 'streamed 6 days ago', '5 seconds ago (edited)'
+ e.g. 'streamed 6 days ago', '5 seconds ago (edited)', 'updated today'
"""
- mobj = re.search(r'(?P<time>\d+)\s*(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?\s*ago', relative_time_text)
+ mobj = re.search(r'(?P<start>today|yesterday|now)|(?P<time>\d+)\s*(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?\s*ago', relative_time_text)
if mobj:
+ start = mobj.group('start')
+ if start:
+ return datetime_from_str(start)
try:
- return datetime_from_str('now-%s%s' % (mobj.group('time'), mobj.group('unit')), precision='auto')
+ return datetime_from_str('now-%s%s' % (mobj.group('time'), mobj.group('unit')))
except ValueError:
return None
@@ -710,6 +722,13 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
timestamp = None
if isinstance(dt, datetime.datetime):
timestamp = calendar.timegm(dt.timetuple())
+
+ if timestamp is None:
+ timestamp = (
+ unified_timestamp(text) or unified_timestamp(
+ self._search_regex(
+ (r'(?:.+|^)(?:live|premieres|ed|ing)(?:\s*on)?\s*(.+\d)', r'\w+[\s,\.-]*\w+[\s,\.-]+20\d{2}'), text.lower(), 'time text', default=None)))
+
if text and timestamp is None:
self.report_warning('Cannot parse localized time text' + bug_reports_message(), only_once=True)
return timestamp, text
@@ -794,10 +813,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
description = self._get_text(renderer, 'descriptionSnippet')
duration = parse_duration(self._get_text(
renderer, 'lengthText', ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'text')))
- view_count_text = self._get_text(renderer, 'viewCountText') or ''
- view_count = str_to_int(self._search_regex(
- r'^([\d,]+)', re.sub(r'\s', '', view_count_text),
- 'view count', default=None))
+ view_count = self._get_count(renderer, 'viewCountText')
uploader = self._get_text(renderer, 'ownerText', 'shortBylineText')
channel_id = traverse_obj(
@@ -2317,8 +2333,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
_continuation = None
for content in contents:
comments_header_renderer = traverse_obj(content, 'commentsHeaderRenderer')
- expected_comment_count = parse_count(self._get_text(
- comments_header_renderer, 'countText', 'commentsCount', max_runs=1))
+ expected_comment_count = self._get_count(
+ comments_header_renderer, 'countText', 'commentsCount')
if expected_comment_count:
tracker['est_total'] = expected_comment_count
@@ -3603,6 +3619,7 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
tags = []
selected_tab = self._extract_selected_tab(tabs)
+ primary_sidebar_renderer = self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer')
renderer = try_get(
data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
if renderer:
@@ -3622,17 +3639,18 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
thumbnails = (
self._extract_thumbnails(renderer, 'avatar')
or self._extract_thumbnails(
- self._extract_sidebar_info_renderer(data, 'playlistSidebarPrimaryInfoRenderer'),
- ('thumbnailRenderer', 'playlistVideoThumbnailRenderer', 'thumbnail')))
+ primary_sidebar_renderer, ('thumbnailRenderer', 'playlistVideoThumbnailRenderer', 'thumbnail')))
if playlist_id is None:
playlist_id = item_id
+
+ playlist_stats = traverse_obj(primary_sidebar_renderer, 'stats')
+ last_updated_unix, _ = self._extract_time_text(playlist_stats, 2)
if title is None:
- title = (
- try_get(data, lambda x: x['header']['hashtagHeaderRenderer']['hashtag']['simpleText'])
- or playlist_id)
+ title = self._get_text(data, ('header', 'hashtagHeaderRenderer', 'hashtag')) or playlist_id
title += format_field(selected_tab, 'title', ' - %s')
title += format_field(selected_tab, 'expandedText', ' - %s')
+
metadata = {
'playlist_id': playlist_id,
'playlist_title': title,
@@ -3642,10 +3660,11 @@ class YoutubeTabBaseInfoExtractor(YoutubeBaseInfoExtractor):
'uploader_url': channel_url,
'thumbnails': thumbnails,
'tags': tags,
+ 'view_count': self._get_count(playlist_stats, 1),
+ 'availability': self._extract_availability(data),
+ 'modified_date': strftime_or_none(last_updated_unix, '%Y%m%d'),
+ 'playlist_count': self._get_count(playlist_stats, 0)
}
- availability = self._extract_availability(data)
- if availability:
- metadata['availability'] = availability
if not channel_id:
metadata.update(self._extract_uploader(data))
metadata.update({