diff options
author | coletdjnz <colethedj@protonmail.com> | 2021-12-20 17:47:53 +1300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-12-20 04:47:53 +0000 |
commit | f3aa3c3f98e50f4f25d8744a97f642f5eb589ac9 (patch) | |
tree | c42cfa1bf295861d83ce6ad9f095f98c5005368d | |
parent | ae43a4b9868798097bb1420336294a2a2571be5f (diff) | |
download | hypervideo-pre-f3aa3c3f98e50f4f25d8744a97f642f5eb589ac9.tar.lz hypervideo-pre-f3aa3c3f98e50f4f25d8744a97f642f5eb589ac9.tar.xz hypervideo-pre-f3aa3c3f98e50f4f25d8744a97f642f5eb589ac9.zip |
[youtube:tab] Extract more metadata from feeds/channels/playlists (#1018)
Parse relative time text, extract live, upcoming status, availability and channel id from feeds/channels/playlists (where applicable).
Closes #1883
Authored-by: coletdjnz
-rw-r--r-- | yt_dlp/extractor/youtube.py | 97 |
1 files changed, 57 insertions, 40 deletions
diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 20452bb70..5a3b98bb5 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -55,6 +55,7 @@ from ..utils import ( smuggle_url, str_or_none, str_to_int, + strftime_or_none, traverse_obj, try_get, unescapeHTML, @@ -358,7 +359,20 @@ class YoutubeBaseInfoExtractor(InfoExtractor): consent_id = random.randint(100, 999) self._set_cookie('.youtube.com', 'CONSENT', 'YES+cb.20210328-17-p0.en+FX+%s' % consent_id) + def _initialize_pref(self): + cookies = self._get_cookies('https://www.youtube.com/') + pref_cookie = cookies.get('PREF') + pref = {} + if pref_cookie: + try: + pref = dict(compat_urlparse.parse_qsl(pref_cookie.value)) + except ValueError: + self.report_warning('Failed to parse user PREF cookie' + bug_reports_message()) + pref.update({'hl': 'en'}) + self._set_cookie('.youtube.com', name='PREF', value=compat_urllib_parse_urlencode(pref)) + def _real_initialize(self): + self._initialize_pref() self._initialize_consent() self._login() @@ -391,23 +405,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor): return self._ytcfg_get_safe(ytcfg, lambda x: x['INNERTUBE_API_KEY'], compat_str, default_client) def _extract_context(self, ytcfg=None, default_client='web'): - _get_context = lambda y: try_get(y, lambda x: x['INNERTUBE_CONTEXT'], dict) - context = _get_context(ytcfg) - if context: - return context - - context = _get_context(self._get_default_ytcfg(default_client)) - if not ytcfg: - return context - - # Recreate the client context (required) - context['client'].update({ - 'clientVersion': self._extract_client_version(ytcfg, default_client), - 'clientName': self._extract_client_name(ytcfg, default_client), - }) - visitor_data = try_get(ytcfg, lambda x: x['VISITOR_DATA'], compat_str) - if visitor_data: - context['client']['visitorData'] = visitor_data + context = get_first( + (ytcfg, self._get_default_ytcfg(default_client)), 'INNERTUBE_CONTEXT', expected_type=dict) + # Enforce language for extraction + traverse_obj(context, 'client', expected_type=dict, default={})['hl'] = 'en' return context _SAPISID = None @@ -664,6 +665,29 @@ class YoutubeBaseInfoExtractor(InfoExtractor): if text: return text + @staticmethod + def extract_relative_time(relative_time_text): + """ + Extracts a relative time from string and converts to dt object + e.g. 'streamed 6 days ago', '5 seconds ago (edited)' + """ + mobj = re.search(r'(?P<time>\d+)\s*(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?\s*ago', relative_time_text) + if mobj: + try: + return datetime_from_str('now-%s%s' % (mobj.group('time'), mobj.group('unit')), precision='auto') + except ValueError: + return None + + def _extract_time_text(self, renderer, *path_list): + text = self._get_text(renderer, *path_list) or '' + dt = self.extract_relative_time(text) + timestamp = None + if isinstance(dt, datetime.datetime): + timestamp = calendar.timegm(dt.timetuple()) + if text and timestamp is None: + self.report_warning('Cannot parse localized time text' + bug_reports_message(), only_once=True) + return timestamp, text + def _extract_response(self, item_id, query, note='Downloading API JSON', headers=None, ytcfg=None, check_get_keys=None, ep='browse', fatal=True, api_hostname=None, default_client='web'): @@ -750,7 +774,13 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'view count', default=None)) uploader = self._get_text(renderer, 'ownerText', 'shortBylineText') - + channel_id = traverse_obj( + renderer, ('shortBylineText', 'runs', ..., 'navigationEndpoint', 'browseEndpoint', 'browseId'), expected_type=str, get_all=False) + timestamp, time_text = self._extract_time_text(renderer, 'publishedTimeText') + scheduled_timestamp = str_to_int(traverse_obj(renderer, ('upcomingEventData', 'startTime'), get_all=False)) + overlay_style = traverse_obj( + renderer, ('thumbnailOverlays', ..., 'thumbnailOverlayTimeStatusRenderer', 'style'), get_all=False, expected_type=str) + badges = self._extract_badges(renderer) return { '_type': 'url', 'ie_key': YoutubeIE.ie_key(), @@ -761,6 +791,14 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'duration': duration, 'view_count': view_count, 'uploader': uploader, + 'channel_id': channel_id, + 'upload_date': strftime_or_none(timestamp, '%Y%m%d'), + 'live_status': ('is_upcoming' if scheduled_timestamp is not None + else 'was_live' if 'streamed' in time_text.lower() + else 'is_live' if overlay_style is not None and overlay_style == 'LIVE' or 'live now' in badges + else None), + 'release_timestamp': scheduled_timestamp, + 'availability': self._availability(needs_premium='premium' in badges, needs_subscription='members only' in badges) } @@ -2064,19 +2102,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): (r'%s\s*%s' % (regex, self._YT_INITIAL_BOUNDARY_RE), regex), webpage, name, default='{}'), video_id, fatal=False) - @staticmethod - def parse_time_text(time_text): - """ - Parse the comment time text - time_text is in the format 'X units ago (edited)' - """ - time_text_split = time_text.split(' ') - if len(time_text_split) >= 3: - try: - return datetime_from_str('now-%s%s' % (time_text_split[0], time_text_split[1]), precision='auto') - except ValueError: - return None - def _extract_comment(self, comment_renderer, parent=None): comment_id = comment_renderer.get('commentId') if not comment_id: @@ -2085,10 +2110,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): text = self._get_text(comment_renderer, 'contentText') # note: timestamp is an estimate calculated from the current time and time_text - time_text = self._get_text(comment_renderer, 'publishedTimeText') or '' - time_text_dt = self.parse_time_text(time_text) - if isinstance(time_text_dt, datetime.datetime): - timestamp = calendar.timegm(time_text_dt.timetuple()) + timestamp, time_text = self._extract_time_text(comment_renderer, 'publishedTimeText') author = self._get_text(comment_renderer, 'authorText') author_id = try_get(comment_renderer, lambda x: x['authorEndpoint']['browseEndpoint']['browseId'], compat_str) @@ -2261,11 +2283,6 @@ class YoutubeIE(YoutubeBaseInfoExtractor): yield from self._comment_entries(renderer, ytcfg, video_id) max_comments = int_or_none(self._configuration_arg('max_comments', [''])[0]) - # Force English regardless of account setting to prevent parsing issues - # See: https://github.com/yt-dlp/yt-dlp/issues/532 - ytcfg = copy.deepcopy(ytcfg) - traverse_obj( - ytcfg, ('INNERTUBE_CONTEXT', 'client'), expected_type=dict, default={})['hl'] = 'en' return itertools.islice(_real_comment_extract(contents), 0, max_comments) @staticmethod |