diff options
author | pukkandan <pukkandan.ytdlp@gmail.com> | 2022-06-03 21:29:03 +0530 |
---|---|---|
committer | pukkandan <pukkandan.ytdlp@gmail.com> | 2022-06-03 21:45:35 +0530 |
commit | 1890fc6389393ffaa05fa27bd47717f4d862404f (patch) | |
tree | eccbba9c05ba3032d217948ba93e529ca764f7af /yt_dlp/extractor | |
parent | c4910024f3dbb9798554f02d935d0b0604f51182 (diff) | |
download | hypervideo-pre-1890fc6389393ffaa05fa27bd47717f4d862404f.tar.lz hypervideo-pre-1890fc6389393ffaa05fa27bd47717f4d862404f.tar.xz hypervideo-pre-1890fc6389393ffaa05fa27bd47717f4d862404f.zip |
[cleanup] Misc fixes
Cherry-picks from: #3498, #3947
Related: #3949, https://github.com/yt-dlp/yt-dlp/issues/1839#issuecomment-1140313836
Authored by: pukkandan, flashdagger, gamer191
Diffstat (limited to 'yt_dlp/extractor')
-rw-r--r-- | yt_dlp/extractor/common.py | 5 | ||||
-rw-r--r-- | yt_dlp/extractor/youtube.py | 93 |
2 files changed, 48 insertions, 50 deletions
diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index c1a160e82..2e62660c7 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -786,7 +786,8 @@ class InfoExtractor: self.report_warning(errmsg) return False - def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): + def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, + encoding=None, data=None, headers={}, query={}, expected_status=None): """ Return a tuple (page content as string, URL handle). @@ -943,7 +944,7 @@ class InfoExtractor: except ValueError: raise e except ValueError as ve: - errmsg = '%s: Failed to parse JSON ' % video_id + errmsg = f'{video_id}: Failed to parse JSON' if fatal: raise ExtractorError(errmsg, cause=ve) else: diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index c9bdd309d..8b2332dc1 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -15,7 +15,7 @@ import time import traceback from .common import InfoExtractor, SearchInfoExtractor -from ..compat import functools +from ..compat import functools # isort: split from ..compat import ( compat_chr, compat_HTTPError, @@ -483,6 +483,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor): if data: return self._parse_json(data, item_id, fatal=fatal) + def _extract_yt_initial_variable(self, webpage, regex, video_id, name): + return self._parse_json(self._search_regex( + (fr'{regex}\s*{self._YT_INITIAL_BOUNDARY_RE}', + regex), webpage, name, default='{}'), video_id, fatal=False, lenient=True) + @staticmethod def _extract_session_index(*data): """ @@ -2733,54 +2738,38 @@ class YoutubeIE(YoutubeBaseInfoExtractor): chapter_time = lambda chapter: parse_duration(self._get_text(chapter, 'timeDescription')) chapter_title = lambda chapter: self._get_text(chapter, 'title') - return next(( - filter(None, ( - self._extract_chapters( - traverse_obj(contents, (..., 'macroMarkersListItemRenderer')), - chapter_time, chapter_title, duration) - for contents in content_list - ))), []) + return next(filter(None, ( + self._extract_chapters(traverse_obj(contents, (..., 'macroMarkersListItemRenderer')), + chapter_time, chapter_title, duration) + for contents in content_list)), []) - @staticmethod - def _extract_chapters_from_description(description, duration): - chapters = [{'start_time': 0}] - for timestamp, title in re.findall( - r'(?m)^((?:\d+:)?\d{1,2}:\d{2})\b\W*\s(.+?)\s*$', description or ''): - start = parse_duration(timestamp) - if start and title and chapters[-1]['start_time'] < start < duration: - chapters[-1]['end_time'] = start - chapters.append({ - 'start_time': start, - 'title': title, - }) - chapters[-1]['end_time'] = duration - return chapters[1:] - - def _extract_chapters(self, chapter_list, chapter_time, chapter_title, duration): - chapters = [] - last_chapter = {'start_time': 0} - for idx, chapter in enumerate(chapter_list or []): - title = chapter_title(chapter) - start_time = chapter_time(chapter) - if start_time is None: - continue - last_chapter['end_time'] = start_time - if start_time < last_chapter['start_time']: - if idx == 1: - chapters.pop() - self.report_warning('Invalid start time for chapter "%s"' % last_chapter['title']) - else: - self.report_warning(f'Invalid start time for chapter "{title}"') - continue - last_chapter = {'start_time': start_time, 'title': title} - chapters.append(last_chapter) - last_chapter['end_time'] = duration - return chapters + def _extract_chapters_from_description(self, description, duration): + return self._extract_chapters( + re.findall(r'(?m)^((?:\d+:)?\d{1,2}:\d{2})\b\W*\s(.+?)\s*$', description or ''), + chapter_time=lambda x: parse_duration(x[0]), chapter_title=lambda x: x[1], + duration=duration, strict=False) - def _extract_yt_initial_variable(self, webpage, regex, video_id, name): - return self._parse_json(self._search_regex( - (fr'{regex}\s*{self._YT_INITIAL_BOUNDARY_RE}', - regex), webpage, name, default='{}'), video_id, fatal=False, lenient=True) + def _extract_chapters(self, chapter_list, chapter_time, chapter_title, duration, strict=True): + if not duration: + return + chapter_list = [{ + 'start_time': chapter_time(chapter), + 'title': chapter_title(chapter), + } for chapter in chapter_list or []] + if not strict: + chapter_list.sort(key=lambda c: c['start_time'] or 0) + + chapters = [{'start_time': 0, 'title': '<Untitled>'}] + for idx, chapter in enumerate(chapter_list): + if chapter['start_time'] is None or not chapter['title']: + self.report_warning(f'Incomplete chapter {idx}') + elif chapters[-1]['start_time'] <= chapter['start_time'] <= duration: + chapters[-1]['end_time'] = chapter['start_time'] + chapters.append(chapter) + else: + self.report_warning(f'Invalid start time for chapter "{chapter["title"]}"') + chapters[-1]['end_time'] = duration + return chapters if len(chapters) > 1 and chapters[1]['start_time'] else chapters[1:] def _extract_comment(self, comment_renderer, parent=None): comment_id = comment_renderer.get('commentId') @@ -3663,7 +3652,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Youtube Music Auto-generated description if video_description: - mobj = re.search(r'(?s)(?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?.+\nAuto-generated by YouTube\.\s*$', video_description) + mobj = re.search( + r'''(?xs) + (?P<track>[^·\n]+)·(?P<artist>[^\n]+)\n+ + (?P<album>[^\n]+) + (?:.+?℗\s*(?P<release_year>\d{4})(?!\d))? + (?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))? + (.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))? + .+\nAuto-generated\ by\ YouTube\.\s*$ + ''', video_description) if mobj: release_year = mobj.group('release_year') release_date = mobj.group('release_date') |