diff options
| author | HobbyistDev <105957301+HobbyistDev@users.noreply.github.com> | 2023-04-13 03:21:57 +0900 | 
|---|---|---|
| committer | GitHub <noreply@github.com> | 2023-04-12 23:51:57 +0530 | 
| commit | b093c38cc9f26b59a8504211d792f053142c847d (patch) | |
| tree | 798cb75b385ed75e8e036a02c8f65cf00848685c | |
| parent | 2d97d154fe4fb84fe2ed3a4e1ed5819e89b71e88 (diff) | |
| download | hypervideo-pre-b093c38cc9f26b59a8504211d792f053142c847d.tar.lz hypervideo-pre-b093c38cc9f26b59a8504211d792f053142c847d.tar.xz hypervideo-pre-b093c38cc9f26b59a8504211d792f053142c847d.zip | |
[extractor/biliIntl] Add comment extraction  (#6079)
Authored by: HobbyistDev
| -rw-r--r-- | yt_dlp/extractor/bilibili.py | 111 | 
1 files changed, 109 insertions, 2 deletions
| diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index c34439779..91d436dd8 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -26,6 +26,7 @@ from ..utils import (      srt_subtitles_timecode,      str_or_none,      traverse_obj, +    unified_timestamp,      unsmuggle_url,      url_or_none,      urlencode_postdata, @@ -996,6 +997,53 @@ class BiliIntlIE(BiliIntlBaseIE):              'thumbnail': r're:https?://pic[-\.]bstarstatic.+/ugc/.+\.jpg$',              'upload_date': '20221212',              'title': 'Kimetsu no Yaiba Season 3 Official Trailer - Bstation', +        }, +    }, { +        # episode comment extraction +        'url': 'https://www.bilibili.tv/en/play/34580/340317', +        'info_dict': { +            'id': '340317', +            'ext': 'mp4', +            'timestamp': 1604057820, +            'upload_date': '20201030', +            'episode_number': 5, +            'title': 'E5 - My Own Steel', +            'description': 'md5:2b17ab10aebb33e3c2a54da9e8e487e2', +            'thumbnail': r're:https?://pic\.bstarstatic\.com/ogv/.+\.png$', +            'episode': 'Episode 5', +            'comment_count': int, +            'chapters': [{ +                'start_time': 0, +                'end_time': 61.0, +                'title': '<Untitled Chapter 1>' +            }, { +                'start_time': 61.0, +                'end_time': 134.0, +                'title': 'Intro' +            }, { +                'start_time': 1290.0, +                'end_time': 1379.0, +                'title': 'Outro' +            }], +        }, +        'params': { +            'getcomments': True +        } +    }, { +        # user generated content comment extraction +        'url': 'https://www.bilibili.tv/en/video/2045730385', +        'info_dict': { +            'id': '2045730385', +            'ext': 'mp4', +            'description': 'md5:693b6f3967fb4e7e7764ea817857c33a', +            'timestamp': 1667891924, +            'upload_date': '20221108', +            'title': 'That Time I Got Reincarnated as a Slime: Scarlet Bond - Official Trailer 3| AnimeStan - Bstation', +            'comment_count': int, +            'thumbnail': 'https://pic.bstarstatic.com/ugc/f6c363659efd2eabe5683fbb906b1582.jpg', +        }, +        'params': { +            'getcomments': True          }      }, {          # episode id without intro and outro @@ -1055,11 +1103,69 @@ class BiliIntlIE(BiliIntlBaseIE):          # XXX: webpage metadata may not accurate, it just used to not crash when video_data not found          return merge_dicts( -            self._parse_video_metadata(video_data), self._search_json_ld(webpage, video_id), { +            self._parse_video_metadata(video_data), self._search_json_ld(webpage, video_id, fatal=False), {                  'title': self._html_search_meta('og:title', webpage),                  'description': self._html_search_meta('og:description', webpage)              }) +    def _get_comments_reply(self, root_id, next_id=0, display_id=None): +        comment_api_raw_data = self._download_json( +            'https://api.bilibili.tv/reply/web/detail', display_id, +            note=f'Downloading reply comment of {root_id} - {next_id}', +            query={ +                'platform': 'web', +                'ps': 20,  # comment's reply per page (default: 3) +                'root': root_id, +                'next': next_id, +            }) + +        for replies in traverse_obj(comment_api_raw_data, ('data', 'replies', ...)): +            yield { +                'author': traverse_obj(replies, ('member', 'name')), +                'author_id': traverse_obj(replies, ('member', 'mid')), +                'author_thumbnail': traverse_obj(replies, ('member', 'face')), +                'text': traverse_obj(replies, ('content', 'message')), +                'id': replies.get('rpid'), +                'like_count': int_or_none(replies.get('like_count')), +                'parent': replies.get('parent'), +                'timestamp': unified_timestamp(replies.get('ctime_text')) +            } + +        if not traverse_obj(comment_api_raw_data, ('data', 'cursor', 'is_end')): +            yield from self._get_comments_reply( +                root_id, comment_api_raw_data['data']['cursor']['next'], display_id) + +    def _get_comments(self, video_id, ep_id): +        for i in itertools.count(0): +            comment_api_raw_data = self._download_json( +                'https://api.bilibili.tv/reply/web/root', video_id, +                note=f'Downloading comment page {i + 1}', +                query={ +                    'platform': 'web', +                    'pn': i,  # page number +                    'ps': 20,  # comment per page (default: 20) +                    'oid': video_id, +                    'type': 3 if ep_id else 1,  # 1: user generated content, 3: series content +                    'sort_type': 1,  # 1: best, 2: recent +                }) + +            for replies in traverse_obj(comment_api_raw_data, ('data', 'replies', ...)): +                yield { +                    'author': traverse_obj(replies, ('member', 'name')), +                    'author_id': traverse_obj(replies, ('member', 'mid')), +                    'author_thumbnail': traverse_obj(replies, ('member', 'face')), +                    'text': traverse_obj(replies, ('content', 'message')), +                    'id': replies.get('rpid'), +                    'like_count': int_or_none(replies.get('like_count')), +                    'timestamp': unified_timestamp(replies.get('ctime_text')), +                    'author_is_uploader': bool(traverse_obj(replies, ('member', 'type'))), +                } +                if replies.get('count'): +                    yield from self._get_comments_reply(replies.get('rpid'), display_id=video_id) + +            if traverse_obj(comment_api_raw_data, ('data', 'cursor', 'is_end')): +                break +      def _real_extract(self, url):          season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid')          video_id = ep_id or aid @@ -1087,7 +1193,8 @@ class BiliIntlIE(BiliIntlBaseIE):              **self._extract_video_metadata(url, video_id, season_id),              'formats': self._get_formats(ep_id=ep_id, aid=aid),              'subtitles': self.extract_subtitles(ep_id=ep_id, aid=aid), -            'chapters': chapters +            'chapters': chapters, +            '__post_extractor': self.extract_comments(video_id, ep_id)          } | 
