diff options
Diffstat (limited to 'hypervideo_dl/extractor/bilibili.py')
-rw-r--r-- | hypervideo_dl/extractor/bilibili.py | 571 |
1 files changed, 470 insertions, 101 deletions
diff --git a/hypervideo_dl/extractor/bilibili.py b/hypervideo_dl/extractor/bilibili.py index bc04241..cb7ab2a 100644 --- a/hypervideo_dl/extractor/bilibili.py +++ b/hypervideo_dl/extractor/bilibili.py @@ -1,11 +1,14 @@ import base64 import functools +import hashlib import itertools import math -import urllib.error +import time import urllib.parse from .common import InfoExtractor, SearchInfoExtractor +from ..dependencies import Cryptodome +from ..networking.exceptions import HTTPError from ..utils import ( ExtractorError, GeoRestrictedError, @@ -15,14 +18,20 @@ from ..utils import ( float_or_none, format_field, int_or_none, + join_nonempty, make_archive_id, + merge_dicts, mimetype2ext, parse_count, parse_qs, qualities, + smuggle_url, srt_subtitles_timecode, str_or_none, traverse_obj, + try_call, + unified_timestamp, + unsmuggle_url, url_or_none, urlencode_postdata, ) @@ -77,7 +86,7 @@ class BilibiliBaseIE(InfoExtractor): f'{line["content"]}\n\n') return srt_data - def _get_subtitles(self, video_id, initial_state, cid): + def _get_subtitles(self, video_id, aid, cid): subtitles = { 'danmaku': [{ 'ext': 'xml', @@ -85,7 +94,8 @@ class BilibiliBaseIE(InfoExtractor): }] } - for s in traverse_obj(initial_state, ('videoData', 'subtitle', 'list')) or []: + video_info_json = self._download_json(f'https://api.bilibili.com/x/player/v2?aid={aid}&cid={cid}', video_id) + for s in traverse_obj(video_info_json, ('data', 'subtitle', 'subtitles', ...)): subtitles.setdefault(s['lan'], []).append({ 'ext': 'srt', 'data': self.json2srt(self._download_json(s['subtitle_url'], video_id)) @@ -126,9 +136,20 @@ class BilibiliBaseIE(InfoExtractor): for children in map(self._get_all_children, traverse_obj(reply, ('replies', ...))): yield from children + def _get_episodes_from_season(self, ss_id, url): + season_info = self._download_json( + 'https://api.bilibili.com/pgc/web/season/section', ss_id, + note='Downloading season info', query={'season_id': ss_id}, + headers={'Referer': url, **self.geo_verification_headers()}) + + for entry in traverse_obj(season_info, ( + 'result', 'main_section', 'episodes', + lambda _, v: url_or_none(v['share_url']) and v['id'])): + yield self.url_result(entry['share_url'], BiliBiliBangumiIE, f'ep{entry["id"]}') + class BiliBiliIE(BilibiliBaseIE): - _VALID_URL = r'https?://www\.bilibili\.com/video/[aAbB][vV](?P<id>[^/?#&]+)' + _VALID_URL = r'https?://www\.bilibili\.com/(?:video/|festival/\w+\?(?:[^#]*&)?bvid=)[aAbB][vV](?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://www.bilibili.com/video/BV13x41117TL', @@ -276,19 +297,60 @@ class BiliBiliIE(BilibiliBaseIE): 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', }, 'params': {'skip_download': True}, + }, { + 'note': 'video redirects to festival page', + 'url': 'https://www.bilibili.com/video/BV1wP4y1P72h', + 'info_dict': { + 'id': 'BV1wP4y1P72h', + 'ext': 'mp4', + 'title': '牛虎年相交之际,一首传统民族打击乐《牛斗虎》祝大家新春快乐,虎年大吉!【bilibili音乐虎闹新春】', + 'timestamp': 1643947497, + 'upload_date': '20220204', + 'description': 'md5:8681a0d4d2c06b4ae27e59c8080a7fe6', + 'uploader': '叨叨冯聊音乐', + 'duration': 246.719, + 'uploader_id': '528182630', + 'view_count': int, + 'like_count': int, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + }, + 'params': {'skip_download': True}, + }, { + 'note': 'newer festival video', + 'url': 'https://www.bilibili.com/festival/2023honkaiimpact3gala?bvid=BV1ay4y1d77f', + 'info_dict': { + 'id': 'BV1ay4y1d77f', + 'ext': 'mp4', + 'title': '【崩坏3新春剧场】为特别的你送上祝福!', + 'timestamp': 1674273600, + 'upload_date': '20230121', + 'description': 'md5:58af66d15c6a0122dc30c8adfd828dd8', + 'uploader': '果蝇轰', + 'duration': 1111.722, + 'uploader_id': '8469526', + 'view_count': int, + 'like_count': int, + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + }, + 'params': {'skip_download': True}, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id) - play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data'] - video_data = initial_state['videoData'] + is_festival = 'videoData' not in initial_state + if is_festival: + video_data = initial_state['videoInfo'] + else: + play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data'] + video_data = initial_state['videoData'] + video_id, title = video_data['bvid'], video_data.get('title') # Bilibili anthologies are similar to playlists but all videos share the same video ID as the anthology itself. - page_list_json = traverse_obj( + page_list_json = not is_festival and traverse_obj( self._download_json( 'https://api.bilibili.com/x/player/pagelist', video_id, fatal=False, query={'bvid': video_id, 'jsonp': 'jsonp'}, @@ -303,106 +365,143 @@ class BiliBiliIE(BilibiliBaseIE): getter=lambda entry: f'https://www.bilibili.com/video/{video_id}?p={entry["page"]}') if is_anthology: - title += f' p{part_id:02d} {traverse_obj(page_list_json, ((part_id or 1) - 1, "part")) or ""}' + part_id = part_id or 1 + title += f' p{part_id:02d} {traverse_obj(page_list_json, (part_id - 1, "part")) or ""}' aid = video_data.get('aid') old_video_id = format_field(aid, None, f'%s_part{part_id or 1}') cid = traverse_obj(video_data, ('pages', part_id - 1, 'cid')) if part_id else video_data.get('cid') + festival_info = {} + if is_festival: + play_info = self._download_json( + 'https://api.bilibili.com/x/player/playurl', video_id, + query={'bvid': video_id, 'cid': cid, 'fnval': 4048}, + note='Extracting festival video formats')['data'] + + festival_info = traverse_obj(initial_state, { + 'uploader': ('videoInfo', 'upName'), + 'uploader_id': ('videoInfo', 'upMid', {str_or_none}), + 'like_count': ('videoStatus', 'like', {int_or_none}), + 'thumbnail': ('sectionEpisodes', lambda _, v: v['bvid'] == video_id, 'cover'), + }, get_all=False) + return { + **traverse_obj(initial_state, { + 'uploader': ('upData', 'name'), + 'uploader_id': ('upData', 'mid', {str_or_none}), + 'like_count': ('videoData', 'stat', 'like', {int_or_none}), + 'tags': ('tags', ..., 'tag_name'), + 'thumbnail': ('videoData', 'pic', {url_or_none}), + }), + **festival_info, + **traverse_obj(video_data, { + 'description': 'desc', + 'timestamp': ('pubdate', {int_or_none}), + 'view_count': (('viewCount', ('stat', 'view')), {int_or_none}), + 'comment_count': ('stat', 'reply', {int_or_none}), + }, get_all=False), 'id': f'{video_id}{format_field(part_id, None, "_p%d")}', 'formats': self.extract_formats(play_info), '_old_archive_ids': [make_archive_id(self, old_video_id)] if old_video_id else None, 'title': title, - 'description': traverse_obj(initial_state, ('videoData', 'desc')), - 'view_count': traverse_obj(initial_state, ('videoData', 'stat', 'view')), - 'uploader': traverse_obj(initial_state, ('upData', 'name')), - 'uploader_id': traverse_obj(initial_state, ('upData', 'mid')), - 'like_count': traverse_obj(initial_state, ('videoData', 'stat', 'like')), - 'comment_count': traverse_obj(initial_state, ('videoData', 'stat', 'reply')), - 'tags': traverse_obj(initial_state, ('tags', ..., 'tag_name')), - 'thumbnail': traverse_obj(initial_state, ('videoData', 'pic')), - 'timestamp': traverse_obj(initial_state, ('videoData', 'pubdate')), 'duration': float_or_none(play_info.get('timelength'), scale=1000), 'chapters': self._get_chapters(aid, cid), - 'subtitles': self.extract_subtitles(video_id, initial_state, cid), + 'subtitles': self.extract_subtitles(video_id, aid, cid), '__post_extractor': self.extract_comments(aid), 'http_headers': {'Referer': url}, } class BiliBiliBangumiIE(BilibiliBaseIE): - _VALID_URL = r'(?x)https?://www\.bilibili\.com/bangumi/play/(?P<id>(?:ss|ep)\d+)' + _VALID_URL = r'https?://(?:www\.)?bilibili\.com/bangumi/play/(?P<id>ep\d+)' _TESTS = [{ - 'url': 'https://www.bilibili.com/bangumi/play/ss897', + 'url': 'https://www.bilibili.com/bangumi/play/ep267851', 'info_dict': { - 'id': 'ss897', + 'id': '267851', 'ext': 'mp4', - 'series': '神的记事本', - 'season': '神的记事本', - 'season_id': 897, + 'series': '鬼灭之刃', + 'series_id': '4358', + 'season': '鬼灭之刃', + 'season_id': '26801', 'season_number': 1, - 'episode': '你与旅行包', - 'episode_number': 2, - 'title': '神的记事本:第2话 你与旅行包', - 'duration': 1428.487, - 'timestamp': 1310809380, - 'upload_date': '20110716', - 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$', + 'episode': '残酷', + 'episode_id': '267851', + 'episode_number': 1, + 'title': '1 残酷', + 'duration': 1425.256, + 'timestamp': 1554566400, + 'upload_date': '20190406', + 'thumbnail': r're:^https?://.*\.(jpg|jpeg|png)$' }, - }, { - 'url': 'https://www.bilibili.com/bangumi/play/ep508406', - 'only_matching': True, + 'skip': 'According to the copyright owner\'s request, you may only watch the video after you are premium member.' }] def _real_extract(self, url): video_id = self._match_id(url) + episode_id = video_id[2:] webpage = self._download_webpage(url, video_id) if '您所在的地区无法观看本片' in webpage: raise GeoRestrictedError('This video is restricted') - elif ('开通大会员观看' in webpage and '__playinfo__' not in webpage - or '正在观看预览,大会员免费看全片' in webpage): + elif '正在观看预览,大会员免费看全片' in webpage: self.raise_login_required('This video is for premium members only') - play_info = self._search_json(r'window\.__playinfo__\s*=', webpage, 'play info', video_id)['data'] + headers = {'Referer': url, **self.geo_verification_headers()} + play_info = self._download_json( + 'https://api.bilibili.com/pgc/player/web/v2/playurl', video_id, + 'Extracting episode', query={'fnval': '4048', 'ep_id': episode_id}, + headers=headers) + premium_only = play_info.get('code') == -10403 + play_info = traverse_obj(play_info, ('result', 'video_info', {dict})) or {} + formats = self.extract_formats(play_info) - if (not formats and '成为大会员抢先看' in webpage - and play_info.get('durl') and not play_info.get('dash')): + if not formats and (premium_only or '成为大会员抢先看' in webpage or '开通大会员观看' in webpage): self.raise_login_required('This video is for premium members only') - initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial state', video_id) + bangumi_info = self._download_json( + 'https://api.bilibili.com/pgc/view/web/season', video_id, 'Get episode details', + query={'ep_id': episode_id}, headers=headers)['result'] - season_id = traverse_obj(initial_state, ('mediaInfo', 'season_id')) + episode_number, episode_info = next(( + (idx, ep) for idx, ep in enumerate(traverse_obj( + bangumi_info, ('episodes', ..., {dict})), 1) + if str_or_none(ep.get('id')) == episode_id), (1, {})) + + season_id = bangumi_info.get('season_id') season_number = season_id and next(( idx + 1 for idx, e in enumerate( - traverse_obj(initial_state, ('mediaInfo', 'seasons', ...))) + traverse_obj(bangumi_info, ('seasons', ...))) if e.get('season_id') == season_id ), None) + aid = episode_info.get('aid') + return { 'id': video_id, 'formats': formats, - 'title': traverse_obj(initial_state, 'h1Title'), - 'episode': traverse_obj(initial_state, ('epInfo', 'long_title')), - 'episode_number': int_or_none(traverse_obj(initial_state, ('epInfo', 'title'))), - 'series': traverse_obj(initial_state, ('mediaInfo', 'series')), - 'season': traverse_obj(initial_state, ('mediaInfo', 'season_title')), - 'season_id': season_id, + **traverse_obj(bangumi_info, { + 'series': ('series', 'series_title', {str}), + 'series_id': ('series', 'series_id', {str_or_none}), + 'thumbnail': ('square_cover', {url_or_none}), + }), + 'title': join_nonempty('title', 'long_title', delim=' ', from_dict=episode_info), + 'episode': episode_info.get('long_title'), + 'episode_id': episode_id, + 'episode_number': int_or_none(episode_info.get('title')) or episode_number, + 'season_id': str_or_none(season_id), 'season_number': season_number, - 'thumbnail': traverse_obj(initial_state, ('epInfo', 'cover')), - 'timestamp': traverse_obj(initial_state, ('epInfo', 'pub_time')), + 'timestamp': int_or_none(episode_info.get('pub_time')), 'duration': float_or_none(play_info.get('timelength'), scale=1000), - 'subtitles': self.extract_subtitles( - video_id, initial_state, traverse_obj(initial_state, ('epInfo', 'cid'))), - '__post_extractor': self.extract_comments(traverse_obj(initial_state, ('epInfo', 'aid'))), - 'http_headers': {'Referer': url, **self.geo_verification_headers()}, + 'subtitles': self.extract_subtitles(video_id, aid, episode_info.get('cid')), + '__post_extractor': self.extract_comments(aid), + 'http_headers': headers, } -class BiliBiliBangumiMediaIE(InfoExtractor): +class BiliBiliBangumiMediaIE(BilibiliBaseIE): _VALID_URL = r'https?://www\.bilibili\.com/bangumi/media/md(?P<id>\d+)' _TESTS = [{ 'url': 'https://www.bilibili.com/bangumi/media/md24097891', @@ -415,16 +514,26 @@ class BiliBiliBangumiMediaIE(InfoExtractor): def _real_extract(self, url): media_id = self._match_id(url) webpage = self._download_webpage(url, media_id) + ss_id = self._search_json( + r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id)['mediaInfo']['season_id'] + + return self.playlist_result(self._get_episodes_from_season(ss_id, url), media_id) + + +class BiliBiliBangumiSeasonIE(BilibiliBaseIE): + _VALID_URL = r'(?x)https?://www\.bilibili\.com/bangumi/play/ss(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.bilibili.com/bangumi/play/ss26801', + 'info_dict': { + 'id': '26801' + }, + 'playlist_mincount': 26 + }] - initial_state = self._search_json(r'window\.__INITIAL_STATE__\s*=', webpage, 'initial_state', media_id) - episode_list = self._download_json( - 'https://api.bilibili.com/pgc/web/season/section', media_id, - query={'season_id': initial_state['mediaInfo']['season_id']}, - note='Downloading season info')['result']['main_section']['episodes'] + def _real_extract(self, url): + ss_id = self._match_id(url) - return self.playlist_result(( - self.url_result(entry['share_url'], BiliBiliBangumiIE, entry['aid']) - for entry in episode_list), media_id) + return self.playlist_result(self._get_episodes_from_season(ss_id, url), ss_id) class BilibiliSpaceBaseIE(InfoExtractor): @@ -447,21 +556,65 @@ class BilibiliSpaceVideoIE(BilibiliSpaceBaseIE): 'id': '3985676', }, 'playlist_mincount': 178, + }, { + 'url': 'https://space.bilibili.com/313580179/video', + 'info_dict': { + 'id': '313580179', + }, + 'playlist_mincount': 92, }] + def _extract_signature(self, playlist_id): + session_data = self._download_json('https://api.bilibili.com/x/web-interface/nav', playlist_id, fatal=False) + + key_from_url = lambda x: x[x.rfind('/') + 1:].split('.')[0] + img_key = traverse_obj( + session_data, ('data', 'wbi_img', 'img_url', {key_from_url})) or '34478ba821254d9d93542680e3b86100' + sub_key = traverse_obj( + session_data, ('data', 'wbi_img', 'sub_url', {key_from_url})) or '7e16a90d190a4355a78fd00b32a38de6' + + session_key = img_key + sub_key + + signature_values = [] + for position in ( + 46, 47, 18, 2, 53, 8, 23, 32, 15, 50, 10, 31, 58, 3, 45, 35, 27, 43, 5, 49, 33, 9, 42, 19, 29, 28, 14, 39, + 12, 38, 41, 13, 37, 48, 7, 16, 24, 55, 40, 61, 26, 17, 0, 1, 60, 51, 30, 4, 22, 25, 54, 21, 56, 59, 6, 63, + 57, 62, 11, 36, 20, 34, 44, 52 + ): + char_at_position = try_call(lambda: session_key[position]) + if char_at_position: + signature_values.append(char_at_position) + + return ''.join(signature_values)[:32] + def _real_extract(self, url): playlist_id, is_video_url = self._match_valid_url(url).group('id', 'video') if not is_video_url: self.to_screen('A channel URL was given. Only the channel\'s videos will be downloaded. ' 'To download audios, add a "/audio" to the URL') + signature = self._extract_signature(playlist_id) + def fetch_page(page_idx): + query = { + 'keyword': '', + 'mid': playlist_id, + 'order': 'pubdate', + 'order_avoided': 'true', + 'platform': 'web', + 'pn': page_idx + 1, + 'ps': 30, + 'tid': 0, + 'web_location': 1550101, + 'wts': int(time.time()), + } + query['w_rid'] = hashlib.md5(f'{urllib.parse.urlencode(query)}{signature}'.encode()).hexdigest() + try: - response = self._download_json('https://api.bilibili.com/x/space/arc/search', - playlist_id, note=f'Downloading page {page_idx}', - query={'mid': playlist_id, 'pn': page_idx + 1, 'jsonp': 'jsonp'}) + response = self._download_json('https://api.bilibili.com/x/space/wbi/arc/search', + playlist_id, note=f'Downloading page {page_idx}', query=query) except ExtractorError as e: - if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 412: + if isinstance(e.cause, HTTPError) and e.cause.status == 412: raise ExtractorError( 'Request is blocked by server (412), please add cookies, wait and try later.', expected=True) raise @@ -489,9 +642,9 @@ class BilibiliSpaceVideoIE(BilibiliSpaceBaseIE): class BilibiliSpaceAudioIE(BilibiliSpaceBaseIE): _VALID_URL = r'https?://space\.bilibili\.com/(?P<id>\d+)/audio' _TESTS = [{ - 'url': 'https://space.bilibili.com/3985676/audio', + 'url': 'https://space.bilibili.com/313580179/audio', 'info_dict': { - 'id': '3985676', + 'id': '313580179', }, 'playlist_mincount': 1, }] @@ -880,35 +1033,24 @@ class BiliIntlBaseIE(InfoExtractor): return formats - def _extract_video_info(self, video_data, *, ep_id=None, aid=None): + def _parse_video_metadata(self, video_data): return { - 'id': ep_id or aid, 'title': video_data.get('title_display') or video_data.get('title'), 'thumbnail': video_data.get('cover'), 'episode_number': int_or_none(self._search_regex( r'^E(\d+)(?:$| - )', video_data.get('title_display') or '', 'episode number', default=None)), - 'formats': self._get_formats(ep_id=ep_id, aid=aid), - 'subtitles': self._get_subtitles(ep_id=ep_id, aid=aid), - 'extractor_key': BiliIntlIE.ie_key(), } def _perform_login(self, username, password): - try: - from Cryptodome.PublicKey import RSA - from Cryptodome.Cipher import PKCS1_v1_5 - except ImportError: - try: - from Crypto.PublicKey import RSA - from Crypto.Cipher import PKCS1_v1_5 - except ImportError: - raise ExtractorError('pycryptodomex not found. Please install', expected=True) + if not Cryptodome.RSA: + raise ExtractorError('pycryptodomex not found. Please install', expected=True) key_data = self._download_json( 'https://passport.bilibili.tv/x/intl/passport-login/web/key?lang=en-US', None, note='Downloading login key', errnote='Unable to download login key')['data'] - public_key = RSA.importKey(key_data['key']) - password_hash = PKCS1_v1_5.new(public_key).encrypt((key_data['hash'] + password).encode('utf-8')) + public_key = Cryptodome.RSA.importKey(key_data['key']) + password_hash = Cryptodome.PKCS1_v1_5.new(public_key).encrypt((key_data['hash'] + password).encode('utf-8')) login_post = self._download_json( 'https://passport.bilibili.tv/x/intl/passport-login/web/login/password?lang=en-US', None, data=urlencode_postdata({ 'username': username, @@ -935,6 +1077,23 @@ class BiliIntlIE(BiliIntlBaseIE): 'title': 'E2 - The First Night', 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$', 'episode_number': 2, + 'upload_date': '20201009', + 'episode': 'Episode 2', + 'timestamp': 1602259500, + 'description': 'md5:297b5a17155eb645e14a14b385ab547e', + 'chapters': [{ + 'start_time': 0, + 'end_time': 76.242, + 'title': '<Untitled Chapter 1>' + }, { + 'start_time': 76.242, + 'end_time': 161.161, + 'title': 'Intro' + }, { + 'start_time': 1325.742, + 'end_time': 1403.903, + 'title': 'Outro' + }], } }, { # Non-Bstation page @@ -945,6 +1104,23 @@ class BiliIntlIE(BiliIntlBaseIE): 'title': 'E3 - Who?', 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$', 'episode_number': 3, + 'description': 'md5:e1a775e71a35c43f141484715470ad09', + 'episode': 'Episode 3', + 'upload_date': '20211219', + 'timestamp': 1639928700, + 'chapters': [{ + 'start_time': 0, + 'end_time': 88.0, + 'title': '<Untitled Chapter 1>' + }, { + 'start_time': 88.0, + 'end_time': 156.0, + 'title': 'Intro' + }, { + 'start_time': 1173.0, + 'end_time': 1259.535, + 'title': 'Outro' + }], } }, { # Subtitle with empty content @@ -958,6 +1134,78 @@ class BiliIntlIE(BiliIntlBaseIE): }, 'skip': 'According to the copyright owner\'s request, you may only watch the video after you log in.' }, { + 'url': 'https://www.bilibili.tv/en/video/2041863208', + 'info_dict': { + 'id': '2041863208', + 'ext': 'mp4', + 'timestamp': 1670874843, + 'description': 'Scheduled for April 2023.\nStudio: ufotable', + 'thumbnail': r're:https?://pic[-\.]bstarstatic.+/ugc/.+\.jpg$', + 'upload_date': '20221212', + 'title': 'Kimetsu no Yaiba Season 3 Official Trailer - Bstation', + }, + }, { + # episode comment extraction + 'url': 'https://www.bilibili.tv/en/play/34580/340317', + 'info_dict': { + 'id': '340317', + 'ext': 'mp4', + 'timestamp': 1604057820, + 'upload_date': '20201030', + 'episode_number': 5, + 'title': 'E5 - My Own Steel', + 'description': 'md5:2b17ab10aebb33e3c2a54da9e8e487e2', + 'thumbnail': r're:https?://pic\.bstarstatic\.com/ogv/.+\.png$', + 'episode': 'Episode 5', + 'comment_count': int, + 'chapters': [{ + 'start_time': 0, + 'end_time': 61.0, + 'title': '<Untitled Chapter 1>' + }, { + 'start_time': 61.0, + 'end_time': 134.0, + 'title': 'Intro' + }, { + 'start_time': 1290.0, + 'end_time': 1379.0, + 'title': 'Outro' + }], + }, + 'params': { + 'getcomments': True + } + }, { + # user generated content comment extraction + 'url': 'https://www.bilibili.tv/en/video/2045730385', + 'info_dict': { + 'id': '2045730385', + 'ext': 'mp4', + 'description': 'md5:693b6f3967fb4e7e7764ea817857c33a', + 'timestamp': 1667891924, + 'upload_date': '20221108', + 'title': 'That Time I Got Reincarnated as a Slime: Scarlet Bond - Official Trailer 3| AnimeStan - Bstation', + 'comment_count': int, + 'thumbnail': 'https://pic.bstarstatic.com/ugc/f6c363659efd2eabe5683fbb906b1582.jpg', + }, + 'params': { + 'getcomments': True + } + }, { + # episode id without intro and outro + 'url': 'https://www.bilibili.tv/en/play/1048837/11246489', + 'info_dict': { + 'id': '11246489', + 'ext': 'mp4', + 'title': 'E1 - Operation \'Strix\' <Owl>', + 'description': 'md5:b4434eb1a9a97ad2bccb779514b89f17', + 'timestamp': 1649516400, + 'thumbnail': 'https://pic.bstarstatic.com/ogv/62cb1de23ada17fb70fbe7bdd6ff29c29da02a64.png', + 'episode': 'Episode 1', + 'episode_number': 1, + 'upload_date': '20220409', + }, + }, { 'url': 'https://www.biliintl.com/en/play/34613/341736', 'only_matching': True, }, { @@ -974,36 +1222,139 @@ class BiliIntlIE(BiliIntlBaseIE): 'only_matching': True, }] - def _real_extract(self, url): - season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid') - video_id = ep_id or aid + def _make_url(video_id, series_id=None): + if series_id: + return f'https://www.bilibili.tv/en/play/{series_id}/{video_id}' + return f'https://www.bilibili.tv/en/video/{video_id}' + + def _extract_video_metadata(self, url, video_id, season_id): + url, smuggled_data = unsmuggle_url(url, {}) + if smuggled_data.get('title'): + return smuggled_data + webpage = self._download_webpage(url, video_id) # Bstation layout initial_data = ( self._search_json(r'window\.__INITIAL_(?:DATA|STATE)__\s*=', webpage, 'preload state', video_id, default={}) or self._search_nuxt_data(webpage, video_id, '__initialState', fatal=False, traverse=None)) video_data = traverse_obj( - initial_data, ('OgvVideo', 'epDetail'), ('UgcVideo', 'videoData'), ('ugc', 'archive'), expected_type=dict) + initial_data, ('OgvVideo', 'epDetail'), ('UgcVideo', 'videoData'), ('ugc', 'archive'), expected_type=dict) or {} if season_id and not video_data: # Non-Bstation layout, read through episode list season_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={season_id}&platform=web', video_id) - video_data = traverse_obj(season_json, - ('sections', ..., 'episodes', lambda _, v: str(v['episode_id']) == ep_id), - expected_type=dict, get_all=False) - return self._extract_video_info(video_data or {}, ep_id=ep_id, aid=aid) + video_data = traverse_obj(season_json, ( + 'sections', ..., 'episodes', lambda _, v: str(v['episode_id']) == video_id + ), expected_type=dict, get_all=False) + + # XXX: webpage metadata may not accurate, it just used to not crash when video_data not found + return merge_dicts( + self._parse_video_metadata(video_data), self._search_json_ld(webpage, video_id, fatal=False), { + 'title': self._html_search_meta('og:title', webpage), + 'description': self._html_search_meta('og:description', webpage) + }) + + def _get_comments_reply(self, root_id, next_id=0, display_id=None): + comment_api_raw_data = self._download_json( + 'https://api.bilibili.tv/reply/web/detail', display_id, + note=f'Downloading reply comment of {root_id} - {next_id}', + query={ + 'platform': 'web', + 'ps': 20, # comment's reply per page (default: 3) + 'root': root_id, + 'next': next_id, + }) + + for replies in traverse_obj(comment_api_raw_data, ('data', 'replies', ...)): + yield { + 'author': traverse_obj(replies, ('member', 'name')), + 'author_id': traverse_obj(replies, ('member', 'mid')), + 'author_thumbnail': traverse_obj(replies, ('member', 'face')), + 'text': traverse_obj(replies, ('content', 'message')), + 'id': replies.get('rpid'), + 'like_count': int_or_none(replies.get('like_count')), + 'parent': replies.get('parent'), + 'timestamp': unified_timestamp(replies.get('ctime_text')) + } + + if not traverse_obj(comment_api_raw_data, ('data', 'cursor', 'is_end')): + yield from self._get_comments_reply( + root_id, comment_api_raw_data['data']['cursor']['next'], display_id) + + def _get_comments(self, video_id, ep_id): + for i in itertools.count(0): + comment_api_raw_data = self._download_json( + 'https://api.bilibili.tv/reply/web/root', video_id, + note=f'Downloading comment page {i + 1}', + query={ + 'platform': 'web', + 'pn': i, # page number + 'ps': 20, # comment per page (default: 20) + 'oid': video_id, + 'type': 3 if ep_id else 1, # 1: user generated content, 3: series content + 'sort_type': 1, # 1: best, 2: recent + }) + + for replies in traverse_obj(comment_api_raw_data, ('data', 'replies', ...)): + yield { + 'author': traverse_obj(replies, ('member', 'name')), + 'author_id': traverse_obj(replies, ('member', 'mid')), + 'author_thumbnail': traverse_obj(replies, ('member', 'face')), + 'text': traverse_obj(replies, ('content', 'message')), + 'id': replies.get('rpid'), + 'like_count': int_or_none(replies.get('like_count')), + 'timestamp': unified_timestamp(replies.get('ctime_text')), + 'author_is_uploader': bool(traverse_obj(replies, ('member', 'type'))), + } + if replies.get('count'): + yield from self._get_comments_reply(replies.get('rpid'), display_id=video_id) + + if traverse_obj(comment_api_raw_data, ('data', 'cursor', 'is_end')): + break + + def _real_extract(self, url): + season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid') + video_id = ep_id or aid + chapters = None + + if ep_id: + intro_ending_json = self._call_api( + f'/web/v2/ogv/play/episode?episode_id={ep_id}&platform=web', + video_id, fatal=False) or {} + if intro_ending_json.get('skip'): + # FIXME: start time and end time seems a bit off a few second even it corrext based on ogv.*.js + # ref: https://p.bstarstatic.com/fe-static/bstar-web-new/assets/ogv.2b147442.js + chapters = [{ + 'start_time': float_or_none(traverse_obj(intro_ending_json, ('skip', 'opening_start_time')), 1000), + 'end_time': float_or_none(traverse_obj(intro_ending_json, ('skip', 'opening_end_time')), 1000), + 'title': 'Intro' + }, { + 'start_time': float_or_none(traverse_obj(intro_ending_json, ('skip', 'ending_start_time')), 1000), + 'end_time': float_or_none(traverse_obj(intro_ending_json, ('skip', 'ending_end_time')), 1000), + 'title': 'Outro' + }] + + return { + 'id': video_id, + **self._extract_video_metadata(url, video_id, season_id), + 'formats': self._get_formats(ep_id=ep_id, aid=aid), + 'subtitles': self.extract_subtitles(ep_id=ep_id, aid=aid), + 'chapters': chapters, + '__post_extractor': self.extract_comments(video_id, ep_id) + } class BiliIntlSeriesIE(BiliIntlBaseIE): - _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-zA-Z]{2}/)?play/(?P<id>\d+)/?(?:[?#]|$)' + IE_NAME = 'biliIntl:series' + _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-zA-Z]{2}/)?(?:play|media)/(?P<id>\d+)/?(?:[?#]|$)' _TESTS = [{ 'url': 'https://www.bilibili.tv/en/play/34613', 'playlist_mincount': 15, 'info_dict': { 'id': '34613', - 'title': 'Fly Me to the Moon', - 'description': 'md5:a861ee1c4dc0acfad85f557cc42ac627', - 'categories': ['Romance', 'Comedy', 'Slice of life'], + 'title': 'TONIKAWA: Over the Moon For You', + 'description': 'md5:297b5a17155eb645e14a14b385ab547e', + 'categories': ['Slice of life', 'Comedy', 'Romance'], 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.png$', 'view_count': int, }, @@ -1011,6 +1362,17 @@ class BiliIntlSeriesIE(BiliIntlBaseIE): 'skip_download': True, }, }, { + 'url': 'https://www.bilibili.tv/en/media/1048837', + 'info_dict': { + 'id': '1048837', + 'title': 'SPY×FAMILY', + 'description': 'md5:b4434eb1a9a97ad2bccb779514b89f17', + 'categories': ['Adventure', 'Action', 'Comedy'], + 'thumbnail': r're:^https://pic\.bstarstatic\.com/ogv/.+\.jpg$', + 'view_count': int, + }, + 'playlist_mincount': 25, + }, { 'url': 'https://www.biliintl.com/en/play/34613', 'only_matching': True, }, { @@ -1020,9 +1382,12 @@ class BiliIntlSeriesIE(BiliIntlBaseIE): def _entries(self, series_id): series_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={series_id}&platform=web', series_id) - for episode in traverse_obj(series_json, ('sections', ..., 'episodes', ...), expected_type=dict, default=[]): - episode_id = str(episode.get('episode_id')) - yield self._extract_video_info(episode, ep_id=episode_id) + for episode in traverse_obj(series_json, ('sections', ..., 'episodes', ...), expected_type=dict): + episode_id = str(episode['episode_id']) + yield self.url_result(smuggle_url( + BiliIntlIE._make_url(episode_id, series_id), + self._parse_video_metadata(episode) + ), BiliIntlIE, episode_id) def _real_extract(self, url): series_id = self._match_id(url) @@ -1034,7 +1399,7 @@ class BiliIntlSeriesIE(BiliIntlBaseIE): class BiliLiveIE(InfoExtractor): - _VALID_URL = r'https?://live.bilibili.com/(?P<id>\d+)' + _VALID_URL = r'https?://live.bilibili.com/(?:blanc/)?(?P<id>\d+)' _TESTS = [{ 'url': 'https://live.bilibili.com/196', @@ -1050,6 +1415,9 @@ class BiliLiveIE(InfoExtractor): }, { 'url': 'https://live.bilibili.com/196?broadcast_type=0&is_room_feed=1?spm_id_from=333.999.space_home.strengthen_live_card.click', 'only_matching': True + }, { + 'url': 'https://live.bilibili.com/blanc/196', + 'only_matching': True }] _FORMATS = { @@ -1111,6 +1479,7 @@ class BiliLiveIE(InfoExtractor): 'thumbnail': room_data.get('user_cover'), 'timestamp': stream_data.get('live_time'), 'formats': formats, + 'is_live': True, 'http_headers': { 'Referer': url, }, |