diff options
| author | Luc Ritchie <luc.ritchie@gmail.com> | 2022-03-27 23:21:42 -0400 | 
|---|---|---|
| committer | GitHub <noreply@github.com> | 2022-03-27 20:21:42 -0700 | 
| commit | f5f15c9993cf8087753a7ba2b57fee55e366b80e (patch) | |
| tree | 1c0a9891cf042f87571df24a6032eabf64782727 | |
| parent | cb96c5be7002a1b16c1abbb11c2cd0239d86825a (diff) | |
| download | hypervideo-pre-f5f15c9993cf8087753a7ba2b57fee55e366b80e.tar.lz hypervideo-pre-f5f15c9993cf8087753a7ba2b57fee55e366b80e.tar.xz hypervideo-pre-f5f15c9993cf8087753a7ba2b57fee55e366b80e.zip | |
[BiliIntl] Support user-generated videos (#3203)
Authored by: wlritchi
| -rw-r--r-- | yt_dlp/extractor/bilibili.py | 67 | 
1 files changed, 45 insertions, 22 deletions
| diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py index b4eb20642..dd1ff512e 100644 --- a/yt_dlp/extractor/bilibili.py +++ b/yt_dlp/extractor/bilibili.py @@ -15,6 +15,7 @@ from ..compat import (  )  from ..utils import (      ExtractorError, +    filter_dict,      int_or_none,      float_or_none,      mimetype2ext, @@ -755,15 +756,21 @@ class BiliIntlBaseIE(InfoExtractor):              for i, line in enumerate(json['body']) if line.get('content'))          return data -    def _get_subtitles(self, ep_id): -        sub_json = self._call_api(f'/web/v2/subtitle?episode_id={ep_id}&platform=web', ep_id) +    def _get_subtitles(self, *, ep_id=None, aid=None): +        sub_json = self._call_api( +            '/web/v2/subtitle', ep_id or aid, note='Downloading subtitles list', +            errnote='Unable to download subtitles list', query=filter_dict({ +                'platform': 'web', +                'episode_id': ep_id, +                'aid': aid, +            }))          subtitles = {}          for sub in sub_json.get('subtitles') or []:              sub_url = sub.get('url')              if not sub_url:                  continue              sub_data = self._download_json( -                sub_url, ep_id, errnote='Unable to download subtitles', fatal=False, +                sub_url, ep_id or aid, errnote='Unable to download subtitles', fatal=False,                  note='Downloading subtitles%s' % f' for {sub["lang"]}' if sub.get('lang') else '')              if not sub_data:                  continue @@ -773,9 +780,14 @@ class BiliIntlBaseIE(InfoExtractor):              })          return subtitles -    def _get_formats(self, ep_id): -        video_json = self._call_api(f'/web/playurl?ep_id={ep_id}&platform=web', ep_id, -                                    note='Downloading video formats', errnote='Unable to download video formats') +    def _get_formats(self, *, ep_id=None, aid=None): +        video_json = self._call_api( +            '/web/playurl', ep_id or aid, note='Downloading video formats', +            errnote='Unable to download video formats', query=filter_dict({ +                'platform': 'web', +                'ep_id': ep_id, +                'aid': aid, +            }))          video_json = video_json['playurl']          formats = []          for vid in video_json.get('video') or []: @@ -809,15 +821,15 @@ class BiliIntlBaseIE(InfoExtractor):          self._sort_formats(formats)          return formats -    def _extract_ep_info(self, episode_data, ep_id): +    def _extract_video_info(self, video_data, *, ep_id=None, aid=None):          return { -            'id': ep_id, -            'title': episode_data.get('title_display') or episode_data['title'], -            'thumbnail': episode_data.get('cover'), +            'id': ep_id or aid, +            'title': video_data.get('title_display') or video_data.get('title'), +            'thumbnail': video_data.get('cover'),              'episode_number': int_or_none(self._search_regex( -                r'^E(\d+)(?:$| - )', episode_data.get('title_display'), 'episode number', default=None)), -            'formats': self._get_formats(ep_id), -            'subtitles': self._get_subtitles(ep_id), +                r'^E(\d+)(?:$| - )', video_data.get('title_display') or '', 'episode number', default=None)), +            'formats': self._get_formats(ep_id=ep_id, aid=aid), +            'subtitles': self._get_subtitles(ep_id=ep_id, aid=aid),              'extractor_key': BiliIntlIE.ie_key(),          } @@ -854,7 +866,7 @@ class BiliIntlBaseIE(InfoExtractor):  class BiliIntlIE(BiliIntlBaseIE): -    _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?play/(?P<season_id>\d+)/(?P<id>\d+)' +    _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?(play/(?P<season_id>\d+)/(?P<ep_id>\d+)|video/(?P<aid>\d+))'      _TESTS = [{          # Bstation page          'url': 'https://www.bilibili.tv/en/play/34613/341736', @@ -889,24 +901,35 @@ class BiliIntlIE(BiliIntlBaseIE):      }, {          'url': 'https://www.biliintl.com/en/play/34613/341736',          'only_matching': True, +    }, { +        # User-generated content (as opposed to a series licensed from a studio) +        'url': 'https://bilibili.tv/en/video/2019955076', +        'only_matching': True, +    }, { +        # No language in URL +        'url': 'https://www.bilibili.tv/video/2019955076', +        'only_matching': True,      }]      def _real_extract(self, url): -        season_id, video_id = self._match_valid_url(url).groups() +        season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid') +        video_id = ep_id or aid          webpage = self._download_webpage(url, video_id)          # Bstation layout          initial_data = self._parse_json(self._search_regex( -            r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage, +            r'window\.__INITIAL_(?:DATA|STATE)__\s*=\s*({.+?});', webpage,              'preload state', default='{}'), video_id, fatal=False) or {} -        episode_data = traverse_obj(initial_data, ('OgvVideo', 'epDetail'), expected_type=dict) +        video_data = ( +            traverse_obj(initial_data, ('OgvVideo', 'epDetail'), expected_type=dict) +            or traverse_obj(initial_data, ('UgcVideo', 'videoData'), expected_type=dict) or {}) -        if not episode_data: +        if season_id and not video_data:              # Non-Bstation layout, read through episode list              season_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={season_id}&platform=web', video_id) -            episode_data = next( +            video_data = next(                  episode for episode in traverse_obj(season_json, ('sections', ..., 'episodes', ...), expected_type=dict) -                if str(episode.get('episode_id')) == video_id) -        return self._extract_ep_info(episode_data, video_id) +                if str(episode.get('episode_id')) == ep_id) +        return self._extract_video_info(video_data, ep_id=ep_id, aid=aid)  class BiliIntlSeriesIE(BiliIntlBaseIE): @@ -934,7 +957,7 @@ class BiliIntlSeriesIE(BiliIntlBaseIE):          series_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={series_id}&platform=web', series_id)          for episode in traverse_obj(series_json, ('sections', ..., 'episodes', ...), expected_type=dict, default=[]):              episode_id = str(episode.get('episode_id')) -            yield self._extract_ep_info(episode, episode_id) +            yield self._extract_video_info(episode, ep_id=episode_id)      def _real_extract(self, url):          series_id = self._match_id(url) | 
