[BiliIntl] Support user-generated videos (#3203)

Authored by: wlritchi
author: Luc Ritchie <luc.ritchie@gmail.com> 2022-03-27 23:21:42 -0400
committer: GitHub <noreply@github.com> 2022-03-27 20:21:42 -0700
commit: f5f15c9993cf8087753a7ba2b57fee55e366b80e (patch)
tree: 1c0a9891cf042f87571df24a6032eabf64782727 /yt_dlp/extractor/bilibili.py
parent: cb96c5be7002a1b16c1abbb11c2cd0239d86825a (diff)
download: hypervideo-pre-f5f15c9993cf8087753a7ba2b57fee55e366b80e.tar.lz
hypervideo-pre-f5f15c9993cf8087753a7ba2b57fee55e366b80e.tar.xz
hypervideo-pre-f5f15c9993cf8087753a7ba2b57fee55e366b80e.zip
1 files changed, 45 insertions, 22 deletions
diff --git a/yt_dlp/extractor/bilibili.py b/yt_dlp/extractor/bilibili.py
index b4eb20642..dd1ff512e 100644
--- a/yt_dlp/extractor/bilibili.py
+++ b/yt_dlp/extractor/bilibili.py
@@ -15,6 +15,7 @@ from ..compat import (
 )
 from ..utils import (
     ExtractorError,
+    filter_dict,
     int_or_none,
     float_or_none,
     mimetype2ext,
@@ -755,15 +756,21 @@ class BiliIntlBaseIE(InfoExtractor):
             for i, line in enumerate(json['body']) if line.get('content'))
         return data
 
-    def _get_subtitles(self, ep_id):
-        sub_json = self._call_api(f'/web/v2/subtitle?episode_id={ep_id}&platform=web', ep_id)
+    def _get_subtitles(self, *, ep_id=None, aid=None):
+        sub_json = self._call_api(
+            '/web/v2/subtitle', ep_id or aid, note='Downloading subtitles list',
+            errnote='Unable to download subtitles list', query=filter_dict({
+                'platform': 'web',
+                'episode_id': ep_id,
+                'aid': aid,
+            }))
         subtitles = {}
         for sub in sub_json.get('subtitles') or []:
             sub_url = sub.get('url')
             if not sub_url:
                 continue
             sub_data = self._download_json(
-                sub_url, ep_id, errnote='Unable to download subtitles', fatal=False,
+                sub_url, ep_id or aid, errnote='Unable to download subtitles', fatal=False,
                 note='Downloading subtitles%s' % f' for {sub["lang"]}' if sub.get('lang') else '')
             if not sub_data:
                 continue
@@ -773,9 +780,14 @@ class BiliIntlBaseIE(InfoExtractor):
             })
         return subtitles
 
-    def _get_formats(self, ep_id):
-        video_json = self._call_api(f'/web/playurl?ep_id={ep_id}&platform=web', ep_id,
-                                    note='Downloading video formats', errnote='Unable to download video formats')
+    def _get_formats(self, *, ep_id=None, aid=None):
+        video_json = self._call_api(
+            '/web/playurl', ep_id or aid, note='Downloading video formats',
+            errnote='Unable to download video formats', query=filter_dict({
+                'platform': 'web',
+                'ep_id': ep_id,
+                'aid': aid,
+            }))
         video_json = video_json['playurl']
         formats = []
         for vid in video_json.get('video') or []:
@@ -809,15 +821,15 @@ class BiliIntlBaseIE(InfoExtractor):
         self._sort_formats(formats)
         return formats
 
-    def _extract_ep_info(self, episode_data, ep_id):
+    def _extract_video_info(self, video_data, *, ep_id=None, aid=None):
         return {
-            'id': ep_id,
-            'title': episode_data.get('title_display') or episode_data['title'],
-            'thumbnail': episode_data.get('cover'),
+            'id': ep_id or aid,
+            'title': video_data.get('title_display') or video_data.get('title'),
+            'thumbnail': video_data.get('cover'),
             'episode_number': int_or_none(self._search_regex(
-                r'^E(\d+)(?:$| - )', episode_data.get('title_display'), 'episode number', default=None)),
-            'formats': self._get_formats(ep_id),
-            'subtitles': self._get_subtitles(ep_id),
+                r'^E(\d+)(?:$| - )', video_data.get('title_display') or '', 'episode number', default=None)),
+            'formats': self._get_formats(ep_id=ep_id, aid=aid),
+            'subtitles': self._get_subtitles(ep_id=ep_id, aid=aid),
             'extractor_key': BiliIntlIE.ie_key(),
         }
 
@@ -854,7 +866,7 @@ class BiliIntlBaseIE(InfoExtractor):
 
 
 class BiliIntlIE(BiliIntlBaseIE):
-    _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?play/(?P<season_id>\d+)/(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:www\.)?bili(?:bili\.tv|intl\.com)/(?:[a-z]{2}/)?(play/(?P<season_id>\d+)/(?P<ep_id>\d+)|video/(?P<aid>\d+))'
     _TESTS = [{
         # Bstation page
         'url': 'https://www.bilibili.tv/en/play/34613/341736',
@@ -889,24 +901,35 @@ class BiliIntlIE(BiliIntlBaseIE):
     }, {
         'url': 'https://www.biliintl.com/en/play/34613/341736',
         'only_matching': True,
+    }, {
+        # User-generated content (as opposed to a series licensed from a studio)
+        'url': 'https://bilibili.tv/en/video/2019955076',
+        'only_matching': True,
+    }, {
+        # No language in URL
+        'url': 'https://www.bilibili.tv/video/2019955076',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
-        season_id, video_id = self._match_valid_url(url).groups()
+        season_id, ep_id, aid = self._match_valid_url(url).group('season_id', 'ep_id', 'aid')
+        video_id = ep_id or aid
         webpage = self._download_webpage(url, video_id)
         # Bstation layout
         initial_data = self._parse_json(self._search_regex(
-            r'window\.__INITIAL_DATA__\s*=\s*({.+?});', webpage,
+            r'window\.__INITIAL_(?:DATA|STATE)__\s*=\s*({.+?});', webpage,
             'preload state', default='{}'), video_id, fatal=False) or {}
-        episode_data = traverse_obj(initial_data, ('OgvVideo', 'epDetail'), expected_type=dict)
+        video_data = (
+            traverse_obj(initial_data, ('OgvVideo', 'epDetail'), expected_type=dict)
+            or traverse_obj(initial_data, ('UgcVideo', 'videoData'), expected_type=dict) or {})
 
-        if not episode_data:
+        if season_id and not video_data:
             # Non-Bstation layout, read through episode list
             season_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={season_id}&platform=web', video_id)
-            episode_data = next(
+            video_data = next(
                 episode for episode in traverse_obj(season_json, ('sections', ..., 'episodes', ...), expected_type=dict)
-                if str(episode.get('episode_id')) == video_id)
-        return self._extract_ep_info(episode_data, video_id)
+                if str(episode.get('episode_id')) == ep_id)
+        return self._extract_video_info(video_data, ep_id=ep_id, aid=aid)
 
 
 class BiliIntlSeriesIE(BiliIntlBaseIE):
@@ -934,7 +957,7 @@ class BiliIntlSeriesIE(BiliIntlBaseIE):
         series_json = self._call_api(f'/web/v2/ogv/play/episodes?season_id={series_id}&platform=web', series_id)
         for episode in traverse_obj(series_json, ('sections', ..., 'episodes', ...), expected_type=dict, default=[]):
             episode_id = str(episode.get('episode_id'))
-            yield self._extract_ep_info(episode, episode_id)
+            yield self._extract_video_info(episode, ep_id=episode_id)
 
     def _real_extract(self, url):
         series_id = self._match_id(url)
author	Luc Ritchie <luc.ritchie@gmail.com>	2022-03-27 23:21:42 -0400
committer	GitHub <noreply@github.com>	2022-03-27 20:21:42 -0700
commit	f5f15c9993cf8087753a7ba2b57fee55e366b80e (patch)
tree	1c0a9891cf042f87571df24a6032eabf64782727 /yt_dlp/extractor/bilibili.py
parent	cb96c5be7002a1b16c1abbb11c2cd0239d86825a (diff)
download	hypervideo-pre-f5f15c9993cf8087753a7ba2b57fee55e366b80e.tar.lz hypervideo-pre-f5f15c9993cf8087753a7ba2b57fee55e366b80e.tar.xz hypervideo-pre-f5f15c9993cf8087753a7ba2b57fee55e366b80e.zip