hypervideo_dl/extractor/tbsjp.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152

from .common import InfoExtractor
from ..networking.exceptions import HTTPError
from ..utils import (
    ExtractorError,
    clean_html,
    get_element_text_and_html_by_tag,
    int_or_none,
    str_or_none,
    traverse_obj,
    try_call,
    unified_timestamp,
    urljoin,
)


class TBSJPEpisodeIE(InfoExtractor):
    _VALID_URL = r'https?://cu\.tbs\.co\.jp/episode/(?P<id>[\d_]+)'
    _GEO_BYPASS = False
    _TESTS = [{
        'url': 'https://cu.tbs.co.jp/episode/23613_2044134_1000049010',
        'skip': 'streams geo-restricted, Japan only. Also, will likely expire eventually',
        'info_dict': {
            'title': 'VIVANT 第三話 誤送金完結へ!絶体絶命の反撃開始',
            'id': '23613_2044134_1000049010',
            'ext': 'mp4',
            'upload_date': '20230728',
            'duration': 3517,
            'release_timestamp': 1691118230,
            'episode': '第三話 誤送金完結へ!絶体絶命の反撃開始',
            'release_date': '20230804',
            'categories': 'count:11',
            'episode_number': 3,
            'timestamp': 1690522538,
            'description': 'md5:2b796341af1ef772034133174ba4a895',
            'series': 'VIVANT',
        },
    }]

    def _real_extract(self, url):
        video_id = self._match_id(url)
        webpage = self._download_webpage(url, video_id)
        meta = self._search_json(r'window\.app\s*=', webpage, 'episode info', video_id, fatal=False)
        episode = traverse_obj(meta, ('falcorCache', 'catalog', 'episode', video_id, 'value'))

        tf_path = self._search_regex(
            r'<script[^>]+src=["\'](/assets/tf\.[^"\']+\.js)["\']', webpage, 'stream API config')
        tf_js = self._download_webpage(urljoin(url, tf_path), video_id, note='Downloading stream API config')
        video_url = self._search_regex(r'videoPlaybackUrl:\s*[\'"]([^\'"]+)[\'"]', tf_js, 'stream API url')
        api_key = self._search_regex(r'api_key:\s*[\'"]([^\'"]+)[\'"]', tf_js, 'stream API key')

        try:
            source_meta = self._download_json(f'{video_url}ref:{video_id}', video_id,
                                              headers={'X-Streaks-Api-Key': api_key},
                                              note='Downloading stream metadata')
        except ExtractorError as e:
            if isinstance(e.cause, HTTPError) and e.cause.status == 403:
                self.raise_geo_restricted(countries=['JP'])
            raise

        formats, subtitles = [], {}
        for src in traverse_obj(source_meta, ('sources', ..., 'src')):
            fmts, subs = self._extract_m3u8_formats_and_subtitles(src, video_id, fatal=False)
            formats.extend(fmts)
            self._merge_subtitles(subs, target=subtitles)

        return {
            'title': try_call(lambda: clean_html(get_element_text_and_html_by_tag('h3', webpage)[0])),
            'id': video_id,
            **traverse_obj(episode, {
                'categories': ('keywords', {list}),
                'id': ('content_id', {str}),
                'description': ('description', 0, 'value'),
                'timestamp': ('created_at', {unified_timestamp}),
                'release_timestamp': ('pub_date', {unified_timestamp}),
                'duration': ('tv_episode_info', 'duration', {int_or_none}),
                'episode_number': ('tv_episode_info', 'episode_number', {int_or_none}),
                'episode': ('title', lambda _, v: not v.get('is_phonetic'), 'value'),
                'series': ('custom_data', 'program_name'),
            }, get_all=False),
            'formats': formats,
            'subtitles': subtitles,
        }


class TBSJPProgramIE(InfoExtractor):
    _VALID_URL = r'https?://cu\.tbs\.co\.jp/program/(?P<id>\d+)'
    _TESTS = [{
        'url': 'https://cu.tbs.co.jp/program/23601',
        'playlist_mincount': 4,
        'info_dict': {
            'id': '23601',
            'categories': ['エンタメ', 'ミライカプセル', '会社', '働く', 'バラエティ', '動画'],
            'description': '幼少期の夢は大人になって、どう成長したのだろうか？\nそしてその夢は今後、どのように広がっていくのか？\nいま話題の会社で働く人の「夢の成長」を描く',
            'series': 'ミライカプセル　-I have a dream-',
            'title': 'ミライカプセル　-I have a dream-'
        }
    }]

    def _real_extract(self, url):
        programme_id = self._match_id(url)
        webpage = self._download_webpage(url, programme_id)
        meta = self._search_json(r'window\.app\s*=', webpage, 'programme info', programme_id)

        programme = traverse_obj(meta, ('falcorCache', 'catalog', 'program', programme_id, 'false', 'value'))

        return {
            '_type': 'playlist',
            'entries': [self.url_result(f'https://cu.tbs.co.jp/episode/{video_id}', TBSJPEpisodeIE, video_id)
                        for video_id in traverse_obj(programme, ('custom_data', 'seriesList', 'episodeCode', ...))],
            'id': programme_id,
            **traverse_obj(programme, {
                'categories': ('keywords', ...),
                'id': ('tv_episode_info', 'show_content_id', {str_or_none}),
                'description': ('custom_data', 'program_description'),
                'series': ('custom_data', 'program_name'),
                'title': ('custom_data', 'program_name'),
            }),
        }


class TBSJPPlaylistIE(InfoExtractor):
    _VALID_URL = r'https?://cu\.tbs\.co\.jp/playlist/(?P<id>[\da-f]+)'
    _TESTS = [{
        'url': 'https://cu.tbs.co.jp/playlist/184f9970e7ba48e4915f1b252c55015e',
        'playlist_mincount': 4,
        'info_dict': {
            'title': 'まもなく配信終了',
            'id': '184f9970e7ba48e4915f1b252c55015e',
        }
    }]

    def _real_extract(self, url):
        playlist_id = self._match_id(url)
        page = self._download_webpage(url, playlist_id)
        meta = self._search_json(r'window\.app\s*=', page, 'playlist info', playlist_id)
        playlist = traverse_obj(meta, ('falcorCache', 'playList', playlist_id))

        def entries():
            for entry in traverse_obj(playlist, ('catalogs', 'value', lambda _, v: v['content_id'])):
                # TODO: it's likely possible to get all metadata from the playlist page json instead
                content_id = entry['content_id']
                content_type = entry.get('content_type')
                if content_type == 'tv_show':
                    yield self.url_result(
                        f'https://cu.tbs.co.jp/program/{content_id}', TBSJPProgramIE, content_id)
                elif content_type == 'tv_episode':
                    yield self.url_result(
                        f'https://cu.tbs.co.jp/episode/{content_id}', TBSJPEpisodeIE, content_id)
                else:
                    self.report_warning(f'Skipping "{content_id}" with unsupported content_type "{content_type}"')

        return self.playlist_result(entries(), playlist_id, traverse_obj(playlist, ('display_name', 'value')))