yt_dlp/extractor/tvopengr.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143

# coding: utf-8
from __future__ import unicode_literals

import re

from .common import InfoExtractor
from ..utils import (
    determine_ext,
    get_elements_text_and_html_by_attribute,
    merge_dicts,
    unescapeHTML,
)


class TVOpenGrBaseIE(InfoExtractor):
    def _return_canonical_url(self, url, video_id):
        webpage = self._download_webpage(url, video_id)
        canonical_url = self._og_search_url(webpage)
        title = self._og_search_title(webpage)
        return self.url_result(canonical_url, ie=TVOpenGrWatchIE.ie_key(), video_id=video_id, video_title=title)


class TVOpenGrWatchIE(TVOpenGrBaseIE):
    IE_NAME = 'tvopengr:watch'
    IE_DESC = 'tvopen.gr (and ethnos.gr) videos'
    _VALID_URL = r'https?://(?P<netloc>(?:www\.)?(?:tvopen|ethnos)\.gr)/watch/(?P<id>\d+)/(?P<slug>[^/]+)'
    _API_ENDPOINT = 'https://www.tvopen.gr/templates/data/player'

    _TESTS = [{
        'url': 'https://www.ethnos.gr/watch/101009/nikoskaprabelosdenexoymekanenanasthenhsemethmethmetallaxhomikron',
        'md5': '8728570e3a72e0f8d9475ba94859fdc1',
        'info_dict': {
            'id': '101009',
            'title': 'md5:51f68773dcb6c70498cd326f45fefdf0',
            'display_id': 'nikoskaprabelosdenexoymekanenanasthenhsemethmethmetallaxhomikron',
            'description': 'md5:78fff49f18fb3effe41b070e5c7685d6',
            'thumbnail': 'https://opentv-static.siliconweb.com/imgHandler/1920/d573ba71-ec5f-43c6-b4cb-d181f327d3a8.jpg',
            'ext': 'mp4',
            'upload_date': '20220109',
            'timestamp': 1641686400,
        },
    }, {
        'url': 'https://www.tvopen.gr/watch/100979/se28099agapaomenalla7cepeisodio267cmhthrargiapashskakias',
        'md5': '38f98a1be0c577db4ea2d1b1c0770c48',
        'info_dict': {
            'id': '100979',
            'title': 'md5:e021f3001e16088ee40fa79b20df305b',
            'display_id': 'se28099agapaomenalla7cepeisodio267cmhthrargiapashskakias',
            'description': 'md5:ba17db53954134eb8d625d199e2919fb',
            'thumbnail': 'https://opentv-static.siliconweb.com/imgHandler/1920/9bb71cf1-21da-43a9-9d65-367950fde4e3.jpg',
            'ext': 'mp4',
            'upload_date': '20220108',
            'timestamp': 1641600000,
        },
    }]

    def _extract_formats_and_subs(self, response, video_id):
        formats, subs = [], {}
        for format_id, format_url in response.items():
            if format_id not in ('stream', 'httpstream', 'mpegdash'):
                continue
            ext = determine_ext(format_url)
            if ext == 'm3u8':
                formats_, subs_ = self._extract_m3u8_formats_and_subtitles(
                    format_url, video_id, 'mp4', m3u8_id=format_id,
                    fatal=False)
            elif ext == 'mpd':
                formats_, subs_ = self._extract_mpd_formats_and_subtitles(
                    format_url, video_id, 'mp4', fatal=False)
            else:
                formats.append({
                    'url': format_url,
                    'format_id': format_id,
                })
                continue
            formats.extend(formats_)
            self._merge_subtitles(subs_, target=subs)
        self._sort_formats(formats)
        return formats, subs

    @staticmethod
    def _scale_thumbnails_to_max_width(formats, thumbnails, url_width_re):
        _keys = ('width', 'height')
        max_dimensions = max(
            [tuple(format.get(k) or 0 for k in _keys) for format in formats],
            default=(0, 0))
        if not max_dimensions[0]:
            return thumbnails
        return [
            merge_dicts(
                {'url': re.sub(url_width_re, str(max_dimensions[0]), thumbnail['url'])},
                dict(zip(_keys, max_dimensions)), thumbnail)
            for thumbnail in thumbnails
        ]

    def _real_extract(self, url):
        netloc, video_id, display_id = self._match_valid_url(url).group('netloc', 'id', 'slug')
        if netloc.find('tvopen.gr') == -1:
            return self._return_canonical_url(url, video_id)
        webpage = self._download_webpage(url, video_id)
        info = self._search_json_ld(webpage, video_id, expected_type='VideoObject')
        info['formats'], info['subtitles'] = self._extract_formats_and_subs(
            self._download_json(self._API_ENDPOINT, video_id, query={'cid': video_id}),
            video_id)
        info['thumbnails'] = self._scale_thumbnails_to_max_width(
            info['formats'], info['thumbnails'], r'(?<=/imgHandler/)\d+')
        description, _html = next(get_elements_text_and_html_by_attribute('class', 'description', webpage))
        if description and _html.startswith('<span '):
            info['description'] = description
        info['id'] = video_id
        info['display_id'] = display_id
        return info


class TVOpenGrEmbedIE(TVOpenGrBaseIE):
    IE_NAME = 'tvopengr:embed'
    IE_DESC = 'tvopen.gr embedded videos'
    _VALID_URL = r'(?:https?:)?//(?:www\.|cdn\.|)(?:tvopen|ethnos).gr/embed/(?P<id>\d+)'
    _EMBED_RE = re.compile(rf'''<iframe[^>]+?src=(?P<_q1>["'])(?P<url>{_VALID_URL})(?P=_q1)''')

    _TESTS = [{
        'url': 'https://cdn.ethnos.gr/embed/100963',
        'md5': '2da147881f45571d81662d94d086628b',
        'info_dict': {
            'id': '100963',
            'display_id': 'koronoiosapotoysdieythyntestonsxoleionselftestgiaosoysdenbrhkan',
            'title': 'md5:2c71876fadf0cda6043da0da5fca2936',
            'description': 'md5:17482b4432e5ed30eccd93b05d6ea509',
            'thumbnail': 'https://opentv-static.siliconweb.com/imgHandler/1920/5804e07f-799a-4247-a696-33842c94ca37.jpg',
            'ext': 'mp4',
            'upload_date': '20220108',
            'timestamp': 1641600000,
        },
    }]

    @classmethod
    def _extract_urls(cls, webpage):
        for mobj in cls._EMBED_RE.finditer(webpage):
            yield unescapeHTML(mobj.group('url'))

    def _real_extract(self, url):
        video_id = self._match_id(url)
        return self._return_canonical_url(url, video_id)