aboutsummaryrefslogtreecommitdiffstats
path: root/hypervideo_dl/extractor/telegram.py
diff options
context:
space:
mode:
Diffstat (limited to 'hypervideo_dl/extractor/telegram.py')
-rw-r--r--hypervideo_dl/extractor/telegram.py141
1 files changed, 120 insertions, 21 deletions
diff --git a/hypervideo_dl/extractor/telegram.py b/hypervideo_dl/extractor/telegram.py
index 2dfa261..5ec5485 100644
--- a/hypervideo_dl/extractor/telegram.py
+++ b/hypervideo_dl/extractor/telegram.py
@@ -1,37 +1,136 @@
+import re
+
from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ format_field,
+ get_element_by_class,
+ parse_duration,
+ parse_qs,
+ traverse_obj,
+ unified_timestamp,
+ update_url_query,
+ url_basename,
+)
class TelegramEmbedIE(InfoExtractor):
IE_NAME = 'telegram:embed'
- _VALID_URL = r'https?://t\.me/(?P<channel_name>[^/]+)/(?P<id>\d+)'
+ _VALID_URL = r'https?://t\.me/(?P<channel_id>[^/]+)/(?P<id>\d+)'
_TESTS = [{
'url': 'https://t.me/europa_press/613',
+ 'md5': 'dd707708aea958c11a590e8068825f22',
'info_dict': {
'id': '613',
'ext': 'mp4',
- 'title': 'Europa Press',
- 'description': '6ce2d7e8d56eda16d80607b23db7b252',
- 'thumbnail': r're:^https?:\/\/cdn.*?telesco\.pe\/file\/\w+',
+ 'title': 'md5:6ce2d7e8d56eda16d80607b23db7b252',
+ 'description': 'md5:6ce2d7e8d56eda16d80607b23db7b252',
+ 'channel_id': 'europa_press',
+ 'channel': 'Europa Press ✔',
+ 'thumbnail': r're:^https?://.+',
+ 'timestamp': 1635631203,
+ 'upload_date': '20211030',
+ 'duration': 61,
+ },
+ }, {
+ # 2-video post
+ 'url': 'https://t.me/vorposte/29342',
+ 'info_dict': {
+ 'id': 'vorposte-29342',
+ 'title': 'Форпост 29342',
+ 'description': 'md5:9d92e22169a3e136d5d69df25f82c3dc',
+ },
+ 'playlist_count': 2,
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # 2-video post with --no-playlist
+ 'url': 'https://t.me/vorposte/29343',
+ 'md5': '1724e96053c18e788c8464038876e245',
+ 'info_dict': {
+ 'id': '29343',
+ 'ext': 'mp4',
+ 'title': 'md5:9d92e22169a3e136d5d69df25f82c3dc',
+ 'description': 'md5:9d92e22169a3e136d5d69df25f82c3dc',
+ 'channel_id': 'vorposte',
+ 'channel': 'Форпост',
+ 'thumbnail': r're:^https?://.+',
+ 'timestamp': 1666384480,
+ 'upload_date': '20221021',
+ 'duration': 35,
+ },
+ 'params': {
+ 'noplaylist': True,
+ }
+ }, {
+ # 2-video post with 'single' query param
+ 'url': 'https://t.me/vorposte/29342?single',
+ 'md5': 'd20b202f1e41400a9f43201428add18f',
+ 'info_dict': {
+ 'id': '29342',
+ 'ext': 'mp4',
+ 'title': 'md5:9d92e22169a3e136d5d69df25f82c3dc',
+ 'description': 'md5:9d92e22169a3e136d5d69df25f82c3dc',
+ 'channel_id': 'vorposte',
+ 'channel': 'Форпост',
+ 'thumbnail': r're:^https?://.+',
+ 'timestamp': 1666384480,
+ 'upload_date': '20221021',
+ 'duration': 33,
},
}]
def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- webpage_embed = self._download_webpage(f'{url}?embed=1', video_id)
+ channel_id, msg_id = self._match_valid_url(url).group('channel_id', 'id')
+ embed = self._download_webpage(
+ url, msg_id, query={'embed': '1', 'single': []}, note='Downloading embed frame')
- formats = [{
- 'url': self._proto_relative_url(self._search_regex(
- '<video[^>]+src="([^"]+)"', webpage_embed, 'source')),
- 'ext': 'mp4',
- }]
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'title': self._html_search_meta(['og:title', 'twitter:title'], webpage, fatal=True),
- 'description': self._html_search_meta(['og:description', 'twitter:description'], webpage, fatal=True),
- 'thumbnail': self._search_regex(r'tgme_widget_message_video_thumb"[^>]+background-image:url\(\'([^\']+)\'\)',
- webpage_embed, 'thumbnail'),
- 'formats': formats,
+ def clean_text(html_class, html):
+ text = clean_html(get_element_by_class(html_class, html))
+ return text.replace('\n', ' ') if text else None
+
+ description = clean_text('tgme_widget_message_text', embed)
+ message = {
+ 'title': description or '',
+ 'description': description,
+ 'channel': clean_text('tgme_widget_message_author', embed),
+ 'channel_id': channel_id,
+ 'timestamp': unified_timestamp(self._search_regex(
+ r'<time[^>]*datetime="([^"]*)"', embed, 'timestamp', fatal=False)),
}
+
+ videos = []
+ for video in re.findall(r'<a class="tgme_widget_message_video_player(?s:.+?)</time>', embed):
+ video_url = self._search_regex(
+ r'<video[^>]+src="([^"]+)"', video, 'video URL', fatal=False)
+ webpage_url = self._search_regex(
+ r'<a class="tgme_widget_message_video_player[^>]+href="([^"]+)"',
+ video, 'webpage URL', fatal=False)
+ if not video_url or not webpage_url:
+ continue
+ formats = [{
+ 'url': video_url,
+ 'ext': 'mp4',
+ }]
+ videos.append({
+ 'id': url_basename(webpage_url),
+ 'webpage_url': update_url_query(webpage_url, {'single': True}),
+ 'duration': parse_duration(self._search_regex(
+ r'<time[^>]+duration[^>]*>([\d:]+)</time>', video, 'duration', fatal=False)),
+ 'thumbnail': self._search_regex(
+ r'tgme_widget_message_video_thumb"[^>]+background-image:url\(\'([^\']+)\'\)',
+ video, 'thumbnail', fatal=False),
+ 'formats': formats,
+ **message,
+ })
+
+ playlist_id = None
+ if len(videos) > 1 and 'single' not in parse_qs(url, keep_blank_values=True):
+ playlist_id = f'{channel_id}-{msg_id}'
+
+ if self._yes_playlist(playlist_id, msg_id):
+ return self.playlist_result(
+ videos, playlist_id, format_field(message, 'channel', f'%s {msg_id}'), description)
+ else:
+ return traverse_obj(videos, lambda _, x: x['id'] == msg_id, get_all=False)