aboutsummaryrefslogtreecommitdiffstats
path: root/hypervideo_dl/extractor/twitter.py
diff options
context:
space:
mode:
Diffstat (limited to 'hypervideo_dl/extractor/twitter.py')
-rw-r--r--hypervideo_dl/extractor/twitter.py788
1 files changed, 650 insertions, 138 deletions
diff --git a/hypervideo_dl/extractor/twitter.py b/hypervideo_dl/extractor/twitter.py
index 8ccc38e..18ebb36 100644
--- a/hypervideo_dl/extractor/twitter.py
+++ b/hypervideo_dl/extractor/twitter.py
@@ -1,40 +1,42 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
+import json
import re
+import urllib.error
from .common import InfoExtractor
+from .periscope import PeriscopeBaseIE, PeriscopeIE
+from ..compat import functools # isort: split
from ..compat import (
- compat_HTTPError,
compat_parse_qs,
compat_urllib_parse_unquote,
compat_urllib_parse_urlparse,
)
from ..utils import (
- dict_get,
ExtractorError,
- format_field,
+ dict_get,
float_or_none,
+ format_field,
int_or_none,
+ make_archive_id,
+ str_or_none,
+ strip_or_none,
traverse_obj,
+ try_call,
try_get,
- strip_or_none,
unified_timestamp,
update_url_query,
url_or_none,
xpath_text,
)
-from .periscope import (
- PeriscopeBaseIE,
- PeriscopeIE,
-)
-
class TwitterBaseIE(InfoExtractor):
_API_BASE = 'https://api.twitter.com/1.1/'
- _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?twitter\.com/'
- _GUEST_TOKEN = None
+ _GRAPHQL_API_BASE = 'https://twitter.com/i/api/graphql/'
+ _TOKENS = {
+ 'AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA': None,
+ 'AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw': None,
+ }
+ _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?(?:twitter\.com|twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid\.onion)/'
def _extract_variant_formats(self, variant, video_id):
variant_url = variant.get('url')
@@ -86,28 +88,81 @@ class TwitterBaseIE(InfoExtractor):
'height': int(m.group('height')),
})
- def _call_api(self, path, video_id, query={}):
- headers = {
- 'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAAPYXBAAAAAAACLXUNDekMxqa8h%2F40K4moUkGsoc%3DTYfbDKbT3jJPCEVnMYqilB28NHfOPqkca3qaAxGfsyKCs0wRbw',
- }
- token = self._get_cookies(self._API_BASE).get('ct0')
- if token:
- headers['x-csrf-token'] = token.value
- if not self._GUEST_TOKEN:
- self._GUEST_TOKEN = self._download_json(
- self._API_BASE + 'guest/activate.json', video_id,
- 'Downloading guest token', data=b'',
- headers=headers)['guest_token']
- headers['x-guest-token'] = self._GUEST_TOKEN
- try:
- return self._download_json(
- self._API_BASE + path, video_id, headers=headers, query=query)
- except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
- raise ExtractorError(self._parse_json(
- e.cause.read().decode(),
- video_id)['errors'][0]['message'], expected=True)
- raise
+ @functools.cached_property
+ def is_logged_in(self):
+ return bool(self._get_cookies(self._API_BASE).get('auth_token'))
+
+ def _call_api(self, path, video_id, query={}, graphql=False):
+ cookies = self._get_cookies(self._API_BASE)
+ headers = {}
+
+ csrf_cookie = cookies.get('ct0')
+ if csrf_cookie:
+ headers['x-csrf-token'] = csrf_cookie.value
+
+ if self.is_logged_in:
+ headers.update({
+ 'x-twitter-auth-type': 'OAuth2Session',
+ 'x-twitter-client-language': 'en',
+ 'x-twitter-active-user': 'yes',
+ })
+
+ last_error = None
+ for bearer_token in self._TOKENS:
+ for first_attempt in (True, False):
+ headers['Authorization'] = f'Bearer {bearer_token}'
+
+ if not self.is_logged_in:
+ if not self._TOKENS[bearer_token]:
+ headers.pop('x-guest-token', None)
+ guest_token_response = self._download_json(
+ self._API_BASE + 'guest/activate.json', video_id,
+ 'Downloading guest token', data=b'', headers=headers)
+
+ self._TOKENS[bearer_token] = guest_token_response.get('guest_token')
+ if not self._TOKENS[bearer_token]:
+ raise ExtractorError('Could not retrieve guest token')
+
+ headers['x-guest-token'] = self._TOKENS[bearer_token]
+
+ try:
+ allowed_status = {400, 403, 404} if graphql else {403}
+ result = self._download_json(
+ (self._GRAPHQL_API_BASE if graphql else self._API_BASE) + path,
+ video_id, headers=headers, query=query, expected_status=allowed_status)
+
+ except ExtractorError as e:
+ if last_error:
+ raise last_error
+
+ if not isinstance(e.cause, urllib.error.HTTPError) or e.cause.code != 404:
+ raise
+
+ last_error = e
+ self.report_warning(
+ 'Twitter API gave 404 response, retrying with deprecated auth token. '
+ 'Only one media item can be extracted')
+ break # continue outer loop with next bearer_token
+
+ if result.get('errors'):
+ errors = traverse_obj(result, ('errors', ..., 'message'), expected_type=str)
+ if first_attempt and any('bad guest token' in error.lower() for error in errors):
+ self.to_screen('Guest token has expired. Refreshing guest token')
+ self._TOKENS[bearer_token] = None
+ continue
+
+ error_message = ', '.join(set(errors)) or 'Unknown error'
+ raise ExtractorError(f'Error(s) while querying API: {error_message}', expected=True)
+
+ return result
+
+ def _build_graphql_query(self, media_id):
+ raise NotImplementedError('Method must be implemented to support GraphQL')
+
+ def _call_graphql_api(self, endpoint, media_id):
+ data = self._build_graphql_query(media_id)
+ query = {key: json.dumps(value, separators=(',', ':')) for key, value in data.items()}
+ return traverse_obj(self._call_api(endpoint, media_id, query=query, graphql=True), 'data')
class TwitterCardIE(InfoExtractor):
@@ -118,7 +173,7 @@ class TwitterCardIE(InfoExtractor):
'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889',
# MD5 checksums are different in different places
'info_dict': {
- 'id': '560070183650213889',
+ 'id': '560070131976392705',
'ext': 'mp4',
'title': "Twitter - You can now shoot, edit and share video on Twitter. Capture life's most moving moments from your perspective.",
'description': 'md5:18d3e24bb4f6e5007487dd546e53bd96',
@@ -128,6 +183,13 @@ class TwitterCardIE(InfoExtractor):
'duration': 30.033,
'timestamp': 1422366112,
'upload_date': '20150127',
+ 'age_limit': 0,
+ 'comment_count': int,
+ 'tags': [],
+ 'repost_count': int,
+ 'like_count': int,
+ 'display_id': '560070183650213889',
+ 'uploader_url': 'https://twitter.com/Twitter',
},
},
{
@@ -142,7 +204,14 @@ class TwitterCardIE(InfoExtractor):
'uploader_id': 'NASA',
'timestamp': 1437408129,
'upload_date': '20150720',
+ 'uploader_url': 'https://twitter.com/NASA',
+ 'age_limit': 0,
+ 'comment_count': int,
+ 'like_count': int,
+ 'repost_count': int,
+ 'tags': ['PlutoFlyby'],
},
+ 'params': {'format': '[protocol=https]'}
},
{
'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977',
@@ -155,12 +224,27 @@ class TwitterCardIE(InfoExtractor):
'upload_date': '20111013',
'uploader': 'OMG! UBUNTU!',
'uploader_id': 'omgubuntu',
+ 'channel_url': 'https://www.youtube.com/channel/UCIiSwcm9xiFb3Y4wjzR41eQ',
+ 'channel_id': 'UCIiSwcm9xiFb3Y4wjzR41eQ',
+ 'channel_follower_count': int,
+ 'chapters': 'count:8',
+ 'uploader_url': 'http://www.youtube.com/user/omgubuntu',
+ 'duration': 138,
+ 'categories': ['Film & Animation'],
+ 'age_limit': 0,
+ 'comment_count': int,
+ 'availability': 'public',
+ 'like_count': int,
+ 'thumbnail': 'https://i.ytimg.com/vi/dq4Oj5quskI/maxresdefault.jpg',
+ 'view_count': int,
+ 'tags': 'count:12',
+ 'channel': 'OMG! UBUNTU!',
+ 'playable_in_embed': True,
},
'add_ie': ['Youtube'],
},
{
'url': 'https://twitter.com/i/cards/tfw/v1/665289828897005568',
- 'md5': '6dabeaca9e68cbb71c99c322a4b42a11',
'info_dict': {
'id': 'iBb2x00UVlv',
'ext': 'mp4',
@@ -169,9 +253,17 @@ class TwitterCardIE(InfoExtractor):
'uploader': 'ArsenalTerje',
'title': 'Vine by ArsenalTerje',
'timestamp': 1447451307,
+ 'alt_title': 'Vine by ArsenalTerje',
+ 'comment_count': int,
+ 'like_count': int,
+ 'thumbnail': r're:^https?://[^?#]+\.jpg',
+ 'view_count': int,
+ 'repost_count': int,
},
'add_ie': ['Vine'],
- }, {
+ 'params': {'skip_download': 'm3u8'},
+ },
+ {
'url': 'https://twitter.com/i/videos/tweet/705235433198714880',
'md5': '884812a2adc8aaf6fe52b15ccbfa3b88',
'info_dict': {
@@ -185,7 +277,8 @@ class TwitterCardIE(InfoExtractor):
'upload_date': '20160303',
},
'skip': 'This content is no longer available.',
- }, {
+ },
+ {
'url': 'https://twitter.com/i/videos/752274308186120192',
'only_matching': True,
},
@@ -205,7 +298,8 @@ class TwitterIE(TwitterBaseIE):
_TESTS = [{
'url': 'https://twitter.com/freethenipple/status/643211948184596480',
'info_dict': {
- 'id': '643211948184596480',
+ 'id': '643211870443208704',
+ 'display_id': '643211948184596480',
'ext': 'mp4',
'title': 'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!',
'thumbnail': r're:^https?://.*\.jpg',
@@ -215,6 +309,11 @@ class TwitterIE(TwitterBaseIE):
'duration': 12.922,
'timestamp': 1442188653,
'upload_date': '20150913',
+ 'uploader_url': 'https://twitter.com/freethenipple',
+ 'comment_count': int,
+ 'repost_count': int,
+ 'like_count': int,
+ 'tags': [],
'age_limit': 18,
},
}, {
@@ -235,13 +334,20 @@ class TwitterIE(TwitterBaseIE):
'url': 'https://twitter.com/starwars/status/665052190608723968',
'info_dict': {
'id': '665052190608723968',
+ 'display_id': '665052190608723968',
'ext': 'mp4',
- 'title': 'Star Wars - A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens.',
+ 'title': 'md5:55fef1d5b811944f1550e91b44abb82e',
'description': 'A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens. https://t.co/OkSqT2fjWJ',
'uploader_id': 'starwars',
- 'uploader': 'Star Wars',
+ 'uploader': r're:Star Wars.*',
'timestamp': 1447395772,
'upload_date': '20151113',
+ 'uploader_url': 'https://twitter.com/starwars',
+ 'comment_count': int,
+ 'repost_count': int,
+ 'like_count': int,
+ 'tags': ['TV', 'StarWars', 'TheForceAwakens'],
+ 'age_limit': 0,
},
}, {
'url': 'https://twitter.com/BTNBrentYarina/status/705235433198714880',
@@ -254,25 +360,39 @@ class TwitterIE(TwitterBaseIE):
'uploader': 'Brent Yarina',
'timestamp': 1456976204,
'upload_date': '20160303',
+ 'uploader_url': 'https://twitter.com/BTNBrentYarina',
+ 'comment_count': int,
+ 'repost_count': int,
+ 'like_count': int,
+ 'tags': [],
+ 'age_limit': 0,
},
'params': {
# The same video as https://twitter.com/i/videos/tweet/705235433198714880
# Test case of TwitterCardIE
'skip_download': True,
},
+ 'skip': 'Dead external link',
}, {
'url': 'https://twitter.com/jaydingeer/status/700207533655363584',
'info_dict': {
- 'id': '700207533655363584',
+ 'id': '700207414000242688',
+ 'display_id': '700207533655363584',
'ext': 'mp4',
- 'title': 'simon vertugo - BEAT PROD: @suhmeduh #Damndaniel',
+ 'title': 'jaydin donte geer - BEAT PROD: @suhmeduh #Damndaniel',
'description': 'BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ',
'thumbnail': r're:^https?://.*\.jpg',
- 'uploader': 'simon vertugo',
- 'uploader_id': 'simonvertugo',
+ 'uploader': 'jaydin donte geer',
+ 'uploader_id': 'jaydingeer',
'duration': 30.0,
'timestamp': 1455777459,
'upload_date': '20160218',
+ 'uploader_url': 'https://twitter.com/jaydingeer',
+ 'comment_count': int,
+ 'repost_count': int,
+ 'like_count': int,
+ 'tags': ['Damndaniel'],
+ 'age_limit': 0,
},
}, {
'url': 'https://twitter.com/Filmdrunk/status/713801302971588609',
@@ -285,12 +405,19 @@ class TwitterIE(TwitterBaseIE):
'uploader_id': '1004126642786242560',
'timestamp': 1402826626,
'upload_date': '20140615',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'alt_title': 'Vine by TAKUMA',
+ 'comment_count': int,
+ 'repost_count': int,
+ 'like_count': int,
+ 'view_count': int,
},
'add_ie': ['Vine'],
}, {
'url': 'https://twitter.com/captainamerica/status/719944021058060289',
'info_dict': {
- 'id': '719944021058060289',
+ 'id': '717462543795523584',
+ 'display_id': '719944021058060289',
'ext': 'mp4',
'title': 'Captain America - @King0fNerd Are you sure you made the right choice? Find out in theaters.',
'description': '@King0fNerd Are you sure you made the right choice? Find out in theaters. https://t.co/GpgYi9xMJI',
@@ -299,6 +426,13 @@ class TwitterIE(TwitterBaseIE):
'duration': 3.17,
'timestamp': 1460483005,
'upload_date': '20160412',
+ 'uploader_url': 'https://twitter.com/CaptainAmerica',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'comment_count': int,
+ 'repost_count': int,
+ 'like_count': int,
+ 'tags': [],
+ 'age_limit': 0,
},
}, {
'url': 'https://twitter.com/OPP_HSD/status/779210622571536384',
@@ -310,6 +444,7 @@ class TwitterIE(TwitterBaseIE):
'uploader_id': '1PmKqpJdOJQoY',
'uploader': 'Sgt Kerry Schmidt - Ontario Provincial Police',
'timestamp': 1474613214,
+ 'thumbnail': r're:^https?://.*\.jpg',
},
'add_ie': ['Periscope'],
}, {
@@ -330,7 +465,8 @@ class TwitterIE(TwitterBaseIE):
}, {
'url': 'https://twitter.com/i/web/status/910031516746514432',
'info_dict': {
- 'id': '910031516746514432',
+ 'id': '910030238373089285',
+ 'display_id': '910031516746514432',
'ext': 'mp4',
'title': 'Préfet de Guadeloupe - [Direct] #Maria Le centre se trouve actuellement au sud de Basse-Terre. Restez confinés. Réfugiez-vous dans la pièce la + sûre.',
'thumbnail': r're:^https?://.*\.jpg',
@@ -340,6 +476,12 @@ class TwitterIE(TwitterBaseIE):
'duration': 47.48,
'timestamp': 1505803395,
'upload_date': '20170919',
+ 'uploader_url': 'https://twitter.com/Prefet971',
+ 'comment_count': int,
+ 'repost_count': int,
+ 'like_count': int,
+ 'tags': ['Maria'],
+ 'age_limit': 0,
},
'params': {
'skip_download': True, # requires ffmpeg
@@ -348,7 +490,8 @@ class TwitterIE(TwitterBaseIE):
# card via api.twitter.com/1.1/videos/tweet/config
'url': 'https://twitter.com/LisPower1/status/1001551623938805763',
'info_dict': {
- 'id': '1001551623938805763',
+ 'id': '1001551417340022785',
+ 'display_id': '1001551623938805763',
'ext': 'mp4',
'title': 're:.*?Shep is on a roll today.*?',
'thumbnail': r're:^https?://.*\.jpg',
@@ -358,6 +501,12 @@ class TwitterIE(TwitterBaseIE):
'duration': 111.278,
'timestamp': 1527623489,
'upload_date': '20180529',
+ 'uploader_url': 'https://twitter.com/LisPower1',
+ 'comment_count': int,
+ 'repost_count': int,
+ 'like_count': int,
+ 'tags': [],
+ 'age_limit': 0,
},
'params': {
'skip_download': True, # requires ffmpeg
@@ -365,7 +514,8 @@ class TwitterIE(TwitterBaseIE):
}, {
'url': 'https://twitter.com/foobar/status/1087791357756956680',
'info_dict': {
- 'id': '1087791357756956680',
+ 'id': '1087791272830607360',
+ 'display_id': '1087791357756956680',
'ext': 'mp4',
'title': 'Twitter - A new is coming. Some of you got an opt-in to try it now. Check out the emoji button, quick keyboard shortcuts, upgraded trends, advanced search, and more. Let us know your thoughts!',
'thumbnail': r're:^https?://.*\.jpg',
@@ -375,6 +525,12 @@ class TwitterIE(TwitterBaseIE):
'duration': 61.567,
'timestamp': 1548184644,
'upload_date': '20190122',
+ 'uploader_url': 'https://twitter.com/Twitter',
+ 'comment_count': int,
+ 'repost_count': int,
+ 'like_count': int,
+ 'tags': [],
+ 'age_limit': 0,
},
}, {
# not available in Periscope
@@ -385,13 +541,17 @@ class TwitterIE(TwitterBaseIE):
'title': 'Vivi - Vivi founder @lior_rauchy announcing our new student feedback tool live at @EduTECH_AU #EduTECH2019',
'uploader': 'Vivi',
'uploader_id': '1eVjYOLGkGrQL',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'tags': ['EduTECH2019'],
+ 'view_count': int,
},
'add_ie': ['TwitterBroadcast'],
}, {
# unified card
'url': 'https://twitter.com/BrooklynNets/status/1349794411333394432?s=20',
'info_dict': {
- 'id': '1349794411333394432',
+ 'id': '1349774757969989634',
+ 'display_id': '1349794411333394432',
'ext': 'mp4',
'title': 'md5:d1c4941658e4caaa6cb579260d85dcba',
'thumbnail': r're:^https?://.*\.jpg',
@@ -401,11 +561,177 @@ class TwitterIE(TwitterBaseIE):
'duration': 324.484,
'timestamp': 1610651040,
'upload_date': '20210114',
+ 'uploader_url': 'https://twitter.com/BrooklynNets',
+ 'comment_count': int,
+ 'repost_count': int,
+ 'like_count': int,
+ 'tags': [],
+ 'age_limit': 0,
},
'params': {
'skip_download': True,
},
}, {
+ 'url': 'https://twitter.com/oshtru/status/1577855540407197696',
+ 'info_dict': {
+ 'id': '1577855447914409984',
+ 'display_id': '1577855540407197696',
+ 'ext': 'mp4',
+ 'title': 'md5:9d198efb93557b8f8d5b78c480407214',
+ 'description': 'md5:b9c3699335447391d11753ab21c70a74',
+ 'upload_date': '20221006',
+ 'uploader': 'oshtru',
+ 'uploader_id': 'oshtru',
+ 'uploader_url': 'https://twitter.com/oshtru',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'duration': 30.03,
+ 'timestamp': 1665025050,
+ 'comment_count': int,
+ 'repost_count': int,
+ 'like_count': int,
+ 'tags': [],
+ 'age_limit': 0,
+ },
+ 'params': {'skip_download': True},
+ }, {
+ 'url': 'https://twitter.com/UltimaShadowX/status/1577719286659006464',
+ 'info_dict': {
+ 'id': '1577719286659006464',
+ 'title': 'Ultima | #\u0432\u029f\u043c - Test',
+ 'description': 'Test https://t.co/Y3KEZD7Dad',
+ 'uploader': 'Ultima | #\u0432\u029f\u043c',
+ 'uploader_id': 'UltimaShadowX',
+ 'uploader_url': 'https://twitter.com/UltimaShadowX',
+ 'upload_date': '20221005',
+ 'timestamp': 1664992565,
+ 'comment_count': int,
+ 'repost_count': int,
+ 'like_count': int,
+ 'tags': [],
+ 'age_limit': 0,
+ },
+ 'playlist_count': 4,
+ 'params': {'skip_download': True},
+ }, {
+ 'url': 'https://twitter.com/MesoMax919/status/1575560063510810624',
+ 'info_dict': {
+ 'id': '1575559336759263233',
+ 'display_id': '1575560063510810624',
+ 'ext': 'mp4',
+ 'title': 'md5:eec26382babd0f7c18f041db8ae1c9c9',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'description': 'md5:95aea692fda36a12081b9629b02daa92',
+ 'uploader': 'Max Olson',
+ 'uploader_id': 'MesoMax919',
+ 'uploader_url': 'https://twitter.com/MesoMax919',
+ 'duration': 21.321,
+ 'timestamp': 1664477766,
+ 'upload_date': '20220929',
+ 'comment_count': int,
+ 'repost_count': int,
+ 'like_count': int,
+ 'tags': ['HurricaneIan'],
+ 'age_limit': 0,
+ },
+ }, {
+ # Adult content, uses old token
+ # Fails if not logged in (GraphQL)
+ 'url': 'https://twitter.com/Rizdraws/status/1575199173472927762',
+ 'info_dict': {
+ 'id': '1575199163847000068',
+ 'display_id': '1575199173472927762',
+ 'ext': 'mp4',
+ 'title': str,
+ 'description': str,
+ 'uploader': str,
+ 'uploader_id': 'Rizdraws',
+ 'uploader_url': 'https://twitter.com/Rizdraws',
+ 'upload_date': '20220928',
+ 'timestamp': 1664391723,
+ 'thumbnail': 're:^https?://.*\\.jpg',
+ 'like_count': int,
+ 'repost_count': int,
+ 'comment_count': int,
+ 'age_limit': 18,
+ 'tags': []
+ },
+ 'expected_warnings': ['404'],
+ }, {
+ # Description is missing one https://t.co url (GraphQL)
+ 'url': 'https://twitter.com/Srirachachau/status/1395079556562706435',
+ 'playlist_mincount': 2,
+ 'info_dict': {
+ 'id': '1395079556562706435',
+ 'title': str,
+ 'tags': [],
+ 'uploader': str,
+ 'like_count': int,
+ 'upload_date': '20210519',
+ 'age_limit': 0,
+ 'repost_count': int,
+ 'description': 'Here it is! Finished my gothic western cartoon. Pretty proud of it. It\'s got some goofs and lots of splashy over the top violence, something for everyone, hope you like it https://t.co/fOsG5glUnw https://t.co/kbXZrozlY7',
+ 'uploader_id': 'Srirachachau',
+ 'comment_count': int,
+ 'uploader_url': 'https://twitter.com/Srirachachau',
+ 'timestamp': 1621447860,
+ },
+ }, {
+ # Description is missing one https://t.co url (GraphQL)
+ 'url': 'https://twitter.com/DavidToons_/status/1578353380363501568',
+ 'playlist_mincount': 2,
+ 'info_dict': {
+ 'id': '1578353380363501568',
+ 'title': str,
+ 'uploader_id': 'DavidToons_',
+ 'repost_count': int,
+ 'like_count': int,
+ 'uploader': str,
+ 'timestamp': 1665143744,
+ 'uploader_url': 'https://twitter.com/DavidToons_',
+ 'description': 'Chris sounds like Linda from Bob\'s Burgers, so as an animator: this had to be done. https://t.co/glfQdgfFXH https://t.co/WgJauwIW1w',
+ 'tags': [],
+ 'comment_count': int,
+ 'upload_date': '20221007',
+ 'age_limit': 0,
+ },
+ }, {
+ 'url': 'https://twitter.com/primevideouk/status/1578401165338976258',
+ 'playlist_count': 2,
+ 'info_dict': {
+ 'id': '1578401165338976258',
+ 'title': str,
+ 'description': 'md5:659a6b517a034b4cee5d795381a2dc41',
+ 'uploader': str,
+ 'uploader_id': 'primevideouk',
+ 'timestamp': 1665155137,
+ 'upload_date': '20221007',
+ 'age_limit': 0,
+ 'uploader_url': 'https://twitter.com/primevideouk',
+ 'comment_count': int,
+ 'repost_count': int,
+ 'like_count': int,
+ 'tags': ['TheRingsOfPower'],
+ },
+ }, {
+ # Twitter Spaces
+ 'url': 'https://twitter.com/MoniqueCamarra/status/1550101959377551360',
+ 'info_dict': {
+ 'id': '1lPJqmBeeNAJb',
+ 'ext': 'm4a',
+ 'title': 'EuroFile@6 Ukraine Up-date-Draghi Defenestration-the West',
+ 'uploader': r're:Monique Camarra.+?',
+ 'uploader_id': 'MoniqueCamarra',
+ 'live_status': 'was_live',
+ 'description': 'md5:acce559345fd49f129c20dbcda3f1201',
+ 'timestamp': 1658407771464,
+ },
+ 'add_ie': ['TwitterSpaces'],
+ 'params': {'skip_download': 'm3u8'},
+ }, {
+ # onion route
+ 'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273',
+ 'only_matching': True,
+ }, {
# Twitch Clip Embed
'url': 'https://twitter.com/GunB1g/status/1163218564784017422',
'only_matching': True,
@@ -439,10 +765,77 @@ class TwitterIE(TwitterBaseIE):
'only_matching': True,
}]
+ def _graphql_to_legacy(self, data, twid):
+ result = traverse_obj(data, (
+ 'threaded_conversation_with_injections_v2', 'instructions', 0, 'entries',
+ lambda _, v: v['entryId'] == f'tweet-{twid}', 'content', 'itemContent',
+ 'tweet_results', 'result'
+ ), expected_type=dict, default={}, get_all=False)
+
+ if 'tombstone' in result:
+ cause = traverse_obj(result, ('tombstone', 'text', 'text'), expected_type=str)
+ raise ExtractorError(f'Twitter API says: {cause or "Unknown error"}', expected=True)
+
+ status = result.get('legacy', {})
+ status.update(traverse_obj(result, {
+ 'user': ('core', 'user_results', 'result', 'legacy'),
+ 'card': ('card', 'legacy'),
+ 'quoted_status': ('quoted_status_result', 'result', 'legacy'),
+ }, expected_type=dict, default={}))
+
+ # extra transformation is needed since result does not match legacy format
+ binding_values = {
+ binding_value.get('key'): binding_value.get('value')
+ for binding_value in traverse_obj(status, ('card', 'binding_values', ...), expected_type=dict)
+ }
+ if binding_values:
+ status['card']['binding_values'] = binding_values
+
+ return status
+
+ def _build_graphql_query(self, media_id):
+ return {
+ 'variables': {
+ 'focalTweetId': media_id,
+ 'includePromotedContent': True,
+ 'with_rux_injections': False,
+ 'withBirdwatchNotes': True,
+ 'withCommunity': True,
+ 'withDownvotePerspective': False,
+ 'withQuickPromoteEligibilityTweetFields': True,
+ 'withReactionsMetadata': False,
+ 'withReactionsPerspective': False,
+ 'withSuperFollowsTweetFields': True,
+ 'withSuperFollowsUserFields': True,
+ 'withV2Timeline': True,
+ 'withVoice': True,
+ },
+ 'features': {
+ 'graphql_is_translatable_rweb_tweet_is_translatable_enabled': False,
+ 'interactive_text_enabled': True,
+ 'responsive_web_edit_tweet_api_enabled': True,
+ 'responsive_web_enhance_cards_enabled': True,
+ 'responsive_web_graphql_timeline_navigation_enabled': False,
+ 'responsive_web_text_conversations_enabled': False,
+ 'responsive_web_uc_gql_enabled': True,
+ 'standardized_nudges_misinfo': True,
+ 'tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled': False,
+ 'tweetypie_unmention_optimization_enabled': True,
+ 'unified_cards_ad_metadata_container_dynamic_card_content_query_enabled': True,
+ 'verified_phone_label_enabled': False,
+ 'vibe_api_enabled': True,
+ },
+ }
+
def _real_extract(self, url):
twid = self._match_id(url)
- status = self._call_api(
- 'statuses/show/%s.json' % twid, twid, {
+ if self.is_logged_in or self._configuration_arg('force_graphql'):
+ self.write_debug(f'Using GraphQL API (Auth = {self.is_logged_in})')
+ result = self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid)
+ status = self._graphql_to_legacy(result, twid)
+
+ else:
+ status = self._call_api(f'statuses/show/{twid}.json', twid, {
'cards_platform': 'Web-12',
'include_cards': 1,
'include_reply_count': 1,
@@ -456,7 +849,7 @@ class TwitterIE(TwitterBaseIE):
user = status.get('user') or {}
uploader = user.get('name')
if uploader:
- title = '%s - %s' % (uploader, title)
+ title = f'{uploader} - {title}'
uploader_id = user.get('screen_name')
tags = []
@@ -473,7 +866,7 @@ class TwitterIE(TwitterBaseIE):
'uploader': uploader,
'timestamp': unified_timestamp(status.get('created_at')),
'uploader_id': uploader_id,
- 'uploader_url': format_field(uploader_id, template='https://twitter.com/%s'),
+ 'uploader_url': format_field(uploader_id, None, 'https://twitter.com/%s'),
'like_count': int_or_none(status.get('favorite_count')),
'repost_count': int_or_none(status.get('retweet_count')),
'comment_count': int_or_none(status.get('reply_count')),
@@ -482,6 +875,8 @@ class TwitterIE(TwitterBaseIE):
}
def extract_from_video_info(media):
+ media_id = traverse_obj(media, 'id_str', 'id', expected_type=str_or_none)
+ self.write_debug(f'Extracting from video info: {media_id}')
video_info = media.get('video_info') or {}
formats = []
@@ -490,7 +885,6 @@ class TwitterIE(TwitterBaseIE):
fmts, subs = self._extract_variant_formats(variant, twid)
subtitles = self._merge_subtitles(subtitles, subs)
formats.extend(fmts)
- self._sort_formats(formats, ('res', 'br', 'size', 'proto')) # The codec of http formats are unknown
thumbnails = []
media_url = media.get('media_url_https') or media.get('media_url')
@@ -506,90 +900,111 @@ class TwitterIE(TwitterBaseIE):
add_thumbnail(name, size)
add_thumbnail('orig', media.get('original_info') or {})
- info.update({
+ return {
+ 'id': media_id,
'formats': formats,
'subtitles': subtitles,
'thumbnails': thumbnails,
'duration': float_or_none(video_info.get('duration_millis'), 1000),
- })
+ # The codec of http formats are unknown
+ '_format_sort_fields': ('res', 'br', 'size', 'proto'),
+ }
- media = traverse_obj(status, ((None, 'quoted_status'), 'extended_entities', 'media', 0), get_all=False)
- if media and media.get('type') != 'photo':
- extract_from_video_info(media)
- else:
- card = status.get('card')
- if card:
- binding_values = card['binding_values']
-
- def get_binding_value(k):
- o = binding_values.get(k) or {}
- return try_get(o, lambda x: x[x['type'].lower() + '_value'])
-
- card_name = card['name'].split(':')[-1]
- if card_name == 'player':
- info.update({
- '_type': 'url',
- 'url': get_binding_value('player_url'),
- })
- elif card_name == 'periscope_broadcast':
- info.update({
- '_type': 'url',
- 'url': get_binding_value('url') or get_binding_value('player_url'),
- 'ie_key': PeriscopeIE.ie_key(),
- })
- elif card_name == 'broadcast':
- info.update({
- '_type': 'url',
- 'url': get_binding_value('broadcast_url'),
- 'ie_key': TwitterBroadcastIE.ie_key(),
- })
- elif card_name == 'summary':
- info.update({
- '_type': 'url',
- 'url': get_binding_value('card_url'),
- })
- elif card_name == 'unified_card':
- media_entities = self._parse_json(get_binding_value('unified_card'), twid)['media_entities']
- extract_from_video_info(next(iter(media_entities.values())))
- # amplify, promo_video_website, promo_video_convo, appplayer,
- # video_direct_message, poll2choice_video, poll3choice_video,
- # poll4choice_video, ...
- else:
- is_amplify = card_name == 'amplify'
- vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url')
- content_id = get_binding_value('%s_content_id' % (card_name if is_amplify else 'player'))
- formats, subtitles = self._extract_formats_from_vmap_url(vmap_url, content_id or twid)
- self._sort_formats(formats)
-
- thumbnails = []
- for suffix in ('_small', '', '_large', '_x_large', '_original'):
- image = get_binding_value('player_image' + suffix) or {}
- image_url = image.get('url')
- if not image_url or '/player-placeholder' in image_url:
- continue
- thumbnails.append({
- 'id': suffix[1:] if suffix else 'medium',
- 'url': image_url,
- 'width': int_or_none(image.get('width')),
- 'height': int_or_none(image.get('height')),
- })
-
- info.update({
- 'formats': formats,
- 'subtitles': subtitles,
- 'thumbnails': thumbnails,
- 'duration': int_or_none(get_binding_value(
- 'content_duration_seconds')),
- })
- else:
- expanded_url = try_get(status, lambda x: x['entities']['urls'][0]['expanded_url'])
- if not expanded_url:
- raise ExtractorError("There's no video in this tweet.")
- info.update({
+ def extract_from_card_info(card):
+ if not card:
+ return
+
+ self.write_debug(f'Extracting from card info: {card.get("url")}')
+ binding_values = card['binding_values']
+
+ def get_binding_value(k):
+ o = binding_values.get(k) or {}
+ return try_get(o, lambda x: x[x['type'].lower() + '_value'])
+
+ card_name = card['name'].split(':')[-1]
+ if card_name == 'player':
+ yield {
'_type': 'url',
- 'url': expanded_url,
- })
- return info
+ 'url': get_binding_value('player_url'),
+ }
+ elif card_name == 'periscope_broadcast':
+ yield {
+ '_type': 'url',
+ 'url': get_binding_value('url') or get_binding_value('player_url'),
+ 'ie_key': PeriscopeIE.ie_key(),
+ }
+ elif card_name == 'broadcast':
+ yield {
+ '_type': 'url',
+ 'url': get_binding_value('broadcast_url'),
+ 'ie_key': TwitterBroadcastIE.ie_key(),
+ }
+ elif card_name == 'audiospace':
+ yield {
+ '_type': 'url',
+ 'url': f'https://twitter.com/i/spaces/{get_binding_value("id")}',
+ 'ie_key': TwitterSpacesIE.ie_key(),
+ }
+ elif card_name == 'summary':
+ yield {
+ '_type': 'url',
+ 'url': get_binding_value('card_url'),
+ }
+ elif card_name == 'unified_card':
+ unified_card = self._parse_json(get_binding_value('unified_card'), twid)
+ yield from map(extract_from_video_info, traverse_obj(
+ unified_card, ('media_entities', ...), expected_type=dict))
+ # amplify, promo_video_website, promo_video_convo, appplayer,
+ # video_direct_message, poll2choice_video, poll3choice_video,
+ # poll4choice_video, ...
+ else:
+ is_amplify = card_name == 'amplify'
+ vmap_url = get_binding_value('amplify_url_vmap') if is_amplify else get_binding_value('player_stream_url')
+ content_id = get_binding_value('%s_content_id' % (card_name if is_amplify else 'player'))
+ formats, subtitles = self._extract_formats_from_vmap_url(vmap_url, content_id or twid)
+
+ thumbnails = []
+ for suffix in ('_small', '', '_large', '_x_large', '_original'):
+ image = get_binding_value('player_image' + suffix) or {}
+ image_url = image.get('url')
+ if not image_url or '/player-placeholder' in image_url:
+ continue
+ thumbnails.append({
+ 'id': suffix[1:] if suffix else 'medium',
+ 'url': image_url,
+ 'width': int_or_none(image.get('width')),
+ 'height': int_or_none(image.get('height')),
+ })
+
+ yield {
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'thumbnails': thumbnails,
+ 'duration': int_or_none(get_binding_value(
+ 'content_duration_seconds')),
+ }
+
+ media_path = ((None, 'quoted_status'), 'extended_entities', 'media', lambda _, m: m['type'] != 'photo')
+ videos = map(extract_from_video_info, traverse_obj(status, media_path, expected_type=dict))
+ cards = extract_from_card_info(status.get('card'))
+ entries = [{**info, **data, 'display_id': twid} for data in (*videos, *cards)]
+
+ if not entries:
+ expanded_url = traverse_obj(status, ('entities', 'urls', 0, 'expanded_url'), expected_type=url_or_none)
+ if not expanded_url or expanded_url == url:
+ raise ExtractorError('No video could be found in this tweet', expected=True)
+
+ return self.url_result(expanded_url, display_id=twid, **info)
+
+ entries[0]['_old_archive_ids'] = [make_archive_id(self, twid)]
+
+ if len(entries) == 1:
+ return entries[0]
+
+ for index, entry in enumerate(entries, 1):
+ entry['title'] += f' #{index}'
+
+ return self.playlist_result(entries, **info)
class TwitterAmplifyIE(TwitterBaseIE):
@@ -598,13 +1013,14 @@ class TwitterAmplifyIE(TwitterBaseIE):
_TEST = {
'url': 'https://amp.twimg.com/v/0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951',
- 'md5': '7df102d0b9fd7066b86f3159f8e81bf6',
+ 'md5': 'fec25801d18a4557c5c9f33d2c379ffa',
'info_dict': {
'id': '0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951',
'ext': 'mp4',
'title': 'Twitter Video',
'thumbnail': 're:^https?://.*',
},
+ 'params': {'format': '[protocol=https]'},
}
def _real_extract(self, url):
@@ -613,7 +1029,7 @@ class TwitterAmplifyIE(TwitterBaseIE):
vmap_url = self._html_search_meta(
'twitter:amplify:vmap', webpage, 'vmap url')
- formats = self._extract_formats_from_vmap_url(vmap_url, video_id)
+ formats, _ = self._extract_formats_from_vmap_url(vmap_url, video_id)
thumbnails = []
thumbnail = self._html_search_meta(
@@ -661,6 +1077,8 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE):
'title': 'Andrea May Sahouri - Periscope Broadcast',
'uploader': 'Andrea May Sahouri',
'uploader_id': '1PXEdBZWpGwKe',
+ 'thumbnail': r're:^https?://[^?#]+\.jpg\?token=',
+ 'view_count': int,
},
}
@@ -672,7 +1090,7 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE):
info = self._parse_broadcast_data(broadcast, broadcast_id)
media_key = broadcast['media_key']
source = self._call_api(
- 'live_video_stream/status/' + media_key, media_key)['source']
+ f'live_video_stream/status/{media_key}', media_key)['source']
m3u8_url = source.get('noRedirectPlaybackUrl') or source['location']
if '/live_video_stream/geoblocked/' in m3u8_url:
self.raise_geo_restricted()
@@ -684,6 +1102,100 @@ class TwitterBroadcastIE(TwitterBaseIE, PeriscopeBaseIE):
return info
+class TwitterSpacesIE(TwitterBaseIE):
+ IE_NAME = 'twitter:spaces'
+ _VALID_URL = TwitterBaseIE._BASE_REGEX + r'i/spaces/(?P<id>[0-9a-zA-Z]{13})'
+
+ _TESTS = [{
+ 'url': 'https://twitter.com/i/spaces/1RDxlgyvNXzJL',
+ 'info_dict': {
+ 'id': '1RDxlgyvNXzJL',
+ 'ext': 'm4a',
+ 'title': 'King Carlo e la mossa Kansas City per fare il Grande Centro',
+ 'description': 'Twitter Space participated by annarita digiorgio, Signor Ernesto, Raffaello Colosimo, Simone M. Sepe',
+ 'uploader': r're:Lucio Di Gaetano.*?',
+ 'uploader_id': 'luciodigaetano',
+ 'live_status': 'was_live',
+ 'timestamp': 1659877956397,
+ },
+ 'params': {'skip_download': 'm3u8'},
+ }]
+
+ SPACE_STATUS = {
+ 'notstarted': 'is_upcoming',
+ 'ended': 'was_live',
+ 'running': 'is_live',
+ 'timedout': 'post_live',
+ }
+
+ def _build_graphql_query(self, space_id):
+ return {
+ 'variables': {
+ 'id': space_id,
+ 'isMetatagsQuery': True,
+ 'withDownvotePerspective': False,
+ 'withReactionsMetadata': False,
+ 'withReactionsPerspective': False,
+ 'withReplays': True,
+ 'withSuperFollowsUserFields': True,
+ 'withSuperFollowsTweetFields': True,
+ },
+ 'features': {
+ 'dont_mention_me_view_api_enabled': True,
+ 'interactive_text_enabled': True,
+ 'responsive_web_edit_tweet_api_enabled': True,
+ 'responsive_web_enhance_cards_enabled': True,
+ 'responsive_web_uc_gql_enabled': True,
+ 'spaces_2022_h2_clipping': True,
+ 'spaces_2022_h2_spaces_communities': False,
+ 'standardized_nudges_misinfo': True,
+ 'tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled': False,
+ 'vibe_api_enabled': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ space_id = self._match_id(url)
+ space_data = self._call_graphql_api('HPEisOmj1epUNLCWTYhUWw/AudioSpaceById', space_id)['audioSpace']
+ if not space_data:
+ raise ExtractorError('Twitter Space not found', expected=True)
+
+ metadata = space_data['metadata']
+ live_status = try_call(lambda: self.SPACE_STATUS[metadata['state'].lower()])
+
+ formats = []
+ if live_status == 'is_upcoming':
+ self.raise_no_formats('Twitter Space not started yet', expected=True)
+ elif live_status == 'post_live':
+ self.raise_no_formats('Twitter Space ended but not downloadable yet', expected=True)
+ else:
+ source = self._call_api(
+ f'live_video_stream/status/{metadata["media_key"]}', metadata['media_key'])['source']
+
+ # XXX: Native downloader does not work
+ formats = self._extract_m3u8_formats(
+ traverse_obj(source, 'noRedirectPlaybackUrl', 'location'),
+ metadata['media_key'], 'm4a', 'm3u8', live=live_status == 'is_live',
+ headers={'Referer': 'https://twitter.com/'})
+ for fmt in formats:
+ fmt.update({'vcodec': 'none', 'acodec': 'aac'})
+
+ participants = ', '.join(traverse_obj(
+ space_data, ('participants', 'speakers', ..., 'display_name'))) or 'nobody yet'
+ return {
+ 'id': space_id,
+ 'title': metadata.get('title'),
+ 'description': f'Twitter Space participated by {participants}',
+ 'uploader': traverse_obj(
+ metadata, ('creator_results', 'result', 'legacy', 'name')),
+ 'uploader_id': traverse_obj(
+ metadata, ('creator_results', 'result', 'legacy', 'screen_name')),
+ 'live_status': live_status,
+ 'timestamp': metadata.get('created_at'),
+ 'formats': formats,
+ }
+
+
class TwitterShortenerIE(TwitterBaseIE):
IE_NAME = 'twitter:shortener'
_VALID_URL = r'https?://t.co/(?P<id>[^?]+)|tco:(?P<eid>[^?]+)'