diff options
| -rw-r--r-- | README.md | 3 | ||||
| -rw-r--r-- | yt_dlp/extractor/twitter.py | 180 | 
2 files changed, 113 insertions, 70 deletions
| @@ -1852,6 +1852,9 @@ The following extractors use this feature:  #### rokfinchannel  * `tab`: Which tab to download - one of `new`, `top`, `videos`, `podcasts`, `streams`, `stacks` +#### twitter +* `legacy_api`: Force usage of the legacy Twitter API instead of the GraphQL API for tweet extraction. Has no effect if login cookies are passed +  #### stacommu, wrestleuniverse  * `device_id`: UUID value assigned by the website and used to enforce device limits for paid livestream content. Can be found in browser local storage diff --git a/yt_dlp/extractor/twitter.py b/yt_dlp/extractor/twitter.py index eaf9be526..1fb9524da 100644 --- a/yt_dlp/extractor/twitter.py +++ b/yt_dlp/extractor/twitter.py @@ -1,6 +1,5 @@  import json  import re -import urllib.error  from .common import InfoExtractor  from .periscope import PeriscopeBaseIE, PeriscopeIE @@ -35,6 +34,7 @@ class TwitterBaseIE(InfoExtractor):      _GRAPHQL_API_BASE = 'https://twitter.com/i/api/graphql/'      _BASE_REGEX = r'https?://(?:(?:www|m(?:obile)?)\.)?(?:twitter\.com|twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid\.onion)/'      _AUTH = {'Authorization': 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'} +    _guest_token = None      _flow_token = None      _LOGIN_INIT_DATA = json.dumps({ @@ -145,6 +145,14 @@ class TwitterBaseIE(InfoExtractor):      def is_logged_in(self):          return bool(self._get_cookies(self._API_BASE).get('auth_token')) +    def _fetch_guest_token(self, headers, display_id): +        headers.pop('x-guest-token', None) +        self._guest_token = traverse_obj(self._download_json( +            f'{self._API_BASE}guest/activate.json', display_id, +            'Downloading guest token', data=b'', headers=headers), 'guest_token') +        if not self._guest_token: +            raise ExtractorError('Could not retrieve guest token') +      def _set_base_headers(self):          headers = self._AUTH.copy()          csrf_token = try_call(lambda: self._get_cookies(self._API_BASE)['ct0'].value) @@ -175,15 +183,12 @@ class TwitterBaseIE(InfoExtractor):          if self.is_logged_in:              return -        webpage = self._download_webpage('https://twitter.com/', None, 'Downloading login page') +        self._request_webpage('https://twitter.com/', None, 'Requesting cookies')          headers = self._set_base_headers() -        guest_token = self._search_regex( -            r'\.cookie\s*=\s*["\']gt=(\d+);', webpage, 'gt', default=None) or self._download_json( -            f'{self._API_BASE}guest/activate.json', None, 'Downloading guest token', -            data=b'', headers=headers)['guest_token'] +        self._fetch_guest_token(headers, None)          headers.update({              'content-type': 'application/json', -            'x-guest-token': guest_token, +            'x-guest-token': self._guest_token,              'x-twitter-client-language': 'en',              'x-twitter-active-user': 'yes',              'Referer': 'https://twitter.com/', @@ -280,24 +285,37 @@ class TwitterBaseIE(InfoExtractor):          self.report_login()      def _call_api(self, path, video_id, query={}, graphql=False): -        if not self.is_logged_in: -            self.raise_login_required() - -        result = self._download_json( -            (self._GRAPHQL_API_BASE if graphql else self._API_BASE) + path, video_id, -            f'Downloading {"GraphQL" if graphql else "legacy API"} JSON', headers={ -                **self._set_base_headers(), +        headers = self._set_base_headers() +        if self.is_logged_in: +            headers.update({                  'x-twitter-auth-type': 'OAuth2Session',                  'x-twitter-client-language': 'en',                  'x-twitter-active-user': 'yes', -            }, query=query, expected_status={400, 401, 403, 404} if graphql else {403}) +            }) + +        for first_attempt in (True, False): +            if not self.is_logged_in: +                if not self._guest_token: +                    self._fetch_guest_token(headers, video_id) +                headers['x-guest-token'] = self._guest_token + +            allowed_status = {400, 401, 403, 404} if graphql else {403} +            result = self._download_json( +                (self._GRAPHQL_API_BASE if graphql else self._API_BASE) + path, +                video_id, headers=headers, query=query, expected_status=allowed_status, +                note=f'Downloading {"GraphQL" if graphql else "legacy API"} JSON') + +            if result.get('errors'): +                errors = ', '.join(set(traverse_obj(result, ('errors', ..., 'message', {str})))) +                if not self.is_logged_in and first_attempt and 'bad guest token' in errors.lower(): +                    self.to_screen('Guest token has expired. Refreshing guest token') +                    self._guest_token = None +                    continue -        if result.get('errors'): -            errors = ', '.join(set(traverse_obj(result, ('errors', ..., 'message', {str})))) -            raise ExtractorError( -                f'Error(s) while querying API: {errors or "Unknown error"}', expected=True) +                raise ExtractorError( +                    f'Error(s) while querying API: {errors or "Unknown error"}', expected=True) -        return result +            return result      def _build_graphql_query(self, media_id):          raise NotImplementedError('Method must be implemented to support GraphQL') @@ -439,7 +457,6 @@ class TwitterIE(TwitterBaseIE):      _VALID_URL = TwitterBaseIE._BASE_REGEX + r'(?:(?:i/web|[^/]+)/status|statuses)/(?P<id>\d+)(?:/(?:video|photo)/(?P<index>\d+))?'      _TESTS = [{ -        # comment_count, repost_count, view_count are only available with auth (applies to all tests)          'url': 'https://twitter.com/freethenipple/status/643211948184596480',          'info_dict': {              'id': '643211870443208704', @@ -454,7 +471,10 @@ class TwitterIE(TwitterBaseIE):              'timestamp': 1442188653,              'upload_date': '20150913',              'uploader_url': 'https://twitter.com/freethenipple', +            'comment_count': int, +            'repost_count': int,              'like_count': int, +            'view_count': int,              'tags': [],              'age_limit': 18,          }, @@ -485,6 +505,8 @@ class TwitterIE(TwitterBaseIE):              'timestamp': 1447395772,              'upload_date': '20151113',              'uploader_url': 'https://twitter.com/starwars', +            'comment_count': int, +            'repost_count': int,              'like_count': int,              'tags': ['TV', 'StarWars', 'TheForceAwakens'],              'age_limit': 0, @@ -528,7 +550,10 @@ class TwitterIE(TwitterBaseIE):              'timestamp': 1455777459,              'upload_date': '20160218',              'uploader_url': 'https://twitter.com/jaydingeer', +            'comment_count': int, +            'repost_count': int,              'like_count': int, +            'view_count': int,              'tags': ['Damndaniel'],              'age_limit': 0,          }, @@ -566,7 +591,10 @@ class TwitterIE(TwitterBaseIE):              'upload_date': '20160412',              'uploader_url': 'https://twitter.com/CaptainAmerica',              'thumbnail': r're:^https?://.*\.jpg', +            'comment_count': int, +            'repost_count': int,              'like_count': int, +            'view_count': int,              'tags': [],              'age_limit': 0,          }, @@ -613,7 +641,10 @@ class TwitterIE(TwitterBaseIE):              'timestamp': 1505803395,              'upload_date': '20170919',              'uploader_url': 'https://twitter.com/Prefet971', +            'comment_count': int, +            'repost_count': int,              'like_count': int, +            'view_count': int,              'tags': ['Maria'],              'age_limit': 0,          }, @@ -636,7 +667,10 @@ class TwitterIE(TwitterBaseIE):              'timestamp': 1527623489,              'upload_date': '20180529',              'uploader_url': 'https://twitter.com/LisPower1', +            'comment_count': int, +            'repost_count': int,              'like_count': int, +            'view_count': int,              'tags': [],              'age_limit': 0,          }, @@ -658,7 +692,10 @@ class TwitterIE(TwitterBaseIE):              'timestamp': 1548184644,              'upload_date': '20190122',              'uploader_url': 'https://twitter.com/Twitter', +            'comment_count': int, +            'repost_count': int,              'like_count': int, +            'view_count': int,              'tags': [],              'age_limit': 0,          }, @@ -676,7 +713,6 @@ class TwitterIE(TwitterBaseIE):              'view_count': int,          },          'add_ie': ['TwitterBroadcast'], -        'skip': 'Requires authentication',      }, {          # unified card          'url': 'https://twitter.com/BrooklynNets/status/1349794411333394432?s=20', @@ -693,6 +729,8 @@ class TwitterIE(TwitterBaseIE):              'timestamp': 1610651040,              'upload_date': '20210114',              'uploader_url': 'https://twitter.com/BrooklynNets', +            'comment_count': int, +            'repost_count': int,              'like_count': int,              'tags': [],              'age_limit': 0, @@ -715,7 +753,10 @@ class TwitterIE(TwitterBaseIE):              'thumbnail': r're:^https?://.*\.jpg',              'duration': 30.03,              'timestamp': 1665025050, +            'comment_count': int, +            'repost_count': int,              'like_count': int, +            'view_count': int,              'tags': [],              'age_limit': 0,          }, @@ -724,13 +765,15 @@ class TwitterIE(TwitterBaseIE):          'url': 'https://twitter.com/UltimaShadowX/status/1577719286659006464',          'info_dict': {              'id': '1577719286659006464', -            'title': 'Ultima📛 | #вʟм - Test', +            'title': 'Ultima | #\u0432\u029f\u043c - Test',              'description': 'Test https://t.co/Y3KEZD7Dad', -            'uploader': 'Ultima📛 | #вʟм', +            'uploader': 'Ultima | #\u0432\u029f\u043c',              'uploader_id': 'UltimaShadowX',              'uploader_url': 'https://twitter.com/UltimaShadowX',              'upload_date': '20221005',              'timestamp': 1664992565, +            'comment_count': int, +            'repost_count': int,              'like_count': int,              'tags': [],              'age_limit': 0, @@ -752,7 +795,10 @@ class TwitterIE(TwitterBaseIE):              'duration': 21.321,              'timestamp': 1664477766,              'upload_date': '20220929', +            'comment_count': int, +            'repost_count': int,              'like_count': int, +            'view_count': int,              'tags': ['HurricaneIan'],              'age_limit': 0,          }, @@ -779,20 +825,6 @@ class TwitterIE(TwitterBaseIE):          },          'skip': 'Requires authentication',      }, { -        # Single Vimeo video result without auth -        'url': 'https://twitter.com/Srirachachau/status/1395079556562706435', -        'info_dict': { -            'id': '551578322', -            'ext': 'mp4', -            'title': 'Dusty & The Mayor', -            'uploader': 'Michael Chau', -            'uploader_id': 'user29061007', -            'uploader_url': 'https://vimeo.com/user29061007', -            'duration': 478, -            'thumbnail': 'https://i.vimeocdn.com/video/1139658575-0dfdce6e9a2401fe09feb24bf0d14e6f24a53c12f447ff688ace61009ad4c1ba-d_1280', -        }, -    }, { -        # Playlist result only with auth          'url': 'https://twitter.com/Srirachachau/status/1395079556562706435',          'playlist_mincount': 2,          'info_dict': { @@ -810,7 +842,6 @@ class TwitterIE(TwitterBaseIE):              'uploader_url': 'https://twitter.com/Srirachachau',              'timestamp': 1621447860,          }, -        'skip': 'Requires authentication',      }, {          'url': 'https://twitter.com/DavidToons_/status/1578353380363501568',          'playlist_mincount': 2, @@ -829,7 +860,6 @@ class TwitterIE(TwitterBaseIE):              'upload_date': '20221007',              'age_limit': 0,          }, -        'skip': 'Requires authentication',      }, {          'url': 'https://twitter.com/primevideouk/status/1578401165338976258',          'playlist_count': 2, @@ -843,6 +873,8 @@ class TwitterIE(TwitterBaseIE):              'upload_date': '20221007',              'age_limit': 0,              'uploader_url': 'https://twitter.com/primevideouk', +            'comment_count': int, +            'repost_count': int,              'like_count': int,              'tags': ['TheRingsOfPower'],          }, @@ -864,7 +896,6 @@ class TwitterIE(TwitterBaseIE):          },          'add_ie': ['TwitterSpaces'],          'params': {'skip_download': 'm3u8'}, -        'skip': 'Requires authentication',      }, {          # URL specifies video number but --yes-playlist          'url': 'https://twitter.com/CTVJLaidlaw/status/1600649710662213632/video/1', @@ -874,7 +905,9 @@ class TwitterIE(TwitterBaseIE):              'title': 'md5:be05989b0722e114103ed3851a0ffae2',              'timestamp': 1670459604.0,              'description': 'md5:591c19ce66fadc2359725d5cd0d1052c', +            'comment_count': int,              'uploader_id': 'CTVJLaidlaw', +            'repost_count': int,              'tags': ['colorectalcancer', 'cancerjourney', 'imnotaquitter'],              'upload_date': '20221208',              'age_limit': 0, @@ -893,11 +926,14 @@ class TwitterIE(TwitterBaseIE):              'timestamp': 1670459604.0,              'uploader_id': 'CTVJLaidlaw',              'uploader': 'Jocelyn Laidlaw', +            'repost_count': int, +            'comment_count': int,              'tags': ['colorectalcancer', 'cancerjourney', 'imnotaquitter'],              'duration': 102.226,              'uploader_url': 'https://twitter.com/CTVJLaidlaw',              'display_id': '1600649710662213632',              'like_count': int, +            'view_count': int,              'description': 'md5:591c19ce66fadc2359725d5cd0d1052c',              'upload_date': '20221208',              'age_limit': 0, @@ -923,6 +959,9 @@ class TwitterIE(TwitterBaseIE):              'age_limit': 18,              'tags': [],              'like_count': int, +            'repost_count': int, +            'comment_count': int, +            'view_count': int,          },      }, {          'url': 'https://twitter.com/hlo_again/status/1599108751385972737/video/2', @@ -935,7 +974,10 @@ class TwitterIE(TwitterBaseIE):              'like_count': int,              'uploader_id': 'hlo_again',              'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1599108643743473680/pu/img/UG3xjov4rgg5sbYM.jpg?name=orig', +            'repost_count': int,              'duration': 9.531, +            'comment_count': int, +            'view_count': int,              'upload_date': '20221203',              'age_limit': 0,              'timestamp': 1670092210.0, @@ -952,11 +994,14 @@ class TwitterIE(TwitterBaseIE):              'ext': 'mp4',              'uploader_url': 'https://twitter.com/MunTheShinobi',              'description': 'This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525 https://t.co/cNsA0MoOml', +            'view_count': int,              'thumbnail': 'https://pbs.twimg.com/ext_tw_video_thumb/1600009362759733248/pu/img/XVhFQivj75H_YxxV.jpg?name=orig',              'age_limit': 0,              'uploader': 'Mün The Shinobi', +            'repost_count': int,              'upload_date': '20221206',              'title': 'Mün The Shinobi - This is a genius ad by Apple. \U0001f525\U0001f525\U0001f525\U0001f525\U0001f525', +            'comment_count': int,              'like_count': int,              'tags': [],              'uploader_id': 'MunTheShinobi', @@ -964,14 +1009,14 @@ class TwitterIE(TwitterBaseIE):              'timestamp': 1670306984.0,          },      }, { -        # url to retweet id +        # url to retweet id, legacy API          'url': 'https://twitter.com/liberdalau/status/1623739803874349067',          'info_dict': {              'id': '1623274794488659969',              'display_id': '1623739803874349067',              'ext': 'mp4',              'title': 'Johnny Bullets - Me after going viral to over 30million people:    Whoopsie-daisy', -            'description': 'md5:224d62f54b0cdef8e33d4c56c41ac503', +            'description': 'md5:e873616a4a8fe0f93e71872678a672f3',              'uploader': 'Johnny Bullets',              'uploader_id': 'Johnnybull3ts',              'uploader_url': 'https://twitter.com/Johnnybull3ts', @@ -982,7 +1027,10 @@ class TwitterIE(TwitterBaseIE):              'upload_date': '20230208',              'thumbnail': r're:https://pbs\.twimg\.com/ext_tw_video_thumb/.+',              'like_count': int, +            'repost_count': int, +            'comment_count': int,          }, +        'params': {'extractor_args': {'twitter': {'legacy_api': ['']}}},      }, {          # onion route          'url': 'https://twitter3e4tixl4xyajtrzo62zg5vztmjuricljdp2c5kshju4avyoid.onion/TwitterBlue/status/1484226494708662273', @@ -1033,6 +1081,8 @@ class TwitterIE(TwitterBaseIE):          if 'tombstone' in result:              cause = remove_end(traverse_obj(result, ('tombstone', 'text', 'text', {str})), '. Learn more') +            if cause and 'adult content' in cause: +                self.raise_login_required(cause)              raise ExtractorError(f'Twitter API says: {cause or "Unknown error"}', expected=True)          status = result.get('legacy', {}) @@ -1088,22 +1138,19 @@ class TwitterIE(TwitterBaseIE):      def _real_extract(self, url):          twid, selected_index = self._match_valid_url(url).group('id', 'index') -        if not self.is_logged_in: -            try: -                status = self._download_json( -                    'https://cdn.syndication.twimg.com/tweet-result', twid, 'Downloading syndication JSON', -                    headers={'User-Agent': 'Googlebot'}, query={'id': twid}) -                self.to_screen(f'Some metadata is missing without authentication. {self._login_hint()}') -            except ExtractorError as e: -                if isinstance(e.cause, urllib.error.HTTPError) and e.cause.code == 404: -                    self.raise_login_required('Requested tweet may only be available when logged in') -                raise +        if self._configuration_arg('legacy_api') and not self.is_logged_in: +            status = traverse_obj(self._call_api(f'statuses/show/{twid}.json', twid, { +                'cards_platform': 'Web-12', +                'include_cards': 1, +                'include_reply_count': 1, +                'include_user_entities': 0, +                'tweet_mode': 'extended', +            }), 'retweeted_status', None)          else: -            status = self._graphql_to_legacy( -                self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid), twid) +            result = self._call_graphql_api('zZXycP0V6H7m-2r0mOnFcA/TweetDetail', twid) +            status = self._graphql_to_legacy(result, twid) -        title = description = traverse_obj( -            status, (('full_text', 'text'), {lambda x: x.replace('\n', ' ')}), get_all=False) or '' +        title = description = status['full_text'].replace('\n', ' ')          # strip  'https -_t.co_BJYgOjSeGA' junk from filenames          title = re.sub(r'\s+(https?://[^ ]+)', '', title)          user = status.get('user') or {} @@ -1129,16 +1176,12 @@ class TwitterIE(TwitterBaseIE):          def extract_from_video_info(media):              media_id = traverse_obj(media, 'id_str', 'id', expected_type=str_or_none) -            if not media_id: -                # workaround for non-authenticated responses -                media_id = traverse_obj(media, ( -                    'video_info', 'variants', ..., 'url', -                    {lambda x: re.search(r'_video/(\d+)/', x)[1]}), get_all=False)              self.write_debug(f'Extracting from video info: {media_id}') +            video_info = media.get('video_info') or {}              formats = []              subtitles = {} -            for variant in traverse_obj(media, ('video_info', 'variants', ...)): +            for variant in video_info.get('variants', []):                  fmts, subs = self._extract_variant_formats(variant, twid)                  subtitles = self._merge_subtitles(subtitles, subs)                  formats.extend(fmts) @@ -1158,12 +1201,12 @@ class TwitterIE(TwitterBaseIE):                  add_thumbnail('orig', media.get('original_info') or {})              return { -                'id': media_id or twid, +                'id': media_id,                  'formats': formats,                  'subtitles': subtitles,                  'thumbnails': thumbnails,                  'view_count': traverse_obj(media, ('mediaStats', 'viewCount', {int_or_none})), -                'duration': float_or_none(traverse_obj(media, ('video_info', 'duration_millis')), 1000), +                'duration': float_or_none(video_info.get('duration_millis'), 1000),                  # The codec of http formats are unknown                  '_format_sort_fields': ('res', 'br', 'size', 'proto'),              } @@ -1243,15 +1286,12 @@ class TwitterIE(TwitterBaseIE):                  }          videos = traverse_obj(status, ( -            ('mediaDetails', ((None, 'quoted_status'), 'extended_entities', 'media')), -            lambda _, m: m['type'] != 'photo', {dict})) +            (None, 'quoted_status'), 'extended_entities', 'media', lambda _, m: m['type'] != 'photo', {dict}))          if self._yes_playlist(twid, selected_index, video_label='URL-specified video number'):              selected_entries = (*map(extract_from_video_info, videos), *extract_from_card_info(status.get('card')))          else: -            desired_obj = traverse_obj(status, ( -                ('mediaDetails', ((None, 'quoted_status'), 'extended_entities', 'media')), -                int(selected_index) - 1, {dict}), get_all=False) +            desired_obj = traverse_obj(status, ('extended_entities', 'media', int(selected_index) - 1, {dict}))              if not desired_obj:                  raise ExtractorError(f'Video #{selected_index} is unavailable', expected=True)              elif desired_obj.get('type') != 'video': | 
