Extraction: Bypass age-restriction

author: James Taylor <user234683@users.noreply.github.com> 2019-12-12 22:13:17 -0800
committer: James Taylor <user234683@users.noreply.github.com> 2019-12-12 22:13:17 -0800
commit: 26f37521babbb2fc4b86ad59354e8c69da1f3897 (patch)
tree: 7fd6eb51bc09ce84b18a9193ab99f23cb481af55
parent: 205ad29cb0763dd263a5940cdcb3059d189bbfe7 (diff)
download: yt-local-26f37521babbb2fc4b86ad59354e8c69da1f3897.tar.lz
yt-local-26f37521babbb2fc4b86ad59354e8c69da1f3897.tar.xz
yt-local-26f37521babbb2fc4b86ad59354e8c69da1f3897.zip
2 files changed, 90 insertions, 35 deletions
diff --git a/youtube/watch.py b/youtube/watch.py
index fa697ba..4575c1e 100644
--- a/youtube/watch.py
+++ b/youtube/watch.py
@@ -275,17 +275,32 @@ headers = (
 ) + util.mobile_ua
 
 def extract_info(video_id):
-    polymer_json = util.fetch_url('https://m.youtube.com/watch?v=' + video_id + '&pbj=1', headers=headers, debug_name='watch')
+    polymer_json = util.fetch_url('https://m.youtube.com/watch?v=' + video_id + '&pbj=1', headers=headers, debug_name='watch').decode('utf-8')
+    # TODO: Decide whether this should be done in yt_data_extract.extract_watch_info
     try:
         polymer_json = json.loads(polymer_json)
     except json.decoder.JSONDecodeError:
         traceback.print_exc()
         return {'error': 'Failed to parse json response'}
     info = yt_data_extract.extract_watch_info(polymer_json)
-    error = decrypt_signatures(info)
-    if error:
-        print('Error decrypting url signatures: ' + error)
-        info['playability_error'] = error
+
+    # age restriction bypass
+    if info['age_restricted']:
+        print('Fetching age restriction bypass page')
+        data = {
+            'video_id': video_id,
+            'eurl': 'https://youtube.googleapis.com/v/' + video_id,
+        }
+        url = 'https://www.youtube.com/get_video_info?' + urllib.parse.urlencode(data)
+        video_info_page = util.fetch_url(url, debug_name='get_video_info', report_text='Fetched age restriction bypass page').decode('utf-8')
+        yt_data_extract.update_with_age_restricted_info(info, video_info_page)
+
+    # signature decryption
+    decryption_error = decrypt_signatures(info)
+    if decryption_error:
+        decryption_error = 'Error decrypting url signatures: ' + decryption_error
+        info['playability_error'] = decryption_error
+
     return info
 
 def video_quality_string(format):
@@ -410,6 +425,7 @@ def get_watch_page():
         uploader    = info['author'],
         description = info['description'],
         unlisted    = info['unlisted'],
+        age_restricted    = info['age_restricted'],
         playability_error = info['playability_error'],
     )
 
diff --git a/youtube/yt_data_extract.py b/youtube/yt_data_extract.py
index 7c2b717..8c5c63d 100644
--- a/youtube/yt_data_extract.py
+++ b/youtube/yt_data_extract.py
@@ -943,6 +943,11 @@ def extract_watch_info_mobile(top_level):
     info = {}
     microformat = default_multi_get(top_level, 'playerResponse', 'microformat', 'playerMicroformatRenderer', default={})
 
+    family_safe = microformat.get('isFamilySafe')
+    if family_safe is None:
+        info['age_restricted'] = None
+    else:
+        info['age_restricted'] = not family_safe
     info['allowed_countries'] = microformat.get('availableCountries', [])
     info['published_date'] = microformat.get('publishDate')
 
@@ -1055,6 +1060,34 @@ def get_caption_url(info, language, format, automatic=False, translation_languag
         url += '&tlang=' + translation_language
     return url
 
+def extract_formats(info, player_response):
+    streaming_data = player_response.get('streamingData', {})
+    yt_formats = streaming_data.get('formats', []) + streaming_data.get('adaptiveFormats', [])
+
+    info['formats'] = []
+
+    for yt_fmt in yt_formats:
+        fmt = {}
+        fmt['ext'] = None
+        fmt['audio_bitrate'] = None
+        fmt['acodec'] = None
+        fmt['vcodec'] = None
+        fmt['width'] = yt_fmt.get('width')
+        fmt['height'] = yt_fmt.get('height')
+        fmt['file_size'] = yt_fmt.get('contentLength')
+        fmt['audio_sample_rate'] = yt_fmt.get('audioSampleRate')
+        fmt['fps'] = yt_fmt.get('fps')
+        cipher = dict(urllib.parse.parse_qsl(yt_fmt.get('cipher', '')))
+        if cipher:
+            fmt['url'] = cipher.get('url')
+        else:
+            fmt['url'] = yt_fmt.get('url')
+        fmt['s'] = cipher.get('s')
+        fmt['sp'] = cipher.get('sp')
+        fmt.update(_formats.get(str(yt_fmt.get('itag')), {}))
+
+        info['formats'].append(fmt)
+
 SUBTITLE_FORMATS = ('srv1', 'srv2', 'srv3', 'ttml', 'vtt')
 def extract_watch_info(polymer_json):
     info = {'playability_error': None, 'error': None}
@@ -1080,10 +1113,6 @@ def extract_watch_info(polymer_json):
 
     player_args = default_multi_get(top_level, 'player', 'args', default={})
     player_response = json.loads(player_args['player_response']) if 'player_response' in player_args else {}
-    playability_status = default_multi_get(player_response, 'playabilityStatus', 'status', default=None)
-    playability_reason = default_multi_get(player_response, 'playabilityStatus', 'reason', default='Unknown error')
-    if playability_status not in (None, 'OK'):
-        info['playability_error'] = playability_reason
 
     # captions
     info['automatic_caption_languages'] = []
@@ -1106,35 +1135,19 @@ def extract_watch_info(polymer_json):
             print('WARNING: Found non-translatable caption language')
 
     # formats
-    streaming_data = player_response.get('streamingData', {})
-    yt_formats = streaming_data.get('formats', []) + streaming_data.get('adaptiveFormats', [])
-
-    info['formats'] = []
-
-    for yt_fmt in yt_formats:
-        fmt = {}
-        fmt['ext'] = None
-        fmt['audio_bitrate'] = None
-        fmt['acodec'] = None
-        fmt['vcodec'] = None
-        fmt['width'] = yt_fmt.get('width')
-        fmt['height'] = yt_fmt.get('height')
-        fmt['file_size'] = yt_fmt.get('contentLength')
-        fmt['audio_sample_rate'] = yt_fmt.get('audioSampleRate')
-        fmt['fps'] = yt_fmt.get('fps')
-        cipher = dict(urllib.parse.parse_qsl(yt_fmt.get('cipher', '')))
-        if cipher:
-            fmt['url'] = cipher.get('url')
+    extract_formats(info, player_response)
+    playability_status = default_multi_get(player_response, 'playabilityStatus', 'status', default=None)
+    playability_reason = default_multi_get(player_response, 'playabilityStatus', 'reason', default='Could not find playability error')
+    if not info['formats']:
+        if playability_status not in (None, 'OK'):
+            info['playability_error'] = playability_reason
         else:
-            fmt['url'] = yt_fmt.get('url')
-        fmt['s'] = cipher.get('s')
-        fmt['sp'] = cipher.get('sp')
-        fmt.update(_formats.get(str(yt_fmt.get('itag')), {}))
+            info['playability_error'] = 'Unknown playability error'
 
-        info['formats'].append(fmt)
-    if info['formats']:
-        info['playability_error'] = None    # in case they lie
+    # check age-restriction
+    info['age_restricted'] = (playability_status == 'LOGIN_REQUIRED' and playability_reason and ' age' in playability_reason)
 
+    # base_js (for decryption of signatures)
     info['base_js'] = default_multi_get(top_level, 'player', 'assets', 'js')
     if info['base_js']:
         info['base_js'] = normalize_url(info['base_js'])
@@ -1162,3 +1175,29 @@ def extract_watch_info(polymer_json):
     # other stuff
     info['author_url'] = 'https://www.youtube.com/channel/' + info['author_id'] if info['author_id'] else None
     return info
+
+def update_with_age_restricted_info(info, video_info_page):
+    ERROR_PREFIX = 'Error bypassing age-restriction: '
+
+    video_info = urllib.parse.parse_qs(video_info_page)
+    player_response = default_multi_get(video_info, 'player_response', 0)
+    if player_response is None:
+        info['playability_error'] = ERROR_PREFIX + 'Could not find player_response in video_info_page'
+        return
+    try:
+        player_response = json.loads(player_response)
+    except json.decoder.JSONDecodeError:
+        traceback.print_exc()
+        info['playability_error'] = ERROR_PREFIX + 'Failed to parse json response'
+        return
+
+    extract_formats(info, player_response)
+    if info['formats']:
+        info['playability_error'] = None
+    else:
+        playability_status = default_multi_get(player_response, 'playabilityStatus', 'status', default=None)
+        playability_reason = default_multi_get(player_response, 'playabilityStatus', 'reason', default=ERROR_PREFIX + 'Could not find playability error')
+        if playability_status not in (None, 'OK'):
+            info['playability_error'] = ERROR_PREFIX + playability_reason
+        else:
+            info['playability_error'] = ERROR_PREFIX + 'Unknown playability error'
author	James Taylor <user234683@users.noreply.github.com>	2019-12-12 22:13:17 -0800
committer	James Taylor <user234683@users.noreply.github.com>	2019-12-12 22:13:17 -0800
commit	26f37521babbb2fc4b86ad59354e8c69da1f3897 (patch)
tree	7fd6eb51bc09ce84b18a9193ab99f23cb481af55
parent	205ad29cb0763dd263a5940cdcb3059d189bbfe7 (diff)
download	yt-local-26f37521babbb2fc4b86ad59354e8c69da1f3897.tar.lz yt-local-26f37521babbb2fc4b86ad59354e8c69da1f3897.tar.xz yt-local-26f37521babbb2fc4b86ad59354e8c69da1f3897.zip