aboutsummaryrefslogtreecommitdiffstats
path: root/youtube/yt_data_extract/watch_extraction.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube/yt_data_extract/watch_extraction.py')
-rw-r--r--youtube/yt_data_extract/watch_extraction.py62
1 files changed, 52 insertions, 10 deletions
diff --git a/youtube/yt_data_extract/watch_extraction.py b/youtube/yt_data_extract/watch_extraction.py
index 9dbb252..5aaa318 100644
--- a/youtube/yt_data_extract/watch_extraction.py
+++ b/youtube/yt_data_extract/watch_extraction.py
@@ -307,6 +307,18 @@ def _extract_watch_info_desktop(top_level):
return info
+def update_format_with_codec_info(fmt, codec):
+ if (codec.startswith('av')
+ or codec in ('vp9', 'vp8', 'vp8.0', 'h263', 'h264', 'mp4v')):
+ if codec == 'vp8.0':
+ codec = 'vp8'
+ conservative_update(fmt, 'vcodec', codec)
+ elif (codec.startswith('mp4a')
+ or codec in ('opus', 'mp3', 'aac', 'dtse', 'ec-3', 'vorbis')):
+ conservative_update(fmt, 'acodec', codec)
+ else:
+ print('Warning: unrecognized codec: ' + codec)
+
fmt_type_re = re.compile(
r'(text|audio|video)/([\w0-9]+); codecs="([\w0-9\.]+(?:, [\w0-9\.]+)*)"')
def update_format_with_type_info(fmt, yt_fmt):
@@ -319,16 +331,7 @@ def update_format_with_type_info(fmt, yt_fmt):
type, fmt['ext'], codecs = match.groups()
codecs = codecs.split(', ')
for codec in codecs:
- if (codec.startswith('av')
- or codec in ('vp9', 'vp8', 'vp8.0', 'h263', 'h264', 'mp4v')):
- if codec == 'vp8.0':
- codec = 'vp8'
- conservative_update(fmt, 'vcodec', codec)
- elif (codec.startswith('mp4a')
- or codec in ('opus', 'mp3', 'aac', 'dtse', 'ec-3', 'vorbis')):
- conservative_update(fmt, 'acodec', codec)
- else:
- print('Warning: unrecognized codec: ' + codec)
+ update_format_with_codec_info(fmt, codec)
if type == 'audio':
assert len(codecs) == 1
@@ -337,6 +340,8 @@ def _extract_formats(info, player_response):
yt_formats = streaming_data.get('formats', []) + streaming_data.get('adaptiveFormats', [])
info['formats'] = []
+ info['hls_manifest_url'] = streaming_data.get('hlsManifestUrl')
+ info['dash_manifest_url'] = streaming_data.get('dashManifestUrl')
for yt_fmt in yt_formats:
fmt = {}
@@ -371,6 +376,43 @@ def _extract_formats(info, player_response):
else:
info['ip_address'] = None
+hls_regex = re.compile(r'[\w_-]+=(?:"[^"]+"|[^",]+),')
+def extract_hls_formats(hls_manifest):
+ '''returns hls_formats, err'''
+ hls_formats = []
+ try:
+ lines = hls_manifest.splitlines()
+ i = 0
+ while i < len(lines):
+ if lines[i].startswith('#EXT-X-STREAM-INF'):
+ fmt = {'acodec': None, 'vcodec': None, 'height': None,
+ 'width': None, 'fps': None, 'audio_bitrate': None,
+ 'itag': None, 'file_size': None,
+ 'audio_sample_rate': None, 'url': None}
+ properties = lines[i].split(':')[1]
+ properties += ',' # make regex work for last key-value pair
+
+ for pair in hls_regex.findall(properties):
+ key, value = pair.rstrip(',').split('=')
+ if key == 'CODECS':
+ for codec in value.strip('"').split(','):
+ update_format_with_codec_info(fmt, codec)
+ elif key == 'RESOLUTION':
+ fmt['width'], fmt['height'] = map(int, value.split('x'))
+ fmt['resolution'] = value
+ elif key == 'FRAME-RATE':
+ fmt['fps'] = int(value)
+ i += 1
+ fmt['url'] = lines[i]
+ assert fmt['url'].startswith('http')
+ fmt['ext'] = 'm3u8'
+ hls_formats.append(fmt)
+ i += 1
+ except Exception as e:
+ traceback.print_exc()
+ return [], str(e)
+ return hls_formats, None
+
def _extract_playability_error(info, player_response, error_prefix=''):
if info['formats']: