From 9fc347e0932c71f9d9840fe9f5941225c8a8b6ee Mon Sep 17 00:00:00 2001
From: James Taylor <user234683@users.noreply.github.com>
Date: Sat, 25 Jul 2020 19:40:37 -0700
Subject: Add video transcript to downloads Generated from the video captions

---
 youtube/templates/watch.html | 10 ++++++
 youtube/watch.py             | 81 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 90 insertions(+), 1 deletion(-)

(limited to 'youtube')
diff --git a/youtube/templates/watch.html b/youtube/templates/watch.html
index 04f963d..4041144 100644
--- a/youtube/templates/watch.html
+++ b/youtube/templates/watch.html
@@ -381,6 +381,16 @@ Reload without invidious (for usage of new identity button).</a>
                         </a>
                     </li>
                 {% endfor %}
+                {% for download in other_downloads %}
+                    <li class="download-format">
+                        <a href="{{ download['url'] }}">
+                            <ol class="format-attributes">
+                                <li class="format-ext">{{ download['ext'] }}</li>
+                                <li class="format-label">{{ download['label'] }}</li>
+                            </ol>
+                        </a>
+                    </li>
+                {% endfor %}
             </ul>
         </details>
         <input class="checkbox" name="video_info_list" value="{{ video_info }}" form="playlist-edit" type="checkbox">
diff --git a/youtube/watch.py b/youtube/watch.py
index a3d8e24..d65bd8c 100644
--- a/youtube/watch.py
+++ b/youtube/watch.py
@@ -453,6 +453,22 @@ def get_watch_page(video_id=None):
         print('Comment count:', info['comment_count'])
         info['comment_count'] = None # hack to make it obvious there's a bug
 
+    # captions and transcript
+    subtitle_sources = get_subtitle_sources(info)
+    other_downloads = []
+    for source in subtitle_sources:
+        best_caption_parse = urllib.parse.urlparse(
+            source['url'].lstrip('/'))
+        transcript_url = (util.URL_ORIGIN
+            + '/watch/transcript'
+            + best_caption_parse.path
+            + '?' + best_caption_parse.query)
+        other_downloads.append({
+            'label': 'Video Transcript: ' + source['label'],
+            'ext': 'txt',
+            'url': transcript_url
+        })
+
     return flask.render_template('watch.html',
         header_playlist_names   = local_playlist.get_playlist_names(),
         uploader_channel_url    = ('/' + info['author_url']) if info['author_url'] else '',
@@ -461,10 +477,11 @@ def get_watch_page(video_id=None):
         like_count    = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("like_count", None)),
         dislike_count = (lambda x: '{:,}'.format(x) if x is not None else "")(info.get("dislike_count", None)),
         download_formats        = download_formats,
+        other_downloads         = other_downloads,
         video_info              = json.dumps(video_info),
         video_sources           = video_sources,
         hls_formats             = info['hls_formats'],
-        subtitle_sources        = get_subtitle_sources(info),
+        subtitle_sources        = subtitle_sources,
         related                 = info['related_videos'],
         playlist                = info['playlist'],
         music_list              = info['music_list'],
@@ -504,5 +521,67 @@ def get_captions(dummy):
     return result
 
 
+times_reg = re.compile(r'^\d\d:\d\d:\d\d\.\d\d\d --> \d\d:\d\d:\d\d\.\d\d\d.*$')
+inner_timestamp_removal_reg = re.compile(r'<[^>]+>')
+@yt_app.route('/watch/transcript/<path:caption_path>')
+def get_transcript(caption_path):
+    try:
+        captions = util.fetch_url('https://www.youtube.com/'
+            + caption_path
+            + '?' + request.environ['QUERY_STRING']).decode('utf-8')
+    except util.FetchError as e:
+        msg = ('Error retrieving captions: ' + str(e) + '\n\n'
+            + 'The caption url may have expired.')
+        print(msg)
+        return flask.Response(msg,
+            status = e.code,
+            mimetype='text/plain;charset=UTF-8')
+
+    lines = captions.splitlines()
+    segments = []
+
+    # skip captions file header
+    i = 0
+    while lines[i] != '':
+        i += 1
+
+    current_segment = None
+    while i < len(lines):
+        line = lines[i]
+        if line == '':
+            if ((current_segment is not None)
+                    and (current_segment['begin'] is not None)):
+                segments.append(current_segment)
+            current_segment = {
+                'begin': None,
+                'end': None,
+                'lines': [],
+            }
+        elif times_reg.fullmatch(line.rstrip()):
+            current_segment['begin'], current_segment['end'] = line.split(' --> ')
+        else:
+            current_segment['lines'].append(
+                inner_timestamp_removal_reg.sub('', line))
+        i += 1
+
+    # if automatic captions, but not translated
+    if request.args.get('kind') == 'asr' and not request.args.get('tlang'):
+        # Automatic captions repeat content. The new segment is displayed
+        # on the bottom row; the old one is displayed on the top row.
+        # So grab the bottom row only
+        for seg in segments:
+            seg['text'] = seg['lines'][1]
+    else:
+        for seg in segments:
+            seg['text'] = ' '.join(map(str.rstrip, seg['lines']))
+
+    result = ''
+    for seg in segments:
+        if seg['text'] != ' ':
+            result += seg['begin'] + ' ' + seg['text'] + '\r\n'
+
+    return flask.Response(result.encode('utf-8'),
+        mimetype='text/plain;charset=UTF-8')
+
 
 
-- 
cgit v1.2.3