aboutsummaryrefslogtreecommitdiffstats
path: root/lib/WWW/FairViewer/GetCaption.pm
blob: 710a2af6e7eb4ed6732d45188ff0ed1b88518649 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
package WWW::FairViewer::GetCaption;

use utf8;
use 5.014;
use warnings;

=head1 NAME

WWW::FairViewer::GetCaption - Save the YouTube closed captions as .srt files for a videoID.

=head1 SYNOPSIS

    use WWW::FairViewer::GetCaption;

    my $yv_cap = WWW::FairViewer::GetCaption->new(%opts);
    my $file = $yv_cap->save_caption($videoID);

=head1 SUBROUTINES/METHODS

=head2 new(%opts)

Options:

=over 4

=item captions => []

The captions data.

=item captions_dir => "."

Where to save the closed captions.

=item languages => [qw(en es ro jp)]

Preferred languages. First found is saved and returned.

=back

=cut

sub new {
    my ($class, %opts) = @_;

    my $self = bless {}, $class;

    $self->{captions_dir}  = undef;
    $self->{captions}      = [];
    $self->{auto_captions} = 0;
    $self->{languages}     = [qw(en es)];
    $self->{yv_obj}        = undef;

    foreach my $key (keys %{$self}) {
        $self->{$key} = delete $opts{$key}
          if exists $opts{$key};
    }

    $self->{yv_obj} //= do {
        require WWW::FairViewer;
        WWW::FairViewer->new(cache_dir => $self->{captions_dir},);
    };

    foreach my $invalid_key (keys %opts) {
        warn "Invalid key: '${invalid_key}'";
    }

    return $self;
}

=head2 find_caption_data()

Find a caption data, based on the preferred languages.

=cut

sub find_caption_data {
    my ($self) = @_;

    my @found;
    foreach my $caption (@{$self->{captions}}) {
        if (defined $caption->{languageCode}) {
            foreach my $i (0 .. $#{$self->{languages}}) {
                my $lang = $self->{languages}[$i];
                if ($caption->{languageCode} =~ /^\Q$lang\E(?:\z|[_-])/i) {

                    # Automatic Speech Recognition
                    my $auto = defined($caption->{kind}) && lc($caption->{kind}) eq 'asr';

                    # Check against auto-generated captions
                    if ($auto and not $self->{auto_captions}) {
                        next;
                    }

                    # Fuzzy match or auto-generated caption
                    if (lc($caption->{languageCode}) ne lc($lang) or $auto) {
                        $found[$i + (($auto ? 2 : 1) * scalar(@{$self->{languages}}))] = $caption;
                    }

                    # Perfect match
                    else {
                        $i == 0 and return $caption;
                        $found[$i] = $caption;
                    }
                }
            }
        }
    }

    foreach my $caption (@found) {
        return $caption if defined($caption);
    }

    return;
}

=head2 sec2time(@seconds)

Convert a list of seconds to .srt times.

=cut

sub sec2time {
    my $self = shift;

    my @out;
    foreach my $sec (map { sprintf '%.3f', $_ } @_) {
        push @out,
          sprintf('%02d:%02d:%02d,%03d', ($sec / 3600 % 24, $sec / 60 % 60, $sec % 60, substr($sec, index($sec, '.') + 1)));
    }

    return @out;
}

=head2 xml2srt($xml_string)

Convert the XML data to SubRip format.

=cut

sub xml2srt {
    my ($self, $xml) = @_;

    require WWW::FairViewer::ParseXML;
    my $hash = eval { WWW::FairViewer::ParseXML::xml2hash($xml) } // return;

    my $sections;
    if (    exists $hash->{transcript}
        and ref($hash->{transcript}) eq 'ARRAY'
        and ref($hash->{transcript}[0]) eq 'HASH'
        and exists $hash->{transcript}[0]{text}) {
        $sections = $hash->{transcript}[0]{text};
    }
    else {
        return;
    }

    require HTML::Entities;

    my @text;
    foreach my $i (0 .. $#{$sections}) {
        my $line = $sections->[$i];

        if (not defined($line->{'-dur'})) {
            if (exists $sections->[$i + 1]) {
                $line->{'-dur'} = $sections->[$i + 1]{'-start'} - $line->{'-start'};
            }
            else {
                $line->{'-dur'} = 10;
            }
        }

        my $start = $line->{'-start'};
        my $end   = $start + $line->{'-dur'};

        push @text,
          join("\n",
               $i + 1,
               join(' --> ', $self->sec2time($start, $end)),
               HTML::Entities::decode_entities($line->{'#text'} // ''));
    }

    return join("\n\n", @text);
}

=head2 get_xml_data($caption_data)

Get the XML content for a given caption data.

=cut

sub get_xml_data {
    my ($self, $url) = @_;
    $self->{yv_obj}->lwp_get($url, simple => 1);
}

=head2 save_caption($video_ID)

Save the caption in a .srt file and return its file path.

=cut

sub save_caption {
    my ($self, $video_id) = @_;

    # Find one of the preferred languages
    my $info = $self->find_caption_data() // return;

    require File::Spec;
    my $filename = "${video_id}_$info->{languageCode}.srt";
    my $srt_file = File::Spec->catfile($self->{captions_dir} // File::Spec->tmpdir, $filename);

    # Return the srt file if it already exists
    return $srt_file if (-e $srt_file);

    # Get XML data, then transform it to SubRip data
    my $xml = $self->get_xml_data($info->{baseUrl} // return) // return;
    my $srt = $self->xml2srt($xml)                            // return;

    # Write the SubRib data to the $srt_file
    open(my $fh, '>:utf8', $srt_file) or return;
    print {$fh} $srt, "\n";
    close $fh;

    # Return the .srt file path
    return $srt_file;
}

=head1 AUTHOR

Trizen, C<< <echo dHJpemVuQHByb3Rvbm1haWwuY29tCg== | base64 -d> >>

Jesus, C<< <echo aGVja3llbEBoeXBlcmJvbGEuaW5mbw== | base64 -d> >>


=head1 SUPPORT

You can find documentation for this module with the perldoc command.

    perldoc WWW::FairViewer::GetCaption


=head1 LICENSE AND COPYRIGHT

Copyright 2012-2015 Trizen.

Copyright 2020 Jesus E.

This program is free software; you can redistribute it and/or modify it
under the terms of either: the GNU General Public License as published
by the Free Software Foundation; or the Artistic License.

See L<http://dev.perl.org/licenses/> for more information.

=cut

1;    # End of WWW::FairViewer::GetCaption