diff options
author | Jesús <heckyel@hyperbola.info> | 2020-03-02 08:18:54 -0500 |
---|---|---|
committer | Jesús <heckyel@hyperbola.info> | 2020-03-02 08:18:54 -0500 |
commit | c67158fa409f1b1b4f98a8621a69bb2013b76451 (patch) | |
tree | bb4ca80f29cda70734a868d372e00c85e172e7d3 /lib/WWW/FairViewer/GetCaption.pm | |
parent | ad7ec1785fc28799e10d10e7a679dc5bb4891ee3 (diff) | |
download | fair-viewer-c67158fa409f1b1b4f98a8621a69bb2013b76451.tar.lz fair-viewer-c67158fa409f1b1b4f98a8621a69bb2013b76451.tar.xz fair-viewer-c67158fa409f1b1b4f98a8621a69bb2013b76451.zip |
rebrand app
Diffstat (limited to 'lib/WWW/FairViewer/GetCaption.pm')
-rw-r--r-- | lib/WWW/FairViewer/GetCaption.pm | 280 |
1 files changed, 280 insertions, 0 deletions
diff --git a/lib/WWW/FairViewer/GetCaption.pm b/lib/WWW/FairViewer/GetCaption.pm new file mode 100644 index 0000000..24741b1 --- /dev/null +++ b/lib/WWW/FairViewer/GetCaption.pm @@ -0,0 +1,280 @@ +package WWW::FairViewer::GetCaption; + +use utf8; +use 5.014; +use warnings; + +=head1 NAME + +WWW::FairViewer::GetCaption - Save the YouTube closed captions as .srt files for a videoID. + +=head1 SYNOPSIS + + use WWW::FairViewer::GetCaption; + + my $yv_cap = WWW::FairViewer::GetCaption->new(%opts); + + print $yv_cap->get_caption($videoID); + +=head1 SUBROUTINES/METHODS + +=head2 new(%opts) + +Options: + +=over 4 + +=item captions => [] + +The captions data. + +=item captions_dir => "." + +Where to save the closed captions. + +=item languages => [qw(en es ro jp)] + +Preferred languages. First found is saved and returned. + +=back + +=cut + +sub new { + my ($class, %opts) = @_; + + my $self = bless {}, $class; + $self->{captions_dir} = undef; + $self->{captions} = []; + $self->{auto_captions} = 0; + $self->{languages} = [qw(en es)]; + + foreach my $key (keys %{$self}) { + $self->{$key} = delete $opts{$key} + if exists $opts{$key}; + } + + foreach my $invalid_key (keys %opts) { + warn "Invalid key: '${invalid_key}'"; + } + + return $self; +} + +=head2 find_caption_data() + +Find a caption data, based on the preferred languages. + +=cut + +sub find_caption_data { + my ($self) = @_; + + my @found; + foreach my $caption (@{$self->{captions}}) { + if (defined $caption->{languageCode}) { + foreach my $i (0 .. $#{$self->{languages}}) { + my $lang = $self->{languages}[$i]; + if ($caption->{languageCode} =~ /^\Q$lang\E(?:\z|[_-])/i) { + + # Automatic Speech Recognition + my $auto = defined($caption->{kind}) && lc($caption->{kind}) eq 'asr'; + + # Check against auto-generated captions + if ($auto and not $self->{auto_captions}) { + next; + } + + # Fuzzy match or auto-generated caption + if (lc($caption->{languageCode}) ne lc($lang) or $auto) { + $found[$i + (($auto ? 2 : 1) * scalar(@{$self->{languages}}))] = $caption; + } + + # Perfect match + else { + $i == 0 and return $caption; + $found[$i] = $caption; + } + } + } + } + } + + foreach my $caption (@found) { + return $caption if defined($caption); + } + + return; +} + +=head2 sec2time(@seconds) + +Convert a list of seconds to .srt times. + +=cut + +sub sec2time { + my $self = shift; + + my @out; + foreach my $sec (map { sprintf '%.3f', $_ } @_) { + push @out, + sprintf('%02d:%02d:%02d,%03d', ($sec / 3600 % 24, $sec / 60 % 60, $sec % 60, substr($sec, index($sec, '.') + 1))); + } + + return @out; +} + +=head2 xml2srt($xml_string) + +Convert the XML data to SubRip format. + +=cut + +sub xml2srt { + my ($self, $xml) = @_; + + require WWW::FairViewer::ParseXML; + my $hash = eval { WWW::FairViewer::ParseXML::xml2hash($xml) } // return; + + my $sections; + if ( exists $hash->{transcript} + and ref($hash->{transcript}) eq 'ARRAY' + and ref($hash->{transcript}[0]) eq 'HASH' + and exists $hash->{transcript}[0]{text}) { + $sections = $hash->{transcript}[0]{text}; + } + else { + return; + } + + require HTML::Entities; + + my @text; + foreach my $i (0 .. $#{$sections}) { + my $line = $sections->[$i]; + + if (not defined($line->{'-dur'})) { + if (exists $sections->[$i + 1]) { + $line->{'-dur'} = $sections->[$i + 1]{'-start'} - $line->{'-start'}; + } + else { + $line->{'-dur'} = 10; + } + } + + my $start = $line->{'-start'}; + my $end = $start + $line->{'-dur'}; + + push @text, + join("\n", + $i + 1, + join(' --> ', $self->sec2time($start, $end)), + HTML::Entities::decode_entities($line->{'#text'} // '')); + } + + return join("\n\n", @text); +} + +=head2 get_xml_data($caption_data) + +Get the XML content for a given caption data. + +=cut + +sub get_xml_data { + my ($self, $url) = @_; + + state $lwp = do { + + require LWP::UserAgent; + + my $agent = LWP::UserAgent->new( + timeout => 30, + env_proxy => 1, + agent => + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36', + ); + + require LWP::ConnCache; + state $cache = LWP::ConnCache->new; + $cache->total_capacity(undef); # no limit + + state $accepted_encodings = do { + require HTTP::Message; + HTTP::Message::decodable(); + }; + + $agent->ssl_opts(Timeout => 30); + $agent->default_header('Accept-Encoding' => $accepted_encodings); + $agent->conn_cache($cache); + + $agent; + }; + + my $req = $lwp->get($url); + + if ($req->is_success) { + return $req->decoded_content; + } + + return; +} + +=head2 save_caption($video_ID) + +Save the caption in a .srt file and return its file path. + +=cut + +sub save_caption { + my ($self, $video_id) = @_; + + # Find one of the preferred languages + my $info = $self->find_caption_data() // return; + + require File::Spec; + my $filename = "${video_id}_$info->{languageCode}.srt"; + my $srt_file = File::Spec->catfile($self->{captions_dir} // File::Spec->tmpdir, $filename); + + # Return the srt file if it already exists + return $srt_file if (-e $srt_file); + + # Get XML data, then transform it to SubRip data + my $xml = $self->get_xml_data($info->{baseUrl} // return) // return; + my $srt = $self->xml2srt($xml) // return; + + # Write the SubRib data to the $srt_file + open(my $fh, '>:utf8', $srt_file) or return; + print {$fh} $srt, "\n"; + close $fh; + + # Return the .srt file path + return $srt_file; +} + +=head1 AUTHOR + +Trizen, C<< <echo dHJpemVuQHByb3Rvbm1haWwuY29tCg== | base64 -d> >> + + +=head1 SUPPORT + +You can find documentation for this module with the perldoc command. + + perldoc WWW::FairViewer::GetCaption + + +=head1 LICENSE AND COPYRIGHT + +Copyright 2012-2015 Trizen. + +This program is free software; you can redistribute it and/or modify it +under the terms of either: the GNU General Public License as published +by the Free Software Foundation; or the Artistic License. + +See L<http://dev.perl.org/licenses/> for more information. + +=cut + +1; # End of WWW::FairViewer::GetCaption |