aboutsummaryrefslogtreecommitdiffstats
path: root/lib/WWW/StrawViewer/GetCaption.pm
diff options
context:
space:
mode:
Diffstat (limited to 'lib/WWW/StrawViewer/GetCaption.pm')
-rw-r--r--lib/WWW/StrawViewer/GetCaption.pm280
1 files changed, 0 insertions, 280 deletions
diff --git a/lib/WWW/StrawViewer/GetCaption.pm b/lib/WWW/StrawViewer/GetCaption.pm
deleted file mode 100644
index 81af4e2..0000000
--- a/lib/WWW/StrawViewer/GetCaption.pm
+++ /dev/null
@@ -1,280 +0,0 @@
-package WWW::StrawViewer::GetCaption;
-
-use utf8;
-use 5.014;
-use warnings;
-
-=head1 NAME
-
-WWW::StrawViewer::GetCaption - Save the YouTube closed captions as .srt files for a videoID.
-
-=head1 SYNOPSIS
-
- use WWW::StrawViewer::GetCaption;
-
- my $yv_cap = WWW::StrawViewer::GetCaption->new(%opts);
-
- print $yv_cap->get_caption($videoID);
-
-=head1 SUBROUTINES/METHODS
-
-=head2 new(%opts)
-
-Options:
-
-=over 4
-
-=item captions => []
-
-The captions data.
-
-=item captions_dir => "."
-
-Where to save the closed captions.
-
-=item languages => [qw(en es ro jp)]
-
-Preferred languages. First found is saved and returned.
-
-=back
-
-=cut
-
-sub new {
- my ($class, %opts) = @_;
-
- my $self = bless {}, $class;
- $self->{captions_dir} = undef;
- $self->{captions} = [];
- $self->{auto_captions} = 0;
- $self->{languages} = [qw(en es)];
-
- foreach my $key (keys %{$self}) {
- $self->{$key} = delete $opts{$key}
- if exists $opts{$key};
- }
-
- foreach my $invalid_key (keys %opts) {
- warn "Invalid key: '${invalid_key}'";
- }
-
- return $self;
-}
-
-=head2 find_caption_data()
-
-Find a caption data, based on the preferred languages.
-
-=cut
-
-sub find_caption_data {
- my ($self) = @_;
-
- my @found;
- foreach my $caption (@{$self->{captions}}) {
- if (defined $caption->{languageCode}) {
- foreach my $i (0 .. $#{$self->{languages}}) {
- my $lang = $self->{languages}[$i];
- if ($caption->{languageCode} =~ /^\Q$lang\E(?:\z|[_-])/i) {
-
- # Automatic Speech Recognition
- my $auto = defined($caption->{kind}) && lc($caption->{kind}) eq 'asr';
-
- # Check against auto-generated captions
- if ($auto and not $self->{auto_captions}) {
- next;
- }
-
- # Fuzzy match or auto-generated caption
- if (lc($caption->{languageCode}) ne lc($lang) or $auto) {
- $found[$i + (($auto ? 2 : 1) * scalar(@{$self->{languages}}))] = $caption;
- }
-
- # Perfect match
- else {
- $i == 0 and return $caption;
- $found[$i] = $caption;
- }
- }
- }
- }
- }
-
- foreach my $caption (@found) {
- return $caption if defined($caption);
- }
-
- return;
-}
-
-=head2 sec2time(@seconds)
-
-Convert a list of seconds to .srt times.
-
-=cut
-
-sub sec2time {
- my $self = shift;
-
- my @out;
- foreach my $sec (map { sprintf '%.3f', $_ } @_) {
- push @out,
- sprintf('%02d:%02d:%02d,%03d', ($sec / 3600 % 24, $sec / 60 % 60, $sec % 60, substr($sec, index($sec, '.') + 1)));
- }
-
- return @out;
-}
-
-=head2 xml2srt($xml_string)
-
-Convert the XML data to SubRip format.
-
-=cut
-
-sub xml2srt {
- my ($self, $xml) = @_;
-
- require WWW::StrawViewer::ParseXML;
- my $hash = eval { WWW::StrawViewer::ParseXML::xml2hash($xml) } // return;
-
- my $sections;
- if ( exists $hash->{transcript}
- and ref($hash->{transcript}) eq 'ARRAY'
- and ref($hash->{transcript}[0]) eq 'HASH'
- and exists $hash->{transcript}[0]{text}) {
- $sections = $hash->{transcript}[0]{text};
- }
- else {
- return;
- }
-
- require HTML::Entities;
-
- my @text;
- foreach my $i (0 .. $#{$sections}) {
- my $line = $sections->[$i];
-
- if (not defined($line->{'-dur'})) {
- if (exists $sections->[$i + 1]) {
- $line->{'-dur'} = $sections->[$i + 1]{'-start'} - $line->{'-start'};
- }
- else {
- $line->{'-dur'} = 10;
- }
- }
-
- my $start = $line->{'-start'};
- my $end = $start + $line->{'-dur'};
-
- push @text,
- join("\n",
- $i + 1,
- join(' --> ', $self->sec2time($start, $end)),
- HTML::Entities::decode_entities($line->{'#text'} // ''));
- }
-
- return join("\n\n", @text);
-}
-
-=head2 get_xml_data($caption_data)
-
-Get the XML content for a given caption data.
-
-=cut
-
-sub get_xml_data {
- my ($self, $url) = @_;
-
- state $lwp = do {
-
- require LWP::UserAgent;
-
- my $agent = LWP::UserAgent->new(
- timeout => 30,
- env_proxy => 1,
- agent =>
- 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.101 Safari/537.36',
- );
-
- require LWP::ConnCache;
- state $cache = LWP::ConnCache->new;
- $cache->total_capacity(undef); # no limit
-
- state $accepted_encodings = do {
- require HTTP::Message;
- HTTP::Message::decodable();
- };
-
- $agent->ssl_opts(Timeout => 30);
- $agent->default_header('Accept-Encoding' => $accepted_encodings);
- $agent->conn_cache($cache);
-
- $agent;
- };
-
- my $req = $lwp->get($url);
-
- if ($req->is_success) {
- return $req->decoded_content;
- }
-
- return;
-}
-
-=head2 save_caption($video_ID)
-
-Save the caption in a .srt file and return its file path.
-
-=cut
-
-sub save_caption {
- my ($self, $video_id) = @_;
-
- # Find one of the preferred languages
- my $info = $self->find_caption_data() // return;
-
- require File::Spec;
- my $filename = "${video_id}_$info->{languageCode}.srt";
- my $srt_file = File::Spec->catfile($self->{captions_dir} // File::Spec->tmpdir, $filename);
-
- # Return the srt file if it already exists
- return $srt_file if (-e $srt_file);
-
- # Get XML data, then transform it to SubRip data
- my $xml = $self->get_xml_data($info->{baseUrl} // return) // return;
- my $srt = $self->xml2srt($xml) // return;
-
- # Write the SubRib data to the $srt_file
- open(my $fh, '>:utf8', $srt_file) or return;
- print {$fh} $srt, "\n";
- close $fh;
-
- # Return the .srt file path
- return $srt_file;
-}
-
-=head1 AUTHOR
-
-Trizen, C<< <echo dHJpemVuQHByb3Rvbm1haWwuY29tCg== | base64 -d> >>
-
-
-=head1 SUPPORT
-
-You can find documentation for this module with the perldoc command.
-
- perldoc WWW::StrawViewer::GetCaption
-
-
-=head1 LICENSE AND COPYRIGHT
-
-Copyright 2012-2015 Trizen.
-
-This program is free software; you can redistribute it and/or modify it
-under the terms of either: the GNU General Public License as published
-by the Free Software Foundation; or the Artistic License.
-
-See L<http://dev.perl.org/licenses/> for more information.
-
-=cut
-
-1; # End of WWW::StrawViewer::GetCaption