<!-- pre { font-family: monospace; color: #000000; background-color: #e5e5e5; } body { font-family: monospace; color: #000000; background-color: #e5e5e5; } .Special { color: #6a5acd; background-color: #f2f2f2; } .Constant { color: #ff00ff; background-color: #f2f2f2; } .Identifier { color: #008080; } .Statement { color: #804040; font-weight: bold; } .Comment { color: #0000ff; } -->
It's a pity that we cant dowload subtitle for videos at TED.COM. Some people have written
desktop software and web app to solve the problem.But now these things can't work at all.
Maybe TED changed the contents ofweb page. So I write the script using Perl today.
It could get subtitle file from TED.COMand convert it to SRT format(SubRip Subtitle File).
The script could just work,I 'll make it perfect next week. Enjoy it!
Last version: https://gist.github.com/949659
Executable file(Win32): http://www.tinyurl.com/tedsubtitle
Usage: ted.exe URL languageCode output.src
################################################################################
# File: ted.pl
# Desscription: Get ted talk's subtitle from TED.com and convert subtitle of TED video to SRT format(SubRip Subtitle File)
# Usage: ted.pl URL languageCode output.src
# Executable File(Win32): http://www.tinyurl.com/tedsubtitle
# Create: Thinkhy
# Date: 2011.04.30
# ChangeLog: 1 Add language code. 2011.05.06
#
# LanguageCode Language
# alb Albanian
# ara Arabic
# bul Bulgarian
# chi_hans Chinese (Simplified)
# chi_hant Chinese (Traditional)
# cze Czech
# dut Dutch
# eng English
# est Estonian
# fin Finnish
# fre_fr French (France)
# ger German
# gre Greek
# heb Hebrew
# hun Hungarian
# ita Italian
# kor Korean
# pol Polish
# por_br Portuguese (Brazil)
# rum Romanian
# rus Russian
# scr Croatian
# spa Spanish
# tur Turkish
# ukr Ukrainian
#
################################################################################
##!/usr/local/bin/perl
use strict;
use Data::Dumper;
use JSON;
use LWP::Simple qw(get);
# Magic Number
my $durationOfAdv = 16000; # seconds of Advertisement time(millisecond).
# Get content from file
# my $content = GetContentFromFile("back.json");
# The TED talk URL
my $url = $ARGV[0];
# languageCode
my $languageCode = $ARGV[1];
# output file of SRT format
my $outputFile = $ARGV[2];
# !!Note: What you should do is to write URL of TED talks here.
# my $url = "http://www.ted.com/talks/stephen_wolfram_computing_a_theory_of_everything.html";
open OUT, ">out.html";
print "Get html content from URL: $url/n";
# First of all, Get the talkID from the web page.
my $html = GetUrl($url);
$html =~ m/(?<=var talkID = ).*?(/d+)/g;
my $talkID = $1;
chomp($talkID);
print "TalkID: $talkID/n";
#/(?<=/t)/w+/
print OUT $html;
print "Have got html content from URL: $url/n";
# Get subtitle content from TED.COM
my $subtitleUrl = "http://www.ted.com/talks/subtitles/id/$talkID/lang/$languageCode/format/text";
my $content = GetUrl($subtitleUrl);
open DEBUG, ">out.json";
print DEBUG $content;
# Decode JSON text
open SRT, ">$outputFile";
my $json = new JSON;
my $obj = $json->decode($content);
my $startTime = $obj->{captions}->[10]->{startTime};
my $duration = $obj->{captions}->[10]->{duration};
my $subtitle = $obj->{captions}->[10]->{content};
my $cnt = 0;
my $len = scalar(@{$obj->{captions}});
print $len;
#foreach my $element ($obj->{captions})
for (;$cnt < $len; $cnt++)
{
#my %hash = %$element;
my $startTime = $obj->{captions}->[$cnt]->{startTime};
my $duration = $obj->{captions}->[$cnt]->{duration};
my $subtitle = $obj->{captions}->[$cnt]->{content};
OutputSrt(1+$cnt, $startTime, $duration, $subtitle);
}
###########################################################
# Sub Functions
###########################################################
sub GetTime
{
my ($time) = @_;
my $hour = int($time / 1000 / 3600);
my $minute = int((int($time / 1000) % 3600) / 60 );
my $second = int($time / 1000) - $hour * 3600 - $minute * 60;
my $msecond = $time - ($hour * 3600 + $minute * 60 + $second) * 1000;
return ($hour, $minute, $second, $msecond);
}
sub OutputSrt
{
my ($orderNum, $startTime, $duration, $subtitle) = @_;
# plus the duration of advertisement
$startTime += $durationOfAdv;
# Caculate endTime by duration
my $endTime = $startTime + $duration;
my($hour, $minute, $second, $msecond) = GetTime($startTime);
print SRT "$orderNum/n"; # order number
# Begin time
print SRT $hour.":".$minute.":".$second.",$msecond";
# delimitation
print SRT " --> ";
# End time
my($hour1, $minute1, $second1) = GetTime($endTime);
print SRT $hour1.":".$minute1.":".$second1.",$msecond/n";
# Subtitle
print SRT "$subtitle/n/n";
}
sub GetContentFromFile
{
my $file = shift;
my $content;
open FILE, $file;
while(<FILE>) {
$content .= "$_";
}
return $content;
}
# Test URL: http://www.ted.com/talks/subtitles/id/1130/lang/eng/format/text
sub GetUrl
{
my $url = shift;
my $content = get($url);
return $content;
}