| 1 |
#!/usr/bin/perl |
| 2 |
|
| 3 |
use warnings; |
| 4 |
use strict; |
| 5 |
use bigint; |
| 6 |
|
| 7 |
BEGIN { |
| 8 |
my @deps = qw( |
| 9 |
Number::Nary |
| 10 |
LWP::Simple |
| 11 |
Getopt::Long |
| 12 |
IO::File |
| 13 |
); |
| 14 |
foreach my $dep (sort @deps) { |
| 15 |
eval "require $dep"; |
| 16 |
if($@) { |
| 17 |
die "Cannot find module dependency $dep - Please install it via cpan\n"; |
| 18 |
} |
| 19 |
} |
| 20 |
} |
| 21 |
|
| 22 |
use IO::File; |
| 23 |
use Number::Nary; |
| 24 |
use LWP::Simple; |
| 25 |
use Getopt::Long; |
| 26 |
|
| 27 |
|
| 28 |
$|++; |
| 29 |
|
| 30 |
my ($enc, $dec) = n_codec( join("", 0 .. 9, 'a' .. 'z', 'A' .. 'Z') ); |
| 31 |
|
| 32 |
my ($file, $length, $start, $output, $revision_dir); |
| 33 |
GetOptions( |
| 34 |
"file=s" => \$file, |
| 35 |
"length=s" => \$length, |
| 36 |
"start=s" => \$start, |
| 37 |
"output=s" => \$output, |
| 38 |
"revisions=s" => \$revision_dir, |
| 39 |
); |
| 40 |
|
| 41 |
$length ||= 10; |
| 42 |
$start ||= 0; |
| 43 |
$file ||= "ep_ids.txt"; |
| 44 |
$output ||= "archive"; |
| 45 |
$revision_dir ||= "$output/revisions"; |
| 46 |
|
| 47 |
unless(-d $output) { |
| 48 |
die "$output is not a valid directory\n"; |
| 49 |
} |
| 50 |
|
| 51 |
unless(-d $revision_dir) { |
| 52 |
die "$revision_dir is not a valid directory\n"; |
| 53 |
} |
| 54 |
|
| 55 |
|
| 56 |
my $i = 0; |
| 57 |
if(-f $file) { |
| 58 |
print "Scanning $file for starting point...\n"; |
| 59 |
my $last; |
| 60 |
|
| 61 |
my $fh = IO::File->new($file, "r"); |
| 62 |
foreach my $line (<$fh>) { |
| 63 |
chomp $line; |
| 64 |
if( (length $line) && ($line !~ /^\s*$/) ) { |
| 65 |
$last = $line; |
| 66 |
} |
| 67 |
} |
| 68 |
$fh->close; |
| 69 |
undef $fh; |
| 70 |
|
| 71 |
if($last) { |
| 72 |
$last =~ s/^- //; |
| 73 |
if( $start ) { |
| 74 |
print "** Overriding the --start option, using the last recorded value in $file\n"; |
| 75 |
} |
| 76 |
$start = $last; |
| 77 |
} |
| 78 |
} |
| 79 |
|
| 80 |
print <<EOT; |
| 81 |
Using input file $file |
| 82 |
Stopping when length cross $length |
| 83 |
Starting with id $start |
| 84 |
Outputting initial versions to $output |
| 85 |
Outputting revisions to $revision_dir |
| 86 |
|
| 87 |
EOT |
| 88 |
|
| 89 |
if($start) { |
| 90 |
$i = $dec->($start); |
| 91 |
} |
| 92 |
|
| 93 |
$i=$i+0; # this overcomes some perl string/number strangness that angers 32bit perls |
| 94 |
my $id = $enc->($i++); |
| 95 |
|
| 96 |
while(length($id) <= $length) { |
| 97 |
my $start = "Looking at $id ..."; |
| 98 |
|
| 99 |
my $c = get("http://etherpad.com/$id"); |
| 100 |
unless($c) { |
| 101 |
until(length $c) { |
| 102 |
print "No content returned from http://etherpad.com/$id. Retrying in 1 second\n"; |
| 103 |
sleep 1; |
| 104 |
print "Looking at $id ..."; |
| 105 |
$c = get("http://etherpad.com/$id"); |
| 106 |
} |
| 107 |
} |
| 108 |
|
| 109 |
if(length $c) { |
| 110 |
if($c =~ /var clientVars = (\{.+\})/) { |
| 111 |
my $js = $1; |
| 112 |
write_to_file("$output/$id.js", "$js\n"); |
| 113 |
print "$start Archived\n"; |
| 114 |
|
| 115 |
$js =~ /"initialRevisionList":\[(.+?)\]/; |
| 116 |
my ($revlist, @revisions); |
| 117 |
$revlist = $1; |
| 118 |
if($revlist) { |
| 119 |
@revisions = $revlist =~ /"id":"(.+?)"/g; |
| 120 |
} |
| 121 |
if(@revisions) { |
| 122 |
foreach my $rev_id (@revisions) { |
| 123 |
my $output_file = "$revision_dir/$id-$rev_id.js"; |
| 124 |
my $rev_start = "** $id Revision: $rev_id ... "; |
| 125 |
|
| 126 |
if(-f $output_file) { |
| 127 |
print "$rev_start Skip\n"; |
| 128 |
} else { |
| 129 |
my $js = grab_clientvars("http://etherpad.com/ep/pad/view/$id/$rev_id"); |
| 130 |
if($js) { |
| 131 |
write_to_file($output_file, $js); |
| 132 |
print "$rev_start Archived\n"; |
| 133 |
} else { |
| 134 |
print "$rev_start No content\n"; |
| 135 |
} |
| 136 |
} |
| 137 |
} |
| 138 |
} |
| 139 |
} else { |
| 140 |
print "$start No\n"; |
| 141 |
} |
| 142 |
} else { |
| 143 |
print "$start No\n"; |
| 144 |
} |
| 145 |
|
| 146 |
write_to_file($file, "$id\n"); |
| 147 |
$id = $enc->($i++); |
| 148 |
} |
| 149 |
|
| 150 |
|
| 151 |
|
| 152 |
##################### |
| 153 |
sub grab_clientvars { |
| 154 |
my $url = shift; |
| 155 |
my $content = get($url); |
| 156 |
|
| 157 |
my ($clientvars) = $content =~ /var clientVars = (.+?);\n/ms; |
| 158 |
|
| 159 |
return $clientvars if $clientvars; |
| 160 |
return; |
| 161 |
} |
| 162 |
|
| 163 |
|
| 164 |
sub write_to_file { |
| 165 |
my ($file, $text) = @_; |
| 166 |
my $fh = IO::File->new($file, '>') |
| 167 |
or die "Could not open $file for writing: $!"; |
| 168 |
print $fh $text; |
| 169 |
undef $fh; |
| 170 |
return; |
| 171 |
} |