1
#!/usr/bin/perl
2
3
use warnings;
4
use strict;
5
use bigint;
6
7
BEGIN {
8
	my @deps = qw(
9
		Number::Nary
10
		LWP::Simple
11
		Getopt::Long
12
		IO::File
13
	);
14
	foreach my $dep (sort @deps) {
15
		eval "require $dep";
16
		if($@) {
17
			die "Cannot find module dependency $dep - Please install it via cpan\n";
18
		}
19
	}
20
}
21
22
use IO::File;
23
use Number::Nary;
24
use LWP::Simple;
25
use Getopt::Long;
26
27
28
$|++;
29
30
my ($enc, $dec) = n_codec( join("", 0 .. 9, 'a' .. 'z', 'A' .. 'Z') );
31
32
my ($file, $length, $start, $output, $revision_dir);
33
GetOptions(
34
	"file=s"   => \$file,
35
	"length=s" => \$length,
36
	"start=s"  => \$start,
37
	"output=s" => \$output,
38
	"revisions=s" => \$revision_dir,
39
);
40
41
$length ||= 10;
42
$start  ||= 0;
43
$file   ||= "ep_ids.txt";
44
$output ||= "archive";
45
$revision_dir ||= "$output/revisions";
46
47
unless(-d $output) {
48
	die "$output is not a valid directory\n";
49
}
50
51
unless(-d $revision_dir) {
52
	die "$revision_dir is not a valid directory\n";
53
}
54
55
56
my $i = 0;
57
if(-f $file) {
58
	print "Scanning $file for starting point...\n";
59
	my $last;
60
61
	my $fh = IO::File->new($file, "r");
62
	foreach my $line (<$fh>) {
63
		chomp $line;
64
		if( (length $line) && ($line !~ /^\s*$/) ) {
65
			$last = $line;
66
		}
67
	}
68
	$fh->close;
69
	undef $fh;
70
71
	if($last) {
72
		$last =~ s/^- //;
73
		if( $start ) {
74
			print "** Overriding the --start option, using the last recorded value in $file\n";
75
		}
76
		$start = $last;
77
	}
78
}
79
80
print <<EOT;
81
Using input file $file
82
Stopping when length cross $length
83
Starting with id $start
84
Outputting initial versions to $output
85
Outputting revisions to $revision_dir
86
87
EOT
88
89
if($start) {
90
	$i = $dec->($start);
91
}
92
93
$i=$i+0; # this overcomes some perl string/number strangness that angers 32bit perls
94
my $id = $enc->($i++);
95
96
while(length($id) <= $length) {
97
	my $start = "Looking at $id ...";
98
99
	my $c = get("http://etherpad.com/$id");
100
	unless($c) {
101
		until(length $c) {
102
			print "No content returned from http://etherpad.com/$id. Retrying in 1 second\n";
103
			sleep 1;
104
			print "Looking at $id ...";
105
			$c = get("http://etherpad.com/$id");
106
		}
107
	}
108
109
	if(length $c) {
110
		if($c =~ /var clientVars = (\{.+\})/) {
111
			my $js = $1;
112
			write_to_file("$output/$id.js", "$js\n");
113
			print "$start Archived\n";
114
115
			$js =~ /"initialRevisionList":\[(.+?)\]/;
116
			my ($revlist, @revisions);
117
			$revlist = $1;
118
			if($revlist) {
119
				@revisions = $revlist =~ /"id":"(.+?)"/g;
120
			}
121
			if(@revisions) {
122
				foreach my $rev_id (@revisions) {
123
					my $output_file = "$revision_dir/$id-$rev_id.js";
124
					my $rev_start = "** $id Revision: $rev_id ... ";
125
					
126
					if(-f $output_file) {
127
						print "$rev_start Skip\n";
128
					} else {
129
						my $js = grab_clientvars("http://etherpad.com/ep/pad/view/$id/$rev_id");
130
						if($js) {
131
							write_to_file($output_file, $js);
132
							print "$rev_start Archived\n";
133
						} else {
134
							print "$rev_start No content\n";
135
						}
136
					}
137
				}
138
			}
139
		} else {
140
			print "$start No\n";
141
		}
142
	} else {
143
		print "$start No\n";
144
	}
145
146
	write_to_file($file, "$id\n");
147
	$id = $enc->($i++);
148
}
149
150
151
152
#####################
153
sub grab_clientvars {
154
	my $url = shift;
155
	my $content = get($url); 
156
157
	my ($clientvars) = $content =~ /var clientVars = (.+?);\n/ms;
158
159
	return $clientvars if $clientvars;
160
	return;
161
}
162
163
164
sub write_to_file {
165
	my ($file, $text) = @_;
166
	my $fh = IO::File->new($file, '>') 
167
		or die "Could not open $file for writing: $!";
168
	print $fh $text;
169
	undef $fh;
170
	return;
171
}