| a18bfc9 by Aurélien Bompard at 2010-04-11 |
1 |
#!/usr/bin/env python |
|
2 |
# vim: set fileencoding=utf-8 tabstop=4 shiftwidth=4 expandtab smartindent: |
|
3 |
u""" |
| 7581434 by Aurélien Bompard at 2011-04-03 |
4 |
|
|
5 |
RSS Mirror |
|
6 |
---------- |
|
7 |
|
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
8 |
Mirrors on the local disk the pages listed in an RSS feed, using Wget or |
|
9 |
HTTrack. |
|
10 |
|
| df41b86 by Aurélien Bompard at 2011-10-14 |
11 |
Requires Python >= 2.6 |
|
12 |
|
| c19ca33 by Aurélien Bompard at 2012-01-04 |
13 |
|
|
14 |
Configuration file |
|
15 |
~~~~~~~~~~~~~~~~~~ |
|
16 |
|
|
17 |
RSS-mirror uses a configuration file to list the RSS feed that should be |
|
18 |
downloaded. This file must be placed in ``~/.config/rss-mirror.conf`` and is in |
|
19 |
INI format. Example:: |
|
20 |
|
|
21 |
[DEFAULT] |
|
22 |
output = ~/pda/webpages |
|
23 |
|
|
24 |
[owni] |
|
25 |
url = http://owni.fr/feed |
|
26 |
|
|
27 |
[zenhabits] |
|
28 |
url = http://zenhabits.net/feed |
|
29 |
|
|
30 |
[rue89-ecologie] |
|
31 |
url = http://www.rue89.com/tag/ecologie/feed |
|
32 |
title = Rue89 - Ecologie |
|
33 |
|
|
34 |
The ``DEFAULT`` section has an ``output`` key pointing to the output directory |
|
35 |
where the webpages will be downloaded. |
|
36 |
|
|
37 |
Each section (except DEFAULT) is a feed to download. It has a ``url`` key which |
|
38 |
is self-explanatory and an optional ``title`` key which will be used as a title |
|
39 |
for the feed in the summary page. |
|
40 |
|
|
41 |
|
|
42 |
Credits |
|
43 |
~~~~~~~ |
|
44 |
|
| 7581434 by Aurélien Bompard at 2011-04-03 |
45 |
.. :Authors: |
|
46 |
Aurélien Bompard <aurelien@bompard.org> <http://aurelien.bompard.org> |
|
47 |
|
|
48 |
.. :License: |
|
49 |
GNU GPL v3 or later |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
50 |
|
|
51 |
""" |
|
52 |
|
|
53 |
import os |
|
54 |
import sys |
| f31a5e8 by Aurélien Bompard at 2010-05-09 |
55 |
import urllib |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
56 |
import urllib2 |
|
57 |
import re |
|
58 |
import shutil |
|
59 |
import optparse |
|
60 |
import datetime |
|
61 |
import time |
| df41b86 by Aurélien Bompard at 2011-10-14 |
62 |
import itertools |
| 984214f by Aurélien Bompard at 2011-04-13 |
63 |
import xml.etree.ElementTree as etree |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
64 |
from urlparse import urlparse |
| ec63db5 by Aurélien Bompard at 2011-10-02 |
65 |
from subprocess import call |
| df41b86 by Aurélien Bompard at 2011-10-14 |
66 |
from zipfile import ZipFile, BadZipfile |
|
67 |
from ConfigParser import SafeConfigParser |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
68 |
|
| 7581434 by Aurélien Bompard at 2011-04-03 |
69 |
|
| 6a3dead by Aurélien Bompard at 2011-10-14 |
70 |
IUI_VERSION = "0.40-alpha1" |
| df41b86 by Aurélien Bompard at 2011-10-14 |
71 |
CONFIG_PATH = "~/.config/rss-mirror.conf" |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
72 |
|
| 7581434 by Aurélien Bompard at 2011-04-03 |
73 |
|
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
74 |
class Downloader(object): |
|
75 |
""" |
|
76 |
Abstract downloader class |
| 7581434 by Aurélien Bompard at 2011-04-03 |
77 |
|
| 9f7de6a by Aurélien Bompard at 2010-07-27 |
78 |
:cvar return_codes_ok: list of non-zero return codes that are actually OK |
|
79 |
:type return_codes_ok: ``list`` |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
80 |
""" |
|
81 |
|
| 9f7de6a by Aurélien Bompard at 2010-07-27 |
82 |
return_codes_ok = [] |
|
83 |
|
| 984214f by Aurélien Bompard at 2011-04-13 |
84 |
def get_command(self, destdir, url, options=None): |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
85 |
"""Returns the system command to execute""" |
| 9f7de6a by Aurélien Bompard at 2010-07-27 |
86 |
raise NotImplementedError |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
87 |
|
|
88 |
def get_start_path(self, basedir, page): |
|
89 |
"""Returns the path to the downloaded page on the disk""" |
| 984214f by Aurélien Bompard at 2011-04-13 |
90 |
raise NotImplementedError |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
91 |
|
| 7581434 by Aurélien Bompard at 2011-04-03 |
92 |
|
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
93 |
class HttrackDownloader(Downloader): |
|
94 |
""" |
|
95 |
Download using httrack. More features than wget, but is has some bugs, like |
|
96 |
downloading CSS stylesheets in ``@import`` constructs. |
|
97 |
""" |
|
98 |
|
| fa99e26 by Aurélien Bompard at 2010-04-13 |
99 |
name = "httrack" |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
100 |
opts = [ |
|
101 |
"-%l", "fr", # language |
|
102 |
"-Y", # mirror links |
|
103 |
"-C0", # no cache |
|
104 |
"-b0", # no cookies |
|
105 |
"-n", # download "near" files |
|
106 |
"-L0", # DOS-compatible file names |
|
107 |
"-d", # stay on the same domain |
|
108 |
"-x", # replace external links by error page |
|
109 |
"-%u", #url hacks: various hacks to limit duplicate URLs |
| 9f7de6a by Aurélien Bompard at 2010-07-27 |
110 |
"-F", "rss-mirror (allow like Gecko)", # user-agent |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
111 |
] |
|
112 |
|
| 24db566 by Aurélien Bompard at 2010-05-09 |
113 |
def __init__(self): |
|
114 |
super(HttrackDownloader, self).__init__() |
| df41b86 by Aurélien Bompard at 2011-10-14 |
115 |
recursive = config.getint("DEFAULT", "recursive") |
|
116 |
if recursive: |
|
117 |
self.opts.append("-r%d" % recursive) |
| 24db566 by Aurélien Bompard at 2010-05-09 |
118 |
|
| 984214f by Aurélien Bompard at 2011-04-13 |
119 |
def get_command(self, destdir, url, options=None): |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
120 |
command = ["httrack"] |
|
121 |
command.extend(self.opts) |
| 984214f by Aurélien Bompard at 2011-04-13 |
122 |
if options: |
|
123 |
command.extend(options) |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
124 |
command.extend(["-O", destdir, url]) |
|
125 |
return command |
|
126 |
|
| df41b86 by Aurélien Bompard at 2011-10-14 |
127 |
def get_start_path(self, basedir, title): |
|
128 |
indexfile = open(os.path.join(basedir, title, "index.html")) |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
129 |
mo = re.search('<meta HTTP-EQUIV="Refresh" CONTENT="0; URL=(.*)">', |
|
130 |
indexfile.read()) |
|
131 |
indexfile.close() |
|
132 |
return mo.group(1) |
|
133 |
|
| 7581434 by Aurélien Bompard at 2011-04-03 |
134 |
|
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
135 |
class WgetDownloader(Downloader): |
|
136 |
""" |
|
137 |
Download using wget. Simple and fast. |
| 7581434 by Aurélien Bompard at 2011-04-03 |
138 |
|
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
139 |
I use the ``-nv`` switch to avoid creating the whole directory structure |
|
140 |
mirroring the website structure, because the FAT32 filesystem does not like |
|
141 |
very very long names. |
|
142 |
""" |
|
143 |
|
| fa99e26 by Aurélien Bompard at 2010-04-13 |
144 |
name = "wget" |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
145 |
opts = [ |
|
146 |
"-nv", # non verbose |
|
147 |
"-k", # convert links |
|
148 |
"-p", # download needed files for the page |
|
149 |
"-N", # timestamping |
|
150 |
"--restrict-file-names=windows,ascii,lowercase", |
|
151 |
"-E", # adjust extension |
|
152 |
"-H", # allow going on a different domain |
|
153 |
"--timeout=15", # it's 900 by default... |
|
154 |
"--tries=2", # it's 20 by default... |
|
155 |
"-nd", # avoid having 255+ chars paths |
| 9f7de6a by Aurélien Bompard at 2010-07-27 |
156 |
"--no-check-certificate", # SSL |
| 03bd1d7 by Aurélien Bompard at 2011-10-14 |
157 |
# User-agent: try to get the mobile version of the page |
|
158 |
"-U", ("Mozilla/5.0 (Linux; U; Android 2.2; en-us; rss-mirror) " |
|
159 |
"AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile " |
|
160 |
"Safari/533.1"), |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
161 |
] |
| 15bf794 by Aurélien Bompard at 2010-09-12 |
162 |
return_codes_ok = [3, 4, 8] |
|
163 |
# 3: I/O error, usually because the filename is too long |
|
164 |
# 4: Network error (broken link on the page) |
|
165 |
# 8: Server issued error response (broken link on the page) |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
166 |
|
| 24db566 by Aurélien Bompard at 2010-05-09 |
167 |
def __init__(self): |
|
168 |
super(WgetDownloader, self).__init__() |
| df41b86 by Aurélien Bompard at 2011-10-14 |
169 |
recursive = config.getint("DEFAULT", "recursive") |
|
170 |
if recursive: |
|
171 |
self.opts.extend(["-r", "-l%d" % recursive]) |
| 24db566 by Aurélien Bompard at 2010-05-09 |
172 |
|
| 984214f by Aurélien Bompard at 2011-04-13 |
173 |
def get_command(self, destdir, url, options=None): |
|
174 |
command = ["wget", ] |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
175 |
command.extend(self.opts) |
| 984214f by Aurélien Bompard at 2011-04-13 |
176 |
if options: |
|
177 |
command.extend(options) |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
178 |
command.extend(["-P", destdir, url]) |
|
179 |
return command |
|
180 |
|
| df41b86 by Aurélien Bompard at 2011-10-14 |
181 |
def get_start_path(self, basedir, title): |
|
182 |
urlfile = open(os.path.join(basedir, title, "url.txt")) |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
183 |
url = urlfile.read().strip() |
|
184 |
urlfile.close() |
|
185 |
if url.endswith("/"): |
| 57ee016 by Aurélien Bompard at 2010-05-09 |
186 |
url += "index.html" |
|
187 |
if "-nv" in self.opts: |
|
188 |
local_path = self.get_start_path_nodirs(basedir, url) |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
189 |
else: |
| df41b86 by Aurélien Bompard at 2011-10-14 |
190 |
if os.path.exists(os.path.join(basedir, title, "index.html")): |
| 57ee016 by Aurélien Bompard at 2010-05-09 |
191 |
# downloaded with httrack |
|
192 |
httrack_dl = HttrackDownloader() |
|
193 |
return httrack_dl.get_start_path(basedir, page) |
|
194 |
local_path = self.get_start_path_dirs(basedir, url) |
| df41b86 by Aurélien Bompard at 2011-10-14 |
195 |
if os.path.isfile(os.path.join(basedir, title, local_path)): |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
196 |
return local_path |
| df41b86 by Aurélien Bompard at 2011-10-14 |
197 |
if os.path.isfile(os.path.join(basedir, title, local_path) + ".html"): |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
198 |
return local_path + ".html" |
|
199 |
else: |
| 57ee016 by Aurélien Bompard at 2010-05-09 |
200 |
print >>sys.stderr, "Can't find the start page: ", local_path |
| df41b86 by Aurélien Bompard at 2011-10-14 |
201 |
print >>sys.stderr, "Tried:", os.path.join(basedir, title, local_path), \ |
|
202 |
os.path.join(basedir, title, local_path) + ".html" |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
203 |
return "." |
|
204 |
|
| 57ee016 by Aurélien Bompard at 2010-05-09 |
205 |
def get_start_path_nodirs(self, basedir, url): |
|
206 |
url_parsed = urlparse(url) |
|
207 |
local_path = os.path.basename(url_parsed[2]).lower() |
|
208 |
if not local_path: |
|
209 |
local_path = "index.html" |
|
210 |
if url_parsed[4]: |
|
211 |
local_path += "@" + url_parsed[4].lower().replace("/", "%2f") |
|
212 |
return local_path |
|
213 |
|
|
214 |
def get_start_path_dirs(self, basedir, url): |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
215 |
local_path = url.replace("http://","").lower().replace("?", "@") |
| 57ee016 by Aurélien Bompard at 2010-05-09 |
216 |
return local_path |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
217 |
|
|
218 |
|
| fa99e26 by Aurélien Bompard at 2010-04-13 |
219 |
def get_engines(): |
|
220 |
engines = {} |
|
221 |
for downloader in Downloader.__subclasses__(): |
|
222 |
engines[downloader.name] = downloader |
|
223 |
return engines |
|
224 |
|
| df41b86 by Aurélien Bompard at 2011-10-14 |
225 |
|
| 984214f by Aurélien Bompard at 2011-04-13 |
226 |
def extract_options(desc): |
|
227 |
opts = [] |
|
228 |
options_matches = re.findall("\{options:\s+[^}]+\}", desc) |
|
229 |
for options_match in options_matches: |
|
230 |
inner_options = options_match[10:-1].strip() |
|
231 |
for inner_opt in inner_options.split(): |
|
232 |
inner_opt = inner_opt.strip() |
|
233 |
if not inner_opt: |
|
234 |
continue |
|
235 |
opts.append(inner_opt) |
|
236 |
return opts |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
237 |
|
|
238 |
|
| df41b86 by Aurélien Bompard at 2011-10-14 |
239 |
class Page(object): |
|
240 |
""" |
|
241 |
- ``title`` is a shortned title derived from the page title, |
|
242 |
- ``link`` is the URL, |
|
243 |
- ``title_full`` is the HTML page title, |
|
244 |
- ``timestamp`` is the UNIX timestamp of the page in the RSS feed, which is |
|
245 |
probably the moment you bookmarked it. |
|
246 |
""" |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
247 |
|
|
248 |
|
|
249 |
allowed_chars = re.compile("[^a-zA-Z0-9_-]") |
| df41b86 by Aurélien Bompard at 2011-10-14 |
250 |
desc_cleanup = re.compile("<[^>]+>") |
|
251 |
|
|
252 |
@classmethod |
|
253 |
def parse(cls, item): |
|
254 |
titlesize = config.getint("DEFAULT", "title_size",) |
|
255 |
page = cls() |
|
256 |
page.title_full = item.findtext("title").strip() |
|
257 |
page.title = page.title_full[:titlesize].strip().lower() |
|
258 |
page.title = cls.allowed_chars.sub("", page.title.replace(" ","_")) |
| f9cf3f7 by Aurélien Bompard at 2011-10-21 |
259 |
page.link = item.findtext("link").strip() |
| 984214f by Aurélien Bompard at 2011-04-13 |
260 |
timestamp = item.findtext("pubDate") |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
261 |
try: |
|
262 |
timestamp = datetime.datetime.strptime(timestamp, |
|
263 |
"%a, %d %b %Y %H:%M:%S EDT") |
|
264 |
timestamp = int(time.mktime(timestamp.timetuple())) |
|
265 |
except ValueError: |
|
266 |
timestamp = int(time.time()) |
| df41b86 by Aurélien Bompard at 2011-10-14 |
267 |
page.timestamp = timestamp |
|
268 |
page.description = item.findtext("description", "") |
|
269 |
page.description = cls.desc_cleanup.sub("", page.description) |
|
270 |
page.options = extract_options(page.description) |
|
271 |
return page |
|
272 |
|
|
273 |
def download(self, outdir, downloader): |
|
274 |
"""Use the downloader to mirror the page""" |
|
275 |
destdir = os.path.join(outdir, self.title) |
|
276 |
if os.path.exists(destdir): |
|
277 |
feedname = os.path.basename(outdir) |
|
278 |
print "Already downloaded:", os.path.join(feedname, self.title) |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
279 |
return |
| df41b86 by Aurélien Bompard at 2011-10-14 |
280 |
print "Downloading", self.title, self.link |
|
281 |
try: |
|
282 |
command = downloader.get_command(destdir, self.link, |
|
283 |
options=self.options) |
| f9cf3f7 by Aurélien Bompard at 2011-10-21 |
284 |
print " ".join(command) |
| df41b86 by Aurélien Bompard at 2011-10-14 |
285 |
if config.getboolean("DEFAULT", "debug"): |
|
286 |
retcode = 0 |
|
287 |
else: |
|
288 |
retcode = call(command) |
|
289 |
if retcode < 0: |
|
290 |
print |
|
291 |
print >> sys.stderr, "Child was terminated by signal", -retcode |
|
292 |
return |
|
293 |
if retcode != 0 and retcode not in downloader.return_codes_ok: |
|
294 |
print |
|
295 |
print >> sys.stderr, "Something went wrong while downloading " \ |
|
296 |
+ self.title + "(%s)" % self.link |
|
297 |
print >> sys.stderr, "Return code: %s" % retcode |
|
298 |
return |
|
299 |
except OSError, e: |
| 9f7de6a by Aurélien Bompard at 2010-07-27 |
300 |
print |
| df41b86 by Aurélien Bompard at 2011-10-14 |
301 |
print >> sys.stderr, "Execution failed:", e |
| 9f7de6a by Aurélien Bompard at 2010-07-27 |
302 |
return |
| df41b86 by Aurélien Bompard at 2011-10-14 |
303 |
except KeyboardInterrupt, e: |
|
304 |
print "Removing downloaded dir in 1 sec..." # to avoid partial downloads |
|
305 |
time.sleep(1) |
|
306 |
shutil.rmtree(destdir) |
|
307 |
return |
|
308 |
# Backup the URL in the url.txt file |
|
309 |
link_file = open(os.path.join(destdir, "url.txt"),"w") |
|
310 |
link_file.write(self.link) |
|
311 |
link_file.close() |
|
312 |
# Backup the HTML title in the title.txt file |
|
313 |
title_file = open(os.path.join(destdir, "title.txt"),"w") |
| 9f7de6a by Aurélien Bompard at 2010-07-27 |
314 |
try: |
| df41b86 by Aurélien Bompard at 2011-10-14 |
315 |
title_file.write(unicode(self.title_full).encode("utf-8")) |
|
316 |
except UnicodeEncodeError: |
|
317 |
title_file.write(self.title) |
|
318 |
title_file.close() |
|
319 |
# Backup the timestamp in the timestamp.txt file |
|
320 |
timestamp_file = open(os.path.join(destdir, "timestamp.txt"),"w") |
|
321 |
timestamp_file.write(str(self.timestamp)) |
|
322 |
timestamp_file.close() |
|
323 |
print |
|
324 |
time.sleep(1) # Can't remember why this was necessary... FIXME |
|
325 |
|
|
326 |
|
|
327 |
|
|
328 |
class Feed(object): |
|
329 |
|
|
330 |
def __init__(self, name, url): |
|
331 |
self.name = name |
|
332 |
self.url = url |
|
333 |
self.title = None |
|
334 |
self.pages = [] |
|
335 |
|
|
336 |
def parse(self): |
|
337 |
""" |
|
338 |
Read an RSS feed and return a list of pages to mirror. |
|
339 |
""" |
|
340 |
content = urllib2.urlopen(self.url) |
|
341 |
feed = etree.parse(content) |
|
342 |
self.title = self.get_title(feed) |
|
343 |
pages = [] |
|
344 |
for item in feed.findall(".//item"): |
|
345 |
pages.append(Page.parse(item)) |
|
346 |
self.pages = pages |
|
347 |
|
|
348 |
def get_title(self, feed): |
|
349 |
if config.has_option(self.name, "title"): |
|
350 |
return config.get(self.name, "title") |
|
351 |
feed_title = feed.findtext("channel/title") |
|
352 |
if not feed_title: |
|
353 |
return self.name |
|
354 |
return feed_title |
|
355 |
|
|
356 |
|
|
357 |
class Repository(object): |
|
358 |
""" |
|
359 |
A folder containing mirrored pages |
|
360 |
""" |
|
361 |
|
|
362 |
def __init__(self, path, feeds): |
|
363 |
self.path = os.path.expanduser(path) |
|
364 |
self.feeds = feeds |
|
365 |
|
|
366 |
def make_index(self, downloader): |
|
367 |
"""Build the HTML index of the mirrored pages""" |
|
368 |
startfiles = {} |
|
369 |
for feed in self.feeds: |
|
370 |
startfiles[feed] = [] |
|
371 |
destdir = os.path.join(self.path, feed.name) |
|
372 |
for page in feed.pages: |
|
373 |
if not os.path.exists(os.path.join(destdir, page.title)): |
|
374 |
continue |
|
375 |
try: |
|
376 |
local_path = downloader.get_start_path(destdir, page.title) |
|
377 |
except IOError: |
|
378 |
print >> sys.stderr, "Can't find the url.txt file for %s" \ |
|
379 |
% page.title |
|
380 |
continue # no url.txt file, something went wrong |
|
381 |
startfiles[feed].append( |
|
382 |
( unicode(page.title_full).encode("utf-8"), |
|
383 |
page.description, |
|
384 |
"/".join([feed.name, page.title, local_path]) ) ) |
|
385 |
mainindex = open(os.path.join(self.path, "index.html"), "w") |
|
386 |
mainindex.write("""<!DOCTYPE html> |
| ec63db5 by Aurélien Bompard at 2011-10-02 |
387 |
<html> |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
388 |
<head> |
| 6a3dead by Aurélien Bompard at 2011-10-14 |
389 |
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> |
|
390 |
<meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0; user-scalable=0;"/> |
| ec63db5 by Aurélien Bompard at 2011-10-02 |
391 |
<title>Mirrored pages</title> |
| 6a3dead by Aurélien Bompard at 2011-10-14 |
392 |
<link rel="stylesheet" href="_iui/iui.css" type="text/css" /> |
|
393 |
<link rel="stylesheet" href="_iui/t/default/default-theme.css" type="text/css"/> |
|
394 |
<script type="application/x-javascript" src="_iui/iui.js"></script> |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
395 |
</head> |
|
396 |
<body> |
| db37078 by Aurélien Bompard at 2011-01-22 |
397 |
|
| 6a3dead by Aurélien Bompard at 2011-10-14 |
398 |
<div class="toolbar"> |
|
399 |
<h1 id="pageTitle"></h1> |
|
400 |
<a id="backButton" class="button" href="#"></a> |
|
401 |
</div> |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
402 |
""") |
| 6a3dead by Aurélien Bompard at 2011-10-14 |
403 |
if len(startfiles) > 1: |
|
404 |
mainindex.write("""<ul id="index" title="Mirrored pages" selected="true">\n""") |
|
405 |
for feed in sorted(startfiles, key=lambda f: f.name): |
|
406 |
mainindex.write(""" <li><a href="#%(name)s">%(title)s</a></li>\n""" |
|
407 |
% {"name": feed.name, "title": feed.title} ) |
|
408 |
mainindex.write("</ul>\n\n") |
|
409 |
|
| df41b86 by Aurélien Bompard at 2011-10-14 |
410 |
for feed in sorted(startfiles, key=lambda f: f.name): |
| 6a3dead by Aurélien Bompard at 2011-10-14 |
411 |
mainindex.write("""<ul id="%(name)s" title="%(title)s">\n""" |
|
412 |
% {"name": feed.name, "title": feed.title}) |
| df41b86 by Aurélien Bompard at 2011-10-14 |
413 |
for title, description, index in startfiles[feed]: |
|
414 |
mainindex.write( |
| 6a3dead by Aurélien Bompard at 2011-10-14 |
415 |
""" <li><a href="%s" target="_webapp">%s</a></li>\n""" % |
|
416 |
(urllib.quote(index.encode("utf-8")), title)) |
|
417 |
mainindex.write("</ul>\n\n") |
| df41b86 by Aurélien Bompard at 2011-10-14 |
418 |
mainindex.write(""" |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
419 |
</body> |
|
420 |
</html> |
|
421 |
""") |
|
422 |
|
| df41b86 by Aurélien Bompard at 2011-10-14 |
423 |
|
|
424 |
def cleanup(self): |
|
425 |
""" |
|
426 |
Remove mirrored pages which are not in the feed anymore (probably |
|
427 |
because you read them) |
|
428 |
""" |
|
429 |
dirs_to_remove = self._get_old_feed_dirs() |
|
430 |
for feed in self.feeds: |
|
431 |
dirs_to_remove.extend(self._get_old_page_dirs(feed)) |
|
432 |
for dirname in dirs_to_remove: |
|
433 |
print "Removing", dirname |
|
434 |
if not config.getboolean("DEFAULT", "debug"): |
|
435 |
shutil.rmtree(os.path.join(self.path, dirname)) |
|
436 |
self.lowercase_dirs() |
| 6a3dead by Aurélien Bompard at 2011-10-14 |
437 |
if "_iui" not in os.listdir(self.path): |
|
438 |
self.download_iui() |
| df41b86 by Aurélien Bompard at 2011-10-14 |
439 |
|
|
440 |
def _get_old_feed_dirs(self): |
|
441 |
dirs = [] |
|
442 |
for feeddirname in os.listdir(self.path): |
|
443 |
if feeddirname.startswith("."): |
|
444 |
continue |
| 6a3dead by Aurélien Bompard at 2011-10-14 |
445 |
if feeddirname == "_iui": |
| df41b86 by Aurélien Bompard at 2011-10-14 |
446 |
continue |
|
447 |
if not os.path.isdir(os.path.join(self.path, feeddirname)): |
|
448 |
continue # like "index.html" for example |
|
449 |
if feeddirname not in [ f.name for f in self.feeds ]: |
|
450 |
dirs.append(feeddirname) |
|
451 |
return dirs |
|
452 |
|
|
453 |
def _get_old_page_dirs(self, feed): |
|
454 |
if not os.path.isdir(os.path.join(self.path, feed.name)): |
|
455 |
return [] |
|
456 |
dirs = [] |
|
457 |
for dirname in os.listdir(os.path.join(self.path, feed.name)): |
|
458 |
if dirname.startswith("."): |
|
459 |
continue |
|
460 |
if dirname not in [ p.title for p in feed.pages ]: |
|
461 |
dirs.append(os.path.join(feed.name, dirname)) |
|
462 |
return dirs |
|
463 |
|
|
464 |
def lowercase_dirs(self): |
|
465 |
"""For FAT32 transparency""" |
|
466 |
for feed in self.feeds: |
|
467 |
for root, dirs, files in os.walk( |
|
468 |
os.path.join(self.path, feed.name), topdown=False): |
|
469 |
for name in dirs: |
|
470 |
newname = name.lower() |
|
471 |
if name != newname: |
|
472 |
source = os.path.join(self.path, feed.name, root, name) |
|
473 |
dest = os.path.join(self.path, feed.name, root, newname) |
|
474 |
if os.path.exists(dest): |
|
475 |
continue |
|
476 |
os.rename(source, dest) |
|
477 |
|
|
478 |
|
| 6a3dead by Aurélien Bompard at 2011-10-14 |
479 |
def download_iui(self): |
|
480 |
print "Downloading iUI... ", |
|
481 |
sys.stdout.flush() |
|
482 |
iui_url = "http://iui.googlecode.com/files/iui-%s.zip" % IUI_VERSION |
|
483 |
try: |
|
484 |
fn, _headers = urllib.urlretrieve(iui_url) |
|
485 |
with ZipFile(fn, "r") as archive: |
|
486 |
archive.extractall(self.path) |
|
487 |
os.remove(fn) |
|
488 |
os.rename(os.path.join(self.path, "iui-%s/web-app/iui" % IUI_VERSION), |
|
489 |
os.path.join(self.path, "_iui")) |
|
490 |
shutil.rmtree(os.path.join(self.path, "iui-%s" % IUI_VERSION)) |
|
491 |
except (IOError, BadZipfile): |
|
492 |
print "FAILED." |
|
493 |
print >>sys.stderr, "WARNING, could not download or unzip iUI" |
|
494 |
from formatter import DumbWriter |
|
495 |
dw = DumbWriter(sys.stderr) # not really necessary, but fun :) |
|
496 |
dw.send_flowing_data("You must download iUI from %s, unpack it, " |
|
497 |
"rename the folder to \"_iui\", and put it in the " |
|
498 |
"destination directory." % iui_url) |
|
499 |
print |
|
500 |
return |
|
501 |
print "done." |
|
502 |
|
|
503 |
|
| df41b86 by Aurélien Bompard at 2011-10-14 |
504 |
def download_jqm(self): |
|
505 |
print "Downloading jQuery Mobile... ", |
|
506 |
sys.stdout.flush() |
|
507 |
jqm_url = ("http://code.jquery.com/mobile/%(ver)s/jquery.mobile-%(ver)s.zip" |
|
508 |
% {"ver": JQM_VERSION}) |
|
509 |
jq_url = "http://code.jquery.com/jquery-%s.min.js" % JQ_VERSION |
|
510 |
try: |
|
511 |
jq_fn, _headers = urllib.urlretrieve(jqm_url) |
|
512 |
with ZipFile(jq_fn, "r") as jq_zip: |
|
513 |
jq_zip.extractall(self.path) |
|
514 |
os.remove(jq_fn) |
|
515 |
os.rename(os.path.join(self.path, "jquery.mobile-%s" % JQM_VERSION), |
|
516 |
os.path.join(self.path, "_jqm")) |
|
517 |
urllib.urlretrieve(jq_url, os.path.join(self.path, "_jqm", |
|
518 |
os.path.basename(jq_url))) |
|
519 |
except (IOError, BadZipfile): |
|
520 |
print "FAILED." |
|
521 |
print >>sys.stderr, ("WARNING, could not download or unzip " |
|
522 |
"jQuery Mobile.") |
|
523 |
from formatter import DumbWriter |
|
524 |
dw = DumbWriter(sys.stderr) # not really necessary, but fun :) |
|
525 |
dw.send_flowing_data("You must download jQuery Mobile from " |
|
526 |
"%(jqmurl)s, unpack it, rename the folder to \"_jqm\", " |
|
527 |
"put it in the destination directory, then download " |
|
528 |
"jQuery from %(jqurl)s, and put it in the same folder." |
|
529 |
% { "jqmurl": jqm_url, "jqurl": jq_url } ) |
|
530 |
print |
|
531 |
return |
|
532 |
print "done." |
|
533 |
|
| 6d7a295 by Aurélien Bompard at 2010-05-14 |
534 |
|
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
535 |
def parse_opts(): |
|
536 |
"""Command-line options""" |
| df41b86 by Aurélien Bompard at 2011-10-14 |
537 |
usage = "usage: %prog -c <config file>" |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
538 |
parser = optparse.OptionParser(usage) |
| df41b86 by Aurélien Bompard at 2011-10-14 |
539 |
parser.add_option("-c", "--config", help="Configuration file") |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
540 |
parser.add_option("-o", "--output", dest="output", metavar="DIR", |
|
541 |
help="Output directory (will be purged !)") |
| fa99e26 by Aurélien Bompard at 2010-04-13 |
542 |
parser.add_option("--list-engines", dest="lse", action="store_true", |
|
543 |
help="List available engines and exit") |
| df41b86 by Aurélien Bompard at 2011-10-14 |
544 |
parser.add_option("-r", "--recursive", dest="recursive", |
| 24db566 by Aurélien Bompard at 2010-05-09 |
545 |
type="int", metavar="DEPTH", help="Download linked " |
|
546 |
"pages until this depth. Be careful with that. " |
|
547 |
"Default: %default)") |
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
548 |
parser.add_option("-d", "--debug", dest="debug", action="store_true", |
|
549 |
help="Debug mode") |
|
550 |
options, args = parser.parse_args() |
| fa99e26 by Aurélien Bompard at 2010-04-13 |
551 |
if (options.lse): |
| df41b86 by Aurélien Bompard at 2011-10-14 |
552 |
engines = get_engines() |
| fa99e26 by Aurélien Bompard at 2010-04-13 |
553 |
print "\n".join(engines.keys()) |
|
554 |
sys.exit() |
| df41b86 by Aurélien Bompard at 2011-10-14 |
555 |
if not options.config: |
|
556 |
if os.path.exists(os.path.expanduser(CONFIG_PATH)): |
|
557 |
options.config = CONFIG_PATH |
|
558 |
else: |
|
559 |
parser.error("You must provide a configuration file (or put it " |
|
560 |
"in %s)" % CONFIG_PATH) |
|
561 |
if not os.path.exists(os.path.expanduser(options.config)): |
|
562 |
parser.error("Unable to find the configuration file: %s" |
|
563 |
% options.config) |
|
564 |
if args: |
|
565 |
parser.error("No arguments allowed") |
|
566 |
return options |
|
567 |
|
|
568 |
|
|
569 |
def get_feeds(config): |
|
570 |
feed_list = [] |
|
571 |
for s in config.sections(): |
|
572 |
if not config.has_option(s, "url"): |
|
573 |
continue |
|
574 |
feed = Feed(s, config.get(s, "url")) |
|
575 |
feed_list.append(feed) |
|
576 |
return feed_list |
|
577 |
|
|
578 |
|
|
579 |
def choose_engine(config): |
|
580 |
engines = get_engines() |
|
581 |
name = config.get("DEFAULT", "engine") |
|
582 |
return engines[name]() |
|
583 |
|
|
584 |
|
|
585 |
def get_config(options): |
|
586 |
# TODO: create config |
|
587 |
config = SafeConfigParser({"title_size": "50", "engine": "wget", |
|
588 |
"recursive": "0"}) |
|
589 |
config.read(os.path.expanduser(options.config)) |
|
590 |
if options.output is not None: |
|
591 |
config.set("DEFAULT", "output", options.output) |
|
592 |
if not config.has_option("DEFAULT", "output"): |
|
593 |
print >> sys.stderr, "Config file should have an 'output' variable" |
|
594 |
sys.exit(1) |
|
595 |
if not os.path.isdir(os.path.expanduser(config.get("DEFAULT", "output"))): |
|
596 |
print >> sys.stderr, "The output path must be a directory" |
|
597 |
sys.exit(1) |
|
598 |
config.set("DEFAULT", "debug", str(bool(options.debug))) |
|
599 |
if options.recursive is not None: |
|
600 |
config.set("DEFAULT", "recursive", options.recursive) |
|
601 |
return config |
|
602 |
|
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
603 |
|
|
604 |
def main(): |
|
605 |
"""The fun starts here""" |
| df41b86 by Aurélien Bompard at 2011-10-14 |
606 |
global config |
|
607 |
options = parse_opts() |
|
608 |
config = get_config(options) |
|
609 |
downloader = choose_engine(config) |
|
610 |
|
|
611 |
feeds = get_feeds(config) |
| 6a3dead by Aurélien Bompard at 2011-10-14 |
612 |
for feed in feeds[:]: |
| df41b86 by Aurélien Bompard at 2011-10-14 |
613 |
try: |
|
614 |
feed.parse() |
|
615 |
except urllib2.HTTPError, e: |
|
616 |
print >>sys.stderr, "Failed downloading %s: %s" % (feed.url, e) |
| 6a3dead by Aurélien Bompard at 2011-10-14 |
617 |
feeds.remove(feed) |
| df41b86 by Aurélien Bompard at 2011-10-14 |
618 |
|
|
619 |
repo = Repository(config.get("DEFAULT", "output"), feeds) |
|
620 |
|
|
621 |
for feed in feeds: |
|
622 |
outdir = os.path.join(repo.path, feed.name) |
|
623 |
for page in feed.pages: |
|
624 |
page.download(outdir, downloader) |
|
625 |
|
|
626 |
repo.make_index(downloader) |
|
627 |
repo.cleanup() |
|
628 |
|
| a18bfc9 by Aurélien Bompard at 2010-04-11 |
629 |
|
|
630 |
|
|
631 |
if __name__ == "__main__": |
|
632 |
main() |