| 1 |
#!/usr/bin/env python |
| 2 |
# vim: set fileencoding=utf-8 tabstop=4 shiftwidth=4 expandtab smartindent: |
| 3 |
u""" |
| 4 |
|
| 5 |
RSS Mirror |
| 6 |
---------- |
| 7 |
|
| 8 |
Mirrors on the local disk the pages listed in an RSS feed, using Wget or |
| 9 |
HTTrack. |
| 10 |
|
| 11 |
Requires Python >= 2.6 |
| 12 |
|
| 13 |
|
| 14 |
Configuration file |
| 15 |
~~~~~~~~~~~~~~~~~~ |
| 16 |
|
| 17 |
RSS-mirror uses a configuration file to list the RSS feed that should be |
| 18 |
downloaded. This file must be placed in ``~/.config/rss-mirror.conf`` and is in |
| 19 |
INI format. Example:: |
| 20 |
|
| 21 |
[DEFAULT] |
| 22 |
output = ~/pda/webpages |
| 23 |
|
| 24 |
[owni] |
| 25 |
url = http://owni.fr/feed |
| 26 |
|
| 27 |
[zenhabits] |
| 28 |
url = http://zenhabits.net/feed |
| 29 |
|
| 30 |
[rue89-ecologie] |
| 31 |
url = http://www.rue89.com/tag/ecologie/feed |
| 32 |
title = Rue89 - Ecologie |
| 33 |
|
| 34 |
The ``DEFAULT`` section has an ``output`` key pointing to the output directory |
| 35 |
where the webpages will be downloaded. |
| 36 |
|
| 37 |
Each section (except DEFAULT) is a feed to download. It has a ``url`` key which |
| 38 |
is self-explanatory and an optional ``title`` key which will be used as a title |
| 39 |
for the feed in the summary page. |
| 40 |
|
| 41 |
|
| 42 |
Credits |
| 43 |
~~~~~~~ |
| 44 |
|
| 45 |
.. :Authors: |
| 46 |
Aurélien Bompard <aurelien@bompard.org> <http://aurelien.bompard.org> |
| 47 |
|
| 48 |
.. :License: |
| 49 |
GNU GPL v3 or later |
| 50 |
|
| 51 |
""" |
| 52 |
|
| 53 |
import os |
| 54 |
import sys |
| 55 |
import urllib |
| 56 |
import urllib2 |
| 57 |
import re |
| 58 |
import shutil |
| 59 |
import optparse |
| 60 |
import datetime |
| 61 |
import time |
| 62 |
import itertools |
| 63 |
import xml.etree.ElementTree as etree |
| 64 |
from urlparse import urlparse |
| 65 |
from subprocess import call |
| 66 |
from zipfile import ZipFile, BadZipfile |
| 67 |
from ConfigParser import SafeConfigParser |
| 68 |
|
| 69 |
|
| 70 |
IUI_VERSION = "0.40-alpha1" |
| 71 |
CONFIG_PATH = "~/.config/rss-mirror.conf" |
| 72 |
|
| 73 |
|
| 74 |
class Downloader(object): |
| 75 |
""" |
| 76 |
Abstract downloader class |
| 77 |
|
| 78 |
:cvar return_codes_ok: list of non-zero return codes that are actually OK |
| 79 |
:type return_codes_ok: ``list`` |
| 80 |
""" |
| 81 |
|
| 82 |
return_codes_ok = [] |
| 83 |
|
| 84 |
def get_command(self, destdir, url, options=None): |
| 85 |
"""Returns the system command to execute""" |
| 86 |
raise NotImplementedError |
| 87 |
|
| 88 |
def get_start_path(self, basedir, page): |
| 89 |
"""Returns the path to the downloaded page on the disk""" |
| 90 |
raise NotImplementedError |
| 91 |
|
| 92 |
|
| 93 |
class HttrackDownloader(Downloader): |
| 94 |
""" |
| 95 |
Download using httrack. More features than wget, but is has some bugs, like |
| 96 |
downloading CSS stylesheets in ``@import`` constructs. |
| 97 |
""" |
| 98 |
|
| 99 |
name = "httrack" |
| 100 |
opts = [ |
| 101 |
"-%l", "fr", # language |
| 102 |
"-Y", # mirror links |
| 103 |
"-C0", # no cache |
| 104 |
"-b0", # no cookies |
| 105 |
"-n", # download "near" files |
| 106 |
"-L0", # DOS-compatible file names |
| 107 |
"-d", # stay on the same domain |
| 108 |
"-x", # replace external links by error page |
| 109 |
"-%u", #url hacks: various hacks to limit duplicate URLs |
| 110 |
"-F", "rss-mirror (allow like Gecko)", # user-agent |
| 111 |
] |
| 112 |
|
| 113 |
def __init__(self): |
| 114 |
super(HttrackDownloader, self).__init__() |
| 115 |
recursive = config.getint("DEFAULT", "recursive") |
| 116 |
if recursive: |
| 117 |
self.opts.append("-r%d" % recursive) |
| 118 |
|
| 119 |
def get_command(self, destdir, url, options=None): |
| 120 |
command = ["httrack"] |
| 121 |
command.extend(self.opts) |
| 122 |
if options: |
| 123 |
command.extend(options) |
| 124 |
command.extend(["-O", destdir, url]) |
| 125 |
return command |
| 126 |
|
| 127 |
def get_start_path(self, basedir, title): |
| 128 |
indexfile = open(os.path.join(basedir, title, "index.html")) |
| 129 |
mo = re.search('<meta HTTP-EQUIV="Refresh" CONTENT="0; URL=(.*)">', |
| 130 |
indexfile.read()) |
| 131 |
indexfile.close() |
| 132 |
return mo.group(1) |
| 133 |
|
| 134 |
|
| 135 |
class WgetDownloader(Downloader): |
| 136 |
""" |
| 137 |
Download using wget. Simple and fast. |
| 138 |
|
| 139 |
I use the ``-nv`` switch to avoid creating the whole directory structure |
| 140 |
mirroring the website structure, because the FAT32 filesystem does not like |
| 141 |
very very long names. |
| 142 |
""" |
| 143 |
|
| 144 |
name = "wget" |
| 145 |
opts = [ |
| 146 |
"-nv", # non verbose |
| 147 |
"-k", # convert links |
| 148 |
"-p", # download needed files for the page |
| 149 |
"-N", # timestamping |
| 150 |
"--restrict-file-names=windows,ascii,lowercase", |
| 151 |
"-E", # adjust extension |
| 152 |
"-H", # allow going on a different domain |
| 153 |
"--timeout=15", # it's 900 by default... |
| 154 |
"--tries=2", # it's 20 by default... |
| 155 |
"-nd", # avoid having 255+ chars paths |
| 156 |
"--no-check-certificate", # SSL |
| 157 |
# User-agent: try to get the mobile version of the page |
| 158 |
"-U", ("Mozilla/5.0 (Linux; U; Android 2.2; en-us; rss-mirror) " |
| 159 |
"AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile " |
| 160 |
"Safari/533.1"), |
| 161 |
] |
| 162 |
return_codes_ok = [3, 4, 8] |
| 163 |
# 3: I/O error, usually because the filename is too long |
| 164 |
# 4: Network error (broken link on the page) |
| 165 |
# 8: Server issued error response (broken link on the page) |
| 166 |
|
| 167 |
def __init__(self): |
| 168 |
super(WgetDownloader, self).__init__() |
| 169 |
recursive = config.getint("DEFAULT", "recursive") |
| 170 |
if recursive: |
| 171 |
self.opts.extend(["-r", "-l%d" % recursive]) |
| 172 |
|
| 173 |
def get_command(self, destdir, url, options=None): |
| 174 |
command = ["wget", ] |
| 175 |
command.extend(self.opts) |
| 176 |
if options: |
| 177 |
command.extend(options) |
| 178 |
command.extend(["-P", destdir, url]) |
| 179 |
return command |
| 180 |
|
| 181 |
def get_start_path(self, basedir, title): |
| 182 |
urlfile = open(os.path.join(basedir, title, "url.txt")) |
| 183 |
url = urlfile.read().strip() |
| 184 |
urlfile.close() |
| 185 |
if url.endswith("/"): |
| 186 |
url += "index.html" |
| 187 |
if "-nv" in self.opts: |
| 188 |
local_path = self.get_start_path_nodirs(basedir, url) |
| 189 |
else: |
| 190 |
if os.path.exists(os.path.join(basedir, title, "index.html")): |
| 191 |
# downloaded with httrack |
| 192 |
httrack_dl = HttrackDownloader() |
| 193 |
return httrack_dl.get_start_path(basedir, page) |
| 194 |
local_path = self.get_start_path_dirs(basedir, url) |
| 195 |
if os.path.isfile(os.path.join(basedir, title, local_path)): |
| 196 |
return local_path |
| 197 |
if os.path.isfile(os.path.join(basedir, title, local_path) + ".html"): |
| 198 |
return local_path + ".html" |
| 199 |
else: |
| 200 |
print >>sys.stderr, "Can't find the start page: ", local_path |
| 201 |
print >>sys.stderr, "Tried:", os.path.join(basedir, title, local_path), \ |
| 202 |
os.path.join(basedir, title, local_path) + ".html" |
| 203 |
return "." |
| 204 |
|
| 205 |
def get_start_path_nodirs(self, basedir, url): |
| 206 |
url_parsed = urlparse(url) |
| 207 |
local_path = os.path.basename(url_parsed[2]).lower() |
| 208 |
if not local_path: |
| 209 |
local_path = "index.html" |
| 210 |
if url_parsed[4]: |
| 211 |
local_path += "@" + url_parsed[4].lower().replace("/", "%2f") |
| 212 |
return local_path |
| 213 |
|
| 214 |
def get_start_path_dirs(self, basedir, url): |
| 215 |
local_path = url.replace("http://","").lower().replace("?", "@") |
| 216 |
return local_path |
| 217 |
|
| 218 |
|
| 219 |
def get_engines(): |
| 220 |
engines = {} |
| 221 |
for downloader in Downloader.__subclasses__(): |
| 222 |
engines[downloader.name] = downloader |
| 223 |
return engines |
| 224 |
|
| 225 |
|
| 226 |
def extract_options(desc): |
| 227 |
opts = [] |
| 228 |
options_matches = re.findall("\{options:\s+[^}]+\}", desc) |
| 229 |
for options_match in options_matches: |
| 230 |
inner_options = options_match[10:-1].strip() |
| 231 |
for inner_opt in inner_options.split(): |
| 232 |
inner_opt = inner_opt.strip() |
| 233 |
if not inner_opt: |
| 234 |
continue |
| 235 |
opts.append(inner_opt) |
| 236 |
return opts |
| 237 |
|
| 238 |
|
| 239 |
class Page(object): |
| 240 |
""" |
| 241 |
- ``title`` is a shortned title derived from the page title, |
| 242 |
- ``link`` is the URL, |
| 243 |
- ``title_full`` is the HTML page title, |
| 244 |
- ``timestamp`` is the UNIX timestamp of the page in the RSS feed, which is |
| 245 |
probably the moment you bookmarked it. |
| 246 |
""" |
| 247 |
|
| 248 |
|
| 249 |
allowed_chars = re.compile("[^a-zA-Z0-9_-]") |
| 250 |
desc_cleanup = re.compile("<[^>]+>") |
| 251 |
|
| 252 |
@classmethod |
| 253 |
def parse(cls, item): |
| 254 |
titlesize = config.getint("DEFAULT", "title_size",) |
| 255 |
page = cls() |
| 256 |
page.title_full = item.findtext("title").strip() |
| 257 |
page.title = page.title_full[:titlesize].strip().lower() |
| 258 |
page.title = cls.allowed_chars.sub("", page.title.replace(" ","_")) |
| 259 |
page.link = item.findtext("link").strip() |
| 260 |
timestamp = item.findtext("pubDate") |
| 261 |
try: |
| 262 |
timestamp = datetime.datetime.strptime(timestamp, |
| 263 |
"%a, %d %b %Y %H:%M:%S EDT") |
| 264 |
timestamp = int(time.mktime(timestamp.timetuple())) |
| 265 |
except ValueError: |
| 266 |
timestamp = int(time.time()) |
| 267 |
page.timestamp = timestamp |
| 268 |
page.description = item.findtext("description", "") |
| 269 |
page.description = cls.desc_cleanup.sub("", page.description) |
| 270 |
page.options = extract_options(page.description) |
| 271 |
return page |
| 272 |
|
| 273 |
def download(self, outdir, downloader): |
| 274 |
"""Use the downloader to mirror the page""" |
| 275 |
destdir = os.path.join(outdir, self.title) |
| 276 |
if os.path.exists(destdir): |
| 277 |
feedname = os.path.basename(outdir) |
| 278 |
print "Already downloaded:", os.path.join(feedname, self.title) |
| 279 |
return |
| 280 |
print "Downloading", self.title, self.link |
| 281 |
try: |
| 282 |
command = downloader.get_command(destdir, self.link, |
| 283 |
options=self.options) |
| 284 |
print " ".join(command) |
| 285 |
if config.getboolean("DEFAULT", "debug"): |
| 286 |
retcode = 0 |
| 287 |
else: |
| 288 |
retcode = call(command) |
| 289 |
if retcode < 0: |
| 290 |
print |
| 291 |
print >> sys.stderr, "Child was terminated by signal", -retcode |
| 292 |
return |
| 293 |
if retcode != 0 and retcode not in downloader.return_codes_ok: |
| 294 |
print |
| 295 |
print >> sys.stderr, "Something went wrong while downloading " \ |
| 296 |
+ self.title + "(%s)" % self.link |
| 297 |
print >> sys.stderr, "Return code: %s" % retcode |
| 298 |
return |
| 299 |
except OSError, e: |
| 300 |
print |
| 301 |
print >> sys.stderr, "Execution failed:", e |
| 302 |
return |
| 303 |
except KeyboardInterrupt, e: |
| 304 |
print "Removing downloaded dir in 1 sec..." # to avoid partial downloads |
| 305 |
time.sleep(1) |
| 306 |
shutil.rmtree(destdir) |
| 307 |
return |
| 308 |
# Backup the URL in the url.txt file |
| 309 |
link_file = open(os.path.join(destdir, "url.txt"),"w") |
| 310 |
link_file.write(self.link) |
| 311 |
link_file.close() |
| 312 |
# Backup the HTML title in the title.txt file |
| 313 |
title_file = open(os.path.join(destdir, "title.txt"),"w") |
| 314 |
try: |
| 315 |
title_file.write(unicode(self.title_full).encode("utf-8")) |
| 316 |
except UnicodeEncodeError: |
| 317 |
title_file.write(self.title) |
| 318 |
title_file.close() |
| 319 |
# Backup the timestamp in the timestamp.txt file |
| 320 |
timestamp_file = open(os.path.join(destdir, "timestamp.txt"),"w") |
| 321 |
timestamp_file.write(str(self.timestamp)) |
| 322 |
timestamp_file.close() |
| 323 |
print |
| 324 |
time.sleep(1) # Can't remember why this was necessary... FIXME |
| 325 |
|
| 326 |
|
| 327 |
|
| 328 |
class Feed(object): |
| 329 |
|
| 330 |
def __init__(self, name, url): |
| 331 |
self.name = name |
| 332 |
self.url = url |
| 333 |
self.title = None |
| 334 |
self.pages = [] |
| 335 |
|
| 336 |
def parse(self): |
| 337 |
""" |
| 338 |
Read an RSS feed and return a list of pages to mirror. |
| 339 |
""" |
| 340 |
content = urllib2.urlopen(self.url) |
| 341 |
feed = etree.parse(content) |
| 342 |
self.title = self.get_title(feed) |
| 343 |
pages = [] |
| 344 |
for item in feed.findall(".//item"): |
| 345 |
pages.append(Page.parse(item)) |
| 346 |
self.pages = pages |
| 347 |
|
| 348 |
def get_title(self, feed): |
| 349 |
if config.has_option(self.name, "title"): |
| 350 |
return config.get(self.name, "title") |
| 351 |
feed_title = feed.findtext("channel/title") |
| 352 |
if not feed_title: |
| 353 |
return self.name |
| 354 |
return feed_title |
| 355 |
|
| 356 |
|
| 357 |
class Repository(object): |
| 358 |
""" |
| 359 |
A folder containing mirrored pages |
| 360 |
""" |
| 361 |
|
| 362 |
def __init__(self, path, feeds): |
| 363 |
self.path = os.path.expanduser(path) |
| 364 |
self.feeds = feeds |
| 365 |
|
| 366 |
def make_index(self, downloader): |
| 367 |
"""Build the HTML index of the mirrored pages""" |
| 368 |
startfiles = {} |
| 369 |
for feed in self.feeds: |
| 370 |
startfiles[feed] = [] |
| 371 |
destdir = os.path.join(self.path, feed.name) |
| 372 |
for page in feed.pages: |
| 373 |
if not os.path.exists(os.path.join(destdir, page.title)): |
| 374 |
continue |
| 375 |
try: |
| 376 |
local_path = downloader.get_start_path(destdir, page.title) |
| 377 |
except IOError: |
| 378 |
print >> sys.stderr, "Can't find the url.txt file for %s" \ |
| 379 |
% page.title |
| 380 |
continue # no url.txt file, something went wrong |
| 381 |
startfiles[feed].append( |
| 382 |
( unicode(page.title_full).encode("utf-8"), |
| 383 |
page.description, |
| 384 |
"/".join([feed.name, page.title, local_path]) ) ) |
| 385 |
mainindex = open(os.path.join(self.path, "index.html"), "w") |
| 386 |
mainindex.write("""<!DOCTYPE html> |
| 387 |
<html> |
| 388 |
<head> |
| 389 |
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> |
| 390 |
<meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0; user-scalable=0;"/> |
| 391 |
<title>Mirrored pages</title> |
| 392 |
<link rel="stylesheet" href="_iui/iui.css" type="text/css" /> |
| 393 |
<link rel="stylesheet" href="_iui/t/default/default-theme.css" type="text/css"/> |
| 394 |
<script type="application/x-javascript" src="_iui/iui.js"></script> |
| 395 |
</head> |
| 396 |
<body> |
| 397 |
|
| 398 |
<div class="toolbar"> |
| 399 |
<h1 id="pageTitle"></h1> |
| 400 |
<a id="backButton" class="button" href="#"></a> |
| 401 |
</div> |
| 402 |
""") |
| 403 |
if len(startfiles) > 1: |
| 404 |
mainindex.write("""<ul id="index" title="Mirrored pages" selected="true">\n""") |
| 405 |
for feed in sorted(startfiles, key=lambda f: f.name): |
| 406 |
mainindex.write(""" <li><a href="#%(name)s">%(title)s</a></li>\n""" |
| 407 |
% {"name": feed.name, "title": feed.title} ) |
| 408 |
mainindex.write("</ul>\n\n") |
| 409 |
|
| 410 |
for feed in sorted(startfiles, key=lambda f: f.name): |
| 411 |
mainindex.write("""<ul id="%(name)s" title="%(title)s">\n""" |
| 412 |
% {"name": feed.name, "title": feed.title}) |
| 413 |
for title, description, index in startfiles[feed]: |
| 414 |
mainindex.write( |
| 415 |
""" <li><a href="%s" target="_webapp">%s</a></li>\n""" % |
| 416 |
(urllib.quote(index.encode("utf-8")), title)) |
| 417 |
mainindex.write("</ul>\n\n") |
| 418 |
mainindex.write(""" |
| 419 |
</body> |
| 420 |
</html> |
| 421 |
""") |
| 422 |
|
| 423 |
|
| 424 |
def cleanup(self): |
| 425 |
""" |
| 426 |
Remove mirrored pages which are not in the feed anymore (probably |
| 427 |
because you read them) |
| 428 |
""" |
| 429 |
dirs_to_remove = self._get_old_feed_dirs() |
| 430 |
for feed in self.feeds: |
| 431 |
dirs_to_remove.extend(self._get_old_page_dirs(feed)) |
| 432 |
for dirname in dirs_to_remove: |
| 433 |
print "Removing", dirname |
| 434 |
if not config.getboolean("DEFAULT", "debug"): |
| 435 |
shutil.rmtree(os.path.join(self.path, dirname)) |
| 436 |
self.lowercase_dirs() |
| 437 |
if "_iui" not in os.listdir(self.path): |
| 438 |
self.download_iui() |
| 439 |
|
| 440 |
def _get_old_feed_dirs(self): |
| 441 |
dirs = [] |
| 442 |
for feeddirname in os.listdir(self.path): |
| 443 |
if feeddirname.startswith("."): |
| 444 |
continue |
| 445 |
if feeddirname == "_iui": |
| 446 |
continue |
| 447 |
if not os.path.isdir(os.path.join(self.path, feeddirname)): |
| 448 |
continue # like "index.html" for example |
| 449 |
if feeddirname not in [ f.name for f in self.feeds ]: |
| 450 |
dirs.append(feeddirname) |
| 451 |
return dirs |
| 452 |
|
| 453 |
def _get_old_page_dirs(self, feed): |
| 454 |
if not os.path.isdir(os.path.join(self.path, feed.name)): |
| 455 |
return [] |
| 456 |
dirs = [] |
| 457 |
for dirname in os.listdir(os.path.join(self.path, feed.name)): |
| 458 |
if dirname.startswith("."): |
| 459 |
continue |
| 460 |
if dirname not in [ p.title for p in feed.pages ]: |
| 461 |
dirs.append(os.path.join(feed.name, dirname)) |
| 462 |
return dirs |
| 463 |
|
| 464 |
def lowercase_dirs(self): |
| 465 |
"""For FAT32 transparency""" |
| 466 |
for feed in self.feeds: |
| 467 |
for root, dirs, files in os.walk( |
| 468 |
os.path.join(self.path, feed.name), topdown=False): |
| 469 |
for name in dirs: |
| 470 |
newname = name.lower() |
| 471 |
if name != newname: |
| 472 |
source = os.path.join(self.path, feed.name, root, name) |
| 473 |
dest = os.path.join(self.path, feed.name, root, newname) |
| 474 |
if os.path.exists(dest): |
| 475 |
continue |
| 476 |
os.rename(source, dest) |
| 477 |
|
| 478 |
|
| 479 |
def download_iui(self): |
| 480 |
print "Downloading iUI... ", |
| 481 |
sys.stdout.flush() |
| 482 |
iui_url = "http://iui.googlecode.com/files/iui-%s.zip" % IUI_VERSION |
| 483 |
try: |
| 484 |
fn, _headers = urllib.urlretrieve(iui_url) |
| 485 |
with ZipFile(fn, "r") as archive: |
| 486 |
archive.extractall(self.path) |
| 487 |
os.remove(fn) |
| 488 |
os.rename(os.path.join(self.path, "iui-%s/web-app/iui" % IUI_VERSION), |
| 489 |
os.path.join(self.path, "_iui")) |
| 490 |
shutil.rmtree(os.path.join(self.path, "iui-%s" % IUI_VERSION)) |
| 491 |
except (IOError, BadZipfile): |
| 492 |
print "FAILED." |
| 493 |
print >>sys.stderr, "WARNING, could not download or unzip iUI" |
| 494 |
from formatter import DumbWriter |
| 495 |
dw = DumbWriter(sys.stderr) # not really necessary, but fun :) |
| 496 |
dw.send_flowing_data("You must download iUI from %s, unpack it, " |
| 497 |
"rename the folder to \"_iui\", and put it in the " |
| 498 |
"destination directory." % iui_url) |
| 499 |
print |
| 500 |
return |
| 501 |
print "done." |
| 502 |
|
| 503 |
|
| 504 |
def download_jqm(self): |
| 505 |
print "Downloading jQuery Mobile... ", |
| 506 |
sys.stdout.flush() |
| 507 |
jqm_url = ("http://code.jquery.com/mobile/%(ver)s/jquery.mobile-%(ver)s.zip" |
| 508 |
% {"ver": JQM_VERSION}) |
| 509 |
jq_url = "http://code.jquery.com/jquery-%s.min.js" % JQ_VERSION |
| 510 |
try: |
| 511 |
jq_fn, _headers = urllib.urlretrieve(jqm_url) |
| 512 |
with ZipFile(jq_fn, "r") as jq_zip: |
| 513 |
jq_zip.extractall(self.path) |
| 514 |
os.remove(jq_fn) |
| 515 |
os.rename(os.path.join(self.path, "jquery.mobile-%s" % JQM_VERSION), |
| 516 |
os.path.join(self.path, "_jqm")) |
| 517 |
urllib.urlretrieve(jq_url, os.path.join(self.path, "_jqm", |
| 518 |
os.path.basename(jq_url))) |
| 519 |
except (IOError, BadZipfile): |
| 520 |
print "FAILED." |
| 521 |
print >>sys.stderr, ("WARNING, could not download or unzip " |
| 522 |
"jQuery Mobile.") |
| 523 |
from formatter import DumbWriter |
| 524 |
dw = DumbWriter(sys.stderr) # not really necessary, but fun :) |
| 525 |
dw.send_flowing_data("You must download jQuery Mobile from " |
| 526 |
"%(jqmurl)s, unpack it, rename the folder to \"_jqm\", " |
| 527 |
"put it in the destination directory, then download " |
| 528 |
"jQuery from %(jqurl)s, and put it in the same folder." |
| 529 |
% { "jqmurl": jqm_url, "jqurl": jq_url } ) |
| 530 |
print |
| 531 |
return |
| 532 |
print "done." |
| 533 |
|
| 534 |
|
| 535 |
def parse_opts(): |
| 536 |
"""Command-line options""" |
| 537 |
usage = "usage: %prog -c <config file>" |
| 538 |
parser = optparse.OptionParser(usage) |
| 539 |
parser.add_option("-c", "--config", help="Configuration file") |
| 540 |
parser.add_option("-o", "--output", dest="output", metavar="DIR", |
| 541 |
help="Output directory (will be purged !)") |
| 542 |
parser.add_option("--list-engines", dest="lse", action="store_true", |
| 543 |
help="List available engines and exit") |
| 544 |
parser.add_option("-r", "--recursive", dest="recursive", |
| 545 |
type="int", metavar="DEPTH", help="Download linked " |
| 546 |
"pages until this depth. Be careful with that. " |
| 547 |
"Default: %default)") |
| 548 |
parser.add_option("-d", "--debug", dest="debug", action="store_true", |
| 549 |
help="Debug mode") |
| 550 |
options, args = parser.parse_args() |
| 551 |
if (options.lse): |
| 552 |
engines = get_engines() |
| 553 |
print "\n".join(engines.keys()) |
| 554 |
sys.exit() |
| 555 |
if not options.config: |
| 556 |
if os.path.exists(os.path.expanduser(CONFIG_PATH)): |
| 557 |
options.config = CONFIG_PATH |
| 558 |
else: |
| 559 |
parser.error("You must provide a configuration file (or put it " |
| 560 |
"in %s)" % CONFIG_PATH) |
| 561 |
if not os.path.exists(os.path.expanduser(options.config)): |
| 562 |
parser.error("Unable to find the configuration file: %s" |
| 563 |
% options.config) |
| 564 |
if args: |
| 565 |
parser.error("No arguments allowed") |
| 566 |
return options |
| 567 |
|
| 568 |
|
| 569 |
def get_feeds(config): |
| 570 |
feed_list = [] |
| 571 |
for s in config.sections(): |
| 572 |
if not config.has_option(s, "url"): |
| 573 |
continue |
| 574 |
feed = Feed(s, config.get(s, "url")) |
| 575 |
feed_list.append(feed) |
| 576 |
return feed_list |
| 577 |
|
| 578 |
|
| 579 |
def choose_engine(config): |
| 580 |
engines = get_engines() |
| 581 |
name = config.get("DEFAULT", "engine") |
| 582 |
return engines[name]() |
| 583 |
|
| 584 |
|
| 585 |
def get_config(options): |
| 586 |
# TODO: create config |
| 587 |
config = SafeConfigParser({"title_size": "50", "engine": "wget", |
| 588 |
"recursive": "0"}) |
| 589 |
config.read(os.path.expanduser(options.config)) |
| 590 |
if options.output is not None: |
| 591 |
config.set("DEFAULT", "output", options.output) |
| 592 |
if not config.has_option("DEFAULT", "output"): |
| 593 |
print >> sys.stderr, "Config file should have an 'output' variable" |
| 594 |
sys.exit(1) |
| 595 |
if not os.path.isdir(os.path.expanduser(config.get("DEFAULT", "output"))): |
| 596 |
print >> sys.stderr, "The output path must be a directory" |
| 597 |
sys.exit(1) |
| 598 |
config.set("DEFAULT", "debug", str(bool(options.debug))) |
| 599 |
if options.recursive is not None: |
| 600 |
config.set("DEFAULT", "recursive", options.recursive) |
| 601 |
return config |
| 602 |
|
| 603 |
|
| 604 |
def main(): |
| 605 |
"""The fun starts here""" |
| 606 |
global config |
| 607 |
options = parse_opts() |
| 608 |
config = get_config(options) |
| 609 |
downloader = choose_engine(config) |
| 610 |
|
| 611 |
feeds = get_feeds(config) |
| 612 |
for feed in feeds[:]: |
| 613 |
try: |
| 614 |
feed.parse() |
| 615 |
except urllib2.HTTPError, e: |
| 616 |
print >>sys.stderr, "Failed downloading %s: %s" % (feed.url, e) |
| 617 |
feeds.remove(feed) |
| 618 |
|
| 619 |
repo = Repository(config.get("DEFAULT", "output"), feeds) |
| 620 |
|
| 621 |
for feed in feeds: |
| 622 |
outdir = os.path.join(repo.path, feed.name) |
| 623 |
for page in feed.pages: |
| 624 |
page.download(outdir, downloader) |
| 625 |
|
| 626 |
repo.make_index(downloader) |
| 627 |
repo.cleanup() |
| 628 |
|
| 629 |
|
| 630 |
|
| 631 |
if __name__ == "__main__": |
| 632 |
main() |