a18bfc9 by Aurélien Bompard at 2010-04-11 1
#!/usr/bin/env python
2
# vim: set fileencoding=utf-8 tabstop=4 shiftwidth=4 expandtab smartindent:
3
u"""
7581434 by Aurélien Bompard at 2011-04-03 4
5
RSS Mirror
6
----------
7
a18bfc9 by Aurélien Bompard at 2010-04-11 8
Mirrors on the local disk the pages listed in an RSS feed, using Wget or
9
HTTrack.
10
df41b86 by Aurélien Bompard at 2011-10-14 11
Requires Python >= 2.6
12
c19ca33 by Aurélien Bompard at 2012-01-04 13
14
Configuration file
15
~~~~~~~~~~~~~~~~~~
16
17
RSS-mirror uses a configuration file to list the RSS feed that should be
18
downloaded. This file must be placed in ``~/.config/rss-mirror.conf`` and is in
19
INI format. Example::
20
21
    [DEFAULT]
22
    output = ~/pda/webpages
23
24
    [owni]
25
    url = http://owni.fr/feed
26
27
    [zenhabits]
28
    url = http://zenhabits.net/feed
29
30
    [rue89-ecologie]
31
    url = http://www.rue89.com/tag/ecologie/feed
32
    title = Rue89 - Ecologie
33
34
The ``DEFAULT`` section has an ``output`` key pointing to the output directory
35
where the webpages will be downloaded.
36
37
Each section (except DEFAULT) is a feed to download. It has a ``url`` key which
38
is self-explanatory and an optional ``title`` key which will be used as a title
39
for the feed in the summary page.
40
41
42
Credits
43
~~~~~~~
44
7581434 by Aurélien Bompard at 2011-04-03 45
.. :Authors:
46
       Aurélien Bompard <aurelien@bompard.org> <http://aurelien.bompard.org>
47
48
.. :License:
49
       GNU GPL v3 or later
a18bfc9 by Aurélien Bompard at 2010-04-11 50
51
"""
52
53
import os
54
import sys
f31a5e8 by Aurélien Bompard at 2010-05-09 55
import urllib
a18bfc9 by Aurélien Bompard at 2010-04-11 56
import urllib2
57
import re
58
import shutil
59
import optparse
60
import datetime
61
import time
df41b86 by Aurélien Bompard at 2011-10-14 62
import itertools
984214f by Aurélien Bompard at 2011-04-13 63
import xml.etree.ElementTree as etree
a18bfc9 by Aurélien Bompard at 2010-04-11 64
from urlparse import urlparse
ec63db5 by Aurélien Bompard at 2011-10-02 65
from subprocess import call
df41b86 by Aurélien Bompard at 2011-10-14 66
from zipfile import ZipFile, BadZipfile
67
from ConfigParser import SafeConfigParser
a18bfc9 by Aurélien Bompard at 2010-04-11 68
7581434 by Aurélien Bompard at 2011-04-03 69
6a3dead by Aurélien Bompard at 2011-10-14 70
IUI_VERSION = "0.40-alpha1"
df41b86 by Aurélien Bompard at 2011-10-14 71
CONFIG_PATH = "~/.config/rss-mirror.conf"
a18bfc9 by Aurélien Bompard at 2010-04-11 72
7581434 by Aurélien Bompard at 2011-04-03 73
a18bfc9 by Aurélien Bompard at 2010-04-11 74
class Downloader(object):
75
    """
76
    Abstract downloader class
7581434 by Aurélien Bompard at 2011-04-03 77
9f7de6a by Aurélien Bompard at 2010-07-27 78
    :cvar return_codes_ok: list of non-zero return codes that are actually OK
79
    :type return_codes_ok: ``list``
a18bfc9 by Aurélien Bompard at 2010-04-11 80
    """
81
9f7de6a by Aurélien Bompard at 2010-07-27 82
    return_codes_ok = []
83
984214f by Aurélien Bompard at 2011-04-13 84
    def get_command(self, destdir, url, options=None):
a18bfc9 by Aurélien Bompard at 2010-04-11 85
        """Returns the system command to execute"""
9f7de6a by Aurélien Bompard at 2010-07-27 86
        raise NotImplementedError
a18bfc9 by Aurélien Bompard at 2010-04-11 87
88
    def get_start_path(self, basedir, page):
89
        """Returns the path to the downloaded page on the disk"""
984214f by Aurélien Bompard at 2011-04-13 90
        raise NotImplementedError
a18bfc9 by Aurélien Bompard at 2010-04-11 91
7581434 by Aurélien Bompard at 2011-04-03 92
a18bfc9 by Aurélien Bompard at 2010-04-11 93
class HttrackDownloader(Downloader):
94
    """
95
    Download using httrack. More features than wget, but is has some bugs, like
96
    downloading CSS stylesheets in ``@import`` constructs.
97
    """
98
fa99e26 by Aurélien Bompard at 2010-04-13 99
    name = "httrack"
a18bfc9 by Aurélien Bompard at 2010-04-11 100
    opts = [
101
        "-%l", "fr", # language
102
        "-Y", # mirror links
103
        "-C0", # no cache
104
        "-b0", # no cookies
105
        "-n", # download "near" files
106
        "-L0", # DOS-compatible file names
107
        "-d", # stay on the same domain
108
        "-x", # replace external links by error page
109
        "-%u", #url hacks: various hacks to limit duplicate URLs
9f7de6a by Aurélien Bompard at 2010-07-27 110
        "-F", "rss-mirror (allow like Gecko)", # user-agent
a18bfc9 by Aurélien Bompard at 2010-04-11 111
    ]
112
24db566 by Aurélien Bompard at 2010-05-09 113
    def __init__(self):
114
        super(HttrackDownloader, self).__init__()
df41b86 by Aurélien Bompard at 2011-10-14 115
        recursive = config.getint("DEFAULT", "recursive")
116
        if recursive:
117
            self.opts.append("-r%d" % recursive)
24db566 by Aurélien Bompard at 2010-05-09 118
984214f by Aurélien Bompard at 2011-04-13 119
    def get_command(self, destdir, url, options=None):
a18bfc9 by Aurélien Bompard at 2010-04-11 120
        command = ["httrack"]
121
        command.extend(self.opts)
984214f by Aurélien Bompard at 2011-04-13 122
        if options:
123
            command.extend(options)
a18bfc9 by Aurélien Bompard at 2010-04-11 124
        command.extend(["-O", destdir, url])
125
        return command
126
df41b86 by Aurélien Bompard at 2011-10-14 127
    def get_start_path(self, basedir, title):
128
        indexfile = open(os.path.join(basedir, title, "index.html"))
a18bfc9 by Aurélien Bompard at 2010-04-11 129
        mo = re.search('<meta HTTP-EQUIV="Refresh" CONTENT="0; URL=(.*)">',
130
                       indexfile.read())
131
        indexfile.close()
132
        return mo.group(1)
133
7581434 by Aurélien Bompard at 2011-04-03 134
a18bfc9 by Aurélien Bompard at 2010-04-11 135
class WgetDownloader(Downloader):
136
    """
137
    Download using wget. Simple and fast.
7581434 by Aurélien Bompard at 2011-04-03 138
a18bfc9 by Aurélien Bompard at 2010-04-11 139
    I use the ``-nv`` switch to avoid creating the whole directory structure
140
    mirroring the website structure, because the FAT32 filesystem does not like
141
    very very long names.
142
    """
143
fa99e26 by Aurélien Bompard at 2010-04-13 144
    name = "wget"
a18bfc9 by Aurélien Bompard at 2010-04-11 145
    opts = [
146
        "-nv", # non verbose
147
        "-k", # convert links
148
        "-p", # download needed files for the page
149
        "-N", # timestamping
150
        "--restrict-file-names=windows,ascii,lowercase",
151
        "-E", # adjust extension
152
        "-H", # allow going on a different domain
153
        "--timeout=15", # it's 900 by default...
154
        "--tries=2", # it's 20 by default...
155
        "-nd", # avoid having 255+ chars paths
9f7de6a by Aurélien Bompard at 2010-07-27 156
        "--no-check-certificate", # SSL
03bd1d7 by Aurélien Bompard at 2011-10-14 157
        # User-agent: try to get the mobile version of the page
158
        "-U", ("Mozilla/5.0 (Linux; U; Android 2.2; en-us; rss-mirror) "
159
               "AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile "
160
               "Safari/533.1"),
a18bfc9 by Aurélien Bompard at 2010-04-11 161
    ]
15bf794 by Aurélien Bompard at 2010-09-12 162
    return_codes_ok = [3, 4, 8]
163
    # 3: I/O error, usually because the filename is too long
164
    # 4: Network error (broken link on the page)
165
    # 8: Server issued error response (broken link on the page)
a18bfc9 by Aurélien Bompard at 2010-04-11 166
24db566 by Aurélien Bompard at 2010-05-09 167
    def __init__(self):
168
        super(WgetDownloader, self).__init__()
df41b86 by Aurélien Bompard at 2011-10-14 169
        recursive = config.getint("DEFAULT", "recursive")
170
        if recursive:
171
            self.opts.extend(["-r", "-l%d" % recursive])
24db566 by Aurélien Bompard at 2010-05-09 172
984214f by Aurélien Bompard at 2011-04-13 173
    def get_command(self, destdir, url, options=None):
174
        command = ["wget", ]
a18bfc9 by Aurélien Bompard at 2010-04-11 175
        command.extend(self.opts)
984214f by Aurélien Bompard at 2011-04-13 176
        if options:
177
            command.extend(options)
a18bfc9 by Aurélien Bompard at 2010-04-11 178
        command.extend(["-P", destdir, url])
179
        return command
180
df41b86 by Aurélien Bompard at 2011-10-14 181
    def get_start_path(self, basedir, title):
182
        urlfile = open(os.path.join(basedir, title, "url.txt"))
a18bfc9 by Aurélien Bompard at 2010-04-11 183
        url = urlfile.read().strip()
184
        urlfile.close()
185
        if url.endswith("/"):
57ee016 by Aurélien Bompard at 2010-05-09 186
            url += "index.html"
187
        if "-nv" in self.opts:
188
            local_path = self.get_start_path_nodirs(basedir, url)
a18bfc9 by Aurélien Bompard at 2010-04-11 189
        else:
df41b86 by Aurélien Bompard at 2011-10-14 190
            if os.path.exists(os.path.join(basedir, title, "index.html")):
57ee016 by Aurélien Bompard at 2010-05-09 191
                # downloaded with httrack
192
                httrack_dl = HttrackDownloader()
193
                return httrack_dl.get_start_path(basedir, page)
194
            local_path = self.get_start_path_dirs(basedir, url)
df41b86 by Aurélien Bompard at 2011-10-14 195
        if os.path.isfile(os.path.join(basedir, title, local_path)):
a18bfc9 by Aurélien Bompard at 2010-04-11 196
            return local_path
df41b86 by Aurélien Bompard at 2011-10-14 197
        if os.path.isfile(os.path.join(basedir, title, local_path) + ".html"):
a18bfc9 by Aurélien Bompard at 2010-04-11 198
            return local_path + ".html"
199
        else:
57ee016 by Aurélien Bompard at 2010-05-09 200
            print >>sys.stderr, "Can't find the start page: ", local_path
df41b86 by Aurélien Bompard at 2011-10-14 201
            print >>sys.stderr, "Tried:", os.path.join(basedir, title, local_path), \
202
                            os.path.join(basedir, title, local_path) + ".html"
a18bfc9 by Aurélien Bompard at 2010-04-11 203
            return "."
204
57ee016 by Aurélien Bompard at 2010-05-09 205
    def get_start_path_nodirs(self, basedir, url):
206
        url_parsed = urlparse(url)
207
        local_path = os.path.basename(url_parsed[2]).lower()
208
        if not local_path:
209
            local_path = "index.html"
210
        if url_parsed[4]:
211
            local_path += "@" + url_parsed[4].lower().replace("/", "%2f")
212
        return local_path
213
214
    def get_start_path_dirs(self, basedir, url):
a18bfc9 by Aurélien Bompard at 2010-04-11 215
        local_path = url.replace("http://","").lower().replace("?", "@")
57ee016 by Aurélien Bompard at 2010-05-09 216
        return local_path
a18bfc9 by Aurélien Bompard at 2010-04-11 217
218
fa99e26 by Aurélien Bompard at 2010-04-13 219
def get_engines():
220
    engines = {}
221
    for downloader in Downloader.__subclasses__():
222
        engines[downloader.name] = downloader
223
    return engines
224
df41b86 by Aurélien Bompard at 2011-10-14 225
984214f by Aurélien Bompard at 2011-04-13 226
def extract_options(desc):
227
    opts = []
228
    options_matches = re.findall("\{options:\s+[^}]+\}", desc)
229
    for options_match in options_matches:
230
        inner_options = options_match[10:-1].strip()
231
        for inner_opt in inner_options.split():
232
            inner_opt = inner_opt.strip()
233
            if not inner_opt:
234
                continue
235
            opts.append(inner_opt)
236
    return opts
a18bfc9 by Aurélien Bompard at 2010-04-11 237
238
df41b86 by Aurélien Bompard at 2011-10-14 239
class Page(object):
240
    """
241
        - ``title`` is a shortned title derived from the page title,
242
        - ``link`` is the URL,
243
        - ``title_full`` is the HTML page title,
244
        - ``timestamp`` is the UNIX timestamp of the page in the RSS feed, which is
245
          probably the moment you bookmarked it.
246
    """
a18bfc9 by Aurélien Bompard at 2010-04-11 247
248
249
    allowed_chars = re.compile("[^a-zA-Z0-9_-]")
df41b86 by Aurélien Bompard at 2011-10-14 250
    desc_cleanup = re.compile("<[^>]+>")
251
252
    @classmethod
253
    def parse(cls, item):
254
        titlesize = config.getint("DEFAULT", "title_size",)
255
        page = cls()
256
        page.title_full = item.findtext("title").strip()
257
        page.title = page.title_full[:titlesize].strip().lower()
258
        page.title = cls.allowed_chars.sub("", page.title.replace(" ","_"))
f9cf3f7 by Aurélien Bompard at 2011-10-21 259
        page.link = item.findtext("link").strip()
984214f by Aurélien Bompard at 2011-04-13 260
        timestamp = item.findtext("pubDate")
a18bfc9 by Aurélien Bompard at 2010-04-11 261
        try:
262
            timestamp = datetime.datetime.strptime(timestamp,
263
                            "%a, %d %b %Y %H:%M:%S EDT")
264
            timestamp = int(time.mktime(timestamp.timetuple()))
265
        except ValueError:
266
            timestamp = int(time.time())
df41b86 by Aurélien Bompard at 2011-10-14 267
        page.timestamp = timestamp
268
        page.description = item.findtext("description", "")
269
        page.description = cls.desc_cleanup.sub("", page.description)
270
        page.options = extract_options(page.description)
271
        return page
272
273
    def download(self, outdir, downloader):
274
        """Use the downloader to mirror the page"""
275
        destdir = os.path.join(outdir, self.title)
276
        if os.path.exists(destdir):
277
            feedname = os.path.basename(outdir)
278
            print "Already downloaded:", os.path.join(feedname, self.title)
a18bfc9 by Aurélien Bompard at 2010-04-11 279
            return
df41b86 by Aurélien Bompard at 2011-10-14 280
        print "Downloading", self.title, self.link
281
        try:
282
            command = downloader.get_command(destdir, self.link,
283
                                             options=self.options)
f9cf3f7 by Aurélien Bompard at 2011-10-21 284
            print " ".join(command)
df41b86 by Aurélien Bompard at 2011-10-14 285
            if config.getboolean("DEFAULT", "debug"):
286
                retcode = 0
287
            else:
288
                retcode = call(command)
289
            if retcode < 0:
290
                print
291
                print >> sys.stderr, "Child was terminated by signal", -retcode
292
                return
293
            if retcode != 0 and retcode not in downloader.return_codes_ok:
294
                print
295
                print >> sys.stderr, "Something went wrong while downloading " \
296
                                    + self.title + "(%s)" % self.link
297
                print >> sys.stderr, "Return code: %s" % retcode
298
                return
299
        except OSError, e:
9f7de6a by Aurélien Bompard at 2010-07-27 300
            print
df41b86 by Aurélien Bompard at 2011-10-14 301
            print >> sys.stderr, "Execution failed:", e
9f7de6a by Aurélien Bompard at 2010-07-27 302
            return
df41b86 by Aurélien Bompard at 2011-10-14 303
        except KeyboardInterrupt, e:
304
            print "Removing downloaded dir in 1 sec..." # to avoid partial downloads
305
            time.sleep(1)
306
            shutil.rmtree(destdir)
307
            return
308
        # Backup the URL in the url.txt file
309
        link_file = open(os.path.join(destdir, "url.txt"),"w")
310
        link_file.write(self.link)
311
        link_file.close()
312
        # Backup the HTML title in the title.txt file
313
        title_file = open(os.path.join(destdir, "title.txt"),"w")
9f7de6a by Aurélien Bompard at 2010-07-27 314
        try:
df41b86 by Aurélien Bompard at 2011-10-14 315
            title_file.write(unicode(self.title_full).encode("utf-8"))
316
        except UnicodeEncodeError:
317
            title_file.write(self.title)
318
        title_file.close()
319
        # Backup the timestamp in the timestamp.txt file
320
        timestamp_file = open(os.path.join(destdir, "timestamp.txt"),"w")
321
        timestamp_file.write(str(self.timestamp))
322
        timestamp_file.close()
323
        print
324
        time.sleep(1) # Can't remember why this was necessary... FIXME
325
326
327
328
class Feed(object):
329
330
    def __init__(self, name, url):
331
        self.name = name
332
        self.url = url
333
        self.title = None
334
        self.pages = []
335
336
    def parse(self):
337
        """
338
        Read an RSS feed and return a list of pages to mirror.
339
        """
340
        content = urllib2.urlopen(self.url)
341
        feed = etree.parse(content)
342
        self.title = self.get_title(feed)
343
        pages = []
344
        for item in feed.findall(".//item"):
345
            pages.append(Page.parse(item))
346
        self.pages = pages
347
348
    def get_title(self, feed):
349
        if config.has_option(self.name, "title"):
350
            return config.get(self.name, "title")
351
        feed_title = feed.findtext("channel/title")
352
        if not feed_title:
353
            return self.name
354
        return feed_title
355
356
357
class Repository(object):
358
    """
359
    A folder containing mirrored pages
360
    """
361
362
    def __init__(self, path, feeds):
363
        self.path = os.path.expanduser(path)
364
        self.feeds = feeds
365
366
    def make_index(self, downloader):
367
        """Build the HTML index of the mirrored pages"""
368
        startfiles = {}
369
        for feed in self.feeds:
370
            startfiles[feed] = []
371
            destdir = os.path.join(self.path, feed.name)
372
            for page in feed.pages:
373
                if not os.path.exists(os.path.join(destdir, page.title)):
374
                    continue
375
                try:
376
                    local_path = downloader.get_start_path(destdir, page.title)
377
                except IOError:
378
                    print >> sys.stderr, "Can't find the url.txt file for %s" \
379
                                         % page.title
380
                    continue # no url.txt file, something went wrong
381
                startfiles[feed].append(
382
                        ( unicode(page.title_full).encode("utf-8"),
383
                          page.description,
384
                          "/".join([feed.name, page.title, local_path]) ) )
385
        mainindex = open(os.path.join(self.path, "index.html"), "w")
386
        mainindex.write("""<!DOCTYPE html>
ec63db5 by Aurélien Bompard at 2011-10-02 387
<html>
a18bfc9 by Aurélien Bompard at 2010-04-11 388
<head>
6a3dead by Aurélien Bompard at 2011-10-14 389
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
390
  <meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0; user-scalable=0;"/>
ec63db5 by Aurélien Bompard at 2011-10-02 391
  <title>Mirrored pages</title>
6a3dead by Aurélien Bompard at 2011-10-14 392
  <link rel="stylesheet" href="_iui/iui.css" type="text/css" />
393
  <link rel="stylesheet" href="_iui/t/default/default-theme.css" type="text/css"/>
394
  <script type="application/x-javascript" src="_iui/iui.js"></script>
a18bfc9 by Aurélien Bompard at 2010-04-11 395
</head>
396
<body>
db37078 by Aurélien Bompard at 2011-01-22 397
6a3dead by Aurélien Bompard at 2011-10-14 398
<div class="toolbar">
399
  <h1 id="pageTitle"></h1>
400
    <a id="backButton" class="button" href="#"></a>
401
</div>
a18bfc9 by Aurélien Bompard at 2010-04-11 402
""")
6a3dead by Aurélien Bompard at 2011-10-14 403
        if len(startfiles) > 1:
404
            mainindex.write("""<ul id="index" title="Mirrored pages" selected="true">\n""")
405
            for feed in sorted(startfiles, key=lambda f: f.name):
406
                mainindex.write("""  <li><a href="#%(name)s">%(title)s</a></li>\n"""
407
                                % {"name": feed.name, "title": feed.title} )
408
            mainindex.write("</ul>\n\n")
409
df41b86 by Aurélien Bompard at 2011-10-14 410
        for feed in sorted(startfiles, key=lambda f: f.name):
6a3dead by Aurélien Bompard at 2011-10-14 411
            mainindex.write("""<ul id="%(name)s" title="%(title)s">\n"""
412
                            % {"name": feed.name, "title": feed.title})
df41b86 by Aurélien Bompard at 2011-10-14 413
            for title, description, index in startfiles[feed]:
414
                mainindex.write(
6a3dead by Aurélien Bompard at 2011-10-14 415
                    """  <li><a href="%s" target="_webapp">%s</a></li>\n""" %
416
                    (urllib.quote(index.encode("utf-8")), title))
417
            mainindex.write("</ul>\n\n")
df41b86 by Aurélien Bompard at 2011-10-14 418
        mainindex.write("""
a18bfc9 by Aurélien Bompard at 2010-04-11 419
</body>
420
</html>
421
""")
422
df41b86 by Aurélien Bompard at 2011-10-14 423
424
    def cleanup(self):
425
        """
426
        Remove mirrored pages which are not in the feed anymore (probably
427
        because you read them)
428
        """
429
        dirs_to_remove = self._get_old_feed_dirs()
430
        for feed in self.feeds:
431
            dirs_to_remove.extend(self._get_old_page_dirs(feed))
432
        for dirname in dirs_to_remove:
433
            print "Removing", dirname
434
            if not config.getboolean("DEFAULT", "debug"):
435
                shutil.rmtree(os.path.join(self.path, dirname))
436
        self.lowercase_dirs()
6a3dead by Aurélien Bompard at 2011-10-14 437
        if "_iui" not in os.listdir(self.path):
438
            self.download_iui()
df41b86 by Aurélien Bompard at 2011-10-14 439
440
    def _get_old_feed_dirs(self):
441
        dirs = []
442
        for feeddirname in os.listdir(self.path):
443
            if feeddirname.startswith("."):
444
                continue
6a3dead by Aurélien Bompard at 2011-10-14 445
            if feeddirname == "_iui":
df41b86 by Aurélien Bompard at 2011-10-14 446
                continue
447
            if not os.path.isdir(os.path.join(self.path, feeddirname)):
448
                continue # like "index.html" for example
449
            if feeddirname not in [ f.name for f in self.feeds ]:
450
                dirs.append(feeddirname)
451
        return dirs
452
453
    def _get_old_page_dirs(self, feed):
454
        if not os.path.isdir(os.path.join(self.path, feed.name)):
455
            return []
456
        dirs = []
457
        for dirname in os.listdir(os.path.join(self.path, feed.name)):
458
            if dirname.startswith("."):
459
                continue
460
            if dirname not in [ p.title for p in feed.pages ]:
461
                dirs.append(os.path.join(feed.name, dirname))
462
        return dirs
463
464
    def lowercase_dirs(self):
465
        """For FAT32 transparency"""
466
        for feed in self.feeds:
467
            for root, dirs, files in os.walk(
468
                        os.path.join(self.path, feed.name), topdown=False):
469
                for name in dirs:
470
                    newname = name.lower()
471
                    if name != newname:
472
                        source = os.path.join(self.path, feed.name, root, name)
473
                        dest = os.path.join(self.path, feed.name, root, newname)
474
                        if os.path.exists(dest):
475
                            continue
476
                        os.rename(source, dest)
477
478
6a3dead by Aurélien Bompard at 2011-10-14 479
    def download_iui(self):
480
        print "Downloading iUI... ",
481
        sys.stdout.flush()
482
        iui_url = "http://iui.googlecode.com/files/iui-%s.zip" % IUI_VERSION
483
        try:
484
            fn, _headers = urllib.urlretrieve(iui_url)
485
            with ZipFile(fn, "r") as archive:
486
                archive.extractall(self.path)
487
            os.remove(fn)
488
            os.rename(os.path.join(self.path, "iui-%s/web-app/iui" % IUI_VERSION),
489
                      os.path.join(self.path, "_iui"))
490
            shutil.rmtree(os.path.join(self.path, "iui-%s" % IUI_VERSION))
491
        except (IOError, BadZipfile):
492
            print "FAILED."
493
            print >>sys.stderr, "WARNING, could not download or unzip iUI"
494
            from formatter import DumbWriter
495
            dw = DumbWriter(sys.stderr) # not really necessary, but fun :)
496
            dw.send_flowing_data("You must download iUI from %s, unpack it, "
497
                        "rename the folder to \"_iui\", and put it in the "
498
                        "destination directory." % iui_url)
499
            print
500
            return
501
        print "done."
502
503
df41b86 by Aurélien Bompard at 2011-10-14 504
    def download_jqm(self):
505
        print "Downloading jQuery Mobile... ",
506
        sys.stdout.flush()
507
        jqm_url = ("http://code.jquery.com/mobile/%(ver)s/jquery.mobile-%(ver)s.zip"
508
                   % {"ver": JQM_VERSION})
509
        jq_url = "http://code.jquery.com/jquery-%s.min.js" % JQ_VERSION
510
        try:
511
            jq_fn, _headers = urllib.urlretrieve(jqm_url)
512
            with ZipFile(jq_fn, "r") as jq_zip:
513
                jq_zip.extractall(self.path)
514
            os.remove(jq_fn)
515
            os.rename(os.path.join(self.path, "jquery.mobile-%s" % JQM_VERSION),
516
                      os.path.join(self.path, "_jqm"))
517
            urllib.urlretrieve(jq_url, os.path.join(self.path, "_jqm",
518
                                                    os.path.basename(jq_url)))
519
        except (IOError, BadZipfile):
520
            print "FAILED."
521
            print >>sys.stderr, ("WARNING, could not download or unzip "
522
                                "jQuery Mobile.")
523
            from formatter import DumbWriter
524
            dw = DumbWriter(sys.stderr) # not really necessary, but fun :)
525
            dw.send_flowing_data("You must download jQuery Mobile from "
526
                      "%(jqmurl)s, unpack it, rename the folder to \"_jqm\", "
527
                      "put it in the destination directory, then download "
528
                      "jQuery from %(jqurl)s, and put it in the same folder."
529
                      % { "jqmurl": jqm_url, "jqurl": jq_url } )
530
            print
531
            return
532
        print "done."
533
6d7a295 by Aurélien Bompard at 2010-05-14 534
a18bfc9 by Aurélien Bompard at 2010-04-11 535
def parse_opts():
536
    """Command-line options"""
df41b86 by Aurélien Bompard at 2011-10-14 537
    usage = "usage: %prog -c <config file>"
a18bfc9 by Aurélien Bompard at 2010-04-11 538
    parser = optparse.OptionParser(usage)
df41b86 by Aurélien Bompard at 2011-10-14 539
    parser.add_option("-c", "--config", help="Configuration file")
a18bfc9 by Aurélien Bompard at 2010-04-11 540
    parser.add_option("-o", "--output", dest="output", metavar="DIR",
541
                      help="Output directory (will be purged !)")
fa99e26 by Aurélien Bompard at 2010-04-13 542
    parser.add_option("--list-engines", dest="lse", action="store_true",
543
                      help="List available engines and exit")
df41b86 by Aurélien Bompard at 2011-10-14 544
    parser.add_option("-r", "--recursive", dest="recursive",
24db566 by Aurélien Bompard at 2010-05-09 545
                      type="int", metavar="DEPTH", help="Download linked "
546
                      "pages until this depth. Be careful with that. "
547
                      "Default: %default)")
a18bfc9 by Aurélien Bompard at 2010-04-11 548
    parser.add_option("-d", "--debug", dest="debug", action="store_true",
549
                      help="Debug mode")
550
    options, args = parser.parse_args()
fa99e26 by Aurélien Bompard at 2010-04-13 551
    if (options.lse):
df41b86 by Aurélien Bompard at 2011-10-14 552
        engines = get_engines()
fa99e26 by Aurélien Bompard at 2010-04-13 553
        print "\n".join(engines.keys())
554
        sys.exit()
df41b86 by Aurélien Bompard at 2011-10-14 555
    if not options.config:
556
        if os.path.exists(os.path.expanduser(CONFIG_PATH)):
557
            options.config = CONFIG_PATH
558
        else:
559
            parser.error("You must provide a configuration file (or put it "
560
                         "in %s)" % CONFIG_PATH)
561
    if not os.path.exists(os.path.expanduser(options.config)):
562
        parser.error("Unable to find the configuration file: %s"
563
                     % options.config)
564
    if args:
565
        parser.error("No arguments allowed")
566
    return options
567
568
569
def get_feeds(config):
570
    feed_list = []
571
    for s in config.sections():
572
        if not config.has_option(s, "url"):
573
            continue
574
        feed = Feed(s, config.get(s, "url"))
575
        feed_list.append(feed)
576
    return feed_list
577
578
579
def choose_engine(config):
580
    engines = get_engines()
581
    name = config.get("DEFAULT", "engine")
582
    return engines[name]()
583
584
585
def get_config(options):
586
    # TODO: create config
587
    config = SafeConfigParser({"title_size": "50", "engine": "wget",
588
                               "recursive": "0"})
589
    config.read(os.path.expanduser(options.config))
590
    if options.output is not None:
591
        config.set("DEFAULT", "output", options.output)
592
    if not config.has_option("DEFAULT", "output"):
593
        print >> sys.stderr, "Config file should have an 'output' variable"
594
        sys.exit(1)
595
    if not os.path.isdir(os.path.expanduser(config.get("DEFAULT", "output"))):
596
        print >> sys.stderr, "The output path must be a directory"
597
        sys.exit(1)
598
    config.set("DEFAULT", "debug", str(bool(options.debug)))
599
    if options.recursive is not None:
600
        config.set("DEFAULT", "recursive", options.recursive)
601
    return config
602
a18bfc9 by Aurélien Bompard at 2010-04-11 603
604
def main():
605
    """The fun starts here"""
df41b86 by Aurélien Bompard at 2011-10-14 606
    global config
607
    options = parse_opts()
608
    config = get_config(options)
609
    downloader = choose_engine(config)
610
611
    feeds = get_feeds(config)
6a3dead by Aurélien Bompard at 2011-10-14 612
    for feed in feeds[:]:
df41b86 by Aurélien Bompard at 2011-10-14 613
        try:
614
            feed.parse()
615
        except urllib2.HTTPError, e:
616
            print >>sys.stderr, "Failed downloading %s: %s" % (feed.url, e)
6a3dead by Aurélien Bompard at 2011-10-14 617
            feeds.remove(feed)
df41b86 by Aurélien Bompard at 2011-10-14 618
619
    repo = Repository(config.get("DEFAULT", "output"), feeds)
620
621
    for feed in feeds:
622
        outdir = os.path.join(repo.path, feed.name)
623
        for page in feed.pages:
624
            page.download(outdir, downloader)
625
626
    repo.make_index(downloader)
627
    repo.cleanup()
628
a18bfc9 by Aurélien Bompard at 2010-04-11 629
630
631
if __name__ == "__main__":
632
    main()