1
#!/usr/bin/env python
2
# vim: set fileencoding=utf-8 tabstop=4 shiftwidth=4 expandtab smartindent:
3
u"""
4
5
RSS Mirror
6
----------
7
8
Mirrors on the local disk the pages listed in an RSS feed, using Wget or
9
HTTrack.
10
11
Requires Python >= 2.6
12
13
14
Configuration file
15
~~~~~~~~~~~~~~~~~~
16
17
RSS-mirror uses a configuration file to list the RSS feed that should be
18
downloaded. This file must be placed in ``~/.config/rss-mirror.conf`` and is in
19
INI format. Example::
20
21
    [DEFAULT]
22
    output = ~/pda/webpages
23
24
    [owni]
25
    url = http://owni.fr/feed
26
27
    [zenhabits]
28
    url = http://zenhabits.net/feed
29
30
    [rue89-ecologie]
31
    url = http://www.rue89.com/tag/ecologie/feed
32
    title = Rue89 - Ecologie
33
34
The ``DEFAULT`` section has an ``output`` key pointing to the output directory
35
where the webpages will be downloaded.
36
37
Each section (except DEFAULT) is a feed to download. It has a ``url`` key which
38
is self-explanatory and an optional ``title`` key which will be used as a title
39
for the feed in the summary page.
40
41
42
Credits
43
~~~~~~~
44
45
.. :Authors:
46
       Aurélien Bompard <aurelien@bompard.org> <http://aurelien.bompard.org>
47
48
.. :License:
49
       GNU GPL v3 or later
50
51
"""
52
53
import os
54
import sys
55
import urllib
56
import urllib2
57
import re
58
import shutil
59
import optparse
60
import datetime
61
import time
62
import itertools
63
import xml.etree.ElementTree as etree
64
from urlparse import urlparse
65
from subprocess import call
66
from zipfile import ZipFile, BadZipfile
67
from ConfigParser import SafeConfigParser
68
69
70
IUI_VERSION = "0.40-alpha1"
71
CONFIG_PATH = "~/.config/rss-mirror.conf"
72
73
74
class Downloader(object):
75
    """
76
    Abstract downloader class
77
78
    :cvar return_codes_ok: list of non-zero return codes that are actually OK
79
    :type return_codes_ok: ``list``
80
    """
81
82
    return_codes_ok = []
83
84
    def get_command(self, destdir, url, options=None):
85
        """Returns the system command to execute"""
86
        raise NotImplementedError
87
88
    def get_start_path(self, basedir, page):
89
        """Returns the path to the downloaded page on the disk"""
90
        raise NotImplementedError
91
92
93
class HttrackDownloader(Downloader):
94
    """
95
    Download using httrack. More features than wget, but is has some bugs, like
96
    downloading CSS stylesheets in ``@import`` constructs.
97
    """
98
99
    name = "httrack"
100
    opts = [
101
        "-%l", "fr", # language
102
        "-Y", # mirror links
103
        "-C0", # no cache
104
        "-b0", # no cookies
105
        "-n", # download "near" files
106
        "-L0", # DOS-compatible file names
107
        "-d", # stay on the same domain
108
        "-x", # replace external links by error page
109
        "-%u", #url hacks: various hacks to limit duplicate URLs
110
        "-F", "rss-mirror (allow like Gecko)", # user-agent
111
    ]
112
113
    def __init__(self):
114
        super(HttrackDownloader, self).__init__()
115
        recursive = config.getint("DEFAULT", "recursive")
116
        if recursive:
117
            self.opts.append("-r%d" % recursive)
118
119
    def get_command(self, destdir, url, options=None):
120
        command = ["httrack"]
121
        command.extend(self.opts)
122
        if options:
123
            command.extend(options)
124
        command.extend(["-O", destdir, url])
125
        return command
126
127
    def get_start_path(self, basedir, title):
128
        indexfile = open(os.path.join(basedir, title, "index.html"))
129
        mo = re.search('<meta HTTP-EQUIV="Refresh" CONTENT="0; URL=(.*)">',
130
                       indexfile.read())
131
        indexfile.close()
132
        return mo.group(1)
133
134
135
class WgetDownloader(Downloader):
136
    """
137
    Download using wget. Simple and fast.
138
139
    I use the ``-nv`` switch to avoid creating the whole directory structure
140
    mirroring the website structure, because the FAT32 filesystem does not like
141
    very very long names.
142
    """
143
144
    name = "wget"
145
    opts = [
146
        "-nv", # non verbose
147
        "-k", # convert links
148
        "-p", # download needed files for the page
149
        "-N", # timestamping
150
        "--restrict-file-names=windows,ascii,lowercase",
151
        "-E", # adjust extension
152
        "-H", # allow going on a different domain
153
        "--timeout=15", # it's 900 by default...
154
        "--tries=2", # it's 20 by default...
155
        "-nd", # avoid having 255+ chars paths
156
        "--no-check-certificate", # SSL
157
        # User-agent: try to get the mobile version of the page
158
        "-U", ("Mozilla/5.0 (Linux; U; Android 2.2; en-us; rss-mirror) "
159
               "AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile "
160
               "Safari/533.1"),
161
    ]
162
    return_codes_ok = [3, 4, 8]
163
    # 3: I/O error, usually because the filename is too long
164
    # 4: Network error (broken link on the page)
165
    # 8: Server issued error response (broken link on the page)
166
167
    def __init__(self):
168
        super(WgetDownloader, self).__init__()
169
        recursive = config.getint("DEFAULT", "recursive")
170
        if recursive:
171
            self.opts.extend(["-r", "-l%d" % recursive])
172
173
    def get_command(self, destdir, url, options=None):
174
        command = ["wget", ]
175
        command.extend(self.opts)
176
        if options:
177
            command.extend(options)
178
        command.extend(["-P", destdir, url])
179
        return command
180
181
    def get_start_path(self, basedir, title):
182
        urlfile = open(os.path.join(basedir, title, "url.txt"))
183
        url = urlfile.read().strip()
184
        urlfile.close()
185
        if url.endswith("/"):
186
            url += "index.html"
187
        if "-nv" in self.opts:
188
            local_path = self.get_start_path_nodirs(basedir, url)
189
        else:
190
            if os.path.exists(os.path.join(basedir, title, "index.html")):
191
                # downloaded with httrack
192
                httrack_dl = HttrackDownloader()
193
                return httrack_dl.get_start_path(basedir, page)
194
            local_path = self.get_start_path_dirs(basedir, url)
195
        if os.path.isfile(os.path.join(basedir, title, local_path)):
196
            return local_path
197
        if os.path.isfile(os.path.join(basedir, title, local_path) + ".html"):
198
            return local_path + ".html"
199
        else:
200
            print >>sys.stderr, "Can't find the start page: ", local_path
201
            print >>sys.stderr, "Tried:", os.path.join(basedir, title, local_path), \
202
                            os.path.join(basedir, title, local_path) + ".html"
203
            return "."
204
205
    def get_start_path_nodirs(self, basedir, url):
206
        url_parsed = urlparse(url)
207
        local_path = os.path.basename(url_parsed[2]).lower()
208
        if not local_path:
209
            local_path = "index.html"
210
        if url_parsed[4]:
211
            local_path += "@" + url_parsed[4].lower().replace("/", "%2f")
212
        return local_path
213
214
    def get_start_path_dirs(self, basedir, url):
215
        local_path = url.replace("http://","").lower().replace("?", "@")
216
        return local_path
217
218
219
def get_engines():
220
    engines = {}
221
    for downloader in Downloader.__subclasses__():
222
        engines[downloader.name] = downloader
223
    return engines
224
225
226
def extract_options(desc):
227
    opts = []
228
    options_matches = re.findall("\{options:\s+[^}]+\}", desc)
229
    for options_match in options_matches:
230
        inner_options = options_match[10:-1].strip()
231
        for inner_opt in inner_options.split():
232
            inner_opt = inner_opt.strip()
233
            if not inner_opt:
234
                continue
235
            opts.append(inner_opt)
236
    return opts
237
238
239
class Page(object):
240
    """
241
        - ``title`` is a shortned title derived from the page title,
242
        - ``link`` is the URL,
243
        - ``title_full`` is the HTML page title,
244
        - ``timestamp`` is the UNIX timestamp of the page in the RSS feed, which is
245
          probably the moment you bookmarked it.
246
    """
247
248
249
    allowed_chars = re.compile("[^a-zA-Z0-9_-]")
250
    desc_cleanup = re.compile("<[^>]+>")
251
252
    @classmethod
253
    def parse(cls, item):
254
        titlesize = config.getint("DEFAULT", "title_size",)
255
        page = cls()
256
        page.title_full = item.findtext("title").strip()
257
        page.title = page.title_full[:titlesize].strip().lower()
258
        page.title = cls.allowed_chars.sub("", page.title.replace(" ","_"))
259
        page.link = item.findtext("link").strip()
260
        timestamp = item.findtext("pubDate")
261
        try:
262
            timestamp = datetime.datetime.strptime(timestamp,
263
                            "%a, %d %b %Y %H:%M:%S EDT")
264
            timestamp = int(time.mktime(timestamp.timetuple()))
265
        except ValueError:
266
            timestamp = int(time.time())
267
        page.timestamp = timestamp
268
        page.description = item.findtext("description", "")
269
        page.description = cls.desc_cleanup.sub("", page.description)
270
        page.options = extract_options(page.description)
271
        return page
272
273
    def download(self, outdir, downloader):
274
        """Use the downloader to mirror the page"""
275
        destdir = os.path.join(outdir, self.title)
276
        if os.path.exists(destdir):
277
            feedname = os.path.basename(outdir)
278
            print "Already downloaded:", os.path.join(feedname, self.title)
279
            return
280
        print "Downloading", self.title, self.link
281
        try:
282
            command = downloader.get_command(destdir, self.link,
283
                                             options=self.options)
284
            print " ".join(command)
285
            if config.getboolean("DEFAULT", "debug"):
286
                retcode = 0
287
            else:
288
                retcode = call(command)
289
            if retcode < 0:
290
                print
291
                print >> sys.stderr, "Child was terminated by signal", -retcode
292
                return
293
            if retcode != 0 and retcode not in downloader.return_codes_ok:
294
                print
295
                print >> sys.stderr, "Something went wrong while downloading " \
296
                                    + self.title + "(%s)" % self.link
297
                print >> sys.stderr, "Return code: %s" % retcode
298
                return
299
        except OSError, e:
300
            print
301
            print >> sys.stderr, "Execution failed:", e
302
            return
303
        except KeyboardInterrupt, e:
304
            print "Removing downloaded dir in 1 sec..." # to avoid partial downloads
305
            time.sleep(1)
306
            shutil.rmtree(destdir)
307
            return
308
        # Backup the URL in the url.txt file
309
        link_file = open(os.path.join(destdir, "url.txt"),"w")
310
        link_file.write(self.link)
311
        link_file.close()
312
        # Backup the HTML title in the title.txt file
313
        title_file = open(os.path.join(destdir, "title.txt"),"w")
314
        try:
315
            title_file.write(unicode(self.title_full).encode("utf-8"))
316
        except UnicodeEncodeError:
317
            title_file.write(self.title)
318
        title_file.close()
319
        # Backup the timestamp in the timestamp.txt file
320
        timestamp_file = open(os.path.join(destdir, "timestamp.txt"),"w")
321
        timestamp_file.write(str(self.timestamp))
322
        timestamp_file.close()
323
        print
324
        time.sleep(1) # Can't remember why this was necessary... FIXME
325
326
327
328
class Feed(object):
329
330
    def __init__(self, name, url):
331
        self.name = name
332
        self.url = url
333
        self.title = None
334
        self.pages = []
335
336
    def parse(self):
337
        """
338
        Read an RSS feed and return a list of pages to mirror.
339
        """
340
        content = urllib2.urlopen(self.url)
341
        feed = etree.parse(content)
342
        self.title = self.get_title(feed)
343
        pages = []
344
        for item in feed.findall(".//item"):
345
            pages.append(Page.parse(item))
346
        self.pages = pages
347
348
    def get_title(self, feed):
349
        if config.has_option(self.name, "title"):
350
            return config.get(self.name, "title")
351
        feed_title = feed.findtext("channel/title")
352
        if not feed_title:
353
            return self.name
354
        return feed_title
355
356
357
class Repository(object):
358
    """
359
    A folder containing mirrored pages
360
    """
361
362
    def __init__(self, path, feeds):
363
        self.path = os.path.expanduser(path)
364
        self.feeds = feeds
365
366
    def make_index(self, downloader):
367
        """Build the HTML index of the mirrored pages"""
368
        startfiles = {}
369
        for feed in self.feeds:
370
            startfiles[feed] = []
371
            destdir = os.path.join(self.path, feed.name)
372
            for page in feed.pages:
373
                if not os.path.exists(os.path.join(destdir, page.title)):
374
                    continue
375
                try:
376
                    local_path = downloader.get_start_path(destdir, page.title)
377
                except IOError:
378
                    print >> sys.stderr, "Can't find the url.txt file for %s" \
379
                                         % page.title
380
                    continue # no url.txt file, something went wrong
381
                startfiles[feed].append(
382
                        ( unicode(page.title_full).encode("utf-8"),
383
                          page.description,
384
                          "/".join([feed.name, page.title, local_path]) ) )
385
        mainindex = open(os.path.join(self.path, "index.html"), "w")
386
        mainindex.write("""<!DOCTYPE html>
387
<html>
388
<head>
389
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
390
  <meta name="viewport" content="width=device-width; initial-scale=1.0; maximum-scale=1.0; user-scalable=0;"/>
391
  <title>Mirrored pages</title>
392
  <link rel="stylesheet" href="_iui/iui.css" type="text/css" />
393
  <link rel="stylesheet" href="_iui/t/default/default-theme.css" type="text/css"/>
394
  <script type="application/x-javascript" src="_iui/iui.js"></script>
395
</head>
396
<body>
397
398
<div class="toolbar">
399
  <h1 id="pageTitle"></h1>
400
    <a id="backButton" class="button" href="#"></a>
401
</div>
402
""")
403
        if len(startfiles) > 1:
404
            mainindex.write("""<ul id="index" title="Mirrored pages" selected="true">\n""")
405
            for feed in sorted(startfiles, key=lambda f: f.name):
406
                mainindex.write("""  <li><a href="#%(name)s">%(title)s</a></li>\n"""
407
                                % {"name": feed.name, "title": feed.title} )
408
            mainindex.write("</ul>\n\n")
409
410
        for feed in sorted(startfiles, key=lambda f: f.name):
411
            mainindex.write("""<ul id="%(name)s" title="%(title)s">\n"""
412
                            % {"name": feed.name, "title": feed.title})
413
            for title, description, index in startfiles[feed]:
414
                mainindex.write(
415
                    """  <li><a href="%s" target="_webapp">%s</a></li>\n""" %
416
                    (urllib.quote(index.encode("utf-8")), title))
417
            mainindex.write("</ul>\n\n")
418
        mainindex.write("""
419
</body>
420
</html>
421
""")
422
423
424
    def cleanup(self):
425
        """
426
        Remove mirrored pages which are not in the feed anymore (probably
427
        because you read them)
428
        """
429
        dirs_to_remove = self._get_old_feed_dirs()
430
        for feed in self.feeds:
431
            dirs_to_remove.extend(self._get_old_page_dirs(feed))
432
        for dirname in dirs_to_remove:
433
            print "Removing", dirname
434
            if not config.getboolean("DEFAULT", "debug"):
435
                shutil.rmtree(os.path.join(self.path, dirname))
436
        self.lowercase_dirs()
437
        if "_iui" not in os.listdir(self.path):
438
            self.download_iui()
439
440
    def _get_old_feed_dirs(self):
441
        dirs = []
442
        for feeddirname in os.listdir(self.path):
443
            if feeddirname.startswith("."):
444
                continue
445
            if feeddirname == "_iui":
446
                continue
447
            if not os.path.isdir(os.path.join(self.path, feeddirname)):
448
                continue # like "index.html" for example
449
            if feeddirname not in [ f.name for f in self.feeds ]:
450
                dirs.append(feeddirname)
451
        return dirs
452
453
    def _get_old_page_dirs(self, feed):
454
        if not os.path.isdir(os.path.join(self.path, feed.name)):
455
            return []
456
        dirs = []
457
        for dirname in os.listdir(os.path.join(self.path, feed.name)):
458
            if dirname.startswith("."):
459
                continue
460
            if dirname not in [ p.title for p in feed.pages ]:
461
                dirs.append(os.path.join(feed.name, dirname))
462
        return dirs
463
464
    def lowercase_dirs(self):
465
        """For FAT32 transparency"""
466
        for feed in self.feeds:
467
            for root, dirs, files in os.walk(
468
                        os.path.join(self.path, feed.name), topdown=False):
469
                for name in dirs:
470
                    newname = name.lower()
471
                    if name != newname:
472
                        source = os.path.join(self.path, feed.name, root, name)
473
                        dest = os.path.join(self.path, feed.name, root, newname)
474
                        if os.path.exists(dest):
475
                            continue
476
                        os.rename(source, dest)
477
478
479
    def download_iui(self):
480
        print "Downloading iUI... ",
481
        sys.stdout.flush()
482
        iui_url = "http://iui.googlecode.com/files/iui-%s.zip" % IUI_VERSION
483
        try:
484
            fn, _headers = urllib.urlretrieve(iui_url)
485
            with ZipFile(fn, "r") as archive:
486
                archive.extractall(self.path)
487
            os.remove(fn)
488
            os.rename(os.path.join(self.path, "iui-%s/web-app/iui" % IUI_VERSION),
489
                      os.path.join(self.path, "_iui"))
490
            shutil.rmtree(os.path.join(self.path, "iui-%s" % IUI_VERSION))
491
        except (IOError, BadZipfile):
492
            print "FAILED."
493
            print >>sys.stderr, "WARNING, could not download or unzip iUI"
494
            from formatter import DumbWriter
495
            dw = DumbWriter(sys.stderr) # not really necessary, but fun :)
496
            dw.send_flowing_data("You must download iUI from %s, unpack it, "
497
                        "rename the folder to \"_iui\", and put it in the "
498
                        "destination directory." % iui_url)
499
            print
500
            return
501
        print "done."
502
503
504
    def download_jqm(self):
505
        print "Downloading jQuery Mobile... ",
506
        sys.stdout.flush()
507
        jqm_url = ("http://code.jquery.com/mobile/%(ver)s/jquery.mobile-%(ver)s.zip"
508
                   % {"ver": JQM_VERSION})
509
        jq_url = "http://code.jquery.com/jquery-%s.min.js" % JQ_VERSION
510
        try:
511
            jq_fn, _headers = urllib.urlretrieve(jqm_url)
512
            with ZipFile(jq_fn, "r") as jq_zip:
513
                jq_zip.extractall(self.path)
514
            os.remove(jq_fn)
515
            os.rename(os.path.join(self.path, "jquery.mobile-%s" % JQM_VERSION),
516
                      os.path.join(self.path, "_jqm"))
517
            urllib.urlretrieve(jq_url, os.path.join(self.path, "_jqm",
518
                                                    os.path.basename(jq_url)))
519
        except (IOError, BadZipfile):
520
            print "FAILED."
521
            print >>sys.stderr, ("WARNING, could not download or unzip "
522
                                "jQuery Mobile.")
523
            from formatter import DumbWriter
524
            dw = DumbWriter(sys.stderr) # not really necessary, but fun :)
525
            dw.send_flowing_data("You must download jQuery Mobile from "
526
                      "%(jqmurl)s, unpack it, rename the folder to \"_jqm\", "
527
                      "put it in the destination directory, then download "
528
                      "jQuery from %(jqurl)s, and put it in the same folder."
529
                      % { "jqmurl": jqm_url, "jqurl": jq_url } )
530
            print
531
            return
532
        print "done."
533
534
535
def parse_opts():
536
    """Command-line options"""
537
    usage = "usage: %prog -c <config file>"
538
    parser = optparse.OptionParser(usage)
539
    parser.add_option("-c", "--config", help="Configuration file")
540
    parser.add_option("-o", "--output", dest="output", metavar="DIR",
541
                      help="Output directory (will be purged !)")
542
    parser.add_option("--list-engines", dest="lse", action="store_true",
543
                      help="List available engines and exit")
544
    parser.add_option("-r", "--recursive", dest="recursive",
545
                      type="int", metavar="DEPTH", help="Download linked "
546
                      "pages until this depth. Be careful with that. "
547
                      "Default: %default)")
548
    parser.add_option("-d", "--debug", dest="debug", action="store_true",
549
                      help="Debug mode")
550
    options, args = parser.parse_args()
551
    if (options.lse):
552
        engines = get_engines()
553
        print "\n".join(engines.keys())
554
        sys.exit()
555
    if not options.config:
556
        if os.path.exists(os.path.expanduser(CONFIG_PATH)):
557
            options.config = CONFIG_PATH
558
        else:
559
            parser.error("You must provide a configuration file (or put it "
560
                         "in %s)" % CONFIG_PATH)
561
    if not os.path.exists(os.path.expanduser(options.config)):
562
        parser.error("Unable to find the configuration file: %s"
563
                     % options.config)
564
    if args:
565
        parser.error("No arguments allowed")
566
    return options
567
568
569
def get_feeds(config):
570
    feed_list = []
571
    for s in config.sections():
572
        if not config.has_option(s, "url"):
573
            continue
574
        feed = Feed(s, config.get(s, "url"))
575
        feed_list.append(feed)
576
    return feed_list
577
578
579
def choose_engine(config):
580
    engines = get_engines()
581
    name = config.get("DEFAULT", "engine")
582
    return engines[name]()
583
584
585
def get_config(options):
586
    # TODO: create config
587
    config = SafeConfigParser({"title_size": "50", "engine": "wget",
588
                               "recursive": "0"})
589
    config.read(os.path.expanduser(options.config))
590
    if options.output is not None:
591
        config.set("DEFAULT", "output", options.output)
592
    if not config.has_option("DEFAULT", "output"):
593
        print >> sys.stderr, "Config file should have an 'output' variable"
594
        sys.exit(1)
595
    if not os.path.isdir(os.path.expanduser(config.get("DEFAULT", "output"))):
596
        print >> sys.stderr, "The output path must be a directory"
597
        sys.exit(1)
598
    config.set("DEFAULT", "debug", str(bool(options.debug)))
599
    if options.recursive is not None:
600
        config.set("DEFAULT", "recursive", options.recursive)
601
    return config
602
603
604
def main():
605
    """The fun starts here"""
606
    global config
607
    options = parse_opts()
608
    config = get_config(options)
609
    downloader = choose_engine(config)
610
611
    feeds = get_feeds(config)
612
    for feed in feeds[:]:
613
        try:
614
            feed.parse()
615
        except urllib2.HTTPError, e:
616
            print >>sys.stderr, "Failed downloading %s: %s" % (feed.url, e)
617
            feeds.remove(feed)
618
619
    repo = Repository(config.get("DEFAULT", "output"), feeds)
620
621
    for feed in feeds:
622
        outdir = os.path.join(repo.path, feed.name)
623
        for page in feed.pages:
624
            page.download(outdir, downloader)
625
626
    repo.make_index(downloader)
627
    repo.cleanup()
628
629
630
631
if __name__ == "__main__":
632
    main()