a5c46b2 by Aurélien Bompard at 2010-06-28 1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
# vim: set fileencoding=utf-8 tabstop=4 shiftwidth=4 expandtab smartindent:
4
u"""
5
Créé un flux RSS avec la page des actualités du site http://manicore.com
6
7
:Authors:
8
    Aurélien Bompard <aurelien@bompard.org> <http://aurelien.bompard.org>
9
10
:License:
11
    GNU GPL v3 or later
12
"""
13
14
import os
15
import sys
16
import urllib2
17
import re
18
import datetime
19
import locale
073eb59 by Aurélien Bompard at 2011-04-23 20
from urlparse import urljoin
21
from optparse import OptionParser
a5c46b2 by Aurélien Bompard at 2010-06-28 22
23
import BeautifulSoup
24
25
073eb59 by Aurélien Bompard at 2011-04-23 26
BASE_URL = "http://manicore.com"
27
MAX_ITEMS = 10
a5c46b2 by Aurélien Bompard at 2010-06-28 28
073eb59 by Aurélien Bompard at 2011-04-23 29
TEXT_START_RE = re.compile("^\s*:\s*")
a5c46b2 by Aurélien Bompard at 2010-06-28 30
073eb59 by Aurélien Bompard at 2011-04-23 31
FEED_HEAD = u"""<?xml version="1.0" encoding="UTF-8"?>
a5c46b2 by Aurélien Bompard at 2010-06-28 32
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
33
    <channel>
34
        <title>%(title)s</title>
35
        <description>%(desc)s</description>
36
        <link>%(base_url)s</link>
37
        <language>fr</language>
38
        <lastBuildDate>%(now)s GMT</lastBuildDate>
39
        <pubDate>%(now)s GMT</pubDate>
073eb59 by Aurélien Bompard at 2011-04-23 40
        <atom:link href="%(feed_url)s" rel="self" type="application/rss+xml" />
a5c46b2 by Aurélien Bompard at 2010-06-28 41
        <image>
42
            <title>%(title)s</title>
43
            <link>%(base_url)s</link>
44
            <url>%(logo)s</url>
45
        </image>
46
"""
47
073eb59 by Aurélien Bompard at 2011-04-23 48
FEED_ENTRY = u"""
a5c46b2 by Aurélien Bompard at 2010-06-28 49
        <item>
50
            <title>%(title)s</title>
51
            <description>
52
                <![CDATA[
53
                    %(desc)s
54
                ]]>
55
            </description>
56
            <link>%(link)s</link>
073eb59 by Aurélien Bompard at 2011-04-23 57
            <guid isPermaLink="false">%(guid)s</guid>
a5c46b2 by Aurélien Bompard at 2010-06-28 58
            <pubDate>%(date)s +0100</pubDate>
59
        </item>
60
"""
61
073eb59 by Aurélien Bompard at 2011-04-23 62
FEED_TAIL = u"""
a5c46b2 by Aurélien Bompard at 2010-06-28 63
    </channel>
64
</rss>
65
"""
66
67
073eb59 by Aurélien Bompard at 2011-04-23 68
class FeedItem(object):
a5c46b2 by Aurélien Bompard at 2010-06-28 69
70
    fixes = {
71
        "decembre": u"décembre",
72
    }
073eb59 by Aurélien Bompard at 2011-04-23 73
    date_re = re.compile("\s*(\d\d?)\s+(\S+)\s+(\d\d\d\d)\s*")
74
    feed_entry = FEED_ENTRY
a5c46b2 by Aurélien Bompard at 2010-06-28 75
76
    def __init__(self):
073eb59 by Aurélien Bompard at 2011-04-23 77
        self.vars = {
78
            "date": None,
79
            "desc": u"",
80
            "link": None,
81
            "title": u"",
82
        }
a5c46b2 by Aurélien Bompard at 2010-06-28 83
073eb59 by Aurélien Bompard at 2011-04-23 84
    def render(self):
85
        vars = self.vars.copy()
86
        if not self.vars["date"]:
87
            vars["date"] = ""
88
            vars["guid"] = ("urn:www:manicore.com:%s" % self.vars["link"])
a5c46b2 by Aurélien Bompard at 2010-06-28 89
        else:
073eb59 by Aurélien Bompard at 2011-04-23 90
            vars["date"] = self.vars["date"].strftime("%a, %d %b %Y %H:%M:%S")
91
            vars["guid"] = ("urn:www:manicore.com:%s:%s"
92
                            % (self.vars["date"].strftime("%Y-%m-%d"),
93
                               os.path.basename(self.vars["link"])))
94
        return unicode(self.feed_entry % vars)
95
96
    def __str__(self):
97
        return self.render()
98
99
    def check(self, item):
100
        return True
a5c46b2 by Aurélien Bompard at 2010-06-28 101
102
    def parse(self, item):
103
        for subitem in item:
104
            if isinstance(subitem, BeautifulSoup.NavigableString):
073eb59 by Aurélien Bompard at 2011-04-23 105
                self.vars["desc"] += unicode(subitem)
106
                self.vars["title"] += unicode(subitem)
a5c46b2 by Aurélien Bompard at 2010-06-28 107
            elif isinstance(subitem, BeautifulSoup.Tag):
108
                if subitem.name == "a":
073eb59 by Aurélien Bompard at 2011-04-23 109
                    subitem["href"] = urljoin(self.page_url, subitem["href"])
a5c46b2 by Aurélien Bompard at 2010-06-28 110
                if subitem.name == "font" or subitem.name == "p":
111
                    self.parse(subitem)
112
                elif subitem.name == "strong" or subitem.name == "b":
113
                    self.parse_date(subitem)
073eb59 by Aurélien Bompard at 2011-04-23 114
                elif not self.vars["link"] and subitem.name == "a":
a5c46b2 by Aurélien Bompard at 2010-06-28 115
                    self.parse_link(subitem)
116
                else:
073eb59 by Aurélien Bompard at 2011-04-23 117
                    self.add_to_desc(subitem)
a5c46b2 by Aurélien Bompard at 2010-06-28 118
073eb59 by Aurélien Bompard at 2011-04-23 119
    def add_to_desc(self, item):
120
        self.vars["desc"] += unicode(item)
a5c46b2 by Aurélien Bompard at 2010-06-28 121
        if item.string:
073eb59 by Aurélien Bompard at 2011-04-23 122
            self.vars["title"] += item.string
a5c46b2 by Aurélien Bompard at 2010-06-28 123
    
124
    def parse_date(self, item):
073eb59 by Aurélien Bompard at 2011-04-23 125
        if self.vars["date"]:
126
            self.add_to_desc(item)
a5c46b2 by Aurélien Bompard at 2010-06-28 127
            return
128
        if not item.string:
129
            #print >>sys.stderr, "Recursing for", item.prettify()
130
            for subitem in item:
131
                self.parse_date(subitem)
132
            return
073eb59 by Aurélien Bompard at 2011-04-23 133
        date = item.string.replace("1er","1")
134
        date_mo = self.date_re.match(date)
a5c46b2 by Aurélien Bompard at 2010-06-28 135
        if not date_mo:
073eb59 by Aurélien Bompard at 2011-04-23 136
            self.vars["desc"] += unicode(item)
a5c46b2 by Aurélien Bompard at 2010-06-28 137
            return
138
        date = "%s %s %s" % (date_mo.group(1), date_mo.group(2), date_mo.group(3))
139
        try:
073eb59 by Aurélien Bompard at 2011-04-23 140
            self.vars["date"] = datetime.datetime.strptime(
141
                                    date.encode("utf-8"), "%d %B %Y")
a5c46b2 by Aurélien Bompard at 2010-06-28 142
        except ValueError:
143
            # Un autre essai en tentant de corriger les erreurs
144
            for fix in self.fixes:
145
                if item.string.count(fix):
146
                    item.string = item.string.replace(fix, self.fixes[fix])
147
                    return self.parse_date(item)
148
            raise
149
150
    def parse_link(self, item):
073eb59 by Aurélien Bompard at 2011-04-23 151
        self.vars["link"] = item["href"].replace("&","&amp;")
152
        self.add_to_desc(item)
a5c46b2 by Aurélien Bompard at 2010-06-28 153
154
    def cleanup(self):
073eb59 by Aurélien Bompard at 2011-04-23 155
        self.vars["desc"] = TEXT_START_RE.sub("", self.vars["desc"])
156
        self.vars["title"] = TEXT_START_RE.sub("", self.vars["title"])
157
        if len(self.vars["title"]) > 100:
158
            self.vars["title"] = self.vars["title"][:98] + "..."
159
160
161
class NewsItem(FeedItem):
162
    page_url = "http://manicore.com/actualites.html"
163
    page_item_attrs = {"align": "justify"}
164
    feed_attrs = {
165
        "title": "Manicore",
166
        "desc": u"Bienvenue chez Jean-Marc Jancovici, le seul consultant qui "
167
                u"vous offre la Lune sans que vous ayez à la demander...",
168
        "logo": "http://manicore.com/ressources/moon.JPG",
169
        "feed_url": "http://feeds.feedburner.com/manicore-actu",
170
    }
171
172
class FranceInfoItem(FeedItem):
173
174
    date_re = re.compile("[^\d]*(\d\d?)\s+(\S+)\s+(\d\d\d\d).*")
175
    page_url = "http://manicore.com/documentation/articles/France_info_chroniques.html"
176
    page_item_attrs = {"class": "normal"}
177
    feed_attrs = {
178
        "title": "Le Regard de Jean-Marc Jancovici",
179
        "desc": u"L'analyse de l'actualité de la semaine par Jean-Marc "
180
                u"Jancovici",
181
        "logo": "http://manicore.com/ressources/moon.JPG",
182
        "feed_url": "http://feeds.feedburner.com/franceinfo-jancovici",
a5c46b2 by Aurélien Bompard at 2010-06-28 183
    }
073eb59 by Aurélien Bompard at 2011-04-23 184
    feed_entry = u"""
185
        <item>
186
            <title>%(title)s</title>
187
            <link>%(link)s</link>
188
            <guid isPermaLink="false">%(guid)s</guid>
189
            <pubDate>%(date)s +0100</pubDate>
190
            <enclosure url="%(enclosure)s" length="%(length)d" type="audio/mpeg" />
191
        </item>
192
"""
193
194
    def __init__(self):
195
        super(FranceInfoItem, self).__init__()
196
        self.vars["enclosure"] = None
197
        self.vars["length"] = 0
198
199
    def check(self, item):
200
        return (item.font and item.font.img and item.a)
201
202
    def parse(self, item):
203
        self.parse_date(item)
204
        self.vars["title"] = item.a.string
205
        self.vars["link"] = urljoin(self.page_url, item.a["href"])
206
        podcast = item.find("a", href=lambda h: h.endswith(".mp3"))
207
        if podcast is not None:
208
            self.vars["enclosure"] = urljoin(self.page_url, podcast["href"])
209
            self.get_podcast_length()
210
211
    def parse_date(self, item):
212
        date = None
213
        for item_content in item.contents:
214
            if not isinstance(item_content, BeautifulSoup.NavigableString):
215
                continue
216
            match = self.date_re.match(item_content)
217
            if match is None:
218
                continue
219
            date = "%s %s %s" % (match.group(1), match.group(2),
220
                                 match.group(3))
221
        if not date:
222
            return
223
        try:
224
            self.vars["date"] = datetime.datetime.strptime(
225
                                    date.encode("utf-8"), "%d %B %Y")
226
        except ValueError:
227
            # Un autre essai en tentant de corriger les erreurs
228
            for fix in self.fixes:
229
                if item.string.count(fix):
230
                    item.string = item.string.replace(fix, self.fixes[fix])
231
                    return self.parse_date(item)
232
            raise
233
234
    def get_podcast_length(self):
235
        try:
236
            podcast_info = urllib2.urlopen(self.vars["enclosure"]).info()
237
        except urllib2.URLError, e:
238
            sys.stderr.write("Error getting podcast info from %s: %s\n" %
239
                             (self.vars["enclosure"], e))
240
            return
241
        self.vars["length"] = int(podcast_info.getheader("Content-Length"))
242
243
244
PAGES = {
245
    "actualites": NewsItem,
246
    "franceinfo": FranceInfoItem,
247
}
248
249
250
def get_items(page_class):
251
    page_text = urllib2.urlopen(page_class.page_url).read()
252
    page_soup = BeautifulSoup.BeautifulSoup(page_text,
253
                                            convertEntities="html")
254
    locale.setlocale(locale.LC_ALL, 'fr_FR.UTF-8')
255
    page_content = page_soup.find("div", id="contenu")
256
    items = []
257
    for page_item in page_content.findAll("p", limit=MAX_ITEMS,
258
                                     attrs=page_class.page_item_attrs):
259
        #print item
260
        item = page_class()
261
        if not item.check(page_item):
262
            continue
263
        item.parse(page_item)
264
        item.cleanup()
265
        #print "date:", news.date, datetime.date.today().strftime("%d %B %Y")
266
        #print "link:", news.link
267
        #print "title:", news.title
268
        #print "text:", news.text
269
        items.append(item)
270
271
    locale.setlocale(locale.LC_ALL, '')
272
    return items
273
274
def build_feed(page_class, items):
275
    feed_info = page_class.feed_attrs.copy()
276
    feed_info["base_url"] = BASE_URL
277
    feed_info["now"] = datetime.datetime.utcnow().strftime(
278
                                "%a, %d %b %Y %H:%M:%S")
279
    feed = FEED_HEAD % feed_info
280
281
    for item in items:
282
        if not item.vars["date"]:
283
            print >> sys.stderr, u"Could not parse date for item: %s" % unicode(item)
284
            continue
285
        feed += unicode(item.render())
286
287
    feed += FEED_TAIL
288
289
    return feed.encode("utf-8")
290
291
def parse_opts():
292
    parser = OptionParser()
293
    parser.add_option("-o", "--output", help="Write the feed to this file")
294
    opts, args = parser.parse_args()
295
    if len(args) != 1:
296
        message = ("You must provide the page name as argument. "
297
                   "Available pages: %s" % ", ".join(PAGES.keys()))
298
        parser.error(message)
299
    return opts, args[0]
300
301
def main():
302
    opts, page_name = parse_opts()
303
    #list_page = open("actualites.html").read()
304
    page_class = PAGES[page_name]
305
    items = get_items(page_class)
306
    feed = build_feed(page_class, items)
307
    if opts.output is None:
308
        print feed
309
    else:
310
        out = open(opts.output, "w")
311
        out.write(feed)
312
        out.close()
a5c46b2 by Aurélien Bompard at 2010-06-28 313
314
073eb59 by Aurélien Bompard at 2011-04-23 315
if __name__ == "__main__":
316
    main()