a5c46b2 by Aurélien Bompard at 2010-06-28 1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
# vim: set fileencoding=utf-8 tabstop=4 shiftwidth=4 expandtab smartindent:
4
u"""
5
Créé un flux RSS avec les interventions de Jean-Marc Jancovici sur France
ddb077e by Aurélien Bompard at 2010-06-28 6
Info.
a5c46b2 by Aurélien Bompard at 2010-06-28 7
8
Les fichiers sonores sont placés dans des balises ``<enclosure/>`` pour faciliter le podcasting.
9
10
:Authors:
11
    Aurélien Bompard <aurelien@bompard.org> <http://aurelien.bompard.org>
12
13
:License:
14
    GNU GPL v3 or later
15
"""
16
17
import os
18
import sys
19
import urllib2
2ec4c8c by Aurélien Bompard at 2010-10-11 20
import urlparse
a5c46b2 by Aurélien Bompard at 2010-06-28 21
import re
22
import BeautifulSoup
23
from pprint import pprint
24
25
list_url = "http://www.france-info.com/rss/Le_regard_de_Jean-Marc_Jancovici.xml"
26
base_url = "http://www.france-info.com"
27
2ec4c8c by Aurélien Bompard at 2010-10-11 28
js_re = re.compile("jstoflash\('play','.*','','','([^']*\.mp3)'\);")
a5c46b2 by Aurélien Bompard at 2010-06-28 29
30
list_page = urllib2.urlopen(list_url).read()
31
list_content = BeautifulSoup.BeautifulStoneSoup(list_page)
32
for item in list_content.find("atom:link").findAll(recursive=False):
33
    list_content.channel.insert(-1, item)
34
list_content.find("atom:link").extract()
35
for item in list_content.channel.findAll("item"):
36
    #print item.prettify()
37
    link = item.link.string
38
    #print link
39
    item_page = urllib2.urlopen(link).read()
40
    item_content = BeautifulSoup.BeautifulSoup(item_page)
41
    play_links = item_content.find("a", {"class": "p_ficheEcouter"})
42
    js_link = play_links["onclick"]
43
    js_mo = js_re.search(js_link)
44
    podcast_rel = js_mo.group(1)
2ec4c8c by Aurélien Bompard at 2010-10-11 45
    podcast_url = urlparse.urljoin(base_url, podcast_rel)
46
    #print "URL:", podcast_url
a5c46b2 by Aurélien Bompard at 2010-06-28 47
    item.enclosure["url"] = podcast_url
2ec4c8c by Aurélien Bompard at 2010-10-11 48
    try:
49
        podcast_info = urllib2.urlopen(podcast_url).info()
50
    except urllib2.URLError, e:
51
        sys.stderr.write("Error getting podcast info from %s: %s\n" %
52
                         (podcast_url, e))
53
        item.enclosure["type"] = "audio/mpeg" # reasonable fallback
54
        del item.enclosure["length"]
55
        continue
a5c46b2 by Aurélien Bompard at 2010-06-28 56
    item.enclosure["type"] = podcast_info.getheader("Content-Type")
57
    item.enclosure["length"] = podcast_info.getheader("Content-Length")
58
59
print list_content.prettify()