| a5c46b2 by Aurélien Bompard at 2010-06-28 |
1 |
#!/usr/bin/env python |
|
2 |
# -*- coding: utf-8 -*- |
|
3 |
# vim: set fileencoding=utf-8 tabstop=4 shiftwidth=4 expandtab smartindent: |
|
4 |
u""" |
|
5 |
Créé un flux RSS avec les interventions de Jean-Marc Jancovici sur France |
| ddb077e by Aurélien Bompard at 2010-06-28 |
6 |
Info. |
| a5c46b2 by Aurélien Bompard at 2010-06-28 |
7 |
|
|
8 |
Les fichiers sonores sont placés dans des balises ``<enclosure/>`` pour faciliter le podcasting. |
|
9 |
|
|
10 |
:Authors: |
|
11 |
Aurélien Bompard <aurelien@bompard.org> <http://aurelien.bompard.org> |
|
12 |
|
|
13 |
:License: |
|
14 |
GNU GPL v3 or later |
|
15 |
""" |
|
16 |
|
|
17 |
import os |
|
18 |
import sys |
|
19 |
import urllib2 |
| 2ec4c8c by Aurélien Bompard at 2010-10-11 |
20 |
import urlparse |
| a5c46b2 by Aurélien Bompard at 2010-06-28 |
21 |
import re |
|
22 |
import BeautifulSoup |
|
23 |
from pprint import pprint |
|
24 |
|
|
25 |
list_url = "http://www.france-info.com/rss/Le_regard_de_Jean-Marc_Jancovici.xml" |
|
26 |
base_url = "http://www.france-info.com" |
|
27 |
|
| 2ec4c8c by Aurélien Bompard at 2010-10-11 |
28 |
js_re = re.compile("jstoflash\('play','.*','','','([^']*\.mp3)'\);") |
| a5c46b2 by Aurélien Bompard at 2010-06-28 |
29 |
|
|
30 |
list_page = urllib2.urlopen(list_url).read() |
|
31 |
list_content = BeautifulSoup.BeautifulStoneSoup(list_page) |
|
32 |
for item in list_content.find("atom:link").findAll(recursive=False): |
|
33 |
list_content.channel.insert(-1, item) |
|
34 |
list_content.find("atom:link").extract() |
|
35 |
for item in list_content.channel.findAll("item"): |
|
36 |
#print item.prettify() |
|
37 |
link = item.link.string |
|
38 |
#print link |
|
39 |
item_page = urllib2.urlopen(link).read() |
|
40 |
item_content = BeautifulSoup.BeautifulSoup(item_page) |
|
41 |
play_links = item_content.find("a", {"class": "p_ficheEcouter"}) |
|
42 |
js_link = play_links["onclick"] |
|
43 |
js_mo = js_re.search(js_link) |
|
44 |
podcast_rel = js_mo.group(1) |
| 2ec4c8c by Aurélien Bompard at 2010-10-11 |
45 |
podcast_url = urlparse.urljoin(base_url, podcast_rel) |
|
46 |
#print "URL:", podcast_url |
| a5c46b2 by Aurélien Bompard at 2010-06-28 |
47 |
item.enclosure["url"] = podcast_url |
| 2ec4c8c by Aurélien Bompard at 2010-10-11 |
48 |
try: |
|
49 |
podcast_info = urllib2.urlopen(podcast_url).info() |
|
50 |
except urllib2.URLError, e: |
|
51 |
sys.stderr.write("Error getting podcast info from %s: %s\n" % |
|
52 |
(podcast_url, e)) |
|
53 |
item.enclosure["type"] = "audio/mpeg" # reasonable fallback |
|
54 |
del item.enclosure["length"] |
|
55 |
continue |
| a5c46b2 by Aurélien Bompard at 2010-06-28 |
56 |
item.enclosure["type"] = podcast_info.getheader("Content-Type") |
|
57 |
item.enclosure["length"] = podcast_info.getheader("Content-Length") |
|
58 |
|
|
59 |
print list_content.prettify() |