| 1 |
#!/bin/python |
| 2 |
|
| 3 |
#TODO: This needs to be resilient so when one subclass returns an error, the |
| 4 |
# whole thing doesn't crash. |
| 5 |
#TODO: what about http:// vs. http://www ... must be careful |
| 6 |
#TODO: time is time committed, not time the feed was grabbed... |
| 7 |
#TODO: put database table names in a variable or something |
| 8 |
#TODO: need to get rid of latin-1 encoding for SQL queries |
| 9 |
#TODO: don't check feeds that haven't been updated since we last checked them |
| 10 |
# -- can only do this on feeds whose contents won't change |
| 11 |
# (LastFM, Microblogs), but... what about deleted content?? |
| 12 |
#TODO: tags and categories, whitelist through subclasses for services that support it |
| 13 |
#TODO: I'm ignoring the published and published_parsed dates entirely |
| 14 |
#TODO: make sure the user defined classes and wonky feeds don't BREAK the whole script! |
| 15 |
|
| 16 |
import feedparser |
| 17 |
import entry |
| 18 |
import time |
| 19 |
import re |
| 20 |
|
| 21 |
"""An object representing an RSS/Atom feed in a unified, simplified standard |
| 22 |
format. This object is the universal mold for which feeds from various services |
| 23 |
can be understood. It can be subclassed in order to handle the nuances or |
| 24 |
quirks of any particular web service. |
| 25 |
""" |
| 26 |
class Feed: |
| 27 |
entryClass = entry.Entry |
| 28 |
|
| 29 |
"""Initialize the object.""" |
| 30 |
def __init__(self, url): |
| 31 |
self.href = url |
| 32 |
self.feed = feedparser.parse(self.href) |
| 33 |
|
| 34 |
#TODO: what happens on failure? |
| 35 |
"""Parse the feed and store the desired data.""" |
| 36 |
def parse(self): |
| 37 |
if not self.feed: |
| 38 |
return |
| 39 |
|
| 40 |
# Collect data from the feed object's direct attributes |
| 41 |
attrs = ['version', 'encoding', 'title', 'subtitle'] |
| 42 |
for attr in attrs: |
| 43 |
setattr(self, attr, getattr(self.feed, attr, "") ) |
| 44 |
|
| 45 |
# Other attributes burried deeper |
| 46 |
attrs = ['title', 'subtitle', 'link'] |
| 47 |
for attr in attrs: |
| 48 |
setattr(self, attr, getattr(self.feed.feed, attr, "") ) |
| 49 |
|
| 50 |
# Strip HTML from subtitle |
| 51 |
self.subtitle = re.sub(r'<[^>]*?>', '', self.subtitle) |
| 52 |
|
| 53 |
# Timestamp for last update |
| 54 |
self.updated_ts = self.get_timestamp() |
| 55 |
|
| 56 |
# Parse entries |
| 57 |
self.entries = [] |
| 58 |
for e in self.feed.entries: |
| 59 |
self.entries += [self.entryClass(e)] |
| 60 |
return True |
| 61 |
|
| 62 |
"""Retrieve the 'updated' timestamp.""" |
| 63 |
def get_timestamp(self): |
| 64 |
if hasattr(self.feed, 'updated'): |
| 65 |
return int(time.mktime(self.feed.updated)) |
| 66 |
else: |
| 67 |
return int(time.time()) |
| 68 |
|
| 69 |
"""Output the feed to xml using a DOM object. |
| 70 |
|
| 71 |
doc - DOM object |
| 72 |
|
| 73 |
Returns a populated DOM object. |
| 74 |
""" |
| 75 |
def toXML(self, doc): |
| 76 |
xml = doc.createElement('feed') |
| 77 |
|
| 78 |
attrs = ['version', 'encoding', 'href', 'link', 'updated'] |
| 79 |
for attr in attrs: |
| 80 |
if getattr(self, attr): |
| 81 |
xml.setAttribute(attr, getattr(self, attr)) |
| 82 |
|
| 83 |
elements = ['title', 'subtitle'] |
| 84 |
for element in elements: |
| 85 |
if getattr(self, element): |
| 86 |
e = doc.createElement(element) |
| 87 |
e.appendChild( doc.createTextNode( getattr(self, element) ) ) |
| 88 |
xml.appendChild( e ) |
| 89 |
|
| 90 |
for entry in self.entries: |
| 91 |
xml.appendChild( entry.toXML(doc) ) |
| 92 |
|
| 93 |
return xml |
| 94 |
|
| 95 |
"""Ensure this feed and its contents are in the database and up-to-date. If |
| 96 |
force is true, update all entries regardless of whether or not an update |
| 97 |
seems to be necessary.""" |
| 98 |
def db_check(self, cursor, verbose=False, force=False): |
| 99 |
# Check the feed |
| 100 |
if self.db_has(cursor): |
| 101 |
self.db_update(cursor, verbose, force) |
| 102 |
else: |
| 103 |
self.db_insert(cursor, verbose) |
| 104 |
|
| 105 |
"""Return the ID of this feed in the database. |
| 106 |
pre-condition: feed must exist in the database |
| 107 |
""" |
| 108 |
def db_get_feed_id(self, cursor): |
| 109 |
cursor.execute("SELECT `id` FROM `feeds` WHERE `href` = %s", self.href) |
| 110 |
return cursor.fetchone()[0] |
| 111 |
|
| 112 |
|
| 113 |
"""Check to see if this feed exists in the database.""" |
| 114 |
def db_has(self, cursor): |
| 115 |
cursor.execute("SELECT * FROM feeds WHERE href= %s", self.href) |
| 116 |
return cursor.rowcount > 0 |
| 117 |
|
| 118 |
"""Insert feed information into the database. |
| 119 |
pre-condition: db_has must be called before to avoid duplicate entries |
| 120 |
""" |
| 121 |
def db_insert(self, cursor, verbose=False, force=False): |
| 122 |
cursor.execute( |
| 123 |
"""INSERT INTO feeds (href, title, subtitle, link, encoding, |
| 124 |
version, rec_created, rec_updated, type) |
| 125 |
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)""", |
| 126 |
(self.href, self.title, self.subtitle, self.link, self.encoding, |
| 127 |
self.version, int(time.time()), self.updated_ts, |
| 128 |
self.__class__.__name__)) |
| 129 |
|
| 130 |
if verbose: |
| 131 |
print "\tFeed inserted: %s" % self.title |
| 132 |
|
| 133 |
#TODO: raise error upon failure |
| 134 |
|
| 135 |
# Check entries (even new entries could be duplicates if href changes) |
| 136 |
for entry in self.entries: |
| 137 |
entry.db_check(cursor, self.db_get_feed_id(cursor), force) |
| 138 |
|
| 139 |
"""Returns turn if the feed timestampe is more recent than the timestamp in |
| 140 |
the database (i.e. the database is out of date); false otherwise. |
| 141 |
""" |
| 142 |
def db_needs_update(self,cursor): |
| 143 |
cursor.execute( |
| 144 |
"""SELECT COUNT(*) FROM feeds |
| 145 |
WHERE href=%s AND rec_updated < %s""", |
| 146 |
(self.href, self.updated_ts)) |
| 147 |
return cursor.fetchone()[0] == 1 |
| 148 |
|
| 149 |
"""Update feed information in the database. If force is true, do the update |
| 150 |
regardless, otherwise check db_needs_update() to see if it's necessary. |
| 151 |
pre-condition: db_has must be called first to avoid errors """ |
| 152 |
def db_update(self, cursor, verbose=False, force=False): |
| 153 |
if force or self.db_needs_update(cursor): |
| 154 |
cursor.execute( |
| 155 |
"""UPDATE feeds SET title=%s, subtitle=%s, link=%s, encoding=%s, |
| 156 |
version=%s, rec_updated=%s, type=%s WHERE href=%s""", |
| 157 |
(self.title, self.subtitle, self.link, self.encoding, |
| 158 |
self.version, self.updated_ts, self.__class__.__name__, |
| 159 |
self.href)) |
| 160 |
if verbose: |
| 161 |
print "\tFeed updated: %s" % self.title |
| 162 |
|
| 163 |
# Check entries (some might be new, some might be old) |
| 164 |
for entry in self.entries: |
| 165 |
entry.db_check(cursor,self.db_get_feed_id(cursor),verbose,force) |
| 166 |
elif verbose: |
| 167 |
print "\tNo update necessary: %s" % self.title |
| 168 |
|
| 169 |
"""These are essentially abstract classes.""" |
| 170 |
class Blog(Feed): |
| 171 |
entryClass = entry.BlogEntry |
| 172 |
def get_timestamp(self): |
| 173 |
if hasattr(self.feed, 'updated'): |
| 174 |
return int(time.mktime(self.feed.updated)) |
| 175 |
elif hasattr(self.feed.feed, 'lastbuilddate'): |
| 176 |
return int(time.mktime(time.strptime(self.feed.feed.lastbuilddate, '%a, %d %b %Y %H:%M:%S %Z'))) |
| 177 |
elif hasattr(self.feed.feed, 'updated_parsed'): |
| 178 |
return int(time.mktime(self.feed.feed.updated_parsed)) |
| 179 |
else: |
| 180 |
return int(time.time()) |
| 181 |
|
| 182 |
class Comments(Feed): |
| 183 |
entryClass = entry.Comment |
| 184 |
|
| 185 |
class MicroBlog(Feed): |
| 186 |
entryClass = entry.MicroBlogEntry |
| 187 |
|
| 188 |
class SharedItems(Feed): |
| 189 |
entryClass = entry.SharedItem |
| 190 |
|
| 191 |
class Favorites(SharedItems): |
| 192 |
entryClass = entry.Favorite |
| 193 |
|
| 194 |
class Multimedia(SharedItems): |
| 195 |
entryClass = entry.MultimediaContent |
| 196 |
|
| 197 |
class StatusUpdates(Feed): |
| 198 |
entryClass = entry.StatusUpdate |
| 199 |
|
| 200 |
|
| 201 |
"""These classes should be used directly.""" |
| 202 |
class BackType(Comments): |
| 203 |
entryClass = entry.BackTypeComment |
| 204 |
|
| 205 |
class BlipFM(MicroBlog): |
| 206 |
entryClass = entry.BlipFMUpdate |
| 207 |
|
| 208 |
class Delicious(SharedItems): |
| 209 |
entryClass = entry.DeliciousBookmark |
| 210 |
|
| 211 |
class Digg(SharedItems): |
| 212 |
entryClass = entry.DuggItem |
| 213 |
|
| 214 |
class FacebookPostedItems(SharedItems): |
| 215 |
entryClass = entry.FacebookPostedItem |
| 216 |
|
| 217 |
class FacebookStatusUpdates(StatusUpdates): |
| 218 |
entryClass = entry.FacebookStatusUpdate |
| 219 |
|
| 220 |
class Flickr(Multimedia): |
| 221 |
entryClass = entry.FlickrPhoto |
| 222 |
"""Flickr's feed title isn't very descriptive.""" |
| 223 |
def parse(self): |
| 224 |
success = Multimedia.parse(self) |
| 225 |
self.title = 'Flickr ' + self.title |
| 226 |
return success |
| 227 |
|
| 228 |
class Gitorious(Feed): |
| 229 |
entryClass = entry.GitoriousUpdate |
| 230 |
|
| 231 |
class GoogleReaderSharedItems(SharedItems): |
| 232 |
entryClass = entry.GoogleReaderSharedItem |
| 233 |
def get_timestamp(self): |
| 234 |
return int(time.mktime(self.feed.feed.updated_parsed)) #TODO: what's the pattern? |
| 235 |
|
| 236 |
class Identica(MicroBlog): |
| 237 |
entryClass = entry.IdenticaUpdate |
| 238 |
|
| 239 |
"""Identica's feed title isn't very descriptive.""" |
| 240 |
def parse(self): |
| 241 |
success = MicroBlog.parse(self) |
| 242 |
self.title = 'Identi.ca - ' + self.title |
| 243 |
return success |
| 244 |
|
| 245 |
class IdenticaFavorites(Favorites): |
| 246 |
entryClass = entry.IdenticaFavorite |
| 247 |
|
| 248 |
class LastFM(SharedItems): |
| 249 |
entryClass = entry.LastFMPlay |
| 250 |
|
| 251 |
class LibreFM(SharedItems): |
| 252 |
entryClass = entry.LibreFMPlay |
| 253 |
|
| 254 |
"""Libre.fm is doing some RDF things that are different.""" |
| 255 |
def parse(self): |
| 256 |
success = SharedItems.parse(self) |
| 257 |
self.link = self.feed.feed.links[0]['rdf:resource'] |
| 258 |
return success |
| 259 |
|
| 260 |
class MySpaceBlog(Blog): |
| 261 |
entryClass = entry.MySpaceBlogEntry |
| 262 |
|
| 263 |
class TTRSSPublishedItems(SharedItems): |
| 264 |
entryClass = entry.TTRSSPublishedItem |
| 265 |
|
| 266 |
class Twitter(MicroBlog): |
| 267 |
entryClass = entry.TwitterUpdate |
| 268 |
|
| 269 |
class TwitterFavorites(Favorites): |
| 270 |
entryClass = entry.TwitterFavorite |
| 271 |
|
| 272 |
class TwitterSearch(Twitter): |
| 273 |
entryClass = entry.TwitterSearchResult |
| 274 |
|
| 275 |
class WordPress(Blog): |
| 276 |
entryClass = entry.WordPressEntry |
| 277 |
|
| 278 |
class YouTube(Multimedia): |
| 279 |
entryClass = entry.YouTubeVideo |
| 280 |
|
| 281 |
class YouTubeFavorites(Favorites): |
| 282 |
entryClass = entry.YouTubeFavorite |