| 1 |
#!/bin/python |
| 2 |
# coding: utf-8 |
| 3 |
|
| 4 |
#TODO: Check number of comments, if a comments feed exists |
| 5 |
|
| 6 |
import time |
| 7 |
import re |
| 8 |
import unicodedata |
| 9 |
from xml.dom import minidom |
| 10 |
|
| 11 |
"""An object representing an entry in an RSS/Atom feed. This is the universal |
| 12 |
object for representing any entry in a feed from any service. It can be |
| 13 |
subclassed in order to handle the nuances or quirks of any particular web |
| 14 |
service. |
| 15 |
""" |
| 16 |
class Entry: |
| 17 |
|
| 18 |
"""Store the entry list from feedparser and parse it.""" |
| 19 |
def __init__(self, entry): |
| 20 |
self.entry = entry |
| 21 |
self.parse() |
| 22 |
|
| 23 |
"""Parse the entry and store the desired data.""" |
| 24 |
def parse(self): |
| 25 |
attrs = ['author', 'link', 'comments_feed', 'comments', 'title', |
| 26 |
'updated', 'updated_parsed', 'summary'] |
| 27 |
for attr in attrs: |
| 28 |
setattr(self, attr, getattr(self.entry, attr, "")) |
| 29 |
|
| 30 |
self.content = self.get_content() |
| 31 |
self.updated_ts = self.get_timestamp() |
| 32 |
|
| 33 |
#this will save so many headaches TODO: more of this? |
| 34 |
self.title = unicodedata.normalize('NFKD', self.title).encode('ASCII', 'xmlcharrefreplace') |
| 35 |
if type(self.summary) == unicode: |
| 36 |
self.summary = unicodedata.normalize('NFKD', self.summary).encode('ASCII', 'xmlcharrefreplace') |
| 37 |
|
| 38 |
self.tags = [] |
| 39 |
# For consistency, ignore case |
| 40 |
for tag in self.get_tags(): |
| 41 |
self.tags += [tag.lower()] |
| 42 |
|
| 43 |
"""Retrieve the main content of the story.""" |
| 44 |
def get_content(self): |
| 45 |
if hasattr(self.entry, 'content'): |
| 46 |
return self.entry.content[0].value |
| 47 |
return "" |
| 48 |
|
| 49 |
"""Retrieve the 'updated' timestamp.""" |
| 50 |
def get_timestamp(self): |
| 51 |
#TODO: I don't know if this will work when DST is over... |
| 52 |
# also... time.altzone... but it's wrong during DST (time.daylight) |
| 53 |
return int(time.mktime(self.entry.updated_parsed)) - time.timezone |
| 54 |
|
| 55 |
"""Retrive a list of tags on this entry from the feed.""" |
| 56 |
def get_tags(self): |
| 57 |
tags = [] |
| 58 |
|
| 59 |
if hasattr(self.entry, 'tags'): |
| 60 |
for tag in self.entry.tags: |
| 61 |
tags += [tag['term']] |
| 62 |
|
| 63 |
return tags |
| 64 |
|
| 65 |
"""Output the feed to xml using DOM object doc.""" |
| 66 |
def toXML(self, doc): |
| 67 |
xml = doc.createElement('entry') |
| 68 |
|
| 69 |
attrs = ['author', 'link', 'comments_feed', 'comments', 'updated'] |
| 70 |
for attr in attrs: |
| 71 |
if getattr(self, attr): |
| 72 |
xml.setAttribute(attr, getattr(self, attr)) |
| 73 |
|
| 74 |
elements = ['title', 'summary', 'content'] |
| 75 |
for element in elements: |
| 76 |
if getattr(self, element): |
| 77 |
e = doc.createElement(element) |
| 78 |
e.appendChild( doc.createTextNode( getattr(self, element) ) ) |
| 79 |
xml.appendChild( e ) |
| 80 |
|
| 81 |
return xml |
| 82 |
|
| 83 |
"""Ensure the entry is in the database and up-to-date. |
| 84 |
|
| 85 |
cursor - database cursor for performing queries |
| 86 |
feed_id - the ID of the feed from which the entry hails |
| 87 |
force - update regardless if set to true |
| 88 |
""" |
| 89 |
def db_check(self, cursor, feed_id, verbose=False, force=False): |
| 90 |
self.entry_id = self.db_get_entry_id(cursor) |
| 91 |
if self.entry_id: |
| 92 |
self.db_update(cursor, feed_id, verbose, force) |
| 93 |
else: |
| 94 |
self.db_insert(cursor, feed_id, verbose) |
| 95 |
|
| 96 |
"""Return the unique ID of this entry in the database. |
| 97 |
|
| 98 |
cursor - database cursor for performing queries |
| 99 |
""" |
| 100 |
def db_get_entry_id(self, cursor): |
| 101 |
cursor.execute("SELECT id FROM entries WHERE link=%s", self.link) |
| 102 |
if cursor.rowcount == 0: |
| 103 |
return None |
| 104 |
elif cursor.rowcount == 1: |
| 105 |
return cursor._rows[0][0] |
| 106 |
else: |
| 107 |
raise Exception("Cannot determine unqiue entry_id") |
| 108 |
|
| 109 |
"""Insert the entry into the database. |
| 110 |
pre-condition: db_has must be called first to avoid duplicate entries |
| 111 |
|
| 112 |
cursor - database cursor for performing queries |
| 113 |
feed_id - the ID of the feed from which the entry hails |
| 114 |
verbose - output more information if set to True |
| 115 |
""" |
| 116 |
def db_insert(self, cursor, feed_id, verbose=False): |
| 117 |
cursor.execute( |
| 118 |
"""INSERT INTO entries (link, title, author, summary, content, |
| 119 |
rec_created, rec_updated, comments, comments_feed, |
| 120 |
feed_id) |
| 121 |
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""", |
| 122 |
(self.link, self.title, self.author, self.summary, self.content, |
| 123 |
int(time.time()), self.updated_ts, |
| 124 |
self.comments, self.comments_feed, feed_id)) |
| 125 |
self.entry_id = cursor.lastrowid |
| 126 |
|
| 127 |
#if verbose -- nah, this is important |
| 128 |
print "\t\tEntry inserted: %s" % self.title |
| 129 |
#TODO: raise error upon failure |
| 130 |
|
| 131 |
# Add tags |
| 132 |
for tag in self.tags: |
| 133 |
self.db_add_tag(cursor, tag, verbose) |
| 134 |
|
| 135 |
"""Returns true if feed updated timestamp is more recent than the timestamp |
| 136 |
in the database (i.e. the database is out of data); false otherwise. |
| 137 |
""" |
| 138 |
def db_needs_update(self, cursor): |
| 139 |
cursor.execute( |
| 140 |
"""SELECT COUNT(*) FROM entries |
| 141 |
WHERE link=%s AND rec_updated < %s""", |
| 142 |
(self.link, self.updated_ts)) |
| 143 |
return cursor.fetchone()[0] == 1 |
| 144 |
|
| 145 |
"""Update entry in the database. |
| 146 |
pre-condition: db_has must be called first to make sure entry exist in db |
| 147 |
|
| 148 |
feed_id - the ID of the feed from which the entry hails |
| 149 |
verbose - output more information if set to True |
| 150 |
force - update regardless if set to True |
| 151 |
""" |
| 152 |
def db_update(self, cursor, feed_id, verbose=False, force=False): |
| 153 |
if force or self.db_needs_update(cursor): |
| 154 |
# Only updating fields which are expected to change (e.g. not links) |
| 155 |
cursor.execute( |
| 156 |
"""UPDATE entries SET title=%s, author=%s, summary=%s, |
| 157 |
content=%s, rec_updated=%s, comments=%s, comments_feed=%s |
| 158 |
WHERE id=%s""", |
| 159 |
(self.title, self.author, self.summary, self.content, |
| 160 |
self.updated_ts, self.comments, self.comments_feed, |
| 161 |
self.entry_id)) |
| 162 |
|
| 163 |
# Check tags |
| 164 |
current_tags = self.db_get_tags(cursor) |
| 165 |
|
| 166 |
# Tag entry if not already tagged |
| 167 |
for tag in self.tags: |
| 168 |
if not current_tags.__contains__(tag): |
| 169 |
self.db_add_tag(cursor, tag, verbose) |
| 170 |
|
| 171 |
# Remove tags which no longer exist in the feed |
| 172 |
for tag in current_tags: |
| 173 |
if not self.tags.__contains__(tag): |
| 174 |
self.db_remove_tag(cursor, tag, verbose) |
| 175 |
|
| 176 |
if verbose: |
| 177 |
print "\t\t" |
| 178 |
print "Entry updated: %s" % self.title |
| 179 |
|
| 180 |
#TODO: raise error upon failure |
| 181 |
|
| 182 |
elif verbose: |
| 183 |
print "\t\tNo update necessary: %s" % self.title |
| 184 |
|
| 185 |
"""Add tag to entry, if not tagged as such alrady. If tag doesn't |
| 186 |
exist in the database, create it. |
| 187 |
Precondition: self.entry_id must be set |
| 188 |
|
| 189 |
cursor - database cursor for performing queries |
| 190 |
tag - the tag to add (string) |
| 191 |
verbose - output more information if set to True |
| 192 |
""" |
| 193 |
def db_add_tag(self, cursor, tag, verbose=False): |
| 194 |
if not hasattr(self, 'entry_id'): |
| 195 |
raise Exception('entry_id must be set') |
| 196 |
|
| 197 |
added=False |
| 198 |
tag_id = self.db_get_tag_id(cursor, tag) |
| 199 |
if not tag_id: |
| 200 |
tag_id = self.db_insert_tag(cursor, tag) |
| 201 |
added = True |
| 202 |
|
| 203 |
if not self.db_tagged_with(cursor, tag_id): |
| 204 |
cursor.execute(""" |
| 205 |
INSERT INTO tag_relationships (entry_id, tag_id) |
| 206 |
VALUES (%s,%s) """, (self.entry_id, tag_id)) |
| 207 |
#TODO: error checking |
| 208 |
|
| 209 |
if verbose: |
| 210 |
if added: |
| 211 |
print "\t\t\tTagged with %s [new]" % (tag) |
| 212 |
else: |
| 213 |
print "\t\t\tTagged with %s" % (tag) |
| 214 |
|
| 215 |
"""Untags item. Does not delete tag from database.""" |
| 216 |
def db_remove_tag(self, cursor, tag, verbose=False): |
| 217 |
if not hasattr(self, 'entry_id'): |
| 218 |
raise Exception('entry_id must be set') |
| 219 |
|
| 220 |
tag_id = self.db_get_tag_id(cursor, tag) |
| 221 |
if not tag_id: |
| 222 |
raise Exception('cannot find tag') |
| 223 |
|
| 224 |
cursor.execute(""" |
| 225 |
DELETE FROM tag_relationships |
| 226 |
WHERE tag_id=%s AND entry_id=%s""", (tag_id, self.entry_id)) |
| 227 |
#TODO: error checking |
| 228 |
|
| 229 |
if verbose: |
| 230 |
print "\t\t\tUntagged with %s" % (tag) |
| 231 |
|
| 232 |
"""Returns the tag ID from the database.""" |
| 233 |
def db_get_tag_id(self, cursor, tag): |
| 234 |
if not cursor.execute("SELECT id FROM tags WHERE name=%s", tag): |
| 235 |
return None |
| 236 |
else: |
| 237 |
return cursor._rows[0][0] |
| 238 |
|
| 239 |
"""Return a list of all tags linked to this entry.""" |
| 240 |
def db_get_tags(self, cursor): |
| 241 |
if not hasattr(self, 'entry_id'): |
| 242 |
raise Exception('entry_id must be set') |
| 243 |
|
| 244 |
tags = [] |
| 245 |
#TODO: error checking |
| 246 |
cursor.execute(""" |
| 247 |
SELECT t.name |
| 248 |
FROM tags t |
| 249 |
LEFT OUTER JOIN tag_relationships r ON t.id = r.tag_id |
| 250 |
WHERE r.entry_id=%s""", self.entry_id) |
| 251 |
|
| 252 |
for row in cursor._rows: |
| 253 |
tags += [row[0]] |
| 254 |
|
| 255 |
return tags |
| 256 |
|
| 257 |
"""Insert tag into the database. Returns insert_id.""" |
| 258 |
def db_insert_tag(self, cursor, tag): |
| 259 |
cursor.execute("INSERT INTO tags SET name=%s", tag) |
| 260 |
return cursor.lastrowid |
| 261 |
|
| 262 |
"""Returns true if entry is tagged with given string, false o.w.""" |
| 263 |
def db_tagged_with(self, cursor, tag_id): |
| 264 |
return cursor.execute(""" |
| 265 |
SELECT * FROM tag_relationships |
| 266 |
WHERE entry_id=%s AND tag_id=%s""", (self.entry_id, tag_id)) |
| 267 |
|
| 268 |
"""Abstract classes.""" |
| 269 |
class BlogEntry(Entry): |
| 270 |
pass |
| 271 |
|
| 272 |
class Comment(Entry): |
| 273 |
pass |
| 274 |
|
| 275 |
class MicroBlogEntry(Entry): |
| 276 |
pass |
| 277 |
|
| 278 |
class SharedItem(Entry): |
| 279 |
pass |
| 280 |
|
| 281 |
class Favorite(SharedItem): |
| 282 |
pass |
| 283 |
|
| 284 |
class MultimediaContent(SharedItem): |
| 285 |
def parse(self): |
| 286 |
SharedItem.parse(self) |
| 287 |
|
| 288 |
# The main link is the link to comments too |
| 289 |
self.comments = self.link |
| 290 |
|
| 291 |
class StatusUpdate(Entry): |
| 292 |
pass |
| 293 |
|
| 294 |
|
| 295 |
"""These classes can be used directly.""" |
| 296 |
class BackTypeComment(Comment): |
| 297 |
pass |
| 298 |
|
| 299 |
class BlipFMUpdate(MicroBlogEntry): |
| 300 |
def parse(self): |
| 301 |
MicroBlogEntry.parse(self) |
| 302 |
self.tags += ['music'] #should I do this? |
| 303 |
|
| 304 |
|
| 305 |
class DeliciousBookmark(SharedItem): |
| 306 |
"""Delicious puts tags together, space separated.""" |
| 307 |
def get_tags(self): |
| 308 |
tags = [] |
| 309 |
|
| 310 |
if hasattr(self.entry, 'tags'): |
| 311 |
for tag in self.entry.tags[0]['term'].split(): |
| 312 |
tags += [tag] |
| 313 |
|
| 314 |
return tags |
| 315 |
|
| 316 |
class DuggItem(SharedItem): |
| 317 |
def parse(self): |
| 318 |
SharedItem.parse(self) |
| 319 |
self.comments = self.link |
| 320 |
|
| 321 |
class FacebookPostedItem(SharedItem): |
| 322 |
pass |
| 323 |
#TODO: this needs to be smarter - input is a mess |
| 324 |
"""def parse_other(self): |
| 325 |
# Need to catch errors, so this doesn't crash whole script |
| 326 |
try: |
| 327 |
x = minidom.parseString(self.summary) |
| 328 |
self.content = "%s %s %s" % (x.childNodes[0].childNodes[0].childNodes[0].childNodes[0].toxml(), |
| 329 |
x.childNodes[0].childNodes[1].childNodes[0].childNodes[0].childNodes[0].toxml(), |
| 330 |
x.childNodes[0].childNodes[1].childNodes[0].childNodes[1].childNodes[0].toxml()) |
| 331 |
self.summary = x.childNodes[0].childNodes[1].childNodes[1].childNodes[0].childNodes[0].toxml() |
| 332 |
except Exception, e: |
| 333 |
print "Can't do it", e""" |
| 334 |
|
| 335 |
class FacebookStatusUpdate(StatusUpdate): |
| 336 |
def get_content(self): |
| 337 |
return self.title |
| 338 |
|
| 339 |
class FlickrPhoto(MultimediaContent): |
| 340 |
"""Flickr dumps all sorts of different information into the content field. |
| 341 |
The first paragraph is redundant (username posted a photo); |
| 342 |
the second paragraph contains the thumbnail photo we want; |
| 343 |
if there's a third paragraph, it's a description -- this should be summary.""" |
| 344 |
def parse(self): |
| 345 |
MultimediaContent.parse(self) |
| 346 |
|
| 347 |
try: |
| 348 |
# Get rid of line breaks, because that makes parsing more difficult |
| 349 |
new_content = "" |
| 350 |
for line in self.content.splitlines(): |
| 351 |
new_content += line |
| 352 |
|
| 353 |
# Turn it into an xml object for easy parsing |
| 354 |
x = minidom.parseString('<div>' + new_content + '</div>') |
| 355 |
|
| 356 |
# Second paragraph is thumbnail (#TODO: not anymore? whitespace nonsense?) |
| 357 |
self.content = x.childNodes[0].childNodes[2].childNodes[0].toxml() |
| 358 |
|
| 359 |
# Third paragraph is description (optional) |
| 360 |
if len(x.childNodes[0].childNodes) >= 4: |
| 361 |
self.summary = x.childNodes[0].childNodes[3].childNodes[0].toxml() |
| 362 |
except IndexError: |
| 363 |
pass |
| 364 |
|
| 365 |
class GitoriousUpdate(Entry): |
| 366 |
def parse(self): |
| 367 |
Entry.parse(self) |
| 368 |
self.tags += ['code'] #TODO: should I hardcode this? |
| 369 |
|
| 370 |
"""No unique links, so we need a stronger criteria.""" |
| 371 |
def db_get_entry_id(self, cursor): |
| 372 |
cursor.execute("""SELECT id FROM entries |
| 373 |
WHERE link=%s AND rec_updated=%s""", |
| 374 |
(self.link, self.updated_ts)) |
| 375 |
if cursor.rowcount == 0: |
| 376 |
return None |
| 377 |
elif cursor.rowcount == 1: |
| 378 |
return cursor._rows[0][0] |
| 379 |
else: |
| 380 |
#TODO: need to output warning and move on, not raise exception |
| 381 |
print cursor._rows |
| 382 |
raise Exception("Cannot determine unqiue entry_id %d" % cursor.rowcount) |
| 383 |
|
| 384 |
class GoogleReaderSharedItem(SharedItem): |
| 385 |
"""Don't want tags because I didn't create them.""" |
| 386 |
def get_tags(self): |
| 387 |
return [] |
| 388 |
|
| 389 |
#TODO: db_has uniquely determined? what if I share something twice? |
| 390 |
|
| 391 |
class IdenticaUpdate(MicroBlogEntry): |
| 392 |
def get_content(self): |
| 393 |
return self.title |
| 394 |
|
| 395 |
class IdenticaFavorite(IdenticaUpdate): |
| 396 |
pass |
| 397 |
|
| 398 |
class LastFMPlay(SharedItem): |
| 399 |
"""Summary info isn't relevant at all here.""" |
| 400 |
def parse(self): |
| 401 |
SharedItem.parse(self) |
| 402 |
self.summary = "" |
| 403 |
self.tags += ['music'] #should I do this? |
| 404 |
|
| 405 |
"""Last.fm doesn't use unique links, so we need a stronger criteria.""" |
| 406 |
def db_get_entry_id(self, cursor): |
| 407 |
cursor.execute("""SELECT id FROM entries |
| 408 |
WHERE link=%s AND rec_updated=%s""", |
| 409 |
(self.link, self.updated_ts)) |
| 410 |
if cursor.rowcount == 0: |
| 411 |
return None |
| 412 |
elif cursor.rowcount == 1: |
| 413 |
return cursor._rows[0][0] |
| 414 |
else: |
| 415 |
#TODO: need to output warning and move on, not raise exception |
| 416 |
print cursor._rows |
| 417 |
raise Exception("Cannot determine unqiue entry_id %d" % cursor.rowcount) |
| 418 |
|
| 419 |
class LibreFMPlay(LastFMPlay): |
| 420 |
"""Libre.fm is using Dublin Core or something?.""" |
| 421 |
def get_timestamp(self): |
| 422 |
return int(time.mktime(time.strptime(self.entry.dcterms_date, '%Y-%m-%dT%X+00:00'))) - time.timezone |
| 423 |
|
| 424 |
class MySpaceBlogEntry(BlogEntry): |
| 425 |
def get_content(self): |
| 426 |
return self.summary |
| 427 |
|
| 428 |
class GoogleReaderSharedItem(SharedItem): |
| 429 |
pass |
| 430 |
|
| 431 |
class TTRSSPublishedItem(SharedItem): |
| 432 |
pass |
| 433 |
|
| 434 |
class TwitterUpdate(MicroBlogEntry): |
| 435 |
def get_content(self): |
| 436 |
return self.title |
| 437 |
|
| 438 |
def parse(self): |
| 439 |
MicroBlogEntry.parse(self) |
| 440 |
m = re.search('([^:]+): (.+)', self.content) |
| 441 |
if (m): |
| 442 |
self.author = m.group(1) |
| 443 |
self.content = m.group(2) |
| 444 |
self.summary = m.group(2) |
| 445 |
|
| 446 |
class TwitterFavorite(TwitterUpdate): |
| 447 |
pass |
| 448 |
|
| 449 |
class TwitterSearchResult(TwitterUpdate): |
| 450 |
pass |
| 451 |
|
| 452 |
class WordPressEntry(BlogEntry): |
| 453 |
def parse_other(self): |
| 454 |
self.comments_feed = self.entry.wfw_commentrss |
| 455 |
|
| 456 |
class YouTubeVideo(MultimediaContent): |
| 457 |
def get_tags(self): |
| 458 |
tags = [] |
| 459 |
|
| 460 |
if hasattr(self.entry, 'media_category'): |
| 461 |
for tag in self.entry.media_category.split(): |
| 462 |
tags += [tag] |
| 463 |
|
| 464 |
return tags |
| 465 |
|
| 466 |
class YouTubeFavorite(Favorite): |
| 467 |
pass |