1
#!/bin/python
2
# coding: utf-8
3
4
#TODO: Check number of comments, if a comments feed exists
5
6
import time
7
import re
8
import unicodedata
9
from xml.dom import minidom
10
11
"""An object representing an entry in an RSS/Atom feed. This is the universal
12
object for representing any entry in a feed from any service. It can be
13
subclassed in order to handle the nuances or quirks of any particular web
14
service.
15
"""
16
class Entry:
17
18
  """Store the entry list from feedparser and parse it."""
19
  def __init__(self, entry):
20
    self.entry = entry
21
    self.parse()
22
23
  """Parse the entry and store the desired data."""
24
  def parse(self):
25
    attrs = ['author', 'link', 'comments_feed', 'comments', 'title',
26
      'updated', 'updated_parsed', 'summary']
27
    for attr in attrs:
28
      setattr(self, attr, getattr(self.entry, attr, ""))
29
    
30
    self.content = self.get_content()
31
    self.updated_ts = self.get_timestamp()
32
    
33
    #this will save so many headaches TODO: more of this?
34
    self.title = unicodedata.normalize('NFKD', self.title).encode('ASCII', 'xmlcharrefreplace')
35
    if type(self.summary) == unicode:
36
      self.summary = unicodedata.normalize('NFKD', self.summary).encode('ASCII', 'xmlcharrefreplace')
37
      
38
    self.tags = []
39
    # For consistency, ignore case
40
    for tag in self.get_tags():
41
      self.tags += [tag.lower()]
42
  
43
  """Retrieve the main content of the story."""
44
  def get_content(self):
45
    if hasattr(self.entry, 'content'):
46
      return self.entry.content[0].value
47
    return ""
48
  
49
  """Retrieve the 'updated' timestamp."""
50
  def get_timestamp(self):
51
    #TODO: I don't know if this will work when DST is over...
52
    # also... time.altzone... but it's wrong during DST (time.daylight)
53
    return int(time.mktime(self.entry.updated_parsed)) - time.timezone
54
  
55
  """Retrive a list of tags on this entry from the feed."""
56
  def get_tags(self):
57
    tags = []
58
59
    if hasattr(self.entry, 'tags'):
60
      for tag in self.entry.tags:
61
        tags += [tag['term']]
62
63
    return tags
64
  
65
  """Output the feed to xml using DOM object doc."""
66
  def toXML(self, doc):
67
    xml = doc.createElement('entry')
68
    
69
    attrs = ['author', 'link', 'comments_feed', 'comments', 'updated']
70
    for attr in attrs:
71
      if getattr(self, attr):
72
        xml.setAttribute(attr, getattr(self, attr))
73
  
74
    elements = ['title', 'summary', 'content']
75
    for element in elements:
76
      if getattr(self, element):
77
        e = doc.createElement(element)
78
        e.appendChild( doc.createTextNode( getattr(self, element) ) )
79
        xml.appendChild( e )
80
    
81
    return xml
82
83
  """Ensure the entry is in the database and up-to-date.
84
85
  cursor - database cursor for performing queries
86
  feed_id - the ID of the feed from which the entry hails
87
  force - update regardless if set to true
88
  """
89
  def db_check(self, cursor, feed_id, verbose=False, force=False):
90
    self.entry_id = self.db_get_entry_id(cursor)
91
    if self.entry_id:
92
      self.db_update(cursor, feed_id, verbose, force)
93
    else:
94
      self.db_insert(cursor, feed_id, verbose)
95
  
96
  """Return the unique ID of this entry in the database.
97
  
98
  cursor - database cursor for performing queries
99
  """
100
  def db_get_entry_id(self, cursor):
101
    cursor.execute("SELECT id FROM entries WHERE link=%s", self.link)
102
    if cursor.rowcount == 0:
103
      return None
104
    elif cursor.rowcount == 1:
105
      return cursor._rows[0][0]
106
    else:
107
      raise Exception("Cannot determine unqiue entry_id")
108
  
109
  """Insert the entry into the database.
110
  pre-condition: db_has must be called first to avoid duplicate entries
111
112
  cursor - database cursor for performing queries
113
  feed_id - the ID of the feed from which the entry hails
114
  verbose - output more information if set to True
115
  """
116
  def db_insert(self, cursor, feed_id, verbose=False):
117
    cursor.execute(
118
      """INSERT INTO entries (link, title, author, summary, content,
119
        rec_created, rec_updated, comments, comments_feed,
120
        feed_id)
121
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
122
      (self.link, self.title, self.author, self.summary, self.content,
123
        int(time.time()), self.updated_ts,
124
        self.comments, self.comments_feed, feed_id))
125
    self.entry_id = cursor.lastrowid
126
    
127
    #if verbose -- nah, this is important
128
    print "\t\tEntry inserted: %s" % self.title
129
    #TODO: raise error upon failure
130
    
131
    # Add tags
132
    for tag in self.tags:
133
      self.db_add_tag(cursor, tag, verbose)
134
  
135
  """Returns true if feed updated timestamp is more recent than the timestamp
136
  in the database (i.e. the database is out of data); false otherwise.
137
  """
138
  def db_needs_update(self, cursor):
139
    cursor.execute(
140
      """SELECT COUNT(*) FROM entries
141
        WHERE link=%s AND rec_updated < %s""",
142
      (self.link, self.updated_ts))
143
    return cursor.fetchone()[0] == 1
144
  
145
  """Update entry in the database.
146
  pre-condition: db_has must be called first to make sure entry exist in db
147
148
  feed_id - the ID of the feed from which the entry hails
149
  verbose - output more information if set to True
150
  force - update regardless if set to True
151
  """
152
  def db_update(self, cursor, feed_id, verbose=False, force=False):
153
    if force or self.db_needs_update(cursor):
154
      # Only updating fields which are expected to change (e.g. not links)
155
      cursor.execute(
156
        """UPDATE entries SET title=%s, author=%s, summary=%s, 
157
          content=%s, rec_updated=%s, comments=%s, comments_feed=%s
158
          WHERE id=%s""",
159
        (self.title, self.author, self.summary, self.content, 
160
          self.updated_ts, self.comments, self.comments_feed, 
161
          self.entry_id))
162
163
      # Check tags
164
      current_tags = self.db_get_tags(cursor)
165
      
166
      # Tag entry if not already tagged
167
      for tag in self.tags:
168
        if not current_tags.__contains__(tag):
169
          self.db_add_tag(cursor, tag, verbose)
170
      
171
      # Remove tags which no longer exist in the feed
172
      for tag in current_tags:
173
        if not self.tags.__contains__(tag):
174
          self.db_remove_tag(cursor, tag, verbose)
175
      
176
      if verbose:
177
        print "\t\t"
178
      print "Entry updated: %s" % self.title
179
180
      #TODO: raise error upon failure
181
182
    elif verbose:
183
      print "\t\tNo update necessary: %s" % self.title
184
    
185
  """Add tag to entry, if not tagged as such alrady. If tag doesn't
186
  exist in the database, create it.
187
  Precondition: self.entry_id must be set
188
  
189
  cursor - database cursor for performing queries
190
  tag - the tag to add (string)
191
  verbose - output more information if set to True
192
  """
193
  def db_add_tag(self, cursor, tag, verbose=False):
194
    if not hasattr(self, 'entry_id'):
195
      raise Exception('entry_id must be set')
196
    
197
    added=False
198
    tag_id = self.db_get_tag_id(cursor, tag)
199
    if not tag_id:
200
      tag_id = self.db_insert_tag(cursor, tag)
201
      added = True
202
203
    if not self.db_tagged_with(cursor, tag_id):       
204
      cursor.execute("""
205
        INSERT INTO tag_relationships (entry_id, tag_id)
206
        VALUES (%s,%s) """, (self.entry_id, tag_id))
207
      #TODO: error checking
208
209
      if verbose:
210
        if added:
211
          print "\t\t\tTagged with %s [new]" % (tag)
212
        else:
213
          print "\t\t\tTagged with %s" % (tag)
214
215
  """Untags item. Does not delete tag from database."""
216
  def db_remove_tag(self, cursor, tag, verbose=False):
217
    if not hasattr(self, 'entry_id'):
218
      raise Exception('entry_id must be set')
219
      
220
    tag_id = self.db_get_tag_id(cursor, tag)
221
    if not tag_id:
222
      raise Exception('cannot find tag')
223
    
224
    cursor.execute("""
225
      DELETE FROM tag_relationships
226
      WHERE tag_id=%s AND entry_id=%s""", (tag_id, self.entry_id))
227
    #TODO: error checking
228
    
229
    if verbose:
230
      print "\t\t\tUntagged with %s" % (tag)
231
  
232
  """Returns the tag ID from the database."""
233
  def db_get_tag_id(self, cursor, tag):
234
    if not cursor.execute("SELECT id FROM tags WHERE name=%s", tag):
235
      return None
236
    else:
237
      return cursor._rows[0][0]
238
  
239
  """Return a list of all tags linked to this entry."""
240
  def db_get_tags(self, cursor):
241
    if not hasattr(self, 'entry_id'):
242
      raise Exception('entry_id must be set')
243
    
244
    tags = []
245
    #TODO: error checking
246
    cursor.execute("""
247
      SELECT t.name
248
      FROM tags t
249
      LEFT OUTER JOIN tag_relationships r ON t.id = r.tag_id
250
      WHERE r.entry_id=%s""", self.entry_id)
251
    
252
    for row in cursor._rows:
253
      tags += [row[0]]
254
    
255
    return tags
256
  
257
  """Insert tag into the database. Returns insert_id."""
258
  def db_insert_tag(self, cursor, tag):
259
    cursor.execute("INSERT INTO tags SET name=%s", tag)
260
    return cursor.lastrowid
261
    
262
  """Returns true if entry is tagged with given string, false o.w."""
263
  def db_tagged_with(self, cursor, tag_id):
264
    return cursor.execute("""
265
      SELECT * FROM tag_relationships
266
      WHERE entry_id=%s AND tag_id=%s""", (self.entry_id, tag_id))
267
268
"""Abstract classes."""
269
class BlogEntry(Entry):
270
  pass
271
272
class Comment(Entry):
273
  pass
274
275
class MicroBlogEntry(Entry):
276
  pass
277
  
278
class SharedItem(Entry):
279
  pass
280
281
class Favorite(SharedItem):
282
  pass
283
284
class MultimediaContent(SharedItem):
285
  def parse(self):
286
    SharedItem.parse(self)
287
    
288
    # The main link is the link to comments too
289
    self.comments = self.link
290
291
class StatusUpdate(Entry):
292
  pass
293
294
295
"""These classes can be used directly."""
296
class BackTypeComment(Comment):
297
  pass
298
299
class BlipFMUpdate(MicroBlogEntry):
300
  def parse(self):
301
    MicroBlogEntry.parse(self)
302
    self.tags += ['music'] #should I do this?
303
    
304
305
class DeliciousBookmark(SharedItem):
306
  """Delicious puts tags together, space separated."""
307
  def get_tags(self):
308
    tags = []
309
310
    if hasattr(self.entry, 'tags'):
311
      for tag in self.entry.tags[0]['term'].split():
312
        tags += [tag]
313
314
    return tags
315
316
class DuggItem(SharedItem):
317
  def parse(self):
318
    SharedItem.parse(self)
319
    self.comments = self.link
320
  
321
class FacebookPostedItem(SharedItem):
322
  pass
323
  #TODO: this needs to be smarter - input is a mess
324
  """def parse_other(self):
325
    # Need to catch errors, so this doesn't crash whole script
326
    try:
327
      x = minidom.parseString(self.summary)
328
      self.content = "%s %s %s" % (x.childNodes[0].childNodes[0].childNodes[0].childNodes[0].toxml(),
329
        x.childNodes[0].childNodes[1].childNodes[0].childNodes[0].childNodes[0].toxml(),
330
        x.childNodes[0].childNodes[1].childNodes[0].childNodes[1].childNodes[0].toxml())
331
      self.summary = x.childNodes[0].childNodes[1].childNodes[1].childNodes[0].childNodes[0].toxml()
332
    except Exception, e:
333
      print "Can't do it", e"""
334
335
class FacebookStatusUpdate(StatusUpdate):
336
  def get_content(self):
337
    return self.title
338
339
class FlickrPhoto(MultimediaContent):
340
  """Flickr dumps all sorts of different information into the content field.
341
  The first paragraph is redundant (username posted a photo);
342
  the second paragraph contains the thumbnail photo we want;
343
  if there's a third paragraph, it's a description -- this should be summary."""
344
  def parse(self):
345
    MultimediaContent.parse(self)
346
347
    try:
348
      # Get rid of line breaks, because that makes parsing more difficult
349
      new_content = ""
350
      for line in self.content.splitlines():
351
        new_content += line
352
  
353
      # Turn it into an xml object for easy parsing
354
      x = minidom.parseString('<div>' + new_content + '</div>')
355
  
356
      # Second paragraph is thumbnail (#TODO: not anymore? whitespace nonsense?)
357
      self.content = x.childNodes[0].childNodes[2].childNodes[0].toxml()
358
        
359
      # Third paragraph is description (optional)
360
      if len(x.childNodes[0].childNodes) >= 4:
361
        self.summary = x.childNodes[0].childNodes[3].childNodes[0].toxml()
362
    except IndexError:
363
      pass
364
365
class GitoriousUpdate(Entry):
366
  def parse(self):
367
    Entry.parse(self)
368
    self.tags += ['code'] #TODO: should I hardcode this?
369
  
370
  """No unique links, so we need a stronger criteria."""
371
  def db_get_entry_id(self, cursor):
372
    cursor.execute("""SELECT id FROM entries 
373
      WHERE link=%s AND rec_updated=%s""", 
374
      (self.link, self.updated_ts))
375
    if cursor.rowcount == 0:
376
      return None
377
    elif cursor.rowcount == 1:
378
      return cursor._rows[0][0]
379
    else:
380
      #TODO: need to output warning and move on, not raise exception
381
      print cursor._rows
382
      raise Exception("Cannot determine unqiue entry_id %d" % cursor.rowcount)
383
  
384
class GoogleReaderSharedItem(SharedItem):
385
  """Don't want tags because I didn't create them."""
386
  def get_tags(self):
387
    return []
388
  
389
  #TODO: db_has uniquely determined? what if I share something twice?
390
391
class IdenticaUpdate(MicroBlogEntry):
392
  def get_content(self):
393
    return self.title
394
395
class IdenticaFavorite(IdenticaUpdate):
396
  pass
397
398
class LastFMPlay(SharedItem):
399
  """Summary info isn't relevant at all here."""
400
  def parse(self):
401
    SharedItem.parse(self)
402
    self.summary = ""
403
    self.tags += ['music'] #should I do this?
404
  
405
  """Last.fm doesn't use unique links, so we need a stronger criteria."""
406
  def db_get_entry_id(self, cursor):
407
    cursor.execute("""SELECT id FROM entries 
408
      WHERE link=%s AND rec_updated=%s""", 
409
      (self.link, self.updated_ts))
410
    if cursor.rowcount == 0:
411
      return None
412
    elif cursor.rowcount == 1:
413
      return cursor._rows[0][0]
414
    else:
415
      #TODO: need to output warning and move on, not raise exception
416
      print cursor._rows
417
      raise Exception("Cannot determine unqiue entry_id %d" % cursor.rowcount)
418
      
419
class LibreFMPlay(LastFMPlay):
420
  """Libre.fm is using Dublin Core or something?."""
421
  def get_timestamp(self):
422
    return int(time.mktime(time.strptime(self.entry.dcterms_date, '%Y-%m-%dT%X+00:00'))) - time.timezone
423
424
class MySpaceBlogEntry(BlogEntry):
425
  def get_content(self):
426
    return self.summary
427
428
class GoogleReaderSharedItem(SharedItem):
429
  pass
430
431
class TTRSSPublishedItem(SharedItem):
432
  pass
433
434
class TwitterUpdate(MicroBlogEntry):
435
  def get_content(self):
436
    return self.title
437
  
438
  def parse(self):
439
    MicroBlogEntry.parse(self)
440
    m = re.search('([^:]+): (.+)', self.content)
441
    if (m):
442
      self.author = m.group(1)
443
      self.content = m.group(2)
444
      self.summary = m.group(2)
445
446
class TwitterFavorite(TwitterUpdate):
447
  pass
448
449
class TwitterSearchResult(TwitterUpdate):
450
  pass
451
452
class WordPressEntry(BlogEntry):
453
  def parse_other(self):
454
    self.comments_feed = self.entry.wfw_commentrss
455
456
class YouTubeVideo(MultimediaContent):
457
  def get_tags(self):
458
    tags = []
459
460
    if hasattr(self.entry, 'media_category'):
461
      for tag in self.entry.media_category.split():
462
        tags += [tag]
463
464
    return tags
465
466
class YouTubeFavorite(Favorite):
467
  pass