1
#!/bin/python
2
3
#TODO: This needs to be resilient so when one subclass returns an error, the
4
# whole thing doesn't crash.
5
#TODO: what about http:// vs. http://www ... must be careful
6
#TODO: time is time committed, not time the feed was grabbed...
7
#TODO: put database table names in a variable or something
8
#TODO: need to get rid of latin-1 encoding for SQL queries
9
#TODO: don't check feeds that haven't been updated since we last checked them 
10
# -- can only do this on feeds whose contents won't change 
11
# (LastFM, Microblogs), but... what about deleted content??
12
#TODO: tags and categories, whitelist through subclasses for services that support it
13
#TODO: I'm ignoring the published and published_parsed dates entirely
14
#TODO: make sure the user defined classes and wonky feeds don't BREAK the whole script!
15
16
import feedparser
17
import entry
18
import time
19
import re
20
21
"""An object representing an RSS/Atom feed in a unified, simplified standard
22
format. This object is the universal mold for which feeds from various services
23
can be understood. It can be subclassed in order to handle the nuances or
24
quirks of any particular web service.
25
"""
26
class Feed:
27
  entryClass = entry.Entry
28
  
29
  """Initialize the object."""
30
  def __init__(self, url):
31
    self.href = url
32
    self.feed = feedparser.parse(self.href)
33
34
  #TODO: what happens on failure? 
35
  """Parse the feed and store the desired data."""
36
  def parse(self):
37
    if not self.feed:
38
      return
39
40
    # Collect data from the feed object's direct attributes
41
    attrs = ['version', 'encoding', 'title', 'subtitle']
42
    for attr in attrs:
43
      setattr(self, attr, getattr(self.feed, attr, "") )
44
    
45
    # Other attributes burried deeper
46
    attrs = ['title', 'subtitle', 'link']
47
    for attr in attrs:
48
      setattr(self, attr, getattr(self.feed.feed, attr, "") )
49
    
50
    # Strip HTML from subtitle
51
    self.subtitle = re.sub(r'<[^>]*?>', '', self.subtitle)
52
53
    # Timestamp for last update
54
    self.updated_ts = self.get_timestamp()
55
    
56
    # Parse entries
57
    self.entries = []
58
    for e in self.feed.entries:
59
      self.entries += [self.entryClass(e)]
60
    return True
61
  
62
  """Retrieve the 'updated' timestamp."""
63
  def get_timestamp(self):
64
    if hasattr(self.feed, 'updated'):
65
      return int(time.mktime(self.feed.updated))
66
    else:
67
      return int(time.time())
68
  
69
  """Output the feed to xml using a DOM object.
70
71
  doc - DOM object
72
73
  Returns a populated DOM object.
74
  """
75
  def toXML(self, doc):
76
    xml = doc.createElement('feed')
77
    
78
    attrs = ['version', 'encoding', 'href', 'link', 'updated']
79
    for attr in attrs:
80
      if getattr(self, attr):
81
        xml.setAttribute(attr, getattr(self, attr))
82
  
83
    elements = ['title', 'subtitle']
84
    for element in elements:
85
      if getattr(self, element):
86
        e = doc.createElement(element)
87
        e.appendChild( doc.createTextNode( getattr(self, element) ) )
88
        xml.appendChild( e )
89
    
90
    for entry in self.entries:
91
      xml.appendChild( entry.toXML(doc) )
92
    
93
    return xml
94
  
95
  """Ensure this feed and its contents are in the database and up-to-date. If
96
  force is true, update all entries regardless of whether or not an update
97
  seems to be necessary."""
98
  def db_check(self, cursor, verbose=False, force=False):
99
    # Check the feed
100
    if self.db_has(cursor):
101
      self.db_update(cursor, verbose, force)
102
    else:
103
      self.db_insert(cursor, verbose) 
104
105
  """Return the ID of this feed in the database.
106
  pre-condition: feed must exist in the database
107
  """
108
  def db_get_feed_id(self, cursor):
109
    cursor.execute("SELECT `id` FROM `feeds` WHERE `href` = %s", self.href)
110
    return cursor.fetchone()[0]
111
    
112
    
113
  """Check to see if this feed exists in the database."""
114
  def db_has(self, cursor):
115
    cursor.execute("SELECT * FROM feeds WHERE href= %s", self.href)
116
    return cursor.rowcount > 0
117
  
118
  """Insert feed information into the database.
119
  pre-condition: db_has must be called before to avoid duplicate entries
120
  """
121
  def db_insert(self, cursor, verbose=False, force=False):
122
    cursor.execute(
123
      """INSERT INTO feeds (href, title, subtitle, link, encoding, 
124
        version, rec_created, rec_updated, type)
125
      VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)""",
126
      (self.href, self.title, self.subtitle, self.link, self.encoding, 
127
        self.version, int(time.time()), self.updated_ts, 
128
        self.__class__.__name__))
129
    
130
    if verbose:
131
      print "\tFeed inserted: %s" % self.title
132
    
133
    #TODO: raise error upon failure
134
    
135
    # Check entries (even new entries could be duplicates if href changes)
136
    for entry in self.entries:
137
      entry.db_check(cursor, self.db_get_feed_id(cursor), force)  
138
  
139
  """Returns turn if the feed timestampe is more recent than the timestamp in
140
  the database (i.e. the database is out of date); false otherwise.
141
  """
142
  def db_needs_update(self,cursor):
143
    cursor.execute(
144
      """SELECT COUNT(*) FROM feeds
145
        WHERE href=%s AND rec_updated < %s""",
146
      (self.href, self.updated_ts))
147
    return cursor.fetchone()[0] == 1
148
  
149
  """Update feed information in the database. If force is true, do the update
150
  regardless, otherwise check db_needs_update() to see if it's necessary.
151
  pre-condition: db_has must be called first to avoid errors """
152
  def db_update(self, cursor, verbose=False, force=False):
153
    if force or self.db_needs_update(cursor):
154
      cursor.execute(
155
        """UPDATE feeds SET title=%s, subtitle=%s, link=%s, encoding=%s,
156
        version=%s, rec_updated=%s, type=%s WHERE href=%s""",
157
        (self.title, self.subtitle, self.link, self.encoding, 
158
          self.version, self.updated_ts, self.__class__.__name__,
159
          self.href))
160
      if verbose:
161
        print "\tFeed updated: %s" % self.title
162
  
163
      # Check entries (some might be new, some might be old)
164
      for entry in self.entries:
165
        entry.db_check(cursor,self.db_get_feed_id(cursor),verbose,force)
166
    elif verbose:
167
      print "\tNo update necessary: %s" % self.title
168
169
"""These are essentially abstract classes."""
170
class Blog(Feed):
171
  entryClass = entry.BlogEntry
172
  def get_timestamp(self):
173
    if hasattr(self.feed, 'updated'):
174
      return int(time.mktime(self.feed.updated))
175
    elif hasattr(self.feed.feed, 'lastbuilddate'):
176
      return int(time.mktime(time.strptime(self.feed.feed.lastbuilddate, '%a, %d %b %Y %H:%M:%S %Z')))
177
    elif hasattr(self.feed.feed, 'updated_parsed'):
178
      return int(time.mktime(self.feed.feed.updated_parsed))
179
    else:
180
      return int(time.time())
181
182
class Comments(Feed):
183
  entryClass = entry.Comment
184
  
185
class MicroBlog(Feed):
186
  entryClass = entry.MicroBlogEntry
187
188
class SharedItems(Feed):
189
  entryClass = entry.SharedItem 
190
191
class Favorites(SharedItems):
192
  entryClass = entry.Favorite
193
194
class Multimedia(SharedItems):
195
  entryClass = entry.MultimediaContent
196
  
197
class StatusUpdates(Feed):
198
  entryClass = entry.StatusUpdate
199
200
201
"""These classes should be used directly."""
202
class BackType(Comments):
203
  entryClass = entry.BackTypeComment
204
205
class BlipFM(MicroBlog):
206
  entryClass = entry.BlipFMUpdate
207
208
class Delicious(SharedItems):
209
  entryClass = entry.DeliciousBookmark
210
211
class Digg(SharedItems):
212
  entryClass = entry.DuggItem
213
214
class FacebookPostedItems(SharedItems):
215
  entryClass = entry.FacebookPostedItem
216
217
class FacebookStatusUpdates(StatusUpdates):
218
  entryClass = entry.FacebookStatusUpdate
219
  
220
class Flickr(Multimedia):
221
  entryClass = entry.FlickrPhoto
222
  """Flickr's feed title isn't very descriptive."""
223
  def parse(self):
224
    success = Multimedia.parse(self)
225
    self.title = 'Flickr ' + self.title
226
    return success
227
228
class Gitorious(Feed):
229
  entryClass = entry.GitoriousUpdate
230
231
class GoogleReaderSharedItems(SharedItems):
232
  entryClass = entry.GoogleReaderSharedItem
233
  def get_timestamp(self):
234
    return int(time.mktime(self.feed.feed.updated_parsed)) #TODO: what's the pattern?
235
236
class Identica(MicroBlog):
237
  entryClass = entry.IdenticaUpdate
238
  
239
  """Identica's feed title isn't very descriptive."""
240
  def parse(self):
241
    success = MicroBlog.parse(self)
242
    self.title = 'Identi.ca - ' + self.title
243
    return success
244
245
class IdenticaFavorites(Favorites):
246
  entryClass = entry.IdenticaFavorite
247
  
248
class LastFM(SharedItems):
249
  entryClass = entry.LastFMPlay
250
251
class LibreFM(SharedItems):
252
  entryClass = entry.LibreFMPlay
253
254
  """Libre.fm is doing some RDF things that are different."""
255
  def parse(self):
256
    success = SharedItems.parse(self)
257
    self.link = self.feed.feed.links[0]['rdf:resource']
258
    return success
259
260
class MySpaceBlog(Blog):
261
  entryClass = entry.MySpaceBlogEntry
262
263
class TTRSSPublishedItems(SharedItems):
264
  entryClass = entry.TTRSSPublishedItem
265
266
class Twitter(MicroBlog):
267
  entryClass = entry.TwitterUpdate
268
269
class TwitterFavorites(Favorites):
270
  entryClass = entry.TwitterFavorite
271
272
class TwitterSearch(Twitter):
273
  entryClass = entry.TwitterSearchResult
274
  
275
class WordPress(Blog):
276
  entryClass = entry.WordPressEntry
277
  
278
class YouTube(Multimedia):
279
  entryClass = entry.YouTubeVideo
280
281
class YouTubeFavorites(Favorites):
282
  entryClass = entry.YouTubeFavorite