1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
u"""
4
Reads an RSS/Atom feed and converts the enclosures to AVI.
5
6
Dependencies:
7
8
- ``flvstreamer`` for the RTMP streams
9
- ``mimms`` for the MMS streams
10
- ``file`` to get the sizes of the videos
11
- ``mencoder`` to do the conversion
12
- ``tedtalksubs.py`` to dowload ted talks subtitles (in this repo)
13
14
:Authors:
15
    Aurélien Bompard <aurelien@bompard.org> <http://aurelien.bompard.org>
16
17
:License:
18
    GNU GPL v3 or later
19
"""
20
21
import os
22
import sys
23
import urllib2
24
import httplib
25
import glob
26
import subprocess
27
import re
28
import tempfile
29
import atexit
30
from urlparse import urlparse
31
from optparse import OptionParser
32
from stat import S_IRUSR, S_IWUSR, S_IROTH, S_IRGRP
33
#from xml.etree import ElementTree as ET
34
from pprint import pprint
35
36
from lxml import etree as ET
37
import urlgrabber
38
import urlgrabber.progress
39
40
# Tags to skip
41
EXCLUDE_TAGS = ""
42
# Max size of the encoded video
43
WIDTH = 800
44
HEIGHT = 480
45
# Default MIME type
46
MIME_DEFAULT = "video/x-msvideo"
47
EXTENSION = "avi"
48
49
50
def get_options():
51
    usage = "usage: %prog -i input_feed -u URL -o output_feed [-d directory]"
52
    parser = OptionParser(usage=usage)
53
    parser.add_option("-i", "--input", dest="input",
54
                      help="Process this file")
55
    parser.add_option("-o", "--output", dest="output",
56
                      help="Write the RSS in this file")
57
    parser.add_option("-d", "--directory", dest="directory",
58
                      help="Write the converted videos in this directory")
59
    parser.add_option("-u", "--url", dest="url",
60
                      help="The external URL of the video folder")
61
    parser.add_option("-W", "--width", dest="width", type="int", default=WIDTH,
62
          help="Width of the converted video [default: %default]")
63
    parser.add_option("-H", "--height", dest="height", type="int", default=HEIGHT,
64
          help="Height of the converted video [default: %default]")
65
    parser.add_option("-m", "--max", dest="max", type="int", default=10,
66
          help="Only convert that many videos, drop the rest [default: %default]")
67
    parser.add_option("-q", "--quiet", dest="quiet", action="store_true",
68
                      default=False, help="Don't show progress bars")
69
    parser.add_option("-k", "--keep", dest="keep", action="store_true",
70
                      default=False, help="Don't remove original files")
71
    parser.add_option("--exclude-tags", dest="exclude_tags",
72
                      default=EXCLUDE_TAGS, help="Drop videos tagged with a "
73
                      "tag in this comma-sparated list [default: %default]")
74
    parser.add_option("--subtitles", dest="subtitles", metavar="LANG",
75
                      help="Download subtitles in this language")
76
    parser.add_option("--old-ffmpeg", dest="oldffmpeg", action="store_true",
77
                      help="FFMpeg is old (like on Debian Lenny)")
78
    options, args = parser.parse_args()
79
    if len(args) > 0:
80
        parser.error("illegal arguments: %s" % ", ".join(args))
81
    if not options.input:
82
        parser.error("I need a file to process")
83
    if not os.path.exists(options.input):
84
        parser.error("The file to process does not exist")
85
    if not options.output:
86
        parser.error("I need a file to write to")
87
    if not options.url:
88
        parser.error("I need an external URL")
89
    if not options.directory and options.input != "-":
90
        options.directory = os.path.abspath(os.path.dirname(options.input))
91
    if not options.directory:
92
        parser.error("I need a directory for the videos")
93
    if not options.quiet and "TERM" not in os.environ:
94
        options.quiet = True # Not in a terminal, be quiet anyway
95
    if isinstance(options.exclude_tags, basestring):
96
        options.exclude_tags = [ t.strip() for t in
97
                                 options.exclude_tags.split(",") ]
98
    return options, args
99
100
101
class PodcastError(Exception): pass
102
class NotAPodcastError(PodcastError): pass
103
class TranscodingError(PodcastError): pass
104
class DownloadingError(PodcastError): pass
105
106
107
class Podcast(object):
108
109
    _mimetypes = {}
110
111
    def __init__(self, item):
112
        self.item = item
113
        self.enclosure = item.find("enclosure")
114
        if not ET.iselement(self.enclosure):
115
            raise NotAPodcastError()
116
        self.url = self._get_url()
117
        self.path_downloaded = self._get_downloaded_path()
118
        self.content_type = self._get_content_type()
119
        self.path_encoded = self._get_encoded_path()
120
        self.subs = self._get_subtitles()
121
        self.video_info = None
122
        self.size = None
123
124
    def _get_url(self):
125
        url = self.enclosure.get("url")
126
        if url.startswith(options.url):
127
            return url
128
        # Resolve redirects
129
        try:
130
            remote_file = urllib2.urlopen(url)
131
            url = remote_file.geturl()
132
            self.content_type = remote_file.info().get("Content-Type")
133
            remote_file.close()
134
        except (urllib2.HTTPError, httplib.HTTPException), e:
135
            print "Failed downloading %s" % url
136
            print e
137
        except urllib2.URLError, e:
138
            print "Probably RTMP or MMS: %s" % url
139
        return url
140
141
    def _get_content_type(self):
142
        if getattr(self, "content_type", None):
143
            return self.content_type # already set by _get_url()
144
        if "content-type" in self.enclosure.attrib:
145
            return self.enclosure.attrib.get("content-type")
146
        for extension, mimetype in self.mimetypes.iteritems():
147
            if self.path_downloaded.endswith("."+extension):
148
                return mimetype
149
        return MIME_DEFAULT
150
151
    def _get_downloaded_path(self):
152
        url_obj = urlparse(self.url)
153
        filename = os.path.basename(url_obj.path)
154
        if filename.count("?"):
155
            filename = filename[:filename.index("?")]
156
        return os.path.join(options.directory, filename)
157
158
    def _get_encoded_path(self):
159
        if self.content_type.startswith("audio/"):
160
            # the file won't be transcoded
161
            return self.path_downloaded
162
        filename_encoded = "%s.%s" % (
163
                os.path.splitext(self.path_downloaded)[0], EXTENSION)
164
        return os.path.join(options.directory, filename_encoded)
165
166
    def _get_subtitles(self):
167
        if not options.subtitles:
168
            return None
169
        subfile, subfile_path = tempfile.mkstemp(prefix="podcast-transcode-sub-",
170
                                                 suffix=".srt")
171
        os.close(subfile)
172
        atexit.register(os.remove, subfile_path)
173
        if (flux_xml.findtext("channel/title") == "TEDTalks (video)" or
174
                flux_xml.findtext("channel/title") == "TEDTalks (hd)"):
175
            talkid = self.item.findtext("guid").split(":")[1]
176
            subdl = subprocess.Popen(["tedtalksubs.py", "-l", options.subtitles,
177
                                      "-o", subfile_path, talkid],
178
                                     stdout=subprocess.PIPE,
179
                                     stderr=subprocess.STDOUT)
180
            out, err = subdl.communicate()
181
            if subdl.returncode != 0:
182
                print >>sys.stderr, "Failed to download subtitles. Message:"
183
                print >>sys.stderr, out
184
                return None
185
            return subfile_path
186
        return None
187
188
    def _get_mimetypes(self):
189
        if self._mimetypes:
190
            return self._mimetypes
191
        mimetypes_re = re.compile("\s*([^\s]+)\s+([^\s]+)\s*")
192
        mimetypes = open("/etc/mime.types")
193
        for line in mimetypes:
194
            line_mo = mimetypes_re.match(line)
195
            if not line_mo:
196
                continue
197
            mimetype = line_mo.group(1)
198
            extension = line_mo.group(2)
199
            if not mimetype.startswith("video/") \
200
                    and not mimetype.startswith("audio/"):
201
                continue
202
            self._mimetypes[line_mo.group(2)] = line_mo.group(1)
203
        mimetypes.close()
204
        return self._mimetypes
205
    mimetypes = property(_get_mimetypes)
206
207
    def is_already_transcoded(self):
208
        return self.url.startswith(options.url)
209
210
    def process(self):
211
        if not os.path.exists(self.path_encoded):
212
            self.download()
213
            self.encode_video()
214
        else:
215
            print "Already downloaded/encoded: %s" % self.path_encoded
216
        self.url = "%s/%s" % (options.url, os.path.basename(self.path_encoded))
217
        self.size = int(os.stat(self.path_encoded).st_size)
218
        self.update_item()
219
220
    def update_item(self):
221
        self.enclosure.set("url", self.url)
222
        self.enclosure.set("length", str(self.size))
223
        self.enclosure.set("type", self.content_type)
224
        fb = self.item.find("{http://rssnamespace.org/feedburner/ext/1.0}origEnclosureLink")
225
        if ET.iselement(fb):
226
            fb.text = self.url
227
        mediacontent = self.item.find("{http://search.yahoo.com/mrss/}content")
228
        if ET.iselement(mediacontent):
229
            mediacontent.set("url", self.url)
230
            mediacontent.set("fileSize", str(self.size))
231
            mediacontent.set("type", self.content_type)
232
233
    def download(self):
234
        if os.path.exists(self.path_downloaded):
235
            print "Already downloaded: %s" % self.path_downloaded
236
            return
237
        if self.url.startswith("rtmp://"):
238
            self.download_rtmp()
239
        elif self.url.startswith("mms://"):
240
            self.download_mms()
241
        elif self.content_type and self.content_type == "video/x-ms-asf":
242
            self.download_asf()
243
        else:
244
            if options.quiet:
245
                progress = urlgrabber.progress.BaseMeter()
246
            else:
247
                progress = urlgrabber.progress.TextMeter(fo=sys.stdout)
248
            print "Downloading %s to %s" % (self.url, self.path_downloaded)
249
            try:
250
                urlgrabber.urlgrab(self.url, filename=self.path_downloaded,
251
                                   reget='simple', progress_obj=progress)
252
            except urlgrabber.grabber.URLGrabError, e:
253
                raise DownloadingError("Error downloading %s: %s"
254
                                       % (self.url, e))
255
256
    def download_rtmp(self):
257
        MAX_TRIES = 10
258
        def download_rtmp_unit(url, path):
259
            command = ["flvstreamer", "-r", url, "-o", path]
260
            if options.quiet:
261
                command.append("-q")
262
            if os.path.exists(path):
263
                command.insert(1, "--resume")
264
            print "Streaming %s to %s" % (url, path)
265
            retcode = 0
266
            try:
267
                retcode = subprocess.call(command)
268
            except KeyboardInterrupt:
269
                retcode = 1
270
            return retcode
271
272
        retcode = download_rtmp_unit(self.url, self.path_downloaded)
273
        # flvstreamer returns 2 if the download is incomplete
274
        current_try = 1
275
        while retcode == 2:
276
            print "Trying again..."
277
            retcode = download_rtmp_unit(self.url, self.path_downloaded)
278
            current_try += 1
279
            if current_try > MAX_TRIES:
280
                print "Too many tries, aborting."
281
                break
282
        if retcode != 0:
283
            if os.path.exists(self.path_downloaded):
284
                os.remove(self.path_downloaded)
285
            raise DownloadingError("Error code: %s" % retcode)
286
287
    def download_mms(self):
288
        #command = ["mplayer", "-dumpstream", "-dumpfile", self.path_downloaded, self.url]
289
        command = ["mimms", self.url, self.path_downloaded]
290
        if options.quiet:
291
            command.append("-q")
292
        try:
293
            print "Streaming %s to %s" % (self.url, self.path_downloaded)
294
            retcode = 0
295
            retcode = subprocess.call(command)
296
        except KeyboardInterrupt:
297
            retcode = 1
298
        if retcode != 0 and os.path.exists(self.path_downloaded):
299
            os.remove(self.path_downloaded)
300
            raise DownloadingError("Error code: %s" % retcode)
301
302
    def download_asf(self):
303
        mms_xml = urllib2.urlopen(self.url).read()
304
        mms_match = re.search('"(mms://.*)"', mms_xml)
305
        mms_url = mms_match.group(1)
306
        return download_mms(mms_url.replace("&amp;", "&"), self.path_downloaded)
307
308
    def encode_video(self):
309
        if self.path_encoded == self.path_downloaded:
310
            print "No transcoding required"
311
            return
312
        transcoded_video = self._transcode_video()
313
        os.rename(transcoded_video, self.path_encoded)
314
        os.chmod(self.path_encoded, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH) # 644
315
316
    def _transcode_video(self):
317
        width, height = self.get_video_info()
318
        transcoded_video_file, transcoded_video = tempfile.mkstemp(
319
                prefix="podcast-transcode-", suffix=".avi", dir=options.directory)
320
        os.close(transcoded_video_file)
321
        def rm_if_exists(f):
322
            if os.path.exists(f):
323
                os.remove(f)
324
        if not options.keep:
325
            atexit.register(rm_if_exists, transcoded_video)
326
        command = ["mencoder", "-oac", "mp3lame",
327
                   "-ovc", "lavc", "-lavcopts", "vbitrate=600",
328
                   "-of", "avi", "-mc", "0", self.path_downloaded,
329
                   "-o", transcoded_video]
330
        if height and width:
331
            if int(height) > options.height:
332
                command.extend(["-vf", "scale=-3:%d" % options.height])
333
            elif int(width) > options.width:
334
                command.extend(["-vf", "scale=%d:-3" % options.width])
335
        if options.quiet:
336
            command.append("-quiet")
337
        if self.subs:
338
            command.extend(["-sub", self.subs, "-subfont-text-scale", "4"])
339
            if not os.path.exists(os.path.expanduser("~/.mplayer/subfont.ttf")):
340
                command.extend(["-fontconfig", "-font", "DejaVu Sans"])
341
        print " ".join(command)
342
        retcode = 0
343
        try:
344
            print "Encoding %s to %s" % (self.path_downloaded, transcoded_video)
345
            retcode = subprocess.call(command)
346
        except KeyboardInterrupt:
347
            retcode = 1
348
        if retcode != 0:
349
            if os.path.exists(transcoded_video):
350
                os.remove(transcoded_video)
351
            raise TranscodingError("Error code: %s" % retcode)
352
        self.content_type = "video/x-msvideo"
353
        return transcoded_video
354
355
    def get_video_info(self):
356
        if self.video_info is not None:
357
            return self.video_info
358
        ffmpeg_cmd = subprocess.Popen(["ffmpeg", "-i", self.path_downloaded],
359
                        stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
360
        output = ffmpeg_cmd.stdout.read()
361
        info_match = re.search("Stream .*: Video: (\w+), \w+, (\d+)x(\d+)", output)
362
        if info_match:
363
            self.video_info = (info_match.group(2), info_match.group(3))
364
        else:
365
            self.video_info = (None, None)
366
        return self.video_info
367
368
369
def cleanup(items):
370
    feed_podcasts = set()
371
    for item in items:
372
        try:
373
            podcast = Podcast(item)
374
        except NotAPodcastError:
375
            continue
376
        feed_podcasts.add(os.path.basename(podcast.path_downloaded))
377
        feed_podcasts.add(os.path.basename(podcast.path_encoded))
378
379
    if options.keep:
380
        return
381
382
    for filepath in glob.glob(os.path.join(options.directory, "*")):
383
        if filepath.endswith(".xml"):
384
            continue # keep the RSS feed
385
        filename = os.path.basename(filepath)
386
        if filename not in feed_podcasts:
387
            print "Removing old file %s" % filename
388
            #print feed_podcasts
389
            os.remove(filepath)
390
391
392
def handle_item(item):
393
    try:
394
        podcast = Podcast(item)
395
    except NotAPodcastError:
396
        return
397
398
    if podcast.is_already_transcoded():
399
        print "Already converted: %s" % podcast.url
400
        return
401
    if options.subtitles and not podcast.subs:
402
        print "No subtitles for %s, skipping." % item.findtext("guid")
403
        flux_xml.find("channel").remove(item)
404
        return
405
406
    try:
407
        podcast.process()
408
    except DownloadingError, e:
409
        print e
410
        return
411
    except TranscodingError, e:
412
        print e
413
        return
414
415
416
def to_skip(item):
417
    tags = item.findall("category")
418
    for tag in tags:
419
        if tag.text in options.exclude_tags:
420
            return True
421
    return False
422
423
def main():
424
    global options, flux_xml
425
    options, args = get_options()
426
    if options.input == "-":
427
        options.input = sys.stdin
428
    flux_xml = ET.parse(options.input)
429
    items = flux_xml.findall("channel/item")
430
    # tag skipping
431
    for item in items[:]:
432
        if to_skip(item):
433
            flux_xml.find("channel").remove(item)
434
            items.remove(item)
435
    for i, item in enumerate(items[:]):
436
        if i < options.max:
437
            handle_item(item)
438
        else:
439
            flux_xml.find("channel").remove(item)
440
441
    #flux_xml.write(options.output, "utf-8")
442
    flux_xml.write(options.output)
443
    cleanup(items[:options.max])
444
445
446
if __name__ == "__main__":
447
    main()