a18bfc9 by Aurélien Bompard at 2010-04-11 1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
u"""
4
Reads an RSS/Atom feed and converts the enclosures to AVI.
5
6
Dependencies:
7
8
- ``flvstreamer`` for the RTMP streams
9
- ``mimms`` for the MMS streams
10
- ``file`` to get the sizes of the videos
11
- ``mencoder`` to do the conversion
fcbb169 by Aurélien Bompard at 2010-05-09 12
- ``tedtalksubs.py`` to dowload ted talks subtitles (in this repo)
a18bfc9 by Aurélien Bompard at 2010-04-11 13
3ecf069 by Aurélien Bompard at 2011-11-05 14
:Authors:
15
    Aurélien Bompard <aurelien@bompard.org> <http://aurelien.bompard.org>
a18bfc9 by Aurélien Bompard at 2010-04-11 16
3ecf069 by Aurélien Bompard at 2011-11-05 17
:License:
18
    GNU GPL v3 or later
a18bfc9 by Aurélien Bompard at 2010-04-11 19
"""
20
21
import os
22
import sys
23
import urllib2
45222ed by Aurélien Bompard at 2012-01-07 24
import httplib
a18bfc9 by Aurélien Bompard at 2010-04-11 25
import glob
26
import subprocess
27
import re
2536794 by Aurélien Bompard at 2010-05-23 28
import tempfile
29
import atexit
a18bfc9 by Aurélien Bompard at 2010-04-11 30
from urlparse import urlparse
31
from optparse import OptionParser
26929f9 by Aurélien Bompard at 2011-11-06 32
from stat import S_IRUSR, S_IWUSR, S_IROTH, S_IRGRP
a18bfc9 by Aurélien Bompard at 2010-04-11 33
#from xml.etree import ElementTree as ET
34
from pprint import pprint
35
fcbb169 by Aurélien Bompard at 2010-05-09 36
from lxml import etree as ET
a18bfc9 by Aurélien Bompard at 2010-04-11 37
import urlgrabber
38
import urlgrabber.progress
39
079fdc8 by Aurélien Bompard at 2010-05-09 40
# Tags to skip
fcbb169 by Aurélien Bompard at 2010-05-09 41
EXCLUDE_TAGS = ""
3ecf069 by Aurélien Bompard at 2011-11-05 42
# Max size of the encoded video
43
WIDTH = 800
44
HEIGHT = 480
079fdc8 by Aurélien Bompard at 2010-05-09 45
# Default MIME type
26929f9 by Aurélien Bompard at 2011-11-06 46
MIME_DEFAULT = "video/x-msvideo"
47
EXTENSION = "avi"
3ecf069 by Aurélien Bompard at 2011-11-05 48
a18bfc9 by Aurélien Bompard at 2010-04-11 49
50
def get_options():
51
    usage = "usage: %prog -i input_feed -u URL -o output_feed [-d directory]"
52
    parser = OptionParser(usage=usage)
53
    parser.add_option("-i", "--input", dest="input",
54
                      help="Process this file")
55
    parser.add_option("-o", "--output", dest="output",
56
                      help="Write the RSS in this file")
57
    parser.add_option("-d", "--directory", dest="directory",
58
                      help="Write the converted videos in this directory")
59
    parser.add_option("-u", "--url", dest="url",
60
                      help="The external URL of the video folder")
3ecf069 by Aurélien Bompard at 2011-11-05 61
    parser.add_option("-W", "--width", dest="width", type="int", default=WIDTH,
fcbb169 by Aurélien Bompard at 2010-05-09 62
          help="Width of the converted video [default: %default]")
3ecf069 by Aurélien Bompard at 2011-11-05 63
    parser.add_option("-H", "--height", dest="height", type="int", default=HEIGHT,
64
          help="Height of the converted video [default: %default]")
fcbb169 by Aurélien Bompard at 2010-05-09 65
    parser.add_option("-m", "--max", dest="max", type="int", default=10,
66
          help="Only convert that many videos, drop the rest [default: %default]")
67
    parser.add_option("-q", "--quiet", dest="quiet", action="store_true",
68
                      default=False, help="Don't show progress bars")
3ecf069 by Aurélien Bompard at 2011-11-05 69
    parser.add_option("-k", "--keep", dest="keep", action="store_true",
70
                      default=False, help="Don't remove original files")
fcbb169 by Aurélien Bompard at 2010-05-09 71
    parser.add_option("--exclude-tags", dest="exclude_tags",
72
                      default=EXCLUDE_TAGS, help="Drop videos tagged with a "
73
                      "tag in this comma-sparated list [default: %default]")
74
    parser.add_option("--subtitles", dest="subtitles", metavar="LANG",
75
                      help="Download subtitles in this language")
3ecf069 by Aurélien Bompard at 2011-11-05 76
    parser.add_option("--old-ffmpeg", dest="oldffmpeg", action="store_true",
77
                      help="FFMpeg is old (like on Debian Lenny)")
a18bfc9 by Aurélien Bompard at 2010-04-11 78
    options, args = parser.parse_args()
79
    if len(args) > 0:
80
        parser.error("illegal arguments: %s" % ", ".join(args))
81
    if not options.input:
82
        parser.error("I need a file to process")
83
    if not os.path.exists(options.input):
84
        parser.error("The file to process does not exist")
85
    if not options.output:
86
        parser.error("I need a file to write to")
87
    if not options.url:
88
        parser.error("I need an external URL")
89
    if not options.directory and options.input != "-":
90
        options.directory = os.path.abspath(os.path.dirname(options.input))
91
    if not options.directory:
92
        parser.error("I need a directory for the videos")
fcbb169 by Aurélien Bompard at 2010-05-09 93
    if not options.quiet and "TERM" not in os.environ:
94
        options.quiet = True # Not in a terminal, be quiet anyway
95
    if isinstance(options.exclude_tags, basestring):
96
        options.exclude_tags = [ t.strip() for t in
97
                                 options.exclude_tags.split(",") ]
a18bfc9 by Aurélien Bompard at 2010-04-11 98
    return options, args
99
f571082 by Aurélien Bompard at 2010-07-22 100
101
class PodcastError(Exception): pass
102
class NotAPodcastError(PodcastError): pass
103
class TranscodingError(PodcastError): pass
104
class DownloadingError(PodcastError): pass
105
106
107
class Podcast(object):
108
109
    _mimetypes = {}
110
111
    def __init__(self, item):
112
        self.item = item
113
        self.enclosure = item.find("enclosure")
114
        if not ET.iselement(self.enclosure):
115
            raise NotAPodcastError()
116
        self.url = self._get_url()
117
        self.path_downloaded = self._get_downloaded_path()
118
        self.content_type = self._get_content_type()
119
        self.path_encoded = self._get_encoded_path()
120
        self.subs = self._get_subtitles()
121
        self.video_info = None
122
        self.size = None
123
124
    def _get_url(self):
125
        url = self.enclosure.get("url")
126
        if url.startswith(options.url):
127
            return url
128
        # Resolve redirects
129
        try:
130
            remote_file = urllib2.urlopen(url)
131
            url = remote_file.geturl()
132
            self.content_type = remote_file.info().get("Content-Type")
133
            remote_file.close()
45222ed by Aurélien Bompard at 2012-01-07 134
        except (urllib2.HTTPError, httplib.HTTPException), e:
f571082 by Aurélien Bompard at 2010-07-22 135
            print "Failed downloading %s" % url
136
            print e
137
        except urllib2.URLError, e:
138
            print "Probably RTMP or MMS: %s" % url
139
        return url
140
141
    def _get_content_type(self):
142
        if getattr(self, "content_type", None):
143
            return self.content_type # already set by _get_url()
144
        if "content-type" in self.enclosure.attrib:
145
            return self.enclosure.attrib.get("content-type")
146
        for extension, mimetype in self.mimetypes.iteritems():
147
            if self.path_downloaded.endswith("."+extension):
148
                return mimetype
149
        return MIME_DEFAULT
150
151
    def _get_downloaded_path(self):
152
        url_obj = urlparse(self.url)
153
        filename = os.path.basename(url_obj.path)
154
        if filename.count("?"):
155
            filename = filename[:filename.index("?")]
156
        return os.path.join(options.directory, filename)
157
158
    def _get_encoded_path(self):
159
        if self.content_type.startswith("audio/"):
160
            # the file won't be transcoded
161
            return self.path_downloaded
3ecf069 by Aurélien Bompard at 2011-11-05 162
        filename_encoded = "%s.%s" % (
163
                os.path.splitext(self.path_downloaded)[0], EXTENSION)
f571082 by Aurélien Bompard at 2010-07-22 164
        return os.path.join(options.directory, filename_encoded)
165
166
    def _get_subtitles(self):
167
        if not options.subtitles:
168
            return None
169
        subfile, subfile_path = tempfile.mkstemp(prefix="podcast-transcode-sub-",
170
                                                 suffix=".srt")
171
        os.close(subfile)
172
        atexit.register(os.remove, subfile_path)
3ecf069 by Aurélien Bompard at 2011-11-05 173
        if (flux_xml.findtext("channel/title") == "TEDTalks (video)" or
174
                flux_xml.findtext("channel/title") == "TEDTalks (hd)"):
f571082 by Aurélien Bompard at 2010-07-22 175
            talkid = self.item.findtext("guid").split(":")[1]
176
            subdl = subprocess.Popen(["tedtalksubs.py", "-l", options.subtitles,
177
                                      "-o", subfile_path, talkid],
178
                                     stdout=subprocess.PIPE,
179
                                     stderr=subprocess.STDOUT)
180
            out, err = subdl.communicate()
181
            if subdl.returncode != 0:
182
                print >>sys.stderr, "Failed to download subtitles. Message:"
183
                print >>sys.stderr, out
184
                return None
185
            return subfile_path
186
        return None
187
188
    def _get_mimetypes(self):
189
        if self._mimetypes:
190
            return self._mimetypes
191
        mimetypes_re = re.compile("\s*([^\s]+)\s+([^\s]+)\s*")
192
        mimetypes = open("/etc/mime.types")
193
        for line in mimetypes:
194
            line_mo = mimetypes_re.match(line)
195
            if not line_mo:
196
                continue
197
            mimetype = line_mo.group(1)
198
            extension = line_mo.group(2)
199
            if not mimetype.startswith("video/") \
200
                    and not mimetype.startswith("audio/"):
201
                continue
202
            self._mimetypes[line_mo.group(2)] = line_mo.group(1)
203
        mimetypes.close()
204
        return self._mimetypes
205
    mimetypes = property(_get_mimetypes)
206
207
    def is_already_transcoded(self):
208
        return self.url.startswith(options.url)
209
210
    def process(self):
211
        if not os.path.exists(self.path_encoded):
212
            self.download()
213
            self.encode_video()
214
        else:
215
            print "Already downloaded/encoded: %s" % self.path_encoded
216
        self.url = "%s/%s" % (options.url, os.path.basename(self.path_encoded))
217
        self.size = int(os.stat(self.path_encoded).st_size)
218
        self.update_item()
219
220
    def update_item(self):
221
        self.enclosure.set("url", self.url)
222
        self.enclosure.set("length", str(self.size))
223
        self.enclosure.set("type", self.content_type)
224
        fb = self.item.find("{http://rssnamespace.org/feedburner/ext/1.0}origEnclosureLink")
225
        if ET.iselement(fb):
226
            fb.text = self.url
227
        mediacontent = self.item.find("{http://search.yahoo.com/mrss/}content")
228
        if ET.iselement(mediacontent):
229
            mediacontent.set("url", self.url)
230
            mediacontent.set("fileSize", str(self.size))
231
            mediacontent.set("type", self.content_type)
232
233
    def download(self):
234
        if os.path.exists(self.path_downloaded):
235
            print "Already downloaded: %s" % self.path_downloaded
236
            return
237
        if self.url.startswith("rtmp://"):
238
            self.download_rtmp()
239
        elif self.url.startswith("mms://"):
240
            self.download_mms()
241
        elif self.content_type and self.content_type == "video/x-ms-asf":
242
            self.download_asf()
243
        else:
244
            if options.quiet:
245
                progress = urlgrabber.progress.BaseMeter()
246
            else:
247
                progress = urlgrabber.progress.TextMeter(fo=sys.stdout)
248
            print "Downloading %s to %s" % (self.url, self.path_downloaded)
6d12ae3 by Aurélien Bompard at 2010-09-23 249
            try:
250
                urlgrabber.urlgrab(self.url, filename=self.path_downloaded,
251
                                   reget='simple', progress_obj=progress)
252
            except urlgrabber.grabber.URLGrabError, e:
253
                raise DownloadingError("Error downloading %s: %s"
254
                                       % (self.url, e))
f571082 by Aurélien Bompard at 2010-07-22 255
256
    def download_rtmp(self):
257
        MAX_TRIES = 10
258
        def download_rtmp_unit(url, path):
259
            command = ["flvstreamer", "-r", url, "-o", path]
260
            if options.quiet:
261
                command.append("-q")
262
            if os.path.exists(path):
263
                command.insert(1, "--resume")
264
            print "Streaming %s to %s" % (url, path)
265
            retcode = 0
266
            try:
267
                retcode = subprocess.call(command)
268
            except KeyboardInterrupt:
269
                retcode = 1
270
            return retcode
271
272
        retcode = download_rtmp_unit(self.url, self.path_downloaded)
273
        # flvstreamer returns 2 if the download is incomplete
274
        current_try = 1
275
        while retcode == 2:
276
            print "Trying again..."
277
            retcode = download_rtmp_unit(self.url, self.path_downloaded)
278
            current_try += 1
279
            if current_try > MAX_TRIES:
280
                print "Too many tries, aborting."
281
                break
282
        if retcode != 0:
283
            if os.path.exists(self.path_downloaded):
284
                os.remove(self.path_downloaded)
285
            raise DownloadingError("Error code: %s" % retcode)
286
287
    def download_mms(self):
288
        #command = ["mplayer", "-dumpstream", "-dumpfile", self.path_downloaded, self.url]
289
        command = ["mimms", self.url, self.path_downloaded]
290
        if options.quiet:
291
            command.append("-q")
292
        try:
293
            print "Streaming %s to %s" % (self.url, self.path_downloaded)
294
            retcode = 0
295
            retcode = subprocess.call(command)
296
        except KeyboardInterrupt:
297
            retcode = 1
298
        if retcode != 0 and os.path.exists(self.path_downloaded):
299
            os.remove(self.path_downloaded)
300
            raise DownloadingError("Error code: %s" % retcode)
301
302
    def download_asf(self):
303
        mms_xml = urllib2.urlopen(self.url).read()
304
        mms_match = re.search('"(mms://.*)"', mms_xml)
305
        mms_url = mms_match.group(1)
306
        return download_mms(mms_url.replace("&amp;", "&"), self.path_downloaded)
307
308
    def encode_video(self):
309
        if self.path_encoded == self.path_downloaded:
310
            print "No transcoding required"
311
            return
3ecf069 by Aurélien Bompard at 2011-11-05 312
        transcoded_video = self._transcode_video()
26929f9 by Aurélien Bompard at 2011-11-06 313
        os.rename(transcoded_video, self.path_encoded)
314
        os.chmod(self.path_encoded, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH) # 644
3ecf069 by Aurélien Bompard at 2011-11-05 315
316
    def _transcode_video(self):
f571082 by Aurélien Bompard at 2010-07-22 317
        width, height = self.get_video_info()
3ecf069 by Aurélien Bompard at 2011-11-05 318
        transcoded_video_file, transcoded_video = tempfile.mkstemp(
319
                prefix="podcast-transcode-", suffix=".avi", dir=options.directory)
320
        os.close(transcoded_video_file)
321
        def rm_if_exists(f):
322
            if os.path.exists(f):
323
                os.remove(f)
324
        if not options.keep:
325
            atexit.register(rm_if_exists, transcoded_video)
326
        command = ["mencoder", "-oac", "mp3lame",
327
                   "-ovc", "lavc", "-lavcopts", "vbitrate=600",
328
                   "-of", "avi", "-mc", "0", self.path_downloaded,
329
                   "-o", transcoded_video]
330
        if height and width:
331
            if int(height) > options.height:
332
                command.extend(["-vf", "scale=-3:%d" % options.height])
333
            elif int(width) > options.width:
334
                command.extend(["-vf", "scale=%d:-3" % options.width])
f571082 by Aurélien Bompard at 2010-07-22 335
        if options.quiet:
3ecf069 by Aurélien Bompard at 2011-11-05 336
            command.append("-quiet")
f571082 by Aurélien Bompard at 2010-07-22 337
        if self.subs:
3ecf069 by Aurélien Bompard at 2011-11-05 338
            command.extend(["-sub", self.subs, "-subfont-text-scale", "4"])
339
            if not os.path.exists(os.path.expanduser("~/.mplayer/subfont.ttf")):
340
                command.extend(["-fontconfig", "-font", "DejaVu Sans"])
f571082 by Aurélien Bompard at 2010-07-22 341
        print " ".join(command)
342
        retcode = 0
343
        try:
3ecf069 by Aurélien Bompard at 2011-11-05 344
            print "Encoding %s to %s" % (self.path_downloaded, transcoded_video)
345
            retcode = subprocess.call(command)
346
        except KeyboardInterrupt:
347
            retcode = 1
348
        if retcode != 0:
349
            if os.path.exists(transcoded_video):
350
                os.remove(transcoded_video)
351
            raise TranscodingError("Error code: %s" % retcode)
352
        self.content_type = "video/x-msvideo"
353
        return transcoded_video
354
f571082 by Aurélien Bompard at 2010-07-22 355
    def get_video_info(self):
356
        if self.video_info is not None:
357
            return self.video_info
358
        ffmpeg_cmd = subprocess.Popen(["ffmpeg", "-i", self.path_downloaded],
359
                        stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
360
        output = ffmpeg_cmd.stdout.read()
361
        info_match = re.search("Stream .*: Video: (\w+), \w+, (\d+)x(\d+)", output)
362
        if info_match:
363
            self.video_info = (info_match.group(2), info_match.group(3))
364
        else:
365
            self.video_info = (None, None)
366
        return self.video_info
367
a18bfc9 by Aurélien Bompard at 2010-04-11 368
369
def cleanup(items):
f571082 by Aurélien Bompard at 2010-07-22 370
    feed_podcasts = set()
a18bfc9 by Aurélien Bompard at 2010-04-11 371
    for item in items:
f571082 by Aurélien Bompard at 2010-07-22 372
        try:
373
            podcast = Podcast(item)
374
        except NotAPodcastError:
a18bfc9 by Aurélien Bompard at 2010-04-11 375
            continue
f571082 by Aurélien Bompard at 2010-07-22 376
        feed_podcasts.add(os.path.basename(podcast.path_downloaded))
377
        feed_podcasts.add(os.path.basename(podcast.path_encoded))
a18bfc9 by Aurélien Bompard at 2010-04-11 378
3ecf069 by Aurélien Bompard at 2011-11-05 379
    if options.keep:
380
        return
381
a18bfc9 by Aurélien Bompard at 2010-04-11 382
    for filepath in glob.glob(os.path.join(options.directory, "*")):
383
        if filepath.endswith(".xml"):
384
            continue # keep the RSS feed
385
        filename = os.path.basename(filepath)
f571082 by Aurélien Bompard at 2010-07-22 386
        if filename not in feed_podcasts:
81a737f by Aurélien Bompard at 2010-07-28 387
            print "Removing old file %s" % filename
f571082 by Aurélien Bompard at 2010-07-22 388
            #print feed_podcasts
a18bfc9 by Aurélien Bompard at 2010-04-11 389
            os.remove(filepath)
390
391
392
def handle_item(item):
f571082 by Aurélien Bompard at 2010-07-22 393
    try:
394
        podcast = Podcast(item)
395
    except NotAPodcastError:
a18bfc9 by Aurélien Bompard at 2010-04-11 396
        return
f571082 by Aurélien Bompard at 2010-07-22 397
398
    if podcast.is_already_transcoded():
399
        print "Already converted: %s" % podcast.url
a18bfc9 by Aurélien Bompard at 2010-04-11 400
        return
f571082 by Aurélien Bompard at 2010-07-22 401
    if options.subtitles and not podcast.subs:
402
        print "No subtitles for %s, skipping." % item.findtext("guid")
403
        flux_xml.find("channel").remove(item)
404
        return
405
a18bfc9 by Aurélien Bompard at 2010-04-11 406
    try:
f571082 by Aurélien Bompard at 2010-07-22 407
        podcast.process()
408
    except DownloadingError, e:
a18bfc9 by Aurélien Bompard at 2010-04-11 409
        print e
410
        return
f571082 by Aurélien Bompard at 2010-07-22 411
    except TranscodingError, e:
412
        print e
a18bfc9 by Aurélien Bompard at 2010-04-11 413
        return
f571082 by Aurélien Bompard at 2010-07-22 414
a18bfc9 by Aurélien Bompard at 2010-04-11 415
416
def to_skip(item):
417
    tags = item.findall("category")
418
    for tag in tags:
fcbb169 by Aurélien Bompard at 2010-05-09 419
        if tag.text in options.exclude_tags:
a18bfc9 by Aurélien Bompard at 2010-04-11 420
            return True
421
    return False
422
423
def main():
fcbb169 by Aurélien Bompard at 2010-05-09 424
    global options, flux_xml
a18bfc9 by Aurélien Bompard at 2010-04-11 425
    options, args = get_options()
426
    if options.input == "-":
427
        options.input = sys.stdin
428
    flux_xml = ET.parse(options.input)
429
    items = flux_xml.findall("channel/item")
430
    # tag skipping
f571082 by Aurélien Bompard at 2010-07-22 431
    for item in items[:]:
a18bfc9 by Aurélien Bompard at 2010-04-11 432
        if to_skip(item):
433
            flux_xml.find("channel").remove(item)
434
            items.remove(item)
f571082 by Aurélien Bompard at 2010-07-22 435
    for i, item in enumerate(items[:]):
a18bfc9 by Aurélien Bompard at 2010-04-11 436
        if i < options.max:
437
            handle_item(item)
438
        else:
439
            flux_xml.find("channel").remove(item)
440
441
    #flux_xml.write(options.output, "utf-8")
442
    flux_xml.write(options.output)
443
    cleanup(items[:options.max])
7581434 by Aurélien Bompard at 2011-04-03 444
a18bfc9 by Aurélien Bompard at 2010-04-11 445
446
if __name__ == "__main__":
447
    main()
448