| 1 |
#!/usr/bin/env python |
| 2 |
# -*- coding: utf-8 -*- |
| 3 |
u""" |
| 4 |
Reads an RSS/Atom feed and converts the enclosures to AVI. |
| 5 |
|
| 6 |
Dependencies: |
| 7 |
|
| 8 |
- ``flvstreamer`` for the RTMP streams |
| 9 |
- ``mimms`` for the MMS streams |
| 10 |
- ``file`` to get the sizes of the videos |
| 11 |
- ``mencoder`` to do the conversion |
| 12 |
- ``tedtalksubs.py`` to dowload ted talks subtitles (in this repo) |
| 13 |
|
| 14 |
:Authors: |
| 15 |
Aurélien Bompard <aurelien@bompard.org> <http://aurelien.bompard.org> |
| 16 |
|
| 17 |
:License: |
| 18 |
GNU GPL v3 or later |
| 19 |
""" |
| 20 |
|
| 21 |
import os |
| 22 |
import sys |
| 23 |
import urllib2 |
| 24 |
import httplib |
| 25 |
import glob |
| 26 |
import subprocess |
| 27 |
import re |
| 28 |
import tempfile |
| 29 |
import atexit |
| 30 |
from urlparse import urlparse |
| 31 |
from optparse import OptionParser |
| 32 |
from stat import S_IRUSR, S_IWUSR, S_IROTH, S_IRGRP |
| 33 |
#from xml.etree import ElementTree as ET |
| 34 |
from pprint import pprint |
| 35 |
|
| 36 |
from lxml import etree as ET |
| 37 |
import urlgrabber |
| 38 |
import urlgrabber.progress |
| 39 |
|
| 40 |
# Tags to skip |
| 41 |
EXCLUDE_TAGS = "" |
| 42 |
# Max size of the encoded video |
| 43 |
WIDTH = 800 |
| 44 |
HEIGHT = 480 |
| 45 |
# Default MIME type |
| 46 |
MIME_DEFAULT = "video/x-msvideo" |
| 47 |
EXTENSION = "avi" |
| 48 |
|
| 49 |
|
| 50 |
def get_options(): |
| 51 |
usage = "usage: %prog -i input_feed -u URL -o output_feed [-d directory]" |
| 52 |
parser = OptionParser(usage=usage) |
| 53 |
parser.add_option("-i", "--input", dest="input", |
| 54 |
help="Process this file") |
| 55 |
parser.add_option("-o", "--output", dest="output", |
| 56 |
help="Write the RSS in this file") |
| 57 |
parser.add_option("-d", "--directory", dest="directory", |
| 58 |
help="Write the converted videos in this directory") |
| 59 |
parser.add_option("-u", "--url", dest="url", |
| 60 |
help="The external URL of the video folder") |
| 61 |
parser.add_option("-W", "--width", dest="width", type="int", default=WIDTH, |
| 62 |
help="Width of the converted video [default: %default]") |
| 63 |
parser.add_option("-H", "--height", dest="height", type="int", default=HEIGHT, |
| 64 |
help="Height of the converted video [default: %default]") |
| 65 |
parser.add_option("-m", "--max", dest="max", type="int", default=10, |
| 66 |
help="Only convert that many videos, drop the rest [default: %default]") |
| 67 |
parser.add_option("-q", "--quiet", dest="quiet", action="store_true", |
| 68 |
default=False, help="Don't show progress bars") |
| 69 |
parser.add_option("-k", "--keep", dest="keep", action="store_true", |
| 70 |
default=False, help="Don't remove original files") |
| 71 |
parser.add_option("--exclude-tags", dest="exclude_tags", |
| 72 |
default=EXCLUDE_TAGS, help="Drop videos tagged with a " |
| 73 |
"tag in this comma-sparated list [default: %default]") |
| 74 |
parser.add_option("--subtitles", dest="subtitles", metavar="LANG", |
| 75 |
help="Download subtitles in this language") |
| 76 |
parser.add_option("--old-ffmpeg", dest="oldffmpeg", action="store_true", |
| 77 |
help="FFMpeg is old (like on Debian Lenny)") |
| 78 |
options, args = parser.parse_args() |
| 79 |
if len(args) > 0: |
| 80 |
parser.error("illegal arguments: %s" % ", ".join(args)) |
| 81 |
if not options.input: |
| 82 |
parser.error("I need a file to process") |
| 83 |
if not os.path.exists(options.input): |
| 84 |
parser.error("The file to process does not exist") |
| 85 |
if not options.output: |
| 86 |
parser.error("I need a file to write to") |
| 87 |
if not options.url: |
| 88 |
parser.error("I need an external URL") |
| 89 |
if not options.directory and options.input != "-": |
| 90 |
options.directory = os.path.abspath(os.path.dirname(options.input)) |
| 91 |
if not options.directory: |
| 92 |
parser.error("I need a directory for the videos") |
| 93 |
if not options.quiet and "TERM" not in os.environ: |
| 94 |
options.quiet = True # Not in a terminal, be quiet anyway |
| 95 |
if isinstance(options.exclude_tags, basestring): |
| 96 |
options.exclude_tags = [ t.strip() for t in |
| 97 |
options.exclude_tags.split(",") ] |
| 98 |
return options, args |
| 99 |
|
| 100 |
|
| 101 |
class PodcastError(Exception): pass |
| 102 |
class NotAPodcastError(PodcastError): pass |
| 103 |
class TranscodingError(PodcastError): pass |
| 104 |
class DownloadingError(PodcastError): pass |
| 105 |
|
| 106 |
|
| 107 |
class Podcast(object): |
| 108 |
|
| 109 |
_mimetypes = {} |
| 110 |
|
| 111 |
def __init__(self, item): |
| 112 |
self.item = item |
| 113 |
self.enclosure = item.find("enclosure") |
| 114 |
if not ET.iselement(self.enclosure): |
| 115 |
raise NotAPodcastError() |
| 116 |
self.url = self._get_url() |
| 117 |
self.path_downloaded = self._get_downloaded_path() |
| 118 |
self.content_type = self._get_content_type() |
| 119 |
self.path_encoded = self._get_encoded_path() |
| 120 |
self.subs = self._get_subtitles() |
| 121 |
self.video_info = None |
| 122 |
self.size = None |
| 123 |
|
| 124 |
def _get_url(self): |
| 125 |
url = self.enclosure.get("url") |
| 126 |
if url.startswith(options.url): |
| 127 |
return url |
| 128 |
# Resolve redirects |
| 129 |
try: |
| 130 |
remote_file = urllib2.urlopen(url) |
| 131 |
url = remote_file.geturl() |
| 132 |
self.content_type = remote_file.info().get("Content-Type") |
| 133 |
remote_file.close() |
| 134 |
except (urllib2.HTTPError, httplib.HTTPException), e: |
| 135 |
print "Failed downloading %s" % url |
| 136 |
print e |
| 137 |
except urllib2.URLError, e: |
| 138 |
print "Probably RTMP or MMS: %s" % url |
| 139 |
return url |
| 140 |
|
| 141 |
def _get_content_type(self): |
| 142 |
if getattr(self, "content_type", None): |
| 143 |
return self.content_type # already set by _get_url() |
| 144 |
if "content-type" in self.enclosure.attrib: |
| 145 |
return self.enclosure.attrib.get("content-type") |
| 146 |
for extension, mimetype in self.mimetypes.iteritems(): |
| 147 |
if self.path_downloaded.endswith("."+extension): |
| 148 |
return mimetype |
| 149 |
return MIME_DEFAULT |
| 150 |
|
| 151 |
def _get_downloaded_path(self): |
| 152 |
url_obj = urlparse(self.url) |
| 153 |
filename = os.path.basename(url_obj.path) |
| 154 |
if filename.count("?"): |
| 155 |
filename = filename[:filename.index("?")] |
| 156 |
return os.path.join(options.directory, filename) |
| 157 |
|
| 158 |
def _get_encoded_path(self): |
| 159 |
if self.content_type.startswith("audio/"): |
| 160 |
# the file won't be transcoded |
| 161 |
return self.path_downloaded |
| 162 |
filename_encoded = "%s.%s" % ( |
| 163 |
os.path.splitext(self.path_downloaded)[0], EXTENSION) |
| 164 |
return os.path.join(options.directory, filename_encoded) |
| 165 |
|
| 166 |
def _get_subtitles(self): |
| 167 |
if not options.subtitles: |
| 168 |
return None |
| 169 |
subfile, subfile_path = tempfile.mkstemp(prefix="podcast-transcode-sub-", |
| 170 |
suffix=".srt") |
| 171 |
os.close(subfile) |
| 172 |
atexit.register(os.remove, subfile_path) |
| 173 |
if (flux_xml.findtext("channel/title") == "TEDTalks (video)" or |
| 174 |
flux_xml.findtext("channel/title") == "TEDTalks (hd)"): |
| 175 |
talkid = self.item.findtext("guid").split(":")[1] |
| 176 |
subdl = subprocess.Popen(["tedtalksubs.py", "-l", options.subtitles, |
| 177 |
"-o", subfile_path, talkid], |
| 178 |
stdout=subprocess.PIPE, |
| 179 |
stderr=subprocess.STDOUT) |
| 180 |
out, err = subdl.communicate() |
| 181 |
if subdl.returncode != 0: |
| 182 |
print >>sys.stderr, "Failed to download subtitles. Message:" |
| 183 |
print >>sys.stderr, out |
| 184 |
return None |
| 185 |
return subfile_path |
| 186 |
return None |
| 187 |
|
| 188 |
def _get_mimetypes(self): |
| 189 |
if self._mimetypes: |
| 190 |
return self._mimetypes |
| 191 |
mimetypes_re = re.compile("\s*([^\s]+)\s+([^\s]+)\s*") |
| 192 |
mimetypes = open("/etc/mime.types") |
| 193 |
for line in mimetypes: |
| 194 |
line_mo = mimetypes_re.match(line) |
| 195 |
if not line_mo: |
| 196 |
continue |
| 197 |
mimetype = line_mo.group(1) |
| 198 |
extension = line_mo.group(2) |
| 199 |
if not mimetype.startswith("video/") \ |
| 200 |
and not mimetype.startswith("audio/"): |
| 201 |
continue |
| 202 |
self._mimetypes[line_mo.group(2)] = line_mo.group(1) |
| 203 |
mimetypes.close() |
| 204 |
return self._mimetypes |
| 205 |
mimetypes = property(_get_mimetypes) |
| 206 |
|
| 207 |
def is_already_transcoded(self): |
| 208 |
return self.url.startswith(options.url) |
| 209 |
|
| 210 |
def process(self): |
| 211 |
if not os.path.exists(self.path_encoded): |
| 212 |
self.download() |
| 213 |
self.encode_video() |
| 214 |
else: |
| 215 |
print "Already downloaded/encoded: %s" % self.path_encoded |
| 216 |
self.url = "%s/%s" % (options.url, os.path.basename(self.path_encoded)) |
| 217 |
self.size = int(os.stat(self.path_encoded).st_size) |
| 218 |
self.update_item() |
| 219 |
|
| 220 |
def update_item(self): |
| 221 |
self.enclosure.set("url", self.url) |
| 222 |
self.enclosure.set("length", str(self.size)) |
| 223 |
self.enclosure.set("type", self.content_type) |
| 224 |
fb = self.item.find("{http://rssnamespace.org/feedburner/ext/1.0}origEnclosureLink") |
| 225 |
if ET.iselement(fb): |
| 226 |
fb.text = self.url |
| 227 |
mediacontent = self.item.find("{http://search.yahoo.com/mrss/}content") |
| 228 |
if ET.iselement(mediacontent): |
| 229 |
mediacontent.set("url", self.url) |
| 230 |
mediacontent.set("fileSize", str(self.size)) |
| 231 |
mediacontent.set("type", self.content_type) |
| 232 |
|
| 233 |
def download(self): |
| 234 |
if os.path.exists(self.path_downloaded): |
| 235 |
print "Already downloaded: %s" % self.path_downloaded |
| 236 |
return |
| 237 |
if self.url.startswith("rtmp://"): |
| 238 |
self.download_rtmp() |
| 239 |
elif self.url.startswith("mms://"): |
| 240 |
self.download_mms() |
| 241 |
elif self.content_type and self.content_type == "video/x-ms-asf": |
| 242 |
self.download_asf() |
| 243 |
else: |
| 244 |
if options.quiet: |
| 245 |
progress = urlgrabber.progress.BaseMeter() |
| 246 |
else: |
| 247 |
progress = urlgrabber.progress.TextMeter(fo=sys.stdout) |
| 248 |
print "Downloading %s to %s" % (self.url, self.path_downloaded) |
| 249 |
try: |
| 250 |
urlgrabber.urlgrab(self.url, filename=self.path_downloaded, |
| 251 |
reget='simple', progress_obj=progress) |
| 252 |
except urlgrabber.grabber.URLGrabError, e: |
| 253 |
raise DownloadingError("Error downloading %s: %s" |
| 254 |
% (self.url, e)) |
| 255 |
|
| 256 |
def download_rtmp(self): |
| 257 |
MAX_TRIES = 10 |
| 258 |
def download_rtmp_unit(url, path): |
| 259 |
command = ["flvstreamer", "-r", url, "-o", path] |
| 260 |
if options.quiet: |
| 261 |
command.append("-q") |
| 262 |
if os.path.exists(path): |
| 263 |
command.insert(1, "--resume") |
| 264 |
print "Streaming %s to %s" % (url, path) |
| 265 |
retcode = 0 |
| 266 |
try: |
| 267 |
retcode = subprocess.call(command) |
| 268 |
except KeyboardInterrupt: |
| 269 |
retcode = 1 |
| 270 |
return retcode |
| 271 |
|
| 272 |
retcode = download_rtmp_unit(self.url, self.path_downloaded) |
| 273 |
# flvstreamer returns 2 if the download is incomplete |
| 274 |
current_try = 1 |
| 275 |
while retcode == 2: |
| 276 |
print "Trying again..." |
| 277 |
retcode = download_rtmp_unit(self.url, self.path_downloaded) |
| 278 |
current_try += 1 |
| 279 |
if current_try > MAX_TRIES: |
| 280 |
print "Too many tries, aborting." |
| 281 |
break |
| 282 |
if retcode != 0: |
| 283 |
if os.path.exists(self.path_downloaded): |
| 284 |
os.remove(self.path_downloaded) |
| 285 |
raise DownloadingError("Error code: %s" % retcode) |
| 286 |
|
| 287 |
def download_mms(self): |
| 288 |
#command = ["mplayer", "-dumpstream", "-dumpfile", self.path_downloaded, self.url] |
| 289 |
command = ["mimms", self.url, self.path_downloaded] |
| 290 |
if options.quiet: |
| 291 |
command.append("-q") |
| 292 |
try: |
| 293 |
print "Streaming %s to %s" % (self.url, self.path_downloaded) |
| 294 |
retcode = 0 |
| 295 |
retcode = subprocess.call(command) |
| 296 |
except KeyboardInterrupt: |
| 297 |
retcode = 1 |
| 298 |
if retcode != 0 and os.path.exists(self.path_downloaded): |
| 299 |
os.remove(self.path_downloaded) |
| 300 |
raise DownloadingError("Error code: %s" % retcode) |
| 301 |
|
| 302 |
def download_asf(self): |
| 303 |
mms_xml = urllib2.urlopen(self.url).read() |
| 304 |
mms_match = re.search('"(mms://.*)"', mms_xml) |
| 305 |
mms_url = mms_match.group(1) |
| 306 |
return download_mms(mms_url.replace("&", "&"), self.path_downloaded) |
| 307 |
|
| 308 |
def encode_video(self): |
| 309 |
if self.path_encoded == self.path_downloaded: |
| 310 |
print "No transcoding required" |
| 311 |
return |
| 312 |
transcoded_video = self._transcode_video() |
| 313 |
os.rename(transcoded_video, self.path_encoded) |
| 314 |
os.chmod(self.path_encoded, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH) # 644 |
| 315 |
|
| 316 |
def _transcode_video(self): |
| 317 |
width, height = self.get_video_info() |
| 318 |
transcoded_video_file, transcoded_video = tempfile.mkstemp( |
| 319 |
prefix="podcast-transcode-", suffix=".avi", dir=options.directory) |
| 320 |
os.close(transcoded_video_file) |
| 321 |
def rm_if_exists(f): |
| 322 |
if os.path.exists(f): |
| 323 |
os.remove(f) |
| 324 |
if not options.keep: |
| 325 |
atexit.register(rm_if_exists, transcoded_video) |
| 326 |
command = ["mencoder", "-oac", "mp3lame", |
| 327 |
"-ovc", "lavc", "-lavcopts", "vbitrate=600", |
| 328 |
"-of", "avi", "-mc", "0", self.path_downloaded, |
| 329 |
"-o", transcoded_video] |
| 330 |
if height and width: |
| 331 |
if int(height) > options.height: |
| 332 |
command.extend(["-vf", "scale=-3:%d" % options.height]) |
| 333 |
elif int(width) > options.width: |
| 334 |
command.extend(["-vf", "scale=%d:-3" % options.width]) |
| 335 |
if options.quiet: |
| 336 |
command.append("-quiet") |
| 337 |
if self.subs: |
| 338 |
command.extend(["-sub", self.subs, "-subfont-text-scale", "4"]) |
| 339 |
if not os.path.exists(os.path.expanduser("~/.mplayer/subfont.ttf")): |
| 340 |
command.extend(["-fontconfig", "-font", "DejaVu Sans"]) |
| 341 |
print " ".join(command) |
| 342 |
retcode = 0 |
| 343 |
try: |
| 344 |
print "Encoding %s to %s" % (self.path_downloaded, transcoded_video) |
| 345 |
retcode = subprocess.call(command) |
| 346 |
except KeyboardInterrupt: |
| 347 |
retcode = 1 |
| 348 |
if retcode != 0: |
| 349 |
if os.path.exists(transcoded_video): |
| 350 |
os.remove(transcoded_video) |
| 351 |
raise TranscodingError("Error code: %s" % retcode) |
| 352 |
self.content_type = "video/x-msvideo" |
| 353 |
return transcoded_video |
| 354 |
|
| 355 |
def get_video_info(self): |
| 356 |
if self.video_info is not None: |
| 357 |
return self.video_info |
| 358 |
ffmpeg_cmd = subprocess.Popen(["ffmpeg", "-i", self.path_downloaded], |
| 359 |
stdout=subprocess.PIPE, stderr=subprocess.STDOUT) |
| 360 |
output = ffmpeg_cmd.stdout.read() |
| 361 |
info_match = re.search("Stream .*: Video: (\w+), \w+, (\d+)x(\d+)", output) |
| 362 |
if info_match: |
| 363 |
self.video_info = (info_match.group(2), info_match.group(3)) |
| 364 |
else: |
| 365 |
self.video_info = (None, None) |
| 366 |
return self.video_info |
| 367 |
|
| 368 |
|
| 369 |
def cleanup(items): |
| 370 |
feed_podcasts = set() |
| 371 |
for item in items: |
| 372 |
try: |
| 373 |
podcast = Podcast(item) |
| 374 |
except NotAPodcastError: |
| 375 |
continue |
| 376 |
feed_podcasts.add(os.path.basename(podcast.path_downloaded)) |
| 377 |
feed_podcasts.add(os.path.basename(podcast.path_encoded)) |
| 378 |
|
| 379 |
if options.keep: |
| 380 |
return |
| 381 |
|
| 382 |
for filepath in glob.glob(os.path.join(options.directory, "*")): |
| 383 |
if filepath.endswith(".xml"): |
| 384 |
continue # keep the RSS feed |
| 385 |
filename = os.path.basename(filepath) |
| 386 |
if filename not in feed_podcasts: |
| 387 |
print "Removing old file %s" % filename |
| 388 |
#print feed_podcasts |
| 389 |
os.remove(filepath) |
| 390 |
|
| 391 |
|
| 392 |
def handle_item(item): |
| 393 |
try: |
| 394 |
podcast = Podcast(item) |
| 395 |
except NotAPodcastError: |
| 396 |
return |
| 397 |
|
| 398 |
if podcast.is_already_transcoded(): |
| 399 |
print "Already converted: %s" % podcast.url |
| 400 |
return |
| 401 |
if options.subtitles and not podcast.subs: |
| 402 |
print "No subtitles for %s, skipping." % item.findtext("guid") |
| 403 |
flux_xml.find("channel").remove(item) |
| 404 |
return |
| 405 |
|
| 406 |
try: |
| 407 |
podcast.process() |
| 408 |
except DownloadingError, e: |
| 409 |
print e |
| 410 |
return |
| 411 |
except TranscodingError, e: |
| 412 |
print e |
| 413 |
return |
| 414 |
|
| 415 |
|
| 416 |
def to_skip(item): |
| 417 |
tags = item.findall("category") |
| 418 |
for tag in tags: |
| 419 |
if tag.text in options.exclude_tags: |
| 420 |
return True |
| 421 |
return False |
| 422 |
|
| 423 |
def main(): |
| 424 |
global options, flux_xml |
| 425 |
options, args = get_options() |
| 426 |
if options.input == "-": |
| 427 |
options.input = sys.stdin |
| 428 |
flux_xml = ET.parse(options.input) |
| 429 |
items = flux_xml.findall("channel/item") |
| 430 |
# tag skipping |
| 431 |
for item in items[:]: |
| 432 |
if to_skip(item): |
| 433 |
flux_xml.find("channel").remove(item) |
| 434 |
items.remove(item) |
| 435 |
for i, item in enumerate(items[:]): |
| 436 |
if i < options.max: |
| 437 |
handle_item(item) |
| 438 |
else: |
| 439 |
flux_xml.find("channel").remove(item) |
| 440 |
|
| 441 |
#flux_xml.write(options.output, "utf-8") |
| 442 |
flux_xml.write(options.output) |
| 443 |
cleanup(items[:options.max]) |
| 444 |
|
| 445 |
|
| 446 |
if __name__ == "__main__": |
| 447 |
main() |