1
#!/usr/bin/php
2
<?php
3
# vim: set expandtab tabstop=4 shiftwidth=4:
4
/**
5
 * xhtml2odt - XHTML to ODT XML transformation
6
 *
7
 * This script can convert a wiki page to the OpenDocument Text (ODT) format,
8
 * standardized as ISO/IEC 26300:2006, and the native format of office suites
9
 * such as OpenOffice.org, KOffice, and others.
10
 *
11
 * It uses a template ODT file which will be filled with the converted
12
 * content of the exported Wiki page.
13
 *
14
 * Inspired by the work on {@link http://open.comsultia.com/docbook2odf/
15
 * docbook2odt}, by Roman Fordinal
16
 *
17
 * @link http://xhtml2odt.org xhtml2odt project
18
 * @author Aurélien Bompard <aurelien@bompard.org>
19
 * @copyright Aurélien Bompard <aurelien@bompard.org> 2009-2010
20
 * @license http://www.gnu.org/licenses/lgpl-2.1.html LGPLv2+
21
 * @package xhtml2odt
22
 *
23
 * This program is free software; you can redistribute it and/or
24
 * modify it under the terms of the GNU Lesser General Public
25
 * License as published by the Free Software Foundation; either
26
 * version 2.1 of the License, or (at your option) any later version.
27
 *
28
 * This program is distributed in the hope that it will be useful,
29
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
30
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
31
 * Library General Public License for more details.
32
 *
33
 */
34
35
36
/**
37
 * Conversion failure
38
 * @package xhtml2odt
39
 */
40
class ODTException extends Exception {}
41
42
43
/**
44
 * Handling of an ODT file based on a template (another ODT file)
45
 *
46
 * The template ODT file is given to the constructor. Then, you must:
47
 * - set the XSLT parameters,
48
 * - call the {@link compile} method,
49
 * - use either the {@link saveToFile} method or the {@link
50
 *   exportAsAttachedFile} method, depending on whether you want to save the
51
 *   file on disk or to push the result to the browser.
52
 * @package xhtml2odt
53
 */
54
class ODTFile {
55
    protected $odtfile;
56
    protected $odtfilepath;
57
    protected $tmpfiles = array();
58
    protected $contentXml;
59
    protected $stylesXml;
60
    protected $autostyles = array();
61
    protected $styles = array();
62
    protected $fonts = array();
63
    protected $images = array();
64
    public $template;
65
    public $xslparams = array();
66
    public $get_remote_images = true;
67
    const PIXEL_TO_CM = 0.026458333;
68
69
    /**
70
     * Constructor
71
     *
72
     * @param string $template the path to the template ODT file
73
     */
74
    public function __construct($template) {
75
        $this->template = $template;
76
        if (! class_exists('ZipArchive')) {
77
            throw new ODTException('Zip extension not loaded - check your php
78
                settings, PHP 5.2 minimum with zip and XSL extensions is
79
                required.'); ;
80
        }
81
        if (! class_exists('XSLTProcessor')) {
82
            throw new ODTException('XSL extension not loaded - check your php
83
                settings, PHP 5.2 minimum with zip and XSL extensions is
84
                required.'); ;
85
        }
86
        // Loading content.xml and styles.xml from the template
87
        $this->odtfile = new ZipArchive();
88
        if ($this->odtfile->open($template) !== true) {
89
          throw new ODTException("Error while Opening the file '$template' -
90
                                  Check your odt file");
91
        }
92
        if (($this->contentXml = $this->odtfile->getFromName('content.xml')) === false) {
93
            throw new ODTException("Nothing to parse - check that the
94
                                    content.xml file is correctly formed");
95
        }
96
        if (($this->stylesXml = $this->odtfile->getFromName('styles.xml')) === false) {
97
          throw new ODTException("Nothing to parse - check that the
98
                                  styles.xml file is correctly formed");
99
        }
100
        $this->odtfile->close();
101
        // Use you app's cache directory here instead of null:
102
        $tmp = tempnam(null, md5(uniqid()));
103
        copy($template, $tmp);
104
        $this->odtfilepath = $tmp;
105
    }
106
107
    public function __destruct() {
108
        if (file_exists($this->odtfilepath)) {
109
            unlink($this->odtfilepath);
110
        }
111
        foreach ($this->tmpfiles as $tmp) {
112
            unlink($tmp);
113
        }
114
    }
115
116
    public function __toString() {
117
        return $this->contentXml;
118
    }
119
120
    /**
121
     * Main function which runs the other
122
     *
123
     * If your app has a templating engine, you may want to use the template
124
     * ODT file as one of you app's templates. You would then do the following
125
     * steps:
126
     * - run it here through your template engine, which would produce a mix
127
     *   of ODT XML and XHTML.
128
     * - pass the result to the {@link xhtml2odt} method, which would only
129
     *   convert the XHTML to ODT, and leave the ODT untouched
130
     * - the rest of the function is identical
131
     */
132
    public function compile() {
133
        //$html = YourAppsTemplatingEngine($this->template);
134
        // here we'll just use the global $html variable.
135
        global $html, $options;
136
        $odt = $this->xhtml2odt($html);
137
        $odt = str_replace('<'.'?xml version="1.0" encoding="utf-8"?'.'>', '', $odt);
138
        // You can do some debugging here if you want to.
139
        //print $html;
140
        //print $this->contentXml;
141
        //print $odt;
142
        //print "\n";
143
        //exit();
144
        // If you're using the ODT file as a template in a templating engine,
145
        // you can just set $this->contentXml to the output of xhtml2odt()
146
        // Here, we'll show how to replace a given string in the template, or
147
        // how to append text to the template.
148
        if ($options["r"] and
149
                strpos($this->contentXml, $options["r"]) !== false) {
150
            $this->contentXml = preg_replace(
151
                    "/<text:p[^>]*>".$options["r"]."<\/text:p>/",
152
                    $odt, $this->contentXml);
153
        } else {
154
            $this->contentXml = str_replace("</office:text>",
155
                    "$odt</office:text>", $this->contentXml);
156
        }
157
        // Add the missing styles (used in content.xml but not defined in
158
        // styles.xml or automatic styles
159
        $this->addStyles();
160
    }
161
162
    /**
163
     * Clean up the HTML we get in input
164
     *
165
     * Because the stylesheets will only accept well-formed (and if possible
166
     * valid) XHTML.
167
     *
168
     * If you have XHTML *and* ODT mixed up in input, because you used
169
     * the ODT file as a template in your templating engine, then you
170
     * *can't* run it through "tidy". Or else you'd have to use the
171
     * input-xml option, and it does strange things like removing the
172
     * white space after links. I didn't find a way around this.
173
     */
174
    public function cleanupInput($xhtml) {
175
        // add namespace if you used the ODT file as a template
176
        //$xhtml = str_replace("<office:document-content", '<office:document-content xmlns="http://www.w3.org/1999/xhtml"', $xhtml);
177
178
        /* Won't work if you have ODT XML *and* XHTML as input */
179
        if (extension_loaded('tidy')) {
180
            $tidy_config = array(
181
                    'output-xhtml' => true,
182
                    'add-xml-decl' => false,
183
                    'indent' => false,
184
                    'tidy-mark' => false,
185
                    //'input-encoding' => "latin1",
186
                    'output-encoding' => "utf8",
187
                    'doctype' => "auto",
188
                    'wrap' => 0,
189
                    'char-encoding' => "utf8",
190
                );
191
            $tidy = new tidy;
192
            $tidy->parseString($xhtml, $tidy_config, 'utf8');
193
            $tidy->cleanRepair();
194
            $xhtml = "$tidy";
195
        }
196
197
        // replace html codes with unicode
198
        // http://www.mail-archive.com/analog-help@lists.meer.net/msg03670.html
199
        $xhtml = str_replace("&nbsp;","&#160;",$xhtml);
200
        //$xhtml = html_entity_decode($xhtml, ENT_COMPAT, "UTF-8");
201
202
        return $xhtml;
203
    }
204
205
    /**
206
     * Convert from XHTML to ODT using the stylesheets
207
     *
208
     * @param string $xhtml XHTML to convert
209
     * @return string resulting ODT XML
210
     */
211
    public function xhtml2odt($xhtml) {
212
        $xhtml = self::cleanupInput($xhtml);
213
        $xhtml = $this->handleImages($xhtml);
214
        // run the stylesheets
215
        $xsl = dirname(__FILE__)."/xsl";
216
        $xmldoc = new DOMDocument();
217
        $xmldoc->loadXML($xhtml);
218
        $xsldoc = new DOMDocument();
219
        $xsldoc->load($xsl."/xhtml2odt.xsl");
220
        $proc = new XSLTProcessor();
221
        $proc->importStylesheet($xsldoc);
222
        foreach ($this->xslparams as $pkey=>$pval) {
223
            $proc->setParameter("", $pkey, $pval);
224
        }
225
        $output = $proc->transformToXML($xmldoc);
226
        if ($output === false) {
227
            throw new ODTException('XSLT transformation failed');
228
        }
229
        return $output;
230
    }
231
232
    /**
233
     * Handle images.
234
     *
235
     * Download and include them when possible. Local and remote images are
236
     * handled differently.
237
     *
238
     * @param string $xhtml XHTML to look for images in
239
     * @return string XHTML with normalized img tags
240
     */
241
    protected function handleImages($xhtml) {
242
        global $options;
243
        // Turn false absolute URLs into relative ones. Useful for a webapp.
244
        $xhtml = preg_replace('#<img ([^>]*)src="http://'.$options["u"].'#',
245
                              '<img \1src="', $xhtml);
246
        /* Since we're a command-line script, there is no notion of a "local
247
           image". Our handleLocalImg function will just convert the source
248
           to absolute URLs. See the top of the function for an example of
249
           what you could do in a webapp (2 lines !)
250
         */
251
        $xhtml = preg_replace_callback('#<img [^>]*src="([^"]+)"[^>]*>#',
252
                                       array($this,"handleLocalImg"), $xhtml);
253
        if ($this->get_remote_images) {
254
            $xhtml = preg_replace_callback(
255
                        '#<img [^>]*src="(https?://[^"]+)"[^>]*#',
256
                        array($this,"handleRemoteImg"), $xhtml);
257
        }
258
        return $xhtml;
259
    }
260
261
    /**
262
     * Handling of local images (on this server)
263
     *
264
     * Must be called as a regexp callback. Outsources all the hard work to
265
     * the {@link handleImg} method.
266
     *
267
     * This implementation downloads the files that come from the same domain
268
     * as the XHTML document cames from, but server-based export plugins can
269
     * just retrieve it from the local disk, using either the
270
     * <samp>DOCUMENT_ROOT</samp> or any appropriate method (depending on the
271
     * web application you're writing an export plugin for).
272
     *
273
     * @param array $matches regexp matches
274
     * @return string regexp replacement
275
     */
276
    protected function handleLocalImg($matches) {
277
        global $options;
278
        $src = $matches[1];
279
        /* Example for a webapp:
280
        $file = $_SERVER["DOCUMENT_ROOT"].$src;
281
        return $this->handleImg($file, $matches);
282
        What follows is more complicated because we're a command-line script:
283
        - if the image is really local, include it
284
        - else, turn it into an absolute URL which will be downloaded later
285
        */
286
        if (strpos($src, "://") !== false and
287
                strpos($src, "file://") === false) {
288
            // This is an absolute link, don't touch it
289
            if (isset($options["v"])) {
290
                print "Local image: $src is an absolute link\n";
291
            }
292
            return $matches[0];
293
        }
294
        if (strpos($src, "file://") == 0) {
295
            $file = substr($src, 7);
296
        } elseif (strpos($src, "/") == 0) {
297
            $file = $src;
298
        } else {
299
            // relative link
300
            $file = dirname($options["i"])."/".$src;
301
        }
302
        if (realpath($file) !== false) {
303
            if (isset($options["v"])) {
304
                print "Local image: $src is actually local !\n";
305
            }
306
            return $this->handleImg(realpath($file), $matches);
307
        }
308
        if (!$options["u"]) {
309
            // There's nothing we can do here
310
            if (isset($options["v"])) {
311
                print "Local image: $src not local, can't download\n";
312
            }
313
            return $matches[0];
314
        }
315
        if (function_exists("http_build_url")) {
316
            $newsrc = http_build_url($options["u"], $src);
317
        } else {
318
            $newsrc = $options["u"]."/".$src;
319
        }
320
        if (isset($options["v"])) print "Local image: $src -> $newsrc\n";
321
        return str_replace($src, $newsrc, $matches[0]);
322
    }
323
324
    /*
325
     * Download remote images with cURL
326
     *
327
     * Must be called as a regexp callback. Outsources all the hard work to
328
     * the {@link handleImg} method.
329
     *
330
     * @param array $matches regexp matches
331
     * @return string regexp replacement
332
     */
333
    protected function handleRemoteImg($matches) {
334
        global $options;
335
        if (!function_exists("curl_init")) {
336
            return $matches[0]; // abort
337
        }
338
        $url = $matches[1];
339
        if (isset($options["v"])) {
340
            print "Downloading image from: $url\n";
341
        }
342
        // Use you app's cache directory here instead of null:
343
        $tempfilename = tempnam(null,"xhtml2odt-");
344
        $this->tmpfiles []= $tempfilename;
345
        $tempfile = fopen($tempfilename,"w");
346
        if ($tempfile === false) {
347
            return $matches[0];
348
        }
349
        $ch = curl_init();
350
        curl_setopt($ch, CURLOPT_URL, $url);
351
        curl_setopt($ch, CURLOPT_FILE, $tempfile);
352
        curl_setopt($ch, CURLOPT_BINARYTRANSFER, true);
353
        $result = curl_exec($ch);
354
        if ($result === false) {
355
            return $matches[0];
356
        }
357
        curl_close($ch);
358
        fclose($tempfile);
359
        return $this->handleImg($tempfilename, $matches);
360
    }
361
362
    /**
363
     * Insertion of the image in the ODT file and the content.xml file
364
     *
365
     * @param string $file the path to the image
366
     * @param array $matches regexp matches
367
     * @return string regext replacement
368
     * @throws ODTException
369
     */
370
    protected function handleImg($file, $matches) {
371
        if (!is_readable($file)) {
372
            throw new ODTException("Image $file is not readable or does "
373
                                  ."not exist");
374
        }
375
        $width = 0;
376
        $height = 0;
377
        if (strpos($matches[0], 'width="') !== false
378
                and strpos($matches[0], 'height="') !== false) {
379
            // Size is specified in the HTML, keep it
380
            $width = preg_replace('/.*\s+width="(\d+)(px)?".*/', '\1',
381
                                  $matches[0]);
382
            $height = preg_replace('/.*\s+height="(\d+)(px)?".*/', '\1',
383
                                   $matches[0]);
384
        }
385
        // Remove any previous size specification
386
        $matches[0] = preg_replace('/\s+width="[^"]*"/', '', $matches[0]);
387
        $matches[0] = preg_replace('/\s+height="[^"]*"/', '', $matches[0]);
388
        if (!$width or !$height) {
389
            // Could not find or extract the wanted size, use the real size
390
            $size = @getimagesize($file);
391
            if ($size === false) {
392
                $size = array($this->xslparams["img_default_width"],
393
                              $this->xslparams["img_default_height"]);
394
            }
395
            list ($width, $height) = $size;
396
        }
397
        $width *= self::PIXEL_TO_CM;
398
        $height *= self::PIXEL_TO_CM;
399
        $this->images[$file] = basename($file);
400
        // Remove existing sizes and replace them with the calculated size
401
        return str_replace($matches[1],"Pictures/".basename($file).'" width="'.$width.'cm" height="'.$height.'cm', $matches[0]);
402
    }
403
404
    /**
405
     * Inserts the generated ODT XML code into the content.xml and styles.xml
406
     * files
407
     */
408
    protected function _parse() {
409
        // automatic styles
410
        if ($this->autostyles) {
411
            $autostyles = implode("\n",$this->autostyles);
412
            if (strpos($this->contentXml, '<office:automatic-styles/>') !== false) {
413
                $this->contentXml = str_replace('<office:automatic-styles/>',
414
                                        '<office:automatic-styles>'.$autostyles.'</office:automatic-styles>',
415
                                        $this->contentXml);
416
            } else {
417
                $this->contentXml = str_replace('</office:automatic-styles>',
418
                                        $autostyles.'</office:automatic-styles>', $this->contentXml);
419
            }
420
        }
421
        // regular styles
422
        if ($this->styles) {
423
            $styles = implode("\n",$this->styles);
424
            $this->stylesXml = str_replace('</office:styles>',
425
                                   $styles.'</office:styles>', $this->stylesXml);
426
        }
427
        // fonts
428
        if ($this->fonts) {
429
            $fonts = implode("\n",$this->fonts);
430
            $this->contentXml = str_replace('</office:font-face-decls>',
431
                                    $fonts.'</office:font-face-decls>', $this->contentXml);
432
        }
433
    }
434
435
    /**
436
     * Internal save
437
     *
438
     * @throws ODTException
439
     */
440
    protected function _save() {
441
        $this->odtfile->open($this->odtfilepath, ZIPARCHIVE::CREATE);
442
        $this->_parse();
443
        if (! $this->odtfile->addFromString('content.xml', $this->contentXml)) {
444
            throw new ODTException('Error during file export');
445
        }
446
        if (! $this->odtfile->addFromString('styles.xml', $this->stylesXml)) {
447
            throw new ODTException('Error during file export');
448
        }
449
        foreach ($this->images as $imageKey => $imageValue) {
450
            $this->odtfile->addFile($imageKey, 'Pictures/' . $imageValue);
451
        }
452
        $this->odtfile->close();
453
    }
454
455
    /**
456
     * Exports the file as an HTTP attachment.
457
     *
458
     * If you're a web app, you'll probably want this.
459
     *
460
     * @param string $name name of the file to download (optional)
461
     * @throws ODTException
462
     */
463
    public function exportAsAttachedFile($name="") {
464
        $this->_save();
465
        if (headers_sent($filename, $linenum)) {
466
            throw new ODTException("headers already sent ($filename at $linenum)");
467
        }
468
        if( $name == "" ) {
469
            $name = md5(uniqid()) . ".odt";
470
        }
471
        header('Content-type: application/vnd.oasis.opendocument.text');
472
        header('Content-Disposition: attachment; filename="'.$name.'"');
473
        readfile($this->odtfilepath);
474
    }
475
476
    /**
477
     * Saves the file to the disk
478
     *
479
     * Mainly useful for the command-line app, see {@link
480
     * exportAsAttachedFile} to have the browser download the file.
481
     *
482
     * @param string $name path to the file on the disk
483
     * @throws ODTException
484
     */
485
    public function saveToFile($name="") {
486
        $this->_save();
487
        if( $name == "" ) {
488
            $name = md5(uniqid()) . ".odt";
489
        }
490
        copy($this->odtfilepath, $name);
491
    }
492
493
    /**
494
     * Adds all missing styles and fonts in the document
495
     */
496
    protected function addStyles() {
497
        $xsl = dirname(__FILE__)."/xsl";
498
        $contentxml = new DOMDocument();
499
        $contentxml->loadXML($this->contentXml);
500
        $stylesxml = new DOMDocument();
501
        $stylesxml->loadXML($this->stylesXml);
502
        $xsldoc = new DOMDocument();
503
        $xsldoc->load($xsl."/styles.xsl");
504
        $proc = new XSLTProcessor();
505
        $proc->importStylesheet($xsldoc);
506
        $this->contentXml = $proc->transformToXML($contentxml);
507
        $this->stylesXml = $proc->transformToXML($stylesxml);
508
        if ($this->contentXml === false or $this->stylesXml === false) {
509
            throw new ODTException('Adding of styles failed');
510
        }
511
    }
512
513
}
514
515
516
/**
517
 * Print a usage message and exit
518
 */
519
function usage() {
520
    $message = sprintf("Usage: %s [options] -i input.html -o output.odt -t template.odt\n", $GLOBALS["argv"][0]);
521
    $message .= "Options:
522
    -u <URL> : the remote URL you downloaded the page from. This is required to include remote images.
523
    -r <KEYWORD> : a keyword in the template document to replace with the converted text.
524
    --top-header-level <LEVEL> : the maximum header level used in your HTML page (1 for <h1>, 2 for <h2> etc.).
525
    --img-default-width <SIZE> : the default width for images.
526
    --img-default-height <SIZE> : the default height for images.
527
";
528
    die($message);
529
}
530
531
/**
532
 * Parse the command line options
533
 */
534
function parseOpts() {
535
    $shortopts = "i:o:t:u:r:vh";
536
    $longopts = array(
537
        "help",
538
        "top-header-level:",
539
        "img-default-width:",
540
        "img-default-height:",
541
    );
542
    $options = getopt($shortopts, $longopts);
543
    if (array_key_exists("h", $options) or
544
        array_key_exists("help", $options)) {
545
        usage();
546
    }
547
    foreach (array("i", "o", "t") as $reqopt) {
548
        if (!array_key_exists($reqopt, $options)) {
549
            print "Missing '-$reqopt' option.\n";
550
            usage();
551
        }
552
    }
553
    $input_url = @parse_url($options["i"]);
554
    if (isset($input_url["scheme"])) {
555
        $options["u"] = $options["i"];
556
    }
557
    if (isset($options["u"])) {
558
        $input_url = @parse_url($options["u"]);
559
        $options["u"] = sprintf("%s://%s%s%s",
560
                            $input_url["scheme"], $input_url["host"],
561
                            isset($input_url["port"]) ?
562
                                ":".$input_url["port"] : "",
563
                            isset($input_url["path"]) ?
564
                                dirname($input_url["path"]) : "");
565
    } else {
566
        print "Warning: you did not supply the '-u' option, "
567
             ."the images in the page will not be included.\n";
568
        $options["u"] = "";
569
    }
570
    if (!extension_loaded('curl')) {
571
        print "Warning: you did not install the 'curl' PHP extension, "
572
             ."the images in the page will not be included.\n";
573
    }
574
    if (!extension_loaded('tidy')) {
575
        print "Warning: you should install the 'tidy' PHP extension to ensure "
576
             ."a good conversion (or else your HTML must be valid already !)\n";
577
    }
578
    if (isset($option["top-header-level"])) {
579
        $options["top-header-level"] = int($options["top-header-level"]);
580
    } else {
581
        $options["top-header-level"] = 1;
582
    }
583
    if (!isset($options["img-default-width"])) {
584
        $options["img-default-width"] = "8cm";
585
    }
586
    if (!isset($options["img-default-height"])) {
587
        $options["img-default-height"] = "6cm";
588
    }
589
    if (!isset($options["r"])) {
590
        $options["r"] = "";
591
    }
592
    return $options;
593
}
594
595
/**
596
 * This function runs the whole conversion process:
597
 * - read command line options
598
 * - read the input file
599
 * - create the {@link ODTFile} instance and set the stylesheet parameters
600
 * - run the {@link ODTFile::compile()} method
601
 * - save the resulting file with the {@link ODTFile::saveToFile()} method
602
 */
603
function main() {
604
    global $html, $options;
605
606
    $options = parseOpts();
607
608
    $html = file_get_contents($options["i"]);
609
610
    $odf = new ODTFile($options["t"]);
611
612
    $odf->xslparams["url"] = $options["u"]; // this would be your app's URL
613
    // the following setting depends on how <h> tags are used in you app
614
    $odf->xslparams["heading_minus_level"] = $options["top-header-level"];
615
    // set the following values from your config
616
    $odf->get_remote_images = ($options["u"] != "");
617
    $odf->xslparams["img_default_width"] = $options["img-default-width"];
618
    $odf->xslparams["img_default_height"] = $options["img-default-height"];
619
620
    $odf->compile();
621
622
    $odf->saveToFile($options["o"]);
623
    print "Wrote document to: ".$options["o"]."\n";
624
}
625
626
main();
627
628
?>