1
/*
2
 * HTMLtree.c : implementation of access function for an HTML tree.
3
 *
4
 * See Copyright for the status of this software.
5
 *
6
 * daniel@veillard.com
7
 */
8
9
10
#define IN_LIBXML
11
#include "libxml.h"
12
#ifdef LIBXML_HTML_ENABLED
13
14
#include <string.h> /* for memset() only ! */
15
16
#ifdef HAVE_CTYPE_H
17
#include <ctype.h>
18
#endif
19
#ifdef HAVE_STDLIB_H
20
#include <stdlib.h>
21
#endif
22
23
#include <libxml/xmlmemory.h>
24
#include <libxml/HTMLparser.h>
25
#include <libxml/HTMLtree.h>
26
#include <libxml/entities.h>
27
#include <libxml/valid.h>
28
#include <libxml/xmlerror.h>
29
#include <libxml/parserInternals.h>
30
#include <libxml/globals.h>
31
#include <libxml/uri.h>
32
33
/************************************************************************
34
 *									*
35
 *   		Getting/Setting encoding meta tags			*
36
 *									*
37
 ************************************************************************/
38
39
/**
40
 * htmlGetMetaEncoding:
41
 * @doc:  the document
42
 * 
43
 * Encoding definition lookup in the Meta tags
44
 *
45
 * Returns the current encoding as flagged in the HTML source
46
 */
47
const xmlChar *
48
htmlGetMetaEncoding(htmlDocPtr doc) {
49
    htmlNodePtr cur;
50
    const xmlChar *content;
51
    const xmlChar *encoding;
52
53
    if (doc == NULL)
54
	return(NULL);
55
    cur = doc->children;
56
57
    /*
58
     * Search the html
59
     */
60
    while (cur != NULL) {
61
	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
62
	    if (xmlStrEqual(cur->name, BAD_CAST"html"))
63
		break;
64
	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
65
		goto found_head;
66
	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
67
		goto found_meta;
68
	}
69
	cur = cur->next;
70
    }
71
    if (cur == NULL)
72
	return(NULL);
73
    cur = cur->children;
74
75
    /*
76
     * Search the head
77
     */
78
    while (cur != NULL) {
79
	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
80
	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
81
		break;
82
	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
83
		goto found_meta;
84
	}
85
	cur = cur->next;
86
    }
87
    if (cur == NULL)
88
	return(NULL);
89
found_head:
90
    cur = cur->children;
91
92
    /*
93
     * Search the meta elements
94
     */
95
found_meta:
96
    while (cur != NULL) {
97
	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
98
	    if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
99
		xmlAttrPtr attr = cur->properties;
100
		int http;
101
		const xmlChar *value;
102
103
		content = NULL;
104
		http = 0;
105
		while (attr != NULL) {
106
		    if ((attr->children != NULL) &&
107
		        (attr->children->type == XML_TEXT_NODE) &&
108
		        (attr->children->next == NULL)) {
109
			value = attr->children->content;
110
			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
111
			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
112
			    http = 1;
113
			else if ((value != NULL)
114
			 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
115
			    content = value;
116
			if ((http != 0) && (content != NULL))
117
			    goto found_content;
118
		    }
119
		    attr = attr->next;
120
		}
121
	    }
122
	}
123
	cur = cur->next;
124
    }
125
    return(NULL);
126
127
found_content:
128
    encoding = xmlStrstr(content, BAD_CAST"charset=");
129
    if (encoding == NULL) 
130
	encoding = xmlStrstr(content, BAD_CAST"Charset=");
131
    if (encoding == NULL) 
132
	encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
133
    if (encoding != NULL) {
134
	encoding += 8;
135
    } else {
136
	encoding = xmlStrstr(content, BAD_CAST"charset =");
137
	if (encoding == NULL) 
138
	    encoding = xmlStrstr(content, BAD_CAST"Charset =");
139
	if (encoding == NULL) 
140
	    encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
141
	if (encoding != NULL)
142
	    encoding += 9;
143
    }
144
    if (encoding != NULL) {
145
	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
146
    }
147
    return(encoding);
148
}
149
150
/**
151
 * htmlSetMetaEncoding:
152
 * @doc:  the document
153
 * @encoding:  the encoding string
154
 * 
155
 * Sets the current encoding in the Meta tags
156
 * NOTE: this will not change the document content encoding, just
157
 * the META flag associated.
158
 *
159
 * Returns 0 in case of success and -1 in case of error
160
 */
161
int
162
htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
163
    htmlNodePtr cur, meta;
164
    const xmlChar *content;
165
    char newcontent[100];
166
167
168
    if (doc == NULL)
169
	return(-1);
170
171
    if (encoding != NULL) {
172
	snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
173
                (char *)encoding);
174
	newcontent[sizeof(newcontent) - 1] = 0;
175
    }
176
177
    cur = doc->children;
178
179
    /*
180
     * Search the html
181
     */
182
    while (cur != NULL) {
183
	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
184
	    if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
185
		break;
186
	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
187
		goto found_head;
188
	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
189
		goto found_meta;
190
	}
191
	cur = cur->next;
192
    }
193
    if (cur == NULL)
194
	return(-1);
195
    cur = cur->children;
196
197
    /*
198
     * Search the head
199
     */
200
    while (cur != NULL) {
201
	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
202
	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
203
		break;
204
	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
205
		goto found_meta;
206
	}
207
	cur = cur->next;
208
    }
209
    if (cur == NULL)
210
	return(-1);
211
found_head:
212
    if (cur->children == NULL) {
213
	if (encoding == NULL)
214
	    return(0);
215
	meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
216
	xmlAddChild(cur, meta);
217
	xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
218
	xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
219
	return(0);
220
    }
221
    cur = cur->children;
222
223
found_meta:
224
    if (encoding != NULL) {
225
	/*
226
	 * Create a new Meta element with the right attributes
227
	 */
228
229
	meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
230
	xmlAddPrevSibling(cur, meta);
231
	xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
232
	xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
233
    }
234
235
    /*
236
     * Search and destroy all the remaining the meta elements carrying
237
     * encoding informations
238
     */
239
    while (cur != NULL) {
240
	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
241
	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
242
		xmlAttrPtr attr = cur->properties;
243
		int http;
244
		const xmlChar *value;
245
246
		content = NULL;
247
		http = 0;
248
		while (attr != NULL) {
249
		    if ((attr->children != NULL) &&
250
		        (attr->children->type == XML_TEXT_NODE) &&
251
		        (attr->children->next == NULL)) {
252
			value = attr->children->content;
253
			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
254
			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
255
			    http = 1;
256
			else 
257
                        {
258
                           if ((value != NULL) && 
259
				(!xmlStrcasecmp(attr->name, BAD_CAST"content")))
260
			      content = value;
261
                        }
262
		        if ((http != 0) && (content != NULL))
263
			    break;
264
		    }
265
		    attr = attr->next;
266
		}
267
		if ((http != 0) && (content != NULL)) {
268
		    meta = cur;
269
		    cur = cur->next;
270
		    xmlUnlinkNode(meta);
271
                    xmlFreeNode(meta);
272
		    continue;
273
		}
274
275
	    }
276
	}
277
	cur = cur->next;
278
    }
279
    return(0);
280
}
281
282
/**
283
 * booleanHTMLAttrs:
284
 *
285
 * These are the HTML attributes which will be output
286
 * in minimized form, i.e. <option selected="selected"> will be
287
 * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
288
 *
289
 */
290
static const char* htmlBooleanAttrs[] = {
291
  "checked", "compact", "declare", "defer", "disabled", "ismap",
292
  "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
293
  "selected", NULL
294
};
295
296
297
/**
298
 * htmlIsBooleanAttr:
299
 * @name:  the name of the attribute to check
300
 *
301
 * Determine if a given attribute is a boolean attribute.
302
 * 
303
 * returns: false if the attribute is not boolean, true otherwise.
304
 */
305
int
306
htmlIsBooleanAttr(const xmlChar *name)
307
{
308
    int i = 0;
309
310
    while (htmlBooleanAttrs[i] != NULL) {
311
        if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
312
            return 1;
313
        i++;
314
    }
315
    return 0;
316
}
317
318
#ifdef LIBXML_OUTPUT_ENABLED
319
/*
320
 * private routine exported from xmlIO.c
321
 */
322
xmlOutputBufferPtr
323
xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder);
324
/************************************************************************
325
 *									*
326
 * 			Output error handlers				*
327
 *									*
328
 ************************************************************************/
329
/**
330
 * htmlSaveErrMemory:
331
 * @extra:  extra informations
332
 *
333
 * Handle an out of memory condition
334
 */
335
static void
336
htmlSaveErrMemory(const char *extra)
337
{
338
    __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
339
}
340
341
/**
342
 * htmlSaveErr:
343
 * @code:  the error number
344
 * @node:  the location of the error.
345
 * @extra:  extra informations
346
 *
347
 * Handle an out of memory condition
348
 */
349
static void
350
htmlSaveErr(int code, xmlNodePtr node, const char *extra)
351
{
352
    const char *msg = NULL;
353
354
    switch(code) {
355
        case XML_SAVE_NOT_UTF8:
356
	    msg = "string is not in UTF-8\n";
357
	    break;
358
	case XML_SAVE_CHAR_INVALID:
359
	    msg = "invalid character value\n";
360
	    break;
361
	case XML_SAVE_UNKNOWN_ENCODING:
362
	    msg = "unknown encoding %s\n";
363
	    break;
364
	case XML_SAVE_NO_DOCTYPE:
365
	    msg = "HTML has no DOCTYPE\n";
366
	    break;
367
	default:
368
	    msg = "unexpected error number\n";
369
    }
370
    __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
371
}
372
373
/************************************************************************
374
 *									*
375
 *   		Dumping HTML tree content to a simple buffer		*
376
 *									*
377
 ************************************************************************/
378
379
static int
380
htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
381
	           int format);
382
383
/**
384
 * htmlNodeDumpFormat:
385
 * @buf:  the HTML buffer output
386
 * @doc:  the document
387
 * @cur:  the current node
388
 * @format:  should formatting spaces been added
389
 *
390
 * Dump an HTML node, recursive behaviour,children are printed too.
391
 *
392
 * Returns the number of byte written or -1 in case of error
393
 */
394
static int
395
htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
396
	           int format) {
397
    unsigned int use;
398
    int ret;
399
    xmlOutputBufferPtr outbuf;
400
401
    if (cur == NULL) {
402
	return (-1);
403
    }
404
    if (buf == NULL) {
405
	return (-1);
406
    }
407
    outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
408
    if (outbuf == NULL) {
409
        htmlSaveErrMemory("allocating HTML output buffer");
410
	return (-1);
411
    }
412
    memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer));
413
    outbuf->buffer = buf;
414
    outbuf->encoder = NULL;
415
    outbuf->writecallback = NULL;
416
    outbuf->closecallback = NULL;
417
    outbuf->context = NULL;
418
    outbuf->written = 0;
419
420
    use = buf->use;
421
    htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
422
    xmlFree(outbuf);
423
    ret = buf->use - use;
424
    return (ret);
425
}
426
427
/**
428
 * htmlNodeDump:
429
 * @buf:  the HTML buffer output
430
 * @doc:  the document
431
 * @cur:  the current node
432
 *
433
 * Dump an HTML node, recursive behaviour,children are printed too,
434
 * and formatting returns are added.
435
 *
436
 * Returns the number of byte written or -1 in case of error
437
 */
438
int
439
htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
440
    xmlInitParser();
441
442
    return(htmlNodeDumpFormat(buf, doc, cur, 1));
443
}
444
445
/**
446
 * htmlNodeDumpFileFormat:
447
 * @out:  the FILE pointer
448
 * @doc:  the document
449
 * @cur:  the current node
450
 * @encoding: the document encoding
451
 * @format:  should formatting spaces been added
452
 *
453
 * Dump an HTML node, recursive behaviour,children are printed too.
454
 *
455
 * TODO: if encoding == NULL try to save in the doc encoding
456
 *
457
 * returns: the number of byte written or -1 in case of failure.
458
 */
459
int
460
htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
461
	               xmlNodePtr cur, const char *encoding, int format) {
462
    xmlOutputBufferPtr buf;
463
    xmlCharEncodingHandlerPtr handler = NULL;
464
    int ret;
465
466
    xmlInitParser();
467
468
    if (encoding != NULL) {
469
	xmlCharEncoding enc;
470
471
	enc = xmlParseCharEncoding(encoding);
472
	if (enc != XML_CHAR_ENCODING_UTF8) {
473
	    handler = xmlFindCharEncodingHandler(encoding);
474
	    if (handler == NULL)
475
		return(-1);
476
	}
477
    }
478
479
    /*
480
     * Fallback to HTML or ASCII when the encoding is unspecified
481
     */
482
    if (handler == NULL)
483
	handler = xmlFindCharEncodingHandler("HTML");
484
    if (handler == NULL)
485
	handler = xmlFindCharEncodingHandler("ascii");
486
487
    /* 
488
     * save the content to a temp buffer.
489
     */
490
    buf = xmlOutputBufferCreateFile(out, handler);
491
    if (buf == NULL) return(0);
492
493
    htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
494
495
    ret = xmlOutputBufferClose(buf);
496
    return(ret);
497
}
498
499
/**
500
 * htmlNodeDumpFile:
501
 * @out:  the FILE pointer
502
 * @doc:  the document
503
 * @cur:  the current node
504
 *
505
 * Dump an HTML node, recursive behaviour,children are printed too,
506
 * and formatting returns are added.
507
 */
508
void
509
htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
510
    htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
511
}
512
513
/**
514
 * htmlDocDumpMemoryFormat:
515
 * @cur:  the document
516
 * @mem:  OUT: the memory pointer
517
 * @size:  OUT: the memory length
518
 * @format:  should formatting spaces been added
519
 *
520
 * Dump an HTML document in memory and return the xmlChar * and it's size.
521
 * It's up to the caller to free the memory.
522
 */
523
void
524
htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
525
    xmlOutputBufferPtr buf;
526
    xmlCharEncodingHandlerPtr handler = NULL;
527
    const char *encoding;
528
529
    xmlInitParser();
530
531
    if ((mem == NULL) || (size == NULL))
532
        return;
533
    if (cur == NULL) {
534
	*mem = NULL;
535
	*size = 0;
536
	return;
537
    }
538
539
    encoding = (const char *) htmlGetMetaEncoding(cur);
540
541
    if (encoding != NULL) {
542
	xmlCharEncoding enc;
543
544
	enc = xmlParseCharEncoding(encoding);
545
	if (enc != cur->charset) {
546
	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
547
		/*
548
		 * Not supported yet
549
		 */
550
		*mem = NULL;
551
		*size = 0;
552
		return;
553
	    }
554
555
	    handler = xmlFindCharEncodingHandler(encoding);
556
	    if (handler == NULL) {
557
		*mem = NULL;
558
		*size = 0;
559
		return;
560
	    }
561
	} else {
562
	    handler = xmlFindCharEncodingHandler(encoding);
563
	}
564
    }
565
566
    /*
567
     * Fallback to HTML or ASCII when the encoding is unspecified
568
     */
569
    if (handler == NULL)
570
	handler = xmlFindCharEncodingHandler("HTML");
571
    if (handler == NULL)
572
	handler = xmlFindCharEncodingHandler("ascii");
573
574
    buf = xmlAllocOutputBufferInternal(handler);
575
    if (buf == NULL) {
576
	*mem = NULL;
577
	*size = 0;
578
	return;
579
    }
580
581
	htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
582
583
    xmlOutputBufferFlush(buf);
584
    if (buf->conv != NULL) {
585
	*size = buf->conv->use;
586
	*mem = xmlStrndup(buf->conv->content, *size);
587
    } else {
588
	*size = buf->buffer->use;
589
	*mem = xmlStrndup(buf->buffer->content, *size);
590
    }
591
    (void)xmlOutputBufferClose(buf);
592
}
593
594
/**
595
 * htmlDocDumpMemory:
596
 * @cur:  the document
597
 * @mem:  OUT: the memory pointer
598
 * @size:  OUT: the memory length
599
 *
600
 * Dump an HTML document in memory and return the xmlChar * and it's size.
601
 * It's up to the caller to free the memory.
602
 */
603
void
604
htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
605
	htmlDocDumpMemoryFormat(cur, mem, size, 1);
606
}
607
608
609
/************************************************************************
610
 *									*
611
 *   		Dumping HTML tree content to an I/O output buffer	*
612
 *									*
613
 ************************************************************************/
614
615
void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur);
616
617
/**
618
 * htmlDtdDumpOutput:
619
 * @buf:  the HTML buffer output
620
 * @doc:  the document
621
 * @encoding:  the encoding string
622
 * 
623
 * TODO: check whether encoding is needed
624
 *
625
 * Dump the HTML document DTD, if any.
626
 */
627
static void
628
htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
629
	          const char *encoding ATTRIBUTE_UNUSED) {
630
    xmlDtdPtr cur = doc->intSubset;
631
632
    if (cur == NULL) {
633
	htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
634
	return;
635
    }
636
    xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
637
    xmlOutputBufferWriteString(buf, (const char *)cur->name);
638
    if (cur->ExternalID != NULL) {
639
	xmlOutputBufferWriteString(buf, " PUBLIC ");
640
	xmlBufferWriteQuotedString(buf->buffer, cur->ExternalID);
641
	if (cur->SystemID != NULL) {
642
	    xmlOutputBufferWriteString(buf, " ");
643
	    xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
644
	} 
645
    }  else if (cur->SystemID != NULL) {
646
	xmlOutputBufferWriteString(buf, " SYSTEM ");
647
	xmlBufferWriteQuotedString(buf->buffer, cur->SystemID);
648
    }
649
    xmlOutputBufferWriteString(buf, ">\n");
650
}
651
652
/**
653
 * htmlAttrDumpOutput:
654
 * @buf:  the HTML buffer output
655
 * @doc:  the document
656
 * @cur:  the attribute pointer
657
 * @encoding:  the encoding string
658
 *
659
 * Dump an HTML attribute
660
 */
661
static void
662
htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur,
663
	           const char *encoding ATTRIBUTE_UNUSED) {
664
    xmlChar *value;
665
666
    /*
667
     * TODO: The html output method should not escape a & character
668
     *       occurring in an attribute value immediately followed by
669
     *       a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
670
     */
671
672
    if (cur == NULL) {
673
	return;
674
    }
675
    xmlOutputBufferWriteString(buf, " ");
676
    if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
677
        xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
678
	xmlOutputBufferWriteString(buf, ":");
679
    }
680
    xmlOutputBufferWriteString(buf, (const char *)cur->name);
681
    if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
682
	value = xmlNodeListGetString(doc, cur->children, 0);
683
	if (value) {
684
	    xmlOutputBufferWriteString(buf, "=");
685
	    if ((cur->ns == NULL) && (cur->parent != NULL) &&
686
		(cur->parent->ns == NULL) &&
687
		((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
688
	         (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
689
		 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
690
		 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
691
		  (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
692
		xmlChar *escaped;
693
		xmlChar *tmp = value;
694
695
		while (IS_BLANK_CH(*tmp)) tmp++;
696
697
		escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+");
698
		if (escaped != NULL) {
699
		    xmlBufferWriteQuotedString(buf->buffer, escaped);
700
		    xmlFree(escaped);
701
		} else {
702
		    xmlBufferWriteQuotedString(buf->buffer, value);
703
		}
704
	    } else {
705
		xmlBufferWriteQuotedString(buf->buffer, value);
706
	    }
707
	    xmlFree(value);
708
	} else  {
709
	    xmlOutputBufferWriteString(buf, "=\"\"");
710
	}
711
    }
712
}
713
714
/**
715
 * htmlAttrListDumpOutput:
716
 * @buf:  the HTML buffer output
717
 * @doc:  the document
718
 * @cur:  the first attribute pointer
719
 * @encoding:  the encoding string
720
 *
721
 * Dump a list of HTML attributes
722
 */
723
static void
724
htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
725
    if (cur == NULL) {
726
	return;
727
    }
728
    while (cur != NULL) {
729
        htmlAttrDumpOutput(buf, doc, cur, encoding);
730
	cur = cur->next;
731
    }
732
}
733
734
735
736
/**
737
 * htmlNodeListDumpOutput:
738
 * @buf:  the HTML buffer output
739
 * @doc:  the document
740
 * @cur:  the first node
741
 * @encoding:  the encoding string
742
 * @format:  should formatting spaces been added
743
 *
744
 * Dump an HTML node list, recursive behaviour,children are printed too.
745
 */
746
static void
747
htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
748
	               xmlNodePtr cur, const char *encoding, int format) {
749
    if (cur == NULL) {
750
	return;
751
    }
752
    while (cur != NULL) {
753
        htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
754
	cur = cur->next;
755
    }
756
}
757
758
/**
759
 * htmlNodeDumpFormatOutput:
760
 * @buf:  the HTML buffer output
761
 * @doc:  the document
762
 * @cur:  the current node
763
 * @encoding:  the encoding string
764
 * @format:  should formatting spaces been added
765
 *
766
 * Dump an HTML node, recursive behaviour,children are printed too.
767
 */
768
void
769
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
770
	                 xmlNodePtr cur, const char *encoding, int format) {
771
    const htmlElemDesc * info;
772
773
    xmlInitParser();
774
775
    if ((cur == NULL) || (buf == NULL)) {
776
	return;
777
    }
778
    /*
779
     * Special cases.
780
     */
781
    if (cur->type == XML_DTD_NODE)
782
	return;
783
    if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
784
        (cur->type == XML_DOCUMENT_NODE)){
785
	htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding);
786
	return;
787
    }
788
    if (cur->type == XML_ATTRIBUTE_NODE) {
789
        htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur, encoding);
790
	return;
791
    }
792
    if (cur->type == HTML_TEXT_NODE) {
793
	if (cur->content != NULL) {
794
	    if (((cur->name == (const xmlChar *)xmlStringText) ||
795
		 (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
796
		((cur->parent == NULL) ||
797
		 ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) &&
798
		  (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) {
799
		xmlChar *buffer;
800
801
		buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
802
		if (buffer != NULL) {
803
		    xmlOutputBufferWriteString(buf, (const char *)buffer);
804
		    xmlFree(buffer);
805
		}
806
	    } else {
807
		xmlOutputBufferWriteString(buf, (const char *)cur->content);
808
	    }
809
	}
810
	return;
811
    }
812
    if (cur->type == HTML_COMMENT_NODE) {
813
	if (cur->content != NULL) {
814
	    xmlOutputBufferWriteString(buf, "<!--");
815
	    xmlOutputBufferWriteString(buf, (const char *)cur->content);
816
	    xmlOutputBufferWriteString(buf, "-->");
817
	}
818
	return;
819
    }
820
    if (cur->type == HTML_PI_NODE) {
821
	if (cur->name == NULL)
822
	    return;
823
	xmlOutputBufferWriteString(buf, "<?");
824
	xmlOutputBufferWriteString(buf, (const char *)cur->name);
825
	if (cur->content != NULL) {
826
	    xmlOutputBufferWriteString(buf, " ");
827
	    xmlOutputBufferWriteString(buf, (const char *)cur->content);
828
	}
829
	xmlOutputBufferWriteString(buf, ">");
830
	return;
831
    }
832
    if (cur->type == HTML_ENTITY_REF_NODE) {
833
        xmlOutputBufferWriteString(buf, "&");
834
	xmlOutputBufferWriteString(buf, (const char *)cur->name);
835
        xmlOutputBufferWriteString(buf, ";");
836
	return;
837
    }
838
    if (cur->type == HTML_PRESERVE_NODE) {
839
	if (cur->content != NULL) {
840
	    xmlOutputBufferWriteString(buf, (const char *)cur->content);
841
	}
842
	return;
843
    }
844
845
    /*
846
     * Get specific HTML info for that node.
847
     */
848
    if (cur->ns == NULL)
849
	info = htmlTagLookup(cur->name);
850
    else
851
	info = NULL;
852
853
    xmlOutputBufferWriteString(buf, "<");
854
    if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
855
        xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
856
	xmlOutputBufferWriteString(buf, ":");
857
    }
858
    xmlOutputBufferWriteString(buf, (const char *)cur->name);
859
    if (cur->nsDef)
860
	xmlNsListDumpOutput(buf, cur->nsDef);
861
    if (cur->properties != NULL)
862
        htmlAttrListDumpOutput(buf, doc, cur->properties, encoding);
863
864
    if ((info != NULL) && (info->empty)) {
865
        xmlOutputBufferWriteString(buf, ">");
866
	if ((format) && (!info->isinline) && (cur->next != NULL)) {
867
	    if ((cur->next->type != HTML_TEXT_NODE) &&
868
		(cur->next->type != HTML_ENTITY_REF_NODE) &&
869
		(cur->parent != NULL) &&
870
		(cur->parent->name != NULL) &&
871
		(cur->parent->name[0] != 'p')) /* p, pre, param */
872
		xmlOutputBufferWriteString(buf, "\n");
873
	}
874
	return;
875
    }
876
    if (((cur->type == XML_ELEMENT_NODE) || (cur->content == NULL)) &&
877
	(cur->children == NULL)) {
878
        if ((info != NULL) && (info->saveEndTag != 0) &&
879
	    (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
880
	    (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
881
	    xmlOutputBufferWriteString(buf, ">");
882
	} else {
883
	    xmlOutputBufferWriteString(buf, "></");
884
            if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
885
                xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
886
                xmlOutputBufferWriteString(buf, ":");
887
            }
888
	    xmlOutputBufferWriteString(buf, (const char *)cur->name);
889
	    xmlOutputBufferWriteString(buf, ">");
890
	}
891
	if ((format) && (cur->next != NULL) &&
892
            (info != NULL) && (!info->isinline)) {
893
	    if ((cur->next->type != HTML_TEXT_NODE) &&
894
		(cur->next->type != HTML_ENTITY_REF_NODE) &&
895
		(cur->parent != NULL) &&
896
		(cur->parent->name != NULL) &&
897
		(cur->parent->name[0] != 'p')) /* p, pre, param */
898
		xmlOutputBufferWriteString(buf, "\n");
899
	}
900
	return;
901
    }
902
    xmlOutputBufferWriteString(buf, ">");
903
    if ((cur->type != XML_ELEMENT_NODE) &&
904
	(cur->content != NULL)) {
905
	    /*
906
	     * Uses the OutputBuffer property to automatically convert
907
	     * invalids to charrefs
908
	     */
909
910
            xmlOutputBufferWriteString(buf, (const char *) cur->content);
911
    }
912
    if (cur->children != NULL) {
913
        if ((format) && (info != NULL) && (!info->isinline) &&
914
	    (cur->children->type != HTML_TEXT_NODE) &&
915
	    (cur->children->type != HTML_ENTITY_REF_NODE) &&
916
	    (cur->children != cur->last) &&
917
	    (cur->name != NULL) &&
918
	    (cur->name[0] != 'p')) /* p, pre, param */
919
	    xmlOutputBufferWriteString(buf, "\n");
920
	htmlNodeListDumpOutput(buf, doc, cur->children, encoding, format);
921
        if ((format) && (info != NULL) && (!info->isinline) &&
922
	    (cur->last->type != HTML_TEXT_NODE) &&
923
	    (cur->last->type != HTML_ENTITY_REF_NODE) &&
924
	    (cur->children != cur->last) &&
925
	    (cur->name != NULL) &&
926
	    (cur->name[0] != 'p')) /* p, pre, param */
927
	    xmlOutputBufferWriteString(buf, "\n");
928
    }
929
    xmlOutputBufferWriteString(buf, "</");
930
    if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
931
        xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
932
	xmlOutputBufferWriteString(buf, ":");
933
    }
934
    xmlOutputBufferWriteString(buf, (const char *)cur->name);
935
    xmlOutputBufferWriteString(buf, ">");
936
    if ((format) && (info != NULL) && (!info->isinline) &&
937
	(cur->next != NULL)) {
938
        if ((cur->next->type != HTML_TEXT_NODE) &&
939
	    (cur->next->type != HTML_ENTITY_REF_NODE) &&
940
	    (cur->parent != NULL) &&
941
	    (cur->parent->name != NULL) &&
942
	    (cur->parent->name[0] != 'p')) /* p, pre, param */
943
	    xmlOutputBufferWriteString(buf, "\n");
944
    }
945
}
946
947
/**
948
 * htmlNodeDumpOutput:
949
 * @buf:  the HTML buffer output
950
 * @doc:  the document
951
 * @cur:  the current node
952
 * @encoding:  the encoding string
953
 *
954
 * Dump an HTML node, recursive behaviour,children are printed too,
955
 * and formatting returns/spaces are added.
956
 */
957
void
958
htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
959
	           xmlNodePtr cur, const char *encoding) {
960
    htmlNodeDumpFormatOutput(buf, doc, cur, encoding, 1);
961
}
962
963
/**
964
 * htmlDocContentDumpFormatOutput:
965
 * @buf:  the HTML buffer output
966
 * @cur:  the document
967
 * @encoding:  the encoding string
968
 * @format:  should formatting spaces been added
969
 *
970
 * Dump an HTML document.
971
 */
972
void
973
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
974
	                       const char *encoding, int format) {
975
    int type;
976
977
    xmlInitParser();
978
979
    if ((buf == NULL) || (cur == NULL))
980
        return;
981
982
    /*
983
     * force to output the stuff as HTML, especially for entities
984
     */
985
    type = cur->type;
986
    cur->type = XML_HTML_DOCUMENT_NODE;
987
    if (cur->intSubset != NULL) {
988
        htmlDtdDumpOutput(buf, cur, NULL);
989
    }
990
    if (cur->children != NULL) {
991
        htmlNodeListDumpOutput(buf, cur, cur->children, encoding, format);
992
    }
993
    xmlOutputBufferWriteString(buf, "\n");
994
    cur->type = (xmlElementType) type;
995
}
996
997
/**
998
 * htmlDocContentDumpOutput:
999
 * @buf:  the HTML buffer output
1000
 * @cur:  the document
1001
 * @encoding:  the encoding string
1002
 *
1003
 * Dump an HTML document. Formating return/spaces are added.
1004
 */
1005
void
1006
htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1007
	                 const char *encoding) {
1008
    htmlDocContentDumpFormatOutput(buf, cur, encoding, 1);
1009
}
1010
1011
/************************************************************************
1012
 *									*
1013
 *		Saving functions front-ends				*
1014
 *									*
1015
 ************************************************************************/
1016
1017
/**
1018
 * htmlDocDump:
1019
 * @f:  the FILE*
1020
 * @cur:  the document
1021
 *
1022
 * Dump an HTML document to an open FILE.
1023
 *
1024
 * returns: the number of byte written or -1 in case of failure.
1025
 */
1026
int
1027
htmlDocDump(FILE *f, xmlDocPtr cur) {
1028
    xmlOutputBufferPtr buf;
1029
    xmlCharEncodingHandlerPtr handler = NULL;
1030
    const char *encoding;
1031
    int ret;
1032
1033
    xmlInitParser();
1034
1035
    if ((cur == NULL) || (f == NULL)) {
1036
	return(-1);
1037
    }
1038
1039
    encoding = (const char *) htmlGetMetaEncoding(cur);
1040
1041
    if (encoding != NULL) {
1042
	xmlCharEncoding enc;
1043
1044
	enc = xmlParseCharEncoding(encoding);
1045
	if (enc != cur->charset) {
1046
	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1047
		/*
1048
		 * Not supported yet
1049
		 */
1050
		return(-1);
1051
	    }
1052
1053
	    handler = xmlFindCharEncodingHandler(encoding);
1054
	    if (handler == NULL)
1055
		return(-1);
1056
	} else {
1057
	    handler = xmlFindCharEncodingHandler(encoding);
1058
	}
1059
    }
1060
1061
    /*
1062
     * Fallback to HTML or ASCII when the encoding is unspecified
1063
     */
1064
    if (handler == NULL)
1065
	handler = xmlFindCharEncodingHandler("HTML");
1066
    if (handler == NULL)
1067
	handler = xmlFindCharEncodingHandler("ascii");
1068
1069
    buf = xmlOutputBufferCreateFile(f, handler);
1070
    if (buf == NULL) return(-1);
1071
    htmlDocContentDumpOutput(buf, cur, NULL);
1072
1073
    ret = xmlOutputBufferClose(buf);
1074
    return(ret);
1075
}
1076
1077
/**
1078
 * htmlSaveFile:
1079
 * @filename:  the filename (or URL)
1080
 * @cur:  the document
1081
 *
1082
 * Dump an HTML document to a file. If @filename is "-" the stdout file is
1083
 * used.
1084
 * returns: the number of byte written or -1 in case of failure.
1085
 */
1086
int
1087
htmlSaveFile(const char *filename, xmlDocPtr cur) {
1088
    xmlOutputBufferPtr buf;
1089
    xmlCharEncodingHandlerPtr handler = NULL;
1090
    const char *encoding;
1091
    int ret;
1092
1093
    if ((cur == NULL) || (filename == NULL))
1094
        return(-1);
1095
       
1096
    xmlInitParser();
1097
1098
    encoding = (const char *) htmlGetMetaEncoding(cur);
1099
1100
    if (encoding != NULL) {
1101
	xmlCharEncoding enc;
1102
1103
	enc = xmlParseCharEncoding(encoding);
1104
	if (enc != cur->charset) {
1105
	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1106
		/*
1107
		 * Not supported yet
1108
		 */
1109
		return(-1);
1110
	    }
1111
1112
	    handler = xmlFindCharEncodingHandler(encoding);
1113
	    if (handler == NULL)
1114
		return(-1);
1115
	}
1116
    }
1117
1118
    /*
1119
     * Fallback to HTML or ASCII when the encoding is unspecified
1120
     */
1121
    if (handler == NULL)
1122
	handler = xmlFindCharEncodingHandler("HTML");
1123
    if (handler == NULL)
1124
	handler = xmlFindCharEncodingHandler("ascii");
1125
1126
    /* 
1127
     * save the content to a temp buffer.
1128
     */
1129
    buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1130
    if (buf == NULL) return(0);
1131
1132
    htmlDocContentDumpOutput(buf, cur, NULL);
1133
1134
    ret = xmlOutputBufferClose(buf);
1135
    return(ret);
1136
}
1137
1138
/**
1139
 * htmlSaveFileFormat:
1140
 * @filename:  the filename
1141
 * @cur:  the document
1142
 * @format:  should formatting spaces been added
1143
 * @encoding: the document encoding
1144
 *
1145
 * Dump an HTML document to a file using a given encoding.
1146
 * 
1147
 * returns: the number of byte written or -1 in case of failure.
1148
 */
1149
int
1150
htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1151
	           const char *encoding, int format) {
1152
    xmlOutputBufferPtr buf;
1153
    xmlCharEncodingHandlerPtr handler = NULL;
1154
    int ret;
1155
1156
    if ((cur == NULL) || (filename == NULL))
1157
        return(-1);
1158
       
1159
    xmlInitParser();
1160
1161
    if (encoding != NULL) {
1162
	xmlCharEncoding enc;
1163
1164
	enc = xmlParseCharEncoding(encoding);
1165
	if (enc != cur->charset) {
1166
	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1167
		/*
1168
		 * Not supported yet
1169
		 */
1170
		return(-1);
1171
	    }
1172
1173
	    handler = xmlFindCharEncodingHandler(encoding);
1174
	    if (handler == NULL)
1175
		return(-1);
1176
            htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1177
	}
1178
    } else {
1179
	htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1180
    }
1181
1182
    /*
1183
     * Fallback to HTML or ASCII when the encoding is unspecified
1184
     */
1185
    if (handler == NULL)
1186
	handler = xmlFindCharEncodingHandler("HTML");
1187
    if (handler == NULL)
1188
	handler = xmlFindCharEncodingHandler("ascii");
1189
1190
    /* 
1191
     * save the content to a temp buffer.
1192
     */
1193
    buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1194
    if (buf == NULL) return(0);
1195
1196
    htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1197
1198
    ret = xmlOutputBufferClose(buf);
1199
    return(ret);
1200
}
1201
1202
/**
1203
 * htmlSaveFileEnc:
1204
 * @filename:  the filename
1205
 * @cur:  the document
1206
 * @encoding: the document encoding
1207
 *
1208
 * Dump an HTML document to a file using a given encoding
1209
 * and formatting returns/spaces are added.
1210
 * 
1211
 * returns: the number of byte written or -1 in case of failure.
1212
 */
1213
int
1214
htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1215
    return(htmlSaveFileFormat(filename, cur, encoding, 1));
1216
}
1217
1218
#endif /* LIBXML_OUTPUT_ENABLED */
1219
1220
#define bottom_HTMLtree
1221
#include "elfgcchack.h"
1222
#endif /* LIBXML_HTML_ENABLED */