1
/*
2
 * HTMLparser.c : an HTML 4.0 non-verifying parser
3
 *
4
 * See Copyright for the status of this software.
5
 *
6
 * daniel@veillard.com
7
 */
8
9
#define IN_LIBXML
10
#include "libxml.h"
11
#ifdef LIBXML_HTML_ENABLED
12
13
#include <string.h>
14
#ifdef HAVE_CTYPE_H
15
#include <ctype.h>
16
#endif
17
#ifdef HAVE_STDLIB_H
18
#include <stdlib.h>
19
#endif
20
#ifdef HAVE_SYS_STAT_H
21
#include <sys/stat.h>
22
#endif
23
#ifdef HAVE_FCNTL_H
24
#include <fcntl.h>
25
#endif
26
#ifdef HAVE_UNISTD_H
27
#include <unistd.h>
28
#endif
29
#ifdef HAVE_ZLIB_H
30
#include <zlib.h>
31
#endif
32
33
#include <libxml/xmlmemory.h>
34
#include <libxml/tree.h>
35
#include <libxml/parser.h>
36
#include <libxml/parserInternals.h>
37
#include <libxml/xmlerror.h>
38
#include <libxml/HTMLparser.h>
39
#include <libxml/HTMLtree.h>
40
#include <libxml/entities.h>
41
#include <libxml/encoding.h>
42
#include <libxml/valid.h>
43
#include <libxml/xmlIO.h>
44
#include <libxml/globals.h>
45
#include <libxml/uri.h>
46
47
#define HTML_MAX_NAMELEN 1000
48
#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49
#define HTML_PARSER_BUFFER_SIZE 100
50
51
/* #define DEBUG */
52
/* #define DEBUG_PUSH */
53
54
static int htmlOmittedDefaultValue = 1;
55
56
xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57
			     xmlChar end, xmlChar  end2, xmlChar end3);
58
static void htmlParseComment(htmlParserCtxtPtr ctxt);
59
60
/************************************************************************
61
 *									*
62
 * 		Some factorized error routines				*
63
 *									*
64
 ************************************************************************/
65
66
/**
67
 * htmlErrMemory:
68
 * @ctxt:  an HTML parser context
69
 * @extra:  extra informations
70
 *
71
 * Handle a redefinition of attribute error
72
 */
73
static void
74
htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75
{
76
    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77
        (ctxt->instate == XML_PARSER_EOF))
78
	return;
79
    if (ctxt != NULL) {
80
        ctxt->errNo = XML_ERR_NO_MEMORY;
81
        ctxt->instate = XML_PARSER_EOF;
82
        ctxt->disableSAX = 1;
83
    }
84
    if (extra)
85
        __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
86
                        XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87
                        NULL, NULL, 0, 0,
88
                        "Memory allocation failed : %s\n", extra);
89
    else
90
        __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
91
                        XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92
                        NULL, NULL, 0, 0, "Memory allocation failed\n");
93
}
94
95
/**
96
 * htmlParseErr:
97
 * @ctxt:  an HTML parser context
98
 * @error:  the error number
99
 * @msg:  the error message
100
 * @str1:  string infor
101
 * @str2:  string infor
102
 *
103
 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104
 */
105
static void
106
htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107
             const char *msg, const xmlChar *str1, const xmlChar *str2)
108
{
109
    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110
        (ctxt->instate == XML_PARSER_EOF))
111
	return;
112
    if (ctxt != NULL)
113
	ctxt->errNo = error;
114
    __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
115
                    XML_ERR_ERROR, NULL, 0,
116
		    (const char *) str1, (const char *) str2,
117
		    NULL, 0, 0,
118
		    msg, str1, str2);
119
    if (ctxt != NULL)
120
	ctxt->wellFormed = 0;
121
}
122
123
/**
124
 * htmlParseErrInt:
125
 * @ctxt:  an HTML parser context
126
 * @error:  the error number
127
 * @msg:  the error message
128
 * @val:  integer info
129
 *
130
 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131
 */
132
static void
133
htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134
             const char *msg, int val)
135
{
136
    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137
        (ctxt->instate == XML_PARSER_EOF))
138
	return;
139
    if (ctxt != NULL)
140
	ctxt->errNo = error;
141
    __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
142
                    XML_ERR_ERROR, NULL, 0, NULL, NULL,
143
		    NULL, val, 0, msg, val);
144
    if (ctxt != NULL)
145
	ctxt->wellFormed = 0;
146
}
147
148
/************************************************************************
149
 *									*
150
 * 		Parser stacks related functions and macros		*
151
 *									*
152
 ************************************************************************/
153
154
/**
155
 * htmlnamePush:
156
 * @ctxt:  an HTML parser context
157
 * @value:  the element name
158
 *
159
 * Pushes a new element name on top of the name stack
160
 *
161
 * Returns 0 in case of error, the index in the stack otherwise
162
 */
163
static int
164
htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
165
{
166
    if (ctxt->nameNr >= ctxt->nameMax) {
167
        ctxt->nameMax *= 2;
168
        ctxt->nameTab = (const xmlChar * *)
169
                         xmlRealloc((xmlChar * *)ctxt->nameTab,
170
                                    ctxt->nameMax *
171
                                    sizeof(ctxt->nameTab[0]));
172
        if (ctxt->nameTab == NULL) {
173
            htmlErrMemory(ctxt, NULL);
174
            return (0);
175
        }
176
    }
177
    ctxt->nameTab[ctxt->nameNr] = value;
178
    ctxt->name = value;
179
    return (ctxt->nameNr++);
180
}
181
/**
182
 * htmlnamePop:
183
 * @ctxt: an HTML parser context
184
 *
185
 * Pops the top element name from the name stack
186
 *
187
 * Returns the name just removed
188
 */
189
static const xmlChar *
190
htmlnamePop(htmlParserCtxtPtr ctxt)
191
{
192
    const xmlChar *ret;
193
194
    if (ctxt->nameNr <= 0)
195
        return (NULL);
196
    ctxt->nameNr--;
197
    if (ctxt->nameNr < 0)
198
        return (NULL);
199
    if (ctxt->nameNr > 0)
200
        ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
201
    else
202
        ctxt->name = NULL;
203
    ret = ctxt->nameTab[ctxt->nameNr];
204
    ctxt->nameTab[ctxt->nameNr] = NULL;
205
    return (ret);
206
}
207
208
/*
209
 * Macros for accessing the content. Those should be used only by the parser,
210
 * and not exported.
211
 *
212
 * Dirty macros, i.e. one need to make assumption on the context to use them
213
 *
214
 *   CUR_PTR return the current pointer to the xmlChar to be parsed.
215
 *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
216
 *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
217
 *           in UNICODE mode. This should be used internally by the parser
218
 *           only to compare to ASCII values otherwise it would break when
219
 *           running with UTF-8 encoding.
220
 *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
221
 *           to compare on ASCII based substring.
222
 *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
223
 *           it should be used only to compare on ASCII based substring.
224
 *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
225
 *           strings without newlines within the parser.
226
 *
227
 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
228
 *
229
 *   CURRENT Returns the current char value, with the full decoding of
230
 *           UTF-8 if we are using this mode. It returns an int.
231
 *   NEXT    Skip to the next character, this does the proper decoding
232
 *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
233
 *   NEXTL(l) Skip the current unicode character of l xmlChars long.
234
 *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
235
 */
236
237
#define UPPER (toupper(*ctxt->input->cur))
238
239
#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
240
241
#define NXT(val) ctxt->input->cur[(val)]
242
243
#define UPP(val) (toupper(ctxt->input->cur[(val)]))
244
245
#define CUR_PTR ctxt->input->cur
246
247
#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
248
		   (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
249
	xmlParserInputShrink(ctxt->input)
250
251
#define GROW if ((ctxt->progressive == 0) &&				\
252
		 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))	\
253
	xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
254
255
#define CURRENT ((int) (*ctxt->input->cur))
256
257
#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
258
259
/* Inported from XML */
260
261
/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
262
#define CUR ((int) (*ctxt->input->cur))
263
#define NEXT xmlNextChar(ctxt)
264
265
#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
266
#define NXT(val) ctxt->input->cur[(val)]
267
#define CUR_PTR ctxt->input->cur
268
269
270
#define NEXTL(l) do {							\
271
    if (*(ctxt->input->cur) == '\n') {					\
272
	ctxt->input->line++; ctxt->input->col = 1;			\
273
    } else ctxt->input->col++;						\
274
    ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;		\
275
  } while (0)
276
    
277
/************
278
    \
279
    if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);	\
280
    if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
281
 ************/
282
283
#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
284
#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
285
286
#define COPY_BUF(l,b,i,v)						\
287
    if (l == 1) b[i++] = (xmlChar) v;					\
288
    else i += xmlCopyChar(l,&b[i],v)
289
290
/**
291
 * htmlCurrentChar:
292
 * @ctxt:  the HTML parser context
293
 * @len:  pointer to the length of the char read
294
 *
295
 * The current char value, if using UTF-8 this may actually span multiple
296
 * bytes in the input buffer. Implement the end of line normalization:
297
 * 2.11 End-of-Line Handling
298
 * If the encoding is unspecified, in the case we find an ISO-Latin-1
299
 * char, then the encoding converter is plugged in automatically.
300
 *
301
 * Returns the current char value and its length
302
 */
303
304
static int
305
htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
306
    if (ctxt->instate == XML_PARSER_EOF)
307
	return(0);
308
309
    if (ctxt->token != 0) {
310
	*len = 0;
311
	return(ctxt->token);
312
    }	
313
    if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
314
	/*
315
	 * We are supposed to handle UTF8, check it's valid
316
	 * From rfc2044: encoding of the Unicode values on UTF-8:
317
	 *
318
	 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
319
	 * 0000 0000-0000 007F   0xxxxxxx
320
	 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
321
	 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx 
322
	 *
323
	 * Check for the 0x110000 limit too
324
	 */
325
	const unsigned char *cur = ctxt->input->cur;
326
	unsigned char c;
327
	unsigned int val;
328
329
	c = *cur;
330
	if (c & 0x80) {
331
	    if (cur[1] == 0)
332
		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
333
	    if ((cur[1] & 0xc0) != 0x80)
334
		goto encoding_error;
335
	    if ((c & 0xe0) == 0xe0) {
336
337
		if (cur[2] == 0)
338
		    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
339
		if ((cur[2] & 0xc0) != 0x80)
340
		    goto encoding_error;
341
		if ((c & 0xf0) == 0xf0) {
342
		    if (cur[3] == 0)
343
			xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
344
		    if (((c & 0xf8) != 0xf0) ||
345
			((cur[3] & 0xc0) != 0x80))
346
			goto encoding_error;
347
		    /* 4-byte code */
348
		    *len = 4;
349
		    val = (cur[0] & 0x7) << 18;
350
		    val |= (cur[1] & 0x3f) << 12;
351
		    val |= (cur[2] & 0x3f) << 6;
352
		    val |= cur[3] & 0x3f;
353
		} else {
354
		  /* 3-byte code */
355
		    *len = 3;
356
		    val = (cur[0] & 0xf) << 12;
357
		    val |= (cur[1] & 0x3f) << 6;
358
		    val |= cur[2] & 0x3f;
359
		}
360
	    } else {
361
	      /* 2-byte code */
362
		*len = 2;
363
		val = (cur[0] & 0x1f) << 6;
364
		val |= cur[1] & 0x3f;
365
	    }
366
	    if (!IS_CHAR(val)) {
367
	        htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
368
				"Char 0x%X out of allowed range\n", val);
369
	    }    
370
	    return(val);
371
	} else {
372
	    /* 1-byte code */
373
	    *len = 1;
374
	    return((int) *ctxt->input->cur);
375
	}
376
    }
377
    /*
378
     * Assume it's a fixed length encoding (1) with
379
     * a compatible encoding for the ASCII set, since
380
     * XML constructs only use < 128 chars
381
     */
382
    *len = 1;
383
    if ((int) *ctxt->input->cur < 0x80)
384
	return((int) *ctxt->input->cur);
385
386
    /*
387
     * Humm this is bad, do an automatic flow conversion
388
     */
389
    xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
390
    ctxt->charset = XML_CHAR_ENCODING_UTF8;
391
    return(xmlCurrentChar(ctxt, len));
392
393
encoding_error:
394
    /*
395
     * If we detect an UTF8 error that probably mean that the
396
     * input encoding didn't get properly advertized in the
397
     * declaration header. Report the error and switch the encoding
398
     * to ISO-Latin-1 (if you don't like this policy, just declare the
399
     * encoding !)
400
     */
401
    {
402
        char buffer[150];
403
404
	if (ctxt->input->end - ctxt->input->cur >= 4) {
405
	    snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
406
			    ctxt->input->cur[0], ctxt->input->cur[1],
407
			    ctxt->input->cur[2], ctxt->input->cur[3]);
408
	} else {
409
	    snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
410
	}
411
	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
412
		     "Input is not proper UTF-8, indicate encoding !\n",
413
		     BAD_CAST buffer, NULL);
414
    }
415
416
    ctxt->charset = XML_CHAR_ENCODING_8859_1; 
417
    *len = 1;
418
    return((int) *ctxt->input->cur);
419
}
420
421
/**
422
 * htmlSkipBlankChars:
423
 * @ctxt:  the HTML parser context
424
 *
425
 * skip all blanks character found at that point in the input streams.
426
 *
427
 * Returns the number of space chars skipped
428
 */
429
430
static int
431
htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
432
    int res = 0;
433
434
    while (IS_BLANK_CH(*(ctxt->input->cur))) {
435
	if ((*ctxt->input->cur == 0) &&
436
	    (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
437
		xmlPopInput(ctxt);
438
	} else {
439
	    if (*(ctxt->input->cur) == '\n') {
440
		ctxt->input->line++; ctxt->input->col = 1;
441
	    } else ctxt->input->col++;
442
	    ctxt->input->cur++;
443
	    ctxt->nbChars++;
444
	    if (*ctxt->input->cur == 0)
445
		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
446
	}
447
	res++;
448
    }
449
    return(res);
450
}
451
452
453
454
/************************************************************************
455
 *									*
456
 * 		The list of HTML elements and their properties		*
457
 *									*
458
 ************************************************************************/
459
460
/*
461
 *  Start Tag: 1 means the start tag can be ommited
462
 *  End Tag:   1 means the end tag can be ommited
463
 *             2 means it's forbidden (empty elements)
464
 *             3 means the tag is stylistic and should be closed easily
465
 *  Depr:      this element is deprecated
466
 *  DTD:       1 means that this element is valid only in the Loose DTD
467
 *             2 means that this element is valid only in the Frameset DTD
468
 *
469
 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
470
	, subElements , impliedsubelt , Attributes, userdata
471
 */
472
473
/* Definitions and a couple of vars for HTML Elements */
474
475
#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
476
#define NB_FONTSTYLE 8
477
#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
478
#define NB_PHRASE 10
479
#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
480
#define NB_SPECIAL 16
481
#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
482
#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
483
#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
484
#define NB_BLOCK NB_HEADING + NB_LIST + 14
485
#define FORMCTRL "input", "select", "textarea", "label", "button"
486
#define NB_FORMCTRL 5
487
#define PCDATA
488
#define NB_PCDATA 0
489
#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
490
#define NB_HEADING 6
491
#define LIST "ul", "ol", "dir", "menu"
492
#define NB_LIST 4
493
#define MODIFIER
494
#define NB_MODIFIER 0
495
#define FLOW BLOCK,INLINE
496
#define NB_FLOW NB_BLOCK + NB_INLINE
497
#define EMPTY NULL
498
499
500
static const char* const html_flow[] = { FLOW, NULL } ;
501
static const char* const html_inline[] = { INLINE, NULL } ;
502
503
/* placeholders: elts with content but no subelements */
504
static const char* const html_pcdata[] = { NULL } ;
505
#define html_cdata html_pcdata
506
507
508
/* ... and for HTML Attributes */
509
510
#define COREATTRS "id", "class", "style", "title"
511
#define NB_COREATTRS 4
512
#define I18N "lang", "dir"
513
#define NB_I18N 2
514
#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
515
#define NB_EVENTS 9
516
#define ATTRS COREATTRS,I18N,EVENTS
517
#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
518
#define CELLHALIGN "align", "char", "charoff"
519
#define NB_CELLHALIGN 3
520
#define CELLVALIGN "valign"
521
#define NB_CELLVALIGN 1
522
523
static const char* const html_attrs[] = { ATTRS, NULL } ;
524
static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
525
static const char* const core_attrs[] = { COREATTRS, NULL } ;
526
static const char* const i18n_attrs[] = { I18N, NULL } ;
527
528
529
/* Other declarations that should go inline ... */
530
static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
531
	"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
532
	"tabindex", "onfocus", "onblur", NULL } ;
533
static const char* const target_attr[] = { "target", NULL } ;
534
static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
535
static const char* const alt_attr[] = { "alt", NULL } ;
536
static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
537
static const char* const href_attrs[] = { "href", NULL } ;
538
static const char* const clear_attrs[] = { "clear", NULL } ;
539
static const char* const inline_p[] = { INLINE, "p", NULL } ;
540
541
static const char* const flow_param[] = { FLOW, "param", NULL } ;
542
static const char* const applet_attrs[] = { COREATTRS , "codebase",
543
		"archive", "alt", "name", "height", "width", "align",
544
		"hspace", "vspace", NULL } ;
545
static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
546
	"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
547
static const char* const basefont_attrs[] =
548
	{ "id", "size", "color", "face", NULL } ;
549
static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
550
static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
551
static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
552
static const char* const body_depr[] = { "background", "bgcolor", "text",
553
	"link", "vlink", "alink", NULL } ;
554
static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
555
	"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
556
557
558
static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
559
static const char* const col_elt[] = { "col", NULL } ;
560
static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
561
static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
562
static const char* const dl_contents[] = { "dt", "dd", NULL } ;
563
static const char* const compact_attr[] = { "compact", NULL } ;
564
static const char* const label_attr[] = { "label", NULL } ;
565
static const char* const fieldset_contents[] = { FLOW, "legend" } ;
566
static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
567
static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
568
static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
569
static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
570
static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
571
static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
572
static const char* const head_attrs[] = { I18N, "profile", NULL } ;
573
static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
574
static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
575
static const char* const version_attr[] = { "version", NULL } ;
576
static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
577
static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
578
static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
579
static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
580
static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
581
static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
582
static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
583
static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
584
static const char* const align_attr[] = { "align", NULL } ;
585
static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
586
static const char* const map_contents[] = { BLOCK, "area", NULL } ;
587
static const char* const name_attr[] = { "name", NULL } ;
588
static const char* const action_attr[] = { "action", NULL } ;
589
static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
590
static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
591
static const char* const content_attr[] = { "content", NULL } ;
592
static const char* const type_attr[] = { "type", NULL } ;
593
static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
594
static const char* const object_contents[] = { FLOW, "param", NULL } ;
595
static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
596
static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
597
static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
598
static const char* const option_elt[] = { "option", NULL } ;
599
static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
600
static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
601
static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
602
static const char* const width_attr[] = { "width", NULL } ;
603
static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
604
static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
605
static const char* const language_attr[] = { "language", NULL } ;
606
static const char* const select_content[] = { "optgroup", "option", NULL } ;
607
static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
608
static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
609
static const char* const table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
610
static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
611
static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
612
static const char* const tr_elt[] = { "tr", NULL } ;
613
static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
614
static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
615
static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
616
static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
617
static const char* const tr_contents[] = { "th", "td", NULL } ;
618
static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
619
static const char* const li_elt[] = { "li", NULL } ;
620
static const char* const ul_depr[] = { "type", "compact", NULL} ;
621
static const char* const dir_attr[] = { "dir", NULL} ;
622
623
#define DECL (const char**)
624
625
static const htmlElemDesc
626
html40ElementTable[] = {
627
{ "a",		0, 0, 0, 0, 0, 0, 1, "anchor ",
628
	DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
629
},
630
{ "abbr",	0, 0, 0, 0, 0, 0, 1, "abbreviated form",
631
	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
632
},
633
{ "acronym",	0, 0, 0, 0, 0, 0, 1, "",
634
	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
635
},
636
{ "address",	0, 0, 0, 0, 0, 0, 0, "information on author ",
637
	DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
638
},
639
{ "applet",	0, 0, 0, 0, 1, 1, 2, "java applet ",
640
	DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
641
},
642
{ "area",	0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
643
	EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
644
},
645
{ "b",		0, 3, 0, 0, 0, 0, 1, "bold text style",
646
	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
647
},
648
{ "base",	0, 2, 2, 1, 0, 0, 0, "document base uri ",
649
	EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
650
},
651
{ "basefont",	0, 2, 2, 1, 1, 1, 1, "base font size " ,
652
	EMPTY , NULL , NULL, DECL basefont_attrs, NULL
653
},
654
{ "bdo",	0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
655
	DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
656
},
657
{ "big",	0, 3, 0, 0, 0, 0, 1, "large text style",
658
	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
659
},
660
{ "blockquote",	0, 0, 0, 0, 0, 0, 0, "long quotation ",
661
	DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
662
},
663
{ "body",	1, 1, 0, 0, 0, 0, 0, "document body ",
664
	DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
665
},
666
{ "br",		0, 2, 2, 1, 0, 0, 1, "forced line break ",
667
	EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
668
},
669
{ "button",	0, 0, 0, 0, 0, 0, 2, "push button ",
670
	DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
671
},
672
{ "caption",	0, 0, 0, 0, 0, 0, 0, "table caption ",
673
	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
674
},
675
{ "center",	0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
676
	DECL html_flow , NULL , NULL, DECL html_attrs, NULL
677
},
678
{ "cite",	0, 0, 0, 0, 0, 0, 1, "citation",
679
	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
680
},
681
{ "code",	0, 0, 0, 0, 0, 0, 1, "computer code fragment",
682
	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
683
},
684
{ "col",	0, 2, 2, 1, 0, 0, 0, "table column ",
685
	EMPTY , NULL , DECL col_attrs , NULL, NULL
686
},
687
{ "colgroup",	0, 1, 0, 0, 0, 0, 0, "table column group ",
688
	DECL col_elt , "col" , DECL col_attrs , NULL, NULL
689
},
690
{ "dd",		0, 1, 0, 0, 0, 0, 0, "definition description ",
691
	DECL html_flow , NULL , DECL html_attrs, NULL, NULL
692
},
693
{ "del",	0, 0, 0, 0, 0, 0, 2, "deleted text ",
694
	DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
695
},
696
{ "dfn",	0, 0, 0, 0, 0, 0, 1, "instance definition",
697
	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
698
},
699
{ "dir",	0, 0, 0, 0, 1, 1, 0, "directory list",
700
	DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
701
},
702
{ "div",	0, 0, 0, 0, 0, 0, 0, "generic language/style container",
703
	DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
704
},
705
{ "dl",		0, 0, 0, 0, 0, 0, 0, "definition list ",
706
	DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
707
},
708
{ "dt",		0, 1, 0, 0, 0, 0, 0, "definition term ",
709
	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
710
},
711
{ "em",		0, 3, 0, 0, 0, 0, 1, "emphasis",
712
	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
713
},
714
{ "embed",	0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
715
	EMPTY, NULL, DECL embed_attrs, NULL, NULL
716
},
717
{ "fieldset",	0, 0, 0, 0, 0, 0, 0, "form control group ",
718
	DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
719
},
720
{ "font",	0, 3, 0, 0, 1, 1, 1, "local change to font ",
721
	DECL html_inline, NULL, NULL, DECL font_attrs, NULL
722
},
723
{ "form",	0, 0, 0, 0, 0, 0, 0, "interactive form ",
724
	DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
725
},
726
{ "frame",	0, 2, 2, 1, 0, 2, 0, "subwindow " ,
727
	EMPTY, NULL, NULL, DECL frame_attrs, NULL
728
},
729
{ "frameset",	0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
730
	DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
731
},
732
{ "h1",		0, 0, 0, 0, 0, 0, 0, "heading ",
733
	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
734
},
735
{ "h2",		0, 0, 0, 0, 0, 0, 0, "heading ",
736
	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
737
},
738
{ "h3",		0, 0, 0, 0, 0, 0, 0, "heading ",
739
	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
740
},
741
{ "h4",		0, 0, 0, 0, 0, 0, 0, "heading ",
742
	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
743
},
744
{ "h5",		0, 0, 0, 0, 0, 0, 0, "heading ",
745
	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
746
},
747
{ "h6",		0, 0, 0, 0, 0, 0, 0, "heading ",
748
	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
749
},
750
{ "head",	1, 1, 0, 0, 0, 0, 0, "document head ",
751
	DECL head_contents, NULL, DECL head_attrs, NULL, NULL
752
},
753
{ "hr",		0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
754
	EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
755
},
756
{ "html",	1, 1, 0, 0, 0, 0, 0, "document root element ",
757
	DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
758
},
759
{ "i",		0, 3, 0, 0, 0, 0, 1, "italic text style",
760
	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
761
},
762
{ "iframe",	0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
763
	DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
764
},
765
{ "img",	0, 2, 2, 1, 0, 0, 1, "embedded image ",
766
	EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
767
},
768
{ "input",	0, 2, 2, 1, 0, 0, 1, "form control ",
769
	EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
770
},
771
{ "ins",	0, 0, 0, 0, 0, 0, 2, "inserted text",
772
	DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
773
},
774
{ "isindex",	0, 2, 2, 1, 1, 1, 0, "single line prompt ",
775
	EMPTY, NULL, NULL, DECL prompt_attrs, NULL
776
},
777
{ "kbd",	0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
778
	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
779
},
780
{ "label",	0, 0, 0, 0, 0, 0, 1, "form field label text ",
781
	DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
782
},
783
{ "legend",	0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
784
	DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
785
},
786
{ "li",		0, 1, 1, 0, 0, 0, 0, "list item ",
787
	DECL html_flow, NULL, DECL html_attrs, NULL, NULL
788
},
789
{ "link",	0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
790
	EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
791
},
792
{ "map",	0, 0, 0, 0, 0, 0, 2, "client-side image map ",
793
	DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
794
},
795
{ "menu",	0, 0, 0, 0, 1, 1, 0, "menu list ",
796
	DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
797
},
798
{ "meta",	0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
799
	EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
800
},
801
{ "noframes",	0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
802
	DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
803
},
804
{ "noscript",	0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
805
	DECL html_flow, "div", DECL html_attrs, NULL, NULL
806
},
807
{ "object",	0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
808
	DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
809
},
810
{ "ol",		0, 0, 0, 0, 0, 0, 0, "ordered list ",
811
	DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
812
},
813
{ "optgroup",	0, 0, 0, 0, 0, 0, 0, "option group ",
814
	DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
815
},
816
{ "option",	0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
817
	DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
818
},
819
{ "p",		0, 1, 0, 0, 0, 0, 0, "paragraph ",
820
	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
821
},
822
{ "param",	0, 2, 2, 1, 0, 0, 0, "named property value ",
823
	EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
824
},
825
{ "pre",	0, 0, 0, 0, 0, 0, 0, "preformatted text ",
826
	DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
827
},
828
{ "q",		0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
829
	DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
830
},
831
{ "s",		0, 3, 0, 0, 1, 1, 1, "strike-through text style",
832
	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
833
},
834
{ "samp",	0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
835
	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
836
},
837
{ "script",	0, 0, 0, 0, 0, 0, 2, "script statements ",
838
	DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
839
},
840
{ "select",	0, 0, 0, 0, 0, 0, 1, "option selector ",
841
	DECL select_content, NULL, DECL select_attrs, NULL, NULL
842
},
843
{ "small",	0, 3, 0, 0, 0, 0, 1, "small text style",
844
	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
845
},
846
{ "span",	0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
847
	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
848
},
849
{ "strike",	0, 3, 0, 0, 1, 1, 1, "strike-through text",
850
	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
851
},
852
{ "strong",	0, 3, 0, 0, 0, 0, 1, "strong emphasis",
853
	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
854
},
855
{ "style",	0, 0, 0, 0, 0, 0, 0, "style info ",
856
	DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
857
},
858
{ "sub",	0, 3, 0, 0, 0, 0, 1, "subscript",
859
	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
860
},
861
{ "sup",	0, 3, 0, 0, 0, 0, 1, "superscript ",
862
	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
863
},
864
{ "table",	0, 0, 0, 0, 0, 0, 0, "",
865
	DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
866
},
867
{ "tbody",	1, 0, 0, 0, 0, 0, 0, "table body ",
868
	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
869
},
870
{ "td",		0, 0, 0, 0, 0, 0, 0, "table data cell",
871
	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
872
},
873
{ "textarea",	0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
874
	DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
875
},
876
{ "tfoot",	0, 1, 0, 0, 0, 0, 0, "table footer ",
877
	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
878
},
879
{ "th",		0, 1, 0, 0, 0, 0, 0, "table header cell",
880
	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
881
},
882
{ "thead",	0, 1, 0, 0, 0, 0, 0, "table header ",
883
	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
884
},
885
{ "title",	0, 0, 0, 0, 0, 0, 0, "document title ",
886
	DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
887
},
888
{ "tr",		0, 0, 0, 0, 0, 0, 0, "table row ",
889
	DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
890
},
891
{ "tt",		0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
892
	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
893
},
894
{ "u",		0, 3, 0, 0, 1, 1, 1, "underlined text style",
895
	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
896
},
897
{ "ul",		0, 0, 0, 0, 0, 0, 0, "unordered list ",
898
	DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
899
},
900
{ "var",	0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
901
	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
902
}
903
};
904
905
/*
906
 * start tags that imply the end of current element
907
 */
908
static const char * const htmlStartClose[] = {
909
"form",		"form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
910
		"dl", "ul", "ol", "menu", "dir", "address", "pre",
911
		"listing", "xmp", "head", NULL,
912
"head",		"p", NULL,
913
"title",	"p", NULL,
914
"body",		"head", "style", "link", "title", "p", NULL,
915
"frameset",	"head", "style", "link", "title", "p", NULL,
916
"li",		"p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
917
		"pre", "listing", "xmp", "head", "li", NULL,
918
"hr",		"p", "head", NULL,
919
"h1",		"p", "head", NULL,
920
"h2",		"p", "head", NULL,
921
"h3",		"p", "head", NULL,
922
"h4",		"p", "head", NULL,
923
"h5",		"p", "head", NULL,
924
"h6",		"p", "head", NULL,
925
"dir",		"p", "head", NULL,
926
"address",	"p", "head", "ul", NULL,
927
"pre",		"p", "head", "ul", NULL,
928
"listing",	"p", "head", NULL,
929
"xmp",		"p", "head", NULL,
930
"blockquote",	"p", "head", NULL,
931
"dl",		"p", "dt", "menu", "dir", "address", "pre", "listing",
932
		"xmp", "head", NULL,
933
"dt",		"p", "menu", "dir", "address", "pre", "listing", "xmp",
934
                "head", "dd", NULL,
935
"dd",		"p", "menu", "dir", "address", "pre", "listing", "xmp",
936
                "head", "dt", NULL,
937
"ul",		"p", "head", "ol", "menu", "dir", "address", "pre",
938
		"listing", "xmp", NULL,
939
"ol",		"p", "head", "ul", NULL,
940
"menu",		"p", "head", "ul", NULL,
941
"p",		"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
942
"div",		"p", "head", NULL,
943
"noscript",	"p", "head", NULL,
944
"center",	"font", "b", "i", "p", "head", NULL,
945
"a",		"a", NULL,
946
"caption",	"p", NULL,
947
"colgroup",	"caption", "colgroup", "col", "p", NULL,
948
"col",		"caption", "col", "p", NULL,
949
"table",	"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
950
		"listing", "xmp", "a", NULL,
951
"th",		"th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
952
"td",		"th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,      
953
"tr",		"th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
954
"thead",	"caption", "col", "colgroup", NULL,
955
"tfoot",	"th", "td", "tr", "caption", "col", "colgroup", "thead",
956
		"tbody", "p", NULL,
957
"tbody",	"th", "td", "tr", "caption", "col", "colgroup", "thead",
958
		"tfoot", "tbody", "p", NULL,
959
"optgroup",	"option", NULL,
960
"option",	"option", NULL,
961
"fieldset",	"legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
962
		"pre", "listing", "xmp", "a", NULL,
963
NULL
964
};
965
966
/*
967
 * The list of HTML elements which are supposed not to have
968
 * CDATA content and where a p element will be implied
969
 *
970
 * TODO: extend that list by reading the HTML SGML DTD on
971
 *       implied paragraph
972
 */
973
static const char *const htmlNoContentElements[] = {
974
    "html",
975
    "head",
976
    NULL
977
};
978
979
/*
980
 * The list of HTML attributes which are of content %Script;
981
 * NOTE: when adding ones, check htmlIsScriptAttribute() since
982
 *       it assumes the name starts with 'on'
983
 */
984
static const char *const htmlScriptAttributes[] = {
985
    "onclick",
986
    "ondblclick",
987
    "onmousedown",
988
    "onmouseup",
989
    "onmouseover",
990
    "onmousemove",
991
    "onmouseout",
992
    "onkeypress",
993
    "onkeydown",
994
    "onkeyup",
995
    "onload",
996
    "onunload",
997
    "onfocus",
998
    "onblur",
999
    "onsubmit",
1000
    "onrest",
1001
    "onchange",
1002
    "onselect"
1003
};
1004
1005
/*
1006
 * This table is used by the htmlparser to know what to do with
1007
 * broken html pages. By assigning different priorities to different
1008
 * elements the parser can decide how to handle extra endtags.
1009
 * Endtags are only allowed to close elements with lower or equal
1010
 * priority.
1011
 */ 
1012
1013
typedef struct {
1014
    const char *name;
1015
    int priority;
1016
} elementPriority;
1017
1018
static const elementPriority htmlEndPriority[] = {
1019
    {"div",   150},
1020
    {"td",    160},
1021
    {"th",    160},
1022
    {"tr",    170},
1023
    {"thead", 180},
1024
    {"tbody", 180},
1025
    {"tfoot", 180},
1026
    {"table", 190},
1027
    {"head",  200},
1028
    {"body",  200},
1029
    {"html",  220},
1030
    {NULL,    100} /* Default priority */
1031
};
1032
1033
static const char** htmlStartCloseIndex[100];
1034
static int htmlStartCloseIndexinitialized = 0;
1035
1036
/************************************************************************
1037
 *									*
1038
 * 		functions to handle HTML specific data			*
1039
 *									*
1040
 ************************************************************************/
1041
1042
/**
1043
 * htmlInitAutoClose:
1044
 *
1045
 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1046
 * This is not reentrant. Call xmlInitParser() once before processing in
1047
 * case of use in multithreaded programs.
1048
 */
1049
void
1050
htmlInitAutoClose(void) {
1051
    int indx, i = 0;
1052
1053
    if (htmlStartCloseIndexinitialized) return;
1054
1055
    for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1056
    indx = 0;
1057
    while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1058
        htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
1059
	while (htmlStartClose[i] != NULL) i++;
1060
	i++;
1061
    }
1062
    htmlStartCloseIndexinitialized = 1;
1063
}
1064
1065
/**
1066
 * htmlTagLookup:
1067
 * @tag:  The tag name in lowercase
1068
 *
1069
 * Lookup the HTML tag in the ElementTable
1070
 *
1071
 * Returns the related htmlElemDescPtr or NULL if not found.
1072
 */
1073
const htmlElemDesc *
1074
htmlTagLookup(const xmlChar *tag) {
1075
    unsigned int i;
1076
1077
    for (i = 0; i < (sizeof(html40ElementTable) /
1078
                     sizeof(html40ElementTable[0]));i++) {
1079
        if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1080
	    return((htmlElemDescPtr) &html40ElementTable[i]);
1081
    }
1082
    return(NULL);
1083
}
1084
1085
/**
1086
 * htmlGetEndPriority:
1087
 * @name: The name of the element to look up the priority for.
1088
 * 
1089
 * Return value: The "endtag" priority.
1090
 **/
1091
static int
1092
htmlGetEndPriority (const xmlChar *name) {
1093
    int i = 0;
1094
1095
    while ((htmlEndPriority[i].name != NULL) &&
1096
	   (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1097
	i++;
1098
1099
    return(htmlEndPriority[i].priority);
1100
}
1101
1102
1103
/**
1104
 * htmlCheckAutoClose:
1105
 * @newtag:  The new tag name
1106
 * @oldtag:  The old tag name
1107
 *
1108
 * Checks whether the new tag is one of the registered valid tags for
1109
 * closing old.
1110
 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1111
 *
1112
 * Returns 0 if no, 1 if yes.
1113
 */
1114
static int
1115
htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1116
{
1117
    int i, indx;
1118
    const char **closed = NULL;
1119
1120
    if (htmlStartCloseIndexinitialized == 0)
1121
        htmlInitAutoClose();
1122
1123
    /* inefficient, but not a big deal */
1124
    for (indx = 0; indx < 100; indx++) {
1125
        closed = htmlStartCloseIndex[indx];
1126
        if (closed == NULL)
1127
            return (0);
1128
        if (xmlStrEqual(BAD_CAST * closed, newtag))
1129
            break;
1130
    }
1131
1132
    i = closed - htmlStartClose;
1133
    i++;
1134
    while (htmlStartClose[i] != NULL) {
1135
        if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1136
            return (1);
1137
        }
1138
        i++;
1139
    }
1140
    return (0);
1141
}
1142
1143
/**
1144
 * htmlAutoCloseOnClose:
1145
 * @ctxt:  an HTML parser context
1146
 * @newtag:  The new tag name
1147
 * @force:  force the tag closure
1148
 *
1149
 * The HTML DTD allows an ending tag to implicitly close other tags.
1150
 */
1151
static void
1152
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1153
{
1154
    const htmlElemDesc *info;
1155
    int i, priority;
1156
1157
    priority = htmlGetEndPriority(newtag);
1158
1159
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1160
1161
        if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1162
            break;
1163
        /*
1164
         * A missplaced endtag can only close elements with lower
1165
         * or equal priority, so if we find an element with higher
1166
         * priority before we find an element with
1167
         * matching name, we just ignore this endtag 
1168
         */
1169
        if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1170
            return;
1171
    }
1172
    if (i < 0)
1173
        return;
1174
1175
    while (!xmlStrEqual(newtag, ctxt->name)) {
1176
        info = htmlTagLookup(ctxt->name);
1177
        if ((info != NULL) && (info->endTag == 3)) {
1178
            htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1179
	                 "Opening and ending tag mismatch: %s and %s\n",
1180
			 newtag, ctxt->name);
1181
        }
1182
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1183
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1184
	htmlnamePop(ctxt);
1185
    }
1186
}
1187
1188
/**
1189
 * htmlAutoCloseOnEnd:
1190
 * @ctxt:  an HTML parser context
1191
 *
1192
 * Close all remaining tags at the end of the stream
1193
 */
1194
static void
1195
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1196
{
1197
    int i;
1198
1199
    if (ctxt->nameNr == 0)
1200
        return;
1201
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1202
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1203
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1204
	htmlnamePop(ctxt);
1205
    }
1206
}
1207
1208
/**
1209
 * htmlAutoClose:
1210
 * @ctxt:  an HTML parser context
1211
 * @newtag:  The new tag name or NULL
1212
 *
1213
 * The HTML DTD allows a tag to implicitly close other tags.
1214
 * The list is kept in htmlStartClose array. This function is
1215
 * called when a new tag has been detected and generates the
1216
 * appropriates closes if possible/needed.
1217
 * If newtag is NULL this mean we are at the end of the resource
1218
 * and we should check 
1219
 */
1220
static void
1221
htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1222
{
1223
    while ((newtag != NULL) && (ctxt->name != NULL) &&
1224
           (htmlCheckAutoClose(newtag, ctxt->name))) {
1225
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1226
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1227
	htmlnamePop(ctxt);
1228
    }
1229
    if (newtag == NULL) {
1230
        htmlAutoCloseOnEnd(ctxt);
1231
        return;
1232
    }
1233
    while ((newtag == NULL) && (ctxt->name != NULL) &&
1234
           ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1235
            (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1236
            (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1237
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1238
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1239
	htmlnamePop(ctxt);
1240
    }
1241
}
1242
1243
/**
1244
 * htmlAutoCloseTag:
1245
 * @doc:  the HTML document
1246
 * @name:  The tag name
1247
 * @elem:  the HTML element
1248
 *
1249
 * The HTML DTD allows a tag to implicitly close other tags.
1250
 * The list is kept in htmlStartClose array. This function checks
1251
 * if the element or one of it's children would autoclose the
1252
 * given tag.
1253
 *
1254
 * Returns 1 if autoclose, 0 otherwise
1255
 */
1256
int
1257
htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1258
    htmlNodePtr child;
1259
1260
    if (elem == NULL) return(1);
1261
    if (xmlStrEqual(name, elem->name)) return(0);
1262
    if (htmlCheckAutoClose(elem->name, name)) return(1);
1263
    child = elem->children;
1264
    while (child != NULL) {
1265
        if (htmlAutoCloseTag(doc, name, child)) return(1);
1266
	child = child->next;
1267
    }
1268
    return(0);
1269
}
1270
1271
/**
1272
 * htmlIsAutoClosed:
1273
 * @doc:  the HTML document
1274
 * @elem:  the HTML element
1275
 *
1276
 * The HTML DTD allows a tag to implicitly close other tags.
1277
 * The list is kept in htmlStartClose array. This function checks
1278
 * if a tag is autoclosed by one of it's child
1279
 *
1280
 * Returns 1 if autoclosed, 0 otherwise
1281
 */
1282
int
1283
htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1284
    htmlNodePtr child;
1285
1286
    if (elem == NULL) return(1);
1287
    child = elem->children;
1288
    while (child != NULL) {
1289
	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1290
	child = child->next;
1291
    }
1292
    return(0);
1293
}
1294
1295
/**
1296
 * htmlCheckImplied:
1297
 * @ctxt:  an HTML parser context
1298
 * @newtag:  The new tag name
1299
 *
1300
 * The HTML DTD allows a tag to exists only implicitly
1301
 * called when a new tag has been detected and generates the
1302
 * appropriates implicit tags if missing
1303
 */
1304
static void
1305
htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1306
    if (!htmlOmittedDefaultValue)
1307
	return;
1308
    if (xmlStrEqual(newtag, BAD_CAST"html"))
1309
	return;
1310
    if (ctxt->nameNr <= 0) {
1311
	htmlnamePush(ctxt, BAD_CAST"html");
1312
	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1313
	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1314
    }
1315
    if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1316
        return;
1317
    if ((ctxt->nameNr <= 1) && 
1318
        ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1319
	 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1320
	 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1321
	 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1322
	 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1323
	 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1324
	    /* 
1325
	     * dropped OBJECT ... i you put it first BODY will be
1326
	     * assumed !
1327
	     */
1328
	    htmlnamePush(ctxt, BAD_CAST"head");
1329
	    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1330
		ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1331
    } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1332
	       (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1333
	       (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1334
	int i;
1335
	for (i = 0;i < ctxt->nameNr;i++) {
1336
	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1337
		return;
1338
	    }
1339
	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1340
		return;
1341
	    }
1342
	}
1343
	    
1344
	htmlnamePush(ctxt, BAD_CAST"body");
1345
	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1346
	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1347
    }
1348
}
1349
1350
/**
1351
 * htmlCheckParagraph
1352
 * @ctxt:  an HTML parser context
1353
 *
1354
 * Check whether a p element need to be implied before inserting
1355
 * characters in the current element.
1356
 *
1357
 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1358
 *         in case of error.
1359
 */
1360
1361
static int
1362
htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1363
    const xmlChar *tag;
1364
    int i;
1365
1366
    if (ctxt == NULL)
1367
	return(-1);
1368
    tag = ctxt->name;
1369
    if (tag == NULL) {
1370
	htmlAutoClose(ctxt, BAD_CAST"p");
1371
	htmlCheckImplied(ctxt, BAD_CAST"p");
1372
	htmlnamePush(ctxt, BAD_CAST"p");
1373
	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1374
	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1375
	return(1);
1376
    }
1377
    if (!htmlOmittedDefaultValue)
1378
	return(0);
1379
    for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1380
	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1381
	    htmlAutoClose(ctxt, BAD_CAST"p");
1382
	    htmlCheckImplied(ctxt, BAD_CAST"p");
1383
	    htmlnamePush(ctxt, BAD_CAST"p");
1384
	    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1385
		ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1386
	    return(1);
1387
	}
1388
    }
1389
    return(0);
1390
}
1391
1392
/**
1393
 * htmlIsScriptAttribute:
1394
 * @name:  an attribute name
1395
 *
1396
 * Check if an attribute is of content type Script
1397
 *
1398
 * Returns 1 is the attribute is a script 0 otherwise
1399
 */
1400
int
1401
htmlIsScriptAttribute(const xmlChar *name) {
1402
    unsigned int i;
1403
1404
    if (name == NULL)
1405
       	return(0);
1406
    /*
1407
     * all script attributes start with 'on'
1408
     */
1409
    if ((name[0] != 'o') || (name[1] != 'n'))
1410
       	return(0);
1411
    for (i = 0;
1412
	 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1413
	 i++) {
1414
	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1415
	    return(1);
1416
    }
1417
    return(0);
1418
}
1419
1420
/************************************************************************
1421
 *									*
1422
 * 		The list of HTML predefined entities			*
1423
 *									*
1424
 ************************************************************************/
1425
1426
1427
static const htmlEntityDesc  html40EntitiesTable[] = {
1428
/*
1429
 * the 4 absolute ones, plus apostrophe.
1430
 */
1431
{ 34,	"quot",	"quotation mark = APL quote, U+0022 ISOnum" },
1432
{ 38,	"amp",	"ampersand, U+0026 ISOnum" },
1433
{ 39,	"apos",	"single quote" },
1434
{ 60,	"lt",	"less-than sign, U+003C ISOnum" },
1435
{ 62,	"gt",	"greater-than sign, U+003E ISOnum" },
1436
1437
/*
1438
 * A bunch still in the 128-255 range
1439
 * Replacing them depend really on the charset used.
1440
 */
1441
{ 160,	"nbsp",	"no-break space = non-breaking space, U+00A0 ISOnum" },
1442
{ 161,	"iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1443
{ 162,	"cent",	"cent sign, U+00A2 ISOnum" },
1444
{ 163,	"pound","pound sign, U+00A3 ISOnum" },
1445
{ 164,	"curren","currency sign, U+00A4 ISOnum" },
1446
{ 165,	"yen",	"yen sign = yuan sign, U+00A5 ISOnum" },
1447
{ 166,	"brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1448
{ 167,	"sect",	"section sign, U+00A7 ISOnum" },
1449
{ 168,	"uml",	"diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1450
{ 169,	"copy",	"copyright sign, U+00A9 ISOnum" },
1451
{ 170,	"ordf",	"feminine ordinal indicator, U+00AA ISOnum" },
1452
{ 171,	"laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1453
{ 172,	"not",	"not sign, U+00AC ISOnum" },
1454
{ 173,	"shy",	"soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1455
{ 174,	"reg",	"registered sign = registered trade mark sign, U+00AE ISOnum" },
1456
{ 175,	"macr",	"macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1457
{ 176,	"deg",	"degree sign, U+00B0 ISOnum" },
1458
{ 177,	"plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1459
{ 178,	"sup2",	"superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1460
{ 179,	"sup3",	"superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1461
{ 180,	"acute","acute accent = spacing acute, U+00B4 ISOdia" },
1462
{ 181,	"micro","micro sign, U+00B5 ISOnum" },
1463
{ 182,	"para",	"pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1464
{ 183,	"middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1465
{ 184,	"cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1466
{ 185,	"sup1",	"superscript one = superscript digit one, U+00B9 ISOnum" },
1467
{ 186,	"ordm",	"masculine ordinal indicator, U+00BA ISOnum" },
1468
{ 187,	"raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1469
{ 188,	"frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1470
{ 189,	"frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1471
{ 190,	"frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1472
{ 191,	"iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1473
{ 192,	"Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1474
{ 193,	"Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1475
{ 194,	"Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1476
{ 195,	"Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1477
{ 196,	"Auml",	"latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1478
{ 197,	"Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1479
{ 198,	"AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1480
{ 199,	"Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1481
{ 200,	"Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1482
{ 201,	"Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1483
{ 202,	"Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1484
{ 203,	"Euml",	"latin capital letter E with diaeresis, U+00CB ISOlat1" },
1485
{ 204,	"Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1486
{ 205,	"Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1487
{ 206,	"Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1488
{ 207,	"Iuml",	"latin capital letter I with diaeresis, U+00CF ISOlat1" },
1489
{ 208,	"ETH",	"latin capital letter ETH, U+00D0 ISOlat1" },
1490
{ 209,	"Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1491
{ 210,	"Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1492
{ 211,	"Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1493
{ 212,	"Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1494
{ 213,	"Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1495
{ 214,	"Ouml",	"latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1496
{ 215,	"times","multiplication sign, U+00D7 ISOnum" },
1497
{ 216,	"Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1498
{ 217,	"Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1499
{ 218,	"Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1500
{ 219,	"Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1501
{ 220,	"Uuml",	"latin capital letter U with diaeresis, U+00DC ISOlat1" },
1502
{ 221,	"Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1503
{ 222,	"THORN","latin capital letter THORN, U+00DE ISOlat1" },
1504
{ 223,	"szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1505
{ 224,	"agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1506
{ 225,	"aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1507
{ 226,	"acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1508
{ 227,	"atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1509
{ 228,	"auml",	"latin small letter a with diaeresis, U+00E4 ISOlat1" },
1510
{ 229,	"aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1511
{ 230,	"aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1512
{ 231,	"ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1513
{ 232,	"egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1514
{ 233,	"eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1515
{ 234,	"ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1516
{ 235,	"euml",	"latin small letter e with diaeresis, U+00EB ISOlat1" },
1517
{ 236,	"igrave","latin small letter i with grave, U+00EC ISOlat1" },
1518
{ 237,	"iacute","latin small letter i with acute, U+00ED ISOlat1" },
1519
{ 238,	"icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1520
{ 239,	"iuml",	"latin small letter i with diaeresis, U+00EF ISOlat1" },
1521
{ 240,	"eth",	"latin small letter eth, U+00F0 ISOlat1" },
1522
{ 241,	"ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1523
{ 242,	"ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1524
{ 243,	"oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1525
{ 244,	"ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1526
{ 245,	"otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1527
{ 246,	"ouml",	"latin small letter o with diaeresis, U+00F6 ISOlat1" },
1528
{ 247,	"divide","division sign, U+00F7 ISOnum" },
1529
{ 248,	"oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1530
{ 249,	"ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1531
{ 250,	"uacute","latin small letter u with acute, U+00FA ISOlat1" },
1532
{ 251,	"ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1533
{ 252,	"uuml",	"latin small letter u with diaeresis, U+00FC ISOlat1" },
1534
{ 253,	"yacute","latin small letter y with acute, U+00FD ISOlat1" },
1535
{ 254,	"thorn","latin small letter thorn with, U+00FE ISOlat1" },
1536
{ 255,	"yuml",	"latin small letter y with diaeresis, U+00FF ISOlat1" },
1537
1538
{ 338,	"OElig","latin capital ligature OE, U+0152 ISOlat2" },
1539
{ 339,	"oelig","latin small ligature oe, U+0153 ISOlat2" },
1540
{ 352,	"Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1541
{ 353,	"scaron","latin small letter s with caron, U+0161 ISOlat2" },
1542
{ 376,	"Yuml",	"latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1543
1544
/*
1545
 * Anything below should really be kept as entities references
1546
 */
1547
{ 402,	"fnof",	"latin small f with hook = function = florin, U+0192 ISOtech" },
1548
1549
{ 710,	"circ",	"modifier letter circumflex accent, U+02C6 ISOpub" },
1550
{ 732,	"tilde","small tilde, U+02DC ISOdia" },
1551
1552
{ 913,	"Alpha","greek capital letter alpha, U+0391" },
1553
{ 914,	"Beta",	"greek capital letter beta, U+0392" },
1554
{ 915,	"Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1555
{ 916,	"Delta","greek capital letter delta, U+0394 ISOgrk3" },
1556
{ 917,	"Epsilon","greek capital letter epsilon, U+0395" },
1557
{ 918,	"Zeta",	"greek capital letter zeta, U+0396" },
1558
{ 919,	"Eta",	"greek capital letter eta, U+0397" },
1559
{ 920,	"Theta","greek capital letter theta, U+0398 ISOgrk3" },
1560
{ 921,	"Iota",	"greek capital letter iota, U+0399" },
1561
{ 922,	"Kappa","greek capital letter kappa, U+039A" },
1562
{ 923,	"Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1563
{ 924,	"Mu",	"greek capital letter mu, U+039C" },
1564
{ 925,	"Nu",	"greek capital letter nu, U+039D" },
1565
{ 926,	"Xi",	"greek capital letter xi, U+039E ISOgrk3" },
1566
{ 927,	"Omicron","greek capital letter omicron, U+039F" },
1567
{ 928,	"Pi",	"greek capital letter pi, U+03A0 ISOgrk3" },
1568
{ 929,	"Rho",	"greek capital letter rho, U+03A1" },
1569
{ 931,	"Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1570
{ 932,	"Tau",	"greek capital letter tau, U+03A4" },
1571
{ 933,	"Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1572
{ 934,	"Phi",	"greek capital letter phi, U+03A6 ISOgrk3" },
1573
{ 935,	"Chi",	"greek capital letter chi, U+03A7" },
1574
{ 936,	"Psi",	"greek capital letter psi, U+03A8 ISOgrk3" },
1575
{ 937,	"Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1576
1577
{ 945,	"alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1578
{ 946,	"beta",	"greek small letter beta, U+03B2 ISOgrk3" },
1579
{ 947,	"gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1580
{ 948,	"delta","greek small letter delta, U+03B4 ISOgrk3" },
1581
{ 949,	"epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1582
{ 950,	"zeta",	"greek small letter zeta, U+03B6 ISOgrk3" },
1583
{ 951,	"eta",	"greek small letter eta, U+03B7 ISOgrk3" },
1584
{ 952,	"theta","greek small letter theta, U+03B8 ISOgrk3" },
1585
{ 953,	"iota",	"greek small letter iota, U+03B9 ISOgrk3" },
1586
{ 954,	"kappa","greek small letter kappa, U+03BA ISOgrk3" },
1587
{ 955,	"lambda","greek small letter lambda, U+03BB ISOgrk3" },
1588
{ 956,	"mu",	"greek small letter mu, U+03BC ISOgrk3" },
1589
{ 957,	"nu",	"greek small letter nu, U+03BD ISOgrk3" },
1590
{ 958,	"xi",	"greek small letter xi, U+03BE ISOgrk3" },
1591
{ 959,	"omicron","greek small letter omicron, U+03BF NEW" },
1592
{ 960,	"pi",	"greek small letter pi, U+03C0 ISOgrk3" },
1593
{ 961,	"rho",	"greek small letter rho, U+03C1 ISOgrk3" },
1594
{ 962,	"sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1595
{ 963,	"sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1596
{ 964,	"tau",	"greek small letter tau, U+03C4 ISOgrk3" },
1597
{ 965,	"upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1598
{ 966,	"phi",	"greek small letter phi, U+03C6 ISOgrk3" },
1599
{ 967,	"chi",	"greek small letter chi, U+03C7 ISOgrk3" },
1600
{ 968,	"psi",	"greek small letter psi, U+03C8 ISOgrk3" },
1601
{ 969,	"omega","greek small letter omega, U+03C9 ISOgrk3" },
1602
{ 977,	"thetasym","greek small letter theta symbol, U+03D1 NEW" },
1603
{ 978,	"upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1604
{ 982,	"piv",	"greek pi symbol, U+03D6 ISOgrk3" },
1605
1606
{ 8194,	"ensp",	"en space, U+2002 ISOpub" },
1607
{ 8195,	"emsp",	"em space, U+2003 ISOpub" },
1608
{ 8201,	"thinsp","thin space, U+2009 ISOpub" },
1609
{ 8204,	"zwnj",	"zero width non-joiner, U+200C NEW RFC 2070" },
1610
{ 8205,	"zwj",	"zero width joiner, U+200D NEW RFC 2070" },
1611
{ 8206,	"lrm",	"left-to-right mark, U+200E NEW RFC 2070" },
1612
{ 8207,	"rlm",	"right-to-left mark, U+200F NEW RFC 2070" },
1613
{ 8211,	"ndash","en dash, U+2013 ISOpub" },
1614
{ 8212,	"mdash","em dash, U+2014 ISOpub" },
1615
{ 8216,	"lsquo","left single quotation mark, U+2018 ISOnum" },
1616
{ 8217,	"rsquo","right single quotation mark, U+2019 ISOnum" },
1617
{ 8218,	"sbquo","single low-9 quotation mark, U+201A NEW" },
1618
{ 8220,	"ldquo","left double quotation mark, U+201C ISOnum" },
1619
{ 8221,	"rdquo","right double quotation mark, U+201D ISOnum" },
1620
{ 8222,	"bdquo","double low-9 quotation mark, U+201E NEW" },
1621
{ 8224,	"dagger","dagger, U+2020 ISOpub" },
1622
{ 8225,	"Dagger","double dagger, U+2021 ISOpub" },
1623
1624
{ 8226,	"bull",	"bullet = black small circle, U+2022 ISOpub" },
1625
{ 8230,	"hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1626
1627
{ 8240,	"permil","per mille sign, U+2030 ISOtech" },
1628
1629
{ 8242,	"prime","prime = minutes = feet, U+2032 ISOtech" },
1630
{ 8243,	"Prime","double prime = seconds = inches, U+2033 ISOtech" },
1631
1632
{ 8249,	"lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1633
{ 8250,	"rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1634
1635
{ 8254,	"oline","overline = spacing overscore, U+203E NEW" },
1636
{ 8260,	"frasl","fraction slash, U+2044 NEW" },
1637
1638
{ 8364,	"euro",	"euro sign, U+20AC NEW" },
1639
1640
{ 8465,	"image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1641
{ 8472,	"weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1642
{ 8476,	"real",	"blackletter capital R = real part symbol, U+211C ISOamso" },
1643
{ 8482,	"trade","trade mark sign, U+2122 ISOnum" },
1644
{ 8501,	"alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1645
{ 8592,	"larr",	"leftwards arrow, U+2190 ISOnum" },
1646
{ 8593,	"uarr",	"upwards arrow, U+2191 ISOnum" },
1647
{ 8594,	"rarr",	"rightwards arrow, U+2192 ISOnum" },
1648
{ 8595,	"darr",	"downwards arrow, U+2193 ISOnum" },
1649
{ 8596,	"harr",	"left right arrow, U+2194 ISOamsa" },
1650
{ 8629,	"crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1651
{ 8656,	"lArr",	"leftwards double arrow, U+21D0 ISOtech" },
1652
{ 8657,	"uArr",	"upwards double arrow, U+21D1 ISOamsa" },
1653
{ 8658,	"rArr",	"rightwards double arrow, U+21D2 ISOtech" },
1654
{ 8659,	"dArr",	"downwards double arrow, U+21D3 ISOamsa" },
1655
{ 8660,	"hArr",	"left right double arrow, U+21D4 ISOamsa" },
1656
1657
{ 8704,	"forall","for all, U+2200 ISOtech" },
1658
{ 8706,	"part",	"partial differential, U+2202 ISOtech" },
1659
{ 8707,	"exist","there exists, U+2203 ISOtech" },
1660
{ 8709,	"empty","empty set = null set = diameter, U+2205 ISOamso" },
1661
{ 8711,	"nabla","nabla = backward difference, U+2207 ISOtech" },
1662
{ 8712,	"isin",	"element of, U+2208 ISOtech" },
1663
{ 8713,	"notin","not an element of, U+2209 ISOtech" },
1664
{ 8715,	"ni",	"contains as member, U+220B ISOtech" },
1665
{ 8719,	"prod",	"n-ary product = product sign, U+220F ISOamsb" },
1666
{ 8721,	"sum",	"n-ary summation, U+2211 ISOamsb" },
1667
{ 8722,	"minus","minus sign, U+2212 ISOtech" },
1668
{ 8727,	"lowast","asterisk operator, U+2217 ISOtech" },
1669
{ 8730,	"radic","square root = radical sign, U+221A ISOtech" },
1670
{ 8733,	"prop",	"proportional to, U+221D ISOtech" },
1671
{ 8734,	"infin","infinity, U+221E ISOtech" },
1672
{ 8736,	"ang",	"angle, U+2220 ISOamso" },
1673
{ 8743,	"and",	"logical and = wedge, U+2227 ISOtech" },
1674
{ 8744,	"or",	"logical or = vee, U+2228 ISOtech" },
1675
{ 8745,	"cap",	"intersection = cap, U+2229 ISOtech" },
1676
{ 8746,	"cup",	"union = cup, U+222A ISOtech" },
1677
{ 8747,	"int",	"integral, U+222B ISOtech" },
1678
{ 8756,	"there4","therefore, U+2234 ISOtech" },
1679
{ 8764,	"sim",	"tilde operator = varies with = similar to, U+223C ISOtech" },
1680
{ 8773,	"cong",	"approximately equal to, U+2245 ISOtech" },
1681
{ 8776,	"asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1682
{ 8800,	"ne",	"not equal to, U+2260 ISOtech" },
1683
{ 8801,	"equiv","identical to, U+2261 ISOtech" },
1684
{ 8804,	"le",	"less-than or equal to, U+2264 ISOtech" },
1685
{ 8805,	"ge",	"greater-than or equal to, U+2265 ISOtech" },
1686
{ 8834,	"sub",	"subset of, U+2282 ISOtech" },
1687
{ 8835,	"sup",	"superset of, U+2283 ISOtech" },
1688
{ 8836,	"nsub",	"not a subset of, U+2284 ISOamsn" },
1689
{ 8838,	"sube",	"subset of or equal to, U+2286 ISOtech" },
1690
{ 8839,	"supe",	"superset of or equal to, U+2287 ISOtech" },
1691
{ 8853,	"oplus","circled plus = direct sum, U+2295 ISOamsb" },
1692
{ 8855,	"otimes","circled times = vector product, U+2297 ISOamsb" },
1693
{ 8869,	"perp",	"up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1694
{ 8901,	"sdot",	"dot operator, U+22C5 ISOamsb" },
1695
{ 8968,	"lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1696
{ 8969,	"rceil","right ceiling, U+2309 ISOamsc" },
1697
{ 8970,	"lfloor","left floor = apl downstile, U+230A ISOamsc" },
1698
{ 8971,	"rfloor","right floor, U+230B ISOamsc" },
1699
{ 9001,	"lang",	"left-pointing angle bracket = bra, U+2329 ISOtech" },
1700
{ 9002,	"rang",	"right-pointing angle bracket = ket, U+232A ISOtech" },
1701
{ 9674,	"loz",	"lozenge, U+25CA ISOpub" },
1702
1703
{ 9824,	"spades","black spade suit, U+2660 ISOpub" },
1704
{ 9827,	"clubs","black club suit = shamrock, U+2663 ISOpub" },
1705
{ 9829,	"hearts","black heart suit = valentine, U+2665 ISOpub" },
1706
{ 9830,	"diams","black diamond suit, U+2666 ISOpub" },
1707
1708
};
1709
1710
/************************************************************************
1711
 *									*
1712
 *		Commodity functions to handle entities			*
1713
 *									*
1714
 ************************************************************************/
1715
1716
/*
1717
 * Macro used to grow the current buffer.
1718
 */
1719
#define growBuffer(buffer) {						\
1720
    xmlChar *tmp;							\
1721
    buffer##_size *= 2;							\
1722
    tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1723
    if (tmp == NULL) {						\
1724
	htmlErrMemory(ctxt, "growing buffer\n");			\
1725
	xmlFree(buffer);						\
1726
	return(NULL);							\
1727
    }									\
1728
    buffer = tmp;							\
1729
}
1730
1731
/**
1732
 * htmlEntityLookup:
1733
 * @name: the entity name
1734
 *
1735
 * Lookup the given entity in EntitiesTable
1736
 *
1737
 * TODO: the linear scan is really ugly, an hash table is really needed.
1738
 *
1739
 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1740
 */
1741
const htmlEntityDesc *
1742
htmlEntityLookup(const xmlChar *name) {
1743
    unsigned int i;
1744
1745
    for (i = 0;i < (sizeof(html40EntitiesTable)/
1746
                    sizeof(html40EntitiesTable[0]));i++) {
1747
        if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1748
            return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1749
	}
1750
    }
1751
    return(NULL);
1752
}
1753
1754
/**
1755
 * htmlEntityValueLookup:
1756
 * @value: the entity's unicode value
1757
 *
1758
 * Lookup the given entity in EntitiesTable
1759
 *
1760
 * TODO: the linear scan is really ugly, an hash table is really needed.
1761
 *
1762
 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1763
 */
1764
const htmlEntityDesc *
1765
htmlEntityValueLookup(unsigned int value) {
1766
    unsigned int i;
1767
1768
    for (i = 0;i < (sizeof(html40EntitiesTable)/
1769
                    sizeof(html40EntitiesTable[0]));i++) {
1770
        if (html40EntitiesTable[i].value >= value) {
1771
	    if (html40EntitiesTable[i].value > value)
1772
		break;
1773
            return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1774
	}
1775
    }
1776
    return(NULL);
1777
}
1778
1779
/**
1780
 * UTF8ToHtml:
1781
 * @out:  a pointer to an array of bytes to store the result
1782
 * @outlen:  the length of @out
1783
 * @in:  a pointer to an array of UTF-8 chars
1784
 * @inlen:  the length of @in
1785
 *
1786
 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1787
 * plus HTML entities block of chars out.
1788
 *
1789
 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1790
 * The value of @inlen after return is the number of octets consumed
1791
 *     as the return value is positive, else unpredictable.
1792
 * The value of @outlen after return is the number of octets consumed.
1793
 */
1794
int
1795
UTF8ToHtml(unsigned char* out, int *outlen,
1796
              const unsigned char* in, int *inlen) {
1797
    const unsigned char* processed = in;
1798
    const unsigned char* outend;
1799
    const unsigned char* outstart = out;
1800
    const unsigned char* instart = in;
1801
    const unsigned char* inend;
1802
    unsigned int c, d;
1803
    int trailing;
1804
1805
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
1806
    if (in == NULL) {
1807
        /*
1808
	 * initialization nothing to do
1809
	 */
1810
	*outlen = 0;
1811
	*inlen = 0;
1812
	return(0);
1813
    }
1814
    inend = in + (*inlen);
1815
    outend = out + (*outlen);
1816
    while (in < inend) {
1817
	d = *in++;
1818
	if      (d < 0x80)  { c= d; trailing= 0; }
1819
	else if (d < 0xC0) {
1820
	    /* trailing byte in leading position */
1821
	    *outlen = out - outstart;
1822
	    *inlen = processed - instart;
1823
	    return(-2);
1824
        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1825
        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
1826
        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1827
	else {
1828
	    /* no chance for this in Ascii */
1829
	    *outlen = out - outstart;
1830
	    *inlen = processed - instart;
1831
	    return(-2);
1832
	}
1833
1834
	if (inend - in < trailing) {
1835
	    break;
1836
	} 
1837
1838
	for ( ; trailing; trailing--) {
1839
	    if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1840
		break;
1841
	    c <<= 6;
1842
	    c |= d & 0x3F;
1843
	}
1844
1845
	/* assertion: c is a single UTF-4 value */
1846
	if (c < 0x80) {
1847
	    if (out + 1 >= outend)
1848
		break;
1849
	    *out++ = c;
1850
	} else {
1851
	    int len;
1852
	    const htmlEntityDesc * ent;
1853
	    const char *cp;
1854
	    char nbuf[16];
1855
1856
	    /*
1857
	     * Try to lookup a predefined HTML entity for it
1858
	     */
1859
1860
	    ent = htmlEntityValueLookup(c);
1861
	    if (ent == NULL) {
1862
	      snprintf(nbuf, sizeof(nbuf), "#%u", c);
1863
	      cp = nbuf;
1864
	    }
1865
	    else
1866
	      cp = ent->name;
1867
	    len = strlen(cp);
1868
	    if (out + 2 + len >= outend)
1869
		break;
1870
	    *out++ = '&';
1871
	    memcpy(out, cp, len);
1872
	    out += len;
1873
	    *out++ = ';';
1874
	}
1875
	processed = in;
1876
    }
1877
    *outlen = out - outstart;
1878
    *inlen = processed - instart;
1879
    return(0);
1880
}
1881
1882
/**
1883
 * htmlEncodeEntities:
1884
 * @out:  a pointer to an array of bytes to store the result
1885
 * @outlen:  the length of @out
1886
 * @in:  a pointer to an array of UTF-8 chars
1887
 * @inlen:  the length of @in
1888
 * @quoteChar: the quote character to escape (' or ") or zero.
1889
 *
1890
 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1891
 * plus HTML entities block of chars out.
1892
 *
1893
 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1894
 * The value of @inlen after return is the number of octets consumed
1895
 *     as the return value is positive, else unpredictable.
1896
 * The value of @outlen after return is the number of octets consumed.
1897
 */
1898
int
1899
htmlEncodeEntities(unsigned char* out, int *outlen,
1900
		   const unsigned char* in, int *inlen, int quoteChar) {
1901
    const unsigned char* processed = in;
1902
    const unsigned char* outend;
1903
    const unsigned char* outstart = out;
1904
    const unsigned char* instart = in;
1905
    const unsigned char* inend;
1906
    unsigned int c, d;
1907
    int trailing;
1908
1909
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
1910
        return(-1);
1911
    outend = out + (*outlen);
1912
    inend = in + (*inlen);
1913
    while (in < inend) {
1914
	d = *in++;
1915
	if      (d < 0x80)  { c= d; trailing= 0; }
1916
	else if (d < 0xC0) {
1917
	    /* trailing byte in leading position */
1918
	    *outlen = out - outstart;
1919
	    *inlen = processed - instart;
1920
	    return(-2);
1921
        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1922
        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
1923
        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1924
	else {
1925
	    /* no chance for this in Ascii */
1926
	    *outlen = out - outstart;
1927
	    *inlen = processed - instart;
1928
	    return(-2);
1929
	}
1930
1931
	if (inend - in < trailing)
1932
	    break;
1933
1934
	while (trailing--) {
1935
	    if (((d= *in++) & 0xC0) != 0x80) {
1936
		*outlen = out - outstart;
1937
		*inlen = processed - instart;
1938
		return(-2);
1939
	    }
1940
	    c <<= 6;
1941
	    c |= d & 0x3F;
1942
	}
1943
1944
	/* assertion: c is a single UTF-4 value */
1945
	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1946
	    (c != '&') && (c != '<') && (c != '>')) {
1947
	    if (out >= outend)
1948
		break;
1949
	    *out++ = c;
1950
	} else {
1951
	    const htmlEntityDesc * ent;
1952
	    const char *cp;
1953
	    char nbuf[16];
1954
	    int len;
1955
1956
	    /*
1957
	     * Try to lookup a predefined HTML entity for it
1958
	     */
1959
	    ent = htmlEntityValueLookup(c);
1960
	    if (ent == NULL) {
1961
		snprintf(nbuf, sizeof(nbuf), "#%u", c);
1962
		cp = nbuf;
1963
	    }
1964
	    else
1965
		cp = ent->name;
1966
	    len = strlen(cp);
1967
	    if (out + 2 + len > outend)
1968
		break;
1969
	    *out++ = '&';
1970
	    memcpy(out, cp, len);
1971
	    out += len;
1972
	    *out++ = ';';
1973
	}
1974
	processed = in;
1975
    }
1976
    *outlen = out - outstart;
1977
    *inlen = processed - instart;
1978
    return(0);
1979
}
1980
1981
/************************************************************************
1982
 *									*
1983
 *		Commodity functions to handle streams			*
1984
 *									*
1985
 ************************************************************************/
1986
1987
/**
1988
 * htmlNewInputStream:
1989
 * @ctxt:  an HTML parser context
1990
 *
1991
 * Create a new input stream structure
1992
 * Returns the new input stream or NULL
1993
 */
1994
static htmlParserInputPtr
1995
htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1996
    htmlParserInputPtr input;
1997
1998
    input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1999
    if (input == NULL) {
2000
        htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2001
	return(NULL);
2002
    }
2003
    memset(input, 0, sizeof(htmlParserInput));
2004
    input->filename = NULL;
2005
    input->directory = NULL;
2006
    input->base = NULL;
2007
    input->cur = NULL;
2008
    input->buf = NULL;
2009
    input->line = 1;
2010
    input->col = 1;
2011
    input->buf = NULL;
2012
    input->free = NULL;
2013
    input->version = NULL;
2014
    input->consumed = 0;
2015
    input->length = 0;
2016
    return(input);
2017
}
2018
2019
2020
/************************************************************************
2021
 *									*
2022
 *		Commodity functions, cleanup needed ?			*
2023
 *									*
2024
 ************************************************************************/
2025
/*
2026
 * all tags allowing pc data from the html 4.01 loose dtd 
2027
 * NOTE: it might be more apropriate to integrate this information
2028
 * into the html40ElementTable array but I don't want to risk any
2029
 * binary incomptibility
2030
 */
2031
static const char *allowPCData[] = {
2032
    "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2033
    "blockquote", "body", "button", "caption", "center", "cite", "code",
2034
    "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2035
    "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2036
    "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2037
    "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2038
};
2039
2040
/**
2041
 * areBlanks:
2042
 * @ctxt:  an HTML parser context
2043
 * @str:  a xmlChar *
2044
 * @len:  the size of @str
2045
 *
2046
 * Is this a sequence of blank chars that one can ignore ?
2047
 *
2048
 * Returns 1 if ignorable 0 otherwise.
2049
 */
2050
2051
static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2052
    unsigned int i;
2053
    int j;
2054
    xmlNodePtr lastChild;
2055
    xmlDtdPtr dtd;
2056
2057
    for (j = 0;j < len;j++)
2058
        if (!(IS_BLANK_CH(str[j]))) return(0);
2059
2060
    if (CUR == 0) return(1);
2061
    if (CUR != '<') return(0);
2062
    if (ctxt->name == NULL)
2063
	return(1);
2064
    if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2065
	return(1);
2066
    if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2067
	return(1);
2068
2069
    /* Only strip CDATA children of the body tag for strict HTML DTDs */
2070
    if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2071
        dtd = xmlGetIntSubset(ctxt->myDoc);
2072
        if (dtd != NULL && dtd->ExternalID != NULL) {
2073
            if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2074
                    !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2075
                return(1);
2076
        }
2077
    }
2078
2079
    if (ctxt->node == NULL) return(0);
2080
    lastChild = xmlGetLastChild(ctxt->node);
2081
    while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2082
	lastChild = lastChild->prev;
2083
    if (lastChild == NULL) {
2084
        if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2085
            (ctxt->node->content != NULL)) return(0);
2086
	/* keep ws in constructs like ...<b> </b>... 
2087
	   for all tags "b" allowing PCDATA */
2088
	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2089
	    if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2090
		return(0);
2091
	    }
2092
	}
2093
    } else if (xmlNodeIsText(lastChild)) {
2094
        return(0);
2095
    } else {
2096
	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p> 
2097
	   for all tags "p" allowing PCDATA */
2098
	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2099
	    if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2100
		return(0);
2101
	    }
2102
	}
2103
    }
2104
    return(1);
2105
}
2106
2107
/**
2108
 * htmlNewDocNoDtD:
2109
 * @URI:  URI for the dtd, or NULL
2110
 * @ExternalID:  the external ID of the DTD, or NULL
2111
 *
2112
 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2113
 * are NULL
2114
 *
2115
 * Returns a new document, do not initialize the DTD if not provided
2116
 */
2117
htmlDocPtr
2118
htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2119
    xmlDocPtr cur;
2120
2121
    /*
2122
     * Allocate a new document and fill the fields.
2123
     */
2124
    cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2125
    if (cur == NULL) {
2126
	htmlErrMemory(NULL, "HTML document creation failed\n");
2127
	return(NULL);
2128
    }
2129
    memset(cur, 0, sizeof(xmlDoc));
2130
2131
    cur->type = XML_HTML_DOCUMENT_NODE;
2132
    cur->version = NULL;
2133
    cur->intSubset = NULL;
2134
    cur->doc = cur;
2135
    cur->name = NULL;
2136
    cur->children = NULL; 
2137
    cur->extSubset = NULL;
2138
    cur->oldNs = NULL;
2139
    cur->encoding = NULL;
2140
    cur->standalone = 1;
2141
    cur->compression = 0;
2142
    cur->ids = NULL;
2143
    cur->refs = NULL;
2144
    cur->_private = NULL;
2145
    cur->charset = XML_CHAR_ENCODING_UTF8;
2146
    cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2147
    if ((ExternalID != NULL) ||
2148
	(URI != NULL))
2149
	xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2150
    return(cur);
2151
}
2152
2153
/**
2154
 * htmlNewDoc:
2155
 * @URI:  URI for the dtd, or NULL
2156
 * @ExternalID:  the external ID of the DTD, or NULL
2157
 *
2158
 * Creates a new HTML document
2159
 *
2160
 * Returns a new document
2161
 */
2162
htmlDocPtr
2163
htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2164
    if ((URI == NULL) && (ExternalID == NULL))
2165
	return(htmlNewDocNoDtD(
2166
		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2167
		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2168
2169
    return(htmlNewDocNoDtD(URI, ExternalID));
2170
}
2171
2172
2173
/************************************************************************
2174
 *									*
2175
 *			The parser itself				*
2176
 *	Relates to http://www.w3.org/TR/html40				*
2177
 *									*
2178
 ************************************************************************/
2179
2180
/************************************************************************
2181
 *									*
2182
 *			The parser itself				*
2183
 *									*
2184
 ************************************************************************/
2185
2186
static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2187
2188
/**
2189
 * htmlParseHTMLName:
2190
 * @ctxt:  an HTML parser context
2191
 *
2192
 * parse an HTML tag or attribute name, note that we convert it to lowercase
2193
 * since HTML names are not case-sensitive.
2194
 *
2195
 * Returns the Tag Name parsed or NULL
2196
 */
2197
2198
static const xmlChar *
2199
htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2200
    int i = 0;
2201
    xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2202
2203
    if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2204
        (CUR != ':')) return(NULL);
2205
2206
    while ((i < HTML_PARSER_BUFFER_SIZE) &&
2207
           ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2208
	   (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2209
	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2210
        else loc[i] = CUR;
2211
	i++;
2212
	
2213
	NEXT;
2214
    }
2215
    
2216
    return(xmlDictLookup(ctxt->dict, loc, i));
2217
}
2218
2219
2220
/**
2221
 * htmlParseHTMLName_nonInvasive:
2222
 * @ctxt:  an HTML parser context
2223
 *
2224
 * parse an HTML tag or attribute name, note that we convert it to lowercase
2225
 * since HTML names are not case-sensitive, this doesn't consume the data
2226
 * from the stream, it's a look-ahead
2227
 *
2228
 * Returns the Tag Name parsed or NULL
2229
 */
2230
2231
static const xmlChar *
2232
htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2233
    int i = 0;
2234
    xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2235
2236
    if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2237
        (NXT(1) != ':')) return(NULL);
2238
 
2239
    while ((i < HTML_PARSER_BUFFER_SIZE) &&
2240
           ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2241
	   (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2242
	if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2243
        else loc[i] = NXT(1+i);
2244
	i++;
2245
    }
2246
    
2247
    return(xmlDictLookup(ctxt->dict, loc, i));
2248
}
2249
2250
2251
/**
2252
 * htmlParseName:
2253
 * @ctxt:  an HTML parser context
2254
 *
2255
 * parse an HTML name, this routine is case sensitive.
2256
 *
2257
 * Returns the Name parsed or NULL
2258
 */
2259
2260
static const xmlChar *
2261
htmlParseName(htmlParserCtxtPtr ctxt) {
2262
    const xmlChar *in;
2263
    const xmlChar *ret;
2264
    int count = 0;
2265
2266
    GROW;
2267
2268
    /*
2269
     * Accelerator for simple ASCII names
2270
     */
2271
    in = ctxt->input->cur;
2272
    if (((*in >= 0x61) && (*in <= 0x7A)) ||
2273
	((*in >= 0x41) && (*in <= 0x5A)) ||
2274
	(*in == '_') || (*in == ':')) {
2275
	in++;
2276
	while (((*in >= 0x61) && (*in <= 0x7A)) ||
2277
	       ((*in >= 0x41) && (*in <= 0x5A)) ||
2278
	       ((*in >= 0x30) && (*in <= 0x39)) ||
2279
	       (*in == '_') || (*in == '-') ||
2280
	       (*in == ':') || (*in == '.'))
2281
	    in++;
2282
	if ((*in > 0) && (*in < 0x80)) {
2283
	    count = in - ctxt->input->cur;
2284
	    ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2285
	    ctxt->input->cur = in;
2286
	    ctxt->nbChars += count;
2287
	    ctxt->input->col += count;
2288
	    return(ret);
2289
	}
2290
    }
2291
    return(htmlParseNameComplex(ctxt));
2292
}
2293
2294
static const xmlChar *
2295
htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2296
    int len = 0, l;
2297
    int c;
2298
    int count = 0;
2299
2300
    /*
2301
     * Handler for more complex cases
2302
     */
2303
    GROW;
2304
    c = CUR_CHAR(l);
2305
    if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2306
	(!IS_LETTER(c) && (c != '_') &&
2307
         (c != ':'))) {
2308
	return(NULL);
2309
    }
2310
2311
    while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2312
	   ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2313
            (c == '.') || (c == '-') ||
2314
	    (c == '_') || (c == ':') || 
2315
	    (IS_COMBINING(c)) ||
2316
	    (IS_EXTENDER(c)))) {
2317
	if (count++ > 100) {
2318
	    count = 0;
2319
	    GROW;
2320
	}
2321
	len += l;
2322
	NEXTL(l);
2323
	c = CUR_CHAR(l);
2324
    }
2325
    return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2326
}
2327
2328
2329
/**
2330
 * htmlParseHTMLAttribute:
2331
 * @ctxt:  an HTML parser context
2332
 * @stop:  a char stop value
2333
 * 
2334
 * parse an HTML attribute value till the stop (quote), if
2335
 * stop is 0 then it stops at the first space
2336
 *
2337
 * Returns the attribute parsed or NULL
2338
 */
2339
2340
static xmlChar *
2341
htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2342
    xmlChar *buffer = NULL;
2343
    int buffer_size = 0;
2344
    xmlChar *out = NULL;
2345
    const xmlChar *name = NULL;
2346
    const xmlChar *cur = NULL;
2347
    const htmlEntityDesc * ent;
2348
2349
    /*
2350
     * allocate a translation buffer.
2351
     */
2352
    buffer_size = HTML_PARSER_BUFFER_SIZE;
2353
    buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2354
    if (buffer == NULL) {
2355
	htmlErrMemory(ctxt, "buffer allocation failed\n");
2356
	return(NULL);
2357
    }
2358
    out = buffer;
2359
2360
    /*
2361
     * Ok loop until we reach one of the ending chars
2362
     */
2363
    while ((CUR != 0) && (CUR != stop)) {
2364
	if ((stop == 0) && (CUR == '>')) break;
2365
	if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2366
        if (CUR == '&') {
2367
	    if (NXT(1) == '#') {
2368
		unsigned int c;
2369
		int bits;
2370
2371
		c = htmlParseCharRef(ctxt);
2372
		if      (c <    0x80)
2373
		        { *out++  = c;                bits= -6; }
2374
		else if (c <   0x800)
2375
		        { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2376
		else if (c < 0x10000)
2377
		        { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2378
		else                 
2379
		        { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2380
	 
2381
		for ( ; bits >= 0; bits-= 6) {
2382
		    *out++  = ((c >> bits) & 0x3F) | 0x80;
2383
		}
2384
		
2385
		if (out - buffer > buffer_size - 100) {
2386
			int indx = out - buffer;
2387
2388
			growBuffer(buffer);
2389
			out = &buffer[indx];
2390
		}
2391
	    } else {
2392
		ent = htmlParseEntityRef(ctxt, &name);
2393
		if (name == NULL) {
2394
		    *out++ = '&';
2395
		    if (out - buffer > buffer_size - 100) {
2396
			int indx = out - buffer;
2397
2398
			growBuffer(buffer);
2399
			out = &buffer[indx];
2400
		    }
2401
		} else if (ent == NULL) {
2402
		    *out++ = '&';
2403
		    cur = name;
2404
		    while (*cur != 0) {
2405
			if (out - buffer > buffer_size - 100) {
2406
			    int indx = out - buffer;
2407
2408
			    growBuffer(buffer);
2409
			    out = &buffer[indx];
2410
			}
2411
			*out++ = *cur++;
2412
		    }
2413
		} else {
2414
		    unsigned int c;
2415
		    int bits;
2416
2417
		    if (out - buffer > buffer_size - 100) {
2418
			int indx = out - buffer;
2419
2420
			growBuffer(buffer);
2421
			out = &buffer[indx];
2422
		    }
2423
		    c = ent->value;
2424
		    if      (c <    0x80)
2425
			{ *out++  = c;                bits= -6; }
2426
		    else if (c <   0x800)
2427
			{ *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2428
		    else if (c < 0x10000)
2429
			{ *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2430
		    else                 
2431
			{ *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2432
	     
2433
		    for ( ; bits >= 0; bits-= 6) {
2434
			*out++  = ((c >> bits) & 0x3F) | 0x80;
2435
		    }
2436
		}
2437
	    }
2438
	} else {
2439
	    unsigned int c;
2440
	    int bits, l;
2441
2442
	    if (out - buffer > buffer_size - 100) {
2443
		int indx = out - buffer;
2444
2445
		growBuffer(buffer);
2446
		out = &buffer[indx];
2447
	    }
2448
	    c = CUR_CHAR(l);
2449
	    if      (c <    0x80)
2450
		    { *out++  = c;                bits= -6; }
2451
	    else if (c <   0x800)
2452
		    { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2453
	    else if (c < 0x10000)
2454
		    { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2455
	    else                 
2456
		    { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2457
     
2458
	    for ( ; bits >= 0; bits-= 6) {
2459
		*out++  = ((c >> bits) & 0x3F) | 0x80;
2460
	    }
2461
	    NEXT;
2462
	}
2463
    }
2464
    *out++ = 0;
2465
    return(buffer);
2466
}
2467
2468
/**
2469
 * htmlParseEntityRef:
2470
 * @ctxt:  an HTML parser context
2471
 * @str:  location to store the entity name
2472
 *
2473
 * parse an HTML ENTITY references
2474
 *
2475
 * [68] EntityRef ::= '&' Name ';'
2476
 *
2477
 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2478
 *         if non-NULL *str will have to be freed by the caller.
2479
 */
2480
const htmlEntityDesc *
2481
htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2482
    const xmlChar *name;
2483
    const htmlEntityDesc * ent = NULL;
2484
2485
    if (str != NULL) *str = NULL;
2486
    if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2487
2488
    if (CUR == '&') {
2489
        NEXT;
2490
        name = htmlParseName(ctxt);
2491
	if (name == NULL) {
2492
	    htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2493
	                 "htmlParseEntityRef: no name\n", NULL, NULL);
2494
	} else {
2495
	    GROW;
2496
	    if (CUR == ';') {
2497
	        if (str != NULL)
2498
		    *str = name;
2499
2500
		/*
2501
		 * Lookup the entity in the table.
2502
		 */
2503
		ent = htmlEntityLookup(name);
2504
		if (ent != NULL) /* OK that's ugly !!! */
2505
		    NEXT;
2506
	    } else {
2507
		htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2508
		             "htmlParseEntityRef: expecting ';'\n",
2509
			     NULL, NULL);
2510
	        if (str != NULL)
2511
		    *str = name;
2512
	    }
2513
	}
2514
    }
2515
    return(ent);
2516
}
2517
2518
/**
2519
 * htmlParseAttValue:
2520
 * @ctxt:  an HTML parser context
2521
 *
2522
 * parse a value for an attribute
2523
 * Note: the parser won't do substitution of entities here, this
2524
 * will be handled later in xmlStringGetNodeList, unless it was
2525
 * asked for ctxt->replaceEntities != 0 
2526
 *
2527
 * Returns the AttValue parsed or NULL.
2528
 */
2529
2530
static xmlChar *
2531
htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2532
    xmlChar *ret = NULL;
2533
2534
    if (CUR == '"') {
2535
        NEXT;
2536
	ret = htmlParseHTMLAttribute(ctxt, '"');
2537
        if (CUR != '"') {
2538
	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2539
	                 "AttValue: \" expected\n", NULL, NULL);
2540
	} else
2541
	    NEXT;
2542
    } else if (CUR == '\'') {
2543
        NEXT;
2544
	ret = htmlParseHTMLAttribute(ctxt, '\'');
2545
        if (CUR != '\'') {
2546
	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2547
	                 "AttValue: ' expected\n", NULL, NULL);
2548
	} else
2549
	    NEXT;
2550
    } else {
2551
        /*
2552
	 * That's an HTMLism, the attribute value may not be quoted
2553
	 */
2554
	ret = htmlParseHTMLAttribute(ctxt, 0);
2555
	if (ret == NULL) {
2556
	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2557
	                 "AttValue: no value found\n", NULL, NULL);
2558
	}
2559
    }
2560
    return(ret);
2561
}
2562
2563
/**
2564
 * htmlParseSystemLiteral:
2565
 * @ctxt:  an HTML parser context
2566
 * 
2567
 * parse an HTML Literal
2568
 *
2569
 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2570
 *
2571
 * Returns the SystemLiteral parsed or NULL
2572
 */
2573
2574
static xmlChar *
2575
htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2576
    const xmlChar *q;
2577
    xmlChar *ret = NULL;
2578
2579
    if (CUR == '"') {
2580
        NEXT;
2581
	q = CUR_PTR;
2582
	while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
2583
	    NEXT;
2584
	if (!IS_CHAR_CH(CUR)) {
2585
	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2586
			 "Unfinished SystemLiteral\n", NULL, NULL);
2587
	} else {
2588
	    ret = xmlStrndup(q, CUR_PTR - q);
2589
	    NEXT;
2590
        }
2591
    } else if (CUR == '\'') {
2592
        NEXT;
2593
	q = CUR_PTR;
2594
	while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
2595
	    NEXT;
2596
	if (!IS_CHAR_CH(CUR)) {
2597
	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2598
			 "Unfinished SystemLiteral\n", NULL, NULL);
2599
	} else {
2600
	    ret = xmlStrndup(q, CUR_PTR - q);
2601
	    NEXT;
2602
        }
2603
    } else {
2604
	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2605
	             " or ' expected\n", NULL, NULL);
2606
    }
2607
    
2608
    return(ret);
2609
}
2610
2611
/**
2612
 * htmlParsePubidLiteral:
2613
 * @ctxt:  an HTML parser context
2614
 *
2615
 * parse an HTML public literal
2616
 *
2617
 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2618
 *
2619
 * Returns the PubidLiteral parsed or NULL.
2620
 */
2621
2622
static xmlChar *
2623
htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2624
    const xmlChar *q;
2625
    xmlChar *ret = NULL;
2626
    /*
2627
     * Name ::= (Letter | '_') (NameChar)*
2628
     */
2629
    if (CUR == '"') {
2630
        NEXT;
2631
	q = CUR_PTR;
2632
	while (IS_PUBIDCHAR_CH(CUR)) NEXT;
2633
	if (CUR != '"') {
2634
	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2635
	                 "Unfinished PubidLiteral\n", NULL, NULL);
2636
	} else {
2637
	    ret = xmlStrndup(q, CUR_PTR - q);
2638
	    NEXT;
2639
	}
2640
    } else if (CUR == '\'') {
2641
        NEXT;
2642
	q = CUR_PTR;
2643
	while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
2644
	    NEXT;
2645
	if (CUR != '\'') {
2646
	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2647
	                 "Unfinished PubidLiteral\n", NULL, NULL);
2648
	} else {
2649
	    ret = xmlStrndup(q, CUR_PTR - q);
2650
	    NEXT;
2651
	}
2652
    } else {
2653
	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2654
	             "PubidLiteral \" or ' expected\n", NULL, NULL);
2655
    }
2656
    
2657
    return(ret);
2658
}
2659
2660
/**
2661
 * htmlParseScript:
2662
 * @ctxt:  an HTML parser context
2663
 *
2664
 * parse the content of an HTML SCRIPT or STYLE element
2665
 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2666
 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2667
 * http://www.w3.org/TR/html4/types.html#type-script
2668
 * http://www.w3.org/TR/html4/types.html#h-6.15
2669
 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2670
 *
2671
 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2672
 * element and the value of intrinsic event attributes. User agents must
2673
 * not evaluate script data as HTML markup but instead must pass it on as
2674
 * data to a script engine.
2675
 * NOTES:
2676
 * - The content is passed like CDATA
2677
 * - the attributes for style and scripting "onXXX" are also described
2678
 *   as CDATA but SGML allows entities references in attributes so their
2679
 *   processing is identical as other attributes
2680
 */
2681
static void
2682
htmlParseScript(htmlParserCtxtPtr ctxt) {
2683
    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2684
    int nbchar = 0;
2685
    int cur,l;
2686
2687
    SHRINK;
2688
    cur = CUR_CHAR(l);
2689
    while (IS_CHAR_CH(cur)) {
2690
	if ((cur == '<') && (NXT(1) == '/')) {
2691
            /*
2692
             * One should break here, the specification is clear:
2693
             * Authors should therefore escape "</" within the content.
2694
             * Escape mechanisms are specific to each scripting or
2695
             * style sheet language.
2696
             *
2697
             * In recovery mode, only break if end tag match the
2698
             * current tag, effectively ignoring all tags inside the
2699
             * script/style block and treating the entire block as
2700
             * CDATA.
2701
             */
2702
            if (ctxt->recovery) {
2703
                if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2, 
2704
				   xmlStrlen(ctxt->name)) == 0) 
2705
                {
2706
                    break; /* while */
2707
                } else {
2708
		    htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2709
				 "Element %s embeds close tag\n",
2710
		                 ctxt->name, NULL);
2711
		}
2712
            } else {
2713
                if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2714
                    ((NXT(2) >= 'a') && (NXT(2) <= 'z'))) 
2715
                {
2716
                    break; /* while */
2717
                }
2718
            }
2719
	}
2720
	COPY_BUF(l,buf,nbchar,cur);
2721
	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2722
	    if (ctxt->sax->cdataBlock!= NULL) {
2723
		/*
2724
		 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2725
		 */
2726
		ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2727
	    } else if (ctxt->sax->characters != NULL) {
2728
		ctxt->sax->characters(ctxt->userData, buf, nbchar);
2729
	    }
2730
	    nbchar = 0;
2731
	}
2732
	GROW;
2733
	NEXTL(l);
2734
	cur = CUR_CHAR(l);
2735
    }
2736
2737
    if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
2738
	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2739
	                "Invalid char in CDATA 0x%X\n", cur);
2740
	NEXT;
2741
    }
2742
2743
    if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2744
	if (ctxt->sax->cdataBlock!= NULL) {
2745
	    /*
2746
	     * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2747
	     */
2748
	    ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2749
	} else if (ctxt->sax->characters != NULL) {
2750
	    ctxt->sax->characters(ctxt->userData, buf, nbchar);
2751
	}
2752
    }
2753
}
2754
2755
2756
/**
2757
 * htmlParseCharData:
2758
 * @ctxt:  an HTML parser context
2759
 *
2760
 * parse a CharData section.
2761
 * if we are within a CDATA section ']]>' marks an end of section.
2762
 *
2763
 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2764
 */
2765
2766
static void
2767
htmlParseCharData(htmlParserCtxtPtr ctxt) {
2768
    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2769
    int nbchar = 0;
2770
    int cur, l;
2771
    int chunk = 0;
2772
2773
    SHRINK;
2774
    cur = CUR_CHAR(l);
2775
    while (((cur != '<') || (ctxt->token == '<')) &&
2776
           ((cur != '&') || (ctxt->token == '&')) && 
2777
	   (cur != 0)) {
2778
	if (!(IS_CHAR(cur))) {
2779
	    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2780
	                "Invalid char in CDATA 0x%X\n", cur);
2781
	} else {
2782
	    COPY_BUF(l,buf,nbchar,cur);
2783
	}
2784
	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2785
	    /*
2786
	     * Ok the segment is to be consumed as chars.
2787
	     */
2788
	    if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2789
		if (areBlanks(ctxt, buf, nbchar)) {
2790
		    if (ctxt->sax->ignorableWhitespace != NULL)
2791
			ctxt->sax->ignorableWhitespace(ctxt->userData,
2792
			                               buf, nbchar);
2793
		} else {
2794
		    htmlCheckParagraph(ctxt);
2795
		    if (ctxt->sax->characters != NULL)
2796
			ctxt->sax->characters(ctxt->userData, buf, nbchar);
2797
		}
2798
	    }
2799
	    nbchar = 0;
2800
	}
2801
	NEXTL(l);
2802
        chunk++;
2803
        if (chunk > HTML_PARSER_BUFFER_SIZE) {
2804
            chunk = 0;
2805
            SHRINK;
2806
            GROW;
2807
        }
2808
	cur = CUR_CHAR(l);
2809
	if (cur == 0) {
2810
	    SHRINK;
2811
	    GROW;
2812
	    cur = CUR_CHAR(l);
2813
	}
2814
    }
2815
    if (nbchar != 0) {
2816
        buf[nbchar] = 0;
2817
2818
	/*
2819
	 * Ok the segment is to be consumed as chars.
2820
	 */
2821
	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2822
	    if (areBlanks(ctxt, buf, nbchar)) {
2823
		if (ctxt->sax->ignorableWhitespace != NULL)
2824
		    ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2825
	    } else {
2826
		htmlCheckParagraph(ctxt);
2827
		if (ctxt->sax->characters != NULL)
2828
		    ctxt->sax->characters(ctxt->userData, buf, nbchar);
2829
	    }
2830
	}
2831
    } else {
2832
	/*
2833
	 * Loop detection
2834
	 */
2835
	if (cur == 0)
2836
	    ctxt->instate = XML_PARSER_EOF;
2837
    }
2838
}
2839
2840
/**
2841
 * htmlParseExternalID:
2842
 * @ctxt:  an HTML parser context
2843
 * @publicID:  a xmlChar** receiving PubidLiteral
2844
 *
2845
 * Parse an External ID or a Public ID
2846
 *
2847
 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2848
 *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
2849
 *
2850
 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2851
 *
2852
 * Returns the function returns SystemLiteral and in the second
2853
 *                case publicID receives PubidLiteral, is strict is off
2854
 *                it is possible to return NULL and have publicID set.
2855
 */
2856
2857
static xmlChar *
2858
htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
2859
    xmlChar *URI = NULL;
2860
2861
    if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2862
         (UPP(2) == 'S') && (UPP(3) == 'T') &&
2863
	 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2864
        SKIP(6);
2865
	if (!IS_BLANK_CH(CUR)) {
2866
	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2867
	                 "Space required after 'SYSTEM'\n", NULL, NULL);
2868
	}
2869
        SKIP_BLANKS;
2870
	URI = htmlParseSystemLiteral(ctxt);
2871
	if (URI == NULL) {
2872
	    htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2873
	                 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
2874
        }
2875
    } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2876
	       (UPP(2) == 'B') && (UPP(3) == 'L') &&
2877
	       (UPP(4) == 'I') && (UPP(5) == 'C')) {
2878
        SKIP(6);
2879
	if (!IS_BLANK_CH(CUR)) {
2880
	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2881
	                 "Space required after 'PUBLIC'\n", NULL, NULL);
2882
	}
2883
        SKIP_BLANKS;
2884
	*publicID = htmlParsePubidLiteral(ctxt);
2885
	if (*publicID == NULL) {
2886
	    htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2887
	                 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2888
			 NULL, NULL);
2889
	}
2890
        SKIP_BLANKS;
2891
        if ((CUR == '"') || (CUR == '\'')) {
2892
	    URI = htmlParseSystemLiteral(ctxt);
2893
	}
2894
    }
2895
    return(URI);
2896
}
2897
2898
/**
2899
 * xmlParsePI:
2900
 * @ctxt:  an XML parser context
2901
 *
2902
 * parse an XML Processing Instruction.
2903
 *
2904
 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
2905
 */
2906
static void
2907
htmlParsePI(htmlParserCtxtPtr ctxt) {
2908
    xmlChar *buf = NULL;
2909
    int len = 0;
2910
    int size = HTML_PARSER_BUFFER_SIZE;
2911
    int cur, l;
2912
    const xmlChar *target;
2913
    xmlParserInputState state;
2914
    int count = 0;
2915
2916
    if ((RAW == '<') && (NXT(1) == '?')) {
2917
	state = ctxt->instate;
2918
        ctxt->instate = XML_PARSER_PI;
2919
	/*
2920
	 * this is a Processing Instruction.
2921
	 */
2922
	SKIP(2);
2923
	SHRINK;
2924
2925
	/*
2926
	 * Parse the target name and check for special support like
2927
	 * namespace.
2928
	 */
2929
        target = htmlParseName(ctxt);
2930
	if (target != NULL) {
2931
	    if (RAW == '>') {
2932
		SKIP(1);
2933
2934
		/*
2935
		 * SAX: PI detected.
2936
		 */
2937
		if ((ctxt->sax) && (!ctxt->disableSAX) &&
2938
		    (ctxt->sax->processingInstruction != NULL))
2939
		    ctxt->sax->processingInstruction(ctxt->userData,
2940
		                                     target, NULL);
2941
		ctxt->instate = state;
2942
		return;
2943
	    }
2944
	    buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
2945
	    if (buf == NULL) {
2946
		htmlErrMemory(ctxt, NULL);
2947
		ctxt->instate = state;
2948
		return;
2949
	    }
2950
	    cur = CUR;
2951
	    if (!IS_BLANK(cur)) {
2952
		htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2953
			  "ParsePI: PI %s space expected\n", target, NULL);
2954
	    }
2955
            SKIP_BLANKS;
2956
	    cur = CUR_CHAR(l);
2957
	    while (IS_CHAR(cur) && (cur != '>')) {
2958
		if (len + 5 >= size) {
2959
		    xmlChar *tmp;
2960
2961
		    size *= 2;
2962
		    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2963
		    if (tmp == NULL) {
2964
			htmlErrMemory(ctxt, NULL);
2965
			xmlFree(buf);
2966
			ctxt->instate = state;
2967
			return;
2968
		    }
2969
		    buf = tmp;
2970
		}
2971
		count++;
2972
		if (count > 50) {
2973
		    GROW;
2974
		    count = 0;
2975
		}
2976
		COPY_BUF(l,buf,len,cur);
2977
		NEXTL(l);
2978
		cur = CUR_CHAR(l);
2979
		if (cur == 0) {
2980
		    SHRINK;
2981
		    GROW;
2982
		    cur = CUR_CHAR(l);
2983
		}
2984
	    }
2985
	    buf[len] = 0;
2986
	    if (cur != '>') {
2987
		htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
2988
		      "ParsePI: PI %s never end ...\n", target, NULL);
2989
	    } else {
2990
		SKIP(1);
2991
2992
		/*
2993
		 * SAX: PI detected.
2994
		 */
2995
		if ((ctxt->sax) && (!ctxt->disableSAX) &&
2996
		    (ctxt->sax->processingInstruction != NULL))
2997
		    ctxt->sax->processingInstruction(ctxt->userData,
2998
		                                     target, buf);
2999
	    }
3000
	    xmlFree(buf);
3001
	} else {
3002
	    htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED, 
3003
                         "PI is not started correctly", NULL, NULL);
3004
	}
3005
	ctxt->instate = state;
3006
    }
3007
}
3008
3009
/**
3010
 * htmlParseComment:
3011
 * @ctxt:  an HTML parser context
3012
 *
3013
 * Parse an XML (SGML) comment <!-- .... -->
3014
 *
3015
 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3016
 */
3017
static void
3018
htmlParseComment(htmlParserCtxtPtr ctxt) {
3019
    xmlChar *buf = NULL;
3020
    int len;
3021
    int size = HTML_PARSER_BUFFER_SIZE;
3022
    int q, ql;
3023
    int r, rl;
3024
    int cur, l;
3025
    xmlParserInputState state;
3026
3027
    /*
3028
     * Check that there is a comment right here.
3029
     */
3030
    if ((RAW != '<') || (NXT(1) != '!') ||
3031
        (NXT(2) != '-') || (NXT(3) != '-')) return;
3032
3033
    state = ctxt->instate;
3034
    ctxt->instate = XML_PARSER_COMMENT;
3035
    SHRINK;
3036
    SKIP(4);
3037
    buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3038
    if (buf == NULL) {
3039
        htmlErrMemory(ctxt, "buffer allocation failed\n");
3040
	ctxt->instate = state;
3041
	return;
3042
    }
3043
    q = CUR_CHAR(ql);
3044
    NEXTL(ql);
3045
    r = CUR_CHAR(rl);
3046
    NEXTL(rl);
3047
    cur = CUR_CHAR(l);
3048
    len = 0;
3049
    while (IS_CHAR(cur) &&
3050
           ((cur != '>') ||
3051
	    (r != '-') || (q != '-'))) {
3052
	if (len + 5 >= size) {
3053
	    xmlChar *tmp;
3054
3055
	    size *= 2;
3056
	    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3057
	    if (tmp == NULL) {
3058
	        xmlFree(buf);
3059
	        htmlErrMemory(ctxt, "growing buffer failed\n");
3060
		ctxt->instate = state;
3061
		return;
3062
	    }
3063
	    buf = tmp;
3064
	}
3065
	COPY_BUF(ql,buf,len,q);
3066
	q = r;
3067
	ql = rl;
3068
	r = cur;
3069
	rl = l;
3070
	NEXTL(l);
3071
	cur = CUR_CHAR(l);
3072
	if (cur == 0) {
3073
	    SHRINK;
3074
	    GROW;
3075
	    cur = CUR_CHAR(l);
3076
	}
3077
    }
3078
    buf[len] = 0;
3079
    if (!IS_CHAR(cur)) {
3080
	htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3081
	             "Comment not terminated \n<!--%.50s\n", buf, NULL);
3082
	xmlFree(buf);
3083
    } else {
3084
        NEXT;
3085
	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3086
	    (!ctxt->disableSAX))
3087
	    ctxt->sax->comment(ctxt->userData, buf);
3088
	xmlFree(buf);
3089
    }
3090
    ctxt->instate = state;
3091
}
3092
3093
/**
3094
 * htmlParseCharRef:
3095
 * @ctxt:  an HTML parser context
3096
 *
3097
 * parse Reference declarations
3098
 *
3099
 * [66] CharRef ::= '&#' [0-9]+ ';' |
3100
 *                  '&#x' [0-9a-fA-F]+ ';'
3101
 *
3102
 * Returns the value parsed (as an int)
3103
 */
3104
int
3105
htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3106
    int val = 0;
3107
3108
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
3109
	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3110
		     "htmlParseCharRef: context error\n",
3111
		     NULL, NULL);
3112
        return(0);
3113
    }
3114
    if ((CUR == '&') && (NXT(1) == '#') &&
3115
        ((NXT(2) == 'x') || NXT(2) == 'X')) {
3116
	SKIP(3);
3117
	while (CUR != ';') {
3118
	    if ((CUR >= '0') && (CUR <= '9')) 
3119
	        val = val * 16 + (CUR - '0');
3120
	    else if ((CUR >= 'a') && (CUR <= 'f'))
3121
	        val = val * 16 + (CUR - 'a') + 10;
3122
	    else if ((CUR >= 'A') && (CUR <= 'F'))
3123
	        val = val * 16 + (CUR - 'A') + 10;
3124
	    else {
3125
	        htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3126
		             "htmlParseCharRef: missing semicolumn\n",
3127
			     NULL, NULL);
3128
		break;
3129
	    }
3130
	    NEXT;
3131
	}
3132
	if (CUR == ';')
3133
	    NEXT;
3134
    } else if  ((CUR == '&') && (NXT(1) == '#')) {
3135
	SKIP(2);
3136
	while (CUR != ';') {
3137
	    if ((CUR >= '0') && (CUR <= '9')) 
3138
	        val = val * 10 + (CUR - '0');
3139
	    else {
3140
	        htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3141
		             "htmlParseCharRef: missing semicolumn\n",
3142
			     NULL, NULL);
3143
		break;
3144
	    }
3145
	    NEXT;
3146
	}
3147
	if (CUR == ';')
3148
	    NEXT;
3149
    } else {
3150
	htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3151
	             "htmlParseCharRef: invalid value\n", NULL, NULL);
3152
    }
3153
    /*
3154
     * Check the value IS_CHAR ...
3155
     */
3156
    if (IS_CHAR(val)) {
3157
        return(val);
3158
    } else {
3159
	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3160
			"htmlParseCharRef: invalid xmlChar value %d\n",
3161
			val);
3162
    }
3163
    return(0);
3164
}
3165
3166
3167
/**
3168
 * htmlParseDocTypeDecl:
3169
 * @ctxt:  an HTML parser context
3170
 *
3171
 * parse a DOCTYPE declaration
3172
 *
3173
 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? 
3174
 *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3175
 */
3176
3177
static void
3178
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3179
    const xmlChar *name;
3180
    xmlChar *ExternalID = NULL;
3181
    xmlChar *URI = NULL;
3182
3183
    /*
3184
     * We know that '<!DOCTYPE' has been detected.
3185
     */
3186
    SKIP(9);
3187
3188
    SKIP_BLANKS;
3189
3190
    /*
3191
     * Parse the DOCTYPE name.
3192
     */
3193
    name = htmlParseName(ctxt);
3194
    if (name == NULL) {
3195
	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3196
	             "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3197
		     NULL, NULL);
3198
    }
3199
    /*
3200
     * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3201
     */
3202
3203
    SKIP_BLANKS;
3204
3205
    /*
3206
     * Check for SystemID and ExternalID
3207
     */
3208
    URI = htmlParseExternalID(ctxt, &ExternalID);
3209
    SKIP_BLANKS;
3210
3211
    /*
3212
     * We should be at the end of the DOCTYPE declaration.
3213
     */
3214
    if (CUR != '>') {
3215
	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3216
	             "DOCTYPE improperly terminated\n", NULL, NULL);
3217
        /* We shouldn't try to resynchronize ... */
3218
    }
3219
    NEXT;
3220
3221
    /*
3222
     * Create or update the document accordingly to the DOCTYPE
3223
     */
3224
    if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3225
	(!ctxt->disableSAX))
3226
	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3227
3228
    /*
3229
     * Cleanup, since we don't use all those identifiers
3230
     */
3231
    if (URI != NULL) xmlFree(URI);
3232
    if (ExternalID != NULL) xmlFree(ExternalID);
3233
}
3234
3235
/**
3236
 * htmlParseAttribute:
3237
 * @ctxt:  an HTML parser context
3238
 * @value:  a xmlChar ** used to store the value of the attribute
3239
 *
3240
 * parse an attribute
3241
 *
3242
 * [41] Attribute ::= Name Eq AttValue
3243
 *
3244
 * [25] Eq ::= S? '=' S?
3245
 *
3246
 * With namespace:
3247
 *
3248
 * [NS 11] Attribute ::= QName Eq AttValue
3249
 *
3250
 * Also the case QName == xmlns:??? is handled independently as a namespace
3251
 * definition.
3252
 *
3253
 * Returns the attribute name, and the value in *value.
3254
 */
3255
3256
static const xmlChar *
3257
htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3258
    const xmlChar *name;
3259
    xmlChar *val = NULL;
3260
3261
    *value = NULL;
3262
    name = htmlParseHTMLName(ctxt);
3263
    if (name == NULL) {
3264
	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3265
	             "error parsing attribute name\n", NULL, NULL);
3266
        return(NULL);
3267
    }
3268
3269
    /*
3270
     * read the value
3271
     */
3272
    SKIP_BLANKS;
3273
    if (CUR == '=') {
3274
        NEXT;
3275
	SKIP_BLANKS;
3276
	val = htmlParseAttValue(ctxt);
3277
    } else if (htmlIsBooleanAttr(name)) {
3278
        /*
3279
	 * assume a minimized attribute
3280
	 */
3281
	val = xmlStrdup(name);
3282
    }
3283
3284
    *value = val;
3285
    return(name);
3286
}
3287
3288
/**
3289
 * htmlCheckEncoding:
3290
 * @ctxt:  an HTML parser context
3291
 * @attvalue: the attribute value
3292
 *
3293
 * Checks an http-equiv attribute from a Meta tag to detect
3294
 * the encoding
3295
 * If a new encoding is detected the parser is switched to decode
3296
 * it and pass UTF8
3297
 */
3298
static void
3299
htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3300
    const xmlChar *encoding;
3301
3302
    if ((ctxt == NULL) || (attvalue == NULL))
3303
	return;
3304
3305
    /* do not change encoding */	
3306
    if (ctxt->input->encoding != NULL)
3307
        return;
3308
3309
    encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3310
    if (encoding != NULL) {
3311
	encoding += 8;
3312
    } else {
3313
	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3314
	if (encoding != NULL)
3315
	    encoding += 9;
3316
    }
3317
    if (encoding != NULL) {
3318
	xmlCharEncoding enc;
3319
	xmlCharEncodingHandlerPtr handler;
3320
3321
	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3322
3323
	if (ctxt->input->encoding != NULL)
3324
	    xmlFree((xmlChar *) ctxt->input->encoding);
3325
	ctxt->input->encoding = xmlStrdup(encoding);
3326
3327
	enc = xmlParseCharEncoding((const char *) encoding);
3328
	/*
3329
	 * registered set of known encodings
3330
	 */
3331
	if (enc != XML_CHAR_ENCODING_ERROR) {
3332
	    if (((enc == XML_CHAR_ENCODING_UTF16LE) || 
3333
	         (enc == XML_CHAR_ENCODING_UTF16BE) ||
3334
		 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3335
		 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3336
		(ctxt->input->buf != NULL) &&
3337
		(ctxt->input->buf->encoder == NULL)) {
3338
		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3339
		             "htmlCheckEncoding: wrong encoding meta\n",
3340
			     NULL, NULL);
3341
	    } else {
3342
		xmlSwitchEncoding(ctxt, enc);
3343
	    }
3344
	    ctxt->charset = XML_CHAR_ENCODING_UTF8;
3345
	} else {
3346
	    /*
3347
	     * fallback for unknown encodings
3348
	     */
3349
	    handler = xmlFindCharEncodingHandler((const char *) encoding);
3350
	    if (handler != NULL) {
3351
		xmlSwitchToEncoding(ctxt, handler);
3352
		ctxt->charset = XML_CHAR_ENCODING_UTF8;
3353
	    } else {
3354
		ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3355
	    }
3356
	}
3357
3358
	if ((ctxt->input->buf != NULL) &&
3359
	    (ctxt->input->buf->encoder != NULL) &&
3360
	    (ctxt->input->buf->raw != NULL) &&
3361
	    (ctxt->input->buf->buffer != NULL)) {
3362
	    int nbchars;
3363
	    int processed;
3364
3365
	    /*
3366
	     * convert as much as possible to the parser reading buffer.
3367
	     */
3368
	    processed = ctxt->input->cur - ctxt->input->base;
3369
	    xmlBufferShrink(ctxt->input->buf->buffer, processed);
3370
	    nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3371
		                       ctxt->input->buf->buffer,
3372
				       ctxt->input->buf->raw);
3373
	    if (nbchars < 0) {
3374
		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3375
		             "htmlCheckEncoding: encoder error\n",
3376
			     NULL, NULL);
3377
	    }
3378
	    ctxt->input->base =
3379
	    ctxt->input->cur = ctxt->input->buf->buffer->content;
3380
	}
3381
    }
3382
}
3383
3384
/**
3385
 * htmlCheckMeta:
3386
 * @ctxt:  an HTML parser context
3387
 * @atts:  the attributes values
3388
 *
3389
 * Checks an attributes from a Meta tag
3390
 */
3391
static void
3392
htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3393
    int i;
3394
    const xmlChar *att, *value;
3395
    int http = 0;
3396
    const xmlChar *content = NULL;
3397
3398
    if ((ctxt == NULL) || (atts == NULL))
3399
	return;
3400
3401
    i = 0;
3402
    att = atts[i++];
3403
    while (att != NULL) {
3404
	value = atts[i++];
3405
	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3406
	 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3407
	    http = 1;
3408
	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3409
	    content = value;
3410
	att = atts[i++];
3411
    }
3412
    if ((http) && (content != NULL))
3413
	htmlCheckEncoding(ctxt, content);
3414
3415
}
3416
3417
/**
3418
 * htmlParseStartTag:
3419
 * @ctxt:  an HTML parser context
3420
 * 
3421
 * parse a start of tag either for rule element or
3422
 * EmptyElement. In both case we don't parse the tag closing chars.
3423
 *
3424
 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3425
 *
3426
 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3427
 *
3428
 * With namespace:
3429
 *
3430
 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3431
 *
3432
 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3433
 *
3434
 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3435
 */
3436
3437
static int
3438
htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3439
    const xmlChar *name;
3440
    const xmlChar *attname;
3441
    xmlChar *attvalue;
3442
    const xmlChar **atts;
3443
    int nbatts = 0;
3444
    int maxatts;
3445
    int meta = 0;
3446
    int i;
3447
    int discardtag = 0;
3448
3449
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
3450
	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3451
		     "htmlParseStartTag: context error\n", NULL, NULL);
3452
	return -1;
3453
    }
3454
    if (CUR != '<') return -1;
3455
    NEXT;
3456
3457
    atts = ctxt->atts;
3458
    maxatts = ctxt->maxatts;
3459
3460
    GROW;
3461
    name = htmlParseHTMLName(ctxt);
3462
    if (name == NULL) {
3463
	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3464
	             "htmlParseStartTag: invalid element name\n",
3465
		     NULL, NULL);
3466
	/* Dump the bogus tag like browsers do */
3467
	while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3468
	    NEXT;
3469
        return -1;
3470
    }
3471
    if (xmlStrEqual(name, BAD_CAST"meta"))
3472
	meta = 1;
3473
3474
    /*
3475
     * Check for auto-closure of HTML elements.
3476
     */
3477
    htmlAutoClose(ctxt, name);
3478
3479
    /*
3480
     * Check for implied HTML elements.
3481
     */
3482
    htmlCheckImplied(ctxt, name);
3483
3484
    /*
3485
     * Avoid html at any level > 0, head at any level != 1
3486
     * or any attempt to recurse body
3487
     */
3488
    if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3489
	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3490
	             "htmlParseStartTag: misplaced <html> tag\n",
3491
		     name, NULL);
3492
	discardtag = 1;
3493
	ctxt->depth++;
3494
    }
3495
    if ((ctxt->nameNr != 1) && 
3496
	(xmlStrEqual(name, BAD_CAST"head"))) {
3497
	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3498
	             "htmlParseStartTag: misplaced <head> tag\n",
3499
		     name, NULL);
3500
	discardtag = 1;
3501
	ctxt->depth++;
3502
    }
3503
    if (xmlStrEqual(name, BAD_CAST"body")) {
3504
	int indx;
3505
	for (indx = 0;indx < ctxt->nameNr;indx++) {
3506
	    if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3507
		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3508
		             "htmlParseStartTag: misplaced <body> tag\n",
3509
			     name, NULL);
3510
		discardtag = 1;
3511
		ctxt->depth++;
3512
	    }
3513
	}
3514
    }
3515
3516
    /*
3517
     * Now parse the attributes, it ends up with the ending
3518
     *
3519
     * (S Attribute)* S?
3520
     */
3521
    SKIP_BLANKS;
3522
    while ((IS_CHAR_CH(CUR)) &&
3523
           (CUR != '>') && 
3524
	   ((CUR != '/') || (NXT(1) != '>'))) {
3525
	long cons = ctxt->nbChars;
3526
3527
	GROW;
3528
	attname = htmlParseAttribute(ctxt, &attvalue);
3529
        if (attname != NULL) {
3530
3531
	    /*
3532
	     * Well formedness requires at most one declaration of an attribute
3533
	     */
3534
	    for (i = 0; i < nbatts;i += 2) {
3535
	        if (xmlStrEqual(atts[i], attname)) {
3536
		    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3537
		                 "Attribute %s redefined\n", attname, NULL);
3538
		    if (attvalue != NULL)
3539
			xmlFree(attvalue);
3540
		    goto failed;
3541
		}
3542
	    }
3543
3544
	    /*
3545
	     * Add the pair to atts
3546
	     */
3547
	    if (atts == NULL) {
3548
	        maxatts = 22; /* allow for 10 attrs by default */
3549
	        atts = (const xmlChar **)
3550
		       xmlMalloc(maxatts * sizeof(xmlChar *));
3551
		if (atts == NULL) {
3552
		    htmlErrMemory(ctxt, NULL);
3553
		    if (attvalue != NULL)
3554
			xmlFree(attvalue);
3555
		    goto failed;
3556
		}
3557
		ctxt->atts = atts;
3558
		ctxt->maxatts = maxatts;
3559
	    } else if (nbatts + 4 > maxatts) {
3560
	        const xmlChar **n;
3561
3562
	        maxatts *= 2;
3563
	        n = (const xmlChar **) xmlRealloc((void *) atts,
3564
					     maxatts * sizeof(const xmlChar *));
3565
		if (n == NULL) {
3566
		    htmlErrMemory(ctxt, NULL);
3567
		    if (attvalue != NULL)
3568
			xmlFree(attvalue);
3569
		    goto failed;
3570
		}
3571
		atts = n;
3572
		ctxt->atts = atts;
3573
		ctxt->maxatts = maxatts;
3574
	    }
3575
	    atts[nbatts++] = attname;
3576
	    atts[nbatts++] = attvalue;
3577
	    atts[nbatts] = NULL;
3578
	    atts[nbatts + 1] = NULL;
3579
	}
3580
	else {
3581
	    if (attvalue != NULL)
3582
	        xmlFree(attvalue);
3583
	    /* Dump the bogus attribute string up to the next blank or
3584
	     * the end of the tag. */
3585
	    while ((IS_CHAR_CH(CUR)) &&
3586
	           !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3587
		   ((CUR != '/') || (NXT(1) != '>')))
3588
		NEXT;
3589
	}
3590
3591
failed:
3592
	SKIP_BLANKS;
3593
        if (cons == ctxt->nbChars) {
3594
	    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3595
	                 "htmlParseStartTag: problem parsing attributes\n",
3596
			 NULL, NULL);
3597
	    break;
3598
	}
3599
    }
3600
3601
    /*
3602
     * Handle specific association to the META tag
3603
     */
3604
    if (meta && (nbatts != 0))
3605
	htmlCheckMeta(ctxt, atts);
3606
3607
    /*
3608
     * SAX: Start of Element !
3609
     */
3610
    if (!discardtag) {
3611
	htmlnamePush(ctxt, name);
3612
	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3613
	    if (nbatts != 0)
3614
		ctxt->sax->startElement(ctxt->userData, name, atts);
3615
	    else
3616
		ctxt->sax->startElement(ctxt->userData, name, NULL);
3617
	}
3618
    }
3619
3620
    if (atts != NULL) {
3621
        for (i = 1;i < nbatts;i += 2) {
3622
	    if (atts[i] != NULL)
3623
		xmlFree((xmlChar *) atts[i]);
3624
	}
3625
    }
3626
3627
    return(discardtag);
3628
}
3629
3630
/**
3631
 * htmlParseEndTag:
3632
 * @ctxt:  an HTML parser context
3633
 *
3634
 * parse an end of tag
3635
 *
3636
 * [42] ETag ::= '</' Name S? '>'
3637
 *
3638
 * With namespace
3639
 *
3640
 * [NS 9] ETag ::= '</' QName S? '>'
3641
 *
3642
 * Returns 1 if the current level should be closed.
3643
 */
3644
3645
static int
3646
htmlParseEndTag(htmlParserCtxtPtr ctxt)
3647
{
3648
    const xmlChar *name;
3649
    const xmlChar *oldname;
3650
    int i, ret;
3651
3652
    if ((CUR != '<') || (NXT(1) != '/')) {
3653
        htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3654
	             "htmlParseEndTag: '</' not found\n", NULL, NULL);
3655
        return (0);
3656
    }
3657
    SKIP(2);
3658
3659
    name = htmlParseHTMLName(ctxt);
3660
    if (name == NULL)
3661
        return (0);
3662
    /*
3663
     * We should definitely be at the ending "S? '>'" part
3664
     */
3665
    SKIP_BLANKS;
3666
    if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
3667
        htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3668
	             "End tag : expected '>'\n", NULL, NULL);
3669
	if (ctxt->recovery) {
3670
	    /*
3671
	     * We're not at the ending > !!
3672
	     * Error, unless in recover mode where we search forwards
3673
	     * until we find a >
3674
	     */
3675
	    while (CUR != '\0' && CUR != '>') NEXT;
3676
	    NEXT;
3677
	}
3678
    } else
3679
        NEXT;
3680
3681
    /*
3682
     * if we ignored misplaced tags in htmlParseStartTag don't pop them
3683
     * out now.
3684
     */
3685
    if ((ctxt->depth > 0) &&
3686
        (xmlStrEqual(name, BAD_CAST "html") ||
3687
         xmlStrEqual(name, BAD_CAST "body") ||
3688
	 xmlStrEqual(name, BAD_CAST "head"))) {
3689
	ctxt->depth--;
3690
	return (0);
3691
    }
3692
3693
    /*
3694
     * If the name read is not one of the element in the parsing stack
3695
     * then return, it's just an error.
3696
     */
3697
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3698
        if (xmlStrEqual(name, ctxt->nameTab[i]))
3699
            break;
3700
    }
3701
    if (i < 0) {
3702
        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3703
	             "Unexpected end tag : %s\n", name, NULL);
3704
        return (0);
3705
    }
3706
3707
3708
    /*
3709
     * Check for auto-closure of HTML elements.
3710
     */
3711
3712
    htmlAutoCloseOnClose(ctxt, name);
3713
3714
    /*
3715
     * Well formedness constraints, opening and closing must match.
3716
     * With the exception that the autoclose may have popped stuff out
3717
     * of the stack.
3718
     */
3719
    if (!xmlStrEqual(name, ctxt->name)) {
3720
        if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
3721
            htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3722
	                 "Opening and ending tag mismatch: %s and %s\n",
3723
			 name, ctxt->name);
3724
        }
3725
    }
3726
3727
    /*
3728
     * SAX: End of Tag
3729
     */
3730
    oldname = ctxt->name;
3731
    if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3732
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3733
            ctxt->sax->endElement(ctxt->userData, name);
3734
        htmlnamePop(ctxt);
3735
        ret = 1;
3736
    } else {
3737
        ret = 0;
3738
    }
3739
3740
    return (ret);
3741
}
3742
3743
3744
/**
3745
 * htmlParseReference:
3746
 * @ctxt:  an HTML parser context
3747
 * 
3748
 * parse and handle entity references in content,
3749
 * this will end-up in a call to character() since this is either a
3750
 * CharRef, or a predefined entity.
3751
 */
3752
static void
3753
htmlParseReference(htmlParserCtxtPtr ctxt) {
3754
    const htmlEntityDesc * ent;
3755
    xmlChar out[6];
3756
    const xmlChar *name;
3757
    if (CUR != '&') return;
3758
3759
    if (NXT(1) == '#') {
3760
	unsigned int c;
3761
	int bits, i = 0;
3762
3763
	c = htmlParseCharRef(ctxt);
3764
	if (c == 0)
3765
	    return;
3766
3767
        if      (c <    0x80) { out[i++]= c;                bits= -6; }
3768
        else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
3769
        else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
3770
        else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
3771
 
3772
        for ( ; bits >= 0; bits-= 6) {
3773
            out[i++]= ((c >> bits) & 0x3F) | 0x80;
3774
        }
3775
	out[i] = 0;
3776
3777
	htmlCheckParagraph(ctxt);
3778
	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3779
	    ctxt->sax->characters(ctxt->userData, out, i);
3780
    } else {
3781
	ent = htmlParseEntityRef(ctxt, &name);
3782
	if (name == NULL) {
3783
	    htmlCheckParagraph(ctxt);
3784
	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3785
	        ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3786
	    return;
3787
	}
3788
	if ((ent == NULL) || !(ent->value > 0)) {
3789
	    htmlCheckParagraph(ctxt);
3790
	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3791
		ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3792
		ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3793
		/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3794
	    }
3795
	} else {
3796
	    unsigned int c;
3797
	    int bits, i = 0;
3798
3799
	    c = ent->value;
3800
	    if      (c <    0x80)
3801
	            { out[i++]= c;                bits= -6; }
3802
	    else if (c <   0x800)
3803
	            { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
3804
	    else if (c < 0x10000)
3805
	            { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
3806
	    else                 
3807
	            { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
3808
     
3809
	    for ( ; bits >= 0; bits-= 6) {
3810
		out[i++]= ((c >> bits) & 0x3F) | 0x80;
3811
	    }
3812
	    out[i] = 0;
3813
3814
	    htmlCheckParagraph(ctxt);
3815
	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3816
		ctxt->sax->characters(ctxt->userData, out, i);
3817
	}
3818
    }
3819
}
3820
3821
/**
3822
 * htmlParseContent:
3823
 * @ctxt:  an HTML parser context
3824
 *
3825
 * Parse a content: comment, sub-element, reference or text.
3826
 */
3827
3828
static void
3829
htmlParseContent(htmlParserCtxtPtr ctxt) {
3830
    xmlChar *currentNode;
3831
    int depth;
3832
    const xmlChar *name;
3833
3834
    currentNode = xmlStrdup(ctxt->name);
3835
    depth = ctxt->nameNr;
3836
    while (1) {
3837
	long cons = ctxt->nbChars;
3838
3839
        GROW;
3840
	/*
3841
	 * Our tag or one of it's parent or children is ending.
3842
	 */
3843
        if ((CUR == '<') && (NXT(1) == '/')) {
3844
	    if (htmlParseEndTag(ctxt) &&
3845
		((currentNode != NULL) || (ctxt->nameNr == 0))) {
3846
		if (currentNode != NULL)
3847
		    xmlFree(currentNode);
3848
		return;
3849
	    }
3850
	    continue; /* while */
3851
        }
3852
3853
	else if ((CUR == '<') &&
3854
	         ((IS_ASCII_LETTER(NXT(1))) ||
3855
		  (NXT(1) == '_') || (NXT(1) == ':'))) {
3856
	    name = htmlParseHTMLName_nonInvasive(ctxt);
3857
	    if (name == NULL) {
3858
	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3859
			 "htmlParseStartTag: invalid element name\n",
3860
			 NULL, NULL);
3861
	        /* Dump the bogus tag like browsers do */
3862
 	        while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3863
	            NEXT;
3864
3865
	        if (currentNode != NULL)
3866
	            xmlFree(currentNode);
3867
	        return;
3868
	    }
3869
3870
	    if (ctxt->name != NULL) {
3871
	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
3872
	            htmlAutoClose(ctxt, name);
3873
	            continue;
3874
	        }
3875
	    }	  
3876
	}
3877
3878
	/*
3879
	 * Has this node been popped out during parsing of
3880
	 * the next element
3881
	 */
3882
        if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3883
	    (!xmlStrEqual(currentNode, ctxt->name)))
3884
	     {
3885
	    if (currentNode != NULL) xmlFree(currentNode);
3886
	    return;
3887
	}
3888
3889
	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3890
	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
3891
	    /*
3892
	     * Handle SCRIPT/STYLE separately
3893
	     */
3894
	    htmlParseScript(ctxt);
3895
	} else {
3896
	    /*
3897
	     * Sometimes DOCTYPE arrives in the middle of the document
3898
	     */
3899
	    if ((CUR == '<') && (NXT(1) == '!') &&
3900
		(UPP(2) == 'D') && (UPP(3) == 'O') &&
3901
		(UPP(4) == 'C') && (UPP(5) == 'T') &&
3902
		(UPP(6) == 'Y') && (UPP(7) == 'P') &&
3903
		(UPP(8) == 'E')) {
3904
		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3905
		             "Misplaced DOCTYPE declaration\n",
3906
			     BAD_CAST "DOCTYPE" , NULL);
3907
		htmlParseDocTypeDecl(ctxt);
3908
	    }
3909
3910
	    /*
3911
	     * First case :  a comment
3912
	     */
3913
	    if ((CUR == '<') && (NXT(1) == '!') &&
3914
		(NXT(2) == '-') && (NXT(3) == '-')) {
3915
		htmlParseComment(ctxt);
3916
	    }
3917
3918
	    /*
3919
	     * Second case : a Processing Instruction.
3920
	     */
3921
	    else if ((CUR == '<') && (NXT(1) == '?')) {
3922
		htmlParsePI(ctxt);
3923
	    }
3924
3925
	    /*
3926
	     * Third case :  a sub-element.
3927
	     */
3928
	    else if (CUR == '<') {
3929
		htmlParseElement(ctxt);
3930
	    }
3931
3932
	    /*
3933
	     * Fourth case : a reference. If if has not been resolved,
3934
	     *    parsing returns it's Name, create the node 
3935
	     */
3936
	    else if (CUR == '&') {
3937
		htmlParseReference(ctxt);
3938
	    }
3939
3940
	    /*
3941
	     * Fifth case : end of the resource
3942
	     */
3943
	    else if (CUR == 0) {
3944
		htmlAutoCloseOnEnd(ctxt);
3945
		break;
3946
	    }
3947
3948
	    /*
3949
	     * Last case, text. Note that References are handled directly.
3950
	     */
3951
	    else {
3952
		htmlParseCharData(ctxt);
3953
	    }
3954
3955
	    if (cons == ctxt->nbChars) {
3956
		if (ctxt->node != NULL) {
3957
		    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3958
		                 "detected an error in element content\n",
3959
				 NULL, NULL);
3960
		}
3961
		break;
3962
	    }
3963
	}
3964
        GROW;
3965
    }
3966
    if (currentNode != NULL) xmlFree(currentNode);
3967
}
3968
3969
/**
3970
 * htmlParseContent:
3971
 * @ctxt:  an HTML parser context
3972
 *
3973
 * Parse a content: comment, sub-element, reference or text.
3974
 */
3975
3976
void
3977
__htmlParseContent(void *ctxt) {
3978
    if (ctxt != NULL)
3979
	htmlParseContent((htmlParserCtxtPtr) ctxt);
3980
}
3981
3982
/**
3983
 * htmlParseElement:
3984
 * @ctxt:  an HTML parser context
3985
 *
3986
 * parse an HTML element, this is highly recursive
3987
 *
3988
 * [39] element ::= EmptyElemTag | STag content ETag
3989
 *
3990
 * [41] Attribute ::= Name Eq AttValue
3991
 */
3992
3993
void
3994
htmlParseElement(htmlParserCtxtPtr ctxt) {
3995
    const xmlChar *name;
3996
    xmlChar *currentNode = NULL;
3997
    const htmlElemDesc * info;
3998
    htmlParserNodeInfo node_info;
3999
    int failed;
4000
    int depth;
4001
    const xmlChar *oldptr;
4002
4003
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
4004
	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4005
		     "htmlParseElement: context error\n", NULL, NULL);
4006
	return;
4007
    }
4008
    /* Capture start position */
4009
    if (ctxt->record_info) {
4010
        node_info.begin_pos = ctxt->input->consumed +
4011
                          (CUR_PTR - ctxt->input->base);
4012
	node_info.begin_line = ctxt->input->line;
4013
    }
4014
4015
    failed = htmlParseStartTag(ctxt);
4016
    name = ctxt->name;
4017
    if ((failed == -1) || (name == NULL)) {
4018
	if (CUR == '>')
4019
	    NEXT;
4020
        return;
4021
    }
4022
4023
    /*
4024
     * Lookup the info for that element.
4025
     */
4026
    info = htmlTagLookup(name);
4027
    if (info == NULL) {
4028
	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4029
	             "Tag %s invalid\n", name, NULL);
4030
    }
4031
4032
    /*
4033
     * Check for an Empty Element labeled the XML/SGML way
4034
     */
4035
    if ((CUR == '/') && (NXT(1) == '>')) {
4036
        SKIP(2);
4037
	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4038
	    ctxt->sax->endElement(ctxt->userData, name);
4039
	htmlnamePop(ctxt);
4040
	return;
4041
    }
4042
4043
    if (CUR == '>') {
4044
        NEXT;
4045
    } else {
4046
	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4047
	             "Couldn't find end of Start Tag %s\n", name, NULL);
4048
4049
	/*
4050
	 * end of parsing of this node.
4051
	 */
4052
	if (xmlStrEqual(name, ctxt->name)) { 
4053
	    nodePop(ctxt);
4054
	    htmlnamePop(ctxt);
4055
	}    
4056
4057
	/*
4058
	 * Capture end position and add node
4059
	 */
4060
	if (ctxt->record_info) {
4061
	   node_info.end_pos = ctxt->input->consumed +
4062
			      (CUR_PTR - ctxt->input->base);
4063
	   node_info.end_line = ctxt->input->line;
4064
	   node_info.node = ctxt->node;
4065
	   xmlParserAddNodeInfo(ctxt, &node_info);
4066
	}
4067
	return;
4068
    }
4069
4070
    /*
4071
     * Check for an Empty Element from DTD definition
4072
     */
4073
    if ((info != NULL) && (info->empty)) {
4074
	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4075
	    ctxt->sax->endElement(ctxt->userData, name);
4076
	htmlnamePop(ctxt);
4077
	return;
4078
    }
4079
4080
    /*
4081
     * Parse the content of the element:
4082
     */
4083
    currentNode = xmlStrdup(ctxt->name);
4084
    depth = ctxt->nameNr;
4085
    while (IS_CHAR_CH(CUR)) {
4086
	oldptr = ctxt->input->cur;
4087
	htmlParseContent(ctxt);
4088
	if (oldptr==ctxt->input->cur) break;
4089
	if (ctxt->nameNr < depth) break; 
4090
    }	
4091
4092
    /*
4093
     * Capture end position and add node
4094
     */
4095
    if ( currentNode != NULL && ctxt->record_info ) {
4096
       node_info.end_pos = ctxt->input->consumed +
4097
                          (CUR_PTR - ctxt->input->base);
4098
       node_info.end_line = ctxt->input->line;
4099
       node_info.node = ctxt->node;
4100
       xmlParserAddNodeInfo(ctxt, &node_info);
4101
    }
4102
    if (!IS_CHAR_CH(CUR)) {
4103
	htmlAutoCloseOnEnd(ctxt);
4104
    }
4105
4106
    if (currentNode != NULL)
4107
	xmlFree(currentNode);
4108
}
4109
4110
/**
4111
 * htmlParseDocument:
4112
 * @ctxt:  an HTML parser context
4113
 * 
4114
 * parse an HTML document (and build a tree if using the standard SAX
4115
 * interface).
4116
 *
4117
 * Returns 0, -1 in case of error. the parser context is augmented
4118
 *                as a result of the parsing.
4119
 */
4120
4121
int
4122
htmlParseDocument(htmlParserCtxtPtr ctxt) {
4123
    xmlChar start[4];
4124
    xmlCharEncoding enc;
4125
    xmlDtdPtr dtd;
4126
4127
    xmlInitParser();
4128
4129
    htmlDefaultSAXHandlerInit();
4130
4131
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
4132
	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4133
		     "htmlParseDocument: context error\n", NULL, NULL);
4134
	return(XML_ERR_INTERNAL_ERROR);
4135
    }
4136
    ctxt->html = 1;
4137
    GROW;
4138
    /*
4139
     * SAX: beginning of the document processing.
4140
     */
4141
    if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4142
        ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4143
4144
    if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4145
        ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4146
	/*
4147
	 * Get the 4 first bytes and decode the charset
4148
	 * if enc != XML_CHAR_ENCODING_NONE
4149
	 * plug some encoding conversion routines.
4150
	 */
4151
	start[0] = RAW;
4152
	start[1] = NXT(1);
4153
	start[2] = NXT(2);
4154
	start[3] = NXT(3);
4155
	enc = xmlDetectCharEncoding(&start[0], 4);
4156
	if (enc != XML_CHAR_ENCODING_NONE) {
4157
	    xmlSwitchEncoding(ctxt, enc);
4158
	}
4159
    }
4160
4161
    /*
4162
     * Wipe out everything which is before the first '<'
4163
     */
4164
    SKIP_BLANKS;
4165
    if (CUR == 0) {
4166
	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY, 
4167
	             "Document is empty\n", NULL, NULL);
4168
    }
4169
4170
    if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4171
	ctxt->sax->startDocument(ctxt->userData);
4172
4173
4174
    /*
4175
     * Parse possible comments and PIs before any content
4176
     */
4177
    while (((CUR == '<') && (NXT(1) == '!') &&
4178
            (NXT(2) == '-') && (NXT(3) == '-')) ||
4179
	   ((CUR == '<') && (NXT(1) == '?'))) {
4180
        htmlParseComment(ctxt);
4181
        htmlParsePI(ctxt);
4182
	SKIP_BLANKS;
4183
    }
4184
4185
4186
    /*
4187
     * Then possibly doc type declaration(s) and more Misc
4188
     * (doctypedecl Misc*)?
4189
     */
4190
    if ((CUR == '<') && (NXT(1) == '!') &&
4191
	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4192
	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4193
	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4194
	(UPP(8) == 'E')) {
4195
	htmlParseDocTypeDecl(ctxt);
4196
    }
4197
    SKIP_BLANKS;
4198
4199
    /*
4200
     * Parse possible comments and PIs before any content
4201
     */
4202
    while (((CUR == '<') && (NXT(1) == '!') &&
4203
            (NXT(2) == '-') && (NXT(3) == '-')) ||
4204
	   ((CUR == '<') && (NXT(1) == '?'))) {
4205
        htmlParseComment(ctxt);	   
4206
        htmlParsePI(ctxt);	   
4207
	SKIP_BLANKS;
4208
    }	   
4209
4210
    /*
4211
     * Time to start parsing the tree itself
4212
     */
4213
    htmlParseContent(ctxt);
4214
4215
    /*
4216
     * autoclose
4217
     */
4218
    if (CUR == 0)
4219
	htmlAutoCloseOnEnd(ctxt);
4220
4221
4222
    /*
4223
     * SAX: end of the document processing.
4224
     */
4225
    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4226
        ctxt->sax->endDocument(ctxt->userData);
4227
4228
    if (ctxt->myDoc != NULL) {
4229
	dtd = xmlGetIntSubset(ctxt->myDoc);
4230
	if (dtd == NULL)
4231
	    ctxt->myDoc->intSubset = 
4232
		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html", 
4233
		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4234
		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4235
    }
4236
    if (! ctxt->wellFormed) return(-1);
4237
    return(0);
4238
}
4239
4240
4241
/************************************************************************
4242
 *									*
4243
 *			Parser contexts handling			*
4244
 *									*
4245
 ************************************************************************/
4246
4247
/**
4248
 * htmlInitParserCtxt:
4249
 * @ctxt:  an HTML parser context
4250
 *
4251
 * Initialize a parser context
4252
 *
4253
 * Returns 0 in case of success and -1 in case of error
4254
 */
4255
4256
static int
4257
htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4258
{
4259
    htmlSAXHandler *sax;
4260
4261
    if (ctxt == NULL) return(-1);
4262
    memset(ctxt, 0, sizeof(htmlParserCtxt));
4263
4264
    ctxt->dict = xmlDictCreate();
4265
    if (ctxt->dict == NULL) {
4266
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4267
	return(-1);
4268
    }
4269
    sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4270
    if (sax == NULL) {
4271
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4272
	return(-1);
4273
    }
4274
    else
4275
        memset(sax, 0, sizeof(htmlSAXHandler));
4276
4277
    /* Allocate the Input stack */
4278
    ctxt->inputTab = (htmlParserInputPtr *) 
4279
                      xmlMalloc(5 * sizeof(htmlParserInputPtr));
4280
    if (ctxt->inputTab == NULL) {
4281
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4282
	ctxt->inputNr = 0;
4283
	ctxt->inputMax = 0;
4284
	ctxt->input = NULL;
4285
	return(-1);
4286
    }
4287
    ctxt->inputNr = 0;
4288
    ctxt->inputMax = 5;
4289
    ctxt->input = NULL;
4290
    ctxt->version = NULL;
4291
    ctxt->encoding = NULL;
4292
    ctxt->standalone = -1;
4293
    ctxt->instate = XML_PARSER_START;
4294
4295
    /* Allocate the Node stack */
4296
    ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4297
    if (ctxt->nodeTab == NULL) {
4298
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4299
	ctxt->nodeNr = 0;
4300
	ctxt->nodeMax = 0;
4301
	ctxt->node = NULL;
4302
	ctxt->inputNr = 0;
4303
	ctxt->inputMax = 0;
4304
	ctxt->input = NULL;
4305
	return(-1);
4306
    }
4307
    ctxt->nodeNr = 0;
4308
    ctxt->nodeMax = 10;
4309
    ctxt->node = NULL;
4310
4311
    /* Allocate the Name stack */
4312
    ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4313
    if (ctxt->nameTab == NULL) {
4314
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4315
	ctxt->nameNr = 0;
4316
	ctxt->nameMax = 10;
4317
	ctxt->name = NULL;
4318
	ctxt->nodeNr = 0;
4319
	ctxt->nodeMax = 0;
4320
	ctxt->node = NULL;
4321
	ctxt->inputNr = 0;
4322
	ctxt->inputMax = 0;
4323
	ctxt->input = NULL;
4324
	return(-1);
4325
    }
4326
    ctxt->nameNr = 0;
4327
    ctxt->nameMax = 10;
4328
    ctxt->name = NULL;
4329
4330
    if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4331
    else {
4332
        ctxt->sax = sax;
4333
	memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4334
    }
4335
    ctxt->userData = ctxt;
4336
    ctxt->myDoc = NULL;
4337
    ctxt->wellFormed = 1;
4338
    ctxt->replaceEntities = 0;
4339
    ctxt->linenumbers = xmlLineNumbersDefaultValue;
4340
    ctxt->html = 1;
4341
    ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
4342
    ctxt->vctxt.userData = ctxt;
4343
    ctxt->vctxt.error = xmlParserValidityError;
4344
    ctxt->vctxt.warning = xmlParserValidityWarning;
4345
    ctxt->record_info = 0;
4346
    ctxt->validate = 0;
4347
    ctxt->nbChars = 0;
4348
    ctxt->checkIndex = 0;
4349
    ctxt->catalogs = NULL;
4350
    xmlInitNodeInfoSeq(&ctxt->node_seq);
4351
    return(0);
4352
}
4353
4354
/**
4355
 * htmlFreeParserCtxt:
4356
 * @ctxt:  an HTML parser context
4357
 *
4358
 * Free all the memory used by a parser context. However the parsed
4359
 * document in ctxt->myDoc is not freed.
4360
 */
4361
4362
void
4363
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4364
{
4365
    xmlFreeParserCtxt(ctxt);
4366
}
4367
4368
/**
4369
 * htmlNewParserCtxt:
4370
 *
4371
 * Allocate and initialize a new parser context.
4372
 *
4373
 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4374
 */
4375
4376
htmlParserCtxtPtr
4377
htmlNewParserCtxt(void)
4378
{
4379
    xmlParserCtxtPtr ctxt;
4380
4381
    ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4382
    if (ctxt == NULL) {
4383
        htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
4384
	return(NULL);
4385
    }
4386
    memset(ctxt, 0, sizeof(xmlParserCtxt));
4387
    if (htmlInitParserCtxt(ctxt) < 0) {
4388
        htmlFreeParserCtxt(ctxt);
4389
	return(NULL);
4390
    }
4391
    return(ctxt);
4392
}
4393
4394
/**
4395
 * htmlCreateMemoryParserCtxt:
4396
 * @buffer:  a pointer to a char array
4397
 * @size:  the size of the array
4398
 *
4399
 * Create a parser context for an HTML in-memory document.
4400
 *
4401
 * Returns the new parser context or NULL
4402
 */
4403
htmlParserCtxtPtr
4404
htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4405
    xmlParserCtxtPtr ctxt;
4406
    xmlParserInputPtr input;
4407
    xmlParserInputBufferPtr buf;
4408
4409
    if (buffer == NULL)
4410
	return(NULL);
4411
    if (size <= 0)
4412
	return(NULL);
4413
4414
    ctxt = htmlNewParserCtxt();
4415
    if (ctxt == NULL)
4416
	return(NULL);
4417
4418
    buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4419
    if (buf == NULL) return(NULL);
4420
4421
    input = xmlNewInputStream(ctxt);
4422
    if (input == NULL) {
4423
	xmlFreeParserCtxt(ctxt);
4424
	return(NULL);
4425
    }
4426
4427
    input->filename = NULL;
4428
    input->buf = buf;
4429
    input->base = input->buf->buffer->content;
4430
    input->cur = input->buf->buffer->content;
4431
    input->end = &input->buf->buffer->content[input->buf->buffer->use];
4432
4433
    inputPush(ctxt, input);
4434
    return(ctxt);
4435
}
4436
4437
/**
4438
 * htmlCreateDocParserCtxt:
4439
 * @cur:  a pointer to an array of xmlChar
4440
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
4441
 *
4442
 * Create a parser context for an HTML document.
4443
 *
4444
 * TODO: check the need to add encoding handling there
4445
 *
4446
 * Returns the new parser context or NULL
4447
 */
4448
static htmlParserCtxtPtr
4449
htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
4450
    int len;
4451
    htmlParserCtxtPtr ctxt;
4452
4453
    if (cur == NULL)
4454
	return(NULL);
4455
    len = xmlStrlen(cur);
4456
    ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4457
    if (ctxt == NULL)
4458
	return(NULL);
4459
4460
    if (encoding != NULL) {
4461
	xmlCharEncoding enc;
4462
	xmlCharEncodingHandlerPtr handler;
4463
4464
	if (ctxt->input->encoding != NULL)
4465
	    xmlFree((xmlChar *) ctxt->input->encoding);
4466
	ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
4467
4468
	enc = xmlParseCharEncoding(encoding);
4469
	/*
4470
	 * registered set of known encodings
4471
	 */
4472
	if (enc != XML_CHAR_ENCODING_ERROR) {
4473
	    xmlSwitchEncoding(ctxt, enc);
4474
	    if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
4475
		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4476
		             "Unsupported encoding %s\n", 
4477
			     (const xmlChar *) encoding, NULL);
4478
	    }
4479
	} else {
4480
	    /*
4481
	     * fallback for unknown encodings
4482
	     */
4483
	    handler = xmlFindCharEncodingHandler((const char *) encoding);
4484
	    if (handler != NULL) {
4485
		xmlSwitchToEncoding(ctxt, handler);
4486
	    } else {
4487
		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4488
		             "Unsupported encoding %s\n",
4489
			     (const xmlChar *) encoding, NULL);
4490
	    }
4491
	}
4492
    }
4493
    return(ctxt);
4494
}
4495
4496
#ifdef LIBXML_PUSH_ENABLED
4497
/************************************************************************
4498
 *									*
4499
 * 		Progressive parsing interfaces				*
4500
 *									*
4501
 ************************************************************************/
4502
4503
/**
4504
 * htmlParseLookupSequence:
4505
 * @ctxt:  an HTML parser context
4506
 * @first:  the first char to lookup
4507
 * @next:  the next char to lookup or zero
4508
 * @third:  the next char to lookup or zero
4509
 * @comment: flag to force checking inside comments
4510
 *
4511
 * Try to find if a sequence (first, next, third) or  just (first next) or
4512
 * (first) is available in the input stream.
4513
 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4514
 * to avoid rescanning sequences of bytes, it DOES change the state of the
4515
 * parser, do not use liberally.
4516
 * This is basically similar to xmlParseLookupSequence()
4517
 *
4518
 * Returns the index to the current parsing point if the full sequence
4519
 *      is available, -1 otherwise.
4520
 */
4521
static int
4522
htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
4523
                        xmlChar next, xmlChar third, int iscomment) {
4524
    int base, len;
4525
    htmlParserInputPtr in;
4526
    const xmlChar *buf;
4527
    int incomment = 0;
4528
4529
    in = ctxt->input;
4530
    if (in == NULL) return(-1);
4531
    base = in->cur - in->base;
4532
    if (base < 0) return(-1);
4533
    if (ctxt->checkIndex > base)
4534
        base = ctxt->checkIndex;
4535
    if (in->buf == NULL) {
4536
	buf = in->base;
4537
	len = in->length;
4538
    } else {
4539
	buf = in->buf->buffer->content;
4540
	len = in->buf->buffer->use;
4541
    }
4542
    /* take into account the sequence length */
4543
    if (third) len -= 2;
4544
    else if (next) len --;
4545
    for (;base < len;base++) {
4546
	if (!incomment && (base + 4 < len) && !iscomment) {
4547
	    if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4548
		(buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4549
		incomment = 1;
4550
		/* do not increment past <! - some people use <!--> */
4551
		base += 2;
4552
	    }
4553
	}
4554
	if (incomment) {
4555
	    if (base + 3 > len)
4556
		return(-1);
4557
	    if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4558
		(buf[base + 2] == '>')) {
4559
		incomment = 0;
4560
		base += 2;
4561
	    }
4562
	    continue;
4563
	}
4564
        if (buf[base] == first) {
4565
	    if (third != 0) {
4566
		if ((buf[base + 1] != next) ||
4567
		    (buf[base + 2] != third)) continue;
4568
	    } else if (next != 0) {
4569
		if (buf[base + 1] != next) continue;
4570
	    }
4571
	    ctxt->checkIndex = 0;
4572
#ifdef DEBUG_PUSH
4573
	    if (next == 0)
4574
		xmlGenericError(xmlGenericErrorContext,
4575
			"HPP: lookup '%c' found at %d\n",
4576
			first, base);
4577
	    else if (third == 0)
4578
		xmlGenericError(xmlGenericErrorContext,
4579
			"HPP: lookup '%c%c' found at %d\n",
4580
			first, next, base);
4581
	    else 
4582
		xmlGenericError(xmlGenericErrorContext,
4583
			"HPP: lookup '%c%c%c' found at %d\n",
4584
			first, next, third, base);
4585
#endif
4586
	    return(base - (in->cur - in->base));
4587
	}
4588
    }
4589
    ctxt->checkIndex = base;
4590
#ifdef DEBUG_PUSH
4591
    if (next == 0)
4592
	xmlGenericError(xmlGenericErrorContext,
4593
		"HPP: lookup '%c' failed\n", first);
4594
    else if (third == 0)
4595
	xmlGenericError(xmlGenericErrorContext,
4596
		"HPP: lookup '%c%c' failed\n", first, next);
4597
    else	
4598
	xmlGenericError(xmlGenericErrorContext,
4599
		"HPP: lookup '%c%c%c' failed\n", first, next, third);
4600
#endif
4601
    return(-1);
4602
}
4603
4604
/**
4605
 * htmlParseTryOrFinish:
4606
 * @ctxt:  an HTML parser context
4607
 * @terminate:  last chunk indicator
4608
 *
4609
 * Try to progress on parsing
4610
 *
4611
 * Returns zero if no parsing was possible
4612
 */
4613
static int
4614
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4615
    int ret = 0;
4616
    htmlParserInputPtr in;
4617
    int avail = 0;
4618
    xmlChar cur, next;
4619
4620
#ifdef DEBUG_PUSH
4621
    switch (ctxt->instate) {
4622
	case XML_PARSER_EOF:
4623
	    xmlGenericError(xmlGenericErrorContext,
4624
		    "HPP: try EOF\n"); break;
4625
	case XML_PARSER_START:
4626
	    xmlGenericError(xmlGenericErrorContext,
4627
		    "HPP: try START\n"); break;
4628
	case XML_PARSER_MISC:
4629
	    xmlGenericError(xmlGenericErrorContext,
4630
		    "HPP: try MISC\n");break;
4631
	case XML_PARSER_COMMENT:
4632
	    xmlGenericError(xmlGenericErrorContext,
4633
		    "HPP: try COMMENT\n");break;
4634
	case XML_PARSER_PROLOG:
4635
	    xmlGenericError(xmlGenericErrorContext,
4636
		    "HPP: try PROLOG\n");break;
4637
	case XML_PARSER_START_TAG:
4638
	    xmlGenericError(xmlGenericErrorContext,
4639
		    "HPP: try START_TAG\n");break;
4640
	case XML_PARSER_CONTENT:
4641
	    xmlGenericError(xmlGenericErrorContext,
4642
		    "HPP: try CONTENT\n");break;
4643
	case XML_PARSER_CDATA_SECTION:
4644
	    xmlGenericError(xmlGenericErrorContext,
4645
		    "HPP: try CDATA_SECTION\n");break;
4646
	case XML_PARSER_END_TAG:
4647
	    xmlGenericError(xmlGenericErrorContext,
4648
		    "HPP: try END_TAG\n");break;
4649
	case XML_PARSER_ENTITY_DECL:
4650
	    xmlGenericError(xmlGenericErrorContext,
4651
		    "HPP: try ENTITY_DECL\n");break;
4652
	case XML_PARSER_ENTITY_VALUE:
4653
	    xmlGenericError(xmlGenericErrorContext,
4654
		    "HPP: try ENTITY_VALUE\n");break;
4655
	case XML_PARSER_ATTRIBUTE_VALUE:
4656
	    xmlGenericError(xmlGenericErrorContext,
4657
		    "HPP: try ATTRIBUTE_VALUE\n");break;
4658
	case XML_PARSER_DTD:
4659
	    xmlGenericError(xmlGenericErrorContext,
4660
		    "HPP: try DTD\n");break;
4661
	case XML_PARSER_EPILOG:
4662
	    xmlGenericError(xmlGenericErrorContext,
4663
		    "HPP: try EPILOG\n");break;
4664
	case XML_PARSER_PI:
4665
	    xmlGenericError(xmlGenericErrorContext,
4666
		    "HPP: try PI\n");break;
4667
	case XML_PARSER_SYSTEM_LITERAL:
4668
	    xmlGenericError(xmlGenericErrorContext,
4669
		    "HPP: try SYSTEM_LITERAL\n");break;
4670
    }
4671
#endif
4672
4673
    while (1) {
4674
4675
	in = ctxt->input;
4676
	if (in == NULL) break;
4677
	if (in->buf == NULL)
4678
	    avail = in->length - (in->cur - in->base);
4679
	else
4680
	    avail = in->buf->buffer->use - (in->cur - in->base);
4681
	if ((avail == 0) && (terminate)) {
4682
	    htmlAutoCloseOnEnd(ctxt);
4683
	    if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { 
4684
		/*
4685
		 * SAX: end of the document processing.
4686
		 */
4687
		ctxt->instate = XML_PARSER_EOF;
4688
		if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4689
		    ctxt->sax->endDocument(ctxt->userData);
4690
	    }
4691
	}
4692
        if (avail < 1)
4693
	    goto done;
4694
	cur = in->cur[0];
4695
	if (cur == 0) {
4696
	    SKIP(1);
4697
	    continue;
4698
	}
4699
4700
        switch (ctxt->instate) {
4701
            case XML_PARSER_EOF:
4702
	        /*
4703
		 * Document parsing is done !
4704
		 */
4705
	        goto done;
4706
            case XML_PARSER_START:
4707
	        /*
4708
		 * Very first chars read from the document flow.
4709
		 */
4710
		cur = in->cur[0];
4711
		if (IS_BLANK_CH(cur)) {
4712
		    SKIP_BLANKS;
4713
		    if (in->buf == NULL)
4714
			avail = in->length - (in->cur - in->base);
4715
		    else
4716
			avail = in->buf->buffer->use - (in->cur - in->base);
4717
		}
4718
		if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4719
		    ctxt->sax->setDocumentLocator(ctxt->userData,
4720
						  &xmlDefaultSAXLocator);
4721
		if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4722
	            (!ctxt->disableSAX))
4723
		    ctxt->sax->startDocument(ctxt->userData);
4724
4725
		cur = in->cur[0];
4726
		next = in->cur[1];
4727
		if ((cur == '<') && (next == '!') &&
4728
		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
4729
		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
4730
		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4731
		    (UPP(8) == 'E')) {
4732
		    if ((!terminate) &&
4733
		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4734
			goto done;
4735
#ifdef DEBUG_PUSH
4736
		    xmlGenericError(xmlGenericErrorContext,
4737
			    "HPP: Parsing internal subset\n");
4738
#endif
4739
		    htmlParseDocTypeDecl(ctxt);
4740
		    ctxt->instate = XML_PARSER_PROLOG;
4741
#ifdef DEBUG_PUSH
4742
		    xmlGenericError(xmlGenericErrorContext,
4743
			    "HPP: entering PROLOG\n");
4744
#endif
4745
                } else {
4746
		    ctxt->instate = XML_PARSER_MISC;
4747
#ifdef DEBUG_PUSH
4748
		    xmlGenericError(xmlGenericErrorContext,
4749
			    "HPP: entering MISC\n");
4750
#endif
4751
		}
4752
		break;
4753
            case XML_PARSER_MISC:
4754
		SKIP_BLANKS;
4755
		if (in->buf == NULL)
4756
		    avail = in->length - (in->cur - in->base);
4757
		else
4758
		    avail = in->buf->buffer->use - (in->cur - in->base);
4759
		if (avail < 2)
4760
		    goto done;
4761
		cur = in->cur[0];
4762
		next = in->cur[1];
4763
	        if ((cur == '<') && (next == '!') &&
4764
		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
4765
		    if ((!terminate) &&
4766
		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4767
			goto done;
4768
#ifdef DEBUG_PUSH
4769
		    xmlGenericError(xmlGenericErrorContext,
4770
			    "HPP: Parsing Comment\n");
4771
#endif
4772
		    htmlParseComment(ctxt);
4773
		    ctxt->instate = XML_PARSER_MISC;
4774
	        } else if ((cur == '<') && (next == '?')) {
4775
		    if ((!terminate) &&
4776
		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4777
			goto done;
4778
#ifdef DEBUG_PUSH
4779
		    xmlGenericError(xmlGenericErrorContext,
4780
			    "HPP: Parsing PI\n");
4781
#endif
4782
		    htmlParsePI(ctxt);
4783
		    ctxt->instate = XML_PARSER_MISC;
4784
		} else if ((cur == '<') && (next == '!') &&
4785
		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
4786
		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
4787
		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4788
		    (UPP(8) == 'E')) {
4789
		    if ((!terminate) &&
4790
		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4791
			goto done;
4792
#ifdef DEBUG_PUSH
4793
		    xmlGenericError(xmlGenericErrorContext,
4794
			    "HPP: Parsing internal subset\n");
4795
#endif
4796
		    htmlParseDocTypeDecl(ctxt);
4797
		    ctxt->instate = XML_PARSER_PROLOG;
4798
#ifdef DEBUG_PUSH
4799
		    xmlGenericError(xmlGenericErrorContext,
4800
			    "HPP: entering PROLOG\n");
4801
#endif
4802
		} else if ((cur == '<') && (next == '!') &&
4803
		           (avail < 9)) {
4804
		    goto done;
4805
		} else {
4806
		    ctxt->instate = XML_PARSER_START_TAG;
4807
#ifdef DEBUG_PUSH
4808
		    xmlGenericError(xmlGenericErrorContext,
4809
			    "HPP: entering START_TAG\n");
4810
#endif
4811
		}
4812
		break;
4813
            case XML_PARSER_PROLOG:
4814
		SKIP_BLANKS;
4815
		if (in->buf == NULL)
4816
		    avail = in->length - (in->cur - in->base);
4817
		else
4818
		    avail = in->buf->buffer->use - (in->cur - in->base);
4819
		if (avail < 2) 
4820
		    goto done;
4821
		cur = in->cur[0];
4822
		next = in->cur[1];
4823
		if ((cur == '<') && (next == '!') &&
4824
		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
4825
		    if ((!terminate) &&
4826
		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4827
			goto done;
4828
#ifdef DEBUG_PUSH
4829
		    xmlGenericError(xmlGenericErrorContext,
4830
			    "HPP: Parsing Comment\n");
4831
#endif
4832
		    htmlParseComment(ctxt);
4833
		    ctxt->instate = XML_PARSER_PROLOG;
4834
	        } else if ((cur == '<') && (next == '?')) {
4835
		    if ((!terminate) &&
4836
		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4837
			goto done;
4838
#ifdef DEBUG_PUSH
4839
		    xmlGenericError(xmlGenericErrorContext,
4840
			    "HPP: Parsing PI\n");
4841
#endif
4842
		    htmlParsePI(ctxt);
4843
		    ctxt->instate = XML_PARSER_PROLOG;
4844
		} else if ((cur == '<') && (next == '!') &&
4845
		           (avail < 4)) {
4846
		    goto done;
4847
		} else {
4848
		    ctxt->instate = XML_PARSER_START_TAG;
4849
#ifdef DEBUG_PUSH
4850
		    xmlGenericError(xmlGenericErrorContext,
4851
			    "HPP: entering START_TAG\n");
4852
#endif
4853
		}
4854
		break;
4855
            case XML_PARSER_EPILOG:
4856
		if (in->buf == NULL)
4857
		    avail = in->length - (in->cur - in->base);
4858
		else
4859
		    avail = in->buf->buffer->use - (in->cur - in->base);
4860
		if (avail < 1)
4861
		    goto done;
4862
		cur = in->cur[0];
4863
		if (IS_BLANK_CH(cur)) {
4864
		    htmlParseCharData(ctxt);
4865
		    goto done;
4866
		}
4867
		if (avail < 2)
4868
		    goto done;
4869
		next = in->cur[1];
4870
	        if ((cur == '<') && (next == '!') &&
4871
		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
4872
		    if ((!terminate) &&
4873
		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4874
			goto done;
4875
#ifdef DEBUG_PUSH
4876
		    xmlGenericError(xmlGenericErrorContext,
4877
			    "HPP: Parsing Comment\n");
4878
#endif
4879
		    htmlParseComment(ctxt);
4880
		    ctxt->instate = XML_PARSER_EPILOG;
4881
	        } else if ((cur == '<') && (next == '?')) {
4882
		    if ((!terminate) &&
4883
		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4884
			goto done;
4885
#ifdef DEBUG_PUSH
4886
		    xmlGenericError(xmlGenericErrorContext,
4887
			    "HPP: Parsing PI\n");
4888
#endif
4889
		    htmlParsePI(ctxt);
4890
		    ctxt->instate = XML_PARSER_EPILOG;
4891
		} else if ((cur == '<') && (next == '!') &&
4892
		           (avail < 4)) {
4893
		    goto done;
4894
		} else {
4895
		    ctxt->errNo = XML_ERR_DOCUMENT_END;
4896
		    ctxt->wellFormed = 0;
4897
		    ctxt->instate = XML_PARSER_EOF;
4898
#ifdef DEBUG_PUSH
4899
		    xmlGenericError(xmlGenericErrorContext,
4900
			    "HPP: entering EOF\n");
4901
#endif
4902
		    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4903
			ctxt->sax->endDocument(ctxt->userData);
4904
		    goto done;
4905
		}
4906
		break;
4907
            case XML_PARSER_START_TAG: {
4908
	        const xmlChar *name;
4909
		int failed;
4910
		const htmlElemDesc * info;
4911
4912
		if (avail < 2)
4913
		    goto done;
4914
		cur = in->cur[0];
4915
	        if (cur != '<') {
4916
		    ctxt->instate = XML_PARSER_CONTENT;
4917
#ifdef DEBUG_PUSH
4918
		    xmlGenericError(xmlGenericErrorContext,
4919
			    "HPP: entering CONTENT\n");
4920
#endif
4921
		    break;
4922
		}
4923
		if (in->cur[1] == '/') {
4924
		    ctxt->instate = XML_PARSER_END_TAG;
4925
		    ctxt->checkIndex = 0;
4926
#ifdef DEBUG_PUSH
4927
		    xmlGenericError(xmlGenericErrorContext,
4928
			    "HPP: entering END_TAG\n");
4929
#endif
4930
		    break;
4931
		}
4932
		if ((!terminate) &&
4933
		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4934
		    goto done;
4935
4936
		failed = htmlParseStartTag(ctxt);
4937
		name = ctxt->name;
4938
		if ((failed == -1) ||
4939
		    (name == NULL)) {
4940
		    if (CUR == '>')
4941
			NEXT;
4942
		    break;
4943
		}
4944
4945
		/*
4946
		 * Lookup the info for that element.
4947
		 */
4948
		info = htmlTagLookup(name);
4949
		if (info == NULL) {
4950
		    htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4951
		                 "Tag %s invalid\n", name, NULL);
4952
		}
4953
4954
		/*
4955
		 * Check for an Empty Element labeled the XML/SGML way
4956
		 */
4957
		if ((CUR == '/') && (NXT(1) == '>')) {
4958
		    SKIP(2);
4959
		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4960
			ctxt->sax->endElement(ctxt->userData, name);
4961
		    htmlnamePop(ctxt);
4962
		    ctxt->instate = XML_PARSER_CONTENT;
4963
#ifdef DEBUG_PUSH
4964
		    xmlGenericError(xmlGenericErrorContext,
4965
			    "HPP: entering CONTENT\n");
4966
#endif
4967
		    break;
4968
		}
4969
4970
		if (CUR == '>') {
4971
		    NEXT;
4972
		} else {
4973
		    htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4974
		                 "Couldn't find end of Start Tag %s\n",
4975
				 name, NULL);
4976
4977
		    /*
4978
		     * end of parsing of this node.
4979
		     */
4980
		    if (xmlStrEqual(name, ctxt->name)) { 
4981
			nodePop(ctxt);
4982
			htmlnamePop(ctxt);
4983
		    }    
4984
4985
		    ctxt->instate = XML_PARSER_CONTENT;
4986
#ifdef DEBUG_PUSH
4987
		    xmlGenericError(xmlGenericErrorContext,
4988
			    "HPP: entering CONTENT\n");
4989
#endif
4990
		    break;
4991
		}
4992
4993
		/*
4994
		 * Check for an Empty Element from DTD definition
4995
		 */
4996
		if ((info != NULL) && (info->empty)) {
4997
		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4998
			ctxt->sax->endElement(ctxt->userData, name);
4999
		    htmlnamePop(ctxt);
5000
		}
5001
		ctxt->instate = XML_PARSER_CONTENT;
5002
#ifdef DEBUG_PUSH
5003
		xmlGenericError(xmlGenericErrorContext,
5004
			"HPP: entering CONTENT\n");
5005
#endif
5006
                break;
5007
	    }
5008
            case XML_PARSER_CONTENT: {
5009
		long cons;
5010
                /*
5011
		 * Handle preparsed entities and charRef
5012
		 */
5013
		if (ctxt->token != 0) {
5014
		    xmlChar chr[2] = { 0 , 0 } ;
5015
5016
		    chr[0] = (xmlChar) ctxt->token;
5017
		    htmlCheckParagraph(ctxt);
5018
		    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5019
			ctxt->sax->characters(ctxt->userData, chr, 1);
5020
		    ctxt->token = 0;
5021
		    ctxt->checkIndex = 0;
5022
		}
5023
		if ((avail == 1) && (terminate)) {
5024
		    cur = in->cur[0];
5025
		    if ((cur != '<') && (cur != '&')) {
5026
			if (ctxt->sax != NULL) {
5027
			    if (IS_BLANK_CH(cur)) {
5028
				if (ctxt->sax->ignorableWhitespace != NULL)
5029
				    ctxt->sax->ignorableWhitespace(
5030
					    ctxt->userData, &cur, 1);
5031
			    } else {
5032
				htmlCheckParagraph(ctxt);
5033
				if (ctxt->sax->characters != NULL)
5034
				    ctxt->sax->characters(
5035
					    ctxt->userData, &cur, 1);
5036
			    }
5037
			}
5038
			ctxt->token = 0;
5039
			ctxt->checkIndex = 0;
5040
			in->cur++;
5041
			break;
5042
		    }
5043
		}
5044
		if (avail < 2)
5045
		    goto done;
5046
		cur = in->cur[0];
5047
		next = in->cur[1];
5048
		cons = ctxt->nbChars;
5049
		if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5050
		    (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5051
		    /*
5052
		     * Handle SCRIPT/STYLE separately
5053
		     */
5054
		    if (!terminate) {
5055
		        int idx;
5056
			xmlChar val;
5057
5058
			idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5059
			if (idx < 0)
5060
			    goto done;
5061
		        val = in->cur[idx + 2];
5062
			if (val == 0) /* bad cut of input */
5063
			    goto done;
5064
		    }
5065
		    htmlParseScript(ctxt);
5066
		    if ((cur == '<') && (next == '/')) {
5067
			ctxt->instate = XML_PARSER_END_TAG;
5068
			ctxt->checkIndex = 0;
5069
#ifdef DEBUG_PUSH
5070
			xmlGenericError(xmlGenericErrorContext,
5071
				"HPP: entering END_TAG\n");
5072
#endif
5073
			break;
5074
		    }
5075
		} else {
5076
		    /*
5077
		     * Sometimes DOCTYPE arrives in the middle of the document
5078
		     */
5079
		    if ((cur == '<') && (next == '!') &&
5080
			(UPP(2) == 'D') && (UPP(3) == 'O') &&
5081
			(UPP(4) == 'C') && (UPP(5) == 'T') &&
5082
			(UPP(6) == 'Y') && (UPP(7) == 'P') &&
5083
			(UPP(8) == 'E')) {
5084
			if ((!terminate) &&
5085
			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5086
			    goto done;
5087
			htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5088
			             "Misplaced DOCTYPE declaration\n",
5089
				     BAD_CAST "DOCTYPE" , NULL);
5090
			htmlParseDocTypeDecl(ctxt);
5091
		    } else if ((cur == '<') && (next == '!') &&
5092
			(in->cur[2] == '-') && (in->cur[3] == '-')) {
5093
			if ((!terminate) &&
5094
			    (htmlParseLookupSequence(
5095
			    		ctxt, '-', '-', '>', 1) < 0))
5096
			    goto done;
5097
#ifdef DEBUG_PUSH
5098
			xmlGenericError(xmlGenericErrorContext,
5099
				"HPP: Parsing Comment\n");
5100
#endif
5101
			htmlParseComment(ctxt);
5102
			ctxt->instate = XML_PARSER_CONTENT;
5103
		    } else if ((cur == '<') && (next == '?')) {
5104
			if ((!terminate) &&
5105
			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5106
			    goto done;
5107
#ifdef DEBUG_PUSH
5108
			xmlGenericError(xmlGenericErrorContext,
5109
				"HPP: Parsing PI\n");
5110
#endif
5111
			htmlParsePI(ctxt);
5112
			ctxt->instate = XML_PARSER_CONTENT;
5113
		    } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5114
			goto done;
5115
		    } else if ((cur == '<') && (next == '/')) {
5116
			ctxt->instate = XML_PARSER_END_TAG;
5117
			ctxt->checkIndex = 0;
5118
#ifdef DEBUG_PUSH
5119
			xmlGenericError(xmlGenericErrorContext,
5120
				"HPP: entering END_TAG\n");
5121
#endif
5122
			break;
5123
		    } else if (cur == '<') {
5124
			ctxt->instate = XML_PARSER_START_TAG;
5125
			ctxt->checkIndex = 0;
5126
#ifdef DEBUG_PUSH
5127
			xmlGenericError(xmlGenericErrorContext,
5128
				"HPP: entering START_TAG\n");
5129
#endif
5130
			break;
5131
		    } else if (cur == '&') {
5132
			if ((!terminate) &&
5133
			    (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
5134
			    goto done;
5135
#ifdef DEBUG_PUSH
5136
			xmlGenericError(xmlGenericErrorContext,
5137
				"HPP: Parsing Reference\n");
5138
#endif
5139
			/* TODO: check generation of subtrees if noent !!! */
5140
			htmlParseReference(ctxt);
5141
		    } else {
5142
		        /*
5143
			 * check that the text sequence is complete
5144
			 * before handing out the data to the parser
5145
			 * to avoid problems with erroneous end of
5146
			 * data detection.
5147
			 */
5148
			if ((!terminate) &&
5149
			    (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
5150
			    goto done;
5151
			ctxt->checkIndex = 0;
5152
#ifdef DEBUG_PUSH
5153
			xmlGenericError(xmlGenericErrorContext,
5154
				"HPP: Parsing char data\n");
5155
#endif
5156
			htmlParseCharData(ctxt);
5157
		    }
5158
		}
5159
		if (cons == ctxt->nbChars) {
5160
		    if (ctxt->node != NULL) {
5161
			htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5162
			             "detected an error in element content\n",
5163
				     NULL, NULL);
5164
		    }
5165
		    NEXT;
5166
		    break;
5167
		}
5168
5169
		break;
5170
	    }
5171
            case XML_PARSER_END_TAG:
5172
		if (avail < 2)
5173
		    goto done;
5174
		if ((!terminate) &&
5175
		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5176
		    goto done;
5177
		htmlParseEndTag(ctxt);
5178
		if (ctxt->nameNr == 0) {
5179
		    ctxt->instate = XML_PARSER_EPILOG;
5180
		} else {
5181
		    ctxt->instate = XML_PARSER_CONTENT;
5182
		}
5183
		ctxt->checkIndex = 0;
5184
#ifdef DEBUG_PUSH
5185
		xmlGenericError(xmlGenericErrorContext,
5186
			"HPP: entering CONTENT\n");
5187
#endif
5188
	        break;
5189
            case XML_PARSER_CDATA_SECTION:
5190
		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5191
			"HPP: internal error, state == CDATA\n",
5192
			     NULL, NULL);
5193
		ctxt->instate = XML_PARSER_CONTENT;
5194
		ctxt->checkIndex = 0;
5195
#ifdef DEBUG_PUSH
5196
		xmlGenericError(xmlGenericErrorContext,
5197
			"HPP: entering CONTENT\n");
5198
#endif
5199
		break;
5200
            case XML_PARSER_DTD:
5201
		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5202
			"HPP: internal error, state == DTD\n",
5203
			     NULL, NULL);
5204
		ctxt->instate = XML_PARSER_CONTENT;
5205
		ctxt->checkIndex = 0;
5206
#ifdef DEBUG_PUSH
5207
		xmlGenericError(xmlGenericErrorContext,
5208
			"HPP: entering CONTENT\n");
5209
#endif
5210
		break;
5211
            case XML_PARSER_COMMENT:
5212
		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5213
			"HPP: internal error, state == COMMENT\n",
5214
			     NULL, NULL);
5215
		ctxt->instate = XML_PARSER_CONTENT;
5216
		ctxt->checkIndex = 0;
5217
#ifdef DEBUG_PUSH
5218
		xmlGenericError(xmlGenericErrorContext,
5219
			"HPP: entering CONTENT\n");
5220
#endif
5221
		break;
5222
            case XML_PARSER_PI:
5223
		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5224
			"HPP: internal error, state == PI\n",
5225
			     NULL, NULL);
5226
		ctxt->instate = XML_PARSER_CONTENT;
5227
		ctxt->checkIndex = 0;
5228
#ifdef DEBUG_PUSH
5229
		xmlGenericError(xmlGenericErrorContext,
5230
			"HPP: entering CONTENT\n");
5231
#endif
5232
		break;
5233
            case XML_PARSER_ENTITY_DECL:
5234
		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5235
			"HPP: internal error, state == ENTITY_DECL\n",
5236
			     NULL, NULL);
5237
		ctxt->instate = XML_PARSER_CONTENT;
5238
		ctxt->checkIndex = 0;
5239
#ifdef DEBUG_PUSH
5240
		xmlGenericError(xmlGenericErrorContext,
5241
			"HPP: entering CONTENT\n");
5242
#endif
5243
		break;
5244
            case XML_PARSER_ENTITY_VALUE:
5245
		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5246
			"HPP: internal error, state == ENTITY_VALUE\n",
5247
			     NULL, NULL);
5248
		ctxt->instate = XML_PARSER_CONTENT;
5249
		ctxt->checkIndex = 0;
5250
#ifdef DEBUG_PUSH
5251
		xmlGenericError(xmlGenericErrorContext,
5252
			"HPP: entering DTD\n");
5253
#endif
5254
		break;
5255
            case XML_PARSER_ATTRIBUTE_VALUE:
5256
		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5257
			"HPP: internal error, state == ATTRIBUTE_VALUE\n",
5258
			     NULL, NULL);
5259
		ctxt->instate = XML_PARSER_START_TAG;
5260
		ctxt->checkIndex = 0;
5261
#ifdef DEBUG_PUSH
5262
		xmlGenericError(xmlGenericErrorContext,
5263
			"HPP: entering START_TAG\n");
5264
#endif
5265
		break;
5266
	    case XML_PARSER_SYSTEM_LITERAL:
5267
		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5268
		    "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5269
			     NULL, NULL);
5270
		ctxt->instate = XML_PARSER_CONTENT;
5271
		ctxt->checkIndex = 0;
5272
#ifdef DEBUG_PUSH
5273
		xmlGenericError(xmlGenericErrorContext,
5274
			"HPP: entering CONTENT\n");
5275
#endif
5276
		break;
5277
	    case XML_PARSER_IGNORE:
5278
		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5279
			"HPP: internal error, state == XML_PARSER_IGNORE\n",
5280
			     NULL, NULL);
5281
		ctxt->instate = XML_PARSER_CONTENT;
5282
		ctxt->checkIndex = 0;
5283
#ifdef DEBUG_PUSH
5284
		xmlGenericError(xmlGenericErrorContext,
5285
			"HPP: entering CONTENT\n");
5286
#endif
5287
		break;
5288
	    case XML_PARSER_PUBLIC_LITERAL:
5289
		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5290
			"HPP: internal error, state == XML_PARSER_LITERAL\n",
5291
			     NULL, NULL);
5292
		ctxt->instate = XML_PARSER_CONTENT;
5293
		ctxt->checkIndex = 0;
5294
#ifdef DEBUG_PUSH
5295
		xmlGenericError(xmlGenericErrorContext,
5296
			"HPP: entering CONTENT\n");
5297
#endif
5298
		break;
5299
5300
	}
5301
    }
5302
done:    
5303
    if ((avail == 0) && (terminate)) {
5304
	htmlAutoCloseOnEnd(ctxt);
5305
	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { 
5306
	    /*
5307
	     * SAX: end of the document processing.
5308
	     */
5309
	    ctxt->instate = XML_PARSER_EOF;
5310
	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5311
		ctxt->sax->endDocument(ctxt->userData);
5312
	}
5313
    }
5314
    if ((ctxt->myDoc != NULL) &&
5315
	((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5316
	 (ctxt->instate == XML_PARSER_EPILOG))) {
5317
	xmlDtdPtr dtd;
5318
	dtd = xmlGetIntSubset(ctxt->myDoc);
5319
	if (dtd == NULL)
5320
	    ctxt->myDoc->intSubset = 
5321
		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html", 
5322
		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5323
		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5324
    }
5325
#ifdef DEBUG_PUSH
5326
    xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5327
#endif
5328
    return(ret);
5329
}
5330
5331
/**
5332
 * htmlParseChunk:
5333
 * @ctxt:  an HTML parser context
5334
 * @chunk:  an char array
5335
 * @size:  the size in byte of the chunk
5336
 * @terminate:  last chunk indicator
5337
 *
5338
 * Parse a Chunk of memory
5339
 *
5340
 * Returns zero if no error, the xmlParserErrors otherwise.
5341
 */
5342
int
5343
htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5344
              int terminate) {
5345
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
5346
	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5347
		     "htmlParseChunk: context error\n", NULL, NULL);
5348
	return(XML_ERR_INTERNAL_ERROR);
5349
    }
5350
    if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5351
        (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
5352
	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5353
	int cur = ctxt->input->cur - ctxt->input->base;
5354
	int res;
5355
	
5356
	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);	      
5357
	if (res < 0) {
5358
	    ctxt->errNo = XML_PARSER_EOF;
5359
	    ctxt->disableSAX = 1;
5360
	    return (XML_PARSER_EOF);
5361
	}
5362
	ctxt->input->base = ctxt->input->buf->buffer->content + base;
5363
	ctxt->input->cur = ctxt->input->base + cur;
5364
	ctxt->input->end =
5365
	  &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
5366
#ifdef DEBUG_PUSH
5367
	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5368
#endif
5369
5370
#if 0
5371
	if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5372
	    htmlParseTryOrFinish(ctxt, terminate);
5373
#endif
5374
    } else if (ctxt->instate != XML_PARSER_EOF) {
5375
	if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5376
	    xmlParserInputBufferPtr in = ctxt->input->buf;
5377
	    if ((in->encoder != NULL) && (in->buffer != NULL) &&
5378
		    (in->raw != NULL)) {
5379
		int nbchars;
5380
		    
5381
		nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5382
		if (nbchars < 0) {
5383
		    htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5384
			         "encoder error\n", NULL, NULL);
5385
		    return(XML_ERR_INVALID_ENCODING);
5386
		}
5387
	    }
5388
	}
5389
    }
5390
    htmlParseTryOrFinish(ctxt, terminate);
5391
    if (terminate) {
5392
	if ((ctxt->instate != XML_PARSER_EOF) &&
5393
	    (ctxt->instate != XML_PARSER_EPILOG) &&
5394
	    (ctxt->instate != XML_PARSER_MISC)) {
5395
	    ctxt->errNo = XML_ERR_DOCUMENT_END;
5396
	    ctxt->wellFormed = 0;
5397
	} 
5398
	if (ctxt->instate != XML_PARSER_EOF) {
5399
	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5400
		ctxt->sax->endDocument(ctxt->userData);
5401
	}
5402
	ctxt->instate = XML_PARSER_EOF;
5403
    }
5404
    return((xmlParserErrors) ctxt->errNo);	      
5405
}
5406
5407
/************************************************************************
5408
 *									*
5409
 *			User entry points				*
5410
 *									*
5411
 ************************************************************************/
5412
5413
/**
5414
 * htmlCreatePushParserCtxt:
5415
 * @sax:  a SAX handler
5416
 * @user_data:  The user data returned on SAX callbacks
5417
 * @chunk:  a pointer to an array of chars
5418
 * @size:  number of chars in the array
5419
 * @filename:  an optional file name or URI
5420
 * @enc:  an optional encoding
5421
 *
5422
 * Create a parser context for using the HTML parser in push mode
5423
 * The value of @filename is used for fetching external entities
5424
 * and error/warning reports.
5425
 *
5426
 * Returns the new parser context or NULL
5427
 */
5428
htmlParserCtxtPtr
5429
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data, 
5430
                         const char *chunk, int size, const char *filename,
5431
			 xmlCharEncoding enc) {
5432
    htmlParserCtxtPtr ctxt;
5433
    htmlParserInputPtr inputStream;
5434
    xmlParserInputBufferPtr buf;
5435
5436
    xmlInitParser();
5437
5438
    buf = xmlAllocParserInputBuffer(enc);
5439
    if (buf == NULL) return(NULL);
5440
5441
    ctxt = htmlNewParserCtxt();
5442
    if (ctxt == NULL) {
5443
	xmlFreeParserInputBuffer(buf);
5444
	return(NULL);
5445
    }
5446
    if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5447
	ctxt->charset=XML_CHAR_ENCODING_UTF8;
5448
    if (sax != NULL) {
5449
	if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
5450
	    xmlFree(ctxt->sax);
5451
	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5452
	if (ctxt->sax == NULL) {
5453
	    xmlFree(buf);
5454
	    xmlFree(ctxt);
5455
	    return(NULL);
5456
	}
5457
	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5458
	if (user_data != NULL)
5459
	    ctxt->userData = user_data;
5460
    }	
5461
    if (filename == NULL) {
5462
	ctxt->directory = NULL;
5463
    } else {
5464
        ctxt->directory = xmlParserGetDirectory(filename);
5465
    }
5466
5467
    inputStream = htmlNewInputStream(ctxt);
5468
    if (inputStream == NULL) {
5469
	xmlFreeParserCtxt(ctxt);
5470
	xmlFree(buf);
5471
	return(NULL);
5472
    }
5473
5474
    if (filename == NULL)
5475
	inputStream->filename = NULL;
5476
    else
5477
	inputStream->filename = (char *)
5478
	    xmlCanonicPath((const xmlChar *) filename);
5479
    inputStream->buf = buf;
5480
    inputStream->base = inputStream->buf->buffer->content;
5481
    inputStream->cur = inputStream->buf->buffer->content;
5482
    inputStream->end = 
5483
	&inputStream->buf->buffer->content[inputStream->buf->buffer->use];
5484
5485
    inputPush(ctxt, inputStream);
5486
5487
    if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5488
        (ctxt->input->buf != NULL))  {	      
5489
	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5490
	int cur = ctxt->input->cur - ctxt->input->base;
5491
5492
	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);	      
5493
5494
	ctxt->input->base = ctxt->input->buf->buffer->content + base;
5495
	ctxt->input->cur = ctxt->input->base + cur;
5496
	ctxt->input->end =
5497
	    &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
5498
#ifdef DEBUG_PUSH
5499
	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5500
#endif
5501
    }
5502
    ctxt->progressive = 1;
5503
5504
    return(ctxt);
5505
}
5506
#endif /* LIBXML_PUSH_ENABLED */
5507
5508
/**
5509
 * htmlSAXParseDoc:
5510
 * @cur:  a pointer to an array of xmlChar
5511
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
5512
 * @sax:  the SAX handler block
5513
 * @userData: if using SAX, this pointer will be provided on callbacks. 
5514
 *
5515
 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5516
 * to handle parse events. If sax is NULL, fallback to the default DOM
5517
 * behavior and return a tree.
5518
 * 
5519
 * Returns the resulting document tree unless SAX is NULL or the document is
5520
 *     not well formed.
5521
 */
5522
5523
htmlDocPtr
5524
htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5525
    htmlDocPtr ret;
5526
    htmlParserCtxtPtr ctxt;
5527
5528
    xmlInitParser();
5529
5530
    if (cur == NULL) return(NULL);
5531
5532
5533
    ctxt = htmlCreateDocParserCtxt(cur, encoding);
5534
    if (ctxt == NULL) return(NULL);
5535
    if (sax != NULL) { 
5536
        if (ctxt->sax != NULL) xmlFree (ctxt->sax);
5537
        ctxt->sax = sax;
5538
        ctxt->userData = userData;
5539
    }
5540
5541
    htmlParseDocument(ctxt);
5542
    ret = ctxt->myDoc;
5543
    if (sax != NULL) {
5544
	ctxt->sax = NULL;
5545
	ctxt->userData = NULL;
5546
    }
5547
    htmlFreeParserCtxt(ctxt);
5548
    
5549
    return(ret);
5550
}
5551
5552
/**
5553
 * htmlParseDoc:
5554
 * @cur:  a pointer to an array of xmlChar
5555
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
5556
 *
5557
 * parse an HTML in-memory document and build a tree.
5558
 * 
5559
 * Returns the resulting document tree
5560
 */
5561
5562
htmlDocPtr
5563
htmlParseDoc(xmlChar *cur, const char *encoding) {
5564
    return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5565
}
5566
5567
5568
/**
5569
 * htmlCreateFileParserCtxt:
5570
 * @filename:  the filename
5571
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
5572
 *
5573
 * Create a parser context for a file content. 
5574
 * Automatic support for ZLIB/Compress compressed document is provided
5575
 * by default if found at compile-time.
5576
 *
5577
 * Returns the new parser context or NULL
5578
 */
5579
htmlParserCtxtPtr
5580
htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5581
{
5582
    htmlParserCtxtPtr ctxt;
5583
    htmlParserInputPtr inputStream;
5584
    char *canonicFilename;
5585
    /* htmlCharEncoding enc; */
5586
    xmlChar *content, *content_line = (xmlChar *) "charset=";
5587
5588
    if (filename == NULL)
5589
        return(NULL);
5590
5591
    ctxt = htmlNewParserCtxt();
5592
    if (ctxt == NULL) {
5593
	return(NULL);
5594
    }
5595
    canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5596
    if (canonicFilename == NULL) {
5597
#ifdef LIBXML_SAX1_ENABLED
5598
	if (xmlDefaultSAXHandler.error != NULL) {
5599
	    xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5600
	}
5601
#endif
5602
	xmlFreeParserCtxt(ctxt);
5603
	return(NULL);
5604
    }
5605
    
5606
    inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5607
    xmlFree(canonicFilename);
5608
    if (inputStream == NULL) {
5609
	xmlFreeParserCtxt(ctxt);
5610
	return(NULL);
5611
    }
5612
5613
    inputPush(ctxt, inputStream);
5614
5615
    /* set encoding */
5616
    if (encoding) {
5617
        content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
5618
	if (content) {  
5619
	    strcpy ((char *)content, (char *)content_line);
5620
            strcat ((char *)content, (char *)encoding);
5621
            htmlCheckEncoding (ctxt, content);
5622
	    xmlFree (content);
5623
	}
5624
    }
5625
    
5626
    return(ctxt);
5627
}
5628
5629
/**
5630
 * htmlSAXParseFile:
5631
 * @filename:  the filename
5632
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
5633
 * @sax:  the SAX handler block
5634
 * @userData: if using SAX, this pointer will be provided on callbacks. 
5635
 *
5636
 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5637
 * compressed document is provided by default if found at compile-time.
5638
 * It use the given SAX function block to handle the parsing callback.
5639
 * If sax is NULL, fallback to the default DOM tree building routines.
5640
 *
5641
 * Returns the resulting document tree unless SAX is NULL or the document is
5642
 *     not well formed.
5643
 */
5644
5645
htmlDocPtr
5646
htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax, 
5647
                 void *userData) {
5648
    htmlDocPtr ret;
5649
    htmlParserCtxtPtr ctxt;
5650
    htmlSAXHandlerPtr oldsax = NULL;
5651
5652
    xmlInitParser();
5653
5654
    ctxt = htmlCreateFileParserCtxt(filename, encoding);
5655
    if (ctxt == NULL) return(NULL);
5656
    if (sax != NULL) {
5657
	oldsax = ctxt->sax;
5658
        ctxt->sax = sax;
5659
        ctxt->userData = userData;
5660
    }
5661
5662
    htmlParseDocument(ctxt);
5663
5664
    ret = ctxt->myDoc;
5665
    if (sax != NULL) {
5666
        ctxt->sax = oldsax;
5667
        ctxt->userData = NULL;
5668
    }
5669
    htmlFreeParserCtxt(ctxt);
5670
    
5671
    return(ret);
5672
}
5673
5674
/**
5675
 * htmlParseFile:
5676
 * @filename:  the filename
5677
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
5678
 *
5679
 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5680
 * compressed document is provided by default if found at compile-time.
5681
 *
5682
 * Returns the resulting document tree
5683
 */
5684
5685
htmlDocPtr
5686
htmlParseFile(const char *filename, const char *encoding) {
5687
    return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5688
}
5689
5690
/**
5691
 * htmlHandleOmittedElem:
5692
 * @val:  int 0 or 1 
5693
 *
5694
 * Set and return the previous value for handling HTML omitted tags.
5695
 *
5696
 * Returns the last value for 0 for no handling, 1 for auto insertion.
5697
 */
5698
5699
int
5700
htmlHandleOmittedElem(int val) {
5701
    int old = htmlOmittedDefaultValue;
5702
5703
    htmlOmittedDefaultValue = val;
5704
    return(old);
5705
}
5706
5707
/**
5708
 * htmlElementAllowedHere:
5709
 * @parent: HTML parent element
5710
 * @elt: HTML element
5711
 *
5712
 * Checks whether an HTML element may be a direct child of a parent element.
5713
 * Note - doesn't check for deprecated elements
5714
 *
5715
 * Returns 1 if allowed; 0 otherwise.
5716
 */
5717
int
5718
htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5719
  const char** p ;
5720
5721
  if ( ! elt || ! parent || ! parent->subelts )
5722
	return 0 ;
5723
5724
  for ( p = parent->subelts; *p; ++p )
5725
    if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5726
      return 1 ;
5727
5728
  return 0 ;
5729
}
5730
/**
5731
 * htmlElementStatusHere:
5732
 * @parent: HTML parent element
5733
 * @elt: HTML element
5734
 *
5735
 * Checks whether an HTML element may be a direct child of a parent element.
5736
 * and if so whether it is valid or deprecated.
5737
 *
5738
 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5739
 */
5740
htmlStatus
5741
htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5742
  if ( ! parent || ! elt )
5743
    return HTML_INVALID ;
5744
  if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5745
    return HTML_INVALID ;
5746
5747
  return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5748
}
5749
/**
5750
 * htmlAttrAllowed:
5751
 * @elt: HTML element
5752
 * @attr: HTML attribute
5753
 * @legacy: whether to allow deprecated attributes
5754
 *
5755
 * Checks whether an attribute is valid for an element
5756
 * Has full knowledge of Required and Deprecated attributes
5757
 *
5758
 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5759
 */
5760
htmlStatus
5761
htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5762
  const char** p ;
5763
5764
  if ( !elt || ! attr )
5765
	return HTML_INVALID ;
5766
5767
  if ( elt->attrs_req )
5768
    for ( p = elt->attrs_req; *p; ++p)
5769
      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5770
        return HTML_REQUIRED ;
5771
5772
  if ( elt->attrs_opt )
5773
    for ( p = elt->attrs_opt; *p; ++p)
5774
      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5775
        return HTML_VALID ;
5776
5777
  if ( legacy && elt->attrs_depr )
5778
    for ( p = elt->attrs_depr; *p; ++p)
5779
      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5780
        return HTML_DEPRECATED ;
5781
5782
  return HTML_INVALID ;
5783
}
5784
/**
5785
 * htmlNodeStatus:
5786
 * @node: an htmlNodePtr in a tree
5787
 * @legacy: whether to allow deprecated elements (YES is faster here
5788
 *	for Element nodes)
5789
 *
5790
 * Checks whether the tree node is valid.  Experimental (the author
5791
 *     only uses the HTML enhancements in a SAX parser)
5792
 *
5793
 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5794
 *	legacy allowed) or htmlElementStatusHere (otherwise).
5795
 *	for Attribute nodes, a return from htmlAttrAllowed
5796
 *	for other nodes, HTML_NA (no checks performed)
5797
 */
5798
htmlStatus
5799
htmlNodeStatus(const htmlNodePtr node, int legacy) {
5800
  if ( ! node )
5801
    return HTML_INVALID ;
5802
5803
  switch ( node->type ) {
5804
    case XML_ELEMENT_NODE:
5805
      return legacy
5806
	? ( htmlElementAllowedHere (
5807
		htmlTagLookup(node->parent->name) , node->name
5808
		) ? HTML_VALID : HTML_INVALID )
5809
	: htmlElementStatusHere(
5810
		htmlTagLookup(node->parent->name) ,
5811
		htmlTagLookup(node->name) )
5812
	;
5813
    case XML_ATTRIBUTE_NODE:
5814
      return htmlAttrAllowed(
5815
	htmlTagLookup(node->parent->name) , node->name, legacy) ;
5816
    default: return HTML_NA ;
5817
  }
5818
}
5819
/************************************************************************
5820
 *									*
5821
 *	New set (2.6.0) of simpler and more flexible APIs		*
5822
 *									*
5823
 ************************************************************************/
5824
/**
5825
 * DICT_FREE:
5826
 * @str:  a string
5827
 *
5828
 * Free a string if it is not owned by the "dict" dictionnary in the
5829
 * current scope
5830
 */
5831
#define DICT_FREE(str)						\
5832
	if ((str) && ((!dict) || 				\
5833
	    (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))	\
5834
	    xmlFree((char *)(str));
5835
5836
/**
5837
 * htmlCtxtReset:
5838
 * @ctxt: an HTML parser context
5839
 *
5840
 * Reset a parser context
5841
 */
5842
void
5843
htmlCtxtReset(htmlParserCtxtPtr ctxt)
5844
{
5845
    xmlParserInputPtr input;
5846
    xmlDictPtr dict;
5847
    
5848
    if (ctxt == NULL)
5849
        return;
5850
5851
    xmlInitParser();
5852
    dict = ctxt->dict;
5853
5854
    while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5855
        xmlFreeInputStream(input);
5856
    }
5857
    ctxt->inputNr = 0;
5858
    ctxt->input = NULL;
5859
5860
    ctxt->spaceNr = 0;
5861
    if (ctxt->spaceTab != NULL) {
5862
	ctxt->spaceTab[0] = -1;
5863
	ctxt->space = &ctxt->spaceTab[0];
5864
    } else {
5865
	ctxt->space = NULL;
5866
    }
5867
5868
5869
    ctxt->nodeNr = 0;
5870
    ctxt->node = NULL;
5871
5872
    ctxt->nameNr = 0;
5873
    ctxt->name = NULL;
5874
5875
    DICT_FREE(ctxt->version);
5876
    ctxt->version = NULL;
5877
    DICT_FREE(ctxt->encoding);
5878
    ctxt->encoding = NULL;
5879
    DICT_FREE(ctxt->directory);
5880
    ctxt->directory = NULL;
5881
    DICT_FREE(ctxt->extSubURI);
5882
    ctxt->extSubURI = NULL;
5883
    DICT_FREE(ctxt->extSubSystem);
5884
    ctxt->extSubSystem = NULL;
5885
    if (ctxt->myDoc != NULL)
5886
        xmlFreeDoc(ctxt->myDoc);
5887
    ctxt->myDoc = NULL;
5888
5889
    ctxt->standalone = -1;
5890
    ctxt->hasExternalSubset = 0;
5891
    ctxt->hasPErefs = 0;
5892
    ctxt->html = 1;
5893
    ctxt->external = 0;
5894
    ctxt->instate = XML_PARSER_START;
5895
    ctxt->token = 0;
5896
5897
    ctxt->wellFormed = 1;
5898
    ctxt->nsWellFormed = 1;
5899
    ctxt->valid = 1;
5900
    ctxt->vctxt.userData = ctxt;
5901
    ctxt->vctxt.error = xmlParserValidityError;
5902
    ctxt->vctxt.warning = xmlParserValidityWarning;
5903
    ctxt->record_info = 0;
5904
    ctxt->nbChars = 0;
5905
    ctxt->checkIndex = 0;
5906
    ctxt->inSubset = 0;
5907
    ctxt->errNo = XML_ERR_OK;
5908
    ctxt->depth = 0;
5909
    ctxt->charset = XML_CHAR_ENCODING_NONE;
5910
    ctxt->catalogs = NULL;
5911
    xmlInitNodeInfoSeq(&ctxt->node_seq);
5912
5913
    if (ctxt->attsDefault != NULL) {
5914
        xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
5915
        ctxt->attsDefault = NULL;
5916
    }
5917
    if (ctxt->attsSpecial != NULL) {
5918
        xmlHashFree(ctxt->attsSpecial, NULL);
5919
        ctxt->attsSpecial = NULL;
5920
    }
5921
}
5922
5923
/**
5924
 * htmlCtxtUseOptions:
5925
 * @ctxt: an HTML parser context
5926
 * @options:  a combination of htmlParserOption(s)
5927
 *
5928
 * Applies the options to the parser context
5929
 *
5930
 * Returns 0 in case of success, the set of unknown or unimplemented options
5931
 *         in case of error.
5932
 */
5933
int
5934
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5935
{
5936
    if (ctxt == NULL)
5937
        return(-1);
5938
5939
    if (options & HTML_PARSE_NOWARNING) {
5940
        ctxt->sax->warning = NULL;
5941
        ctxt->vctxt.warning = NULL;
5942
        options -= XML_PARSE_NOWARNING;
5943
	ctxt->options |= XML_PARSE_NOWARNING;
5944
    }
5945
    if (options & HTML_PARSE_NOERROR) {
5946
        ctxt->sax->error = NULL;
5947
        ctxt->vctxt.error = NULL;
5948
        ctxt->sax->fatalError = NULL;
5949
        options -= XML_PARSE_NOERROR;
5950
	ctxt->options |= XML_PARSE_NOERROR;
5951
    }
5952
    if (options & HTML_PARSE_PEDANTIC) {
5953
        ctxt->pedantic = 1;
5954
        options -= XML_PARSE_PEDANTIC;
5955
	ctxt->options |= XML_PARSE_PEDANTIC;
5956
    } else
5957
        ctxt->pedantic = 0;
5958
    if (options & XML_PARSE_NOBLANKS) {
5959
        ctxt->keepBlanks = 0;
5960
        ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5961
        options -= XML_PARSE_NOBLANKS;
5962
	ctxt->options |= XML_PARSE_NOBLANKS;
5963
    } else
5964
        ctxt->keepBlanks = 1;
5965
    if (options & HTML_PARSE_RECOVER) {
5966
        ctxt->recovery = 1;
5967
	options -= HTML_PARSE_RECOVER;
5968
    } else
5969
        ctxt->recovery = 0;
5970
    if (options & HTML_PARSE_COMPACT) {
5971
	ctxt->options |= HTML_PARSE_COMPACT;
5972
        options -= HTML_PARSE_COMPACT;
5973
    }
5974
    ctxt->dictNames = 0;
5975
    return (options);
5976
}
5977
5978
/**
5979
 * htmlDoRead:
5980
 * @ctxt:  an HTML parser context
5981
 * @URL:  the base URL to use for the document
5982
 * @encoding:  the document encoding, or NULL
5983
 * @options:  a combination of htmlParserOption(s)
5984
 * @reuse:  keep the context for reuse
5985
 *
5986
 * Common front-end for the htmlRead functions
5987
 * 
5988
 * Returns the resulting document tree or NULL
5989
 */
5990
static htmlDocPtr
5991
htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
5992
          int options, int reuse)
5993
{
5994
    htmlDocPtr ret;
5995
    
5996
    htmlCtxtUseOptions(ctxt, options);
5997
    ctxt->html = 1;
5998
    if (encoding != NULL) {
5999
        xmlCharEncodingHandlerPtr hdlr;
6000
6001
	hdlr = xmlFindCharEncodingHandler(encoding);
6002
	if (hdlr != NULL) {
6003
	    xmlSwitchToEncoding(ctxt, hdlr);
6004
	    if (ctxt->input->encoding != NULL)
6005
	      xmlFree((xmlChar *) ctxt->input->encoding);
6006
            ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6007
        }
6008
    }
6009
    if ((URL != NULL) && (ctxt->input != NULL) &&
6010
        (ctxt->input->filename == NULL))
6011
        ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6012
    htmlParseDocument(ctxt);
6013
    ret = ctxt->myDoc;
6014
    ctxt->myDoc = NULL;
6015
    if (!reuse) {
6016
        if ((ctxt->dictNames) &&
6017
	    (ret != NULL) &&
6018
	    (ret->dict == ctxt->dict))
6019
	    ctxt->dict = NULL;
6020
	xmlFreeParserCtxt(ctxt);
6021
    }
6022
    return (ret);
6023
}
6024
6025
/**
6026
 * htmlReadDoc:
6027
 * @cur:  a pointer to a zero terminated string
6028
 * @URL:  the base URL to use for the document
6029
 * @encoding:  the document encoding, or NULL
6030
 * @options:  a combination of htmlParserOption(s)
6031
 *
6032
 * parse an XML in-memory document and build a tree.
6033
 * 
6034
 * Returns the resulting document tree
6035
 */
6036
htmlDocPtr
6037
htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6038
{
6039
    htmlParserCtxtPtr ctxt;
6040
6041
    if (cur == NULL)
6042
        return (NULL);
6043
6044
    xmlInitParser();
6045
    ctxt = htmlCreateDocParserCtxt(cur, NULL);
6046
    if (ctxt == NULL)
6047
        return (NULL);
6048
    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6049
}
6050
6051
/**
6052
 * htmlReadFile:
6053
 * @filename:  a file or URL
6054
 * @encoding:  the document encoding, or NULL
6055
 * @options:  a combination of htmlParserOption(s)
6056
 *
6057
 * parse an XML file from the filesystem or the network.
6058
 * 
6059
 * Returns the resulting document tree
6060
 */
6061
htmlDocPtr
6062
htmlReadFile(const char *filename, const char *encoding, int options)
6063
{
6064
    htmlParserCtxtPtr ctxt;
6065
6066
    xmlInitParser();
6067
    ctxt = htmlCreateFileParserCtxt(filename, encoding);
6068
    if (ctxt == NULL)
6069
        return (NULL);
6070
    return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6071
}
6072
6073
/**
6074
 * htmlReadMemory:
6075
 * @buffer:  a pointer to a char array
6076
 * @size:  the size of the array
6077
 * @URL:  the base URL to use for the document
6078
 * @encoding:  the document encoding, or NULL
6079
 * @options:  a combination of htmlParserOption(s)
6080
 *
6081
 * parse an XML in-memory document and build a tree.
6082
 * 
6083
 * Returns the resulting document tree
6084
 */
6085
htmlDocPtr
6086
htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6087
{
6088
    htmlParserCtxtPtr ctxt;
6089
6090
    xmlInitParser();
6091
    ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6092
    if (ctxt == NULL)
6093
        return (NULL);
6094
    htmlDefaultSAXHandlerInit();
6095
    if (ctxt->sax != NULL)
6096
        memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6097
    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6098
}
6099
6100
/**
6101
 * htmlReadFd:
6102
 * @fd:  an open file descriptor
6103
 * @URL:  the base URL to use for the document
6104
 * @encoding:  the document encoding, or NULL
6105
 * @options:  a combination of htmlParserOption(s)
6106
 *
6107
 * parse an XML from a file descriptor and build a tree.
6108
 * 
6109
 * Returns the resulting document tree
6110
 */
6111
htmlDocPtr
6112
htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6113
{
6114
    htmlParserCtxtPtr ctxt;
6115
    xmlParserInputBufferPtr input;
6116
    xmlParserInputPtr stream;
6117
6118
    if (fd < 0)
6119
        return (NULL);
6120
6121
    xmlInitParser();
6122
    input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6123
    if (input == NULL)
6124
        return (NULL);
6125
    ctxt = xmlNewParserCtxt();
6126
    if (ctxt == NULL) {
6127
        xmlFreeParserInputBuffer(input);
6128
        return (NULL);
6129
    }
6130
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6131
    if (stream == NULL) {
6132
        xmlFreeParserInputBuffer(input);
6133
	xmlFreeParserCtxt(ctxt);
6134
        return (NULL);
6135
    }
6136
    inputPush(ctxt, stream);
6137
    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6138
}
6139
6140
/**
6141
 * htmlReadIO:
6142
 * @ioread:  an I/O read function
6143
 * @ioclose:  an I/O close function
6144
 * @ioctx:  an I/O handler
6145
 * @URL:  the base URL to use for the document
6146
 * @encoding:  the document encoding, or NULL
6147
 * @options:  a combination of htmlParserOption(s)
6148
 *
6149
 * parse an HTML document from I/O functions and source and build a tree.
6150
 * 
6151
 * Returns the resulting document tree
6152
 */
6153
htmlDocPtr
6154
htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6155
          void *ioctx, const char *URL, const char *encoding, int options)
6156
{
6157
    htmlParserCtxtPtr ctxt;
6158
    xmlParserInputBufferPtr input;
6159
    xmlParserInputPtr stream;
6160
6161
    if (ioread == NULL)
6162
        return (NULL);
6163
    xmlInitParser();
6164
6165
    input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6166
                                         XML_CHAR_ENCODING_NONE);
6167
    if (input == NULL)
6168
        return (NULL);
6169
    ctxt = htmlNewParserCtxt();
6170
    if (ctxt == NULL) {
6171
        xmlFreeParserInputBuffer(input);
6172
        return (NULL);
6173
    }
6174
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6175
    if (stream == NULL) {
6176
        xmlFreeParserInputBuffer(input);
6177
	xmlFreeParserCtxt(ctxt);
6178
        return (NULL);
6179
    }
6180
    inputPush(ctxt, stream);
6181
    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6182
}
6183
6184
/**
6185
 * htmlCtxtReadDoc:
6186
 * @ctxt:  an HTML parser context
6187
 * @cur:  a pointer to a zero terminated string
6188
 * @URL:  the base URL to use for the document
6189
 * @encoding:  the document encoding, or NULL
6190
 * @options:  a combination of htmlParserOption(s)
6191
 *
6192
 * parse an XML in-memory document and build a tree.
6193
 * This reuses the existing @ctxt parser context
6194
 * 
6195
 * Returns the resulting document tree
6196
 */
6197
htmlDocPtr
6198
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6199
               const char *URL, const char *encoding, int options)
6200
{
6201
    xmlParserInputPtr stream;
6202
6203
    if (cur == NULL)
6204
        return (NULL);
6205
    if (ctxt == NULL)
6206
        return (NULL);
6207
6208
    htmlCtxtReset(ctxt);
6209
6210
    stream = xmlNewStringInputStream(ctxt, cur);
6211
    if (stream == NULL) {
6212
        return (NULL);
6213
    }
6214
    inputPush(ctxt, stream);
6215
    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6216
}
6217
6218
/**
6219
 * htmlCtxtReadFile:
6220
 * @ctxt:  an HTML parser context
6221
 * @filename:  a file or URL
6222
 * @encoding:  the document encoding, or NULL
6223
 * @options:  a combination of htmlParserOption(s)
6224
 *
6225
 * parse an XML file from the filesystem or the network.
6226
 * This reuses the existing @ctxt parser context
6227
 * 
6228
 * Returns the resulting document tree
6229
 */
6230
htmlDocPtr
6231
htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6232
                const char *encoding, int options)
6233
{
6234
    xmlParserInputPtr stream;
6235
6236
    if (filename == NULL)
6237
        return (NULL);
6238
    if (ctxt == NULL)
6239
        return (NULL);
6240
6241
    htmlCtxtReset(ctxt);
6242
6243
    stream = xmlLoadExternalEntity(filename, NULL, ctxt);
6244
    if (stream == NULL) {
6245
        return (NULL);
6246
    }
6247
    inputPush(ctxt, stream);
6248
    return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6249
}
6250
6251
/**
6252
 * htmlCtxtReadMemory:
6253
 * @ctxt:  an HTML parser context
6254
 * @buffer:  a pointer to a char array
6255
 * @size:  the size of the array
6256
 * @URL:  the base URL to use for the document
6257
 * @encoding:  the document encoding, or NULL
6258
 * @options:  a combination of htmlParserOption(s)
6259
 *
6260
 * parse an XML in-memory document and build a tree.
6261
 * This reuses the existing @ctxt parser context
6262
 * 
6263
 * Returns the resulting document tree
6264
 */
6265
htmlDocPtr
6266
htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6267
                  const char *URL, const char *encoding, int options)
6268
{
6269
    xmlParserInputBufferPtr input;
6270
    xmlParserInputPtr stream;
6271
6272
    if (ctxt == NULL)
6273
        return (NULL);
6274
    if (buffer == NULL)
6275
        return (NULL);
6276
6277
    htmlCtxtReset(ctxt);
6278
6279
    input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6280
    if (input == NULL) {
6281
	return(NULL);
6282
    }
6283
6284
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6285
    if (stream == NULL) {
6286
	xmlFreeParserInputBuffer(input);
6287
	return(NULL);
6288
    }
6289
6290
    inputPush(ctxt, stream);
6291
    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6292
}
6293
6294
/**
6295
 * htmlCtxtReadFd:
6296
 * @ctxt:  an HTML parser context
6297
 * @fd:  an open file descriptor
6298
 * @URL:  the base URL to use for the document
6299
 * @encoding:  the document encoding, or NULL
6300
 * @options:  a combination of htmlParserOption(s)
6301
 *
6302
 * parse an XML from a file descriptor and build a tree.
6303
 * This reuses the existing @ctxt parser context
6304
 * 
6305
 * Returns the resulting document tree
6306
 */
6307
htmlDocPtr
6308
htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6309
              const char *URL, const char *encoding, int options)
6310
{
6311
    xmlParserInputBufferPtr input;
6312
    xmlParserInputPtr stream;
6313
6314
    if (fd < 0)
6315
        return (NULL);
6316
    if (ctxt == NULL)
6317
        return (NULL);
6318
6319
    htmlCtxtReset(ctxt);
6320
6321
6322
    input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6323
    if (input == NULL)
6324
        return (NULL);
6325
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6326
    if (stream == NULL) {
6327
        xmlFreeParserInputBuffer(input);
6328
        return (NULL);
6329
    }
6330
    inputPush(ctxt, stream);
6331
    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6332
}
6333
6334
/**
6335
 * htmlCtxtReadIO:
6336
 * @ctxt:  an HTML parser context
6337
 * @ioread:  an I/O read function
6338
 * @ioclose:  an I/O close function
6339
 * @ioctx:  an I/O handler
6340
 * @URL:  the base URL to use for the document
6341
 * @encoding:  the document encoding, or NULL
6342
 * @options:  a combination of htmlParserOption(s)
6343
 *
6344
 * parse an HTML document from I/O functions and source and build a tree.
6345
 * This reuses the existing @ctxt parser context
6346
 * 
6347
 * Returns the resulting document tree
6348
 */
6349
htmlDocPtr
6350
htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6351
              xmlInputCloseCallback ioclose, void *ioctx,
6352
	      const char *URL,
6353
              const char *encoding, int options)
6354
{
6355
    xmlParserInputBufferPtr input;
6356
    xmlParserInputPtr stream;
6357
6358
    if (ioread == NULL)
6359
        return (NULL);
6360
    if (ctxt == NULL)
6361
        return (NULL);
6362
6363
    htmlCtxtReset(ctxt);
6364
6365
    input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6366
                                         XML_CHAR_ENCODING_NONE);
6367
    if (input == NULL)
6368
        return (NULL);
6369
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6370
    if (stream == NULL) {
6371
        xmlFreeParserInputBuffer(input);
6372
        return (NULL);
6373
    }
6374
    inputPush(ctxt, stream);
6375
    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6376
}
6377
6378
#define bottom_HTMLparser
6379
#include "elfgcchack.h"
6380
#endif /* LIBXML_HTML_ENABLED */