mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-07-29 11:41:22 +03:00
html: Add HTML5 parser option
This option passes tokenizer output directly to the SAX callbacks, making it possible to test the tokenizer against the html5lib test suite. This will produce unbalanced calls to the startElement and endElement callbacks, but it's the only way to support a SAX like interface for HTML5. It can be used for filtering or rewriting HTML5, for example. A HTML5 tree builder could then be implemented on top of the SAX callbacks.
This commit is contained in:
55
HTMLparser.c
55
HTMLparser.c
@ -1522,6 +1522,9 @@ htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
|
|||||||
const htmlElemDesc *info;
|
const htmlElemDesc *info;
|
||||||
int i, priority;
|
int i, priority;
|
||||||
|
|
||||||
|
if (ctxt->options & HTML_PARSE_HTML5)
|
||||||
|
return;
|
||||||
|
|
||||||
priority = htmlGetEndPriority(newtag);
|
priority = htmlGetEndPriority(newtag);
|
||||||
|
|
||||||
for (i = (ctxt->nameNr - 1); i >= 0; i--) {
|
for (i = (ctxt->nameNr - 1); i >= 0; i--) {
|
||||||
@ -1565,6 +1568,9 @@ htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
|
|||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
|
if (ctxt->options & HTML_PARSE_HTML5)
|
||||||
|
return;
|
||||||
|
|
||||||
if (ctxt->nameNr == 0)
|
if (ctxt->nameNr == 0)
|
||||||
return;
|
return;
|
||||||
for (i = (ctxt->nameNr - 1); i >= 0; i--) {
|
for (i = (ctxt->nameNr - 1); i >= 0; i--) {
|
||||||
@ -1590,6 +1596,9 @@ htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
|
|||||||
static void
|
static void
|
||||||
htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
|
htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
|
||||||
{
|
{
|
||||||
|
if (ctxt->options & HTML_PARSE_HTML5)
|
||||||
|
return;
|
||||||
|
|
||||||
if (newtag == NULL)
|
if (newtag == NULL)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
@ -1667,7 +1676,7 @@ static void
|
|||||||
htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
|
htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
if (ctxt->options & HTML_PARSE_NOIMPLIED)
|
if (ctxt->options & (HTML_PARSE_NOIMPLIED | HTML_PARSE_HTML5))
|
||||||
return;
|
return;
|
||||||
if (!htmlOmittedDefaultValue)
|
if (!htmlOmittedDefaultValue)
|
||||||
return;
|
return;
|
||||||
@ -1738,6 +1747,9 @@ htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
|
|||||||
|
|
||||||
if (ctxt == NULL)
|
if (ctxt == NULL)
|
||||||
return(-1);
|
return(-1);
|
||||||
|
if (ctxt->options & HTML_PARSE_HTML5)
|
||||||
|
return(0);
|
||||||
|
|
||||||
tag = ctxt->name;
|
tag = ctxt->name;
|
||||||
if (tag == NULL) {
|
if (tag == NULL) {
|
||||||
htmlAutoClose(ctxt, BAD_CAST"p");
|
htmlAutoClose(ctxt, BAD_CAST"p");
|
||||||
@ -3893,6 +3905,11 @@ failed:
|
|||||||
* SAX: Start of Element !
|
* SAX: Start of Element !
|
||||||
*/
|
*/
|
||||||
if (!discardtag) {
|
if (!discardtag) {
|
||||||
|
if (ctxt->options & HTML_PARSE_HTML5) {
|
||||||
|
if (ctxt->nameNr > 0)
|
||||||
|
htmlnamePop(ctxt);
|
||||||
|
}
|
||||||
|
|
||||||
htmlnamePush(ctxt, name);
|
htmlnamePush(ctxt, name);
|
||||||
if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
|
if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
|
||||||
if (nbatts != 0)
|
if (nbatts != 0)
|
||||||
@ -3978,6 +3995,12 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)
|
|||||||
"End tag : expected '>'\n", NULL, NULL);
|
"End tag : expected '>'\n", NULL, NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (ctxt->options & HTML_PARSE_HTML5) {
|
||||||
|
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
|
||||||
|
ctxt->sax->endElement(ctxt->userData, name);
|
||||||
|
return(0);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* if we ignored misplaced tags in htmlParseStartTag don't pop them
|
* if we ignored misplaced tags in htmlParseStartTag don't pop them
|
||||||
* out now.
|
* out now.
|
||||||
@ -4217,8 +4240,10 @@ htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
|
|||||||
if ((CUR == '/') && (NXT(1) == '>')) {
|
if ((CUR == '/') && (NXT(1) == '>')) {
|
||||||
SKIP(2);
|
SKIP(2);
|
||||||
htmlParserFinishElementParsing(ctxt);
|
htmlParserFinishElementParsing(ctxt);
|
||||||
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
|
if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
|
||||||
ctxt->sax->endElement(ctxt->userData, name);
|
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
|
||||||
|
ctxt->sax->endElement(ctxt->userData, name);
|
||||||
|
}
|
||||||
htmlnamePop(ctxt);
|
htmlnamePop(ctxt);
|
||||||
return(0);
|
return(0);
|
||||||
}
|
}
|
||||||
@ -4245,8 +4270,10 @@ htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
|
|||||||
*/
|
*/
|
||||||
if ((info != NULL) && (info->empty)) {
|
if ((info != NULL) && (info->empty)) {
|
||||||
htmlParserFinishElementParsing(ctxt);
|
htmlParserFinishElementParsing(ctxt);
|
||||||
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
|
if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
|
||||||
ctxt->sax->endElement(ctxt->userData, name);
|
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
|
||||||
|
ctxt->sax->endElement(ctxt->userData, name);
|
||||||
|
}
|
||||||
htmlnamePop(ctxt);
|
htmlnamePop(ctxt);
|
||||||
return(0);
|
return(0);
|
||||||
}
|
}
|
||||||
@ -5208,8 +5235,11 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
|||||||
if ((CUR == '/') && (NXT(1) == '>')) {
|
if ((CUR == '/') && (NXT(1) == '>')) {
|
||||||
SKIP(2);
|
SKIP(2);
|
||||||
htmlParserFinishElementParsing(ctxt);
|
htmlParserFinishElementParsing(ctxt);
|
||||||
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
|
if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
|
||||||
ctxt->sax->endElement(ctxt->userData, name);
|
if ((ctxt->sax != NULL) &&
|
||||||
|
(ctxt->sax->endElement != NULL))
|
||||||
|
ctxt->sax->endElement(ctxt->userData, name);
|
||||||
|
}
|
||||||
htmlnamePop(ctxt);
|
htmlnamePop(ctxt);
|
||||||
ctxt->instate = XML_PARSER_CONTENT;
|
ctxt->instate = XML_PARSER_CONTENT;
|
||||||
break;
|
break;
|
||||||
@ -5243,8 +5273,11 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
|||||||
*/
|
*/
|
||||||
if ((info != NULL) && (info->empty)) {
|
if ((info != NULL) && (info->empty)) {
|
||||||
htmlParserFinishElementParsing(ctxt);
|
htmlParserFinishElementParsing(ctxt);
|
||||||
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
|
if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
|
||||||
ctxt->sax->endElement(ctxt->userData, name);
|
if ((ctxt->sax != NULL) &&
|
||||||
|
(ctxt->sax->endElement != NULL))
|
||||||
|
ctxt->sax->endElement(ctxt->userData, name);
|
||||||
|
}
|
||||||
htmlnamePop(ctxt);
|
htmlnamePop(ctxt);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -6032,6 +6065,10 @@ htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
|
|||||||
ctxt->options |= HTML_PARSE_NOIMPLIED;
|
ctxt->options |= HTML_PARSE_NOIMPLIED;
|
||||||
options -= HTML_PARSE_NOIMPLIED;
|
options -= HTML_PARSE_NOIMPLIED;
|
||||||
}
|
}
|
||||||
|
if (options & HTML_PARSE_HTML5) {
|
||||||
|
ctxt->options |= HTML_PARSE_HTML5;
|
||||||
|
options -= HTML_PARSE_HTML5;
|
||||||
|
}
|
||||||
ctxt->dictNames = 0;
|
ctxt->dictNames = 0;
|
||||||
ctxt->linenumbers = 1;
|
ctxt->linenumbers = 1;
|
||||||
return (options);
|
return (options);
|
||||||
|
@ -206,6 +206,7 @@ XMLPUBFUN void
|
|||||||
*/
|
*/
|
||||||
typedef enum {
|
typedef enum {
|
||||||
HTML_PARSE_RECOVER = 1<<0, /* Relaxed parsing */
|
HTML_PARSE_RECOVER = 1<<0, /* Relaxed parsing */
|
||||||
|
HTML_PARSE_HTML5 = 1<<1, /* HTML5 support */
|
||||||
HTML_PARSE_NODEFDTD = 1<<2, /* do not default a doctype if not found */
|
HTML_PARSE_NODEFDTD = 1<<2, /* do not default a doctype if not found */
|
||||||
HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */
|
HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */
|
||||||
HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */
|
HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */
|
||||||
|
Reference in New Issue
Block a user