From e062a4a9b3a8ebd3be35c8582c2eb4b6c161d61c Mon Sep 17 00:00:00 2001 From: Nick Wellnhofer Date: Sun, 8 Sep 2024 20:40:36 +0200 Subject: [PATCH] html: Add HTML5 parser option This option passes tokenizer output directly to the SAX callbacks, making it possible to test the tokenizer against the html5lib test suite. This will produce unbalanced calls to the startElement and endElement callbacks, but it's the only way to support a SAX like interface for HTML5. It can be used for filtering or rewriting HTML5, for example. A HTML5 tree builder could then be implemented on top of the SAX callbacks. --- HTMLparser.c | 55 +++++++++++++++++++++++++++++++------ include/libxml/HTMLparser.h | 1 + 2 files changed, 47 insertions(+), 9 deletions(-) diff --git a/HTMLparser.c b/HTMLparser.c index 58d8ff92..7b407803 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -1522,6 +1522,9 @@ htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) const htmlElemDesc *info; int i, priority; + if (ctxt->options & HTML_PARSE_HTML5) + return; + priority = htmlGetEndPriority(newtag); for (i = (ctxt->nameNr - 1); i >= 0; i--) { @@ -1565,6 +1568,9 @@ htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) { int i; + if (ctxt->options & HTML_PARSE_HTML5) + return; + if (ctxt->nameNr == 0) return; for (i = (ctxt->nameNr - 1); i >= 0; i--) { @@ -1590,6 +1596,9 @@ htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt) static void htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag) { + if (ctxt->options & HTML_PARSE_HTML5) + return; + if (newtag == NULL) return; @@ -1667,7 +1676,7 @@ static void htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) { int i; - if (ctxt->options & HTML_PARSE_NOIMPLIED) + if (ctxt->options & (HTML_PARSE_NOIMPLIED | HTML_PARSE_HTML5)) return; if (!htmlOmittedDefaultValue) return; @@ -1738,6 +1747,9 @@ htmlCheckParagraph(htmlParserCtxtPtr ctxt) { if (ctxt == NULL) return(-1); + if (ctxt->options & HTML_PARSE_HTML5) + return(0); + tag = ctxt->name; if (tag == NULL) { htmlAutoClose(ctxt, BAD_CAST"p"); @@ -3893,6 +3905,11 @@ failed: * SAX: Start of Element ! */ if (!discardtag) { + if (ctxt->options & HTML_PARSE_HTML5) { + if (ctxt->nameNr > 0) + htmlnamePop(ctxt); + } + htmlnamePush(ctxt, name); if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) { if (nbatts != 0) @@ -3978,6 +3995,12 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt) "End tag : expected '>'\n", NULL, NULL); } + if (ctxt->options & HTML_PARSE_HTML5) { + if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) + ctxt->sax->endElement(ctxt->userData, name); + return(0); + } + /* * if we ignored misplaced tags in htmlParseStartTag don't pop them * out now. @@ -4217,8 +4240,10 @@ htmlParseElementInternal(htmlParserCtxtPtr ctxt) { if ((CUR == '/') && (NXT(1) == '>')) { SKIP(2); htmlParserFinishElementParsing(ctxt); - if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) - ctxt->sax->endElement(ctxt->userData, name); + if ((ctxt->options & HTML_PARSE_HTML5) == 0) { + if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) + ctxt->sax->endElement(ctxt->userData, name); + } htmlnamePop(ctxt); return(0); } @@ -4245,8 +4270,10 @@ htmlParseElementInternal(htmlParserCtxtPtr ctxt) { */ if ((info != NULL) && (info->empty)) { htmlParserFinishElementParsing(ctxt); - if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) - ctxt->sax->endElement(ctxt->userData, name); + if ((ctxt->options & HTML_PARSE_HTML5) == 0) { + if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) + ctxt->sax->endElement(ctxt->userData, name); + } htmlnamePop(ctxt); return(0); } @@ -5208,8 +5235,11 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { if ((CUR == '/') && (NXT(1) == '>')) { SKIP(2); htmlParserFinishElementParsing(ctxt); - if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) - ctxt->sax->endElement(ctxt->userData, name); + if ((ctxt->options & HTML_PARSE_HTML5) == 0) { + if ((ctxt->sax != NULL) && + (ctxt->sax->endElement != NULL)) + ctxt->sax->endElement(ctxt->userData, name); + } htmlnamePop(ctxt); ctxt->instate = XML_PARSER_CONTENT; break; @@ -5243,8 +5273,11 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { */ if ((info != NULL) && (info->empty)) { htmlParserFinishElementParsing(ctxt); - if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL)) - ctxt->sax->endElement(ctxt->userData, name); + if ((ctxt->options & HTML_PARSE_HTML5) == 0) { + if ((ctxt->sax != NULL) && + (ctxt->sax->endElement != NULL)) + ctxt->sax->endElement(ctxt->userData, name); + } htmlnamePop(ctxt); } @@ -6032,6 +6065,10 @@ htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options) ctxt->options |= HTML_PARSE_NOIMPLIED; options -= HTML_PARSE_NOIMPLIED; } + if (options & HTML_PARSE_HTML5) { + ctxt->options |= HTML_PARSE_HTML5; + options -= HTML_PARSE_HTML5; + } ctxt->dictNames = 0; ctxt->linenumbers = 1; return (options); diff --git a/include/libxml/HTMLparser.h b/include/libxml/HTMLparser.h index 08d7d0ae..4e73e5ef 100644 --- a/include/libxml/HTMLparser.h +++ b/include/libxml/HTMLparser.h @@ -206,6 +206,7 @@ XMLPUBFUN void */ typedef enum { HTML_PARSE_RECOVER = 1<<0, /* Relaxed parsing */ + HTML_PARSE_HTML5 = 1<<1, /* HTML5 support */ HTML_PARSE_NODEFDTD = 1<<2, /* do not default a doctype if not found */ HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */ HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */