1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-07-08 23:22:04 +03:00

html: Add HTML5 parser option

This option passes tokenizer output directly to the SAX callbacks,
making it possible to test the tokenizer against the html5lib test
suite.

This will produce unbalanced calls to the startElement and endElement
callbacks, but it's the only way to support a SAX like interface for
HTML5. It can be used for filtering or rewriting HTML5, for example.

A HTML5 tree builder could then be implemented on top of the SAX
callbacks.
This commit is contained in:
Nick Wellnhofer
2024-09-08 20:40:36 +02:00
parent 17da54c522
commit e062a4a9b3
2 changed files with 47 additions and 9 deletions

View File

@ -1522,6 +1522,9 @@ htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
const htmlElemDesc *info;
int i, priority;
if (ctxt->options & HTML_PARSE_HTML5)
return;
priority = htmlGetEndPriority(newtag);
for (i = (ctxt->nameNr - 1); i >= 0; i--) {
@ -1565,6 +1568,9 @@ htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
{
int i;
if (ctxt->options & HTML_PARSE_HTML5)
return;
if (ctxt->nameNr == 0)
return;
for (i = (ctxt->nameNr - 1); i >= 0; i--) {
@ -1590,6 +1596,9 @@ htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
static void
htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
{
if (ctxt->options & HTML_PARSE_HTML5)
return;
if (newtag == NULL)
return;
@ -1667,7 +1676,7 @@ static void
htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
int i;
if (ctxt->options & HTML_PARSE_NOIMPLIED)
if (ctxt->options & (HTML_PARSE_NOIMPLIED | HTML_PARSE_HTML5))
return;
if (!htmlOmittedDefaultValue)
return;
@ -1738,6 +1747,9 @@ htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
if (ctxt == NULL)
return(-1);
if (ctxt->options & HTML_PARSE_HTML5)
return(0);
tag = ctxt->name;
if (tag == NULL) {
htmlAutoClose(ctxt, BAD_CAST"p");
@ -3893,6 +3905,11 @@ failed:
* SAX: Start of Element !
*/
if (!discardtag) {
if (ctxt->options & HTML_PARSE_HTML5) {
if (ctxt->nameNr > 0)
htmlnamePop(ctxt);
}
htmlnamePush(ctxt, name);
if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
if (nbatts != 0)
@ -3978,6 +3995,12 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)
"End tag : expected '>'\n", NULL, NULL);
}
if (ctxt->options & HTML_PARSE_HTML5) {
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
ctxt->sax->endElement(ctxt->userData, name);
return(0);
}
/*
* if we ignored misplaced tags in htmlParseStartTag don't pop them
* out now.
@ -4217,8 +4240,10 @@ htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
if ((CUR == '/') && (NXT(1) == '>')) {
SKIP(2);
htmlParserFinishElementParsing(ctxt);
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
ctxt->sax->endElement(ctxt->userData, name);
if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
ctxt->sax->endElement(ctxt->userData, name);
}
htmlnamePop(ctxt);
return(0);
}
@ -4245,8 +4270,10 @@ htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
*/
if ((info != NULL) && (info->empty)) {
htmlParserFinishElementParsing(ctxt);
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
ctxt->sax->endElement(ctxt->userData, name);
if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
ctxt->sax->endElement(ctxt->userData, name);
}
htmlnamePop(ctxt);
return(0);
}
@ -5208,8 +5235,11 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
if ((CUR == '/') && (NXT(1) == '>')) {
SKIP(2);
htmlParserFinishElementParsing(ctxt);
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
ctxt->sax->endElement(ctxt->userData, name);
if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
if ((ctxt->sax != NULL) &&
(ctxt->sax->endElement != NULL))
ctxt->sax->endElement(ctxt->userData, name);
}
htmlnamePop(ctxt);
ctxt->instate = XML_PARSER_CONTENT;
break;
@ -5243,8 +5273,11 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
*/
if ((info != NULL) && (info->empty)) {
htmlParserFinishElementParsing(ctxt);
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
ctxt->sax->endElement(ctxt->userData, name);
if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
if ((ctxt->sax != NULL) &&
(ctxt->sax->endElement != NULL))
ctxt->sax->endElement(ctxt->userData, name);
}
htmlnamePop(ctxt);
}
@ -6032,6 +6065,10 @@ htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
ctxt->options |= HTML_PARSE_NOIMPLIED;
options -= HTML_PARSE_NOIMPLIED;
}
if (options & HTML_PARSE_HTML5) {
ctxt->options |= HTML_PARSE_HTML5;
options -= HTML_PARSE_HTML5;
}
ctxt->dictNames = 0;
ctxt->linenumbers = 1;
return (options);