1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-07-29 11:41:22 +03:00

Skip incorrectly opened HTML comments

Commit 4fd69f3e fixed handling of '<' characters not followed by an
ASCII letter. But a '<!' sequence followed by invalid characters should
be treated as bogus comment and skipped.

Fixes #380.
This commit is contained in:
Nick Wellnhofer
2022-07-15 14:02:26 +02:00
parent 6722d22c88
commit e986d09cf5
6 changed files with 123 additions and 60 deletions

View File

@ -2545,6 +2545,21 @@ htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt); static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
static void
htmlSkipBogusComment(htmlParserCtxtPtr ctxt) {
int c;
htmlParseErr(ctxt, XML_HTML_INCORRECTLY_OPENED_COMMENT,
"Incorrectly opened comment\n", NULL, NULL);
do {
c = CUR;
if (c == 0)
break;
NEXT;
} while (c != '>');
}
/** /**
* htmlParseHTMLName: * htmlParseHTMLName:
* @ctxt: an HTML parser context * @ctxt: an HTML parser context
@ -4380,26 +4395,28 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
htmlParseScript(ctxt); htmlParseScript(ctxt);
} }
/* else if ((CUR == '<') && (NXT(1) == '!')) {
* Sometimes DOCTYPE arrives in the middle of the document /*
*/ * Sometimes DOCTYPE arrives in the middle of the document
else if ((CUR == '<') && (NXT(1) == '!') && */
(UPP(2) == 'D') && (UPP(3) == 'O') && if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
(UPP(4) == 'C') && (UPP(5) == 'T') && (UPP(4) == 'C') && (UPP(5) == 'T') &&
(UPP(6) == 'Y') && (UPP(7) == 'P') && (UPP(6) == 'Y') && (UPP(7) == 'P') &&
(UPP(8) == 'E')) { (UPP(8) == 'E')) {
htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
"Misplaced DOCTYPE declaration\n", "Misplaced DOCTYPE declaration\n",
BAD_CAST "DOCTYPE" , NULL); BAD_CAST "DOCTYPE" , NULL);
htmlParseDocTypeDecl(ctxt); htmlParseDocTypeDecl(ctxt);
} }
/*
/* * First case : a comment
* First case : a comment */
*/ else if ((NXT(2) == '-') && (NXT(3) == '-')) {
else if ((CUR == '<') && (NXT(1) == '!') && htmlParseComment(ctxt);
(NXT(2) == '-') && (NXT(3) == '-')) { }
htmlParseComment(ctxt); else {
htmlSkipBogusComment(ctxt);
}
} }
/* /*
@ -4785,26 +4802,28 @@ htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
htmlParseScript(ctxt); htmlParseScript(ctxt);
} }
/* else if ((CUR == '<') && (NXT(1) == '!')) {
* Sometimes DOCTYPE arrives in the middle of the document /*
*/ * Sometimes DOCTYPE arrives in the middle of the document
else if ((CUR == '<') && (NXT(1) == '!') && */
(UPP(2) == 'D') && (UPP(3) == 'O') && if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
(UPP(4) == 'C') && (UPP(5) == 'T') && (UPP(4) == 'C') && (UPP(5) == 'T') &&
(UPP(6) == 'Y') && (UPP(7) == 'P') && (UPP(6) == 'Y') && (UPP(7) == 'P') &&
(UPP(8) == 'E')) { (UPP(8) == 'E')) {
htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
"Misplaced DOCTYPE declaration\n", "Misplaced DOCTYPE declaration\n",
BAD_CAST "DOCTYPE" , NULL); BAD_CAST "DOCTYPE" , NULL);
htmlParseDocTypeDecl(ctxt); htmlParseDocTypeDecl(ctxt);
} }
/*
/* * First case : a comment
* First case : a comment */
*/ else if ((NXT(2) == '-') && (NXT(3) == '-')) {
else if ((CUR == '<') && (NXT(1) == '!') && htmlParseComment(ctxt);
(NXT(2) == '-') && (NXT(3) == '-')) { }
htmlParseComment(ctxt); else {
htmlSkipBogusComment(ctxt);
}
} }
/* /*
@ -5949,31 +5968,37 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
#endif #endif
break; break;
} }
} else if ((cur == '<') && (next == '!') && } else if ((cur == '<') && (next == '!')) {
(UPP(2) == 'D') && (UPP(3) == 'O') &&
(UPP(4) == 'C') && (UPP(5) == 'T') &&
(UPP(6) == 'Y') && (UPP(7) == 'P') &&
(UPP(8) == 'E')) {
/* /*
* Sometimes DOCTYPE arrives in the middle of the document * Sometimes DOCTYPE arrives in the middle of the document
*/ */
if ((!terminate) && if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
(htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0)) (UPP(4) == 'C') && (UPP(5) == 'T') &&
goto done; (UPP(6) == 'Y') && (UPP(7) == 'P') &&
htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR, (UPP(8) == 'E')) {
"Misplaced DOCTYPE declaration\n", if ((!terminate) &&
BAD_CAST "DOCTYPE" , NULL); (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
htmlParseDocTypeDecl(ctxt); goto done;
} else if ((cur == '<') && (next == '!') && htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
(in->cur[2] == '-') && (in->cur[3] == '-')) { "Misplaced DOCTYPE declaration\n",
if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0)) BAD_CAST "DOCTYPE" , NULL);
goto done; htmlParseDocTypeDecl(ctxt);
} else if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
if ((!terminate) &&
(htmlParseLookupCommentEnd(ctxt) < 0))
goto done;
#ifdef DEBUG_PUSH #ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext, xmlGenericError(xmlGenericErrorContext,
"HPP: Parsing Comment\n"); "HPP: Parsing Comment\n");
#endif #endif
htmlParseComment(ctxt); htmlParseComment(ctxt);
ctxt->instate = XML_PARSER_CONTENT; ctxt->instate = XML_PARSER_CONTENT;
} else {
if ((!terminate) &&
(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
goto done;
htmlSkipBogusComment(ctxt);
}
} else if ((cur == '<') && (next == '?')) { } else if ((cur == '<') && (next == '?')) {
if ((!terminate) && if ((!terminate) &&
(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0)) (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))

View File

@ -260,6 +260,7 @@ typedef enum {
XML_DTD_DUP_TOKEN, /* 541 */ XML_DTD_DUP_TOKEN, /* 541 */
XML_HTML_STRUCURE_ERROR = 800, XML_HTML_STRUCURE_ERROR = 800,
XML_HTML_UNKNOWN_TAG, /* 801 */ XML_HTML_UNKNOWN_TAG, /* 801 */
XML_HTML_INCORRECTLY_OPENED_COMMENT, /* 802 */
XML_RNGP_ANYNAME_ATTR_ANCESTOR = 1000, XML_RNGP_ANYNAME_ATTR_ANCESTOR = 1000,
XML_RNGP_ATTR_CONFLICT, /* 1001 */ XML_RNGP_ATTR_CONFLICT, /* 1001 */
XML_RNGP_ATTRIBUTE_CHILDREN, /* 1002 */ XML_RNGP_ATTRIBUTE_CHILDREN, /* 1002 */

View File

@ -0,0 +1,6 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html>
<body>
...
</body>
</html>

View File

@ -0,0 +1,6 @@
./test/HTML/issue380.html:3: HTML parser error : Incorrectly opened comment
<![if !supportLists]>...<![endif]>
^
./test/HTML/issue380.html:3: HTML parser error : Incorrectly opened comment
<![if !supportLists]>...<![endif]>
^

View File

@ -0,0 +1,20 @@
SAX.setDocumentLocator()
SAX.startDocument()
SAX.startElement(html)
SAX.characters(
, 3)
SAX.startElement(body)
SAX.characters(
, 5)
SAX.error: Incorrectly opened comment
SAX.characters(..., 3)
SAX.error: Incorrectly opened comment
SAX.characters(
, 3)
SAX.endElement(body)
SAX.characters(
, 1)
SAX.endElement(html)
SAX.characters(
, 1)
SAX.endDocument()

5
test/HTML/issue380.html Normal file
View File

@ -0,0 +1,5 @@
<html>
<body>
<![if !supportLists]>...<![endif]>
</body>
</html>