mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-07-04 08:02:34 +03:00
Fix recovery from invalid HTML start tags
Only try to parse a start tag if there's a '<' followed by an ASCII letter. This is more in line with HTML5 and the old behavior in recovery mode. Emit a literal '<' if the following character is invalid. Fixes #101. Fixes #339.
This commit is contained in:
44
HTMLparser.c
44
HTMLparser.c
@ -3960,26 +3960,6 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
|
|||||||
htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
|
htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
|
||||||
"htmlParseStartTag: invalid element name\n",
|
"htmlParseStartTag: invalid element name\n",
|
||||||
NULL, NULL);
|
NULL, NULL);
|
||||||
/*
|
|
||||||
* The recovery code is disabled for now as it can result in
|
|
||||||
* quadratic behavior with the push parser. htmlParseStartTag
|
|
||||||
* must consume all content up to the final '>' in order to avoid
|
|
||||||
* rescanning for this terminator.
|
|
||||||
*
|
|
||||||
* For a proper fix in line with HTML5, htmlParseStartTag and
|
|
||||||
* htmlParseElement should only be called when there's an ASCII
|
|
||||||
* alpha character following the initial '<'. Otherwise, the '<'
|
|
||||||
* should be emitted as text (unless followed by '!', '/' or '?').
|
|
||||||
*/
|
|
||||||
#if 0
|
|
||||||
/* if recover preserve text on classic misconstructs */
|
|
||||||
if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
|
|
||||||
(CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
|
|
||||||
htmlParseCharDataInternal(ctxt, '<');
|
|
||||||
return(-1);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* Dump the bogus tag like browsers do */
|
/* Dump the bogus tag like browsers do */
|
||||||
while ((CUR != 0) && (CUR != '>') &&
|
while ((CUR != 0) && (CUR != '>') &&
|
||||||
(ctxt->instate != XML_PARSER_EOF))
|
(ctxt->instate != XML_PARSER_EOF))
|
||||||
@ -4432,9 +4412,15 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
|
|||||||
/*
|
/*
|
||||||
* Third case : a sub-element.
|
* Third case : a sub-element.
|
||||||
*/
|
*/
|
||||||
else if (CUR == '<') {
|
else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
|
||||||
htmlParseElement(ctxt);
|
htmlParseElement(ctxt);
|
||||||
}
|
}
|
||||||
|
else if (CUR == '<') {
|
||||||
|
if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
|
||||||
|
(ctxt->sax->characters != NULL))
|
||||||
|
ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
|
||||||
|
NEXT;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Fourth case : a reference. If if has not been resolved,
|
* Fourth case : a reference. If if has not been resolved,
|
||||||
@ -4831,13 +4817,19 @@ htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
|
|||||||
/*
|
/*
|
||||||
* Third case : a sub-element.
|
* Third case : a sub-element.
|
||||||
*/
|
*/
|
||||||
else if (CUR == '<') {
|
else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
|
||||||
htmlParseElementInternal(ctxt);
|
htmlParseElementInternal(ctxt);
|
||||||
if (currentNode != NULL) xmlFree(currentNode);
|
if (currentNode != NULL) xmlFree(currentNode);
|
||||||
|
|
||||||
currentNode = xmlStrdup(ctxt->name);
|
currentNode = xmlStrdup(ctxt->name);
|
||||||
depth = ctxt->nameNr;
|
depth = ctxt->nameNr;
|
||||||
}
|
}
|
||||||
|
else if (CUR == '<') {
|
||||||
|
if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
|
||||||
|
(ctxt->sax->characters != NULL))
|
||||||
|
ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
|
||||||
|
NEXT;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Fourth case : a reference. If if has not been resolved,
|
* Fourth case : a reference. If if has not been resolved,
|
||||||
@ -6004,7 +5996,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
|||||||
"HPP: entering END_TAG\n");
|
"HPP: entering END_TAG\n");
|
||||||
#endif
|
#endif
|
||||||
break;
|
break;
|
||||||
} else if (cur == '<') {
|
} else if ((cur == '<') && IS_ASCII_LETTER(next)) {
|
||||||
if ((!terminate) && (next == 0))
|
if ((!terminate) && (next == 0))
|
||||||
goto done;
|
goto done;
|
||||||
ctxt->instate = XML_PARSER_START_TAG;
|
ctxt->instate = XML_PARSER_START_TAG;
|
||||||
@ -6014,6 +6006,12 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
|
|||||||
"HPP: entering START_TAG\n");
|
"HPP: entering START_TAG\n");
|
||||||
#endif
|
#endif
|
||||||
break;
|
break;
|
||||||
|
} else if (cur == '<') {
|
||||||
|
if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
|
||||||
|
(ctxt->sax->characters != NULL))
|
||||||
|
ctxt->sax->characters(ctxt->userData,
|
||||||
|
BAD_CAST "<", 1);
|
||||||
|
NEXT;
|
||||||
} else {
|
} else {
|
||||||
/*
|
/*
|
||||||
* check that the text sequence is complete
|
* check that the text sequence is complete
|
||||||
|
Reference in New Issue
Block a user