mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-07-29 11:41:22 +03:00
Different approach to fix quadratic behavior in HTML push parser
The old approach introduced a regression, see issue #312 and the previous commit. Disable code that tries to recover from invalid start tags. This only affects "recovery" mode. Add a comment outlining a better fix in accordance with the HTML5 spec.
This commit is contained in:
14
HTMLparser.c
14
HTMLparser.c
@ -3958,13 +3958,25 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
|
||||
htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
|
||||
"htmlParseStartTag: invalid element name\n",
|
||||
NULL, NULL);
|
||||
/*
|
||||
* The recovery code is disabled for now as it can result in
|
||||
* quadratic behavior with the push parser. htmlParseStartTag
|
||||
* must consume all content up to the final '>' in order to avoid
|
||||
* rescanning for this terminator.
|
||||
*
|
||||
* For a proper fix in line with HTML5, htmlParseStartTag and
|
||||
* htmlParseElement should only be called when there's an ASCII
|
||||
* alpha character following the initial '<'. Otherwise, the '<'
|
||||
* should be emitted as text (unless followed by '!', '/' or '?').
|
||||
*/
|
||||
#if 0
|
||||
/* if recover preserve text on classic misconstructs */
|
||||
if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
|
||||
(CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
|
||||
htmlParseCharDataInternal(ctxt, '<');
|
||||
return(-1);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/* Dump the bogus tag like browsers do */
|
||||
while ((CUR != 0) && (CUR != '>') &&
|
||||
|
Reference in New Issue
Block a user