Fix recovery from invalid HTML start tags

Only try to parse a start tag if there's a '<' followed by an ASCII letter. This is more in line with HTML5 and the old behavior in recovery mode. Emit a literal '<' if the following character is invalid. Fixes #101. Fixes #339.
2025-10-24 13:33:01 +03:00 · 2022-02-22 18:15:53 +01:00
parent b057239b3f
commit 4fd69f3e27
1 changed files with 21 additions and 23 deletions
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -3960,26 +3960,6 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
 	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
 	             "htmlParseStartTag: invalid element name\n",
 		     NULL, NULL);
-        /*
-         * The recovery code is disabled for now as it can result in
-         * quadratic behavior with the push parser. htmlParseStartTag
-         * must consume all content up to the final '>' in order to avoid
-         * rescanning for this terminator.
-         *
-         * For a proper fix in line with HTML5, htmlParseStartTag and
-         * htmlParseElement should only be called when there's an ASCII
-         * alpha character following the initial '<'. Otherwise, the '<'
-         * should be emitted as text (unless followed by '!', '/' or '?').
-         */
-#if 0
-	/* if recover preserve text on classic misconstructs */
-	if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
-	    (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
-	    htmlParseCharDataInternal(ctxt, '<');
-	    return(-1);
-	}
-#endif
-
 	/* Dump the bogus tag like browsers do */
 	while ((CUR != 0) && (CUR != '>') &&
               (ctxt->instate != XML_PARSER_EOF))
@@ -4432,9 +4412,15 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
 	    /*
 	     * Third case :  a sub-element.
 	     */
-	    else if (CUR == '<') {
+	    else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
 		htmlParseElement(ctxt);
 	    }
+	    else if (CUR == '<') {
+                if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
+                    (ctxt->sax->characters != NULL))
+                    ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
+                NEXT;
+	    }

 	    /*
 	     * Fourth case : a reference. If if has not been resolved,
@@ -4831,13 +4817,19 @@ htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
 	    /*
 	     * Third case :  a sub-element.
 	     */
-	    else if (CUR == '<') {
+	    else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
 		htmlParseElementInternal(ctxt);
 		if (currentNode != NULL) xmlFree(currentNode);

 		currentNode = xmlStrdup(ctxt->name);
 		depth = ctxt->nameNr;
 	    }
+	    else if (CUR == '<') {
+                if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
+                    (ctxt->sax->characters != NULL))
+                    ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
+                NEXT;
+            }

 	    /*
 	     * Fourth case : a reference. If if has not been resolved,
@@ -6004,7 +5996,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 				"HPP: entering END_TAG\n");
 #endif
 			break;
-		    } else if (cur == '<') {
+		    } else if ((cur == '<') && IS_ASCII_LETTER(next)) {
                        if ((!terminate) && (next == 0))
                            goto done;
                        ctxt->instate = XML_PARSER_START_TAG;
@@ -6014,6 +6006,12 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
                                "HPP: entering START_TAG\n");
 #endif
 			break;
+		    } else if (cur == '<') {
+                        if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
+                            (ctxt->sax->characters != NULL))
+			    ctxt->sax->characters(ctxt->userData,
+						  BAD_CAST "<", 1);
+                        NEXT;
 		    } else {
 		        /*
 			 * check that the text sequence is complete