From e179f3ec0ef3238ca2e23693cdbc271c7480998f Mon Sep 17 00:00:00 2001
From: Nick Wellnhofer <wellnhofer@aevum.de>
Date: Wed, 11 Sep 2024 17:29:59 +0200
Subject: [PATCH] html: Stop reporting syntax errors

It doesn't make much sense to keep the old syntax error handling which
doesn't conform to HTML5.

Handling HTML5 parser errors is rather involved and not essential for
parsers.
---
 HTMLparser.c                 | 167 +++++++++++------------------------
 python/tests/pushSAXhtml.py  |   3 +-
 result/HTML/names.html.err   |   3 -
 result/HTML/names.html.sax   |   1 -
 result/HTML/utf8bug.html.err |   3 -
 result/HTML/utf8bug.html.sax |   1 -
 result/HTML/wired.html.err   |   3 -
 result/HTML/wired.html.sax   |   1 -
 8 files changed, 53 insertions(+), 129 deletions(-)
 delete mode 100644 result/HTML/names.html.err
 delete mode 100644 result/HTML/utf8bug.html.err
diff --git a/HTMLparser.c b/HTMLparser.c
index bca44958..c111af5d 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -2958,28 +2958,18 @@ htmlParseAttValue(htmlParserCtxtPtr ctxt) {
     if (CUR == '"') {
         SKIP(1);
 	ret = htmlParseHTMLAttribute(ctxt, '"');
-        if (CUR != '"') {
-	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
-	                 "AttValue: \" expected\n", NULL, NULL);
-	} else
+        if (CUR == '"')
 	    SKIP(1);
     } else if (CUR == '\'') {
         SKIP(1);
 	ret = htmlParseHTMLAttribute(ctxt, '\'');
-        if (CUR != '\'') {
-	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
-	                 "AttValue: ' expected\n", NULL, NULL);
-	} else
+        if (CUR == '\'')
 	    SKIP(1);
     } else {
         /*
 	 * That's an HTMLism, the attribute value may not be quoted
 	 */
 	ret = htmlParseHTMLAttribute(ctxt, 0);
-	if (ret == NULL) {
-	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
-	                 "AttValue: no value found\n", NULL, NULL);
-	}
     }
     return(ret);
 }
@@ -3561,11 +3551,8 @@ htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
 
     *value = NULL;
     name = htmlParseHTMLName(ctxt, 1);
-    if (name == NULL) {
-	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
-	             "error parsing attribute name\n", NULL, NULL);
+    if (name == NULL)
         return(NULL);
-    }
 
     /*
      * read the value
@@ -3702,55 +3689,53 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
 
     GROW;
     name = htmlParseHTMLName(ctxt, 0);
-    if (name == NULL) {
-	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
-	             "htmlParseStartTag: invalid element name\n",
-		     NULL, NULL);
+    if (name == NULL)
         return -1;
-    }
     if (xmlStrEqual(name, BAD_CAST"meta"))
 	meta = 1;
 
-    /*
-     * Check for auto-closure of HTML elements.
-     */
-    htmlAutoClose(ctxt, name);
+    if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
+        /*
+         * Check for auto-closure of HTML elements.
+         */
+        htmlAutoClose(ctxt, name);
 
-    /*
-     * Check for implied HTML elements.
-     */
-    htmlCheckImplied(ctxt, name);
+        /*
+         * Check for implied HTML elements.
+         */
+        htmlCheckImplied(ctxt, name);
 
-    /*
-     * Avoid html at any level > 0, head at any level != 1
-     * or any attempt to recurse body
-     */
-    if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
-	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
-	             "htmlParseStartTag: misplaced <html> tag\n",
-		     name, NULL);
-	discardtag = 1;
-	ctxt->depth++;
-    }
-    if ((ctxt->nameNr != 1) &&
-	(xmlStrEqual(name, BAD_CAST"head"))) {
-	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
-	             "htmlParseStartTag: misplaced <head> tag\n",
-		     name, NULL);
-	discardtag = 1;
-	ctxt->depth++;
-    }
-    if (xmlStrEqual(name, BAD_CAST"body")) {
-	int indx;
-	for (indx = 0;indx < ctxt->nameNr;indx++) {
-	    if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
-		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
-		             "htmlParseStartTag: misplaced <body> tag\n",
-			     name, NULL);
-		discardtag = 1;
-		ctxt->depth++;
-	    }
-	}
+        /*
+         * Avoid html at any level > 0, head at any level != 1
+         * or any attempt to recurse body
+         */
+        if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
+            htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
+                         "htmlParseStartTag: misplaced <html> tag\n",
+                         name, NULL);
+            discardtag = 1;
+            ctxt->depth++;
+        }
+        if ((ctxt->nameNr != 1) &&
+            (xmlStrEqual(name, BAD_CAST"head"))) {
+            htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
+                         "htmlParseStartTag: misplaced <head> tag\n",
+                         name, NULL);
+            discardtag = 1;
+            ctxt->depth++;
+        }
+        if (xmlStrEqual(name, BAD_CAST"body")) {
+            int indx;
+            for (indx = 0;indx < ctxt->nameNr;indx++) {
+                if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
+                    htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
+                                 "htmlParseStartTag: misplaced <body> tag\n",
+                                 name, NULL);
+                    discardtag = 1;
+                    ctxt->depth++;
+                }
+            }
+        }
     }
 
     /*
@@ -3778,8 +3763,6 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
 	     */
 	    for (i = 0; i < nbatts;i += 2) {
 	        if (xmlStrEqual(atts[i], attname)) {
-		    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
-		                 "Attribute %s redefined\n", attname, NULL);
 		    if (attvalue != NULL)
 			xmlFree(attvalue);
 		    goto failed;
@@ -3894,8 +3877,6 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)
     int i, ret;
 
     if ((CUR != '<') || (NXT(1) != '/')) {
-        htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
-	             "htmlParseEndTag: '</' not found\n", NULL, NULL);
         return (0);
     }
     SKIP(2);
@@ -4177,12 +4158,8 @@ htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
      * Lookup the info for that element.
      */
     info = htmlTagLookup(name);
-    if (info == NULL) {
-	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
-	             "Tag %s invalid\n", name, NULL);
-    } else {
+    if (info != NULL)
         ctxt->endCheckState = info->dataMode;
-    }
 
     if (ctxt->record_info)
         htmlNodeInfoPush(ctxt, &node_info);
@@ -4201,22 +4178,9 @@ htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
 	return(0);
     }
 
-    if (CUR == '>') {
-        SKIP(1);
-    } else {
-	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
-	             "Couldn't find end of Start Tag %s\n", name, NULL);
-
-	/*
-	 * end of parsing of this node.
-	 */
-	if (xmlStrEqual(name, ctxt->name)) {
-            htmlParserFinishElementParsing(ctxt);
-	    nodePop(ctxt);
-	    htmlnamePop(ctxt);
-	}
-	return(0);
-    }
+    if (CUR != '>')
+        return(0);
+    SKIP(1);
 
     /*
      * Check for an Empty Element from DTD definition
@@ -4358,10 +4322,6 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
      * Wipe out everything which is before the first '<'
      */
     SKIP_BLANKS;
-    if (ctxt->input->cur >= ctxt->input->end) {
-	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
-	             "Document is empty\n", NULL, NULL);
-    }
 
     if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
 	ctxt->sax->startDocument(ctxt->userData);
@@ -5018,12 +4978,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		 * Lookup the info for that element.
 		 */
 		info = htmlTagLookup(name);
-		if (info == NULL) {
-		    htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
-		                 "Tag %s invalid\n", name, NULL);
-                } else {
+		if (info != NULL)
                     ctxt->endCheckState = info->dataMode;
-		}
 
 		/*
 		 * Check for an Empty Element labeled the XML/SGML way
@@ -5041,28 +4997,9 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		    break;
 		}
 
-		if (CUR == '>') {
-		    SKIP(1);
-		} else {
-		    htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
-		                 "Couldn't find end of Start Tag %s\n",
-				 name, NULL);
-
-		    /*
-		     * end of parsing of this node.
-		     */
-		    if (xmlStrEqual(name, ctxt->name)) {
-                        htmlParserFinishElementParsing(ctxt);
-			nodePop(ctxt);
-			htmlnamePop(ctxt);
-		    }
-
-		    if (ctxt->record_info)
-		        htmlNodeInfoPush(ctxt, &node_info);
-
-		    ctxt->instate = XML_PARSER_CONTENT;
-		    break;
-		}
+		if (CUR != '>')
+                    break;
+		SKIP(1);
 
 		/*
 		 * Check for an Empty Element from DTD definition
diff --git a/python/tests/pushSAXhtml.py b/python/tests/pushSAXhtml.py
index c32cd3e7..d9c3cc84 100755
--- a/python/tests/pushSAXhtml.py
+++ b/python/tests/pushSAXhtml.py
@@ -50,8 +50,7 @@ chunk = "ar</foo>"
 ctxt.htmlParseChunk(chunk, len(chunk), 1)
 ctxt=None
 
-reference = """startDocument:startElement html None:startElement body None:startElement foo {'url': 'tst'}:error: Tag foo invalid
-:characters: bar:endElement foo:endElement body:endElement html:endDocument:"""
+reference = """startDocument:startElement html None:startElement body None:startElement foo {'url': 'tst'}:characters: bar:endElement foo:endElement body:endElement html:endDocument:"""
 if log != reference:
     print("Error got: %s" % log)
     print("Exprected: %s" % reference)
diff --git a/result/HTML/names.html.err b/result/HTML/names.html.err
deleted file mode 100644
index 4d91a5d2..00000000
--- a/result/HTML/names.html.err
+++ /dev/null
@@ -1,3 +0,0 @@
-./test/HTML/names.html:3: HTML parser error : Tag o:p invalid
-  <o:p></o:p>
-      ^
diff --git a/result/HTML/names.html.sax b/result/HTML/names.html.sax
index 12a107f8..7810c498 100644
--- a/result/HTML/names.html.sax
+++ b/result/HTML/names.html.sax
@@ -7,7 +7,6 @@ SAX.startElement(body)
 SAX.characters(
   , 3)
 SAX.startElement(o:p)
-SAX.error: Tag o:p invalid
 SAX.endElement(o:p)
 SAX.characters(
 , 1)
diff --git a/result/HTML/utf8bug.html.err b/result/HTML/utf8bug.html.err
deleted file mode 100644
index 55f30ae8..00000000
--- a/result/HTML/utf8bug.html.err
+++ /dev/null
@@ -1,3 +0,0 @@
-./test/HTML/utf8bug.html:121: HTML parser error : Tag s1 invalid
-ز همکاران است. روی آن کلیک کند.</FONT></FONT></STRONG><S1
-                                                                               ^
diff --git a/result/HTML/utf8bug.html.sax b/result/HTML/utf8bug.html.sax
index fb3d3621..a279b14a 100644
--- a/result/HTML/utf8bug.html.sax
+++ b/result/HTML/utf8bug.html.sax
@@ -422,7 +422,6 @@ SAX.endElement(font)
 SAX.endElement(font)
 SAX.endElement(strong)
 SAX.startElement(s1)
-SAX.error: Tag s1 invalid
 SAX.characters(  , 2)
 SAX.endElement(s1)
 SAX.endElement(div)
diff --git a/result/HTML/wired.html.err b/result/HTML/wired.html.err
index b19feb07..5cce2b0e 100644
--- a/result/HTML/wired.html.err
+++ b/result/HTML/wired.html.err
@@ -1,6 +1,3 @@
-./test/HTML/wired.html:25: HTML parser error : Tag nobr invalid
-<td bgcolor="#FF0000" align="left" valign="center"><nobr><img src="http://static
-                                                        ^
 ./test/HTML/wired.html:125: HTML parser error : Unexpected end tag : form
 	</tr>    </form>
 	                ^
diff --git a/result/HTML/wired.html.sax b/result/HTML/wired.html.sax
index 11dcdf11..0b1c0a3f 100644
--- a/result/HTML/wired.html.sax
+++ b/result/HTML/wired.html.sax
@@ -357,7 +357,6 @@ SAX.characters(
 , 3)
 SAX.startElement(td, bgcolor='#FF0000', align='left', valign='center')
 SAX.startElement(nobr)
-SAX.error: Tag nobr invalid
 SAX.startElement(img, src='http://static.wired.com/news/images/spacer.gif', width='344', height='1')
 SAX.endElement(img)
 SAX.startElement(br)