From 4eeac30944461140b61f253e2daa2c6dd9e3171f Mon Sep 17 00:00:00 2001
From: Nick Wellnhofer <wellnhofer@aevum.de>
Date: Sun, 8 Sep 2024 22:20:20 +0200
Subject: [PATCH] html: Start to fix EOF and U+0000 handling

---
 HTMLparser.c             | 359 ++++++++++-----------------------------
 result/HTML/doc2.htm.err |   3 -
 result/HTML/doc2.htm.sax |   1 -
 result/HTML/doc3.htm.err |   3 -
 result/HTML/doc3.htm.sax |   1 -
 5 files changed, 93 insertions(+), 274 deletions(-)
 delete mode 100644 result/HTML/doc2.htm.err

diff --git a/HTMLparser.c b/HTMLparser.c
index 7b407803..cdebaf67 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -388,10 +388,8 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
     if (c < 0x80) {
         if (c == 0) {
             if (ctxt->input->cur < ctxt->input->end) {
-                htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
-                                "Char 0x%X out of allowed range\n", 0);
                 *len = 1;
-                return(' ');
+                return(0xFFFD);
             } else {
                 *len = 0;
                 return(0);
@@ -3166,19 +3164,16 @@ static int
 htmlParseCharData(htmlParserCtxtPtr ctxt, int terminate) {
     xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
     int nbchar = 0;
-    int stop = 0;
     int complete = 0;
     int res = 0;
     int cur, l, mode;
 
     mode = ctxt->endCheckState;
-    if ((mode == 0) || (mode == DATA_RCDATA))
-        stop = '&';
 
-    cur = CUR_CHAR(l);
-    while ((cur != stop) &&
-	   (cur != 0) &&
-           (!PARSER_STOPPED(ctxt))) {
+    while ((!PARSER_STOPPED(ctxt)) &&
+           (ctxt->input->cur < ctxt->input->end)) {
+        cur = CUR_CHAR(l);
+
         /*
          * Check for end of text data
          */
@@ -3262,6 +3257,9 @@ htmlParseCharData(htmlParserCtxtPtr ctxt, int terminate) {
                 res = 1;
                 break;
             }
+        } else if ((cur == '&') &&
+                   ((mode == 0) || (mode == DATA_RCDATA))) {
+            break;
         }
 
 	COPY_BUF(buf,nbchar,cur);
@@ -3273,7 +3271,6 @@ htmlParseCharData(htmlParserCtxtPtr ctxt, int terminate) {
 	    nbchar = 0;
             SHRINK;
 	}
-	cur = CUR_CHAR(l);
     }
     if (nbchar != 0) {
         buf[nbchar] = 0;
@@ -4130,7 +4127,8 @@ htmlParseReference(htmlParserCtxtPtr ctxt) {
 
 static void
 htmlParseContent(htmlParserCtxtPtr ctxt) {
-    while (PARSER_STOPPED(ctxt) == 0) {
+    while ((PARSER_STOPPED(ctxt) == 0) &&
+           (ctxt->input->cur < ctxt->input->end)) {
         int mode;
 
         GROW;
@@ -4147,9 +4145,6 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
                     (UPP(4) == 'C') && (UPP(5) == 'T') &&
                     (UPP(6) == 'Y') && (UPP(7) == 'P') &&
                     (UPP(8) == 'E')) {
-                    htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
-                                 "Misplaced DOCTYPE declaration\n",
-                                 BAD_CAST "DOCTYPE" , NULL);
                     htmlParseDocTypeDecl(ctxt);
                 } else if ((NXT(2) == '-') && (NXT(3) == '-')) {
                     SKIP(4);
@@ -4172,9 +4167,6 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
             }
         } else if ((CUR == '&') && ((mode == 0) || (mode == DATA_RCDATA))) {
             htmlParseReference(ctxt);
-        } else if (CUR == 0) {
-            htmlAutoCloseOnEnd(ctxt);
-            break;
         } else {
             htmlParseCharData(ctxt, /* terminate */ 1);
         }
@@ -4182,6 +4174,9 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
         SHRINK;
         GROW;
     }
+
+    if (ctxt->input->cur >= ctxt->input->end)
+        htmlAutoCloseOnEnd(ctxt);
 }
 
 /**
@@ -4405,7 +4400,7 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
      * Wipe out everything which is before the first '<'
      */
     SKIP_BLANKS;
-    if (CUR == 0) {
+    if (ctxt->input->cur >= ctxt->input->end) {
 	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
 	             "Document is empty\n", NULL, NULL);
     }
@@ -4966,7 +4961,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
     int ret = 0;
     htmlParserInputPtr in;
     ptrdiff_t avail = 0;
-    xmlChar cur, next;
+    int cur;
 
     htmlParserNodeInfo node_info;
 
@@ -4988,17 +4983,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 	}
         if (avail < 1)
 	    goto done;
-        /*
-         * This is done to make progress and avoid an infinite loop
-         * if a parsing attempt was aborted by hitting a NUL byte. After
-         * changing htmlCurrentChar, this probably isn't necessary anymore.
-         * We should consider removing this check.
-         */
 	cur = in->cur[0];
-	if (cur == 0) {
-	    SKIP(1);
-	    continue;
-	}
 
         switch (ctxt->instate) {
             case XML_PARSER_EOF:
@@ -5017,14 +5002,6 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
                     xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
                 }
 
-	        /*
-		 * Very first chars read from the document flow.
-		 */
-		cur = in->cur[0];
-		if (IS_BLANK_CH(cur)) {
-		    SKIP_BLANKS;
-                    avail = in->end - in->cur;
-		}
                 if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) {
                     ctxt->sax->setDocumentLocator(ctxt->userData,
                             (xmlSAXLocator *) &xmlDefaultSAXLocator);
@@ -5033,161 +5010,22 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 	            (!ctxt->disableSAX))
 		    ctxt->sax->startDocument(ctxt->userData);
 
-		cur = in->cur[0];
-		next = in->cur[1];
-		if ((cur == '<') && (next == '!') &&
-		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
-		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
-		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
-		    (UPP(8) == 'E')) {
-		    if ((!terminate) &&
-		        (htmlParseLookupString(ctxt, 9, ">", 1, 0) < 0))
-			goto done;
-		    htmlParseDocTypeDecl(ctxt);
-		    ctxt->instate = XML_PARSER_PROLOG;
-                } else {
-		    ctxt->instate = XML_PARSER_MISC;
-		}
-		break;
-            case XML_PARSER_MISC:
-		SKIP_BLANKS;
-                avail = in->end - in->cur;
-		/*
-		 * no chars in buffer
-		 */
-		if (avail < 1)
-		    goto done;
-		/*
-		 * not enough chars in buffer
-		 */
-		if (avail < 2) {
-		    if (!terminate)
-			goto done;
-		    else
-			next = ' ';
-		} else {
-		    next = in->cur[1];
-		}
-		cur = in->cur[0];
-	        if ((cur == '<') && (next == '!') &&
-		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
-		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
-			goto done;
-                    SKIP(4);
-		    htmlParseComment(ctxt, /* bogus */ 0);
-		    ctxt->instate = XML_PARSER_MISC;
-	        } else if ((cur == '<') && (next == '?')) {
-		    if ((!terminate) &&
-		        (htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
-			goto done;
-                    SKIP(1);
-		    htmlParseComment(ctxt, /* bogus */ 1);
-		    ctxt->instate = XML_PARSER_MISC;
-		} else if ((cur == '<') && (next == '!') &&
-		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
-		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
-		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
-		    (UPP(8) == 'E')) {
-		    if ((!terminate) &&
-		        (htmlParseLookupString(ctxt, 9, ">", 1, 0) < 0))
-			goto done;
-		    htmlParseDocTypeDecl(ctxt);
-		    ctxt->instate = XML_PARSER_PROLOG;
-		} else if ((cur == '<') && (next == '!') &&
-		           (avail < 9)) {
-		    goto done;
-		} else {
-		    ctxt->instate = XML_PARSER_CONTENT;
-		}
-		break;
-            case XML_PARSER_PROLOG:
-		SKIP_BLANKS;
-                avail = in->end - in->cur;
-		if (avail < 2)
-		    goto done;
-		cur = in->cur[0];
-		next = in->cur[1];
-		if ((cur == '<') && (next == '!') &&
-		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
-		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
-			goto done;
-                    SKIP(4);
-		    htmlParseComment(ctxt, /* bogus */ 0);
-		    ctxt->instate = XML_PARSER_PROLOG;
-	        } else if ((cur == '<') && (next == '?')) {
-		    if ((!terminate) &&
-		        (htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
-			goto done;
-                    SKIP(1);
-		    htmlParseComment(ctxt, /* bogus */ 1);
-		    ctxt->instate = XML_PARSER_PROLOG;
-		} else if ((cur == '<') && (next == '!') &&
-		           (avail < 4)) {
-		    goto done;
-		} else {
-		    ctxt->instate = XML_PARSER_CONTENT;
-		}
-		break;
-            case XML_PARSER_EPILOG:
-                avail = in->end - in->cur;
-		if (avail < 1)
-		    goto done;
-		cur = in->cur[0];
-		if (IS_BLANK_CH(cur)) {
-		    htmlParseCharData(ctxt, terminate);
-		    goto done;
-		}
-		if (avail < 2)
-		    goto done;
-		next = in->cur[1];
-	        if ((cur == '<') && (next == '!') &&
-		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
-		    if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
-			goto done;
-                    SKIP(4);
-		    htmlParseComment(ctxt, /* bogus */ 0);
-		    ctxt->instate = XML_PARSER_EPILOG;
-	        } else if ((cur == '<') && (next == '?')) {
-		    if ((!terminate) &&
-		        (htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
-			goto done;
-                    SKIP(1);
-		    htmlParseComment(ctxt, /* bogus */ 1);
-		    ctxt->instate = XML_PARSER_EPILOG;
-		} else if ((cur == '<') && (next == '!') &&
-		           (avail < 4)) {
-		    goto done;
-		} else {
-		    ctxt->errNo = XML_ERR_DOCUMENT_END;
-		    ctxt->wellFormed = 0;
-		    ctxt->instate = XML_PARSER_EOF;
-		    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
-			ctxt->sax->endDocument(ctxt->userData);
-		    goto done;
-		}
+                /* Allow callback to modify state */
+                if (ctxt->instate == XML_PARSER_START)
+                    ctxt->instate = XML_PARSER_MISC;
 		break;
             case XML_PARSER_START_TAG: {
 	        const xmlChar *name;
-		int failed;
+		int failed, next;
 		const htmlElemDesc * info;
 
-		/*
-		 * no chars in buffer
-		 */
-		if (avail < 1)
-		    goto done;
 		/*
 		 * not enough chars in buffer
 		 */
-		if (avail < 2) {
-		    if (!terminate)
-			goto done;
-		    else
-			next = ' ';
-		} else {
-		    next = in->cur[1];
-		}
+		if (avail < 2)
+		    goto done;
 		cur = in->cur[0];
+		next = in->cur[1];
 	        if (cur != '<') {
 		    ctxt->instate = XML_PARSER_CONTENT;
 		    break;
@@ -5287,44 +5125,21 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		ctxt->instate = XML_PARSER_CONTENT;
                 break;
 	    }
-            case XML_PARSER_CONTENT: {
-		xmlChar chr[2] = { 0, 0 };
+            case XML_PARSER_MISC:
+            case XML_PARSER_PROLOG:
+            case XML_PARSER_CONTENT:
+            case XML_PARSER_EPILOG: {
                 int mode;
 
-                /*
-		 * Handle preparsed entities and charRef
-		 */
-		if ((avail == 1) && (terminate)) {
-		    cur = in->cur[0];
-		    if ((cur != '<') && (cur != '&')) {
-			if (ctxt->sax != NULL) {
-                            chr[0] = cur;
-			    if (IS_BLANK_CH(cur)) {
-				if (ctxt->keepBlanks) {
-				    if (ctxt->sax->characters != NULL)
-					ctxt->sax->characters(
-						ctxt->userData, chr, 1);
-				} else {
-				    if (ctxt->sax->ignorableWhitespace != NULL)
-					ctxt->sax->ignorableWhitespace(
-						ctxt->userData, chr, 1);
-				}
-			    } else {
-				htmlCheckParagraph(ctxt);
-				if (ctxt->sax->characters != NULL)
-				    ctxt->sax->characters(
-					    ctxt->userData, chr, 1);
-			    }
-			}
-			ctxt->checkIndex = 0;
-			in->cur++;
-			break;
-		    }
-		}
-		if (avail < 2)
+                if ((ctxt->instate == XML_PARSER_MISC) ||
+                    (ctxt->instate == XML_PARSER_PROLOG)) {
+                    SKIP_BLANKS;
+                    avail = in->end - in->cur;
+                }
+
+		if (avail < 1)
 		    goto done;
 		cur = in->cur[0];
-		next = in->cur[1];
                 mode = ctxt->endCheckState;
 
                 if (mode != 0) {
@@ -5353,62 +5168,74 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
                     }
 
                     break;
-		} else if ((cur == '<') && (next == '!')) {
-                    if (avail < 4)
-                        goto done;
-                    /*
-                     * Sometimes DOCTYPE arrives in the middle of the document
-                     */
-                    if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
-                        (UPP(4) == 'C') && (UPP(5) == 'T') &&
-                        (UPP(6) == 'Y') && (UPP(7) == 'P') &&
-                        (UPP(8) == 'E')) {
-                        if ((!terminate) &&
-                            (htmlParseLookupString(ctxt, 9, ">", 1, 0) < 0))
+		} else if (cur == '<') {
+                    int next;
+
+                    if (avail < 2) {
+                        if (!terminate)
                             goto done;
-                        htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
-                                     "Misplaced DOCTYPE declaration\n",
-                                     BAD_CAST "DOCTYPE" , NULL);
-                        htmlParseDocTypeDecl(ctxt);
-                    } else if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
-                        if ((!terminate) &&
-                            (htmlParseLookupCommentEnd(ctxt) < 0))
-                            goto done;
-                        SKIP(4);
-                        htmlParseComment(ctxt, /* bogus */ 0);
-                        ctxt->instate = XML_PARSER_CONTENT;
+                        next = ' ';
                     } else {
+                        next = in->cur[1];
+                    }
+
+                    if (next == '!') {
+                        if ((!terminate) && (avail < 4))
+                            goto done;
+                        if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
+                            if ((!terminate) &&
+                                (htmlParseLookupCommentEnd(ctxt) < 0))
+                                goto done;
+                            SKIP(4);
+                            htmlParseComment(ctxt, /* bogus */ 0);
+                            break;
+                        }
+
+                        if ((!terminate) && (avail < 9))
+                            goto done;
+                        if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
+                            (UPP(4) == 'C') && (UPP(5) == 'T') &&
+                            (UPP(6) == 'Y') && (UPP(7) == 'P') &&
+                            (UPP(8) == 'E')) {
+                            if ((!terminate) &&
+                                (htmlParseLookupString(ctxt, 9, ">", 1,
+                                                       0) < 0))
+                                goto done;
+                            htmlParseDocTypeDecl(ctxt);
+                            if (ctxt->instate == XML_PARSER_MISC)
+                                ctxt->instate = XML_PARSER_PROLOG;
+                        } else {
+                            if ((!terminate) &&
+                                (htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
+                                goto done;
+                            SKIP(2);
+                            htmlParseComment(ctxt, /* bogus */ 1);
+                        }
+                    } else if (next == '?') {
                         if ((!terminate) &&
                             (htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
                             goto done;
-                        SKIP(2);
+                        SKIP(1);
                         htmlParseComment(ctxt, /* bogus */ 1);
+                    } else if (next == '/') {
+                        ctxt->instate = XML_PARSER_END_TAG;
+                        ctxt->checkIndex = 0;
+                        break;
+                    } else if (IS_ASCII_LETTER(next)) {
+                        if ((!terminate) && (next == 0))
+                            goto done;
+                        ctxt->instate = XML_PARSER_START_TAG;
+                        ctxt->checkIndex = 0;
+                        break;
+                    } else {
                         ctxt->instate = XML_PARSER_CONTENT;
+                        htmlCheckParagraph(ctxt);
+                        if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
+                            (ctxt->sax->characters != NULL))
+                            ctxt->sax->characters(ctxt->userData,
+                                                  BAD_CAST "<", 1);
+                        SKIP(1);
                     }
-                } else if ((cur == '<') && (next == '?')) {
-                    if ((!terminate) &&
-                        (htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
-                        goto done;
-                    SKIP(1);
-                    htmlParseComment(ctxt, /* bogus */ 1);
-                    ctxt->instate = XML_PARSER_CONTENT;
-                } else if ((cur == '<') && (next == '/')) {
-                    ctxt->instate = XML_PARSER_END_TAG;
-                    ctxt->checkIndex = 0;
-                    break;
-                } else if ((cur == '<') && IS_ASCII_LETTER(next)) {
-                    if ((!terminate) && (next == 0))
-                        goto done;
-                    ctxt->instate = XML_PARSER_START_TAG;
-                    ctxt->checkIndex = 0;
-                    break;
-                } else if (cur == '<') {
-                    htmlCheckParagraph(ctxt);
-                    if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
-                        (ctxt->sax->characters != NULL))
-                        ctxt->sax->characters(ctxt->userData,
-                                              BAD_CAST "<", 1);
-                    SKIP(1);
                 } else {
                     /*
                      * check that the text sequence is complete
diff --git a/result/HTML/doc2.htm.err b/result/HTML/doc2.htm.err
deleted file mode 100644
index 5ac09cd5..00000000
--- a/result/HTML/doc2.htm.err
+++ /dev/null
@@ -1,3 +0,0 @@
-./test/HTML/doc2.htm:10: HTML parser error : Misplaced DOCTYPE declaration
-<!-- END Naviscope Javascript --><!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Tra
-                                 ^
diff --git a/result/HTML/doc2.htm.sax b/result/HTML/doc2.htm.sax
index b28d440e..db42c5bd 100644
--- a/result/HTML/doc2.htm.sax
+++ b/result/HTML/doc2.htm.sax
@@ -17,7 +17,6 @@ SAX.endElement(script)
 SAX.characters(
 , 1)
 SAX.comment( END Naviscope Javascript )
-SAX.error: Misplaced DOCTYPE declaration
 SAX.internalSubset(HTML, -//W3C//DTD HTML 4.0 Transitional//EN, )
 SAX.comment( saved from url=(0027)http://www.agents-tech.com/ )
 SAX.characters(
diff --git a/result/HTML/doc3.htm.err b/result/HTML/doc3.htm.err
index 2e79a56a..58404bc8 100644
--- a/result/HTML/doc3.htm.err
+++ b/result/HTML/doc3.htm.err
@@ -1,6 +1,3 @@
-./test/HTML/doc3.htm:3: HTML parser error : Misplaced DOCTYPE declaration
-<!-- END Naviscope Javascript --><!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2//EN
-                                 ^
 ./test/HTML/doc3.htm:81: HTML parser error : Unexpected end tag : p
   </P></TD></TR></TBODY></TABLE></CENTER></TD></TR></TBODY></TABLE></CENTER></P>
                                                                                ^
diff --git a/result/HTML/doc3.htm.sax b/result/HTML/doc3.htm.sax
index 435ab051..c7221e85 100644
--- a/result/HTML/doc3.htm.sax
+++ b/result/HTML/doc3.htm.sax
@@ -17,7 +17,6 @@ SAX.endElement(script)
 SAX.characters(
 , 1)
 SAX.comment( END Naviscope Javascript )
-SAX.error: Misplaced DOCTYPE declaration
 SAX.internalSubset(HTML, -//W3C//DTD HTML 3.2//EN, )
 SAX.comment(last modified on Tuesday, February 22, 2000 11:47 PM )
 SAX.characters(