patch from johan@evenhuis.nl for #107937 fixing some line counting

* HTMLparser.c parser.c parserInternals.c: patch from johan@evenhuis.nl for #107937 fixing some line counting problems, and some other cleanups. * result/HTML/: this result in some line number changes Daniel
2025-07-29 11:41:22 +03:00 · 2003-03-22 00:04:05 +00:00
parent 580ced8ee2
commit 77a90a7f8e
7 changed files with 186 additions and 154 deletions
--- a/parserInternals.c
+++ b/parserInternals.c
@ -1095,120 +1095,131 @@ xmlParserInputShrink(xmlParserInputPtr in) {
 */

 void
-xmlNextChar(xmlParserCtxtPtr ctxt) {
+xmlNextChar(xmlParserCtxtPtr ctxt)
+{
    if (ctxt->instate == XML_PARSER_EOF)
-	return;
+        return;

-    /*
-     *   2.11 End-of-Line Handling
-     *   the literal two-character sequence "#xD#xA" or a standalone
-     *   literal #xD, an XML processor must pass to the application
-     *   the single character #xA. 
-     */
    if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
-	if ((*ctxt->input->cur == 0) &&
-	    (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0) &&
-	    (ctxt->instate != XML_PARSER_COMMENT)) {
-	        /*
-		 * If we are at the end of the current entity and
-		 * the context allows it, we pop consumed entities
-		 * automatically.
-		 * the auto closing should be blocked in other cases
-		 */
-		xmlPopInput(ctxt);
-	} else {
-	    if (*(ctxt->input->cur) == '\n') {
-		ctxt->input->line++; ctxt->input->col = 1;
-	    } else ctxt->input->col++;
-	    if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
-		/*
-		 * We are supposed to handle UTF8, check it's valid
-		 * From rfc2044: encoding of the Unicode values on UTF-8:
-		 *
-		 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
-		 * 0000 0000-0000 007F   0xxxxxxx
-		 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
-		 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx 
-		 *
-		 * Check for the 0x110000 limit too
-		 */
-		const unsigned char *cur = ctxt->input->cur;
-		unsigned char c;
+        if ((*ctxt->input->cur == 0) &&
+            (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0) &&
+            (ctxt->instate != XML_PARSER_COMMENT)) {
+            /*
+             * If we are at the end of the current entity and
+             * the context allows it, we pop consumed entities
+             * automatically.
+             * the auto closing should be blocked in other cases
+             */
+            xmlPopInput(ctxt);
+        } else {
+            const unsigned char *cur;
+            unsigned char c;

-		c = *cur;
-		if (c & 0x80) {
-		    if (cur[1] == 0)
-			xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
-		    if ((cur[1] & 0xc0) != 0x80)
-			goto encoding_error;
-		    if ((c & 0xe0) == 0xe0) {
-			unsigned int val;
+            /*
+             *   2.11 End-of-Line Handling
+             *   the literal two-character sequence "#xD#xA" or a standalone
+             *   literal #xD, an XML processor must pass to the application
+             *   the single character #xA.
+             */
+            if (*(ctxt->input->cur) == '\n') {
+                ctxt->input->line++;
+                ctxt->input->col = 1;
+            } else
+                ctxt->input->col++;

-			if (cur[2] == 0)
-			    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
-			if ((cur[2] & 0xc0) != 0x80)
-			    goto encoding_error;
-			if ((c & 0xf0) == 0xf0) {
-			    if (cur[3] == 0)
-				xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
-			    if (((c & 0xf8) != 0xf0) ||
-				((cur[3] & 0xc0) != 0x80))
-				goto encoding_error;
-			    /* 4-byte code */
-			    ctxt->input->cur += 4;
-			    val = (cur[0] & 0x7) << 18;
-			    val |= (cur[1] & 0x3f) << 12;
-			    val |= (cur[2] & 0x3f) << 6;
-			    val |= cur[3] & 0x3f;
-			} else {
-			  /* 3-byte code */
-			    ctxt->input->cur += 3;
-			    val = (cur[0] & 0xf) << 12;
-			    val |= (cur[1] & 0x3f) << 6;
-			    val |= cur[2] & 0x3f;
-			}
-			if (((val > 0xd7ff) && (val < 0xe000)) ||
-			    ((val > 0xfffd) && (val < 0x10000)) ||
-			    (val >= 0x110000)) {
-			    if ((ctxt->sax != NULL) &&
-				(ctxt->sax->error != NULL))
-				ctxt->sax->error(ctxt->userData, 
-				 "Char 0x%X out of allowed range\n", val);
-			    ctxt->errNo = XML_ERR_INVALID_ENCODING;
-			    ctxt->wellFormed = 0;
-			    if (ctxt->recovery == 0) ctxt->disableSAX = 1;
-			}    
-		    } else
-		      /* 2-byte code */
-		        ctxt->input->cur += 2;
-		} else
-		    /* 1-byte code */
-		    ctxt->input->cur++;
-	    } else {
-		/*
-		 * Assume it's a fixed length encoding (1) with
-		 * a compatible encoding for the ASCII set, since
-		 * XML constructs only use < 128 chars
-		 */
-	        ctxt->input->cur++;
-	    }
-	    ctxt->nbChars++;
-	    if (*ctxt->input->cur == 0)
-		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
-	}
+            /*
+             * We are supposed to handle UTF8, check it's valid
+             * From rfc2044: encoding of the Unicode values on UTF-8:
+             *
+             * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
+             * 0000 0000-0000 007F   0xxxxxxx
+             * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
+             * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx 
+             *
+             * Check for the 0x110000 limit too
+             */
+            cur = ctxt->input->cur;
+
+            c = *cur;
+            if (c & 0x80) {
+                if (cur[1] == 0)
+                    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
+                if ((cur[1] & 0xc0) != 0x80)
+                    goto encoding_error;
+                if ((c & 0xe0) == 0xe0) {
+                    unsigned int val;
+
+                    if (cur[2] == 0)
+                        xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
+                    if ((cur[2] & 0xc0) != 0x80)
+                        goto encoding_error;
+                    if ((c & 0xf0) == 0xf0) {
+                        if (cur[3] == 0)
+                            xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
+                        if (((c & 0xf8) != 0xf0) ||
+                            ((cur[3] & 0xc0) != 0x80))
+                            goto encoding_error;
+                        /* 4-byte code */
+                        ctxt->input->cur += 4;
+                        val = (cur[0] & 0x7) << 18;
+                        val |= (cur[1] & 0x3f) << 12;
+                        val |= (cur[2] & 0x3f) << 6;
+                        val |= cur[3] & 0x3f;
+                    } else {
+                        /* 3-byte code */
+                        ctxt->input->cur += 3;
+                        val = (cur[0] & 0xf) << 12;
+                        val |= (cur[1] & 0x3f) << 6;
+                        val |= cur[2] & 0x3f;
+                    }
+                    if (((val > 0xd7ff) && (val < 0xe000)) ||
+                        ((val > 0xfffd) && (val < 0x10000)) ||
+                        (val >= 0x110000)) {
+                        if ((ctxt->sax != NULL) &&
+                            (ctxt->sax->error != NULL))
+                            ctxt->sax->error(ctxt->userData,
+                                             "Char 0x%X out of allowed range\n",
+                                             val);
+                        ctxt->errNo = XML_ERR_INVALID_ENCODING;
+                        ctxt->wellFormed = 0;
+                        if (ctxt->recovery == 0)
+                            ctxt->disableSAX = 1;
+                    }
+                } else
+                    /* 2-byte code */
+                    ctxt->input->cur += 2;
+            } else
+                /* 1-byte code */
+                ctxt->input->cur++;
+
+            ctxt->nbChars++;
+            if (*ctxt->input->cur == 0)
+                xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
+        }
    } else {
-	ctxt->input->cur++;
-	ctxt->nbChars++;
-	if (*ctxt->input->cur == 0)
-	    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
+        /*
+         * Assume it's a fixed length encoding (1) with
+         * a compatible encoding for the ASCII set, since
+         * XML constructs only use < 128 chars
+         */
+
+        if (*(ctxt->input->cur) == '\n') {
+            ctxt->input->line++;
+            ctxt->input->col = 1;
+        } else
+            ctxt->input->col++;
+        ctxt->input->cur++;
+        ctxt->nbChars++;
+        if (*ctxt->input->cur == 0)
+            xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
    }
    if ((*ctxt->input->cur == '%') && (!ctxt->html))
-	xmlParserHandlePEReference(ctxt);
+        xmlParserHandlePEReference(ctxt);
    if ((*ctxt->input->cur == 0) &&
        (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0))
-	    xmlPopInput(ctxt);
+        xmlPopInput(ctxt);
    return;
-encoding_error:
+  encoding_error:
    /*
     * If we detect an UTF8 error that probably mean that the
     * input encoding didn't get properly advertised in the
@ -1217,16 +1228,17 @@ encoding_error:
     * encoding !)
     */
    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) {
-	ctxt->sax->error(ctxt->userData, 
-			 "Input is not proper UTF-8, indicate encoding !\n");
-	ctxt->sax->error(ctxt->userData, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
-			ctxt->input->cur[0], ctxt->input->cur[1],
-			ctxt->input->cur[2], ctxt->input->cur[3]);
+        ctxt->sax->error(ctxt->userData,
+                         "Input is not proper UTF-8, indicate encoding !\n");
+        ctxt->sax->error(ctxt->userData,
+                         "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
+                         ctxt->input->cur[0], ctxt->input->cur[1],
+                         ctxt->input->cur[2], ctxt->input->cur[3]);
    }
    ctxt->wellFormed = 0;
    ctxt->errNo = XML_ERR_INVALID_ENCODING;

-    ctxt->charset = XML_CHAR_ENCODING_8859_1; 
+    ctxt->charset = XML_CHAR_ENCODING_8859_1;
    ctxt->input->cur++;
    return;
 }