Rework control flow in htmlCurrentChar

Don't call xmlCurrentChar after switching encodings. Rearrange code blocks and fall through to normal UTF-8 handling.
2025-10-26 00:37:43 +03:00 · 2020-07-15 14:22:08 +02:00
parent 922bebccdd
commit dfd4e33048
1 changed files with 98 additions and 99 deletions
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -414,6 +414,10 @@ htmlFindEncoding(xmlParserCtxtPtr ctxt) {

 static int
 htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
+    const unsigned char *cur;
+    unsigned char c;
+    unsigned int val;
+
    if (ctxt->instate == XML_PARSER_EOF)
 	return(0);

@@ -421,7 +425,41 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
 	*len = 0;
 	return(ctxt->token);
    }
-    if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
+    if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
+        xmlChar * guess;
+        xmlCharEncodingHandlerPtr handler;
+
+        /*
+         * Assume it's a fixed length encoding (1) with
+         * a compatible encoding for the ASCII set, since
+         * HTML constructs only use < 128 chars
+         */
+        if ((int) *ctxt->input->cur < 0x80) {
+            *len = 1;
+            return((int) *ctxt->input->cur);
+        }
+
+        /*
+         * Humm this is bad, do an automatic flow conversion
+         */
+        guess = htmlFindEncoding(ctxt);
+        if (guess == NULL) {
+            xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
+        } else {
+            if (ctxt->input->encoding != NULL)
+                xmlFree((xmlChar *) ctxt->input->encoding);
+            ctxt->input->encoding = guess;
+            handler = xmlFindCharEncodingHandler((const char *) guess);
+            if (handler != NULL) {
+                xmlSwitchToEncoding(ctxt, handler);
+            } else {
+                htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
+                             "Unsupported encoding %s", guess, NULL);
+            }
+        }
+        ctxt->charset = XML_CHAR_ENCODING_UTF8;
+    }
+
    /*
     * We are supposed to handle UTF8, check it's valid
     * From rfc2044: encoding of the Unicode values on UTF-8:
@@ -433,10 +471,7 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
     *
     * Check for the 0x110000 limit too
     */
-	const unsigned char *cur = ctxt->input->cur;
-	unsigned char c;
-	unsigned int val;
-
+    cur = ctxt->input->cur;
    c = *cur;
    if (c & 0x80) {
        if ((c & 0x40) == 0)
@@ -505,42 +540,6 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
        *len = 1;
        return((int) *ctxt->input->cur);
    }
-    }
-    /*
-     * Assume it's a fixed length encoding (1) with
-     * a compatible encoding for the ASCII set, since
-     * XML constructs only use < 128 chars
-     */
-    *len = 1;
-    if ((int) *ctxt->input->cur < 0x80)
-	return((int) *ctxt->input->cur);
-
-    /*
-     * Humm this is bad, do an automatic flow conversion
-     */
-    {
-        xmlChar * guess;
-        xmlCharEncodingHandlerPtr handler;
-
-        guess = htmlFindEncoding(ctxt);
-        if (guess == NULL) {
-            xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
-        } else {
-            if (ctxt->input->encoding != NULL)
-                xmlFree((xmlChar *) ctxt->input->encoding);
-            ctxt->input->encoding = guess;
-            handler = xmlFindCharEncodingHandler((const char *) guess);
-            if (handler != NULL) {
-                xmlSwitchToEncoding(ctxt, handler);
-            } else {
-                htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
-                             "Unsupported encoding %s", guess, NULL);
-            }
-        }
-        ctxt->charset = XML_CHAR_ENCODING_UTF8;
-    }
-
-    return(xmlCurrentChar(ctxt, len));

 encoding_error:
    /*