html: Deduplicate code in htmlCurrentChar

2025-08-01 10:06:59 +03:00 · 2024-09-08 19:11:14 +02:00
parent 3adb396d87
commit 341dc78f24
1 changed files with 32 additions and 53 deletions
--- a/HTMLparser.c
+++ b/HTMLparser.c
@ -372,43 +372,6 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
    if (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)
        xmlParserGrow(ctxt);
    if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) {
        xmlChar * guess;
        /*
         * Assume it's a fixed length encoding (1) with
         * a compatible encoding for the ASCII set, since
         * HTML constructs only use < 128 chars
         */
        if (*ctxt->input->cur < 0x80) {
            if (*ctxt->input->cur == 0) {
                if (ctxt->input->cur < ctxt->input->end) {
                    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
                                    "Char 0x%X out of allowed range\n", 0);
                    *len = 1;
                    return(' ');
                } else {
                    *len = 0;
                    return(0);
                }
            }
            *len = 1;
            return(*ctxt->input->cur);
        }
        /*
         * Humm this is bad, do an automatic flow conversion
         */
        guess = htmlFindEncoding(ctxt);
        if (guess == NULL) {
            xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
        } else {
            xmlSwitchEncodingName(ctxt, (const char *) guess);
            xmlFree(guess);
        }
        ctxt->input->flags |= XML_INPUT_HAS_ENCODING;
    }
    /*
     * We are supposed to handle UTF8, check it's valid
     * From rfc2044: encoding of the Unicode values on UTF-8:
@ -422,9 +385,40 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
     */
    cur = ctxt->input->cur;
    c = *cur;
-    if (c & 0x80) {
+    if (c < 0x80) {
        if (c == 0) {
            if (ctxt->input->cur < ctxt->input->end) {
                htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
                                "Char 0x%X out of allowed range\n", 0);
                *len = 1;
                return(' ');
            } else {
                *len = 0;
                return(0);
            }
        }
        *len = 1;
        return(c);
    } else {
        size_t avail;
        if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) {
            xmlChar * guess;
            guess = htmlFindEncoding(ctxt);
            if (guess == NULL) {
                xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
            } else {
                xmlSwitchEncodingName(ctxt, (const char *) guess);
                xmlFree(guess);
            }
            ctxt->input->flags |= XML_INPUT_HAS_ENCODING;
            cur = ctxt->input->cur;
            c = *cur;
        }
        if ((c & 0x40) == 0)
            goto encoding_error;
@ -469,21 +463,6 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
                            "Char 0x%X out of allowed range\n", val);
        }
        return(val);
    } else {
        if (*ctxt->input->cur == 0) {
            if (ctxt->input->cur < ctxt->input->end) {
                htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
                                "Char 0x%X out of allowed range\n", 0);
                *len = 1;
                return(' ');
            } else {
                *len = 0;
                return(0);
            }
        }
        /* 1-byte code */
        *len = 1;
        return(*ctxt->input->cur);
    }
 encoding_error: