parser: Rework EBCDIC code page detection

To detect EBCDIC code pages, we used to switch the encoding twice and had to be very careful not to decode data after the XML declaration before the second switch. This relied on a hard-coded expected size of the XML declaration and was complicated and unreliable. Now we convert the first 200 bytes to EBCDIC-US and parse the encoding declaration manually.
2025-10-24 13:33:01 +03:00 · 2023-03-21 19:07:12 +01:00
parent 3eb9f5ca4e
commit 98840d40da
5 changed files with 93 additions and 314 deletions
--- a/encoding.c
+++ b/encoding.c
@@ -2037,7 +2037,7 @@ xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen,
 *     as the return value is 0, else unpredictable.
 * The value of @outlen after return is the number of octets produced.
 */
-static int
+int
 xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
                 int *outlen, const unsigned char *in, int *inlen, int flush) {
    int ret;
@@ -2123,189 +2123,12 @@ xmlEncOutputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
 * @out:  an xmlBuffer for the output.
 * @in:  an xmlBuffer for the input
 *
- * Front-end for the encoding handler input function, but handle only
- * the very first line, i.e. limit itself to 45 chars.
- *
- * Returns the number of byte written if success, or
- *     -1 general error
- *     -2 if the transcoding fails (for *in is not valid utf8 string or
- *        the result of transformation can't fit into the encoding we want), or
+ * DEPERECATED: Don't use.
 */
 int
 xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
                    xmlBufferPtr in) {
-    int ret;
-    int written;
-    int toconv;
-
-    if (handler == NULL) return(-1);
-    if (out == NULL) return(-1);
-    if (in == NULL) return(-1);
-
-    /* calculate space available */
-    written = out->size - out->use - 1; /* count '\0' */
-    toconv = in->use;
-    /*
-     * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
-     * 45 chars should be sufficient to reach the end of the encoding
-     * declaration without going too far inside the document content.
-     * on UTF-16 this means 90bytes, on UCS4 this means 180
-     * The actual value depending on guessed encoding is passed as @len
-     * if provided
-     */
-    if (toconv > 180)
-        toconv = 180;
-    if (toconv * 2 >= written) {
-        xmlBufferGrow(out, toconv * 2);
-	written = out->size - out->use - 1;
-    }
-
-    ret = xmlEncInputChunk(handler, &out->content[out->use], &written,
-                           in->content, &toconv, 0);
-    xmlBufferShrink(in, toconv);
-    out->use += written;
-    out->content[out->use] = 0;
-    if (ret == -1) ret = -3;
-
-#ifdef DEBUG_ENCODING
-    switch (ret) {
-        case 0:
-	    xmlGenericError(xmlGenericErrorContext,
-		    "converted %d bytes to %d bytes of input\n",
-	            toconv, written);
-	    break;
-        case -1:
-	    xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
-	            toconv, written, in->use);
-	    break;
-        case -2:
-	    xmlGenericError(xmlGenericErrorContext,
-		    "input conversion failed due to input error\n");
-	    break;
-        case -3:
-	    xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n",
-	            toconv, written, in->use);
-	    break;
-	default:
-	    xmlGenericError(xmlGenericErrorContext,"Unknown input conversion failed %d\n", ret);
-    }
-#endif /* DEBUG_ENCODING */
-    /*
-     * Ignore when input buffer is not on a boundary
-     */
-    if (ret == -3) ret = 0;
-    if (ret == -1) ret = 0;
-    return(written ? written : ret);
-}
-
-/**
- * xmlCharEncFirstLineInput:
- * @input: a parser input buffer
- * @len:  number of bytes to convert for the first line, or -1
- *
- * Front-end for the encoding handler input function, but handle only
- * the very first line. Point is that this is based on autodetection
- * of the encoding and once that first line is converted we may find
- * out that a different decoder is needed to process the input.
- *
- * Returns the number of byte written if success, or
- *     -1 general error
- *     -2 if the transcoding fails (for *in is not valid utf8 string or
- *        the result of transformation can't fit into the encoding we want), or
- */
-int
-xmlCharEncFirstLineInput(xmlParserInputBufferPtr input, int len)
-{
-    int ret;
-    size_t written;
-    size_t toconv;
-    int c_in;
-    int c_out;
-    xmlBufPtr in;
-    xmlBufPtr out;
-
-    if ((input == NULL) || (input->encoder == NULL) ||
-        (input->buffer == NULL) || (input->raw == NULL))
-        return (-1);
-    out = input->buffer;
-    in = input->raw;
-
-    toconv = xmlBufUse(in);
-    if (toconv == 0)
-        return (0);
-    written = xmlBufAvail(out);
-    /*
-     * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
-     * 45 chars should be sufficient to reach the end of the encoding
-     * declaration without going too far inside the document content.
-     * on UTF-16 this means 90bytes, on UCS4 this means 180
-     * The actual value depending on guessed encoding is passed as @len
-     * if provided
-     */
-    if (len >= 0) {
-        if (toconv > (unsigned int) len)
-            toconv = len;
-    } else {
-        if (toconv > 180)
-            toconv = 180;
-    }
-    if (toconv * 2 >= written) {
-        xmlBufGrow(out, toconv * 2);
-        written = xmlBufAvail(out);
-    }
-    if (written > 360)
-        written = 360;
-
-    c_in = toconv;
-    c_out = written;
-    ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out,
-                           xmlBufContent(in), &c_in, 0);
-    xmlBufShrink(in, c_in);
-    xmlBufAddLen(out, c_out);
-    if (ret == -1)
-        ret = -3;
-
-    switch (ret) {
-        case 0:
-#ifdef DEBUG_ENCODING
-            xmlGenericError(xmlGenericErrorContext,
-                            "converted %d bytes to %d bytes of input\n",
-                            c_in, c_out);
-#endif
-            break;
-        case -1:
-#ifdef DEBUG_ENCODING
-            xmlGenericError(xmlGenericErrorContext,
-                         "converted %d bytes to %d bytes of input, %d left\n",
-                            c_in, c_out, (int)xmlBufUse(in));
-#endif
-            break;
-        case -3:
-#ifdef DEBUG_ENCODING
-            xmlGenericError(xmlGenericErrorContext,
-                        "converted %d bytes to %d bytes of input, %d left\n",
-                            c_in, c_out, (int)xmlBufUse(in));
-#endif
-            break;
-        case -2: {
-            char buf[50];
-            const xmlChar *content = xmlBufContent(in);
-
-	    snprintf(&buf[0], 49, "0x%02X 0x%02X 0x%02X 0x%02X",
-		     content[0], content[1],
-		     content[2], content[3]);
-	    buf[49] = 0;
-	    xmlEncodingErr(XML_I18N_CONV_FAILED,
-		    "input conversion failed due to input error, bytes %s\n",
-		           buf);
-        }
-    }
-    /*
-     * Ignore when input buffer is not on a boundary
-     */
-    if (ret == -3) ret = 0;
-    if (ret == -1) ret = 0;
-    return(c_out ? c_out : ret);
+    return(xmlCharEncInFunc(handler, out, in));
 }

 /**
--- a/include/libxml/encoding.h
+++ b/include/libxml/encoding.h
@@ -203,6 +203,7 @@ XMLPUBFUN int
 	xmlCharEncInFunc		(xmlCharEncodingHandler *handler,
 					 xmlBufferPtr out,
 					 xmlBufferPtr in);
+XML_DEPRECATED
 XMLPUBFUN int
 	xmlCharEncFirstLine		(xmlCharEncodingHandler *handler,
 					 xmlBufferPtr out,
--- a/include/private/enc.h
+++ b/include/private/enc.h
@@ -8,7 +8,8 @@ XML_HIDDEN void
 xmlInitEncodingInternal(void);

 XML_HIDDEN int
-xmlCharEncFirstLineInput(xmlParserInputBufferPtr input, int len);
+xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out,
+                 int *outlen, const unsigned char *in, int *inlen, int flush);
 XML_HIDDEN int
 xmlCharEncInput(xmlParserInputBufferPtr input, int flush);
 XML_HIDDEN int
--- a/parser.c
+++ b/parser.c
@@ -10367,6 +10367,7 @@ xmlParseEncodingDecl(xmlParserCtxtPtr ctxt) {
        else if ((encoding != NULL) &&
 	    ((!xmlStrcasecmp(encoding, BAD_CAST "UTF-8")) ||
 	     (!xmlStrcasecmp(encoding, BAD_CAST "UTF8")))) {
+            /* TODO: Check for encoding mismatch. */
 	    if (ctxt->encoding != NULL)
 		xmlFree((xmlChar *) ctxt->encoding);
 	    ctxt->encoding = encoding;
@@ -10692,15 +10693,7 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) {
 	return(-1);
    }

-    /*
-     * Check for the XMLDecl in the Prolog.
-     * do not GROW here to avoid the detected encoder to decode more
-     * than just the first line, unless the amount of data is really
-     * too small to hold "<?xml version="1.0" encoding="foo"
-     */
-    if ((ctxt->input->end - ctxt->input->cur) < 35) {
-       GROW;
-    }
+    GROW;
    if ((CMP5(CUR_PTR, '<', '?', 'x', 'm', 'l')) && (IS_BLANK_CH(NXT(5)))) {

 	/*
@@ -11347,12 +11340,9 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
 	    /*
 	     * If we are operating on converted input, try to flush
 	     * remaining chars to avoid them stalling in the non-converted
-	     * buffer. But do not do this in document start where
-	     * encoding="..." may not have been read and we work on a
-	     * guessed encoding.
+	     * buffer.
 	     */
-	    if ((ctxt->instate != XML_PARSER_START) &&
-	        (ctxt->input->buf->raw != NULL) &&
+	    if ((ctxt->input->buf->raw != NULL) &&
 		(xmlBufIsEmpty(ctxt->input->buf->raw) == 0)) {
                size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer,
                                                 ctxt->input);
@@ -11395,6 +11385,13 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
 		    start[2] = NXT(2);
 		    start[3] = NXT(3);
 		    enc = xmlDetectCharEncoding(start, 4);
+                    /*
+                     * We need more bytes to detect EBCDIC code pages.
+                     * See xmlDetectEBCDIC.
+                     */
+                    if ((enc == XML_CHAR_ENCODING_EBCDIC) &&
+                        (!terminate) && (avail < 200))
+                        goto done;
 		    xmlSwitchEncoding(ctxt, enc);
 		    break;
 		}
@@ -12186,15 +12183,8 @@ xmlCreatePushParserCtxt(xmlSAXHandlerPtr sax, void *user_data,
    xmlParserCtxtPtr ctxt;
    xmlParserInputPtr inputStream;
    xmlParserInputBufferPtr buf;
-    xmlCharEncoding enc = XML_CHAR_ENCODING_NONE;

-    /*
-     * plug some encoding conversion routines
-     */
-    if ((chunk != NULL) && (size >= 4))
-	enc = xmlDetectCharEncoding((const xmlChar *) chunk, size);
-
-    buf = xmlAllocParserInputBuffer(enc);
+    buf = xmlAllocParserInputBuffer(XML_CHAR_ENCODING_NONE);
    if (buf == NULL) return(NULL);

    ctxt = xmlNewSAXParserCtxt(sax, user_data);
@@ -12253,10 +12243,6 @@ xmlCreatePushParserCtxt(xmlSAXHandlerPtr sax, void *user_data,
 #endif
    }

-    if (enc != XML_CHAR_ENCODING_NONE) {
-        xmlSwitchEncoding(ctxt, enc);
-    }
-
    return(ctxt);
 }
 #endif /* LIBXML_PUSH_ENABLED */
--- a/parserInternals.c
+++ b/parserInternals.c
@@ -990,9 +990,62 @@ xmlCopyChar(int len ATTRIBUTE_UNUSED, xmlChar *out, int val) {
 *									*
 ************************************************************************/

-static int
-xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
-                          xmlCharEncodingHandlerPtr handler, int len);
+static xmlCharEncodingHandlerPtr
+xmlDetectEBCDIC(xmlParserInputPtr input) {
+    xmlChar out[200];
+    xmlCharEncodingHandlerPtr handler;
+    int inlen, outlen, res, i;
+
+    /*
+     * To detect the EBCDIC code page, we convert the first 200 bytes
+     * to EBCDIC-US and try to find the encoding declaration.
+     */
+    handler = xmlGetCharEncodingHandler(XML_CHAR_ENCODING_EBCDIC);
+    if (handler == NULL)
+        return(NULL);
+    outlen = sizeof(out);
+    inlen = input->end - input->cur;
+    res = xmlEncInputChunk(handler, out, &outlen, input->cur, &inlen, 0);
+    if (res < 0)
+        return(handler);
+
+    for (i = 0; i < outlen; i++) {
+        if (out[i] == '>')
+            break;
+        if ((out[i] == 'e') &&
+            (xmlStrncmp(out + i, BAD_CAST "encoding", 8) == 0)) {
+            int start, cur, quote;
+
+            i += 8;
+            while (IS_BLANK_CH(out[i]))
+                i += 1;
+            if (out[i++] != '=')
+                break;
+            while (IS_BLANK_CH(out[i]))
+                i += 1;
+            quote = out[i++];
+            if ((quote != '\'') && (quote != '"'))
+                break;
+            start = i;
+            cur = out[i];
+            while (((cur >= 'a') && (cur <= 'z')) ||
+                   ((cur >= 'A') && (cur <= 'Z')) ||
+                   ((cur >= '0') && (cur <= '9')) ||
+                   (cur == '.') || (cur == '_') ||
+                   (cur == '-'))
+                cur = out[++i];
+            if (cur != quote)
+                break;
+            out[i] = 0;
+            xmlCharEncCloseFunc(handler);
+            handler = xmlFindCharEncodingHandler((char *) out + start);
+            break;
+        }
+    }
+
+    return(handler);
+}
+
 /**
 * xmlSwitchEncoding:
 * @ctxt:  the parser context
@@ -1007,7 +1060,6 @@ int
 xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
 {
    xmlCharEncodingHandlerPtr handler;
-    int len = -1;
    int ret;

    if (ctxt == NULL) return(-1);
@@ -1036,51 +1088,13 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
 		ctxt->input->cur += 3;
 	    }
 	    return(0);
-    case XML_CHAR_ENCODING_UTF16LE:
-    case XML_CHAR_ENCODING_UTF16BE:
-        /*The raw input characters are encoded
-         *in UTF-16. As we expect this function
-         *to be called after xmlCharEncInFunc, we expect
-         *ctxt->input->cur to contain UTF-8 encoded characters.
-         *So the raw UTF16 Byte Order Mark
-         *has also been converted into
-         *an UTF-8 BOM. Let's skip that BOM.
-         */
-        if ((ctxt->input != NULL) && (ctxt->input->cur != NULL) &&
-            (ctxt->input->cur[0] == 0xEF) &&
-            (ctxt->input->cur[1] == 0xBB) &&
-            (ctxt->input->cur[2] == 0xBF)) {
-            ctxt->input->cur += 3;
-        }
-        len = 90;
-	break;
-    case XML_CHAR_ENCODING_UCS2:
-        len = 90;
-	break;
-    case XML_CHAR_ENCODING_UCS4BE:
-    case XML_CHAR_ENCODING_UCS4LE:
-    case XML_CHAR_ENCODING_UCS4_2143:
-    case XML_CHAR_ENCODING_UCS4_3412:
-        len = 180;
-	break;
-    case XML_CHAR_ENCODING_EBCDIC:
-    case XML_CHAR_ENCODING_8859_1:
-    case XML_CHAR_ENCODING_8859_2:
-    case XML_CHAR_ENCODING_8859_3:
-    case XML_CHAR_ENCODING_8859_4:
-    case XML_CHAR_ENCODING_8859_5:
-    case XML_CHAR_ENCODING_8859_6:
-    case XML_CHAR_ENCODING_8859_7:
-    case XML_CHAR_ENCODING_8859_8:
-    case XML_CHAR_ENCODING_8859_9:
-    case XML_CHAR_ENCODING_ASCII:
-    case XML_CHAR_ENCODING_2022_JP:
-    case XML_CHAR_ENCODING_SHIFT_JIS:
-    case XML_CHAR_ENCODING_EUC_JP:
-        len = 45;
-	break;
+        case XML_CHAR_ENCODING_EBCDIC:
+            handler = xmlDetectEBCDIC(ctxt->input);
+            break;
+        default:
+            handler = xmlGetCharEncodingHandler(enc);
+            break;
    }
-    handler = xmlGetCharEncodingHandler(enc);
    if (handler == NULL) {
 	/*
 	 * Default handlers.
@@ -1112,7 +1126,7 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
                return(-1);
        }
    }
-    ret = xmlSwitchInputEncodingInt(ctxt, ctxt->input, handler, len);
+    ret = xmlSwitchInputEncoding(ctxt, ctxt->input, handler);
    if ((ret < 0) || (ctxt->errNo == XML_I18N_CONV_FAILED)) {
        /*
 	 * on encoding conversion errors, stop the parser
@@ -1124,20 +1138,19 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
 }

 /**
- * xmlSwitchInputEncodingInt:
+ * xmlSwitchInputEncoding:
 * @ctxt:  the parser context
 * @input:  the input stream
 * @handler:  the encoding handler
- * @len:  the number of bytes to convert for the first line or -1
 *
 * change the input functions when discovering the character encoding
 * of a given entity.
 *
 * Returns 0 in case of success, -1 otherwise
 */
-static int
-xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
-                          xmlCharEncodingHandlerPtr handler, int len)
+int
+xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
+                       xmlCharEncodingHandlerPtr handler)
 {
    int nbchars;
    xmlParserInputBufferPtr in;
@@ -1159,30 +1172,17 @@ xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
 	return (-1);
    }

-    ctxt->charset = XML_CHAR_ENCODING_UTF8;
-
    if (in->encoder != NULL) {
        /*
-         * Check in case the auto encoding detection triggered
-         * in already.
+         * TODO: Detect encoding mismatch. We should start by comparing
+         * in->encoder->name and handler->name, but there are a few
+         * compatible encodings like UTF-16 and UCS-2 or UTF-32 and UCS-4.
         */
-        if (in->encoder == handler)
-            return (0);
-
-        /*
-         * Note: this is a bit dangerous, but that's what it
-         * takes to use nearly compatible signature for different
-         * encodings.
-         *
-         * FIXME: Encoders might buffer partial byte sequences, so
-         * this probably can't work. We should return an error and
-         * make sure that callers never try to switch the encoding
-         * twice.
-         */
-        xmlCharEncCloseFunc(in->encoder);
-        in->encoder = handler;
+        xmlCharEncCloseFunc(handler);
        return (0);
    }
+
+    ctxt->charset = XML_CHAR_ENCODING_UTF8;
    in->encoder = handler;

    /*
@@ -1230,20 +1230,7 @@ xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
        in->rawconsumed = processed;
        use = xmlBufUse(in->raw);

-        if (ctxt->html) {
-            /*
-             * convert as much as possible of the buffer
-             */
-            nbchars = xmlCharEncInput(in, 1);
-        } else {
-            /*
-             * convert just enough to get
-             * '<?xml version="1.0" encoding="xxx"?>'
-             * parsed with the autodetected encoding
-             * into the parser reading buffer.
-             */
-            nbchars = xmlCharEncFirstLineInput(in, len);
-        }
+        nbchars = xmlCharEncInput(in, 0);
        xmlBufResetInput(in->buffer, input);
        if (nbchars < 0) {
            xmlErrInternal(ctxt,
@@ -1261,25 +1248,6 @@ xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
    return (0);
 }

-/**
- * xmlSwitchInputEncoding:
- * @ctxt:  the parser context
- * @input:  the input stream
- * @handler:  the encoding handler
- *
- * DEPRECATED: Use xmlSwitchToEncoding
- *
- * change the input functions when discovering the character encoding
- * of a given entity.
- *
- * Returns 0 in case of success, -1 otherwise
- */
-int
-xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
-                          xmlCharEncodingHandlerPtr handler) {
-    return(xmlSwitchInputEncodingInt(ctxt, input, handler, -1));
-}
-
 /**
 * xmlSwitchToEncoding:
 * @ctxt:  the parser context
@@ -1295,7 +1263,7 @@ xmlSwitchToEncoding(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler)
 {
    if (ctxt == NULL)
        return(-1);
-    return(xmlSwitchInputEncodingInt(ctxt, ctxt->input, handler, -1));
+    return(xmlSwitchInputEncoding(ctxt, ctxt->input, handler));
 }

 /************************************************************************