- Large resync between W3C and Gnome tree

- configure.in: 2.1.0 prerelease - example/Makefile.am example/gjobread.c tree.h: work on libxml1 libxml2 convergence. - nanoftp, nanohttp.c: fixed stalled connections probs - HTMLtree.c SAX.c : support for attribute without values in HTML for andersca - valid.c: Fixed most validation + namespace problems - HTMLparser.c: start document callback for andersca - debugXML.c xpath.c: lots of XPath fixups from Picdar Technology - parser.h, SAX.c: serious speed improvement for large CDATA blocks - encoding.[ch] xmlIO.[ch]: Improved seriously saving to different encoding - config.h.in parser.c xmllint.c: added xmlCheckVersion() and the LIBXML_TEST_VERSION macro Daniel
2025-07-29 11:41:22 +03:00 · 2000-06-28 23:40:59 +00:00
parent c310d56482
commit be803967db
41 changed files with 2877 additions and 1562 deletions
--- a/encoding.c
+++ b/encoding.c
@ -3,6 +3,7 @@
 *
 * Related specs: 
 * rfc2044        (UTF-8 and UTF-16) F. Yergeau Alis Technologies
+ * rfc2781        UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
 * [ISO-10646]    UTF-8 and UTF-16 in Annexes
 * [ISO-8859-1]   ISO Latin-1 characters codes.
 * [UNICODE]      The Unicode Consortium, "The Unicode Standard --
@ -65,6 +66,73 @@ static int xmlLittleEndian = 1;
 * I hope we won't use values > 0xFFFF anytime soon !
 */

+/**
+ * xmlGetUTF8Char:
+ * @utf:  a sequence of UTF-8 encoded bytes
+ * @len:  a pointer to @bytes len
+ *
+ * Read one UTF8 Char from @utf
+ *
+ * Returns the char value or -1 in case of error and update @len with the
+ *        number of bytes used
+ */
+int
+xmlGetUTF8Char(const unsigned char *utf, int *len) {
+    unsigned int c;
+
+    if (utf == NULL)
+	goto error;
+    if (len == NULL)
+	goto error;
+    if (*len < 1)
+	goto error;
+
+    c = utf[0];
+    if (c & 0x80) {
+	if (*len < 2)
+	    goto error;
+	if ((utf[1] & 0xc0) != 0x80)
+	    goto error;
+	if ((c & 0xe0) == 0xe0) {
+	    if (*len < 3)
+		goto error;
+	    if ((utf[2] & 0xc0) != 0x80)
+		goto error;
+	    if ((c & 0xf0) == 0xf0) {
+		if (*len < 4)
+		    goto error;
+		if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
+		    goto error;
+		*len = 4;
+		/* 4-byte code */
+		c = (utf[0] & 0x7) << 18;
+		c |= (utf[1] & 0x3f) << 12;
+		c |= (utf[2] & 0x3f) << 6;
+		c |= utf[3] & 0x3f;
+	    } else {
+	      /* 3-byte code */
+		*len = 3;
+		c = (utf[0] & 0xf) << 12;
+		c |= (utf[1] & 0x3f) << 6;
+		c |= utf[2] & 0x3f;
+	    }
+	} else {
+	  /* 2-byte code */
+	    *len = 2;
+	    c = (utf[0] & 0x1f) << 6;
+	    c |= utf[1] & 0x3f;
+	}
+    } else {
+	/* 1-byte code */
+	*len = 1;
+    }
+    return(c);
+
+error:
+    *len = 0;
+    return(-1);
+}
+
 /**
 * xmlCheckUTF8: Check utf-8 string for legality.
 * @utf: Pointer to putative utf-8 encoded string.
@ -236,7 +304,7 @@ UTF16LEToUTF8(unsigned char* out, int *outlen,
        (*inlenb)--;
    inlen = *inlenb / 2;
    inend = in + inlen;
-    while (in < inend) {
+    while ((in < inend) && (out - outstart + 5 < *outlen)) {
        if (xmlLittleEndian) {
 	    c= *in++;
 	} else {
@ -246,9 +314,9 @@ UTF16LEToUTF8(unsigned char* out, int *outlen,
 	    in++;
 	}
        if ((c & 0xFC00) == 0xD800) {    /* surrogates */
-            if (in >= inend) {           /* (in > inend) shouldn't happens */
-                break;
-            }
+	    if (in >= inend) {           /* (in > inend) shouldn't happens */
+		break;
+	    }
 	    if (xmlLittleEndian) {
 		d = *in++;
 	    } else {
@ -317,6 +385,24 @@ UTF8ToUTF16LE(unsigned char* outb, int *outlen,
    unsigned char *tmp;
    unsigned short tmp1, tmp2;

+    if (in == NULL) {
+        /*
+	 * initialization, add the Byte Order Mark
+	 */
+        if (*outlen >= 2) {
+	    outb[0] = 0xFF;
+	    outb[1] = 0xFE;
+	    *outlen = 2;
+	    *inlen = 0;
+#ifdef DEBUG_ENCODING
+            fprintf(stderr, "Added FFFE Byte Order Mark\n");
+#endif
+	    return(2);
+	}
+	*outlen = 0;
+	*inlen = 0;
+	return(0);
+    }
    outend = out + (*outlen / 2);
    while (in < inend) {
      d= *in++;
@ -385,7 +471,7 @@ UTF8ToUTF16LE(unsigned char* outb, int *outlen,
 	    break;
 	processed = in;
    }
-    *outlen = out - outstart;
+    *outlen = (out - outstart) * 2;
    *inlen = processed - in;
    return(0);
 }
@ -509,6 +595,24 @@ UTF8ToUTF16BE(unsigned char* outb, int *outlen,
    unsigned char *tmp;
    unsigned short tmp1, tmp2;

+    if (in == NULL) {
+        /*
+	 * initialization, add the Byte Order Mark
+	 */
+        if (*outlen >= 2) {
+	    outb[0] = 0xFE;
+	    outb[1] = 0xFF;
+	    *outlen = 2;
+	    *inlen = 0;
+#ifdef DEBUG_ENCODING
+            fprintf(stderr, "Added FEFF Byte Order Mark\n");
+#endif
+	    return(2);
+	}
+	*outlen = 0;
+	*inlen = 0;
+	return(0);
+    }
    outend = out + (*outlen / 2);
    while (in < inend) {
      d= *in++;
@ -574,7 +678,7 @@ UTF8ToUTF16BE(unsigned char* outb, int *outlen,
 	    break;
 	processed = in;
    }
-    *outlen = out - outstart;
+    *outlen = (out - outstart) * 2;
    *inlen = processed - in;
    return(0);
 }
@ -695,6 +799,70 @@ xmlParseCharEncoding(const char* name)
    return(XML_CHAR_ENCODING_ERROR);
 }

+/**
+ * xmlGetCharEncodingName:
+ * @enc:  the encoding
+ *
+ * The "canonical" name for XML encoding.
+ * C.f. http://www.w3.org/TR/REC-xml#charencoding
+ * Section 4.3.3  Character Encoding in Entities
+ *
+ * Returns the canonical name for the given encoding
+ */
+
+const char*
+xmlGetCharEncodingName(xmlCharEncoding enc) {
+    switch (enc) {
+        case XML_CHAR_ENCODING_ERROR:
+	    return(NULL);
+        case XML_CHAR_ENCODING_NONE:
+	    return(NULL);
+        case XML_CHAR_ENCODING_UTF8:
+	    return("UTF-8");
+        case XML_CHAR_ENCODING_UTF16LE:
+	    return("UTF-16");
+        case XML_CHAR_ENCODING_UTF16BE:
+	    return("UTF-16");
+        case XML_CHAR_ENCODING_EBCDIC:
+            return("EBCDIC");
+        case XML_CHAR_ENCODING_UCS4LE:
+            return("ISO-10646-UCS-4");
+        case XML_CHAR_ENCODING_UCS4BE:
+            return("ISO-10646-UCS-4");
+        case XML_CHAR_ENCODING_UCS4_2143:
+            return("ISO-10646-UCS-4");
+        case XML_CHAR_ENCODING_UCS4_3412:
+            return("ISO-10646-UCS-4");
+        case XML_CHAR_ENCODING_UCS2:
+            return("ISO-10646-UCS-2");
+        case XML_CHAR_ENCODING_8859_1:
+	    return("ISO-8859-1");
+        case XML_CHAR_ENCODING_8859_2:
+	    return("ISO-8859-2");
+        case XML_CHAR_ENCODING_8859_3:
+	    return("ISO-8859-3");
+        case XML_CHAR_ENCODING_8859_4:
+	    return("ISO-8859-4");
+        case XML_CHAR_ENCODING_8859_5:
+	    return("ISO-8859-5");
+        case XML_CHAR_ENCODING_8859_6:
+	    return("ISO-8859-6");
+        case XML_CHAR_ENCODING_8859_7:
+	    return("ISO-8859-7");
+        case XML_CHAR_ENCODING_8859_8:
+	    return("ISO-8859-8");
+        case XML_CHAR_ENCODING_8859_9:
+	    return("ISO-8859-9");
+        case XML_CHAR_ENCODING_2022_JP:
+            return("ISO-2022-JP");
+        case XML_CHAR_ENCODING_SHIFT_JIS:
+            return("Shift-JIS");
+        case XML_CHAR_ENCODING_EUC_JP:
+            return("EUC-JP");
+    }
+    return(NULL);
+}
+
 /****************************************************************
 *								*
 *		Char encoding handlers				*
@ -883,7 +1051,7 @@ xmlGetCharEncodingHandler(xmlCharEncoding enc) {
            handler = xmlFindCharEncodingHandler("ebcdic");
            if (handler != NULL) return(handler);
 	    break;
-        case XML_CHAR_ENCODING_UCS4LE:
+        case XML_CHAR_ENCODING_UCS4BE:
            handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
            if (handler != NULL) return(handler);
            handler = xmlFindCharEncodingHandler("UCS-4");
@ -891,8 +1059,12 @@ xmlGetCharEncodingHandler(xmlCharEncoding enc) {
            handler = xmlFindCharEncodingHandler("UCS4");
            if (handler != NULL) return(handler);
 	    break;
-        case XML_CHAR_ENCODING_UCS4BE:
-            handler = xmlFindCharEncodingHandler("UCS4BE");
+        case XML_CHAR_ENCODING_UCS4LE:
+            handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
+            if (handler != NULL) return(handler);
+            handler = xmlFindCharEncodingHandler("UCS-4");
+            if (handler != NULL) return(handler);
+            handler = xmlFindCharEncodingHandler("UCS4");
            if (handler != NULL) return(handler);
 	    break;
        case XML_CHAR_ENCODING_UCS4_2143:
@ -953,9 +1125,10 @@ xmlGetCharEncodingHandler(xmlCharEncoding enc) {
 */
 xmlCharEncodingHandlerPtr
 xmlFindCharEncodingHandler(const char *name) {
+    xmlCharEncodingHandlerPtr enc;
+    xmlCharEncoding alias;
 #ifdef LIBXML_ICONV_ENABLED
    iconv_t icv_in, icv_out;
-    xmlCharEncodingHandlerPtr enc;
 #endif /* LIBXML_ICONV_ENABLED */
    char upper[100];
    int i;
@ -964,6 +1137,9 @@ xmlFindCharEncodingHandler(const char *name) {
    if (name == NULL) return(xmlDefaultCharEncodingHandler);
    if (name[0] == 0) return(xmlDefaultCharEncodingHandler);

+    /*
+     * Check first for directly registered encoding names
+     */
    for (i = 0;i < 99;i++) {
        upper[i] = toupper(name[i]);
 	if (upper[i] == 0) break;
@ -1002,9 +1178,23 @@ xmlFindCharEncodingHandler(const char *name) {
 	    fprintf(stderr, "iconv : problems with filters for '%s'\n", name);
    }
 #endif /* LIBXML_ICONV_ENABLED */
+
 #ifdef DEBUG_ENCODING
    fprintf(stderr, "No handler found for encoding %s\n", name);
 #endif
+
+    /*
+     * Fallback using the canonical names
+     */
+    alias = xmlParseCharEncoding(name);
+    if (alias != XML_CHAR_ENCODING_ERROR) {
+        const char* canon;
+        canon = xmlGetCharEncodingName(alias);
+        if ((canon != NULL) && (strcmp(name, canon))) {
+	    return(xmlFindCharEncodingHandler(canon));
+        }
+    }
+
    return(NULL);
 }

@ -1040,8 +1230,13 @@ xmlIconvWrapper(iconv_t cd,
 	ret = iconv(cd,
 		&icv_in, &icv_inlen,
 		&icv_out, &icv_outlen);
-	*inlen -= icv_inlen;
-	*outlen -= icv_outlen;
+	if (in != NULL) {
+	    *inlen -= icv_inlen;
+	    *outlen -= icv_outlen;
+	} else {
+	    *inlen = 0;
+	    *outlen = 0;
+	}
 	if (icv_inlen != 0 || ret == (size_t) -1) {
 #ifdef EILSEQ
 		if (errno == EILSEQ) {
@ -1066,6 +1261,91 @@ xmlIconvWrapper(iconv_t cd,
 }
 #endif /* LIBXML_ICONV_ENABLED */

+/**
+ * xmlCharEncFirstLine:
+ * @handler:	char enconding transformation data structure
+ * @out:  an xmlBuffer for the output.
+ * @in:  an xmlBuffer for the input
+ *     
+ * Front-end for the encoding handler input function, but handle only
+ * the very first line, i.e. limit itself to 45 chars.
+ *     
+ * Returns the number of byte written if success, or 
+ *     -1 general error
+ *     -2 if the transcoding fails (for *in is not valid utf8 string or
+ *        the result of transformation can't fit into the encoding we want), or
+ */
+int
+xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out,
+                 xmlBufferPtr in) {
+    int ret = -2;
+    int written;
+    int toconv;
+
+    if (handler == NULL) return(-1);
+    if (out == NULL) return(-1);
+    if (in == NULL) return(-1);
+
+    written = out->size - out->use;
+    toconv = in->use;
+    if (toconv * 2 >= written) {
+        xmlBufferGrow(out, toconv);
+	written = out->size - out->use - 1;
+    }
+
+    /*
+     * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38
+     * 45 chars should be sufficient to reach the end of the encoding
+     * decalration without going too far inside the document content.
+     */
+    written = 45;
+
+    if (handler->input != NULL) {
+	ret = handler->input(&out->content[out->use], &written,
+	                     in->content, &toconv);
+	xmlBufferShrink(in, toconv);
+	out->use += written;
+	out->content[out->use] = 0;
+    }
+#ifdef LIBXML_ICONV_ENABLED
+    else if (handler->iconv_in != NULL) {
+	ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
+	                      &written, in->content, &toconv);
+	xmlBufferShrink(in, toconv);
+	out->use += written;
+	out->content[out->use] = 0;
+	if (ret == -1) ret = -3;
+    }
+#endif /* LIBXML_ICONV_ENABLED */
+#ifdef DEBUG_ENCODING
+    switch (ret) {
+        case 0:
+	    fprintf(stderr, "converted %d bytes to %d bytes of input\n",
+	            toconv, written);
+	    break;
+        case -1:
+	    fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
+	            toconv, written, in->use);
+	    break;
+        case -2:
+	    fprintf(stderr, "input conversion failed due to input error\n");
+	    break;
+        case -3:
+	    fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
+	            toconv, written, in->use);
+	    break;
+	default:
+	    fprintf(stderr,"Unknown input conversion failed %d\n", ret);
+    }
+#endif
+    /*
+     * Ignore when input buffer is not on a boundary
+     */
+    if (ret == -3) ret = 0;
+    if (ret == -1) ret = 0;
+    return(ret);
+}
+
 /**
 * xmlCharEncInFunc:
 * @handler:	char enconding transformation data structure
@ -1113,8 +1393,8 @@ xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
 	if (ret == -1) ret = -3;
    }
 #endif /* LIBXML_ICONV_ENABLED */
-#ifdef DEBUG_ENCODING
    switch (ret) {
+#ifdef DEBUG_ENCODING
        case 0:
 	    fprintf(stderr, "converted %d bytes to %d bytes of input\n",
 	            toconv, written);
@ -1123,17 +1403,17 @@ xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
 	    fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
 	            toconv, written, in->use);
 	    break;
-        case -2:
-	    fprintf(stderr, "input conversion failed due to input error\n");
-	    break;
        case -3:
 	    fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
 	            toconv, written, in->use);
 	    break;
-	default:
-	    fprintf(stderr,"Unknown input conversion failed %d\n", ret);
-    }
 #endif
+        case -2:
+	    fprintf(stderr, "input conversion failed due to input error\n");
+	    fprintf(stderr, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
+		    in->content[0], in->content[1],
+		    in->content[2], in->content[3]);
+    }
    /*
     * Ignore when input buffer is not on a boundary
     */
@ -1148,6 +1428,11 @@ xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
 * @in:  an xmlBuffer for the input
 *     
 * Generic front-end for the encoding handler output function
+ * a first call with @in == NULL has to be made firs to initiate the 
+ * output in case of non-stateless encoding needing to initiate their
+ * state or the output (like the BOM in UTF16).
+ * In case of UTF8 sequence conversion errors for the given encoder,
+ * the content will be automatically remapped to a CharRef sequence.
 *     
 * Returns the number of byte written if success, or 
 *     -1 general error
@ -1160,12 +1445,43 @@ xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
    int ret = -2;
    int written;
    int toconv;
+    int output = 0;

    if (handler == NULL) return(-1);
    if (out == NULL) return(-1);
-    if (in == NULL) return(-1);

+retry:
+    
    written = out->size - out->use;
+
+    /*
+     * First specific handling of in = NULL, i.e. the initialization call
+     */
+    if (in == NULL) {
+        toconv = 0;
+	if (handler->output != NULL) {
+	    ret = handler->output(&out->content[out->use], &written,
+				  NULL, &toconv);
+	    out->use += written;
+	    out->content[out->use] = 0;
+	}
+#ifdef LIBXML_ICONV_ENABLED
+	else if (handler->iconv_out != NULL) {
+	    ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
+				  &written, NULL, &toconv);
+	    out->use += written;
+	    out->content[out->use] = 0;
+	}
+#endif /* LIBXML_ICONV_ENABLED */
+#ifdef DEBUG_ENCODING
+	fprintf(stderr, "initialized encoder\n");
+#endif
+        return(0);
+    }
+
+    /*
+     * Convertion itself.
+     */
    toconv = in->use;
    if (toconv * 2 >= written) {
        xmlBufferGrow(out, toconv * 2);
@ -1173,7 +1489,7 @@ xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
    }
    if (handler->output != NULL) {
 	ret = handler->output(&out->content[out->use], &written,
-	                     in->content, &toconv);
+	                      in->content, &toconv);
 	xmlBufferShrink(in, toconv);
 	out->use += written;
 	out->content[out->use] = 0;
@ -1188,8 +1504,14 @@ xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
 	if (ret == -1) ret = -3;
    }
 #endif /* LIBXML_ICONV_ENABLED */
-#ifdef DEBUG_ENCODING
+
+    if (ret >= 0) output += ret;
+
+    /*
+     * Attempt to handle error cases
+     */
    switch (ret) {
+#ifdef DEBUG_ENCODING
        case 0:
 	    fprintf(stderr, "converted %d bytes to %d bytes of output\n",
 	            toconv, written);
@ -1197,17 +1519,45 @@ xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
        case -1:
 	    fprintf(stderr, "output conversion failed by lack of space\n");
 	    break;
-        case -2:
-	    fprintf(stderr, "output conversion failed due to output error\n");
-	    break;
        case -3:
 	    fprintf(stderr,"converted %d bytes to %d bytes of output %d left\n",
 	            toconv, written, in->use);
 	    break;
-	default:
-	    fprintf(stderr,"Unknown output conversion failed %d\n", ret);
-    }
 #endif
+        case -2: {
+	    int len = in->use;
+	    const char *utf = (const char *) in->content;
+	    int cur;
+
+	    cur = xmlGetUTF8Char(utf, &len);
+	    if (cur > 0) {
+		xmlChar charref[20];
+
+#ifdef DEBUG_ENCODING
+		fprintf(stderr, "handling output conversion error\n");
+		fprintf(stderr, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
+			in->content[0], in->content[1],
+			in->content[2], in->content[3]);
+#endif
+		/*
+		 * Removes the UTF8 sequence, and replace it by a charref
+		 * and continue the transcoding phase, hoping the error
+		 * did not mangle the encoder state.
+		 */
+		sprintf(charref, "&#x%X;", cur);
+		xmlBufferShrink(in, len);
+		xmlBufferAddHead(in, charref, -1);
+
+		goto retry;
+	    } else {
+		fprintf(stderr, "output conversion failed due to conv error\n");
+		fprintf(stderr, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
+			in->content[0], in->content[1],
+			in->content[2], in->content[3]);
+	    }
+	    break;
+	}
+    }
    return(ret);
 }