- doc/encoding.html doc/xml.html: added I18N doc

- encoding.[ch] HTMLtree.[ch] parser.c HTMLparser.c: I18N encoding improvements, both parser and filters, added ASCII & HTML, fixed the ISO-Latin-1 one - xmllint.c testHTML.c: added/made visible --encode - debugXML.c : cleanup - most .c files: applied patches due to warning on Windows and when using Sun Pro cc compiler - xpath.c : cleanup memleaks - nanoftp.c : added a TESTING preprocessor flag for standalong compile so that people can report bugs more easilly - nanohttp.c : ditched socklen_t which was a portability mess and replaced it with unsigned int. - tree.[ch]: added xmlHasProp() - TODO: updated - test/ : added more test for entities, NS, encoding, HTML, wap - configure.in: preparing for 2.2.0 release Daniel
2025-07-29 11:41:22 +03:00 · 2000-07-14 14:49:25 +00:00
parent 8d86964a4a
commit 32bc74ef98
41 changed files with 2068 additions and 928 deletions
--- a/encoding.c
+++ b/encoding.c
@ -43,6 +43,9 @@
 #endif
 #include <libxml/encoding.h>
 #include <libxml/xmlmemory.h>
+#ifdef LIBXML_HTML_ENABLED
+#include <libxml/HTMLparser.h>
+#endif

 xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
 xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
@ -177,6 +180,140 @@ xmlCheckUTF8(const unsigned char *utf)
      return(1);
 }

+/**
+ * asciiToUTF8:
+ * @out:  a pointer to an array of bytes to store the result
+ * @outlen:  the length of @out
+ * @in:  a pointer to an array of ASCII chars
+ * @inlen:  the length of @in
+ *
+ * Take a block of ASCII chars in and try to convert it to an UTF-8
+ * block of chars out.
+ * Returns 0 if success, or -1 otherwise
+ * The value of @inlen after return is the number of octets consumed
+ *     as the return value is positive, else unpredictiable.
+ * The value of @outlen after return is the number of ocetes consumed.
+ */
+int
+asciiToUTF8(unsigned char* out, int *outlen,
+              const unsigned char* in, int *inlen) {
+    unsigned char* outstart = out;
+    const unsigned char* base = in;
+    const unsigned char* processed = in;
+    unsigned char* outend = out + *outlen;
+    const unsigned char* inend;
+    unsigned int c;
+    int bits;
+
+    inend = in + (*inlen);
+    while ((in < inend) && (out - outstart + 5 < *outlen)) {
+	c= *in++;
+
+	/* assertion: c is a single UTF-4 value */
+        if (out >= outend)
+	    break;
+        if      (c <    0x80) {  *out++=  c;                bits= -6; }
+        else { 
+	    *outlen = out - outstart;
+	    *inlen = processed - base;
+	    return(-1);
+	}
+ 
+        for ( ; bits >= 0; bits-= 6) {
+            if (out >= outend)
+	        break;
+            *out++= ((c >> bits) & 0x3F) | 0x80;
+        }
+	processed = (const unsigned char*) in;
+    }
+    *outlen = out - outstart;
+    *inlen = processed - base;
+    return(0);
+}
+
+/**
+ * UTF8Toascii:
+ * @out:  a pointer to an array of bytes to store the result
+ * @outlen:  the length of @out
+ * @in:  a pointer to an array of UTF-8 chars
+ * @inlen:  the length of @in
+ *
+ * Take a block of UTF-8 chars in and try to convert it to an ASCII
+ * block of chars out.
+ *
+ * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
+ * The value of @inlen after return is the number of octets consumed
+ *     as the return value is positive, else unpredictiable.
+ * The value of @outlen after return is the number of ocetes consumed.
+ */
+int
+UTF8Toascii(unsigned char* out, int *outlen,
+              const unsigned char* in, int *inlen) {
+    const unsigned char* processed = in;
+    const unsigned char* outend;
+    const unsigned char* outstart = out;
+    const unsigned char* instart = in;
+    const unsigned char* inend;
+    unsigned int c, d;
+    int trailing;
+
+    if (in == NULL) {
+        /*
+	 * initialization nothing to do
+	 */
+	*outlen = 0;
+	*inlen = 0;
+	return(0);
+    }
+    inend = in + (*inlen);
+    outend = out + (*outlen);
+    while (in < inend) {
+	d = *in++;
+	if      (d < 0x80)  { c= d; trailing= 0; }
+	else if (d < 0xC0) {
+	    /* trailing byte in leading position */
+	    *outlen = out - outstart;
+	    *inlen = processed - instart;
+	    return(-2);
+        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
+        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
+        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
+	else {
+	    /* no chance for this in Ascii */
+	    *outlen = out - outstart;
+	    *inlen = processed - instart;
+	    return(-2);
+	}
+
+	if (inend - in < trailing) {
+	    break;
+	} 
+
+	for ( ; trailing; trailing--) {
+	    if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
+		break;
+	    c <<= 6;
+	    c |= d & 0x3F;
+	}
+
+	/* assertion: c is a single UTF-4 value */
+	if (c < 0x80) {
+	    if (out >= outend)
+		break;
+	    *out++ = c;
+	} else {
+	    /* no chance for this in Ascii */
+	    *outlen = out - outstart;
+	    *inlen = processed - instart;
+	    return(-2);
+	}
+	processed = in;
+    }
+    *outlen = out - outstart;
+    *inlen = processed - instart;
+    return(0);
+}
+
 /**
 * isolat1ToUTF8:
 * @out:  a pointer to an array of bytes to store the result
@ -195,28 +332,32 @@ int
 isolat1ToUTF8(unsigned char* out, int *outlen,
              const unsigned char* in, int *inlen) {
    unsigned char* outstart = out;
+    const unsigned char* base = in;
    const unsigned char* processed = in;
    unsigned char* outend = out + *outlen;
-    const unsigned char* inend = in + *inlen;
-    unsigned char c;
+    const unsigned char* inend;
+    unsigned int c;
+    int bits;

-    while (in < inend) {
-        c= *in++;
-        if (c < 0x80) {
+    inend = in + (*inlen);
+    while ((in < inend) && (out - outstart + 5 < *outlen)) {
+	c= *in++;
+
+	/* assertion: c is a single UTF-4 value */
+        if (out >= outend)
+	    break;
+        if      (c <    0x80) {  *out++=  c;                bits= -6; }
+        else                  {  *out++= ((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
+ 
+        for ( ; bits >= 0; bits-= 6) {
            if (out >= outend)
-		break;
-            *out++ = c;
+	        break;
+            *out++= ((c >> bits) & 0x3F) | 0x80;
        }
-        else {
-            if (out + 1 >= outend)  break;
-            *out++ = 0xC0 | (c >> 6);
-            *out++ = 0x80 | (0x3F & c);
-        }
-	processed = in;
+	processed = (const unsigned char*) in;
    }
    *outlen = out - outstart;
-    *inlen = processed - in;
-
+    *inlen = processed - base;
    return(0);
 }

@ -229,7 +370,6 @@ isolat1ToUTF8(unsigned char* out, int *outlen,
 *
 * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
 * block of chars out.
- * TODO: UTF8Toisolat1 need a fallback mechanism ...
 *
 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
 * The value of @inlen after return is the number of octets consumed
@ -239,34 +379,68 @@ isolat1ToUTF8(unsigned char* out, int *outlen,
 int
 UTF8Toisolat1(unsigned char* out, int *outlen,
              const unsigned char* in, int *inlen) {
-    unsigned char* outstart = out;
    const unsigned char* processed = in;
-    unsigned char* outend = out + *outlen;
-    const unsigned char* inend = in + *inlen;
-    unsigned char c;
+    const unsigned char* outend;
+    const unsigned char* outstart = out;
+    const unsigned char* instart = in;
+    const unsigned char* inend;
+    unsigned int c, d;
+    int trailing;

+    if (in == NULL) {
+        /*
+	 * initialization nothing to do
+	 */
+	*outlen = 0;
+	*inlen = 0;
+	return(0);
+    }
+    inend = in + (*inlen);
+    outend = out + (*outlen);
    while (in < inend) {
-        c= *in++;
-        if (c < 0x80) {
-            if (out >= outend)  return(-1);
-            *out++= c;
-        }
-	else if (in == inend) {
-            break;
-	}
-	else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) {
-	    /* a two byte utf-8 and can be encoding as isolate1 */
-            *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
-	}
-	else {
+	d = *in++;
+	if      (d < 0x80)  { c= d; trailing= 0; }
+	else if (d < 0xC0) {
+	    /* trailing byte in leading position */
 	    *outlen = out - outstart;
-	    *inlen = processed - in;
+	    *inlen = processed - instart;
+	    return(-2);
+        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
+        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
+        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
+	else {
+	    /* no chance for this in IsoLat1 */
+	    *outlen = out - outstart;
+	    *inlen = processed - instart;
+	    return(-2);
+	}
+
+	if (inend - in < trailing) {
+	    break;
+	} 
+
+	for ( ; trailing; trailing--) {
+	    if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
+		break;
+	    c <<= 6;
+	    c |= d & 0x3F;
+	}
+
+	/* assertion: c is a single UTF-4 value */
+	if (c <= 0xFF) {
+	    if (out >= outend)
+		break;
+	    *out++ = c;
+	} else {
+	    /* no chance for this in IsoLat1 */
+	    *outlen = out - outstart;
+	    *inlen = processed - instart;
 	    return(-2);
 	}
 	processed = in;
    }
    *outlen = out - outstart;
-    *inlen = processed - in;
+    *inlen = processed - instart;
    return(0);
 }

@ -367,7 +541,6 @@ UTF16LEToUTF8(unsigned char* out, int *outlen,
 *
 * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
 * block of chars out.
- * TODO: UTF8ToUTF16LE need a fallback mechanism ...
 *
 * Returns the number of byte written, or -1 by lack of space, or -2
 *     if the transcoding failed. 
@ -410,7 +583,7 @@ UTF8ToUTF16LE(unsigned char* outb, int *outlen,
      if      (d < 0x80)  { c= d; trailing= 0; }
      else if (d < 0xC0) {
          /* trailing byte in leading position */
-	  *outlen = out - outstart;
+	  *outlen = (out - outstart) * 2;
 	  *inlen = processed - in;
 	  return(-2);
      } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
@ -418,7 +591,7 @@ UTF8ToUTF16LE(unsigned char* outb, int *outlen,
      else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
      else {
 	/* no chance for this in UTF-16 */
-	*outlen = out - outstart;
+	*outlen = (out - outstart) * 2;
 	*inlen = processed - in;
 	return(-2);
      }
@ -578,7 +751,6 @@ UTF16BEToUTF8(unsigned char* out, int *outlen,
 *
 * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
 * block of chars out.
- * TODO: UTF8ToUTF16BE need a fallback mechanism ...
 *
 * Returns the number of byte written, or -1 by lack of space, or -2
 *     if the transcoding failed. 
@ -861,6 +1033,8 @@ xmlGetCharEncodingName(xmlCharEncoding enc) {
            return("Shift-JIS");
        case XML_CHAR_ENCODING_EUC_JP:
            return("EUC-JP");
+        case XML_CHAR_ENCODING_ASCII:
+            return("ASCII");
    }
    return(NULL);
 }
@ -974,6 +1148,10 @@ xmlInitCharEncodingHandlers(void) {
    xmlUTF16BEHandler = 
          xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
    xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
+    xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
+#ifdef LIBXML_HTML_ENABLED
+    xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
+#endif
 }

 /**
@ -1081,16 +1259,51 @@ xmlGetCharEncodingHandler(xmlCharEncoding enc) {
            handler = xmlFindCharEncodingHandler("UCS2");
            if (handler != NULL) return(handler);
 	    break;
+
+	    /*
+	     * We used to keep ISO Latin encodings native in the
+	     * generated data. This led to so many problems that
+	     * this has been removed. One can still change this
+	     * back by registering no-ops encoders for those
+	     */
        case XML_CHAR_ENCODING_8859_1:
+	    handler = xmlFindCharEncodingHandler("ISO-8859-1");
+	    if (handler != NULL) return(handler);
+	    break;
        case XML_CHAR_ENCODING_8859_2:
+	    handler = xmlFindCharEncodingHandler("ISO-8859-2");
+	    if (handler != NULL) return(handler);
+	    break;
        case XML_CHAR_ENCODING_8859_3:
+	    handler = xmlFindCharEncodingHandler("ISO-8859-3");
+	    if (handler != NULL) return(handler);
+	    break;
        case XML_CHAR_ENCODING_8859_4:
+	    handler = xmlFindCharEncodingHandler("ISO-8859-4");
+	    if (handler != NULL) return(handler);
+	    break;
        case XML_CHAR_ENCODING_8859_5:
+	    handler = xmlFindCharEncodingHandler("ISO-8859-5");
+	    if (handler != NULL) return(handler);
+	    break;
        case XML_CHAR_ENCODING_8859_6:
+	    handler = xmlFindCharEncodingHandler("ISO-8859-6");
+	    if (handler != NULL) return(handler);
+	    break;
        case XML_CHAR_ENCODING_8859_7:
+	    handler = xmlFindCharEncodingHandler("ISO-8859-7");
+	    if (handler != NULL) return(handler);
+	    break;
        case XML_CHAR_ENCODING_8859_8:
+	    handler = xmlFindCharEncodingHandler("ISO-8859-8");
+	    if (handler != NULL) return(handler);
+	    break;
        case XML_CHAR_ENCODING_8859_9:
-	    return(NULL);
+	    handler = xmlFindCharEncodingHandler("ISO-8859-9");
+	    if (handler != NULL) return(handler);
+	    break;
+
+
        case XML_CHAR_ENCODING_2022_JP:
            handler = xmlFindCharEncodingHandler("ISO-2022-JP");
            if (handler != NULL) return(handler);
@ -1161,7 +1374,8 @@ xmlFindCharEncodingHandler(const char *name) {
    icv_in = iconv_open("UTF-8", name);
    icv_out = iconv_open(name, "UTF-8");
    if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
-	    enc = xmlMalloc(sizeof(xmlCharEncodingHandler));
+	    enc = (xmlCharEncodingHandlerPtr)
+	          xmlMalloc(sizeof(xmlCharEncodingHandler));
 	    if (enc == NULL) {
 	        iconv_close(icv_in);
 	        iconv_close(icv_out);
@ -1506,6 +1720,10 @@ retry:
 	if (ret == -1) ret = -3;
    }
 #endif /* LIBXML_ICONV_ENABLED */
+    else {
+	fprintf(stderr, "xmlCharEncOutFunc: no output function !\n");
+	return(-1);
+    }

    if (ret >= 0) output += ret;

@ -1528,7 +1746,7 @@ retry:
 #endif
        case -2: {
 	    int len = in->use;
-	    const char *utf = (const char *) in->content;
+	    const xmlChar *utf = (const xmlChar *) in->content;
 	    int cur;

 	    cur = xmlGetUTF8Char(utf, &len);
@ -1546,7 +1764,7 @@ retry:
 		 * and continue the transcoding phase, hoping the error
 		 * did not mangle the encoder state.
 		 */
-		sprintf(charref, "&#x%X;", cur);
+		sprintf((char *) charref, "&#x%X;", cur);
 		xmlBufferShrink(in, len);
 		xmlBufferAddHead(in, charref, -1);