revamped the encoding support, added iconv support, so now libxml if

* encoding.[ch], xmlIO.[ch], parser.c, configure.in : revamped the encoding support, added iconv support, so now libxml if compiled with iconv automatically support japanese encodings among others. Work based on initial patch from Yuan-Chen Cheng I may have broken binary compat in the encoding handler registration scheme, but that was so utterly broken I don't expect anybody to have used this feature until now. * parserInternals.h: fixup on the CHAR range macro * xml-error.h, parser.c: catch URL/URI errors using the uri.c code. * tree.[ch]: added xmlBufferGrow(), was needed for iconv * uri.c: added xmlParseURI() I can't believe I forgot to implement this one in 2.0 !!! * SAX.c: moved doc->encoding update in the endDocument() call. * TODO: updated. Iconv rules :-) Daniel
2025-07-30 22:43:14 +03:00 · 2000-05-03 14:20:55 +00:00
parent 06047432eb
commit 496a1cf592
18 changed files with 1163 additions and 487 deletions
--- a/18
+++ b/18
@ -1,3 +1,21 @@
+Wed May  3 14:21:25 CEST 2000 Daniel Veillard <Daniel.Veillard@w3.org>
+
+	* encoding.[ch], xmlIO.[ch], parser.c, configure.in : revamped
+	  the encoding support, added iconv support, so now libxml if
+	  compiled with iconv automatically support japanese encodings
+	  among others. Work based on initial patch from Yuan-Chen Cheng
+	  I may have broken binary compat in the encoding handler
+	  registration scheme, but that was so utterly broken I don't
+	  expect anybody to have used this feature until now.
+	* parserInternals.h: fixup on the CHAR range macro
+	* xml-error.h, parser.c: catch URL/URI errors using the uri.c
+	  code.
+	* tree.[ch]: added xmlBufferGrow(), was needed for iconv
+	* uri.c: added xmlParseURI() I can't believe I forgot to
+	  implement this one in 2.0 !!!
+	* SAX.c: moved doc->encoding update in the endDocument() call.
+	* TODO: updated.
+
 Mon Apr 24 13:30:13 CEST 2000 Daniel Veillard <Daniel.Veillard@w3.org>

 	* tree.h: removed extraneous xmlRemoveProp definition
--- a/SAX.c
+++ b/SAX.c
@ -595,6 +595,15 @@ endDocument(void *ctx)
    if (ctxt->validate && ctxt->wellFormed &&
        ctxt->myDoc && ctxt->myDoc->intSubset)
 	ctxt->valid &= xmlValidateDocumentFinal(&ctxt->vctxt, ctxt->myDoc);
+
+    /*
+     * Grab the encoding if it was added on-the-fly
+     */
+    if ((ctxt->encoding != NULL) && (ctxt->myDoc != NULL) &&
+	(ctxt->myDoc->encoding == NULL)) {
+	ctxt->myDoc->encoding = ctxt->encoding;
+	ctxt->encoding = NULL;
+    }
 }

 /**
--- a/9
+++ b/9
@ -6,6 +6,8 @@
 TODO:
 =====

+- xmlSwitchToEncoding() need a rewrite for correct handling of conversion
+  error code conditions.
 - DOM needs
  xmlAttrPtr xmlNewDocProp(xmlDocPtr doc, const xmlChar *name, const xmlChar *value)
  int xmlPruneProp(xmlNodePtr node, xmlAtttrPtr attr);
@ -14,7 +16,6 @@ TODO:
 - add support for the trick from Henry conf/sun/valid/empty.xml
 - Correct standalone checking/emitting (hard)
  2.9 Standalone Document Declaration
- URI checkings (no fragments) rfc2396.txt
 - Better checking of external parsed entities TAG 1234
 - Find way of representing PERefs in the Dtd so that %entity; can
  be saved back.
@ -22,6 +23,7 @@ TODO:
  http://www.w3.org/XML/xml-19980210-errata ... bummmer 
 - Handle undefined namespaces in entity contents better ... at least
  issue a warning
+- Issue warning when using non-absolute namespaces URI.
 - General checking of DTD validation in presence of namespaces ... hairy
 - fix --disable-corba configure switch handling, and use XML_WITHOUT_CORBA
  not WITHOUT_CORBA flag
@ -30,7 +32,7 @@ TODO:
 =====

 - Get OASIS testsuite to a more friendly result, check all the results
-  once stable.
+  once stable. Current state at:
  http://xmlsoft.org/conf/result.html

 - Optimization of tag strings allocation ?
@ -55,11 +57,13 @@ EXTENSIONS:

 - Add Xlink recognition/API
  => started adding an xlink.[ch] with a unified API for XML and HTML.
+     it's crap :-(

 - Implement XSLT
  => seems that someone volunteered ?!?

 - Implement XSchemas
+  => Really need to be done <grin/>

 - O2K parsing;
  => this is a somewhat ugly mix of HTML and XML, adding a specific
@ -88,6 +92,7 @@ EXTENSIONS:
 Done:
 =====

+- URI checkings (no fragments) rfc2396.txt
 - Added a clean mechanism for overload or added input methods:
  xmlRegisterInputCallbacks()
 - dynamically adapt the alloc entry point to use g_alloc()/g_free()
--- a/configure.in
+++ b/configure.in
@ -4,7 +4,7 @@ AC_INIT(entities.h)
 AM_CONFIG_HEADER(config.h)

 LIBXML_MAJOR_VERSION=2
-LIBXML_MINOR_VERSION=0
+LIBXML_MINOR_VERSION=1
 LIBXML_MICRO_VERSION=0
 LIBXML_VERSION=$LIBXML_MAJOR_VERSION.$LIBXML_MINOR_VERSION.$LIBXML_MICRO_VERSION
 LIBXML_VERSION_INFO=`expr $LIBXML_MAJOR_VERSION + $LIBXML_MINOR_VERSION`:$LIBXML_MICRO_VERSION:$LIBXML_MINOR_VERSION
@ -203,6 +203,20 @@ fi
 AC_SUBST(WITH_XPATH)
 AC_SUBST(XPATH_OBJ)

+AC_ARG_WITH(iconv, [  --with-iconv            Add the ICONV support (on)])
+if test "$with_iconv" = "no" ; then
+    echo Disabling ICONV support
+    WITH_ICONV=0
+else    
+    if test "$have_iconv" != "" ; then
+        echo Iconv support not found
+        WITH_ICONV=0
+    else
+        WITH_ICONV=1
+    fi
+fi  
+AC_SUBST(WITH_ICONV)
+
 AC_ARG_WITH(debug, [  --with-debug            Add the debugging module (on)])
 if test "$with_debug" = "no" ; then
    echo Disabling DEBUG support
--- a/encoding.c
+++ b/encoding.c
@ -34,12 +34,26 @@
 #ifdef HAVE_STDLIB_H
 #include <stdlib.h>
 #endif
+#include <libxml/xmlversion.h>
+#ifdef LIBXML_ICONV_ENABLED
+#ifdef HAVE_ERRNO_H
+#include <errno.h>
+#endif
+#endif
 #include <libxml/encoding.h>
 #include <libxml/xmlmemory.h>

 xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
 xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;

+#ifdef LIBXML_ICONV_ENABLED
+#if 0
+#define DEBUG_ENCODING  /* Define this to get encoding traces */
+#endif
+#endif
+
+static int xmlLittleEndian = 1;
+
 /*
 * From rfc2044: encoding of the Unicode values on UTF-8:
 *
@ -104,30 +118,38 @@ xmlCheckUTF8(const unsigned char *utf)
 *
 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
 * block of chars out.
- * Returns the number of byte written, or -1 by lack of space.
+ * Returns 0 if success, or -1 otherwise
+ * The value of @inlen after return is the number of octets consumed
+ *     as the return value is positive, else unpredictiable.
+ * The value of @outlen after return is the number of ocetes consumed.
 */
 int
-isolat1ToUTF8(unsigned char* out, int outlen,
+isolat1ToUTF8(unsigned char* out, int *outlen,
              const unsigned char* in, int *inlen) {
    unsigned char* outstart = out;
-    unsigned char* outend= out+outlen;
+    const unsigned char* processed = in;
+    unsigned char* outend = out + *outlen;
    const unsigned char* inend = in + *inlen;
    unsigned char c;

    while (in < inend) {
        c= *in++;
        if (c < 0x80) {
-            if (out >= outend)  return(-1);
+            if (out >= outend)
+		break;
            *out++ = c;
        }
        else {
-            if (out >= outend)  return(-1);
+            if (out + 1 >= outend)  break;
            *out++ = 0xC0 | (c >> 6);
-            if (out >= outend)  return(-1);
            *out++ = 0x80 | (0x3F & c);
        }
+	processed = in;
    }
-    return(out-outstart);
+    *outlen = out - outstart;
+    *inlen = processed - in;
+
+    return(0);
 }

 /**
@ -141,17 +163,17 @@ isolat1ToUTF8(unsigned char* out, int outlen,
 * block of chars out.
 * TODO: UTF8Toisolat1 need a fallback mechanism ...
 *
- * Returns the number of byte written, or -1 by lack of space, or -2
- *     if the transcoding fails (for *in is not valid utf8 string or
- *     the result of transformation can't fit into the encoding we want)
+ * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
 * The value of @inlen after return is the number of octets consumed
 *     as the return value is positive, else unpredictiable.
+ * The value of @outlen after return is the number of ocetes consumed.
 */
 int
-UTF8Toisolat1(unsigned char* out, int outlen,
+UTF8Toisolat1(unsigned char* out, int *outlen,
              const unsigned char* in, int *inlen) {
    unsigned char* outstart = out;
-    unsigned char* outend= out+outlen;
+    const unsigned char* processed = in;
+    unsigned char* outend = out + *outlen;
    const unsigned char* inend = in + *inlen;
    unsigned char c;

@ -162,18 +184,22 @@ UTF8Toisolat1(unsigned char* out, int outlen,
            *out++= c;
        }
 	else if (in == inend) {
-            *inlen -= 1;
            break;
 	}
 	else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) {
 	    /* a two byte utf-8 and can be encoding as isolate1 */
            *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
 	}
-	else
+	else {
+	    *outlen = out - outstart;
+	    *inlen = processed - in;
 	    return(-2);
-	/* TODO : some should be represent as "&#x____;" */
 	}
-    return(out-outstart);
+	processed = in;
+    }
+    *outlen = out - outstart;
+    *inlen = processed - in;
+    return(0);
 }

 /**
@ -194,11 +220,12 @@ UTF8Toisolat1(unsigned char* out, int outlen,
 *     as the return value is positive, else unpredictiable.
 */
 int
-UTF16LEToUTF8(unsigned char* out, int outlen,
+UTF16LEToUTF8(unsigned char* out, int *outlen,
            const unsigned char* inb, int *inlenb)
 {
    unsigned char* outstart = out;
-    unsigned char* outend= out+outlen;
+    const unsigned char* processed = inb;
+    unsigned char* outend = out + *outlen;
    unsigned short* in = (unsigned short*) inb;
    unsigned short* inend;
    unsigned int c, d, inlen;
@ -210,40 +237,42 @@ UTF16LEToUTF8(unsigned char* out, int outlen,
    inlen = *inlenb / 2;
    inend = in + inlen;
    while (in < inend) {
-#ifdef BIG_ENDIAN
+        if (xmlLittleEndian) {
+	    c= *in++;
+	} else {
 	    tmp = (unsigned char *) in;
 	    c = *tmp++;
 	    c = c | (((unsigned int)*tmp) << 8);
 	    in++;
-#else /* BIG_ENDIAN */
-        c= *in++;
-#endif /* BIG_ENDIAN */
+	}
        if ((c & 0xFC00) == 0xD800) {    /* surrogates */
            if (in >= inend) {           /* (in > inend) shouldn't happens */
-                (*inlenb) -= 2;
                break;
            }
-#ifdef BIG_ENDIAN
+	    if (xmlLittleEndian) {
+		d = *in++;
+	    } else {
 		tmp = (unsigned char *) in;
 		d = *tmp++;
 		d = d | (((unsigned int)*tmp) << 8);
 		in++;
-#else /* BIG_ENDIAN */
-            d = *in++;
-#endif /* BIG_ENDIAN */
+	    }
            if ((d & 0xFC00) == 0xDC00) {
                c &= 0x03FF;
                c <<= 10;
                c |= d & 0x03FF;
                c += 0x10000;
            }
-            else
+            else {
+		*outlen = out - outstart;
+		*inlenb = processed - inb;
 	        return(-2);
 	    }
+        }

 	/* assertion: c is a single UTF-4 value */
        if (out >= outend)
-	    return(-1);
+	    break;
        if      (c <    0x80) {  *out++=  c;                bits= -6; }
        else if (c <   0x800) {  *out++= ((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
        else if (c < 0x10000) {  *out++= ((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
@ -251,11 +280,14 @@ UTF16LEToUTF8(unsigned char* out, int outlen,
 
        for ( ; bits >= 0; bits-= 6) {
            if (out >= outend)
-	        return(-1);
+	        break;
            *out++= ((c >> bits) & 0x3F) | 0x80;
        }
+	processed = (const unsigned char*) in;
    }
-    return(out-outstart);
+    *outlen = out - outstart;
+    *inlenb = processed - inb;
+    return(0);
 }

 /**
@ -273,40 +305,44 @@ UTF16LEToUTF8(unsigned char* out, int outlen,
 *     if the transcoding failed. 
 */
 int
-UTF8ToUTF16LE(unsigned char* outb, int outlen,
+UTF8ToUTF16LE(unsigned char* outb, int *outlen,
            const unsigned char* in, int *inlen)
 {
    unsigned short* out = (unsigned short*) outb;
+    const unsigned char* processed = in;
    unsigned short* outstart= out;
    unsigned short* outend;
    const unsigned char* inend= in+*inlen;
    unsigned int c, d, trailing;
-#ifdef BIG_ENDIAN
    unsigned char *tmp;
    unsigned short tmp1, tmp2;
-#endif /* BIG_ENDIAN */

-    outlen /= 2; /* convert in short length */
-    outend = out + outlen;
+    outend = out + (*outlen / 2);
    while (in < inend) {
      d= *in++;
      if      (d < 0x80)  { c= d; trailing= 0; }
-      else if (d < 0xC0)
-          return(-2);    /* trailing byte in leading position */
-      else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
+      else if (d < 0xC0) {
+          /* trailing byte in leading position */
+	  *outlen = out - outstart;
+	  *inlen = processed - in;
+	  return(-2);
+      } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
      else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
      else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
-      else
-          return(-2);    /* no chance for this in UTF-16 */
+      else {
+	/* no chance for this in UTF-16 */
+	*outlen = out - outstart;
+	*inlen = processed - in;
+	return(-2);
+      }

      if (inend - in < trailing) {
-          *inlen -= (inend - in);
          break;
      } 

      for ( ; trailing; trailing--) {
          if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
-	      return(-1);
+	      break;
          c <<= 6;
          c |= d & 0x3F;
      }
@ -314,21 +350,24 @@ UTF8ToUTF16LE(unsigned char* outb, int outlen,
      /* assertion: c is a single UTF-4 value */
        if (c < 0x10000) {
            if (out >= outend)
-	        return(-1);
-#ifdef BIG_ENDIAN
+	        break;
+	    if (xmlLittleEndian) {
+		*out++ = c;
+	    } else {
 		tmp = (unsigned char *) out;
 		*tmp = c ;
 		*(tmp + 1) = c >> 8 ;
 		out++;
-#else /* BIG_ENDIAN */
-            *out++ = c;
-#endif /* BIG_ENDIAN */
+	    }
        }
        else if (c < 0x110000) {
            if (out+1 >= outend)
-	        return(-1);
+	        break;
            c -= 0x10000;
-#ifdef BIG_ENDIAN
+	    if (xmlLittleEndian) {
+		*out++ = 0xD800 | (c >> 10);
+		*out++ = 0xDC00 | (c & 0x03FF);
+	    } else {
 		tmp1 = 0xD800 | (c >> 10);
 		tmp = (unsigned char *) out;
 		*tmp = tmp1;
@ -340,15 +379,15 @@ UTF8ToUTF16LE(unsigned char* outb, int outlen,
 		*tmp  = tmp2;
 		*(tmp + 1) = tmp2 >> 8;
 		out++;
-#else /* BIG_ENDIAN */
-            *out++ = 0xD800 | (c >> 10);
-            *out++ = 0xDC00 | (c & 0x03FF);
-#endif /* BIG_ENDIAN */
+	    }
        }
        else
-	    return(-1);
+	    break;
+	processed = in;
    }
-    return(out-outstart);
+    *outlen = out - outstart;
+    *inlen = processed - in;
+    return(0);
 }

 /**
@ -369,18 +408,16 @@ UTF8ToUTF16LE(unsigned char* outb, int outlen,
 *     as the return value is positive, else unpredictiable.
 */
 int
-UTF16BEToUTF8(unsigned char* out, int outlen,
+UTF16BEToUTF8(unsigned char* out, int *outlen,
            const unsigned char* inb, int *inlenb)
 {
    unsigned char* outstart = out;
-    unsigned char* outend= out+outlen;
+    const unsigned char* processed = inb;
+    unsigned char* outend = out + *outlen;
    unsigned short* in = (unsigned short*) inb;
    unsigned short* inend;
    unsigned int c, d, inlen;
-#ifdef BIG_ENDIAN
-#else /* BIG_ENDIAN */
    unsigned char *tmp;
-#endif /* BIG_ENDIAN */    
    int bits;

    if ((*inlenb % 2) == 1)
@ -388,43 +425,46 @@ UTF16BEToUTF8(unsigned char* out, int outlen,
    inlen = *inlenb / 2;
    inend= in + inlen;
    while (in < inend) {
-#ifdef BIG_ENDIAN    
-        c= *in++;
-#else
+	if (xmlLittleEndian) {
 	    tmp = (unsigned char *) in;
 	    c = *tmp++;
 	    c = c << 8;
 	    c = c | (unsigned int) *tmp;
 	    in++;
-#endif	
+	} else {
+	    c= *in++;
+	} 
        if ((c & 0xFC00) == 0xD800) {    /* surrogates */
 	    if (in >= inend) {           /* (in > inend) shouldn't happens */
-	        (*inlenb) -= 2;
-		break;
+		*outlen = out - outstart;
+		*inlenb = processed - inb;
+	        return(-2);
 	    }
-
-#ifdef BIG_ENDIAN
-            d= *in++;
-#else
+	    if (xmlLittleEndian) {
 		tmp = (unsigned char *) in;
 		d = *tmp++;
 		d = d << 8;
 		d = d | (unsigned int) *tmp;
 		in++;
-#endif	    
+	    } else {
+		d= *in++;
+	    }
            if ((d & 0xFC00) == 0xDC00) {
                c &= 0x03FF;
                c <<= 10;
                c |= d & 0x03FF;
                c += 0x10000;
            }
-            else 
+            else {
+		*outlen = out - outstart;
+		*inlenb = processed - inb;
 	        return(-2);
 	    }
+        }

 	/* assertion: c is a single UTF-4 value */
        if (out >= outend) 
-	    return(-1);
+	    break;
        if      (c <    0x80) {  *out++=  c;                bits= -6; }
        else if (c <   0x800) {  *out++= ((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
        else if (c < 0x10000) {  *out++= ((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
@ -432,11 +472,14 @@ UTF16BEToUTF8(unsigned char* out, int outlen,
 
        for ( ; bits >= 0; bits-= 6) {
            if (out >= outend) 
-	        return(-1);
+	        break;
            *out++= ((c >> bits) & 0x3F) | 0x80;
        }
+	processed = (const unsigned char*) in;
    }
-    return(out-outstart);
+    *outlen = out - outstart;
+    *inlenb = processed - inb;
+    return(0);
 }

 /**
@ -454,63 +497,63 @@ UTF16BEToUTF8(unsigned char* out, int outlen,
 *     if the transcoding failed. 
 */
 int
-UTF8ToUTF16BE(unsigned char* outb, int outlen,
+UTF8ToUTF16BE(unsigned char* outb, int *outlen,
            const unsigned char* in, int *inlen)
 {
    unsigned short* out = (unsigned short*) outb;
+    const unsigned char* processed = in;
    unsigned short* outstart= out;
    unsigned short* outend;
    const unsigned char* inend= in+*inlen;
    unsigned int c, d, trailing;
-#ifdef BIG_ENDIAN
-#else
    unsigned char *tmp;
    unsigned short tmp1, tmp2;
-#endif /* BIG_ENDIAN */    

-    outlen /= 2; /* convert in short length */
-    outend = out + outlen;
+    outend = out + (*outlen / 2);
    while (in < inend) {
      d= *in++;
      if      (d < 0x80)  { c= d; trailing= 0; }
-      else if (d < 0xC0)
-          return(-2);    /* trailing byte in leading position */
-      else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
+      else if (d < 0xC0)  {
+          /* trailing byte in leading position */
+	  *outlen = out - outstart;
+	  *inlen = processed - in;
+	  return(-2);
+      } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
      else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
      else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
-      else
-          return(-2);    /* no chance for this in UTF-16 */
+      else {
+          /* no chance for this in UTF-16 */
+	  *outlen = out - outstart;
+	  *inlen = processed - in;
+	  return(-2);
+      }

      if (inend - in < trailing) {
-          *inlen -= (inend - in);
          break;
      } 

      for ( ; trailing; trailing--) {
-          if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))  return(-1);
+          if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))  break;
          c <<= 6;
          c |= d & 0x3F;
      }

      /* assertion: c is a single UTF-4 value */
        if (c < 0x10000) {
-            if (out >= outend)  return(-1);
-#ifdef BIG_ENDIAN
-            *out++ = c;
-#else
+            if (out >= outend)  break;
+	    if (xmlLittleEndian) {
 		tmp = (unsigned char *) out;
 		*tmp = c >> 8;
 		*(tmp + 1) = c;
 		out++;
-#endif /* BIG_ENDIAN */
+	    } else {
+		*out++ = c;
+	    }
        }
        else if (c < 0x110000) {
-            if (out+1 >= outend)  return(-1);
+            if (out+1 >= outend)  break;
            c -= 0x10000;
-#ifdef BIG_ENDIAN
-            *out++ = 0xD800 | (c >> 10);
-            *out++ = 0xDC00 | (c & 0x03FF);
-#else
+	    if (xmlLittleEndian) {
 		tmp1 = 0xD800 | (c >> 10);
 		tmp = (unsigned char *) out;
 		*tmp = tmp1 >> 8;
@ -522,11 +565,18 @@ UTF8ToUTF16BE(unsigned char* outb, int outlen,
 		*tmp = tmp2 >> 8;
 		*(tmp + 1) = tmp2;
 		out++;
-#endif
+	    } else {
+		*out++ = 0xD800 | (c >> 10);
+		*out++ = 0xDC00 | (c & 0x03FF);
 	    }
-        else  return(-1);
        }
-    return(out-outstart);
+        else
+	    break;
+	processed = in;
+    }
+    *outlen = out - outstart;
+    *inlen = processed - in;
+    return(0);
 }

 /**
@ -636,8 +686,12 @@ xmlParseCharEncoding(const char* name)
    if (!strcmp(upper,  "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);

    if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
-    if (!strcmp(upper, "Shift_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
+    if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
    if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
+
+#ifdef DEBUG_ENCODING
+    fprintf(stderr, "Unknown encoding %s\n", name);
+#endif
    return(XML_CHAR_ENCODING_ERROR);
 }

@ -712,6 +766,9 @@ xmlNewCharEncodingHandler(const char *name,
     * registers and returns the handler.
     */
    xmlRegisterCharEncodingHandler(handler);
+#ifdef DEBUG_ENCODING
+    fprintf(stderr, "Registered encoding handler for %s\n", name);
+#endif
    return(handler);
 }

@ -725,11 +782,18 @@ xmlNewCharEncodingHandler(const char *name,
 */
 void
 xmlInitCharEncodingHandlers(void) {
+    unsigned short int tst = 0x1234;
+    unsigned char *ptr = (unsigned char *) &tst; 
+
    if (handlers != NULL) return;

    handlers = (xmlCharEncodingHandlerPtr *)
        xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));

+    if (*ptr == 0x12) xmlLittleEndian = 0;
+    else if (*ptr == 0x34) xmlLittleEndian = 1;
+    else fprintf(stderr, "Odd problem at endianness detection\n");
+
    if (handlers == NULL) {
        fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
 	return;
@ -755,6 +819,7 @@ xmlCleanupCharEncodingHandlers(void) {
    for (;nbCharEncodingHandler > 0;) {
        nbCharEncodingHandler--;
 	if (handlers[nbCharEncodingHandler] != NULL) {
+	    if (handlers[nbCharEncodingHandler]->name != NULL)
 		xmlFree(handlers[nbCharEncodingHandler]->name);
 	    xmlFree(handlers[nbCharEncodingHandler]);
 	}
@ -798,6 +863,8 @@ xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
 */
 xmlCharEncodingHandlerPtr
 xmlGetCharEncodingHandler(xmlCharEncoding enc) {
+    xmlCharEncodingHandlerPtr handler;
+
    if (handlers == NULL) xmlInitCharEncodingHandlers();
    switch (enc) {
        case XML_CHAR_ENCODING_ERROR:
@ -811,40 +878,68 @@ xmlGetCharEncodingHandler(xmlCharEncoding enc) {
        case XML_CHAR_ENCODING_UTF16BE:
 	    return(xmlUTF16BEHandler);
        case XML_CHAR_ENCODING_EBCDIC:
-	    return(NULL);
+            handler = xmlFindCharEncodingHandler("EBCDIC");
+            if (handler != NULL) return(handler);
+            handler = xmlFindCharEncodingHandler("ebcdic");
+            if (handler != NULL) return(handler);
+	    break;
        case XML_CHAR_ENCODING_UCS4LE:
-	    return(NULL);
+            handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
+            if (handler != NULL) return(handler);
+            handler = xmlFindCharEncodingHandler("UCS-4");
+            if (handler != NULL) return(handler);
+            handler = xmlFindCharEncodingHandler("UCS4");
+            if (handler != NULL) return(handler);
+	    break;
        case XML_CHAR_ENCODING_UCS4BE:
-	    return(NULL);
+            handler = xmlFindCharEncodingHandler("UCS4BE");
+            if (handler != NULL) return(handler);
+	    break;
        case XML_CHAR_ENCODING_UCS4_2143:
-	    return(NULL);
+	    break;
        case XML_CHAR_ENCODING_UCS4_3412:
-	    return(NULL);
+	    break;
        case XML_CHAR_ENCODING_UCS2:
-	    return(NULL);
+            handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
+            if (handler != NULL) return(handler);
+            handler = xmlFindCharEncodingHandler("UCS-2");
+            if (handler != NULL) return(handler);
+            handler = xmlFindCharEncodingHandler("UCS2");
+            if (handler != NULL) return(handler);
+	    break;
        case XML_CHAR_ENCODING_8859_1:
-	    return(NULL);
        case XML_CHAR_ENCODING_8859_2:
-	    return(NULL);
        case XML_CHAR_ENCODING_8859_3:
-	    return(NULL);
        case XML_CHAR_ENCODING_8859_4:
-	    return(NULL);
        case XML_CHAR_ENCODING_8859_5:
-	    return(NULL);
        case XML_CHAR_ENCODING_8859_6:
-	    return(NULL);
        case XML_CHAR_ENCODING_8859_7:
-	    return(NULL);
        case XML_CHAR_ENCODING_8859_8:
-	    return(NULL);
        case XML_CHAR_ENCODING_8859_9:
 	    return(NULL);
        case XML_CHAR_ENCODING_2022_JP:
+            handler = xmlFindCharEncodingHandler("ISO-2022-JP");
+            if (handler != NULL) return(handler);
+	    break;
        case XML_CHAR_ENCODING_SHIFT_JIS:
+            handler = xmlFindCharEncodingHandler("SHIFT-JIS");
+            if (handler != NULL) return(handler);
+            handler = xmlFindCharEncodingHandler("SHIFT_JIS");
+            if (handler != NULL) return(handler);
+            handler = xmlFindCharEncodingHandler("Shift_JIS");
+            if (handler != NULL) return(handler);
+	    break;
        case XML_CHAR_ENCODING_EUC_JP:
-	    return(NULL);
+            handler = xmlFindCharEncodingHandler("EUC-JP");
+            if (handler != NULL) return(handler);
+	    break;
+	default: 
+	    break;
    }
+    
+#ifdef DEBUG_ENCODING
+    fprintf(stderr, "No handler found for encoding %d\n", enc);
+#endif
    return(NULL);
 }

@ -858,23 +953,306 @@ xmlGetCharEncodingHandler(xmlCharEncoding enc) {
 */
 xmlCharEncodingHandlerPtr
 xmlFindCharEncodingHandler(const char *name) {
-    char upper[500];
+#ifdef LIBXML_ICONV_ENABLED
+    iconv_t icv_in, icv_out;
+    xmlCharEncodingHandlerPtr enc;
+#endif /* LIBXML_ICONV_ENABLED */
+    char upper[100];
    int i;

    if (handlers == NULL) xmlInitCharEncodingHandlers();
    if (name == NULL) return(xmlDefaultCharEncodingHandler);
    if (name[0] == 0) return(xmlDefaultCharEncodingHandler);

-    for (i = 0;i < 499;i++) {
+    for (i = 0;i < 99;i++) {
        upper[i] = toupper(name[i]);
 	if (upper[i] == 0) break;
    }
    upper[i] = 0;

    for (i = 0;i < nbCharEncodingHandler; i++)
-        if (!strcmp(name, handlers[i]->name))
+        if (!strcmp(upper, handlers[i]->name)) {
+#ifdef DEBUG_ENCODING
+            fprintf(stderr, "Found registered handler for encoding %s\n", name);
+#endif
 	    return(handlers[i]);
+	}

+#ifdef LIBXML_ICONV_ENABLED
+    /* check whether iconv can handle this */
+    icv_in = iconv_open("UTF-8", name);
+    icv_out = iconv_open(name, "UTF-8");
+    if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
+	    enc = xmlMalloc(sizeof(xmlCharEncodingHandler));
+	    if (enc == NULL) {
+	        iconv_close(icv_in);
+	        iconv_close(icv_out);
+		return(NULL);
+	    }
+	    enc->name = NULL;
+	    enc->input = NULL;
+	    enc->output = NULL;
+	    enc->iconv_in = icv_in;
+	    enc->iconv_out = icv_out;
+#ifdef DEBUG_ENCODING
+            fprintf(stderr, "Found iconv handler for encoding %s\n", name);
+#endif
+	    return enc;
+    } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
+	    fprintf(stderr, "iconv : problems with filters for '%s'\n", name);
+    }
+#endif /* LIBXML_ICONV_ENABLED */
+#ifdef DEBUG_ENCODING
+    fprintf(stderr, "No handler found for encoding %s\n", name);
+#endif
    return(NULL);
 }

+#ifdef LIBXML_ICONV_ENABLED
+/**
+ * xmlIconvWrapper:
+ * @cd:		iconv converter data structure
+ * @out:  a pointer to an array of bytes to store the result
+ * @outlen:  the length of @out
+ * @in:  a pointer to an array of ISO Latin 1 chars
+ * @inlen:  the length of @in
+ *
+ * Returns 0 if success, or 
+ *     -1 by lack of space, or
+ *     -2 if the transcoding fails (for *in is not valid utf8 string or
+ *        the result of transformation can't fit into the encoding we want), or
+ *     -3 if there the last byte can't form a single output char.
+ *     
+ * The value of @inlen after return is the number of octets consumed
+ *     as the return value is positive, else unpredictiable.
+ * The value of @outlen after return is the number of ocetes consumed.
+ */
+static int
+xmlIconvWrapper(iconv_t cd,
+	unsigned char *out, int *outlen,
+	const unsigned char *in, int *inlen) {
+
+	size_t icv_inlen = *inlen, icv_outlen = *outlen;
+	const char *icv_in = (const char *) in;
+	char *icv_out = (char *) out;
+	int ret;
+
+	ret = iconv(cd,
+		&icv_in, &icv_inlen,
+		&icv_out, &icv_outlen);
+	*inlen -= icv_inlen;
+	*outlen -= icv_outlen;
+	if (icv_inlen != 0 || ret == (size_t) -1) {
+#ifdef EILSEQ
+		if (errno == EILSEQ) {
+			return -2;
+		} else
+#endif
+#ifdef E2BIG
+		if (errno == E2BIG) {
+			return -1;
+		} else
+#endif
+#ifdef EINVAL
+		if (errno == EINVAL) {
+			return -3;
+		}
+#endif
+		else {
+			return -3;
+		}
+	}
+	return 0;
+}
+#endif /* LIBXML_ICONV_ENABLED */
+
+/**
+ * xmlCharEncInFunc:
+ * @handler:	char enconding transformation data structure
+ * @out:  an xmlBuffer for the output.
+ * @in:  an xmlBuffer for the input
+ *     
+ * Generic front-end for the encoding handler input function
+ *     
+ * Returns the number of byte written if success, or 
+ *     -1 general error
+ *     -2 if the transcoding fails (for *in is not valid utf8 string or
+ *        the result of transformation can't fit into the encoding we want), or
+ */
+int
+xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
+                 xmlBufferPtr in) {
+    int ret = -2;
+    int written;
+    int toconv;
+
+    if (handler == NULL) return(-1);
+    if (out == NULL) return(-1);
+    if (in == NULL) return(-1);
+
+    written = out->size - out->use;
+    toconv = in->use;
+    if (toconv * 2 >= written) {
+        xmlBufferGrow(out, toconv * 2);
+	written = out->size - out->use - 1;
+    }
+    if (handler->input != NULL) {
+	ret = handler->input(&out->content[out->use], &written,
+	                     in->content, &toconv);
+	xmlBufferShrink(in, toconv);
+	out->use += written;
+	out->content[out->use] = 0;
+    }
+#ifdef LIBXML_ICONV_ENABLED
+    else if (handler->iconv_in != NULL) {
+	ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
+	                      &written, in->content, &toconv);
+	xmlBufferShrink(in, toconv);
+	out->use += written;
+	out->content[out->use] = 0;
+	if (ret == -1) ret = -3;
+    }
+#endif /* LIBXML_ICONV_ENABLED */
+#ifdef DEBUG_ENCODING
+    switch (ret) {
+        case 0:
+	    fprintf(stderr, "converted %d bytes to %d bytes of input\n",
+	            toconv, written);
+	    break;
+        case -1:
+	    fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
+	            toconv, written, in->use);
+	    break;
+        case -2:
+	    fprintf(stderr, "input conversion failed due to input error\n");
+	    break;
+        case -3:
+	    fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
+	            toconv, written, in->use);
+	    break;
+	default:
+	    fprintf(stderr,"Unknown input conversion failed %d\n", ret);
+    }
+#endif
+    /*
+     * Ignore when input buffer is not on a boundary
+     */
+    if (ret == -3) ret = 0;
+    return(ret);
+}
+
+/**
+ * xmlCharEncOutFunc:
+ * @handler:	char enconding transformation data structure
+ * @out:  an xmlBuffer for the output.
+ * @in:  an xmlBuffer for the input
+ *     
+ * Generic front-end for the encoding handler output function
+ *     
+ * Returns the number of byte written if success, or 
+ *     -1 general error
+ *     -2 if the transcoding fails (for *in is not valid utf8 string or
+ *        the result of transformation can't fit into the encoding we want), or
+ */
+int
+xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
+                  xmlBufferPtr in) {
+    int ret = -2;
+    int written;
+    int toconv;
+
+    if (handler == NULL) return(-1);
+    if (out == NULL) return(-1);
+    if (in == NULL) return(-1);
+
+    written = out->size - out->use;
+    toconv = in->use;
+    if (toconv * 2 >= written) {
+        xmlBufferGrow(out, toconv * 2);
+	written = out->size - out->use - 1;
+    }
+    if (handler->output != NULL) {
+	ret = handler->output(&out->content[out->use], &written,
+	                     in->content, &toconv);
+	xmlBufferShrink(in, toconv);
+	out->use += written;
+	out->content[out->use] = 0;
+    }
+#ifdef LIBXML_ICONV_ENABLED
+    else if (handler->iconv_out != NULL) {
+	ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
+	                      &written, in->content, &toconv);
+	xmlBufferShrink(in, toconv);
+	out->use += written;
+	out->content[out->use] = 0;
+	if (ret == -1) ret = -3;
+    }
+#endif /* LIBXML_ICONV_ENABLED */
+#ifdef DEBUG_ENCODING
+    switch (ret) {
+        case 0:
+	    fprintf(stderr, "converted %d bytes to %d bytes of output\n",
+	            toconv, written);
+	    break;
+        case -1:
+	    fprintf(stderr, "output conversion failed by lack of space\n");
+	    break;
+        case -2:
+	    fprintf(stderr, "output conversion failed due to output error\n");
+	    break;
+        case -3:
+	    fprintf(stderr,"converted %d bytes to %d bytes of output %d left\n",
+	            toconv, written, in->use);
+	    break;
+	default:
+	    fprintf(stderr,"Unknown output conversion failed %d\n", ret);
+    }
+#endif
+    return(ret);
+}
+
+/**
+ * xmlCharEncCloseFunc:
+ * @handler:	char enconding transformation data structure
+ *     
+ * Generic front-end for hencoding handler close function
+ *
+ * Returns 0 if success, or -1 in case of error
+ */
+int
+xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
+    int ret = 0;
+    if (handler == NULL) return(-1);
+    if (handler->name == NULL) return(-1);
+#ifdef LIBXML_ICONV_ENABLED
+    /*
+     * Iconv handlers can be oused only once, free the whole block.
+     * and the associated icon resources.
+     */
+    if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
+	if (handler->name != NULL)
+	    xmlFree(handler->name);
+	handler->name = NULL;
+	if (handler->iconv_out != NULL) {
+	    if (iconv_close(handler->iconv_out))
+		ret = -1;
+	    handler->iconv_out = NULL;
+	}
+	if (handler->iconv_in != NULL) {
+	    if (iconv_close(handler->iconv_in))
+		ret = -1;
+	    handler->iconv_in = NULL;
+	}
+	xmlFree(handler);
+    }
+#endif /* LIBXML_ICONV_ENABLED */
+#ifdef DEBUG_ENCODING
+    if (ret)
+        fprintf(stderr, "failed to close the encoding handler\n");
+    else
+        fprintf(stderr, "closed the encoding handler\n");
+
+#endif
+    return(ret);
+}
+
--- a/encoding.h
+++ b/encoding.h
@ -22,12 +22,30 @@
 #define __XML_CHAR_ENCODING_H__

 #include <libxml/xmlversion.h>
+#ifdef LIBXML_ICONV_ENABLED
+#include <iconv.h>
+#endif
+#include <libxml/tree.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif

 /**
 * Predefined values for some standard encodings
+ * Libxml don't do beforehand translation on UTF8, ISOLatinX
+ * It also support UTF16 (LE and BE) by default.
+ *
+ * Anything else would have to be translated to UTF8 before being
+ * given to the parser itself. The BOM for UTF16 and the encoding
+ * declaration are looked at and a converter is looked for at that
+ * point. If not found the parser stops here as asked by the XML REC
+ * Converter can be registered by the user using xmlRegisterCharEncodingHandler
+ * but the currentl form doesn't allow stateful transcoding (a serious
+ * problem agreed !). If iconv has been found it will be used
+ * automatically and allow stateful transcoding, the simplest is then
+ * to be sure to enable icon and to provide iconv libs for the encoding
+ * support needed.
 */
 typedef enum {
    XML_CHAR_ENCODING_ERROR=   -1, /* No char encoding detected */
@ -65,9 +83,13 @@ typedef enum {
 * Take a block of chars in the original encoding and try to convert
 * it to an UTF-8 block of chars out.
 *
- * Returns the number of byte written, or -1 by lack of space.
+ * Returns the number of byte written, or -1 by lack of space, or -2
+ *     if the transcoding failed.
+ * The value of @inlen after return is the number of octets consumed
+ *     as the return value is positive, else unpredictiable.
+ * The value of @outlen after return is the number of ocetes consumed.
 */
-typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int outlen,
+typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int *outlen,
                                         const unsigned char* in, int *inlen);


@ -83,12 +105,17 @@ typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int outlen,
 *
 * Returns the number of byte written, or -1 by lack of space, or -2
 *     if the transcoding failed.
+ * The value of @inlen after return is the number of octets consumed
+ *     as the return value is positive, else unpredictiable.
+ * The value of @outlen after return is the number of ocetes consumed.
 */
-typedef int (* xmlCharEncodingOutputFunc)(unsigned char* out, int outlen,
+typedef int (* xmlCharEncodingOutputFunc)(unsigned char* out, int *outlen,
                                          const unsigned char* in, int *inlen);

+
 /*
 * Block defining the handlers for non UTF-8 encodings.
+ * If iconv is supported, there is two extra fields 
 */

 typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler;
@ -97,6 +124,10 @@ struct _xmlCharEncodingHandler {
    char                       *name;
    xmlCharEncodingInputFunc   input;
    xmlCharEncodingOutputFunc  output;
+#ifdef LIBXML_ICONV_ENABLED
+    iconv_t                    iconv_in;
+    iconv_t                    iconv_out;
+#endif /* LIBXML_ICONV_ENABLED */
 };

 void	xmlInitCharEncodingHandlers	(void);
@ -109,6 +140,14 @@ xmlCharEncodingHandlerPtr xmlGetCharEncodingHandler(xmlCharEncoding enc);
 xmlCharEncodingHandlerPtr xmlFindCharEncodingHandler(const char *name);
 int	xmlCheckUTF8			(const unsigned char *utf);

+int	xmlCharEncOutFunc		(xmlCharEncodingHandler *handler,
+					 xmlBufferPtr out,
+					 xmlBufferPtr in);
+
+int	xmlCharEncInFunc		(xmlCharEncodingHandler *handler,
+					 xmlBufferPtr out,
+					 xmlBufferPtr in);
+int	xmlCharEncCloseFunc		(xmlCharEncodingHandler *handler);

 #ifdef __cplusplus
 }
--- a/include/libxml/encoding.h
+++ b/include/libxml/encoding.h
@ -22,12 +22,30 @@
 #define __XML_CHAR_ENCODING_H__

 #include <libxml/xmlversion.h>
+#ifdef LIBXML_ICONV_ENABLED
+#include <iconv.h>
+#endif
+#include <libxml/tree.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif

 /**
 * Predefined values for some standard encodings
+ * Libxml don't do beforehand translation on UTF8, ISOLatinX
+ * It also support UTF16 (LE and BE) by default.
+ *
+ * Anything else would have to be translated to UTF8 before being
+ * given to the parser itself. The BOM for UTF16 and the encoding
+ * declaration are looked at and a converter is looked for at that
+ * point. If not found the parser stops here as asked by the XML REC
+ * Converter can be registered by the user using xmlRegisterCharEncodingHandler
+ * but the currentl form doesn't allow stateful transcoding (a serious
+ * problem agreed !). If iconv has been found it will be used
+ * automatically and allow stateful transcoding, the simplest is then
+ * to be sure to enable icon and to provide iconv libs for the encoding
+ * support needed.
 */
 typedef enum {
    XML_CHAR_ENCODING_ERROR=   -1, /* No char encoding detected */
@ -65,9 +83,13 @@ typedef enum {
 * Take a block of chars in the original encoding and try to convert
 * it to an UTF-8 block of chars out.
 *
- * Returns the number of byte written, or -1 by lack of space.
+ * Returns the number of byte written, or -1 by lack of space, or -2
+ *     if the transcoding failed.
+ * The value of @inlen after return is the number of octets consumed
+ *     as the return value is positive, else unpredictiable.
+ * The value of @outlen after return is the number of ocetes consumed.
 */
-typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int outlen,
+typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int *outlen,
                                         const unsigned char* in, int *inlen);


@ -83,12 +105,17 @@ typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int outlen,
 *
 * Returns the number of byte written, or -1 by lack of space, or -2
 *     if the transcoding failed.
+ * The value of @inlen after return is the number of octets consumed
+ *     as the return value is positive, else unpredictiable.
+ * The value of @outlen after return is the number of ocetes consumed.
 */
-typedef int (* xmlCharEncodingOutputFunc)(unsigned char* out, int outlen,
+typedef int (* xmlCharEncodingOutputFunc)(unsigned char* out, int *outlen,
                                          const unsigned char* in, int *inlen);

+
 /*
 * Block defining the handlers for non UTF-8 encodings.
+ * If iconv is supported, there is two extra fields 
 */

 typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler;
@ -97,6 +124,10 @@ struct _xmlCharEncodingHandler {
    char                       *name;
    xmlCharEncodingInputFunc   input;
    xmlCharEncodingOutputFunc  output;
+#ifdef LIBXML_ICONV_ENABLED
+    iconv_t                    iconv_in;
+    iconv_t                    iconv_out;
+#endif /* LIBXML_ICONV_ENABLED */
 };

 void	xmlInitCharEncodingHandlers	(void);
@ -109,6 +140,14 @@ xmlCharEncodingHandlerPtr xmlGetCharEncodingHandler(xmlCharEncoding enc);
 xmlCharEncodingHandlerPtr xmlFindCharEncodingHandler(const char *name);
 int	xmlCheckUTF8			(const unsigned char *utf);

+int	xmlCharEncOutFunc		(xmlCharEncodingHandler *handler,
+					 xmlBufferPtr out,
+					 xmlBufferPtr in);
+
+int	xmlCharEncInFunc		(xmlCharEncodingHandler *handler,
+					 xmlBufferPtr out,
+					 xmlBufferPtr in);
+int	xmlCharEncCloseFunc		(xmlCharEncodingHandler *handler);

 #ifdef __cplusplus
 }
--- a/include/libxml/parserInternals.h
+++ b/include/libxml/parserInternals.h
@ -28,10 +28,10 @@ extern "C" {
 * any Unicode character, excluding the surrogate blocks, FFFE, and FFFF.
 */
 #define IS_CHAR(c)							\
-    ((((c) == 0x09) || ((c) == 0x0A) || ((c) == 0x0D) ||		\
-      (((c) >= 0x20) && ((c) != 0xFFFE) && ((c) != 0xFFFF))) &&		\
-      (((c) <= 0xD7FF) || ((c) >= 0xE000)) && ((c) >= 0) &&		\
-      ((c) <= 0x10FFFF))
+    (((c) == 0x09) || ((c) == 0x0A) || ((c) == 0x0D) ||			\
+     (((c) >= 0x20) && ((c) <= 0xD7FF)) ||				\
+     (((c) >= 0xE000) && ((c) <= 0xFFFD)) ||				\
+     (((c) >= 0x10000) && ((c) <= 0x10FFFF)))

 /*
 * [3] S ::= (#x20 | #x9 | #xD | #xA)+
@ -442,8 +442,10 @@ xmlParserCtxtPtr	xmlNewParserCtxt	(void);
 xmlParserCtxtPtr	xmlCreateEntityParserCtxt(const xmlChar *URL,
 						 const xmlChar *ID,
 						 const xmlChar *base);
-void			xmlSwitchEncoding	(xmlParserCtxtPtr ctxt,
+int			xmlSwitchEncoding	(xmlParserCtxtPtr ctxt,
 						 xmlCharEncoding enc);
+int			xmlSwitchToEncoding	(xmlParserCtxtPtr ctxt,
+					     xmlCharEncodingHandlerPtr handler);
 void			xmlFreeParserCtxt	(xmlParserCtxtPtr ctxt);

 /**
--- a/include/libxml/tree.h
+++ b/include/libxml/tree.h
@ -380,6 +380,8 @@ void		xmlBufferCCat		(xmlBufferPtr buf,
 					 const char *str);
 int		xmlBufferShrink		(xmlBufferPtr buf,
 					 int len);
+int		xmlBufferGrow		(xmlBufferPtr buf,
+					 int len);
 void		xmlBufferEmpty		(xmlBufferPtr buf);
 const xmlChar*	xmlBufferContent	(const xmlBufferPtr buf);
 int		xmlBufferUse		(const xmlBufferPtr buf);
--- a/include/libxml/xmlIO.h
+++ b/include/libxml/xmlIO.h
@ -33,6 +33,7 @@ struct _xmlParserInputBuffer {
    xmlCharEncodingHandlerPtr encoder; /* I18N conversions to UTF-8 */
    
    xmlBufferPtr buffer;    /* Local buffer encoded in  UTF-8 */
+    xmlBufferPtr raw;       /* if encoder != NULL buffer for raw input */
 };


--- a/parser.c
+++ b/parser.c
@ -41,6 +41,7 @@
 #include <libxml/valid.h>
 #include <libxml/parserInternals.h>
 #include <libxml/xmlIO.h>
+#include <libxml/uri.h>
 #include "xml-error.h"

 #define XML_PARSER_BIG_BUFFER_SIZE 1000
@ -483,7 +484,7 @@ xmlNextChar(xmlParserCtxtPtr ctxt) {
 			    if ((ctxt->sax != NULL) &&
 				(ctxt->sax->error != NULL))
 				ctxt->sax->error(ctxt->userData, 
-				 "Char out of allowed range\n");
+				 "Char 0x%X out of allowed range\n", val);
 			    ctxt->errNo = XML_ERR_INVALID_ENCODING;
 			    ctxt->wellFormed = 0;
 			    ctxt->disableSAX = 1;
@ -612,7 +613,7 @@ xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
 		if ((ctxt->sax != NULL) &&
 		    (ctxt->sax->error != NULL))
 		    ctxt->sax->error(ctxt->userData, 
-				     "Char out of allowed range\n");
+				     "Char 0x%X out of allowed range\n", val);
 		ctxt->errNo = XML_ERR_INVALID_ENCODING;
 		ctxt->wellFormed = 0;
 		ctxt->disableSAX = 1;
@ -727,7 +728,7 @@ xmlStringCurrentChar(xmlParserCtxtPtr ctxt, const xmlChar *cur, int *len) {
 		if ((ctxt->sax != NULL) &&
 		    (ctxt->sax->error != NULL))
 		    ctxt->sax->error(ctxt->userData, 
-				     "Char out of allowed range\n");
+				     "Char 0x%X out of allowed range\n", val);
 		ctxt->errNo = XML_ERR_INVALID_ENCODING;
 		ctxt->wellFormed = 0;
 		ctxt->disableSAX = 1;
@ -2278,155 +2279,14 @@ xmlCheckLanguageID(const xmlChar *lang) {
 *
 * change the input functions when discovering the character encoding
 * of a given entity.
+ *
+ * Returns 0 in case of success, -1 otherwise
 */
-void
+int
 xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
 {
    xmlCharEncodingHandlerPtr handler;

-    handler = xmlGetCharEncodingHandler(enc);
-    if (handler != NULL) {
-        if (ctxt->input != NULL) {
-	    if (ctxt->input->buf != NULL) {
-	        if (ctxt->input->buf->encoder != NULL) {
-		    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-			ctxt->sax->error(ctxt->userData,
-			     "xmlSwitchEncoding : encoder already regitered\n");
-		    return;
-		}
-		ctxt->input->buf->encoder = handler;
-
-	        /*
-		 * Is there already some content down the pipe to convert
-		 */
-	        if ((ctxt->input->buf->buffer != NULL) &&
-		    (ctxt->input->buf->buffer->use > 0)) {
-		    xmlChar *buf;
-		    int res, len, size;
-		    int processed;
-
-		    /*
-		     * Specific handling of the Byte Order Mark for 
-		     * UTF-16
-		     */
-		    if ((enc == XML_CHAR_ENCODING_UTF16LE) && 
-		        (ctxt->input->cur[0] == 0xFF) &&
-		        (ctxt->input->cur[1] == 0xFE)) {
-			SKIP(2);
-		    }
-		    if ((enc == XML_CHAR_ENCODING_UTF16BE) && 
-		        (ctxt->input->cur[0] == 0xFE) &&
-		        (ctxt->input->cur[1] == 0xFF)) {
-			SKIP(2);
-		    }
-
-		    /*
-		     * convert the non processed part
-		     */
-		    processed = ctxt->input->cur - ctxt->input->base;
-                    len = ctxt->input->buf->buffer->use - processed;
-
-		    if (len <= 0) {
-		        return;
-		    }
-		    size = ctxt->input->buf->buffer->use * 4;
-		    if (size < 4000)
-		        size = 4000;
-retry_larger:			
-		    buf = (xmlChar *) xmlMalloc(size + 1);
-		    if (buf == NULL) {
-			if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-			    ctxt->sax->error(ctxt->userData,
-				 "xmlSwitchEncoding : out of memory\n");
-		        return;
-		    }
-		    /* TODO !!! Handling of buf too small */
-		    res = handler->input(buf, size, ctxt->input->cur, &len);
-		    if (res == -1) {
-		        size *= 2;
-			xmlFree(buf);
-			goto retry_larger;
-		    }
-		    if ((res < 0) ||
-		        (len != ctxt->input->buf->buffer->use - processed)) {
-			if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-			    ctxt->sax->error(ctxt->userData,
-				 "xmlSwitchEncoding : conversion failed\n");
-                        xmlFree(buf);
-		        return;
-		    }
-
-		    /*
-		     * Conversion succeeded, get rid of the old buffer
-		     */
-		    xmlFree(ctxt->input->buf->buffer->content);
-		    ctxt->input->buf->buffer->content = buf;
-		    ctxt->input->base = buf;
-		    ctxt->input->cur = buf;
-		    ctxt->input->buf->buffer->size = size;
-		    ctxt->input->buf->buffer->use = res;
-                    buf[res] = 0;
-		}
-		return;
-	    } else {
-	        if (ctxt->input->length == 0) {
-		    /*
-		     * When parsing a static memory array one must know the
-		     * size to be able to convert the buffer.
-		     */
-		    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-			ctxt->sax->error(ctxt->userData,
-					 "xmlSwitchEncoding : no input\n");
-		    return;
-		} else {
-		    xmlChar *buf;
-		    int res, len;
-		    int processed = ctxt->input->cur - ctxt->input->base;
-
-		    /*
-		     * convert the non processed part
-		     */
-                    len = ctxt->input->length - processed;
-		    if (len <= 0) {
-			if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-			    ctxt->sax->error(ctxt->userData,
-				 "xmlSwitchEncoding : input fully consumed?\n");
-		        return;
-		    }
-		    buf = (xmlChar *) xmlMalloc(ctxt->input->length * 4);
-		    if (buf == NULL) {
-			if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-			    ctxt->sax->error(ctxt->userData,
-				 "xmlSwitchEncoding : out of memory\n");
-		        return;
-		    }
-		    res = handler->input(buf, ctxt->input->length * 4,
-		                         ctxt->input->cur, &len);
-		    if ((res < 0) ||
-		        (len != ctxt->input->length - processed)) {
-			if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-			    ctxt->sax->error(ctxt->userData,
-				 "xmlSwitchEncoding : conversion failed\n");
-                        xmlFree(buf);
-		        return;
-		    }
-		    /*
-		     * Conversion succeeded, get rid of the old buffer
-		     */
-		    if ((ctxt->input->free != NULL) &&
-		        (ctxt->input->base != NULL))
-			ctxt->input->free((xmlChar *) ctxt->input->base);
-		    ctxt->input->base = ctxt->input->cur = buf;
-		    ctxt->input->length = res;
-		}
-	    }
-	} else {
-	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-	        ctxt->sax->error(ctxt->userData,
-		                 "xmlSwitchEncoding : no input\n");
-	}
-    }
-
    switch (enc) {
 	case XML_CHAR_ENCODING_ERROR:
 	    ctxt->errNo = XML_ERR_UNKNOWN_ENCODING;
@ -2437,21 +2297,35 @@ retry_larger:
 	    break;
 	case XML_CHAR_ENCODING_NONE:
 	    /* let's assume it's UTF-8 without the XML decl */
-            return;
+	    return(0);
 	case XML_CHAR_ENCODING_UTF8:
 	    /* default encoding, no conversion should be needed */
-            return;
-        case XML_CHAR_ENCODING_UTF16LE:
-	    ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
+	    return(0);
+	default:
+	    break;
+    }
+    handler = xmlGetCharEncodingHandler(enc);
+    if (handler == NULL) {
+	/*
+	 * Default handlers.
+	 */
+	switch (enc) {
+	    case XML_CHAR_ENCODING_ERROR:
+		ctxt->errNo = XML_ERR_UNKNOWN_ENCODING;
 		if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-		  "char encoding UTF16 little endian not supported\n");
+		    ctxt->sax->error(ctxt->userData, "encoding unknown\n");
+		ctxt->wellFormed = 0;
+		ctxt->disableSAX = 1;
+		break;
+	    case XML_CHAR_ENCODING_NONE:
+		/* let's assume it's UTF-8 without the XML decl */
+		return(0);
+	    case XML_CHAR_ENCODING_UTF8:
+		/* default encoding, no conversion should be needed */
+		return(0);
+	    case XML_CHAR_ENCODING_UTF16LE:
 		break;
 	    case XML_CHAR_ENCODING_UTF16BE:
-	    ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
-	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-		  "char encoding UTF16 big endian not supported\n");
 		break;
 	    case XML_CHAR_ENCODING_UCS4LE:
 		ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
@ -2490,59 +2364,23 @@ retry_larger:
 		      "char encoding UCS2 not supported\n");
 		break;
 	    case XML_CHAR_ENCODING_8859_1:
-	    ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
-	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-		  "char encoding ISO_8859_1 ISO Latin 1 not supported\n");
-            break;
 	    case XML_CHAR_ENCODING_8859_2:
-	    ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
-	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-		  "char encoding ISO_8859_2 ISO Latin 2 not supported\n");
-            break;
 	    case XML_CHAR_ENCODING_8859_3:
-	    ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
-	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-		  "char encoding ISO_8859_3 not supported\n");
-            break;
 	    case XML_CHAR_ENCODING_8859_4:
-	    ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
-	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-		  "char encoding ISO_8859_4 not supported\n");
-            break;
 	    case XML_CHAR_ENCODING_8859_5:
-	    ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
-	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-		  "char encoding ISO_8859_5 not supported\n");
-            break;
 	    case XML_CHAR_ENCODING_8859_6:
-	    ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
-	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-		  "char encoding ISO_8859_6 not supported\n");
-            break;
 	    case XML_CHAR_ENCODING_8859_7:
-	    ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
-	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-		  "char encoding ISO_8859_7 not supported\n");
-            break;
 	    case XML_CHAR_ENCODING_8859_8:
-	    ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
-	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-		  "char encoding ISO_8859_8 not supported\n");
-            break;
 	    case XML_CHAR_ENCODING_8859_9:
-	    ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
-	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-		  "char encoding ISO_8859_9 not supported\n");
-            break;
+		/*
+		 * Keep the internal content in the document encoding
+		 */
+		if ((ctxt->inputNr == 1) &&
+		    (ctxt->encoding == NULL) &&
+		    (ctxt->input->encoding != NULL)) {
+		    ctxt->encoding = xmlStrdup(ctxt->input->encoding);
+		}
+		return(0);
 	    case XML_CHAR_ENCODING_2022_JP:
 		ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
 		if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
@ -2563,6 +2401,150 @@ retry_larger:
 		break;
 	}
    }
+    if (handler == NULL)
+	return(-1);
+    return(xmlSwitchToEncoding(ctxt, handler));
+}
+
+/**
+ * xmlSwitchToEncoding:
+ * @ctxt:  the parser context
+ * @handler:  the encoding handler
+ *
+ * change the input functions when discovering the character encoding
+ * of a given entity.
+ *
+ * Returns 0 in case of success, -1 otherwise
+ */
+int
+xmlSwitchToEncoding(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler) 
+{
+    int nbchars;
+
+    if (handler != NULL) {
+        if (ctxt->input != NULL) {
+	    if (ctxt->input->buf != NULL) {
+	        if (ctxt->input->buf->encoder != NULL) {
+		    if (ctxt->input->buf->encoder == handler)
+			return(0);
+		    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+			ctxt->sax->error(ctxt->userData,
+			     "xmlSwitchEncoding : encoder already regitered\n");
+		    return(-1);
+		}
+		ctxt->input->buf->encoder = handler;
+
+	        /*
+		 * Is there already some content down the pipe to convert ?
+		 */
+	        if ((ctxt->input->buf->buffer != NULL) &&
+		    (ctxt->input->buf->buffer->use > 0)) {
+		    int processed;
+
+		    /*
+		     * Specific handling of the Byte Order Mark for 
+		     * UTF-16
+		     */
+		    if ((handler->name != NULL) &&
+			(!strcmp(handler->name, "UTF-16LE")) && 
+		        (ctxt->input->cur[0] == 0xFF) &&
+		        (ctxt->input->cur[1] == 0xFE)) {
+			ctxt->input->cur += 2;
+		    }
+		    if ((handler->name != NULL) &&
+			(!strcmp(handler->name, "UTF-16BE")) && 
+		        (ctxt->input->cur[0] == 0xFE) &&
+		        (ctxt->input->cur[1] == 0xFF)) {
+			ctxt->input->cur += 2;
+		    }
+
+		    /*
+		     * Shring the current input buffer.
+		     * Move it as the raw buffer and create a new input buffer
+		     */
+		    processed = ctxt->input->cur - ctxt->input->base;
+		    xmlBufferShrink(ctxt->input->buf->buffer, processed);
+		    ctxt->input->buf->raw = ctxt->input->buf->buffer;
+		    ctxt->input->buf->buffer = xmlBufferCreate();
+
+		    /*
+		     * convert as much as possible of the raw input
+		     * to the parser reading buffer.
+		     */
+		    nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
+		                               ctxt->input->buf->buffer,
+					       ctxt->input->buf->raw);
+		    if (nbchars < 0) {
+			fprintf(stderr, "xmlSwitchToEncoding: encoder error\n");
+			return(-1);
+		    }
+		    ctxt->input->base =
+		    ctxt->input->cur = ctxt->input->buf->buffer->content;
+		}
+		return(0);
+	    } else {
+	        if (ctxt->input->length == 0) {
+		    /*
+		     * When parsing a static memory array one must know the
+		     * size to be able to convert the buffer.
+		     */
+		    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+			ctxt->sax->error(ctxt->userData,
+					 "xmlSwitchEncoding : no input\n");
+		    return(-1);
+		} else {
+		    int processed;
+
+		    /*
+		     * Shring the current input buffer.
+		     * Move it as the raw buffer and create a new input buffer
+		     */
+		    processed = ctxt->input->cur - ctxt->input->base;
+		    ctxt->input->buf->raw = xmlBufferCreate();
+		    xmlBufferAdd(ctxt->input->buf->raw, ctxt->input->cur,
+		                 ctxt->input->length - processed);
+		    ctxt->input->buf->buffer = xmlBufferCreate();
+
+		    /*
+		     * convert as much as possible of the raw input
+		     * to the parser reading buffer.
+		     */
+		    nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
+		                               ctxt->input->buf->buffer,
+					       ctxt->input->buf->raw);
+		    if (nbchars < 0) {
+			fprintf(stderr, "xmlSwitchToEncoding: encoder error\n");
+			return(-1);
+		    }
+
+		    /*
+		     * Conversion succeeded, get rid of the old buffer
+		     */
+		    if ((ctxt->input->free != NULL) &&
+		        (ctxt->input->base != NULL))
+			ctxt->input->free((xmlChar *) ctxt->input->base);
+		    ctxt->input->base =
+		    ctxt->input->cur = ctxt->input->buf->buffer->content;
+		}
+	    }
+	} else {
+	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+	        ctxt->sax->error(ctxt->userData,
+		                 "xmlSwitchEncoding : no input\n");
+	    return(-1);
+	}
+	/*
+	 * The parsing is now done in UTF8 natively
+	 */
+	if (ctxt->encoding != NULL) {
+	    xmlFree((xmlChar *) ctxt->encoding);
+	    ctxt->encoding = NULL;
+	}
+    } else 
+	return(-1);
+    return(0);
+
+}

 /************************************************************************
 *									*
@ -4253,7 +4235,7 @@ xmlParseExternalID(xmlParserCtxtPtr ctxt, xmlChar **publicID, int strict) {
 void
 xmlParseComment(xmlParserCtxtPtr ctxt) {
    xmlChar *buf = NULL;
-    int len = 0;
+    int len;
    int size = XML_PARSER_BUFFER_SIZE;
    int q, ql;
    int r, rl;
@ -4282,10 +4264,11 @@ xmlParseComment(xmlParserCtxtPtr ctxt) {
    r = CUR_CHAR(rl);
    NEXTL(rl);
    cur = CUR_CHAR(l);
+    len = 0;
    while (IS_CHAR(cur) &&
           ((cur != '>') ||
 	    (r != '-') || (q != '-'))) {
-	if ((r == '-') && (q == '-')) {
+	if ((r == '-') && (q == '-') && (len > 1)) {
 	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
 	        ctxt->sax->error(ctxt->userData,
 	       "Comment must not contain '--' (double-hyphen)`\n");
@ -4732,12 +4715,37 @@ xmlParseEntityDecl(xmlParserCtxtPtr ctxt) {
 		    ctxt->disableSAX = 1;
 		}
 		if (URI) {
+		    xmlURIPtr uri;
+
+		    uri = xmlParseURI((const char *) URI);
+		    if (uri == NULL) {
 			if ((ctxt->sax != NULL) &&
-			(!ctxt->disableSAX) && (ctxt->sax->entityDecl != NULL))
+			    (!ctxt->disableSAX) &&
+			    (ctxt->sax->error != NULL))
+			    ctxt->sax->error(ctxt->userData,
+				        "Invalid URI: %s\n", URI);
+			ctxt->wellFormed = 0;
+			ctxt->errNo = XML_ERR_INVALID_URI;
+		    } else {
+			if (uri->fragment != NULL) {
+			    if ((ctxt->sax != NULL) &&
+				(!ctxt->disableSAX) &&
+				(ctxt->sax->error != NULL))
+				ctxt->sax->error(ctxt->userData,
+					    "Fragment not allowed: %s\n", URI);
+			    ctxt->wellFormed = 0;
+			    ctxt->errNo = XML_ERR_URI_FRAGMENT;
+			} else {
+			    if ((ctxt->sax != NULL) &&
+				(!ctxt->disableSAX) &&
+				(ctxt->sax->entityDecl != NULL))
 				ctxt->sax->entityDecl(ctxt->userData, name,
 					    XML_EXTERNAL_PARAMETER_ENTITY,
 					    literal, URI, NULL);
 			}
+			xmlFreeURI(uri);
+		    }
+		}
 	    }
 	} else {
 	    if ((RAW == '"') || (RAW == '\'')) {
@ -4757,6 +4765,31 @@ xmlParseEntityDecl(xmlParserCtxtPtr ctxt) {
 		    ctxt->wellFormed = 0;
 		    ctxt->disableSAX = 1;
 		}
+		if (URI) {
+		    xmlURIPtr uri;
+
+		    uri = xmlParseURI((const char *)URI);
+		    if (uri == NULL) {
+			if ((ctxt->sax != NULL) &&
+			    (!ctxt->disableSAX) &&
+			    (ctxt->sax->error != NULL))
+			    ctxt->sax->error(ctxt->userData,
+				        "Invalid URI: %s\n", URI);
+			ctxt->wellFormed = 0;
+			ctxt->errNo = XML_ERR_INVALID_URI;
+		    } else {
+			if (uri->fragment != NULL) {
+			    if ((ctxt->sax != NULL) &&
+				(!ctxt->disableSAX) &&
+				(ctxt->sax->error != NULL))
+				ctxt->sax->error(ctxt->userData,
+					    "Fragment not allowed: %s\n", URI);
+			    ctxt->wellFormed = 0;
+			    ctxt->errNo = XML_ERR_URI_FRAGMENT;
+			}
+			xmlFreeURI(uri);
+		    }
+		}
 		if ((RAW != '>') && (!IS_BLANK(CUR))) {
 		    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
 			ctxt->sax->error(ctxt->userData,
@ -5973,7 +6006,20 @@ xmlParseTextDecl(xmlParserCtxtPtr ctxt) {
    /*
     * We know that '<?xml' is here.
     */
+    if ((RAW == '<') && (NXT(1) == '?') &&
+	(NXT(2) == 'x') && (NXT(3) == 'm') &&
+	(NXT(4) == 'l') && (IS_BLANK(NXT(5)))) {
 	SKIP(5);
+    } else {
+	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+	    ctxt->sax->error(ctxt->userData,
+	                     "Text declaration '<?xml' required\n");
+	ctxt->errNo = XML_ERR_XMLDECL_NOT_STARTED;
+	ctxt->wellFormed = 0;
+	ctxt->disableSAX = 1;
+
+	return;
+    }

    if (!IS_BLANK(CUR)) {
 	if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
@ -6003,7 +6049,13 @@ xmlParseTextDecl(xmlParserCtxtPtr ctxt) {
 	ctxt->wellFormed = 0;
 	ctxt->disableSAX = 1;
    }
-    ctxt->input->encoding = xmlParseEncodingDecl(ctxt);
+    xmlParseEncodingDecl(ctxt);
+    if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
+	/*
+	 * The XML REC instructs us to stop parsing right here
+	 */
+        return;
+    }

    SKIP_BLANKS;
    if ((RAW == '?') && (NXT(1) == '>')) {
@ -6192,6 +6244,13 @@ xmlParseExternalSubset(xmlParserCtxtPtr ctxt, const xmlChar *ExternalID,
        (NXT(2) == 'x') && (NXT(3) == 'm') &&
 	(NXT(4) == 'l')) {
 	xmlParseTextDecl(ctxt);
+	if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
+	    /*
+	     * The XML REC instructs us to stop parsing right here
+	     */
+	    ctxt->instate = XML_PARSER_EOF;
+	    return;
+	}
    }
    if (ctxt->myDoc == NULL) {
        ctxt->myDoc = xmlNewDoc(BAD_CAST "1.0");
@ -6441,6 +6500,13 @@ xmlParseReference(xmlParserCtxtPtr ctxt) {
 		    (NXT(2) == 'x') && (NXT(3) == 'm') &&
 		    (NXT(4) == 'l') && (IS_BLANK(NXT(5)))) {
 		    xmlParseTextDecl(ctxt);
+		    if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
+			/*
+			 * The XML REC instructs us to stop parsing right here
+			 */
+			ctxt->instate = XML_PARSER_EOF;
+			return;
+		    }
 		    if (input->standalone) {
 			if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
 			    ctxt->sax->error(ctxt->userData,
@ -6947,6 +7013,15 @@ xmlParsePEReference(xmlParserCtxtPtr ctxt) {
 			    (NXT(2) == 'x') && (NXT(3) == 'm') &&
 			    (NXT(4) == 'l') && (IS_BLANK(NXT(5)))) {
 			    xmlParseTextDecl(ctxt);
+			    if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
+				/*
+				 * The XML REC instructs us to stop parsing
+				 * right here
+				 */
+				ctxt->instate = XML_PARSER_EOF;
+				xmlFree(name);
+				return;
+			    }
 			}
 			if (ctxt->token == 0)
 			    ctxt->token = ' ';
@ -8197,6 +8272,38 @@ xmlParseEncodingDecl(xmlParserCtxtPtr ctxt) {
 	    ctxt->disableSAX = 1;
 	    ctxt->errNo = XML_ERR_STRING_NOT_STARTED;
 	}
+	if (encoding != NULL) {
+	    xmlCharEncoding enc;
+	    xmlCharEncodingHandlerPtr handler;
+
+	    if (ctxt->input->encoding != NULL)
+		xmlFree((xmlChar *) ctxt->input->encoding);
+	    ctxt->input->encoding = encoding;
+
+	    enc = xmlParseCharEncoding((const char *) encoding);
+	    /*
+	     * registered set of known encodings
+	     */
+	    if (enc != XML_CHAR_ENCODING_ERROR) {
+		xmlSwitchEncoding(ctxt, enc);
+		if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
+		    xmlFree(encoding);
+		    return(NULL);
+		}
+	    } else {
+	        /*
+		 * fallback for unknown encodings
+		 */
+                handler = xmlFindCharEncodingHandler((const char *) encoding);
+		if (handler != NULL) {
+		    xmlSwitchToEncoding(ctxt, handler);
+		} else {
+		    ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
+		    xmlFree(encoding);
+		    return(NULL);
+		}
+	    }
+	}
    }
    return(encoding);
 }
@ -8362,7 +8469,13 @@ xmlParseXMLDecl(xmlParserCtxtPtr ctxt) {
 	ctxt->wellFormed = 0;
 	ctxt->disableSAX = 1;
    }
-    ctxt->input->encoding = xmlParseEncodingDecl(ctxt);
+    xmlParseEncodingDecl(ctxt);
+    if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
+	/*
+	 * The XML REC instructs us to stop parsing right here
+	 */
+        return;
+    }

    /*
     * We may have the standalone status.
@ -8489,12 +8602,19 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) {
    if ((RAW == '<') && (NXT(1) == '?') &&
        (NXT(2) == 'x') && (NXT(3) == 'm') &&
 	(NXT(4) == 'l') && (IS_BLANK(NXT(5)))) {
+
+	/*
+	 * Note that we will switch encoding on the fly.
+	 */
 	xmlParseXMLDecl(ctxt);
+	if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
+	    /*
+	     * The XML REC instructs us to stop parsing right here
+	     */
+	    return(-1);
+	}
 	ctxt->standalone = ctxt->input->standalone;
 	SKIP_BLANKS;
-	if ((ctxt->encoding == NULL) && (ctxt->input->encoding != NULL))
-	    ctxt->encoding = xmlStrdup(ctxt->input->encoding);
-
    } else {
 	ctxt->version = xmlCharStrdup(XML_DEFAULT_VERSION);
    }
@ -8581,14 +8701,6 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) {
 	(!ctxt->disableSAX))
        ctxt->sax->endDocument(ctxt->userData);

-    /*
-     * Grab the encoding if it was added on-the-fly
-     */
-    if ((ctxt->encoding != NULL) && (ctxt->myDoc != NULL) &&
-	(ctxt->myDoc->encoding == NULL)) {
-	ctxt->myDoc->encoding = ctxt->encoding;
-	ctxt->encoding = NULL;
-    }
    if (! ctxt->wellFormed) return(-1);
    return(0);
 }
@ -8805,6 +8917,14 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
 			fprintf(stderr, "PP: Parsing XML Decl\n");
 #endif
 			xmlParseXMLDecl(ctxt);
+			if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
+			    /*
+			     * The XML REC instructs us to stop parsing right
+			     * here
+			     */
+			    ctxt->instate = XML_PARSER_EOF;
+			    return(0);
+			}
 			ctxt->standalone = ctxt->input->standalone;
 			if ((ctxt->encoding == NULL) &&
 			    (ctxt->input->encoding != NULL))
--- a/parserInternals.h
+++ b/parserInternals.h
@ -28,10 +28,10 @@ extern "C" {
 * any Unicode character, excluding the surrogate blocks, FFFE, and FFFF.
 */
 #define IS_CHAR(c)							\
-    ((((c) == 0x09) || ((c) == 0x0A) || ((c) == 0x0D) ||		\
-      (((c) >= 0x20) && ((c) != 0xFFFE) && ((c) != 0xFFFF))) &&		\
-      (((c) <= 0xD7FF) || ((c) >= 0xE000)) && ((c) >= 0) &&		\
-      ((c) <= 0x10FFFF))
+    (((c) == 0x09) || ((c) == 0x0A) || ((c) == 0x0D) ||			\
+     (((c) >= 0x20) && ((c) <= 0xD7FF)) ||				\
+     (((c) >= 0xE000) && ((c) <= 0xFFFD)) ||				\
+     (((c) >= 0x10000) && ((c) <= 0x10FFFF)))

 /*
 * [3] S ::= (#x20 | #x9 | #xD | #xA)+
@ -442,8 +442,10 @@ xmlParserCtxtPtr	xmlNewParserCtxt	(void);
 xmlParserCtxtPtr	xmlCreateEntityParserCtxt(const xmlChar *URL,
 						 const xmlChar *ID,
 						 const xmlChar *base);
-void			xmlSwitchEncoding	(xmlParserCtxtPtr ctxt,
+int			xmlSwitchEncoding	(xmlParserCtxtPtr ctxt,
 						 xmlCharEncoding enc);
+int			xmlSwitchToEncoding	(xmlParserCtxtPtr ctxt,
+					     xmlCharEncodingHandlerPtr handler);
 void			xmlFreeParserCtxt	(xmlParserCtxtPtr ctxt);

 /**
--- a/tree.c
+++ b/tree.c
@ -3771,6 +3771,31 @@ xmlBufferShrink(xmlBufferPtr buf, int len) {
    return(len);
 }

+/**
+ * xmlBufferGrow:
+ * @buf:  the buffer
+ * @len:  the minimum free sie to allocate
+ *
+ * Grow the available space of an XML buffer.
+ *
+ * Returns the new available space or -1 in case of error
+ */
+int
+xmlBufferGrow(xmlBufferPtr buf, int len) {
+    int size;
+    xmlChar *newbuf;
+
+    if (len <= buf->use) return(0);
+
+    size = buf->size + buf->use + len + 100;
+
+    newbuf = xmlRealloc(buf->content, size);
+    if (newbuf == NULL) return(-1);
+    buf->content = newbuf;
+    buf->size = size;
+    return(buf->size - buf->use);
+}
+
 /**
 * xmlBufferDump:
 * @file:  the file output
--- a/tree.h
+++ b/tree.h
@ -380,6 +380,8 @@ void		xmlBufferCCat		(xmlBufferPtr buf,
 					 const char *str);
 int		xmlBufferShrink		(xmlBufferPtr buf,
 					 int len);
+int		xmlBufferGrow		(xmlBufferPtr buf,
+					 int len);
 void		xmlBufferEmpty		(xmlBufferPtr buf);
 const xmlChar*	xmlBufferContent	(const xmlBufferPtr buf);
 int		xmlBufferUse		(const xmlBufferPtr buf);
--- a/uri.c
+++ b/uri.c
@ -1283,6 +1283,34 @@ xmlParseURIReference(xmlURIPtr uri, const char *str) {
    return(0);
 }

+/**
+ * xmlParseURI:
+ * @str:  the URI string to analyze
+ *
+ * Parse an URI 
+ * 
+ * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
+ *
+ * Returns a newly build xmlURIPtr or NULL in case of error
+ */
+xmlURIPtr
+xmlParseURI(const char *str) {
+    xmlURIPtr uri;
+    int ret;
+
+    if (str == NULL)
+	return(NULL);
+    uri = xmlCreateURI();
+    if (uri != NULL) {
+	ret = xmlParseURIReference(uri, str);
+        if (ret) {
+	    xmlFreeURI(uri);
+	    return(NULL);
+	}
+    }
+    return(uri);
+}
+
 /**
 * xmlNormalizeURIPath:
 * @path:  pointer to the path string
--- a/xml-error.h
+++ b/xml-error.h
@ -130,7 +130,9 @@ typedef enum {
    XML_ERR_ENTITY_CHAR_ERROR, /* 88 */
    XML_ERR_ENTITY_PE_INTERNAL, /* 88 */
    XML_ERR_ENTITY_LOOP, /* 89 */
-    XML_ERR_ENTITY_BOUNDARY /* 90 */
+    XML_ERR_ENTITY_BOUNDARY, /* 90 */
+    XML_ERR_INVALID_URI, /* 91 */
+    XML_ERR_URI_FRAGMENT /* 92 */
 }xmlParserErrors;

 void	xmlParserError		(void *ctx,
--- a/xmlIO.c
+++ b/xmlIO.c
@ -498,6 +498,10 @@ xmlAllocParserInputBuffer(xmlCharEncoding enc) {
    }
    ret->buffer->alloc = XML_BUFFER_ALLOC_DOUBLEIT;
    ret->encoder = xmlGetCharEncodingHandler(enc);
+    if (ret->encoder != NULL)
+        ret->raw = xmlBufferCreate();
+    else
+        ret->raw = NULL;
    ret->readcallback = NULL;
    ret->closecallback = NULL;
    ret->context = NULL;
@ -513,13 +517,20 @@ xmlAllocParserInputBuffer(xmlCharEncoding enc) {
 */
 void
 xmlFreeParserInputBuffer(xmlParserInputBufferPtr in) {
-    if (in->buffer != NULL) {
-        xmlBufferFree(in->buffer);
-	in->buffer = NULL;
+    if (in->raw) {
+        xmlBufferFree(in->raw);
+	in->raw = NULL;
+    }
+    if (in->encoder != NULL) {
+        xmlCharEncCloseFunc(in->encoder);
    }
    if (in->closecallback != NULL) {
 	in->closecallback(in->context);
    }
+    if (in->buffer != NULL) {
+        xmlBufferFree(in->buffer);
+	in->buffer = NULL;
+    }

    memset(in, 0xbe, (size_t) sizeof(xmlParserInputBuffer));
    xmlFree(in);
@ -683,34 +694,22 @@ xmlParserInputBufferPush(xmlParserInputBufferPtr in, int len, const char *buf) {

    if (len < 0) return(0);
    if (in->encoder != NULL) {
-        xmlChar *buffer;
-	int processed = len;
-
-	buffer = (xmlChar *) xmlMalloc((len + 1) * 2 * sizeof(xmlChar));
-	if (buffer == NULL) {
-	    fprintf(stderr, "xmlParserInputBufferGrow : out of memory !\n");
-	    return(-1);
-	}
-	nbchars = in->encoder->input(buffer, (len + 1) * 2 * sizeof(xmlChar),
-	                             (xmlChar *) buf, &processed);
        /*
-	 * TODO : we really need to have something atomic or the 
-	 *        encoder must report the number of bytes read
+	 * Store the data in the incoming raw buffer
 	 */
+        if (in->raw == NULL) {
+	    in->raw = xmlBufferCreate();
+	}
+	xmlBufferAdd(in->raw, (const xmlChar *) buf, len);
+
+	/*
+	 * convert as much as possible to the parser reading buffer.
+	 */
+	nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
 	if (nbchars < 0) {
 	    fprintf(stderr, "xmlParserInputBufferPush: encoder error\n");
-	    xmlFree(buffer);
 	    return(-1);
 	}
-	if (processed  != len) {
-	    fprintf(stderr,
-	            "TODO xmlParserInputBufferPush: processed  != len\n");
-	    xmlFree(buffer);
-	    return(-1);
-	}
-        buffer[nbchars] = 0;
-        xmlBufferAdd(in->buffer, (xmlChar *) buffer, nbchars);
-	xmlFree(buffer);
    } else {
 	nbchars = len;
        xmlBufferAdd(in->buffer, (xmlChar *) buf, nbchars);
@ -730,7 +729,9 @@ xmlParserInputBufferPush(xmlParserInputBufferPtr in, int len, const char *buf) {
 * Grow up the content of the input buffer, the old data are preserved
 * This routine handle the I18N transcoding to internal UTF-8
 * This routine is used when operating the parser in normal (pull) mode
- * TODO: one should be able to remove one extra copy
+ *
+ * TODO: one should be able to remove one extra copy by copying directy
+ *       onto in->buffer or in->raw
 *
 * Returns the number of chars read and stored in the buffer, or -1
 *         in case of error.
@ -779,34 +780,22 @@ xmlParserInputBufferGrow(xmlParserInputBufferPtr in, int len) {
 	return(-1);
    }
    if (in->encoder != NULL) {
-        xmlChar *buf;
-	int wrote = res;
+        /*
+	 * Store the data in the incoming raw buffer
+	 */
+        if (in->raw == NULL) {
+	    in->raw = xmlBufferCreate();
+	}
+	xmlBufferAdd(in->raw, (const xmlChar *) buffer, len);

-	buf = (xmlChar *) xmlMalloc((res + 1) * 2 * sizeof(xmlChar));
-	if (buf == NULL) {
-	    fprintf(stderr, "xmlParserInputBufferGrow : out of memory !\n");
-	    xmlFree(buffer);
+	/*
+	 * convert as much as possible to the parser reading buffer.
+	 */
+	nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
+	if (nbchars < 0) {
+	    fprintf(stderr, "xmlParserInputBufferGrow: encoder error\n");
 	    return(-1);
 	}
-	nbchars = in->encoder->input(buf, (res + 1) * 2 * sizeof(xmlChar),
-	                             BAD_CAST buffer, &wrote);
-        buf[nbchars] = 0;
-        xmlBufferAdd(in->buffer, (xmlChar *) buf, nbchars);
-	xmlFree(buf);
-
-	/*
-	 * Check that the encoder was able to process the full input
-	 */
-	if (wrote != res) {
-	    fprintf(stderr, 
-	        "TODO : xmlParserInputBufferGrow wrote %d != res %d\n",
-		wrote, res);
-	    /*
-	     * TODO !!!
-	     * Need to keep the unprocessed input in a buffer in->unprocessed
-	     */
-	}
-
    } else {
 	nbchars = res;
        buffer[nbchars] = 0;
--- a/xmlIO.h
+++ b/xmlIO.h
@ -33,6 +33,7 @@ struct _xmlParserInputBuffer {
    xmlCharEncodingHandlerPtr encoder; /* I18N conversions to UTF-8 */
    
    xmlBufferPtr buffer;    /* Local buffer encoded in  UTF-8 */
+    xmlBufferPtr raw;       /* if encoder != NULL buffer for raw input */
 };