1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-08-07 06:43:02 +03:00

revamped the encoding support, added iconv support, so now libxml if

* encoding.[ch], xmlIO.[ch], parser.c, configure.in : revamped
  the encoding support, added iconv support, so now libxml if
  compiled with iconv automatically support japanese encodings
  among others. Work based on initial patch from Yuan-Chen Cheng
  I may have broken binary compat in the encoding handler
  registration scheme, but that was so utterly broken I don't
  expect anybody to have used this feature until now.
* parserInternals.h: fixup on the CHAR range macro
* xml-error.h, parser.c: catch URL/URI errors using the uri.c
  code.
* tree.[ch]: added xmlBufferGrow(), was needed for iconv
* uri.c: added xmlParseURI() I can't believe I forgot to
  implement this one in 2.0 !!!
* SAX.c: moved doc->encoding update in the endDocument() call.
* TODO: updated.

  Iconv rules :-)

Daniel
This commit is contained in:
Daniel Veillard
2000-05-03 14:20:55 +00:00
parent 06047432eb
commit 496a1cf592
18 changed files with 1163 additions and 487 deletions

View File

@@ -1,3 +1,21 @@
Wed May 3 14:21:25 CEST 2000 Daniel Veillard <Daniel.Veillard@w3.org>
* encoding.[ch], xmlIO.[ch], parser.c, configure.in : revamped
the encoding support, added iconv support, so now libxml if
compiled with iconv automatically support japanese encodings
among others. Work based on initial patch from Yuan-Chen Cheng
I may have broken binary compat in the encoding handler
registration scheme, but that was so utterly broken I don't
expect anybody to have used this feature until now.
* parserInternals.h: fixup on the CHAR range macro
* xml-error.h, parser.c: catch URL/URI errors using the uri.c
code.
* tree.[ch]: added xmlBufferGrow(), was needed for iconv
* uri.c: added xmlParseURI() I can't believe I forgot to
implement this one in 2.0 !!!
* SAX.c: moved doc->encoding update in the endDocument() call.
* TODO: updated.
Mon Apr 24 13:30:13 CEST 2000 Daniel Veillard <Daniel.Veillard@w3.org> Mon Apr 24 13:30:13 CEST 2000 Daniel Veillard <Daniel.Veillard@w3.org>
* tree.h: removed extraneous xmlRemoveProp definition * tree.h: removed extraneous xmlRemoveProp definition

9
SAX.c
View File

@@ -595,6 +595,15 @@ endDocument(void *ctx)
if (ctxt->validate && ctxt->wellFormed && if (ctxt->validate && ctxt->wellFormed &&
ctxt->myDoc && ctxt->myDoc->intSubset) ctxt->myDoc && ctxt->myDoc->intSubset)
ctxt->valid &= xmlValidateDocumentFinal(&ctxt->vctxt, ctxt->myDoc); ctxt->valid &= xmlValidateDocumentFinal(&ctxt->vctxt, ctxt->myDoc);
/*
* Grab the encoding if it was added on-the-fly
*/
if ((ctxt->encoding != NULL) && (ctxt->myDoc != NULL) &&
(ctxt->myDoc->encoding == NULL)) {
ctxt->myDoc->encoding = ctxt->encoding;
ctxt->encoding = NULL;
}
} }
/** /**

9
TODO
View File

@@ -6,6 +6,8 @@
TODO: TODO:
===== =====
- xmlSwitchToEncoding() need a rewrite for correct handling of conversion
error code conditions.
- DOM needs - DOM needs
xmlAttrPtr xmlNewDocProp(xmlDocPtr doc, const xmlChar *name, const xmlChar *value) xmlAttrPtr xmlNewDocProp(xmlDocPtr doc, const xmlChar *name, const xmlChar *value)
int xmlPruneProp(xmlNodePtr node, xmlAtttrPtr attr); int xmlPruneProp(xmlNodePtr node, xmlAtttrPtr attr);
@@ -14,7 +16,6 @@ TODO:
- add support for the trick from Henry conf/sun/valid/empty.xml - add support for the trick from Henry conf/sun/valid/empty.xml
- Correct standalone checking/emitting (hard) - Correct standalone checking/emitting (hard)
2.9 Standalone Document Declaration 2.9 Standalone Document Declaration
- URI checkings (no fragments) rfc2396.txt
- Better checking of external parsed entities TAG 1234 - Better checking of external parsed entities TAG 1234
- Find way of representing PERefs in the Dtd so that %entity; can - Find way of representing PERefs in the Dtd so that %entity; can
be saved back. be saved back.
@@ -22,6 +23,7 @@ TODO:
http://www.w3.org/XML/xml-19980210-errata ... bummmer http://www.w3.org/XML/xml-19980210-errata ... bummmer
- Handle undefined namespaces in entity contents better ... at least - Handle undefined namespaces in entity contents better ... at least
issue a warning issue a warning
- Issue warning when using non-absolute namespaces URI.
- General checking of DTD validation in presence of namespaces ... hairy - General checking of DTD validation in presence of namespaces ... hairy
- fix --disable-corba configure switch handling, and use XML_WITHOUT_CORBA - fix --disable-corba configure switch handling, and use XML_WITHOUT_CORBA
not WITHOUT_CORBA flag not WITHOUT_CORBA flag
@@ -30,7 +32,7 @@ TODO:
===== =====
- Get OASIS testsuite to a more friendly result, check all the results - Get OASIS testsuite to a more friendly result, check all the results
once stable. once stable. Current state at:
http://xmlsoft.org/conf/result.html http://xmlsoft.org/conf/result.html
- Optimization of tag strings allocation ? - Optimization of tag strings allocation ?
@@ -55,11 +57,13 @@ EXTENSIONS:
- Add Xlink recognition/API - Add Xlink recognition/API
=> started adding an xlink.[ch] with a unified API for XML and HTML. => started adding an xlink.[ch] with a unified API for XML and HTML.
it's crap :-(
- Implement XSLT - Implement XSLT
=> seems that someone volunteered ?!? => seems that someone volunteered ?!?
- Implement XSchemas - Implement XSchemas
=> Really need to be done <grin/>
- O2K parsing; - O2K parsing;
=> this is a somewhat ugly mix of HTML and XML, adding a specific => this is a somewhat ugly mix of HTML and XML, adding a specific
@@ -88,6 +92,7 @@ EXTENSIONS:
Done: Done:
===== =====
- URI checkings (no fragments) rfc2396.txt
- Added a clean mechanism for overload or added input methods: - Added a clean mechanism for overload or added input methods:
xmlRegisterInputCallbacks() xmlRegisterInputCallbacks()
- dynamically adapt the alloc entry point to use g_alloc()/g_free() - dynamically adapt the alloc entry point to use g_alloc()/g_free()

View File

@@ -4,7 +4,7 @@ AC_INIT(entities.h)
AM_CONFIG_HEADER(config.h) AM_CONFIG_HEADER(config.h)
LIBXML_MAJOR_VERSION=2 LIBXML_MAJOR_VERSION=2
LIBXML_MINOR_VERSION=0 LIBXML_MINOR_VERSION=1
LIBXML_MICRO_VERSION=0 LIBXML_MICRO_VERSION=0
LIBXML_VERSION=$LIBXML_MAJOR_VERSION.$LIBXML_MINOR_VERSION.$LIBXML_MICRO_VERSION LIBXML_VERSION=$LIBXML_MAJOR_VERSION.$LIBXML_MINOR_VERSION.$LIBXML_MICRO_VERSION
LIBXML_VERSION_INFO=`expr $LIBXML_MAJOR_VERSION + $LIBXML_MINOR_VERSION`:$LIBXML_MICRO_VERSION:$LIBXML_MINOR_VERSION LIBXML_VERSION_INFO=`expr $LIBXML_MAJOR_VERSION + $LIBXML_MINOR_VERSION`:$LIBXML_MICRO_VERSION:$LIBXML_MINOR_VERSION
@@ -203,6 +203,20 @@ fi
AC_SUBST(WITH_XPATH) AC_SUBST(WITH_XPATH)
AC_SUBST(XPATH_OBJ) AC_SUBST(XPATH_OBJ)
AC_ARG_WITH(iconv, [ --with-iconv Add the ICONV support (on)])
if test "$with_iconv" = "no" ; then
echo Disabling ICONV support
WITH_ICONV=0
else
if test "$have_iconv" != "" ; then
echo Iconv support not found
WITH_ICONV=0
else
WITH_ICONV=1
fi
fi
AC_SUBST(WITH_ICONV)
AC_ARG_WITH(debug, [ --with-debug Add the debugging module (on)]) AC_ARG_WITH(debug, [ --with-debug Add the debugging module (on)])
if test "$with_debug" = "no" ; then if test "$with_debug" = "no" ; then
echo Disabling DEBUG support echo Disabling DEBUG support

View File

@@ -34,12 +34,26 @@
#ifdef HAVE_STDLIB_H #ifdef HAVE_STDLIB_H
#include <stdlib.h> #include <stdlib.h>
#endif #endif
#include <libxml/xmlversion.h>
#ifdef LIBXML_ICONV_ENABLED
#ifdef HAVE_ERRNO_H
#include <errno.h>
#endif
#endif
#include <libxml/encoding.h> #include <libxml/encoding.h>
#include <libxml/xmlmemory.h> #include <libxml/xmlmemory.h>
xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL; xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL; xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
#ifdef LIBXML_ICONV_ENABLED
#if 0
#define DEBUG_ENCODING /* Define this to get encoding traces */
#endif
#endif
static int xmlLittleEndian = 1;
/* /*
* From rfc2044: encoding of the Unicode values on UTF-8: * From rfc2044: encoding of the Unicode values on UTF-8:
* *
@@ -104,30 +118,38 @@ xmlCheckUTF8(const unsigned char *utf)
* *
* Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8 * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
* block of chars out. * block of chars out.
* Returns the number of byte written, or -1 by lack of space. * Returns 0 if success, or -1 otherwise
* The value of @inlen after return is the number of octets consumed
* as the return value is positive, else unpredictiable.
* The value of @outlen after return is the number of ocetes consumed.
*/ */
int int
isolat1ToUTF8(unsigned char* out, int outlen, isolat1ToUTF8(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen) { const unsigned char* in, int *inlen) {
unsigned char* outstart = out; unsigned char* outstart = out;
unsigned char* outend= out+outlen; const unsigned char* processed = in;
unsigned char* outend = out + *outlen;
const unsigned char* inend = in + *inlen; const unsigned char* inend = in + *inlen;
unsigned char c; unsigned char c;
while (in < inend) { while (in < inend) {
c= *in++; c= *in++;
if (c < 0x80) { if (c < 0x80) {
if (out >= outend) return(-1); if (out >= outend)
break;
*out++ = c; *out++ = c;
} }
else { else {
if (out >= outend) return(-1); if (out + 1 >= outend) break;
*out++ = 0xC0 | (c >> 6); *out++ = 0xC0 | (c >> 6);
if (out >= outend) return(-1);
*out++ = 0x80 | (0x3F & c); *out++ = 0x80 | (0x3F & c);
} }
processed = in;
} }
return(out-outstart); *outlen = out - outstart;
*inlen = processed - in;
return(0);
} }
/** /**
@@ -141,17 +163,17 @@ isolat1ToUTF8(unsigned char* out, int outlen,
* block of chars out. * block of chars out.
* TODO: UTF8Toisolat1 need a fallback mechanism ... * TODO: UTF8Toisolat1 need a fallback mechanism ...
* *
* Returns the number of byte written, or -1 by lack of space, or -2 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
* if the transcoding fails (for *in is not valid utf8 string or
* the result of transformation can't fit into the encoding we want)
* The value of @inlen after return is the number of octets consumed * The value of @inlen after return is the number of octets consumed
* as the return value is positive, else unpredictiable. * as the return value is positive, else unpredictiable.
* The value of @outlen after return is the number of ocetes consumed.
*/ */
int int
UTF8Toisolat1(unsigned char* out, int outlen, UTF8Toisolat1(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen) { const unsigned char* in, int *inlen) {
unsigned char* outstart = out; unsigned char* outstart = out;
unsigned char* outend= out+outlen; const unsigned char* processed = in;
unsigned char* outend = out + *outlen;
const unsigned char* inend = in + *inlen; const unsigned char* inend = in + *inlen;
unsigned char c; unsigned char c;
@@ -162,18 +184,22 @@ UTF8Toisolat1(unsigned char* out, int outlen,
*out++= c; *out++= c;
} }
else if (in == inend) { else if (in == inend) {
*inlen -= 1;
break; break;
} }
else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) { else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) {
/* a two byte utf-8 and can be encoding as isolate1 */ /* a two byte utf-8 and can be encoding as isolate1 */
*out++= ((c & 0x03) << 6) | (*in++ & 0x3F); *out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
} }
else else {
*outlen = out - outstart;
*inlen = processed - in;
return(-2); return(-2);
/* TODO : some should be represent as "&#x____;" */
} }
return(out-outstart); processed = in;
}
*outlen = out - outstart;
*inlen = processed - in;
return(0);
} }
/** /**
@@ -194,11 +220,12 @@ UTF8Toisolat1(unsigned char* out, int outlen,
* as the return value is positive, else unpredictiable. * as the return value is positive, else unpredictiable.
*/ */
int int
UTF16LEToUTF8(unsigned char* out, int outlen, UTF16LEToUTF8(unsigned char* out, int *outlen,
const unsigned char* inb, int *inlenb) const unsigned char* inb, int *inlenb)
{ {
unsigned char* outstart = out; unsigned char* outstart = out;
unsigned char* outend= out+outlen; const unsigned char* processed = inb;
unsigned char* outend = out + *outlen;
unsigned short* in = (unsigned short*) inb; unsigned short* in = (unsigned short*) inb;
unsigned short* inend; unsigned short* inend;
unsigned int c, d, inlen; unsigned int c, d, inlen;
@@ -210,40 +237,42 @@ UTF16LEToUTF8(unsigned char* out, int outlen,
inlen = *inlenb / 2; inlen = *inlenb / 2;
inend = in + inlen; inend = in + inlen;
while (in < inend) { while (in < inend) {
#ifdef BIG_ENDIAN if (xmlLittleEndian) {
c= *in++;
} else {
tmp = (unsigned char *) in; tmp = (unsigned char *) in;
c = *tmp++; c = *tmp++;
c = c | (((unsigned int)*tmp) << 8); c = c | (((unsigned int)*tmp) << 8);
in++; in++;
#else /* BIG_ENDIAN */ }
c= *in++;
#endif /* BIG_ENDIAN */
if ((c & 0xFC00) == 0xD800) { /* surrogates */ if ((c & 0xFC00) == 0xD800) { /* surrogates */
if (in >= inend) { /* (in > inend) shouldn't happens */ if (in >= inend) { /* (in > inend) shouldn't happens */
(*inlenb) -= 2;
break; break;
} }
#ifdef BIG_ENDIAN if (xmlLittleEndian) {
d = *in++;
} else {
tmp = (unsigned char *) in; tmp = (unsigned char *) in;
d = *tmp++; d = *tmp++;
d = d | (((unsigned int)*tmp) << 8); d = d | (((unsigned int)*tmp) << 8);
in++; in++;
#else /* BIG_ENDIAN */ }
d = *in++;
#endif /* BIG_ENDIAN */
if ((d & 0xFC00) == 0xDC00) { if ((d & 0xFC00) == 0xDC00) {
c &= 0x03FF; c &= 0x03FF;
c <<= 10; c <<= 10;
c |= d & 0x03FF; c |= d & 0x03FF;
c += 0x10000; c += 0x10000;
} }
else else {
*outlen = out - outstart;
*inlenb = processed - inb;
return(-2); return(-2);
} }
}
/* assertion: c is a single UTF-4 value */ /* assertion: c is a single UTF-4 value */
if (out >= outend) if (out >= outend)
return(-1); break;
if (c < 0x80) { *out++= c; bits= -6; } if (c < 0x80) { *out++= c; bits= -6; }
else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; } else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; } else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
@@ -251,11 +280,14 @@ UTF16LEToUTF8(unsigned char* out, int outlen,
for ( ; bits >= 0; bits-= 6) { for ( ; bits >= 0; bits-= 6) {
if (out >= outend) if (out >= outend)
return(-1); break;
*out++= ((c >> bits) & 0x3F) | 0x80; *out++= ((c >> bits) & 0x3F) | 0x80;
} }
processed = (const unsigned char*) in;
} }
return(out-outstart); *outlen = out - outstart;
*inlenb = processed - inb;
return(0);
} }
/** /**
@@ -273,40 +305,44 @@ UTF16LEToUTF8(unsigned char* out, int outlen,
* if the transcoding failed. * if the transcoding failed.
*/ */
int int
UTF8ToUTF16LE(unsigned char* outb, int outlen, UTF8ToUTF16LE(unsigned char* outb, int *outlen,
const unsigned char* in, int *inlen) const unsigned char* in, int *inlen)
{ {
unsigned short* out = (unsigned short*) outb; unsigned short* out = (unsigned short*) outb;
const unsigned char* processed = in;
unsigned short* outstart= out; unsigned short* outstart= out;
unsigned short* outend; unsigned short* outend;
const unsigned char* inend= in+*inlen; const unsigned char* inend= in+*inlen;
unsigned int c, d, trailing; unsigned int c, d, trailing;
#ifdef BIG_ENDIAN
unsigned char *tmp; unsigned char *tmp;
unsigned short tmp1, tmp2; unsigned short tmp1, tmp2;
#endif /* BIG_ENDIAN */
outlen /= 2; /* convert in short length */ outend = out + (*outlen / 2);
outend = out + outlen;
while (in < inend) { while (in < inend) {
d= *in++; d= *in++;
if (d < 0x80) { c= d; trailing= 0; } if (d < 0x80) { c= d; trailing= 0; }
else if (d < 0xC0) else if (d < 0xC0) {
return(-2); /* trailing byte in leading position */ /* trailing byte in leading position */
else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } *outlen = out - outstart;
*inlen = processed - in;
return(-2);
} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
else if (d < 0xF8) { c= d & 0x07; trailing= 3; } else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
else else {
return(-2); /* no chance for this in UTF-16 */ /* no chance for this in UTF-16 */
*outlen = out - outstart;
*inlen = processed - in;
return(-2);
}
if (inend - in < trailing) { if (inend - in < trailing) {
*inlen -= (inend - in);
break; break;
} }
for ( ; trailing; trailing--) { for ( ; trailing; trailing--) {
if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
return(-1); break;
c <<= 6; c <<= 6;
c |= d & 0x3F; c |= d & 0x3F;
} }
@@ -314,21 +350,24 @@ UTF8ToUTF16LE(unsigned char* outb, int outlen,
/* assertion: c is a single UTF-4 value */ /* assertion: c is a single UTF-4 value */
if (c < 0x10000) { if (c < 0x10000) {
if (out >= outend) if (out >= outend)
return(-1); break;
#ifdef BIG_ENDIAN if (xmlLittleEndian) {
*out++ = c;
} else {
tmp = (unsigned char *) out; tmp = (unsigned char *) out;
*tmp = c ; *tmp = c ;
*(tmp + 1) = c >> 8 ; *(tmp + 1) = c >> 8 ;
out++; out++;
#else /* BIG_ENDIAN */ }
*out++ = c;
#endif /* BIG_ENDIAN */
} }
else if (c < 0x110000) { else if (c < 0x110000) {
if (out+1 >= outend) if (out+1 >= outend)
return(-1); break;
c -= 0x10000; c -= 0x10000;
#ifdef BIG_ENDIAN if (xmlLittleEndian) {
*out++ = 0xD800 | (c >> 10);
*out++ = 0xDC00 | (c & 0x03FF);
} else {
tmp1 = 0xD800 | (c >> 10); tmp1 = 0xD800 | (c >> 10);
tmp = (unsigned char *) out; tmp = (unsigned char *) out;
*tmp = tmp1; *tmp = tmp1;
@@ -340,15 +379,15 @@ UTF8ToUTF16LE(unsigned char* outb, int outlen,
*tmp = tmp2; *tmp = tmp2;
*(tmp + 1) = tmp2 >> 8; *(tmp + 1) = tmp2 >> 8;
out++; out++;
#else /* BIG_ENDIAN */ }
*out++ = 0xD800 | (c >> 10);
*out++ = 0xDC00 | (c & 0x03FF);
#endif /* BIG_ENDIAN */
} }
else else
return(-1); break;
processed = in;
} }
return(out-outstart); *outlen = out - outstart;
*inlen = processed - in;
return(0);
} }
/** /**
@@ -369,18 +408,16 @@ UTF8ToUTF16LE(unsigned char* outb, int outlen,
* as the return value is positive, else unpredictiable. * as the return value is positive, else unpredictiable.
*/ */
int int
UTF16BEToUTF8(unsigned char* out, int outlen, UTF16BEToUTF8(unsigned char* out, int *outlen,
const unsigned char* inb, int *inlenb) const unsigned char* inb, int *inlenb)
{ {
unsigned char* outstart = out; unsigned char* outstart = out;
unsigned char* outend= out+outlen; const unsigned char* processed = inb;
unsigned char* outend = out + *outlen;
unsigned short* in = (unsigned short*) inb; unsigned short* in = (unsigned short*) inb;
unsigned short* inend; unsigned short* inend;
unsigned int c, d, inlen; unsigned int c, d, inlen;
#ifdef BIG_ENDIAN
#else /* BIG_ENDIAN */
unsigned char *tmp; unsigned char *tmp;
#endif /* BIG_ENDIAN */
int bits; int bits;
if ((*inlenb % 2) == 1) if ((*inlenb % 2) == 1)
@@ -388,43 +425,46 @@ UTF16BEToUTF8(unsigned char* out, int outlen,
inlen = *inlenb / 2; inlen = *inlenb / 2;
inend= in + inlen; inend= in + inlen;
while (in < inend) { while (in < inend) {
#ifdef BIG_ENDIAN if (xmlLittleEndian) {
c= *in++;
#else
tmp = (unsigned char *) in; tmp = (unsigned char *) in;
c = *tmp++; c = *tmp++;
c = c << 8; c = c << 8;
c = c | (unsigned int) *tmp; c = c | (unsigned int) *tmp;
in++; in++;
#endif } else {
c= *in++;
}
if ((c & 0xFC00) == 0xD800) { /* surrogates */ if ((c & 0xFC00) == 0xD800) { /* surrogates */
if (in >= inend) { /* (in > inend) shouldn't happens */ if (in >= inend) { /* (in > inend) shouldn't happens */
(*inlenb) -= 2; *outlen = out - outstart;
break; *inlenb = processed - inb;
return(-2);
} }
if (xmlLittleEndian) {
#ifdef BIG_ENDIAN
d= *in++;
#else
tmp = (unsigned char *) in; tmp = (unsigned char *) in;
d = *tmp++; d = *tmp++;
d = d << 8; d = d << 8;
d = d | (unsigned int) *tmp; d = d | (unsigned int) *tmp;
in++; in++;
#endif } else {
d= *in++;
}
if ((d & 0xFC00) == 0xDC00) { if ((d & 0xFC00) == 0xDC00) {
c &= 0x03FF; c &= 0x03FF;
c <<= 10; c <<= 10;
c |= d & 0x03FF; c |= d & 0x03FF;
c += 0x10000; c += 0x10000;
} }
else else {
*outlen = out - outstart;
*inlenb = processed - inb;
return(-2); return(-2);
} }
}
/* assertion: c is a single UTF-4 value */ /* assertion: c is a single UTF-4 value */
if (out >= outend) if (out >= outend)
return(-1); break;
if (c < 0x80) { *out++= c; bits= -6; } if (c < 0x80) { *out++= c; bits= -6; }
else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; } else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; } else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
@@ -432,11 +472,14 @@ UTF16BEToUTF8(unsigned char* out, int outlen,
for ( ; bits >= 0; bits-= 6) { for ( ; bits >= 0; bits-= 6) {
if (out >= outend) if (out >= outend)
return(-1); break;
*out++= ((c >> bits) & 0x3F) | 0x80; *out++= ((c >> bits) & 0x3F) | 0x80;
} }
processed = (const unsigned char*) in;
} }
return(out-outstart); *outlen = out - outstart;
*inlenb = processed - inb;
return(0);
} }
/** /**
@@ -454,63 +497,63 @@ UTF16BEToUTF8(unsigned char* out, int outlen,
* if the transcoding failed. * if the transcoding failed.
*/ */
int int
UTF8ToUTF16BE(unsigned char* outb, int outlen, UTF8ToUTF16BE(unsigned char* outb, int *outlen,
const unsigned char* in, int *inlen) const unsigned char* in, int *inlen)
{ {
unsigned short* out = (unsigned short*) outb; unsigned short* out = (unsigned short*) outb;
const unsigned char* processed = in;
unsigned short* outstart= out; unsigned short* outstart= out;
unsigned short* outend; unsigned short* outend;
const unsigned char* inend= in+*inlen; const unsigned char* inend= in+*inlen;
unsigned int c, d, trailing; unsigned int c, d, trailing;
#ifdef BIG_ENDIAN
#else
unsigned char *tmp; unsigned char *tmp;
unsigned short tmp1, tmp2; unsigned short tmp1, tmp2;
#endif /* BIG_ENDIAN */
outlen /= 2; /* convert in short length */ outend = out + (*outlen / 2);
outend = out + outlen;
while (in < inend) { while (in < inend) {
d= *in++; d= *in++;
if (d < 0x80) { c= d; trailing= 0; } if (d < 0x80) { c= d; trailing= 0; }
else if (d < 0xC0) else if (d < 0xC0) {
return(-2); /* trailing byte in leading position */ /* trailing byte in leading position */
else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } *outlen = out - outstart;
*inlen = processed - in;
return(-2);
} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
else if (d < 0xF8) { c= d & 0x07; trailing= 3; } else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
else else {
return(-2); /* no chance for this in UTF-16 */ /* no chance for this in UTF-16 */
*outlen = out - outstart;
*inlen = processed - in;
return(-2);
}
if (inend - in < trailing) { if (inend - in < trailing) {
*inlen -= (inend - in);
break; break;
} }
for ( ; trailing; trailing--) { for ( ; trailing; trailing--) {
if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) return(-1); if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break;
c <<= 6; c <<= 6;
c |= d & 0x3F; c |= d & 0x3F;
} }
/* assertion: c is a single UTF-4 value */ /* assertion: c is a single UTF-4 value */
if (c < 0x10000) { if (c < 0x10000) {
if (out >= outend) return(-1); if (out >= outend) break;
#ifdef BIG_ENDIAN if (xmlLittleEndian) {
*out++ = c;
#else
tmp = (unsigned char *) out; tmp = (unsigned char *) out;
*tmp = c >> 8; *tmp = c >> 8;
*(tmp + 1) = c; *(tmp + 1) = c;
out++; out++;
#endif /* BIG_ENDIAN */ } else {
*out++ = c;
}
} }
else if (c < 0x110000) { else if (c < 0x110000) {
if (out+1 >= outend) return(-1); if (out+1 >= outend) break;
c -= 0x10000; c -= 0x10000;
#ifdef BIG_ENDIAN if (xmlLittleEndian) {
*out++ = 0xD800 | (c >> 10);
*out++ = 0xDC00 | (c & 0x03FF);
#else
tmp1 = 0xD800 | (c >> 10); tmp1 = 0xD800 | (c >> 10);
tmp = (unsigned char *) out; tmp = (unsigned char *) out;
*tmp = tmp1 >> 8; *tmp = tmp1 >> 8;
@@ -522,11 +565,18 @@ UTF8ToUTF16BE(unsigned char* outb, int outlen,
*tmp = tmp2 >> 8; *tmp = tmp2 >> 8;
*(tmp + 1) = tmp2; *(tmp + 1) = tmp2;
out++; out++;
#endif } else {
*out++ = 0xD800 | (c >> 10);
*out++ = 0xDC00 | (c & 0x03FF);
} }
else return(-1);
} }
return(out-outstart); else
break;
processed = in;
}
*outlen = out - outstart;
*inlen = processed - in;
return(0);
} }
/** /**
@@ -636,8 +686,12 @@ xmlParseCharEncoding(const char* name)
if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9); if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP); if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
if (!strcmp(upper, "Shift_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS); if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP); if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
#ifdef DEBUG_ENCODING
fprintf(stderr, "Unknown encoding %s\n", name);
#endif
return(XML_CHAR_ENCODING_ERROR); return(XML_CHAR_ENCODING_ERROR);
} }
@@ -712,6 +766,9 @@ xmlNewCharEncodingHandler(const char *name,
* registers and returns the handler. * registers and returns the handler.
*/ */
xmlRegisterCharEncodingHandler(handler); xmlRegisterCharEncodingHandler(handler);
#ifdef DEBUG_ENCODING
fprintf(stderr, "Registered encoding handler for %s\n", name);
#endif
return(handler); return(handler);
} }
@@ -725,11 +782,18 @@ xmlNewCharEncodingHandler(const char *name,
*/ */
void void
xmlInitCharEncodingHandlers(void) { xmlInitCharEncodingHandlers(void) {
unsigned short int tst = 0x1234;
unsigned char *ptr = (unsigned char *) &tst;
if (handlers != NULL) return; if (handlers != NULL) return;
handlers = (xmlCharEncodingHandlerPtr *) handlers = (xmlCharEncodingHandlerPtr *)
xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr)); xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
if (*ptr == 0x12) xmlLittleEndian = 0;
else if (*ptr == 0x34) xmlLittleEndian = 1;
else fprintf(stderr, "Odd problem at endianness detection\n");
if (handlers == NULL) { if (handlers == NULL) {
fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n"); fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
return; return;
@@ -755,6 +819,7 @@ xmlCleanupCharEncodingHandlers(void) {
for (;nbCharEncodingHandler > 0;) { for (;nbCharEncodingHandler > 0;) {
nbCharEncodingHandler--; nbCharEncodingHandler--;
if (handlers[nbCharEncodingHandler] != NULL) { if (handlers[nbCharEncodingHandler] != NULL) {
if (handlers[nbCharEncodingHandler]->name != NULL)
xmlFree(handlers[nbCharEncodingHandler]->name); xmlFree(handlers[nbCharEncodingHandler]->name);
xmlFree(handlers[nbCharEncodingHandler]); xmlFree(handlers[nbCharEncodingHandler]);
} }
@@ -798,6 +863,8 @@ xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
*/ */
xmlCharEncodingHandlerPtr xmlCharEncodingHandlerPtr
xmlGetCharEncodingHandler(xmlCharEncoding enc) { xmlGetCharEncodingHandler(xmlCharEncoding enc) {
xmlCharEncodingHandlerPtr handler;
if (handlers == NULL) xmlInitCharEncodingHandlers(); if (handlers == NULL) xmlInitCharEncodingHandlers();
switch (enc) { switch (enc) {
case XML_CHAR_ENCODING_ERROR: case XML_CHAR_ENCODING_ERROR:
@@ -811,40 +878,68 @@ xmlGetCharEncodingHandler(xmlCharEncoding enc) {
case XML_CHAR_ENCODING_UTF16BE: case XML_CHAR_ENCODING_UTF16BE:
return(xmlUTF16BEHandler); return(xmlUTF16BEHandler);
case XML_CHAR_ENCODING_EBCDIC: case XML_CHAR_ENCODING_EBCDIC:
return(NULL); handler = xmlFindCharEncodingHandler("EBCDIC");
if (handler != NULL) return(handler);
handler = xmlFindCharEncodingHandler("ebcdic");
if (handler != NULL) return(handler);
break;
case XML_CHAR_ENCODING_UCS4LE: case XML_CHAR_ENCODING_UCS4LE:
return(NULL); handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
if (handler != NULL) return(handler);
handler = xmlFindCharEncodingHandler("UCS-4");
if (handler != NULL) return(handler);
handler = xmlFindCharEncodingHandler("UCS4");
if (handler != NULL) return(handler);
break;
case XML_CHAR_ENCODING_UCS4BE: case XML_CHAR_ENCODING_UCS4BE:
return(NULL); handler = xmlFindCharEncodingHandler("UCS4BE");
if (handler != NULL) return(handler);
break;
case XML_CHAR_ENCODING_UCS4_2143: case XML_CHAR_ENCODING_UCS4_2143:
return(NULL); break;
case XML_CHAR_ENCODING_UCS4_3412: case XML_CHAR_ENCODING_UCS4_3412:
return(NULL); break;
case XML_CHAR_ENCODING_UCS2: case XML_CHAR_ENCODING_UCS2:
return(NULL); handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
if (handler != NULL) return(handler);
handler = xmlFindCharEncodingHandler("UCS-2");
if (handler != NULL) return(handler);
handler = xmlFindCharEncodingHandler("UCS2");
if (handler != NULL) return(handler);
break;
case XML_CHAR_ENCODING_8859_1: case XML_CHAR_ENCODING_8859_1:
return(NULL);
case XML_CHAR_ENCODING_8859_2: case XML_CHAR_ENCODING_8859_2:
return(NULL);
case XML_CHAR_ENCODING_8859_3: case XML_CHAR_ENCODING_8859_3:
return(NULL);
case XML_CHAR_ENCODING_8859_4: case XML_CHAR_ENCODING_8859_4:
return(NULL);
case XML_CHAR_ENCODING_8859_5: case XML_CHAR_ENCODING_8859_5:
return(NULL);
case XML_CHAR_ENCODING_8859_6: case XML_CHAR_ENCODING_8859_6:
return(NULL);
case XML_CHAR_ENCODING_8859_7: case XML_CHAR_ENCODING_8859_7:
return(NULL);
case XML_CHAR_ENCODING_8859_8: case XML_CHAR_ENCODING_8859_8:
return(NULL);
case XML_CHAR_ENCODING_8859_9: case XML_CHAR_ENCODING_8859_9:
return(NULL); return(NULL);
case XML_CHAR_ENCODING_2022_JP: case XML_CHAR_ENCODING_2022_JP:
handler = xmlFindCharEncodingHandler("ISO-2022-JP");
if (handler != NULL) return(handler);
break;
case XML_CHAR_ENCODING_SHIFT_JIS: case XML_CHAR_ENCODING_SHIFT_JIS:
handler = xmlFindCharEncodingHandler("SHIFT-JIS");
if (handler != NULL) return(handler);
handler = xmlFindCharEncodingHandler("SHIFT_JIS");
if (handler != NULL) return(handler);
handler = xmlFindCharEncodingHandler("Shift_JIS");
if (handler != NULL) return(handler);
break;
case XML_CHAR_ENCODING_EUC_JP: case XML_CHAR_ENCODING_EUC_JP:
return(NULL); handler = xmlFindCharEncodingHandler("EUC-JP");
if (handler != NULL) return(handler);
break;
default:
break;
} }
#ifdef DEBUG_ENCODING
fprintf(stderr, "No handler found for encoding %d\n", enc);
#endif
return(NULL); return(NULL);
} }
@@ -858,23 +953,306 @@ xmlGetCharEncodingHandler(xmlCharEncoding enc) {
*/ */
xmlCharEncodingHandlerPtr xmlCharEncodingHandlerPtr
xmlFindCharEncodingHandler(const char *name) { xmlFindCharEncodingHandler(const char *name) {
char upper[500]; #ifdef LIBXML_ICONV_ENABLED
iconv_t icv_in, icv_out;
xmlCharEncodingHandlerPtr enc;
#endif /* LIBXML_ICONV_ENABLED */
char upper[100];
int i; int i;
if (handlers == NULL) xmlInitCharEncodingHandlers(); if (handlers == NULL) xmlInitCharEncodingHandlers();
if (name == NULL) return(xmlDefaultCharEncodingHandler); if (name == NULL) return(xmlDefaultCharEncodingHandler);
if (name[0] == 0) return(xmlDefaultCharEncodingHandler); if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
for (i = 0;i < 499;i++) { for (i = 0;i < 99;i++) {
upper[i] = toupper(name[i]); upper[i] = toupper(name[i]);
if (upper[i] == 0) break; if (upper[i] == 0) break;
} }
upper[i] = 0; upper[i] = 0;
for (i = 0;i < nbCharEncodingHandler; i++) for (i = 0;i < nbCharEncodingHandler; i++)
if (!strcmp(name, handlers[i]->name)) if (!strcmp(upper, handlers[i]->name)) {
#ifdef DEBUG_ENCODING
fprintf(stderr, "Found registered handler for encoding %s\n", name);
#endif
return(handlers[i]); return(handlers[i]);
}
#ifdef LIBXML_ICONV_ENABLED
/* check whether iconv can handle this */
icv_in = iconv_open("UTF-8", name);
icv_out = iconv_open(name, "UTF-8");
if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
enc = xmlMalloc(sizeof(xmlCharEncodingHandler));
if (enc == NULL) {
iconv_close(icv_in);
iconv_close(icv_out);
return(NULL);
}
enc->name = NULL;
enc->input = NULL;
enc->output = NULL;
enc->iconv_in = icv_in;
enc->iconv_out = icv_out;
#ifdef DEBUG_ENCODING
fprintf(stderr, "Found iconv handler for encoding %s\n", name);
#endif
return enc;
} else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
fprintf(stderr, "iconv : problems with filters for '%s'\n", name);
}
#endif /* LIBXML_ICONV_ENABLED */
#ifdef DEBUG_ENCODING
fprintf(stderr, "No handler found for encoding %s\n", name);
#endif
return(NULL); return(NULL);
} }
#ifdef LIBXML_ICONV_ENABLED
/**
* xmlIconvWrapper:
* @cd: iconv converter data structure
* @out: a pointer to an array of bytes to store the result
* @outlen: the length of @out
* @in: a pointer to an array of ISO Latin 1 chars
* @inlen: the length of @in
*
* Returns 0 if success, or
* -1 by lack of space, or
* -2 if the transcoding fails (for *in is not valid utf8 string or
* the result of transformation can't fit into the encoding we want), or
* -3 if there the last byte can't form a single output char.
*
* The value of @inlen after return is the number of octets consumed
* as the return value is positive, else unpredictiable.
* The value of @outlen after return is the number of ocetes consumed.
*/
static int
xmlIconvWrapper(iconv_t cd,
unsigned char *out, int *outlen,
const unsigned char *in, int *inlen) {
size_t icv_inlen = *inlen, icv_outlen = *outlen;
const char *icv_in = (const char *) in;
char *icv_out = (char *) out;
int ret;
ret = iconv(cd,
&icv_in, &icv_inlen,
&icv_out, &icv_outlen);
*inlen -= icv_inlen;
*outlen -= icv_outlen;
if (icv_inlen != 0 || ret == (size_t) -1) {
#ifdef EILSEQ
if (errno == EILSEQ) {
return -2;
} else
#endif
#ifdef E2BIG
if (errno == E2BIG) {
return -1;
} else
#endif
#ifdef EINVAL
if (errno == EINVAL) {
return -3;
}
#endif
else {
return -3;
}
}
return 0;
}
#endif /* LIBXML_ICONV_ENABLED */
/**
* xmlCharEncInFunc:
* @handler: char enconding transformation data structure
* @out: an xmlBuffer for the output.
* @in: an xmlBuffer for the input
*
* Generic front-end for the encoding handler input function
*
* Returns the number of byte written if success, or
* -1 general error
* -2 if the transcoding fails (for *in is not valid utf8 string or
* the result of transformation can't fit into the encoding we want), or
*/
int
xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
xmlBufferPtr in) {
int ret = -2;
int written;
int toconv;
if (handler == NULL) return(-1);
if (out == NULL) return(-1);
if (in == NULL) return(-1);
written = out->size - out->use;
toconv = in->use;
if (toconv * 2 >= written) {
xmlBufferGrow(out, toconv * 2);
written = out->size - out->use - 1;
}
if (handler->input != NULL) {
ret = handler->input(&out->content[out->use], &written,
in->content, &toconv);
xmlBufferShrink(in, toconv);
out->use += written;
out->content[out->use] = 0;
}
#ifdef LIBXML_ICONV_ENABLED
else if (handler->iconv_in != NULL) {
ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
&written, in->content, &toconv);
xmlBufferShrink(in, toconv);
out->use += written;
out->content[out->use] = 0;
if (ret == -1) ret = -3;
}
#endif /* LIBXML_ICONV_ENABLED */
#ifdef DEBUG_ENCODING
switch (ret) {
case 0:
fprintf(stderr, "converted %d bytes to %d bytes of input\n",
toconv, written);
break;
case -1:
fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
toconv, written, in->use);
break;
case -2:
fprintf(stderr, "input conversion failed due to input error\n");
break;
case -3:
fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
toconv, written, in->use);
break;
default:
fprintf(stderr,"Unknown input conversion failed %d\n", ret);
}
#endif
/*
* Ignore when input buffer is not on a boundary
*/
if (ret == -3) ret = 0;
return(ret);
}
/**
* xmlCharEncOutFunc:
* @handler: char enconding transformation data structure
* @out: an xmlBuffer for the output.
* @in: an xmlBuffer for the input
*
* Generic front-end for the encoding handler output function
*
* Returns the number of byte written if success, or
* -1 general error
* -2 if the transcoding fails (for *in is not valid utf8 string or
* the result of transformation can't fit into the encoding we want), or
*/
int
xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
xmlBufferPtr in) {
int ret = -2;
int written;
int toconv;
if (handler == NULL) return(-1);
if (out == NULL) return(-1);
if (in == NULL) return(-1);
written = out->size - out->use;
toconv = in->use;
if (toconv * 2 >= written) {
xmlBufferGrow(out, toconv * 2);
written = out->size - out->use - 1;
}
if (handler->output != NULL) {
ret = handler->output(&out->content[out->use], &written,
in->content, &toconv);
xmlBufferShrink(in, toconv);
out->use += written;
out->content[out->use] = 0;
}
#ifdef LIBXML_ICONV_ENABLED
else if (handler->iconv_out != NULL) {
ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
&written, in->content, &toconv);
xmlBufferShrink(in, toconv);
out->use += written;
out->content[out->use] = 0;
if (ret == -1) ret = -3;
}
#endif /* LIBXML_ICONV_ENABLED */
#ifdef DEBUG_ENCODING
switch (ret) {
case 0:
fprintf(stderr, "converted %d bytes to %d bytes of output\n",
toconv, written);
break;
case -1:
fprintf(stderr, "output conversion failed by lack of space\n");
break;
case -2:
fprintf(stderr, "output conversion failed due to output error\n");
break;
case -3:
fprintf(stderr,"converted %d bytes to %d bytes of output %d left\n",
toconv, written, in->use);
break;
default:
fprintf(stderr,"Unknown output conversion failed %d\n", ret);
}
#endif
return(ret);
}
/**
* xmlCharEncCloseFunc:
* @handler: char enconding transformation data structure
*
* Generic front-end for hencoding handler close function
*
* Returns 0 if success, or -1 in case of error
*/
int
xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
int ret = 0;
if (handler == NULL) return(-1);
if (handler->name == NULL) return(-1);
#ifdef LIBXML_ICONV_ENABLED
/*
* Iconv handlers can be oused only once, free the whole block.
* and the associated icon resources.
*/
if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
if (handler->name != NULL)
xmlFree(handler->name);
handler->name = NULL;
if (handler->iconv_out != NULL) {
if (iconv_close(handler->iconv_out))
ret = -1;
handler->iconv_out = NULL;
}
if (handler->iconv_in != NULL) {
if (iconv_close(handler->iconv_in))
ret = -1;
handler->iconv_in = NULL;
}
xmlFree(handler);
}
#endif /* LIBXML_ICONV_ENABLED */
#ifdef DEBUG_ENCODING
if (ret)
fprintf(stderr, "failed to close the encoding handler\n");
else
fprintf(stderr, "closed the encoding handler\n");
#endif
return(ret);
}

View File

@@ -22,12 +22,30 @@
#define __XML_CHAR_ENCODING_H__ #define __XML_CHAR_ENCODING_H__
#include <libxml/xmlversion.h> #include <libxml/xmlversion.h>
#ifdef LIBXML_ICONV_ENABLED
#include <iconv.h>
#endif
#include <libxml/tree.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
/** /**
* Predefined values for some standard encodings * Predefined values for some standard encodings
* Libxml don't do beforehand translation on UTF8, ISOLatinX
* It also support UTF16 (LE and BE) by default.
*
* Anything else would have to be translated to UTF8 before being
* given to the parser itself. The BOM for UTF16 and the encoding
* declaration are looked at and a converter is looked for at that
* point. If not found the parser stops here as asked by the XML REC
* Converter can be registered by the user using xmlRegisterCharEncodingHandler
* but the currentl form doesn't allow stateful transcoding (a serious
* problem agreed !). If iconv has been found it will be used
* automatically and allow stateful transcoding, the simplest is then
* to be sure to enable icon and to provide iconv libs for the encoding
* support needed.
*/ */
typedef enum { typedef enum {
XML_CHAR_ENCODING_ERROR= -1, /* No char encoding detected */ XML_CHAR_ENCODING_ERROR= -1, /* No char encoding detected */
@@ -65,9 +83,13 @@ typedef enum {
* Take a block of chars in the original encoding and try to convert * Take a block of chars in the original encoding and try to convert
* it to an UTF-8 block of chars out. * it to an UTF-8 block of chars out.
* *
* Returns the number of byte written, or -1 by lack of space. * Returns the number of byte written, or -1 by lack of space, or -2
* if the transcoding failed.
* The value of @inlen after return is the number of octets consumed
* as the return value is positive, else unpredictiable.
* The value of @outlen after return is the number of ocetes consumed.
*/ */
typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int outlen, typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen); const unsigned char* in, int *inlen);
@@ -83,12 +105,17 @@ typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int outlen,
* *
* Returns the number of byte written, or -1 by lack of space, or -2 * Returns the number of byte written, or -1 by lack of space, or -2
* if the transcoding failed. * if the transcoding failed.
* The value of @inlen after return is the number of octets consumed
* as the return value is positive, else unpredictiable.
* The value of @outlen after return is the number of ocetes consumed.
*/ */
typedef int (* xmlCharEncodingOutputFunc)(unsigned char* out, int outlen, typedef int (* xmlCharEncodingOutputFunc)(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen); const unsigned char* in, int *inlen);
/* /*
* Block defining the handlers for non UTF-8 encodings. * Block defining the handlers for non UTF-8 encodings.
* If iconv is supported, there is two extra fields
*/ */
typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler; typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler;
@@ -97,6 +124,10 @@ struct _xmlCharEncodingHandler {
char *name; char *name;
xmlCharEncodingInputFunc input; xmlCharEncodingInputFunc input;
xmlCharEncodingOutputFunc output; xmlCharEncodingOutputFunc output;
#ifdef LIBXML_ICONV_ENABLED
iconv_t iconv_in;
iconv_t iconv_out;
#endif /* LIBXML_ICONV_ENABLED */
}; };
void xmlInitCharEncodingHandlers (void); void xmlInitCharEncodingHandlers (void);
@@ -109,6 +140,14 @@ xmlCharEncodingHandlerPtr xmlGetCharEncodingHandler(xmlCharEncoding enc);
xmlCharEncodingHandlerPtr xmlFindCharEncodingHandler(const char *name); xmlCharEncodingHandlerPtr xmlFindCharEncodingHandler(const char *name);
int xmlCheckUTF8 (const unsigned char *utf); int xmlCheckUTF8 (const unsigned char *utf);
int xmlCharEncOutFunc (xmlCharEncodingHandler *handler,
xmlBufferPtr out,
xmlBufferPtr in);
int xmlCharEncInFunc (xmlCharEncodingHandler *handler,
xmlBufferPtr out,
xmlBufferPtr in);
int xmlCharEncCloseFunc (xmlCharEncodingHandler *handler);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@@ -22,12 +22,30 @@
#define __XML_CHAR_ENCODING_H__ #define __XML_CHAR_ENCODING_H__
#include <libxml/xmlversion.h> #include <libxml/xmlversion.h>
#ifdef LIBXML_ICONV_ENABLED
#include <iconv.h>
#endif
#include <libxml/tree.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
#endif #endif
/** /**
* Predefined values for some standard encodings * Predefined values for some standard encodings
* Libxml don't do beforehand translation on UTF8, ISOLatinX
* It also support UTF16 (LE and BE) by default.
*
* Anything else would have to be translated to UTF8 before being
* given to the parser itself. The BOM for UTF16 and the encoding
* declaration are looked at and a converter is looked for at that
* point. If not found the parser stops here as asked by the XML REC
* Converter can be registered by the user using xmlRegisterCharEncodingHandler
* but the currentl form doesn't allow stateful transcoding (a serious
* problem agreed !). If iconv has been found it will be used
* automatically and allow stateful transcoding, the simplest is then
* to be sure to enable icon and to provide iconv libs for the encoding
* support needed.
*/ */
typedef enum { typedef enum {
XML_CHAR_ENCODING_ERROR= -1, /* No char encoding detected */ XML_CHAR_ENCODING_ERROR= -1, /* No char encoding detected */
@@ -65,9 +83,13 @@ typedef enum {
* Take a block of chars in the original encoding and try to convert * Take a block of chars in the original encoding and try to convert
* it to an UTF-8 block of chars out. * it to an UTF-8 block of chars out.
* *
* Returns the number of byte written, or -1 by lack of space. * Returns the number of byte written, or -1 by lack of space, or -2
* if the transcoding failed.
* The value of @inlen after return is the number of octets consumed
* as the return value is positive, else unpredictiable.
* The value of @outlen after return is the number of ocetes consumed.
*/ */
typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int outlen, typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen); const unsigned char* in, int *inlen);
@@ -83,12 +105,17 @@ typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int outlen,
* *
* Returns the number of byte written, or -1 by lack of space, or -2 * Returns the number of byte written, or -1 by lack of space, or -2
* if the transcoding failed. * if the transcoding failed.
* The value of @inlen after return is the number of octets consumed
* as the return value is positive, else unpredictiable.
* The value of @outlen after return is the number of ocetes consumed.
*/ */
typedef int (* xmlCharEncodingOutputFunc)(unsigned char* out, int outlen, typedef int (* xmlCharEncodingOutputFunc)(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen); const unsigned char* in, int *inlen);
/* /*
* Block defining the handlers for non UTF-8 encodings. * Block defining the handlers for non UTF-8 encodings.
* If iconv is supported, there is two extra fields
*/ */
typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler; typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler;
@@ -97,6 +124,10 @@ struct _xmlCharEncodingHandler {
char *name; char *name;
xmlCharEncodingInputFunc input; xmlCharEncodingInputFunc input;
xmlCharEncodingOutputFunc output; xmlCharEncodingOutputFunc output;
#ifdef LIBXML_ICONV_ENABLED
iconv_t iconv_in;
iconv_t iconv_out;
#endif /* LIBXML_ICONV_ENABLED */
}; };
void xmlInitCharEncodingHandlers (void); void xmlInitCharEncodingHandlers (void);
@@ -109,6 +140,14 @@ xmlCharEncodingHandlerPtr xmlGetCharEncodingHandler(xmlCharEncoding enc);
xmlCharEncodingHandlerPtr xmlFindCharEncodingHandler(const char *name); xmlCharEncodingHandlerPtr xmlFindCharEncodingHandler(const char *name);
int xmlCheckUTF8 (const unsigned char *utf); int xmlCheckUTF8 (const unsigned char *utf);
int xmlCharEncOutFunc (xmlCharEncodingHandler *handler,
xmlBufferPtr out,
xmlBufferPtr in);
int xmlCharEncInFunc (xmlCharEncodingHandler *handler,
xmlBufferPtr out,
xmlBufferPtr in);
int xmlCharEncCloseFunc (xmlCharEncodingHandler *handler);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@@ -28,10 +28,10 @@ extern "C" {
* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. * any Unicode character, excluding the surrogate blocks, FFFE, and FFFF.
*/ */
#define IS_CHAR(c) \ #define IS_CHAR(c) \
((((c) == 0x09) || ((c) == 0x0A) || ((c) == 0x0D) || \ (((c) == 0x09) || ((c) == 0x0A) || ((c) == 0x0D) || \
(((c) >= 0x20) && ((c) != 0xFFFE) && ((c) != 0xFFFF))) && \ (((c) >= 0x20) && ((c) <= 0xD7FF)) || \
(((c) <= 0xD7FF) || ((c) >= 0xE000)) && ((c) >= 0) && \ (((c) >= 0xE000) && ((c) <= 0xFFFD)) || \
((c) <= 0x10FFFF)) (((c) >= 0x10000) && ((c) <= 0x10FFFF)))
/* /*
* [3] S ::= (#x20 | #x9 | #xD | #xA)+ * [3] S ::= (#x20 | #x9 | #xD | #xA)+
@@ -442,8 +442,10 @@ xmlParserCtxtPtr xmlNewParserCtxt (void);
xmlParserCtxtPtr xmlCreateEntityParserCtxt(const xmlChar *URL, xmlParserCtxtPtr xmlCreateEntityParserCtxt(const xmlChar *URL,
const xmlChar *ID, const xmlChar *ID,
const xmlChar *base); const xmlChar *base);
void xmlSwitchEncoding (xmlParserCtxtPtr ctxt, int xmlSwitchEncoding (xmlParserCtxtPtr ctxt,
xmlCharEncoding enc); xmlCharEncoding enc);
int xmlSwitchToEncoding (xmlParserCtxtPtr ctxt,
xmlCharEncodingHandlerPtr handler);
void xmlFreeParserCtxt (xmlParserCtxtPtr ctxt); void xmlFreeParserCtxt (xmlParserCtxtPtr ctxt);
/** /**

View File

@@ -380,6 +380,8 @@ void xmlBufferCCat (xmlBufferPtr buf,
const char *str); const char *str);
int xmlBufferShrink (xmlBufferPtr buf, int xmlBufferShrink (xmlBufferPtr buf,
int len); int len);
int xmlBufferGrow (xmlBufferPtr buf,
int len);
void xmlBufferEmpty (xmlBufferPtr buf); void xmlBufferEmpty (xmlBufferPtr buf);
const xmlChar* xmlBufferContent (const xmlBufferPtr buf); const xmlChar* xmlBufferContent (const xmlBufferPtr buf);
int xmlBufferUse (const xmlBufferPtr buf); int xmlBufferUse (const xmlBufferPtr buf);

View File

@@ -33,6 +33,7 @@ struct _xmlParserInputBuffer {
xmlCharEncodingHandlerPtr encoder; /* I18N conversions to UTF-8 */ xmlCharEncodingHandlerPtr encoder; /* I18N conversions to UTF-8 */
xmlBufferPtr buffer; /* Local buffer encoded in UTF-8 */ xmlBufferPtr buffer; /* Local buffer encoded in UTF-8 */
xmlBufferPtr raw; /* if encoder != NULL buffer for raw input */
}; };

556
parser.c
View File

@@ -41,6 +41,7 @@
#include <libxml/valid.h> #include <libxml/valid.h>
#include <libxml/parserInternals.h> #include <libxml/parserInternals.h>
#include <libxml/xmlIO.h> #include <libxml/xmlIO.h>
#include <libxml/uri.h>
#include "xml-error.h" #include "xml-error.h"
#define XML_PARSER_BIG_BUFFER_SIZE 1000 #define XML_PARSER_BIG_BUFFER_SIZE 1000
@@ -483,7 +484,7 @@ xmlNextChar(xmlParserCtxtPtr ctxt) {
if ((ctxt->sax != NULL) && if ((ctxt->sax != NULL) &&
(ctxt->sax->error != NULL)) (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData, ctxt->sax->error(ctxt->userData,
"Char out of allowed range\n"); "Char 0x%X out of allowed range\n", val);
ctxt->errNo = XML_ERR_INVALID_ENCODING; ctxt->errNo = XML_ERR_INVALID_ENCODING;
ctxt->wellFormed = 0; ctxt->wellFormed = 0;
ctxt->disableSAX = 1; ctxt->disableSAX = 1;
@@ -612,7 +613,7 @@ xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
if ((ctxt->sax != NULL) && if ((ctxt->sax != NULL) &&
(ctxt->sax->error != NULL)) (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData, ctxt->sax->error(ctxt->userData,
"Char out of allowed range\n"); "Char 0x%X out of allowed range\n", val);
ctxt->errNo = XML_ERR_INVALID_ENCODING; ctxt->errNo = XML_ERR_INVALID_ENCODING;
ctxt->wellFormed = 0; ctxt->wellFormed = 0;
ctxt->disableSAX = 1; ctxt->disableSAX = 1;
@@ -727,7 +728,7 @@ xmlStringCurrentChar(xmlParserCtxtPtr ctxt, const xmlChar *cur, int *len) {
if ((ctxt->sax != NULL) && if ((ctxt->sax != NULL) &&
(ctxt->sax->error != NULL)) (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData, ctxt->sax->error(ctxt->userData,
"Char out of allowed range\n"); "Char 0x%X out of allowed range\n", val);
ctxt->errNo = XML_ERR_INVALID_ENCODING; ctxt->errNo = XML_ERR_INVALID_ENCODING;
ctxt->wellFormed = 0; ctxt->wellFormed = 0;
ctxt->disableSAX = 1; ctxt->disableSAX = 1;
@@ -2278,155 +2279,14 @@ xmlCheckLanguageID(const xmlChar *lang) {
* *
* change the input functions when discovering the character encoding * change the input functions when discovering the character encoding
* of a given entity. * of a given entity.
*
* Returns 0 in case of success, -1 otherwise
*/ */
void int
xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
{ {
xmlCharEncodingHandlerPtr handler; xmlCharEncodingHandlerPtr handler;
handler = xmlGetCharEncodingHandler(enc);
if (handler != NULL) {
if (ctxt->input != NULL) {
if (ctxt->input->buf != NULL) {
if (ctxt->input->buf->encoder != NULL) {
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"xmlSwitchEncoding : encoder already regitered\n");
return;
}
ctxt->input->buf->encoder = handler;
/*
* Is there already some content down the pipe to convert
*/
if ((ctxt->input->buf->buffer != NULL) &&
(ctxt->input->buf->buffer->use > 0)) {
xmlChar *buf;
int res, len, size;
int processed;
/*
* Specific handling of the Byte Order Mark for
* UTF-16
*/
if ((enc == XML_CHAR_ENCODING_UTF16LE) &&
(ctxt->input->cur[0] == 0xFF) &&
(ctxt->input->cur[1] == 0xFE)) {
SKIP(2);
}
if ((enc == XML_CHAR_ENCODING_UTF16BE) &&
(ctxt->input->cur[0] == 0xFE) &&
(ctxt->input->cur[1] == 0xFF)) {
SKIP(2);
}
/*
* convert the non processed part
*/
processed = ctxt->input->cur - ctxt->input->base;
len = ctxt->input->buf->buffer->use - processed;
if (len <= 0) {
return;
}
size = ctxt->input->buf->buffer->use * 4;
if (size < 4000)
size = 4000;
retry_larger:
buf = (xmlChar *) xmlMalloc(size + 1);
if (buf == NULL) {
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"xmlSwitchEncoding : out of memory\n");
return;
}
/* TODO !!! Handling of buf too small */
res = handler->input(buf, size, ctxt->input->cur, &len);
if (res == -1) {
size *= 2;
xmlFree(buf);
goto retry_larger;
}
if ((res < 0) ||
(len != ctxt->input->buf->buffer->use - processed)) {
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"xmlSwitchEncoding : conversion failed\n");
xmlFree(buf);
return;
}
/*
* Conversion succeeded, get rid of the old buffer
*/
xmlFree(ctxt->input->buf->buffer->content);
ctxt->input->buf->buffer->content = buf;
ctxt->input->base = buf;
ctxt->input->cur = buf;
ctxt->input->buf->buffer->size = size;
ctxt->input->buf->buffer->use = res;
buf[res] = 0;
}
return;
} else {
if (ctxt->input->length == 0) {
/*
* When parsing a static memory array one must know the
* size to be able to convert the buffer.
*/
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"xmlSwitchEncoding : no input\n");
return;
} else {
xmlChar *buf;
int res, len;
int processed = ctxt->input->cur - ctxt->input->base;
/*
* convert the non processed part
*/
len = ctxt->input->length - processed;
if (len <= 0) {
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"xmlSwitchEncoding : input fully consumed?\n");
return;
}
buf = (xmlChar *) xmlMalloc(ctxt->input->length * 4);
if (buf == NULL) {
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"xmlSwitchEncoding : out of memory\n");
return;
}
res = handler->input(buf, ctxt->input->length * 4,
ctxt->input->cur, &len);
if ((res < 0) ||
(len != ctxt->input->length - processed)) {
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"xmlSwitchEncoding : conversion failed\n");
xmlFree(buf);
return;
}
/*
* Conversion succeeded, get rid of the old buffer
*/
if ((ctxt->input->free != NULL) &&
(ctxt->input->base != NULL))
ctxt->input->free((xmlChar *) ctxt->input->base);
ctxt->input->base = ctxt->input->cur = buf;
ctxt->input->length = res;
}
}
} else {
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"xmlSwitchEncoding : no input\n");
}
}
switch (enc) { switch (enc) {
case XML_CHAR_ENCODING_ERROR: case XML_CHAR_ENCODING_ERROR:
ctxt->errNo = XML_ERR_UNKNOWN_ENCODING; ctxt->errNo = XML_ERR_UNKNOWN_ENCODING;
@@ -2437,21 +2297,35 @@ retry_larger:
break; break;
case XML_CHAR_ENCODING_NONE: case XML_CHAR_ENCODING_NONE:
/* let's assume it's UTF-8 without the XML decl */ /* let's assume it's UTF-8 without the XML decl */
return; return(0);
case XML_CHAR_ENCODING_UTF8: case XML_CHAR_ENCODING_UTF8:
/* default encoding, no conversion should be needed */ /* default encoding, no conversion should be needed */
return; return(0);
case XML_CHAR_ENCODING_UTF16LE: default:
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; break;
}
handler = xmlGetCharEncodingHandler(enc);
if (handler == NULL) {
/*
* Default handlers.
*/
switch (enc) {
case XML_CHAR_ENCODING_ERROR:
ctxt->errNo = XML_ERR_UNKNOWN_ENCODING;
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData, ctxt->sax->error(ctxt->userData, "encoding unknown\n");
"char encoding UTF16 little endian not supported\n"); ctxt->wellFormed = 0;
ctxt->disableSAX = 1;
break;
case XML_CHAR_ENCODING_NONE:
/* let's assume it's UTF-8 without the XML decl */
return(0);
case XML_CHAR_ENCODING_UTF8:
/* default encoding, no conversion should be needed */
return(0);
case XML_CHAR_ENCODING_UTF16LE:
break; break;
case XML_CHAR_ENCODING_UTF16BE: case XML_CHAR_ENCODING_UTF16BE:
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"char encoding UTF16 big endian not supported\n");
break; break;
case XML_CHAR_ENCODING_UCS4LE: case XML_CHAR_ENCODING_UCS4LE:
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
@@ -2490,59 +2364,23 @@ retry_larger:
"char encoding UCS2 not supported\n"); "char encoding UCS2 not supported\n");
break; break;
case XML_CHAR_ENCODING_8859_1: case XML_CHAR_ENCODING_8859_1:
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"char encoding ISO_8859_1 ISO Latin 1 not supported\n");
break;
case XML_CHAR_ENCODING_8859_2: case XML_CHAR_ENCODING_8859_2:
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"char encoding ISO_8859_2 ISO Latin 2 not supported\n");
break;
case XML_CHAR_ENCODING_8859_3: case XML_CHAR_ENCODING_8859_3:
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"char encoding ISO_8859_3 not supported\n");
break;
case XML_CHAR_ENCODING_8859_4: case XML_CHAR_ENCODING_8859_4:
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"char encoding ISO_8859_4 not supported\n");
break;
case XML_CHAR_ENCODING_8859_5: case XML_CHAR_ENCODING_8859_5:
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"char encoding ISO_8859_5 not supported\n");
break;
case XML_CHAR_ENCODING_8859_6: case XML_CHAR_ENCODING_8859_6:
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"char encoding ISO_8859_6 not supported\n");
break;
case XML_CHAR_ENCODING_8859_7: case XML_CHAR_ENCODING_8859_7:
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"char encoding ISO_8859_7 not supported\n");
break;
case XML_CHAR_ENCODING_8859_8: case XML_CHAR_ENCODING_8859_8:
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"char encoding ISO_8859_8 not supported\n");
break;
case XML_CHAR_ENCODING_8859_9: case XML_CHAR_ENCODING_8859_9:
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; /*
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) * Keep the internal content in the document encoding
ctxt->sax->error(ctxt->userData, */
"char encoding ISO_8859_9 not supported\n"); if ((ctxt->inputNr == 1) &&
break; (ctxt->encoding == NULL) &&
(ctxt->input->encoding != NULL)) {
ctxt->encoding = xmlStrdup(ctxt->input->encoding);
}
return(0);
case XML_CHAR_ENCODING_2022_JP: case XML_CHAR_ENCODING_2022_JP:
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
@@ -2563,6 +2401,150 @@ retry_larger:
break; break;
} }
} }
if (handler == NULL)
return(-1);
return(xmlSwitchToEncoding(ctxt, handler));
}
/**
* xmlSwitchToEncoding:
* @ctxt: the parser context
* @handler: the encoding handler
*
* change the input functions when discovering the character encoding
* of a given entity.
*
* Returns 0 in case of success, -1 otherwise
*/
int
xmlSwitchToEncoding(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler)
{
int nbchars;
if (handler != NULL) {
if (ctxt->input != NULL) {
if (ctxt->input->buf != NULL) {
if (ctxt->input->buf->encoder != NULL) {
if (ctxt->input->buf->encoder == handler)
return(0);
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"xmlSwitchEncoding : encoder already regitered\n");
return(-1);
}
ctxt->input->buf->encoder = handler;
/*
* Is there already some content down the pipe to convert ?
*/
if ((ctxt->input->buf->buffer != NULL) &&
(ctxt->input->buf->buffer->use > 0)) {
int processed;
/*
* Specific handling of the Byte Order Mark for
* UTF-16
*/
if ((handler->name != NULL) &&
(!strcmp(handler->name, "UTF-16LE")) &&
(ctxt->input->cur[0] == 0xFF) &&
(ctxt->input->cur[1] == 0xFE)) {
ctxt->input->cur += 2;
}
if ((handler->name != NULL) &&
(!strcmp(handler->name, "UTF-16BE")) &&
(ctxt->input->cur[0] == 0xFE) &&
(ctxt->input->cur[1] == 0xFF)) {
ctxt->input->cur += 2;
}
/*
* Shring the current input buffer.
* Move it as the raw buffer and create a new input buffer
*/
processed = ctxt->input->cur - ctxt->input->base;
xmlBufferShrink(ctxt->input->buf->buffer, processed);
ctxt->input->buf->raw = ctxt->input->buf->buffer;
ctxt->input->buf->buffer = xmlBufferCreate();
/*
* convert as much as possible of the raw input
* to the parser reading buffer.
*/
nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
ctxt->input->buf->buffer,
ctxt->input->buf->raw);
if (nbchars < 0) {
fprintf(stderr, "xmlSwitchToEncoding: encoder error\n");
return(-1);
}
ctxt->input->base =
ctxt->input->cur = ctxt->input->buf->buffer->content;
}
return(0);
} else {
if (ctxt->input->length == 0) {
/*
* When parsing a static memory array one must know the
* size to be able to convert the buffer.
*/
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"xmlSwitchEncoding : no input\n");
return(-1);
} else {
int processed;
/*
* Shring the current input buffer.
* Move it as the raw buffer and create a new input buffer
*/
processed = ctxt->input->cur - ctxt->input->base;
ctxt->input->buf->raw = xmlBufferCreate();
xmlBufferAdd(ctxt->input->buf->raw, ctxt->input->cur,
ctxt->input->length - processed);
ctxt->input->buf->buffer = xmlBufferCreate();
/*
* convert as much as possible of the raw input
* to the parser reading buffer.
*/
nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
ctxt->input->buf->buffer,
ctxt->input->buf->raw);
if (nbchars < 0) {
fprintf(stderr, "xmlSwitchToEncoding: encoder error\n");
return(-1);
}
/*
* Conversion succeeded, get rid of the old buffer
*/
if ((ctxt->input->free != NULL) &&
(ctxt->input->base != NULL))
ctxt->input->free((xmlChar *) ctxt->input->base);
ctxt->input->base =
ctxt->input->cur = ctxt->input->buf->buffer->content;
}
}
} else {
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"xmlSwitchEncoding : no input\n");
return(-1);
}
/*
* The parsing is now done in UTF8 natively
*/
if (ctxt->encoding != NULL) {
xmlFree((xmlChar *) ctxt->encoding);
ctxt->encoding = NULL;
}
} else
return(-1);
return(0);
}
/************************************************************************ /************************************************************************
* * * *
@@ -4253,7 +4235,7 @@ xmlParseExternalID(xmlParserCtxtPtr ctxt, xmlChar **publicID, int strict) {
void void
xmlParseComment(xmlParserCtxtPtr ctxt) { xmlParseComment(xmlParserCtxtPtr ctxt) {
xmlChar *buf = NULL; xmlChar *buf = NULL;
int len = 0; int len;
int size = XML_PARSER_BUFFER_SIZE; int size = XML_PARSER_BUFFER_SIZE;
int q, ql; int q, ql;
int r, rl; int r, rl;
@@ -4282,10 +4264,11 @@ xmlParseComment(xmlParserCtxtPtr ctxt) {
r = CUR_CHAR(rl); r = CUR_CHAR(rl);
NEXTL(rl); NEXTL(rl);
cur = CUR_CHAR(l); cur = CUR_CHAR(l);
len = 0;
while (IS_CHAR(cur) && while (IS_CHAR(cur) &&
((cur != '>') || ((cur != '>') ||
(r != '-') || (q != '-'))) { (r != '-') || (q != '-'))) {
if ((r == '-') && (q == '-')) { if ((r == '-') && (q == '-') && (len > 1)) {
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData, ctxt->sax->error(ctxt->userData,
"Comment must not contain '--' (double-hyphen)`\n"); "Comment must not contain '--' (double-hyphen)`\n");
@@ -4732,12 +4715,37 @@ xmlParseEntityDecl(xmlParserCtxtPtr ctxt) {
ctxt->disableSAX = 1; ctxt->disableSAX = 1;
} }
if (URI) { if (URI) {
xmlURIPtr uri;
uri = xmlParseURI((const char *) URI);
if (uri == NULL) {
if ((ctxt->sax != NULL) && if ((ctxt->sax != NULL) &&
(!ctxt->disableSAX) && (ctxt->sax->entityDecl != NULL)) (!ctxt->disableSAX) &&
(ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"Invalid URI: %s\n", URI);
ctxt->wellFormed = 0;
ctxt->errNo = XML_ERR_INVALID_URI;
} else {
if (uri->fragment != NULL) {
if ((ctxt->sax != NULL) &&
(!ctxt->disableSAX) &&
(ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"Fragment not allowed: %s\n", URI);
ctxt->wellFormed = 0;
ctxt->errNo = XML_ERR_URI_FRAGMENT;
} else {
if ((ctxt->sax != NULL) &&
(!ctxt->disableSAX) &&
(ctxt->sax->entityDecl != NULL))
ctxt->sax->entityDecl(ctxt->userData, name, ctxt->sax->entityDecl(ctxt->userData, name,
XML_EXTERNAL_PARAMETER_ENTITY, XML_EXTERNAL_PARAMETER_ENTITY,
literal, URI, NULL); literal, URI, NULL);
} }
xmlFreeURI(uri);
}
}
} }
} else { } else {
if ((RAW == '"') || (RAW == '\'')) { if ((RAW == '"') || (RAW == '\'')) {
@@ -4757,6 +4765,31 @@ xmlParseEntityDecl(xmlParserCtxtPtr ctxt) {
ctxt->wellFormed = 0; ctxt->wellFormed = 0;
ctxt->disableSAX = 1; ctxt->disableSAX = 1;
} }
if (URI) {
xmlURIPtr uri;
uri = xmlParseURI((const char *)URI);
if (uri == NULL) {
if ((ctxt->sax != NULL) &&
(!ctxt->disableSAX) &&
(ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"Invalid URI: %s\n", URI);
ctxt->wellFormed = 0;
ctxt->errNo = XML_ERR_INVALID_URI;
} else {
if (uri->fragment != NULL) {
if ((ctxt->sax != NULL) &&
(!ctxt->disableSAX) &&
(ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"Fragment not allowed: %s\n", URI);
ctxt->wellFormed = 0;
ctxt->errNo = XML_ERR_URI_FRAGMENT;
}
xmlFreeURI(uri);
}
}
if ((RAW != '>') && (!IS_BLANK(CUR))) { if ((RAW != '>') && (!IS_BLANK(CUR))) {
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData, ctxt->sax->error(ctxt->userData,
@@ -5973,7 +6006,20 @@ xmlParseTextDecl(xmlParserCtxtPtr ctxt) {
/* /*
* We know that '<?xml' is here. * We know that '<?xml' is here.
*/ */
if ((RAW == '<') && (NXT(1) == '?') &&
(NXT(2) == 'x') && (NXT(3) == 'm') &&
(NXT(4) == 'l') && (IS_BLANK(NXT(5)))) {
SKIP(5); SKIP(5);
} else {
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData,
"Text declaration '<?xml' required\n");
ctxt->errNo = XML_ERR_XMLDECL_NOT_STARTED;
ctxt->wellFormed = 0;
ctxt->disableSAX = 1;
return;
}
if (!IS_BLANK(CUR)) { if (!IS_BLANK(CUR)) {
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
@@ -6003,7 +6049,13 @@ xmlParseTextDecl(xmlParserCtxtPtr ctxt) {
ctxt->wellFormed = 0; ctxt->wellFormed = 0;
ctxt->disableSAX = 1; ctxt->disableSAX = 1;
} }
ctxt->input->encoding = xmlParseEncodingDecl(ctxt); xmlParseEncodingDecl(ctxt);
if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
/*
* The XML REC instructs us to stop parsing right here
*/
return;
}
SKIP_BLANKS; SKIP_BLANKS;
if ((RAW == '?') && (NXT(1) == '>')) { if ((RAW == '?') && (NXT(1) == '>')) {
@@ -6192,6 +6244,13 @@ xmlParseExternalSubset(xmlParserCtxtPtr ctxt, const xmlChar *ExternalID,
(NXT(2) == 'x') && (NXT(3) == 'm') && (NXT(2) == 'x') && (NXT(3) == 'm') &&
(NXT(4) == 'l')) { (NXT(4) == 'l')) {
xmlParseTextDecl(ctxt); xmlParseTextDecl(ctxt);
if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
/*
* The XML REC instructs us to stop parsing right here
*/
ctxt->instate = XML_PARSER_EOF;
return;
}
} }
if (ctxt->myDoc == NULL) { if (ctxt->myDoc == NULL) {
ctxt->myDoc = xmlNewDoc(BAD_CAST "1.0"); ctxt->myDoc = xmlNewDoc(BAD_CAST "1.0");
@@ -6441,6 +6500,13 @@ xmlParseReference(xmlParserCtxtPtr ctxt) {
(NXT(2) == 'x') && (NXT(3) == 'm') && (NXT(2) == 'x') && (NXT(3) == 'm') &&
(NXT(4) == 'l') && (IS_BLANK(NXT(5)))) { (NXT(4) == 'l') && (IS_BLANK(NXT(5)))) {
xmlParseTextDecl(ctxt); xmlParseTextDecl(ctxt);
if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
/*
* The XML REC instructs us to stop parsing right here
*/
ctxt->instate = XML_PARSER_EOF;
return;
}
if (input->standalone) { if (input->standalone) {
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
ctxt->sax->error(ctxt->userData, ctxt->sax->error(ctxt->userData,
@@ -6947,6 +7013,15 @@ xmlParsePEReference(xmlParserCtxtPtr ctxt) {
(NXT(2) == 'x') && (NXT(3) == 'm') && (NXT(2) == 'x') && (NXT(3) == 'm') &&
(NXT(4) == 'l') && (IS_BLANK(NXT(5)))) { (NXT(4) == 'l') && (IS_BLANK(NXT(5)))) {
xmlParseTextDecl(ctxt); xmlParseTextDecl(ctxt);
if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
/*
* The XML REC instructs us to stop parsing
* right here
*/
ctxt->instate = XML_PARSER_EOF;
xmlFree(name);
return;
}
} }
if (ctxt->token == 0) if (ctxt->token == 0)
ctxt->token = ' '; ctxt->token = ' ';
@@ -8197,6 +8272,38 @@ xmlParseEncodingDecl(xmlParserCtxtPtr ctxt) {
ctxt->disableSAX = 1; ctxt->disableSAX = 1;
ctxt->errNo = XML_ERR_STRING_NOT_STARTED; ctxt->errNo = XML_ERR_STRING_NOT_STARTED;
} }
if (encoding != NULL) {
xmlCharEncoding enc;
xmlCharEncodingHandlerPtr handler;
if (ctxt->input->encoding != NULL)
xmlFree((xmlChar *) ctxt->input->encoding);
ctxt->input->encoding = encoding;
enc = xmlParseCharEncoding((const char *) encoding);
/*
* registered set of known encodings
*/
if (enc != XML_CHAR_ENCODING_ERROR) {
xmlSwitchEncoding(ctxt, enc);
if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
xmlFree(encoding);
return(NULL);
}
} else {
/*
* fallback for unknown encodings
*/
handler = xmlFindCharEncodingHandler((const char *) encoding);
if (handler != NULL) {
xmlSwitchToEncoding(ctxt, handler);
} else {
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
xmlFree(encoding);
return(NULL);
}
}
}
} }
return(encoding); return(encoding);
} }
@@ -8362,7 +8469,13 @@ xmlParseXMLDecl(xmlParserCtxtPtr ctxt) {
ctxt->wellFormed = 0; ctxt->wellFormed = 0;
ctxt->disableSAX = 1; ctxt->disableSAX = 1;
} }
ctxt->input->encoding = xmlParseEncodingDecl(ctxt); xmlParseEncodingDecl(ctxt);
if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
/*
* The XML REC instructs us to stop parsing right here
*/
return;
}
/* /*
* We may have the standalone status. * We may have the standalone status.
@@ -8489,12 +8602,19 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) {
if ((RAW == '<') && (NXT(1) == '?') && if ((RAW == '<') && (NXT(1) == '?') &&
(NXT(2) == 'x') && (NXT(3) == 'm') && (NXT(2) == 'x') && (NXT(3) == 'm') &&
(NXT(4) == 'l') && (IS_BLANK(NXT(5)))) { (NXT(4) == 'l') && (IS_BLANK(NXT(5)))) {
/*
* Note that we will switch encoding on the fly.
*/
xmlParseXMLDecl(ctxt); xmlParseXMLDecl(ctxt);
if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
/*
* The XML REC instructs us to stop parsing right here
*/
return(-1);
}
ctxt->standalone = ctxt->input->standalone; ctxt->standalone = ctxt->input->standalone;
SKIP_BLANKS; SKIP_BLANKS;
if ((ctxt->encoding == NULL) && (ctxt->input->encoding != NULL))
ctxt->encoding = xmlStrdup(ctxt->input->encoding);
} else { } else {
ctxt->version = xmlCharStrdup(XML_DEFAULT_VERSION); ctxt->version = xmlCharStrdup(XML_DEFAULT_VERSION);
} }
@@ -8581,14 +8701,6 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) {
(!ctxt->disableSAX)) (!ctxt->disableSAX))
ctxt->sax->endDocument(ctxt->userData); ctxt->sax->endDocument(ctxt->userData);
/*
* Grab the encoding if it was added on-the-fly
*/
if ((ctxt->encoding != NULL) && (ctxt->myDoc != NULL) &&
(ctxt->myDoc->encoding == NULL)) {
ctxt->myDoc->encoding = ctxt->encoding;
ctxt->encoding = NULL;
}
if (! ctxt->wellFormed) return(-1); if (! ctxt->wellFormed) return(-1);
return(0); return(0);
} }
@@ -8805,6 +8917,14 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
fprintf(stderr, "PP: Parsing XML Decl\n"); fprintf(stderr, "PP: Parsing XML Decl\n");
#endif #endif
xmlParseXMLDecl(ctxt); xmlParseXMLDecl(ctxt);
if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
/*
* The XML REC instructs us to stop parsing right
* here
*/
ctxt->instate = XML_PARSER_EOF;
return(0);
}
ctxt->standalone = ctxt->input->standalone; ctxt->standalone = ctxt->input->standalone;
if ((ctxt->encoding == NULL) && if ((ctxt->encoding == NULL) &&
(ctxt->input->encoding != NULL)) (ctxt->input->encoding != NULL))

View File

@@ -28,10 +28,10 @@ extern "C" {
* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. * any Unicode character, excluding the surrogate blocks, FFFE, and FFFF.
*/ */
#define IS_CHAR(c) \ #define IS_CHAR(c) \
((((c) == 0x09) || ((c) == 0x0A) || ((c) == 0x0D) || \ (((c) == 0x09) || ((c) == 0x0A) || ((c) == 0x0D) || \
(((c) >= 0x20) && ((c) != 0xFFFE) && ((c) != 0xFFFF))) && \ (((c) >= 0x20) && ((c) <= 0xD7FF)) || \
(((c) <= 0xD7FF) || ((c) >= 0xE000)) && ((c) >= 0) && \ (((c) >= 0xE000) && ((c) <= 0xFFFD)) || \
((c) <= 0x10FFFF)) (((c) >= 0x10000) && ((c) <= 0x10FFFF)))
/* /*
* [3] S ::= (#x20 | #x9 | #xD | #xA)+ * [3] S ::= (#x20 | #x9 | #xD | #xA)+
@@ -442,8 +442,10 @@ xmlParserCtxtPtr xmlNewParserCtxt (void);
xmlParserCtxtPtr xmlCreateEntityParserCtxt(const xmlChar *URL, xmlParserCtxtPtr xmlCreateEntityParserCtxt(const xmlChar *URL,
const xmlChar *ID, const xmlChar *ID,
const xmlChar *base); const xmlChar *base);
void xmlSwitchEncoding (xmlParserCtxtPtr ctxt, int xmlSwitchEncoding (xmlParserCtxtPtr ctxt,
xmlCharEncoding enc); xmlCharEncoding enc);
int xmlSwitchToEncoding (xmlParserCtxtPtr ctxt,
xmlCharEncodingHandlerPtr handler);
void xmlFreeParserCtxt (xmlParserCtxtPtr ctxt); void xmlFreeParserCtxt (xmlParserCtxtPtr ctxt);
/** /**

25
tree.c
View File

@@ -3771,6 +3771,31 @@ xmlBufferShrink(xmlBufferPtr buf, int len) {
return(len); return(len);
} }
/**
* xmlBufferGrow:
* @buf: the buffer
* @len: the minimum free sie to allocate
*
* Grow the available space of an XML buffer.
*
* Returns the new available space or -1 in case of error
*/
int
xmlBufferGrow(xmlBufferPtr buf, int len) {
int size;
xmlChar *newbuf;
if (len <= buf->use) return(0);
size = buf->size + buf->use + len + 100;
newbuf = xmlRealloc(buf->content, size);
if (newbuf == NULL) return(-1);
buf->content = newbuf;
buf->size = size;
return(buf->size - buf->use);
}
/** /**
* xmlBufferDump: * xmlBufferDump:
* @file: the file output * @file: the file output

2
tree.h
View File

@@ -380,6 +380,8 @@ void xmlBufferCCat (xmlBufferPtr buf,
const char *str); const char *str);
int xmlBufferShrink (xmlBufferPtr buf, int xmlBufferShrink (xmlBufferPtr buf,
int len); int len);
int xmlBufferGrow (xmlBufferPtr buf,
int len);
void xmlBufferEmpty (xmlBufferPtr buf); void xmlBufferEmpty (xmlBufferPtr buf);
const xmlChar* xmlBufferContent (const xmlBufferPtr buf); const xmlChar* xmlBufferContent (const xmlBufferPtr buf);
int xmlBufferUse (const xmlBufferPtr buf); int xmlBufferUse (const xmlBufferPtr buf);

28
uri.c
View File

@@ -1283,6 +1283,34 @@ xmlParseURIReference(xmlURIPtr uri, const char *str) {
return(0); return(0);
} }
/**
* xmlParseURI:
* @str: the URI string to analyze
*
* Parse an URI
*
* URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
*
* Returns a newly build xmlURIPtr or NULL in case of error
*/
xmlURIPtr
xmlParseURI(const char *str) {
xmlURIPtr uri;
int ret;
if (str == NULL)
return(NULL);
uri = xmlCreateURI();
if (uri != NULL) {
ret = xmlParseURIReference(uri, str);
if (ret) {
xmlFreeURI(uri);
return(NULL);
}
}
return(uri);
}
/** /**
* xmlNormalizeURIPath: * xmlNormalizeURIPath:
* @path: pointer to the path string * @path: pointer to the path string

View File

@@ -130,7 +130,9 @@ typedef enum {
XML_ERR_ENTITY_CHAR_ERROR, /* 88 */ XML_ERR_ENTITY_CHAR_ERROR, /* 88 */
XML_ERR_ENTITY_PE_INTERNAL, /* 88 */ XML_ERR_ENTITY_PE_INTERNAL, /* 88 */
XML_ERR_ENTITY_LOOP, /* 89 */ XML_ERR_ENTITY_LOOP, /* 89 */
XML_ERR_ENTITY_BOUNDARY /* 90 */ XML_ERR_ENTITY_BOUNDARY, /* 90 */
XML_ERR_INVALID_URI, /* 91 */
XML_ERR_URI_FRAGMENT /* 92 */
}xmlParserErrors; }xmlParserErrors;
void xmlParserError (void *ctx, void xmlParserError (void *ctx,

91
xmlIO.c
View File

@@ -498,6 +498,10 @@ xmlAllocParserInputBuffer(xmlCharEncoding enc) {
} }
ret->buffer->alloc = XML_BUFFER_ALLOC_DOUBLEIT; ret->buffer->alloc = XML_BUFFER_ALLOC_DOUBLEIT;
ret->encoder = xmlGetCharEncodingHandler(enc); ret->encoder = xmlGetCharEncodingHandler(enc);
if (ret->encoder != NULL)
ret->raw = xmlBufferCreate();
else
ret->raw = NULL;
ret->readcallback = NULL; ret->readcallback = NULL;
ret->closecallback = NULL; ret->closecallback = NULL;
ret->context = NULL; ret->context = NULL;
@@ -513,13 +517,20 @@ xmlAllocParserInputBuffer(xmlCharEncoding enc) {
*/ */
void void
xmlFreeParserInputBuffer(xmlParserInputBufferPtr in) { xmlFreeParserInputBuffer(xmlParserInputBufferPtr in) {
if (in->buffer != NULL) { if (in->raw) {
xmlBufferFree(in->buffer); xmlBufferFree(in->raw);
in->buffer = NULL; in->raw = NULL;
}
if (in->encoder != NULL) {
xmlCharEncCloseFunc(in->encoder);
} }
if (in->closecallback != NULL) { if (in->closecallback != NULL) {
in->closecallback(in->context); in->closecallback(in->context);
} }
if (in->buffer != NULL) {
xmlBufferFree(in->buffer);
in->buffer = NULL;
}
memset(in, 0xbe, (size_t) sizeof(xmlParserInputBuffer)); memset(in, 0xbe, (size_t) sizeof(xmlParserInputBuffer));
xmlFree(in); xmlFree(in);
@@ -683,34 +694,22 @@ xmlParserInputBufferPush(xmlParserInputBufferPtr in, int len, const char *buf) {
if (len < 0) return(0); if (len < 0) return(0);
if (in->encoder != NULL) { if (in->encoder != NULL) {
xmlChar *buffer;
int processed = len;
buffer = (xmlChar *) xmlMalloc((len + 1) * 2 * sizeof(xmlChar));
if (buffer == NULL) {
fprintf(stderr, "xmlParserInputBufferGrow : out of memory !\n");
return(-1);
}
nbchars = in->encoder->input(buffer, (len + 1) * 2 * sizeof(xmlChar),
(xmlChar *) buf, &processed);
/* /*
* TODO : we really need to have something atomic or the * Store the data in the incoming raw buffer
* encoder must report the number of bytes read
*/ */
if (in->raw == NULL) {
in->raw = xmlBufferCreate();
}
xmlBufferAdd(in->raw, (const xmlChar *) buf, len);
/*
* convert as much as possible to the parser reading buffer.
*/
nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
if (nbchars < 0) { if (nbchars < 0) {
fprintf(stderr, "xmlParserInputBufferPush: encoder error\n"); fprintf(stderr, "xmlParserInputBufferPush: encoder error\n");
xmlFree(buffer);
return(-1); return(-1);
} }
if (processed != len) {
fprintf(stderr,
"TODO xmlParserInputBufferPush: processed != len\n");
xmlFree(buffer);
return(-1);
}
buffer[nbchars] = 0;
xmlBufferAdd(in->buffer, (xmlChar *) buffer, nbchars);
xmlFree(buffer);
} else { } else {
nbchars = len; nbchars = len;
xmlBufferAdd(in->buffer, (xmlChar *) buf, nbchars); xmlBufferAdd(in->buffer, (xmlChar *) buf, nbchars);
@@ -730,7 +729,9 @@ xmlParserInputBufferPush(xmlParserInputBufferPtr in, int len, const char *buf) {
* Grow up the content of the input buffer, the old data are preserved * Grow up the content of the input buffer, the old data are preserved
* This routine handle the I18N transcoding to internal UTF-8 * This routine handle the I18N transcoding to internal UTF-8
* This routine is used when operating the parser in normal (pull) mode * This routine is used when operating the parser in normal (pull) mode
* TODO: one should be able to remove one extra copy *
* TODO: one should be able to remove one extra copy by copying directy
* onto in->buffer or in->raw
* *
* Returns the number of chars read and stored in the buffer, or -1 * Returns the number of chars read and stored in the buffer, or -1
* in case of error. * in case of error.
@@ -779,34 +780,22 @@ xmlParserInputBufferGrow(xmlParserInputBufferPtr in, int len) {
return(-1); return(-1);
} }
if (in->encoder != NULL) { if (in->encoder != NULL) {
xmlChar *buf; /*
int wrote = res; * Store the data in the incoming raw buffer
*/
if (in->raw == NULL) {
in->raw = xmlBufferCreate();
}
xmlBufferAdd(in->raw, (const xmlChar *) buffer, len);
buf = (xmlChar *) xmlMalloc((res + 1) * 2 * sizeof(xmlChar)); /*
if (buf == NULL) { * convert as much as possible to the parser reading buffer.
fprintf(stderr, "xmlParserInputBufferGrow : out of memory !\n"); */
xmlFree(buffer); nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
if (nbchars < 0) {
fprintf(stderr, "xmlParserInputBufferGrow: encoder error\n");
return(-1); return(-1);
} }
nbchars = in->encoder->input(buf, (res + 1) * 2 * sizeof(xmlChar),
BAD_CAST buffer, &wrote);
buf[nbchars] = 0;
xmlBufferAdd(in->buffer, (xmlChar *) buf, nbchars);
xmlFree(buf);
/*
* Check that the encoder was able to process the full input
*/
if (wrote != res) {
fprintf(stderr,
"TODO : xmlParserInputBufferGrow wrote %d != res %d\n",
wrote, res);
/*
* TODO !!!
* Need to keep the unprocessed input in a buffer in->unprocessed
*/
}
} else { } else {
nbchars = res; nbchars = res;
buffer[nbchars] = 0; buffer[nbchars] = 0;

View File

@@ -33,6 +33,7 @@ struct _xmlParserInputBuffer {
xmlCharEncodingHandlerPtr encoder; /* I18N conversions to UTF-8 */ xmlCharEncodingHandlerPtr encoder; /* I18N conversions to UTF-8 */
xmlBufferPtr buffer; /* Local buffer encoded in UTF-8 */ xmlBufferPtr buffer; /* Local buffer encoded in UTF-8 */
xmlBufferPtr raw; /* if encoder != NULL buffer for raw input */
}; };