mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-07-30 22:43:14 +03:00
revamped the encoding support, added iconv support, so now libxml if
* encoding.[ch], xmlIO.[ch], parser.c, configure.in : revamped the encoding support, added iconv support, so now libxml if compiled with iconv automatically support japanese encodings among others. Work based on initial patch from Yuan-Chen Cheng I may have broken binary compat in the encoding handler registration scheme, but that was so utterly broken I don't expect anybody to have used this feature until now. * parserInternals.h: fixup on the CHAR range macro * xml-error.h, parser.c: catch URL/URI errors using the uri.c code. * tree.[ch]: added xmlBufferGrow(), was needed for iconv * uri.c: added xmlParseURI() I can't believe I forgot to implement this one in 2.0 !!! * SAX.c: moved doc->encoding update in the endDocument() call. * TODO: updated. Iconv rules :-) Daniel
This commit is contained in:
18
ChangeLog
18
ChangeLog
@ -1,3 +1,21 @@
|
||||
Wed May 3 14:21:25 CEST 2000 Daniel Veillard <Daniel.Veillard@w3.org>
|
||||
|
||||
* encoding.[ch], xmlIO.[ch], parser.c, configure.in : revamped
|
||||
the encoding support, added iconv support, so now libxml if
|
||||
compiled with iconv automatically support japanese encodings
|
||||
among others. Work based on initial patch from Yuan-Chen Cheng
|
||||
I may have broken binary compat in the encoding handler
|
||||
registration scheme, but that was so utterly broken I don't
|
||||
expect anybody to have used this feature until now.
|
||||
* parserInternals.h: fixup on the CHAR range macro
|
||||
* xml-error.h, parser.c: catch URL/URI errors using the uri.c
|
||||
code.
|
||||
* tree.[ch]: added xmlBufferGrow(), was needed for iconv
|
||||
* uri.c: added xmlParseURI() I can't believe I forgot to
|
||||
implement this one in 2.0 !!!
|
||||
* SAX.c: moved doc->encoding update in the endDocument() call.
|
||||
* TODO: updated.
|
||||
|
||||
Mon Apr 24 13:30:13 CEST 2000 Daniel Veillard <Daniel.Veillard@w3.org>
|
||||
|
||||
* tree.h: removed extraneous xmlRemoveProp definition
|
||||
|
9
SAX.c
9
SAX.c
@ -595,6 +595,15 @@ endDocument(void *ctx)
|
||||
if (ctxt->validate && ctxt->wellFormed &&
|
||||
ctxt->myDoc && ctxt->myDoc->intSubset)
|
||||
ctxt->valid &= xmlValidateDocumentFinal(&ctxt->vctxt, ctxt->myDoc);
|
||||
|
||||
/*
|
||||
* Grab the encoding if it was added on-the-fly
|
||||
*/
|
||||
if ((ctxt->encoding != NULL) && (ctxt->myDoc != NULL) &&
|
||||
(ctxt->myDoc->encoding == NULL)) {
|
||||
ctxt->myDoc->encoding = ctxt->encoding;
|
||||
ctxt->encoding = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
9
TODO
9
TODO
@ -6,6 +6,8 @@
|
||||
TODO:
|
||||
=====
|
||||
|
||||
- xmlSwitchToEncoding() need a rewrite for correct handling of conversion
|
||||
error code conditions.
|
||||
- DOM needs
|
||||
xmlAttrPtr xmlNewDocProp(xmlDocPtr doc, const xmlChar *name, const xmlChar *value)
|
||||
int xmlPruneProp(xmlNodePtr node, xmlAtttrPtr attr);
|
||||
@ -14,7 +16,6 @@ TODO:
|
||||
- add support for the trick from Henry conf/sun/valid/empty.xml
|
||||
- Correct standalone checking/emitting (hard)
|
||||
2.9 Standalone Document Declaration
|
||||
- URI checkings (no fragments) rfc2396.txt
|
||||
- Better checking of external parsed entities TAG 1234
|
||||
- Find way of representing PERefs in the Dtd so that %entity; can
|
||||
be saved back.
|
||||
@ -22,6 +23,7 @@ TODO:
|
||||
http://www.w3.org/XML/xml-19980210-errata ... bummmer
|
||||
- Handle undefined namespaces in entity contents better ... at least
|
||||
issue a warning
|
||||
- Issue warning when using non-absolute namespaces URI.
|
||||
- General checking of DTD validation in presence of namespaces ... hairy
|
||||
- fix --disable-corba configure switch handling, and use XML_WITHOUT_CORBA
|
||||
not WITHOUT_CORBA flag
|
||||
@ -30,7 +32,7 @@ TODO:
|
||||
=====
|
||||
|
||||
- Get OASIS testsuite to a more friendly result, check all the results
|
||||
once stable.
|
||||
once stable. Current state at:
|
||||
http://xmlsoft.org/conf/result.html
|
||||
|
||||
- Optimization of tag strings allocation ?
|
||||
@ -55,11 +57,13 @@ EXTENSIONS:
|
||||
|
||||
- Add Xlink recognition/API
|
||||
=> started adding an xlink.[ch] with a unified API for XML and HTML.
|
||||
it's crap :-(
|
||||
|
||||
- Implement XSLT
|
||||
=> seems that someone volunteered ?!?
|
||||
|
||||
- Implement XSchemas
|
||||
=> Really need to be done <grin/>
|
||||
|
||||
- O2K parsing;
|
||||
=> this is a somewhat ugly mix of HTML and XML, adding a specific
|
||||
@ -88,6 +92,7 @@ EXTENSIONS:
|
||||
Done:
|
||||
=====
|
||||
|
||||
- URI checkings (no fragments) rfc2396.txt
|
||||
- Added a clean mechanism for overload or added input methods:
|
||||
xmlRegisterInputCallbacks()
|
||||
- dynamically adapt the alloc entry point to use g_alloc()/g_free()
|
||||
|
16
configure.in
16
configure.in
@ -4,7 +4,7 @@ AC_INIT(entities.h)
|
||||
AM_CONFIG_HEADER(config.h)
|
||||
|
||||
LIBXML_MAJOR_VERSION=2
|
||||
LIBXML_MINOR_VERSION=0
|
||||
LIBXML_MINOR_VERSION=1
|
||||
LIBXML_MICRO_VERSION=0
|
||||
LIBXML_VERSION=$LIBXML_MAJOR_VERSION.$LIBXML_MINOR_VERSION.$LIBXML_MICRO_VERSION
|
||||
LIBXML_VERSION_INFO=`expr $LIBXML_MAJOR_VERSION + $LIBXML_MINOR_VERSION`:$LIBXML_MICRO_VERSION:$LIBXML_MINOR_VERSION
|
||||
@ -203,6 +203,20 @@ fi
|
||||
AC_SUBST(WITH_XPATH)
|
||||
AC_SUBST(XPATH_OBJ)
|
||||
|
||||
AC_ARG_WITH(iconv, [ --with-iconv Add the ICONV support (on)])
|
||||
if test "$with_iconv" = "no" ; then
|
||||
echo Disabling ICONV support
|
||||
WITH_ICONV=0
|
||||
else
|
||||
if test "$have_iconv" != "" ; then
|
||||
echo Iconv support not found
|
||||
WITH_ICONV=0
|
||||
else
|
||||
WITH_ICONV=1
|
||||
fi
|
||||
fi
|
||||
AC_SUBST(WITH_ICONV)
|
||||
|
||||
AC_ARG_WITH(debug, [ --with-debug Add the debugging module (on)])
|
||||
if test "$with_debug" = "no" ; then
|
||||
echo Disabling DEBUG support
|
||||
|
620
encoding.c
620
encoding.c
@ -34,12 +34,26 @@
|
||||
#ifdef HAVE_STDLIB_H
|
||||
#include <stdlib.h>
|
||||
#endif
|
||||
#include <libxml/xmlversion.h>
|
||||
#ifdef LIBXML_ICONV_ENABLED
|
||||
#ifdef HAVE_ERRNO_H
|
||||
#include <errno.h>
|
||||
#endif
|
||||
#endif
|
||||
#include <libxml/encoding.h>
|
||||
#include <libxml/xmlmemory.h>
|
||||
|
||||
xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
|
||||
xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
|
||||
|
||||
#ifdef LIBXML_ICONV_ENABLED
|
||||
#if 0
|
||||
#define DEBUG_ENCODING /* Define this to get encoding traces */
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static int xmlLittleEndian = 1;
|
||||
|
||||
/*
|
||||
* From rfc2044: encoding of the Unicode values on UTF-8:
|
||||
*
|
||||
@ -104,30 +118,38 @@ xmlCheckUTF8(const unsigned char *utf)
|
||||
*
|
||||
* Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
|
||||
* block of chars out.
|
||||
* Returns the number of byte written, or -1 by lack of space.
|
||||
* Returns 0 if success, or -1 otherwise
|
||||
* The value of @inlen after return is the number of octets consumed
|
||||
* as the return value is positive, else unpredictiable.
|
||||
* The value of @outlen after return is the number of ocetes consumed.
|
||||
*/
|
||||
int
|
||||
isolat1ToUTF8(unsigned char* out, int outlen,
|
||||
isolat1ToUTF8(unsigned char* out, int *outlen,
|
||||
const unsigned char* in, int *inlen) {
|
||||
unsigned char* outstart = out;
|
||||
unsigned char* outend= out+outlen;
|
||||
const unsigned char* processed = in;
|
||||
unsigned char* outend = out + *outlen;
|
||||
const unsigned char* inend = in + *inlen;
|
||||
unsigned char c;
|
||||
|
||||
while (in < inend) {
|
||||
c= *in++;
|
||||
if (c < 0x80) {
|
||||
if (out >= outend) return(-1);
|
||||
if (out >= outend)
|
||||
break;
|
||||
*out++ = c;
|
||||
}
|
||||
else {
|
||||
if (out >= outend) return(-1);
|
||||
if (out + 1 >= outend) break;
|
||||
*out++ = 0xC0 | (c >> 6);
|
||||
if (out >= outend) return(-1);
|
||||
*out++ = 0x80 | (0x3F & c);
|
||||
}
|
||||
processed = in;
|
||||
}
|
||||
return(out-outstart);
|
||||
*outlen = out - outstart;
|
||||
*inlen = processed - in;
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -141,17 +163,17 @@ isolat1ToUTF8(unsigned char* out, int outlen,
|
||||
* block of chars out.
|
||||
* TODO: UTF8Toisolat1 need a fallback mechanism ...
|
||||
*
|
||||
* Returns the number of byte written, or -1 by lack of space, or -2
|
||||
* if the transcoding fails (for *in is not valid utf8 string or
|
||||
* the result of transformation can't fit into the encoding we want)
|
||||
* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
|
||||
* The value of @inlen after return is the number of octets consumed
|
||||
* as the return value is positive, else unpredictiable.
|
||||
* The value of @outlen after return is the number of ocetes consumed.
|
||||
*/
|
||||
int
|
||||
UTF8Toisolat1(unsigned char* out, int outlen,
|
||||
UTF8Toisolat1(unsigned char* out, int *outlen,
|
||||
const unsigned char* in, int *inlen) {
|
||||
unsigned char* outstart = out;
|
||||
unsigned char* outend= out+outlen;
|
||||
const unsigned char* processed = in;
|
||||
unsigned char* outend = out + *outlen;
|
||||
const unsigned char* inend = in + *inlen;
|
||||
unsigned char c;
|
||||
|
||||
@ -162,18 +184,22 @@ UTF8Toisolat1(unsigned char* out, int outlen,
|
||||
*out++= c;
|
||||
}
|
||||
else if (in == inend) {
|
||||
*inlen -= 1;
|
||||
break;
|
||||
}
|
||||
else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) {
|
||||
/* a two byte utf-8 and can be encoding as isolate1 */
|
||||
*out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
|
||||
}
|
||||
else
|
||||
else {
|
||||
*outlen = out - outstart;
|
||||
*inlen = processed - in;
|
||||
return(-2);
|
||||
/* TODO : some should be represent as "&#x____;" */
|
||||
}
|
||||
return(out-outstart);
|
||||
processed = in;
|
||||
}
|
||||
*outlen = out - outstart;
|
||||
*inlen = processed - in;
|
||||
return(0);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -194,11 +220,12 @@ UTF8Toisolat1(unsigned char* out, int outlen,
|
||||
* as the return value is positive, else unpredictiable.
|
||||
*/
|
||||
int
|
||||
UTF16LEToUTF8(unsigned char* out, int outlen,
|
||||
UTF16LEToUTF8(unsigned char* out, int *outlen,
|
||||
const unsigned char* inb, int *inlenb)
|
||||
{
|
||||
unsigned char* outstart = out;
|
||||
unsigned char* outend= out+outlen;
|
||||
const unsigned char* processed = inb;
|
||||
unsigned char* outend = out + *outlen;
|
||||
unsigned short* in = (unsigned short*) inb;
|
||||
unsigned short* inend;
|
||||
unsigned int c, d, inlen;
|
||||
@ -210,40 +237,42 @@ UTF16LEToUTF8(unsigned char* out, int outlen,
|
||||
inlen = *inlenb / 2;
|
||||
inend = in + inlen;
|
||||
while (in < inend) {
|
||||
#ifdef BIG_ENDIAN
|
||||
if (xmlLittleEndian) {
|
||||
c= *in++;
|
||||
} else {
|
||||
tmp = (unsigned char *) in;
|
||||
c = *tmp++;
|
||||
c = c | (((unsigned int)*tmp) << 8);
|
||||
in++;
|
||||
#else /* BIG_ENDIAN */
|
||||
c= *in++;
|
||||
#endif /* BIG_ENDIAN */
|
||||
}
|
||||
if ((c & 0xFC00) == 0xD800) { /* surrogates */
|
||||
if (in >= inend) { /* (in > inend) shouldn't happens */
|
||||
(*inlenb) -= 2;
|
||||
break;
|
||||
}
|
||||
#ifdef BIG_ENDIAN
|
||||
if (xmlLittleEndian) {
|
||||
d = *in++;
|
||||
} else {
|
||||
tmp = (unsigned char *) in;
|
||||
d = *tmp++;
|
||||
d = d | (((unsigned int)*tmp) << 8);
|
||||
in++;
|
||||
#else /* BIG_ENDIAN */
|
||||
d = *in++;
|
||||
#endif /* BIG_ENDIAN */
|
||||
}
|
||||
if ((d & 0xFC00) == 0xDC00) {
|
||||
c &= 0x03FF;
|
||||
c <<= 10;
|
||||
c |= d & 0x03FF;
|
||||
c += 0x10000;
|
||||
}
|
||||
else
|
||||
else {
|
||||
*outlen = out - outstart;
|
||||
*inlenb = processed - inb;
|
||||
return(-2);
|
||||
}
|
||||
}
|
||||
|
||||
/* assertion: c is a single UTF-4 value */
|
||||
if (out >= outend)
|
||||
return(-1);
|
||||
break;
|
||||
if (c < 0x80) { *out++= c; bits= -6; }
|
||||
else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
|
||||
else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
|
||||
@ -251,11 +280,14 @@ UTF16LEToUTF8(unsigned char* out, int outlen,
|
||||
|
||||
for ( ; bits >= 0; bits-= 6) {
|
||||
if (out >= outend)
|
||||
return(-1);
|
||||
break;
|
||||
*out++= ((c >> bits) & 0x3F) | 0x80;
|
||||
}
|
||||
processed = (const unsigned char*) in;
|
||||
}
|
||||
return(out-outstart);
|
||||
*outlen = out - outstart;
|
||||
*inlenb = processed - inb;
|
||||
return(0);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -273,40 +305,44 @@ UTF16LEToUTF8(unsigned char* out, int outlen,
|
||||
* if the transcoding failed.
|
||||
*/
|
||||
int
|
||||
UTF8ToUTF16LE(unsigned char* outb, int outlen,
|
||||
UTF8ToUTF16LE(unsigned char* outb, int *outlen,
|
||||
const unsigned char* in, int *inlen)
|
||||
{
|
||||
unsigned short* out = (unsigned short*) outb;
|
||||
const unsigned char* processed = in;
|
||||
unsigned short* outstart= out;
|
||||
unsigned short* outend;
|
||||
const unsigned char* inend= in+*inlen;
|
||||
unsigned int c, d, trailing;
|
||||
#ifdef BIG_ENDIAN
|
||||
unsigned char *tmp;
|
||||
unsigned short tmp1, tmp2;
|
||||
#endif /* BIG_ENDIAN */
|
||||
|
||||
outlen /= 2; /* convert in short length */
|
||||
outend = out + outlen;
|
||||
outend = out + (*outlen / 2);
|
||||
while (in < inend) {
|
||||
d= *in++;
|
||||
if (d < 0x80) { c= d; trailing= 0; }
|
||||
else if (d < 0xC0)
|
||||
return(-2); /* trailing byte in leading position */
|
||||
else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
|
||||
else if (d < 0xC0) {
|
||||
/* trailing byte in leading position */
|
||||
*outlen = out - outstart;
|
||||
*inlen = processed - in;
|
||||
return(-2);
|
||||
} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
|
||||
else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
|
||||
else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
|
||||
else
|
||||
return(-2); /* no chance for this in UTF-16 */
|
||||
else {
|
||||
/* no chance for this in UTF-16 */
|
||||
*outlen = out - outstart;
|
||||
*inlen = processed - in;
|
||||
return(-2);
|
||||
}
|
||||
|
||||
if (inend - in < trailing) {
|
||||
*inlen -= (inend - in);
|
||||
break;
|
||||
}
|
||||
|
||||
for ( ; trailing; trailing--) {
|
||||
if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
|
||||
return(-1);
|
||||
break;
|
||||
c <<= 6;
|
||||
c |= d & 0x3F;
|
||||
}
|
||||
@ -314,21 +350,24 @@ UTF8ToUTF16LE(unsigned char* outb, int outlen,
|
||||
/* assertion: c is a single UTF-4 value */
|
||||
if (c < 0x10000) {
|
||||
if (out >= outend)
|
||||
return(-1);
|
||||
#ifdef BIG_ENDIAN
|
||||
break;
|
||||
if (xmlLittleEndian) {
|
||||
*out++ = c;
|
||||
} else {
|
||||
tmp = (unsigned char *) out;
|
||||
*tmp = c ;
|
||||
*(tmp + 1) = c >> 8 ;
|
||||
out++;
|
||||
#else /* BIG_ENDIAN */
|
||||
*out++ = c;
|
||||
#endif /* BIG_ENDIAN */
|
||||
}
|
||||
}
|
||||
else if (c < 0x110000) {
|
||||
if (out+1 >= outend)
|
||||
return(-1);
|
||||
break;
|
||||
c -= 0x10000;
|
||||
#ifdef BIG_ENDIAN
|
||||
if (xmlLittleEndian) {
|
||||
*out++ = 0xD800 | (c >> 10);
|
||||
*out++ = 0xDC00 | (c & 0x03FF);
|
||||
} else {
|
||||
tmp1 = 0xD800 | (c >> 10);
|
||||
tmp = (unsigned char *) out;
|
||||
*tmp = tmp1;
|
||||
@ -340,15 +379,15 @@ UTF8ToUTF16LE(unsigned char* outb, int outlen,
|
||||
*tmp = tmp2;
|
||||
*(tmp + 1) = tmp2 >> 8;
|
||||
out++;
|
||||
#else /* BIG_ENDIAN */
|
||||
*out++ = 0xD800 | (c >> 10);
|
||||
*out++ = 0xDC00 | (c & 0x03FF);
|
||||
#endif /* BIG_ENDIAN */
|
||||
}
|
||||
}
|
||||
else
|
||||
return(-1);
|
||||
break;
|
||||
processed = in;
|
||||
}
|
||||
return(out-outstart);
|
||||
*outlen = out - outstart;
|
||||
*inlen = processed - in;
|
||||
return(0);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -369,18 +408,16 @@ UTF8ToUTF16LE(unsigned char* outb, int outlen,
|
||||
* as the return value is positive, else unpredictiable.
|
||||
*/
|
||||
int
|
||||
UTF16BEToUTF8(unsigned char* out, int outlen,
|
||||
UTF16BEToUTF8(unsigned char* out, int *outlen,
|
||||
const unsigned char* inb, int *inlenb)
|
||||
{
|
||||
unsigned char* outstart = out;
|
||||
unsigned char* outend= out+outlen;
|
||||
const unsigned char* processed = inb;
|
||||
unsigned char* outend = out + *outlen;
|
||||
unsigned short* in = (unsigned short*) inb;
|
||||
unsigned short* inend;
|
||||
unsigned int c, d, inlen;
|
||||
#ifdef BIG_ENDIAN
|
||||
#else /* BIG_ENDIAN */
|
||||
unsigned char *tmp;
|
||||
#endif /* BIG_ENDIAN */
|
||||
int bits;
|
||||
|
||||
if ((*inlenb % 2) == 1)
|
||||
@ -388,43 +425,46 @@ UTF16BEToUTF8(unsigned char* out, int outlen,
|
||||
inlen = *inlenb / 2;
|
||||
inend= in + inlen;
|
||||
while (in < inend) {
|
||||
#ifdef BIG_ENDIAN
|
||||
c= *in++;
|
||||
#else
|
||||
if (xmlLittleEndian) {
|
||||
tmp = (unsigned char *) in;
|
||||
c = *tmp++;
|
||||
c = c << 8;
|
||||
c = c | (unsigned int) *tmp;
|
||||
in++;
|
||||
#endif
|
||||
} else {
|
||||
c= *in++;
|
||||
}
|
||||
if ((c & 0xFC00) == 0xD800) { /* surrogates */
|
||||
if (in >= inend) { /* (in > inend) shouldn't happens */
|
||||
(*inlenb) -= 2;
|
||||
break;
|
||||
*outlen = out - outstart;
|
||||
*inlenb = processed - inb;
|
||||
return(-2);
|
||||
}
|
||||
|
||||
#ifdef BIG_ENDIAN
|
||||
d= *in++;
|
||||
#else
|
||||
if (xmlLittleEndian) {
|
||||
tmp = (unsigned char *) in;
|
||||
d = *tmp++;
|
||||
d = d << 8;
|
||||
d = d | (unsigned int) *tmp;
|
||||
in++;
|
||||
#endif
|
||||
} else {
|
||||
d= *in++;
|
||||
}
|
||||
if ((d & 0xFC00) == 0xDC00) {
|
||||
c &= 0x03FF;
|
||||
c <<= 10;
|
||||
c |= d & 0x03FF;
|
||||
c += 0x10000;
|
||||
}
|
||||
else
|
||||
else {
|
||||
*outlen = out - outstart;
|
||||
*inlenb = processed - inb;
|
||||
return(-2);
|
||||
}
|
||||
}
|
||||
|
||||
/* assertion: c is a single UTF-4 value */
|
||||
if (out >= outend)
|
||||
return(-1);
|
||||
break;
|
||||
if (c < 0x80) { *out++= c; bits= -6; }
|
||||
else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
|
||||
else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
|
||||
@ -432,11 +472,14 @@ UTF16BEToUTF8(unsigned char* out, int outlen,
|
||||
|
||||
for ( ; bits >= 0; bits-= 6) {
|
||||
if (out >= outend)
|
||||
return(-1);
|
||||
break;
|
||||
*out++= ((c >> bits) & 0x3F) | 0x80;
|
||||
}
|
||||
processed = (const unsigned char*) in;
|
||||
}
|
||||
return(out-outstart);
|
||||
*outlen = out - outstart;
|
||||
*inlenb = processed - inb;
|
||||
return(0);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -454,63 +497,63 @@ UTF16BEToUTF8(unsigned char* out, int outlen,
|
||||
* if the transcoding failed.
|
||||
*/
|
||||
int
|
||||
UTF8ToUTF16BE(unsigned char* outb, int outlen,
|
||||
UTF8ToUTF16BE(unsigned char* outb, int *outlen,
|
||||
const unsigned char* in, int *inlen)
|
||||
{
|
||||
unsigned short* out = (unsigned short*) outb;
|
||||
const unsigned char* processed = in;
|
||||
unsigned short* outstart= out;
|
||||
unsigned short* outend;
|
||||
const unsigned char* inend= in+*inlen;
|
||||
unsigned int c, d, trailing;
|
||||
#ifdef BIG_ENDIAN
|
||||
#else
|
||||
unsigned char *tmp;
|
||||
unsigned short tmp1, tmp2;
|
||||
#endif /* BIG_ENDIAN */
|
||||
|
||||
outlen /= 2; /* convert in short length */
|
||||
outend = out + outlen;
|
||||
outend = out + (*outlen / 2);
|
||||
while (in < inend) {
|
||||
d= *in++;
|
||||
if (d < 0x80) { c= d; trailing= 0; }
|
||||
else if (d < 0xC0)
|
||||
return(-2); /* trailing byte in leading position */
|
||||
else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
|
||||
else if (d < 0xC0) {
|
||||
/* trailing byte in leading position */
|
||||
*outlen = out - outstart;
|
||||
*inlen = processed - in;
|
||||
return(-2);
|
||||
} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
|
||||
else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
|
||||
else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
|
||||
else
|
||||
return(-2); /* no chance for this in UTF-16 */
|
||||
else {
|
||||
/* no chance for this in UTF-16 */
|
||||
*outlen = out - outstart;
|
||||
*inlen = processed - in;
|
||||
return(-2);
|
||||
}
|
||||
|
||||
if (inend - in < trailing) {
|
||||
*inlen -= (inend - in);
|
||||
break;
|
||||
}
|
||||
|
||||
for ( ; trailing; trailing--) {
|
||||
if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) return(-1);
|
||||
if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break;
|
||||
c <<= 6;
|
||||
c |= d & 0x3F;
|
||||
}
|
||||
|
||||
/* assertion: c is a single UTF-4 value */
|
||||
if (c < 0x10000) {
|
||||
if (out >= outend) return(-1);
|
||||
#ifdef BIG_ENDIAN
|
||||
*out++ = c;
|
||||
#else
|
||||
if (out >= outend) break;
|
||||
if (xmlLittleEndian) {
|
||||
tmp = (unsigned char *) out;
|
||||
*tmp = c >> 8;
|
||||
*(tmp + 1) = c;
|
||||
out++;
|
||||
#endif /* BIG_ENDIAN */
|
||||
} else {
|
||||
*out++ = c;
|
||||
}
|
||||
}
|
||||
else if (c < 0x110000) {
|
||||
if (out+1 >= outend) return(-1);
|
||||
if (out+1 >= outend) break;
|
||||
c -= 0x10000;
|
||||
#ifdef BIG_ENDIAN
|
||||
*out++ = 0xD800 | (c >> 10);
|
||||
*out++ = 0xDC00 | (c & 0x03FF);
|
||||
#else
|
||||
if (xmlLittleEndian) {
|
||||
tmp1 = 0xD800 | (c >> 10);
|
||||
tmp = (unsigned char *) out;
|
||||
*tmp = tmp1 >> 8;
|
||||
@ -522,11 +565,18 @@ UTF8ToUTF16BE(unsigned char* outb, int outlen,
|
||||
*tmp = tmp2 >> 8;
|
||||
*(tmp + 1) = tmp2;
|
||||
out++;
|
||||
#endif
|
||||
} else {
|
||||
*out++ = 0xD800 | (c >> 10);
|
||||
*out++ = 0xDC00 | (c & 0x03FF);
|
||||
}
|
||||
else return(-1);
|
||||
}
|
||||
return(out-outstart);
|
||||
else
|
||||
break;
|
||||
processed = in;
|
||||
}
|
||||
*outlen = out - outstart;
|
||||
*inlen = processed - in;
|
||||
return(0);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -636,8 +686,12 @@ xmlParseCharEncoding(const char* name)
|
||||
if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
|
||||
|
||||
if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
|
||||
if (!strcmp(upper, "Shift_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
|
||||
if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
|
||||
if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
|
||||
|
||||
#ifdef DEBUG_ENCODING
|
||||
fprintf(stderr, "Unknown encoding %s\n", name);
|
||||
#endif
|
||||
return(XML_CHAR_ENCODING_ERROR);
|
||||
}
|
||||
|
||||
@ -712,6 +766,9 @@ xmlNewCharEncodingHandler(const char *name,
|
||||
* registers and returns the handler.
|
||||
*/
|
||||
xmlRegisterCharEncodingHandler(handler);
|
||||
#ifdef DEBUG_ENCODING
|
||||
fprintf(stderr, "Registered encoding handler for %s\n", name);
|
||||
#endif
|
||||
return(handler);
|
||||
}
|
||||
|
||||
@ -725,11 +782,18 @@ xmlNewCharEncodingHandler(const char *name,
|
||||
*/
|
||||
void
|
||||
xmlInitCharEncodingHandlers(void) {
|
||||
unsigned short int tst = 0x1234;
|
||||
unsigned char *ptr = (unsigned char *) &tst;
|
||||
|
||||
if (handlers != NULL) return;
|
||||
|
||||
handlers = (xmlCharEncodingHandlerPtr *)
|
||||
xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr));
|
||||
|
||||
if (*ptr == 0x12) xmlLittleEndian = 0;
|
||||
else if (*ptr == 0x34) xmlLittleEndian = 1;
|
||||
else fprintf(stderr, "Odd problem at endianness detection\n");
|
||||
|
||||
if (handlers == NULL) {
|
||||
fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n");
|
||||
return;
|
||||
@ -755,6 +819,7 @@ xmlCleanupCharEncodingHandlers(void) {
|
||||
for (;nbCharEncodingHandler > 0;) {
|
||||
nbCharEncodingHandler--;
|
||||
if (handlers[nbCharEncodingHandler] != NULL) {
|
||||
if (handlers[nbCharEncodingHandler]->name != NULL)
|
||||
xmlFree(handlers[nbCharEncodingHandler]->name);
|
||||
xmlFree(handlers[nbCharEncodingHandler]);
|
||||
}
|
||||
@ -798,6 +863,8 @@ xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) {
|
||||
*/
|
||||
xmlCharEncodingHandlerPtr
|
||||
xmlGetCharEncodingHandler(xmlCharEncoding enc) {
|
||||
xmlCharEncodingHandlerPtr handler;
|
||||
|
||||
if (handlers == NULL) xmlInitCharEncodingHandlers();
|
||||
switch (enc) {
|
||||
case XML_CHAR_ENCODING_ERROR:
|
||||
@ -811,40 +878,68 @@ xmlGetCharEncodingHandler(xmlCharEncoding enc) {
|
||||
case XML_CHAR_ENCODING_UTF16BE:
|
||||
return(xmlUTF16BEHandler);
|
||||
case XML_CHAR_ENCODING_EBCDIC:
|
||||
return(NULL);
|
||||
handler = xmlFindCharEncodingHandler("EBCDIC");
|
||||
if (handler != NULL) return(handler);
|
||||
handler = xmlFindCharEncodingHandler("ebcdic");
|
||||
if (handler != NULL) return(handler);
|
||||
break;
|
||||
case XML_CHAR_ENCODING_UCS4LE:
|
||||
return(NULL);
|
||||
handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4");
|
||||
if (handler != NULL) return(handler);
|
||||
handler = xmlFindCharEncodingHandler("UCS-4");
|
||||
if (handler != NULL) return(handler);
|
||||
handler = xmlFindCharEncodingHandler("UCS4");
|
||||
if (handler != NULL) return(handler);
|
||||
break;
|
||||
case XML_CHAR_ENCODING_UCS4BE:
|
||||
return(NULL);
|
||||
handler = xmlFindCharEncodingHandler("UCS4BE");
|
||||
if (handler != NULL) return(handler);
|
||||
break;
|
||||
case XML_CHAR_ENCODING_UCS4_2143:
|
||||
return(NULL);
|
||||
break;
|
||||
case XML_CHAR_ENCODING_UCS4_3412:
|
||||
return(NULL);
|
||||
break;
|
||||
case XML_CHAR_ENCODING_UCS2:
|
||||
return(NULL);
|
||||
handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2");
|
||||
if (handler != NULL) return(handler);
|
||||
handler = xmlFindCharEncodingHandler("UCS-2");
|
||||
if (handler != NULL) return(handler);
|
||||
handler = xmlFindCharEncodingHandler("UCS2");
|
||||
if (handler != NULL) return(handler);
|
||||
break;
|
||||
case XML_CHAR_ENCODING_8859_1:
|
||||
return(NULL);
|
||||
case XML_CHAR_ENCODING_8859_2:
|
||||
return(NULL);
|
||||
case XML_CHAR_ENCODING_8859_3:
|
||||
return(NULL);
|
||||
case XML_CHAR_ENCODING_8859_4:
|
||||
return(NULL);
|
||||
case XML_CHAR_ENCODING_8859_5:
|
||||
return(NULL);
|
||||
case XML_CHAR_ENCODING_8859_6:
|
||||
return(NULL);
|
||||
case XML_CHAR_ENCODING_8859_7:
|
||||
return(NULL);
|
||||
case XML_CHAR_ENCODING_8859_8:
|
||||
return(NULL);
|
||||
case XML_CHAR_ENCODING_8859_9:
|
||||
return(NULL);
|
||||
case XML_CHAR_ENCODING_2022_JP:
|
||||
handler = xmlFindCharEncodingHandler("ISO-2022-JP");
|
||||
if (handler != NULL) return(handler);
|
||||
break;
|
||||
case XML_CHAR_ENCODING_SHIFT_JIS:
|
||||
handler = xmlFindCharEncodingHandler("SHIFT-JIS");
|
||||
if (handler != NULL) return(handler);
|
||||
handler = xmlFindCharEncodingHandler("SHIFT_JIS");
|
||||
if (handler != NULL) return(handler);
|
||||
handler = xmlFindCharEncodingHandler("Shift_JIS");
|
||||
if (handler != NULL) return(handler);
|
||||
break;
|
||||
case XML_CHAR_ENCODING_EUC_JP:
|
||||
return(NULL);
|
||||
handler = xmlFindCharEncodingHandler("EUC-JP");
|
||||
if (handler != NULL) return(handler);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
#ifdef DEBUG_ENCODING
|
||||
fprintf(stderr, "No handler found for encoding %d\n", enc);
|
||||
#endif
|
||||
return(NULL);
|
||||
}
|
||||
|
||||
@ -858,23 +953,306 @@ xmlGetCharEncodingHandler(xmlCharEncoding enc) {
|
||||
*/
|
||||
xmlCharEncodingHandlerPtr
|
||||
xmlFindCharEncodingHandler(const char *name) {
|
||||
char upper[500];
|
||||
#ifdef LIBXML_ICONV_ENABLED
|
||||
iconv_t icv_in, icv_out;
|
||||
xmlCharEncodingHandlerPtr enc;
|
||||
#endif /* LIBXML_ICONV_ENABLED */
|
||||
char upper[100];
|
||||
int i;
|
||||
|
||||
if (handlers == NULL) xmlInitCharEncodingHandlers();
|
||||
if (name == NULL) return(xmlDefaultCharEncodingHandler);
|
||||
if (name[0] == 0) return(xmlDefaultCharEncodingHandler);
|
||||
|
||||
for (i = 0;i < 499;i++) {
|
||||
for (i = 0;i < 99;i++) {
|
||||
upper[i] = toupper(name[i]);
|
||||
if (upper[i] == 0) break;
|
||||
}
|
||||
upper[i] = 0;
|
||||
|
||||
for (i = 0;i < nbCharEncodingHandler; i++)
|
||||
if (!strcmp(name, handlers[i]->name))
|
||||
if (!strcmp(upper, handlers[i]->name)) {
|
||||
#ifdef DEBUG_ENCODING
|
||||
fprintf(stderr, "Found registered handler for encoding %s\n", name);
|
||||
#endif
|
||||
return(handlers[i]);
|
||||
}
|
||||
|
||||
#ifdef LIBXML_ICONV_ENABLED
|
||||
/* check whether iconv can handle this */
|
||||
icv_in = iconv_open("UTF-8", name);
|
||||
icv_out = iconv_open(name, "UTF-8");
|
||||
if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
|
||||
enc = xmlMalloc(sizeof(xmlCharEncodingHandler));
|
||||
if (enc == NULL) {
|
||||
iconv_close(icv_in);
|
||||
iconv_close(icv_out);
|
||||
return(NULL);
|
||||
}
|
||||
enc->name = NULL;
|
||||
enc->input = NULL;
|
||||
enc->output = NULL;
|
||||
enc->iconv_in = icv_in;
|
||||
enc->iconv_out = icv_out;
|
||||
#ifdef DEBUG_ENCODING
|
||||
fprintf(stderr, "Found iconv handler for encoding %s\n", name);
|
||||
#endif
|
||||
return enc;
|
||||
} else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) {
|
||||
fprintf(stderr, "iconv : problems with filters for '%s'\n", name);
|
||||
}
|
||||
#endif /* LIBXML_ICONV_ENABLED */
|
||||
#ifdef DEBUG_ENCODING
|
||||
fprintf(stderr, "No handler found for encoding %s\n", name);
|
||||
#endif
|
||||
return(NULL);
|
||||
}
|
||||
|
||||
#ifdef LIBXML_ICONV_ENABLED
|
||||
/**
|
||||
* xmlIconvWrapper:
|
||||
* @cd: iconv converter data structure
|
||||
* @out: a pointer to an array of bytes to store the result
|
||||
* @outlen: the length of @out
|
||||
* @in: a pointer to an array of ISO Latin 1 chars
|
||||
* @inlen: the length of @in
|
||||
*
|
||||
* Returns 0 if success, or
|
||||
* -1 by lack of space, or
|
||||
* -2 if the transcoding fails (for *in is not valid utf8 string or
|
||||
* the result of transformation can't fit into the encoding we want), or
|
||||
* -3 if there the last byte can't form a single output char.
|
||||
*
|
||||
* The value of @inlen after return is the number of octets consumed
|
||||
* as the return value is positive, else unpredictiable.
|
||||
* The value of @outlen after return is the number of ocetes consumed.
|
||||
*/
|
||||
static int
|
||||
xmlIconvWrapper(iconv_t cd,
|
||||
unsigned char *out, int *outlen,
|
||||
const unsigned char *in, int *inlen) {
|
||||
|
||||
size_t icv_inlen = *inlen, icv_outlen = *outlen;
|
||||
const char *icv_in = (const char *) in;
|
||||
char *icv_out = (char *) out;
|
||||
int ret;
|
||||
|
||||
ret = iconv(cd,
|
||||
&icv_in, &icv_inlen,
|
||||
&icv_out, &icv_outlen);
|
||||
*inlen -= icv_inlen;
|
||||
*outlen -= icv_outlen;
|
||||
if (icv_inlen != 0 || ret == (size_t) -1) {
|
||||
#ifdef EILSEQ
|
||||
if (errno == EILSEQ) {
|
||||
return -2;
|
||||
} else
|
||||
#endif
|
||||
#ifdef E2BIG
|
||||
if (errno == E2BIG) {
|
||||
return -1;
|
||||
} else
|
||||
#endif
|
||||
#ifdef EINVAL
|
||||
if (errno == EINVAL) {
|
||||
return -3;
|
||||
}
|
||||
#endif
|
||||
else {
|
||||
return -3;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
#endif /* LIBXML_ICONV_ENABLED */
|
||||
|
||||
/**
|
||||
* xmlCharEncInFunc:
|
||||
* @handler: char enconding transformation data structure
|
||||
* @out: an xmlBuffer for the output.
|
||||
* @in: an xmlBuffer for the input
|
||||
*
|
||||
* Generic front-end for the encoding handler input function
|
||||
*
|
||||
* Returns the number of byte written if success, or
|
||||
* -1 general error
|
||||
* -2 if the transcoding fails (for *in is not valid utf8 string or
|
||||
* the result of transformation can't fit into the encoding we want), or
|
||||
*/
|
||||
int
|
||||
xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
|
||||
xmlBufferPtr in) {
|
||||
int ret = -2;
|
||||
int written;
|
||||
int toconv;
|
||||
|
||||
if (handler == NULL) return(-1);
|
||||
if (out == NULL) return(-1);
|
||||
if (in == NULL) return(-1);
|
||||
|
||||
written = out->size - out->use;
|
||||
toconv = in->use;
|
||||
if (toconv * 2 >= written) {
|
||||
xmlBufferGrow(out, toconv * 2);
|
||||
written = out->size - out->use - 1;
|
||||
}
|
||||
if (handler->input != NULL) {
|
||||
ret = handler->input(&out->content[out->use], &written,
|
||||
in->content, &toconv);
|
||||
xmlBufferShrink(in, toconv);
|
||||
out->use += written;
|
||||
out->content[out->use] = 0;
|
||||
}
|
||||
#ifdef LIBXML_ICONV_ENABLED
|
||||
else if (handler->iconv_in != NULL) {
|
||||
ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use],
|
||||
&written, in->content, &toconv);
|
||||
xmlBufferShrink(in, toconv);
|
||||
out->use += written;
|
||||
out->content[out->use] = 0;
|
||||
if (ret == -1) ret = -3;
|
||||
}
|
||||
#endif /* LIBXML_ICONV_ENABLED */
|
||||
#ifdef DEBUG_ENCODING
|
||||
switch (ret) {
|
||||
case 0:
|
||||
fprintf(stderr, "converted %d bytes to %d bytes of input\n",
|
||||
toconv, written);
|
||||
break;
|
||||
case -1:
|
||||
fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
|
||||
toconv, written, in->use);
|
||||
break;
|
||||
case -2:
|
||||
fprintf(stderr, "input conversion failed due to input error\n");
|
||||
break;
|
||||
case -3:
|
||||
fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n",
|
||||
toconv, written, in->use);
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr,"Unknown input conversion failed %d\n", ret);
|
||||
}
|
||||
#endif
|
||||
/*
|
||||
* Ignore when input buffer is not on a boundary
|
||||
*/
|
||||
if (ret == -3) ret = 0;
|
||||
return(ret);
|
||||
}
|
||||
|
||||
/**
|
||||
* xmlCharEncOutFunc:
|
||||
* @handler: char enconding transformation data structure
|
||||
* @out: an xmlBuffer for the output.
|
||||
* @in: an xmlBuffer for the input
|
||||
*
|
||||
* Generic front-end for the encoding handler output function
|
||||
*
|
||||
* Returns the number of byte written if success, or
|
||||
* -1 general error
|
||||
* -2 if the transcoding fails (for *in is not valid utf8 string or
|
||||
* the result of transformation can't fit into the encoding we want), or
|
||||
*/
|
||||
int
|
||||
xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out,
|
||||
xmlBufferPtr in) {
|
||||
int ret = -2;
|
||||
int written;
|
||||
int toconv;
|
||||
|
||||
if (handler == NULL) return(-1);
|
||||
if (out == NULL) return(-1);
|
||||
if (in == NULL) return(-1);
|
||||
|
||||
written = out->size - out->use;
|
||||
toconv = in->use;
|
||||
if (toconv * 2 >= written) {
|
||||
xmlBufferGrow(out, toconv * 2);
|
||||
written = out->size - out->use - 1;
|
||||
}
|
||||
if (handler->output != NULL) {
|
||||
ret = handler->output(&out->content[out->use], &written,
|
||||
in->content, &toconv);
|
||||
xmlBufferShrink(in, toconv);
|
||||
out->use += written;
|
||||
out->content[out->use] = 0;
|
||||
}
|
||||
#ifdef LIBXML_ICONV_ENABLED
|
||||
else if (handler->iconv_out != NULL) {
|
||||
ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use],
|
||||
&written, in->content, &toconv);
|
||||
xmlBufferShrink(in, toconv);
|
||||
out->use += written;
|
||||
out->content[out->use] = 0;
|
||||
if (ret == -1) ret = -3;
|
||||
}
|
||||
#endif /* LIBXML_ICONV_ENABLED */
|
||||
#ifdef DEBUG_ENCODING
|
||||
switch (ret) {
|
||||
case 0:
|
||||
fprintf(stderr, "converted %d bytes to %d bytes of output\n",
|
||||
toconv, written);
|
||||
break;
|
||||
case -1:
|
||||
fprintf(stderr, "output conversion failed by lack of space\n");
|
||||
break;
|
||||
case -2:
|
||||
fprintf(stderr, "output conversion failed due to output error\n");
|
||||
break;
|
||||
case -3:
|
||||
fprintf(stderr,"converted %d bytes to %d bytes of output %d left\n",
|
||||
toconv, written, in->use);
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr,"Unknown output conversion failed %d\n", ret);
|
||||
}
|
||||
#endif
|
||||
return(ret);
|
||||
}
|
||||
|
||||
/**
|
||||
* xmlCharEncCloseFunc:
|
||||
* @handler: char enconding transformation data structure
|
||||
*
|
||||
* Generic front-end for hencoding handler close function
|
||||
*
|
||||
* Returns 0 if success, or -1 in case of error
|
||||
*/
|
||||
int
|
||||
xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) {
|
||||
int ret = 0;
|
||||
if (handler == NULL) return(-1);
|
||||
if (handler->name == NULL) return(-1);
|
||||
#ifdef LIBXML_ICONV_ENABLED
|
||||
/*
|
||||
* Iconv handlers can be oused only once, free the whole block.
|
||||
* and the associated icon resources.
|
||||
*/
|
||||
if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) {
|
||||
if (handler->name != NULL)
|
||||
xmlFree(handler->name);
|
||||
handler->name = NULL;
|
||||
if (handler->iconv_out != NULL) {
|
||||
if (iconv_close(handler->iconv_out))
|
||||
ret = -1;
|
||||
handler->iconv_out = NULL;
|
||||
}
|
||||
if (handler->iconv_in != NULL) {
|
||||
if (iconv_close(handler->iconv_in))
|
||||
ret = -1;
|
||||
handler->iconv_in = NULL;
|
||||
}
|
||||
xmlFree(handler);
|
||||
}
|
||||
#endif /* LIBXML_ICONV_ENABLED */
|
||||
#ifdef DEBUG_ENCODING
|
||||
if (ret)
|
||||
fprintf(stderr, "failed to close the encoding handler\n");
|
||||
else
|
||||
fprintf(stderr, "closed the encoding handler\n");
|
||||
|
||||
#endif
|
||||
return(ret);
|
||||
}
|
||||
|
||||
|
45
encoding.h
45
encoding.h
@ -22,12 +22,30 @@
|
||||
#define __XML_CHAR_ENCODING_H__
|
||||
|
||||
#include <libxml/xmlversion.h>
|
||||
#ifdef LIBXML_ICONV_ENABLED
|
||||
#include <iconv.h>
|
||||
#endif
|
||||
#include <libxml/tree.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Predefined values for some standard encodings
|
||||
* Libxml don't do beforehand translation on UTF8, ISOLatinX
|
||||
* It also support UTF16 (LE and BE) by default.
|
||||
*
|
||||
* Anything else would have to be translated to UTF8 before being
|
||||
* given to the parser itself. The BOM for UTF16 and the encoding
|
||||
* declaration are looked at and a converter is looked for at that
|
||||
* point. If not found the parser stops here as asked by the XML REC
|
||||
* Converter can be registered by the user using xmlRegisterCharEncodingHandler
|
||||
* but the currentl form doesn't allow stateful transcoding (a serious
|
||||
* problem agreed !). If iconv has been found it will be used
|
||||
* automatically and allow stateful transcoding, the simplest is then
|
||||
* to be sure to enable icon and to provide iconv libs for the encoding
|
||||
* support needed.
|
||||
*/
|
||||
typedef enum {
|
||||
XML_CHAR_ENCODING_ERROR= -1, /* No char encoding detected */
|
||||
@ -65,9 +83,13 @@ typedef enum {
|
||||
* Take a block of chars in the original encoding and try to convert
|
||||
* it to an UTF-8 block of chars out.
|
||||
*
|
||||
* Returns the number of byte written, or -1 by lack of space.
|
||||
* Returns the number of byte written, or -1 by lack of space, or -2
|
||||
* if the transcoding failed.
|
||||
* The value of @inlen after return is the number of octets consumed
|
||||
* as the return value is positive, else unpredictiable.
|
||||
* The value of @outlen after return is the number of ocetes consumed.
|
||||
*/
|
||||
typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int outlen,
|
||||
typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int *outlen,
|
||||
const unsigned char* in, int *inlen);
|
||||
|
||||
|
||||
@ -83,12 +105,17 @@ typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int outlen,
|
||||
*
|
||||
* Returns the number of byte written, or -1 by lack of space, or -2
|
||||
* if the transcoding failed.
|
||||
* The value of @inlen after return is the number of octets consumed
|
||||
* as the return value is positive, else unpredictiable.
|
||||
* The value of @outlen after return is the number of ocetes consumed.
|
||||
*/
|
||||
typedef int (* xmlCharEncodingOutputFunc)(unsigned char* out, int outlen,
|
||||
typedef int (* xmlCharEncodingOutputFunc)(unsigned char* out, int *outlen,
|
||||
const unsigned char* in, int *inlen);
|
||||
|
||||
|
||||
/*
|
||||
* Block defining the handlers for non UTF-8 encodings.
|
||||
* If iconv is supported, there is two extra fields
|
||||
*/
|
||||
|
||||
typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler;
|
||||
@ -97,6 +124,10 @@ struct _xmlCharEncodingHandler {
|
||||
char *name;
|
||||
xmlCharEncodingInputFunc input;
|
||||
xmlCharEncodingOutputFunc output;
|
||||
#ifdef LIBXML_ICONV_ENABLED
|
||||
iconv_t iconv_in;
|
||||
iconv_t iconv_out;
|
||||
#endif /* LIBXML_ICONV_ENABLED */
|
||||
};
|
||||
|
||||
void xmlInitCharEncodingHandlers (void);
|
||||
@ -109,6 +140,14 @@ xmlCharEncodingHandlerPtr xmlGetCharEncodingHandler(xmlCharEncoding enc);
|
||||
xmlCharEncodingHandlerPtr xmlFindCharEncodingHandler(const char *name);
|
||||
int xmlCheckUTF8 (const unsigned char *utf);
|
||||
|
||||
int xmlCharEncOutFunc (xmlCharEncodingHandler *handler,
|
||||
xmlBufferPtr out,
|
||||
xmlBufferPtr in);
|
||||
|
||||
int xmlCharEncInFunc (xmlCharEncodingHandler *handler,
|
||||
xmlBufferPtr out,
|
||||
xmlBufferPtr in);
|
||||
int xmlCharEncCloseFunc (xmlCharEncodingHandler *handler);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
@ -22,12 +22,30 @@
|
||||
#define __XML_CHAR_ENCODING_H__
|
||||
|
||||
#include <libxml/xmlversion.h>
|
||||
#ifdef LIBXML_ICONV_ENABLED
|
||||
#include <iconv.h>
|
||||
#endif
|
||||
#include <libxml/tree.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Predefined values for some standard encodings
|
||||
* Libxml don't do beforehand translation on UTF8, ISOLatinX
|
||||
* It also support UTF16 (LE and BE) by default.
|
||||
*
|
||||
* Anything else would have to be translated to UTF8 before being
|
||||
* given to the parser itself. The BOM for UTF16 and the encoding
|
||||
* declaration are looked at and a converter is looked for at that
|
||||
* point. If not found the parser stops here as asked by the XML REC
|
||||
* Converter can be registered by the user using xmlRegisterCharEncodingHandler
|
||||
* but the currentl form doesn't allow stateful transcoding (a serious
|
||||
* problem agreed !). If iconv has been found it will be used
|
||||
* automatically and allow stateful transcoding, the simplest is then
|
||||
* to be sure to enable icon and to provide iconv libs for the encoding
|
||||
* support needed.
|
||||
*/
|
||||
typedef enum {
|
||||
XML_CHAR_ENCODING_ERROR= -1, /* No char encoding detected */
|
||||
@ -65,9 +83,13 @@ typedef enum {
|
||||
* Take a block of chars in the original encoding and try to convert
|
||||
* it to an UTF-8 block of chars out.
|
||||
*
|
||||
* Returns the number of byte written, or -1 by lack of space.
|
||||
* Returns the number of byte written, or -1 by lack of space, or -2
|
||||
* if the transcoding failed.
|
||||
* The value of @inlen after return is the number of octets consumed
|
||||
* as the return value is positive, else unpredictiable.
|
||||
* The value of @outlen after return is the number of ocetes consumed.
|
||||
*/
|
||||
typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int outlen,
|
||||
typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int *outlen,
|
||||
const unsigned char* in, int *inlen);
|
||||
|
||||
|
||||
@ -83,12 +105,17 @@ typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int outlen,
|
||||
*
|
||||
* Returns the number of byte written, or -1 by lack of space, or -2
|
||||
* if the transcoding failed.
|
||||
* The value of @inlen after return is the number of octets consumed
|
||||
* as the return value is positive, else unpredictiable.
|
||||
* The value of @outlen after return is the number of ocetes consumed.
|
||||
*/
|
||||
typedef int (* xmlCharEncodingOutputFunc)(unsigned char* out, int outlen,
|
||||
typedef int (* xmlCharEncodingOutputFunc)(unsigned char* out, int *outlen,
|
||||
const unsigned char* in, int *inlen);
|
||||
|
||||
|
||||
/*
|
||||
* Block defining the handlers for non UTF-8 encodings.
|
||||
* If iconv is supported, there is two extra fields
|
||||
*/
|
||||
|
||||
typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler;
|
||||
@ -97,6 +124,10 @@ struct _xmlCharEncodingHandler {
|
||||
char *name;
|
||||
xmlCharEncodingInputFunc input;
|
||||
xmlCharEncodingOutputFunc output;
|
||||
#ifdef LIBXML_ICONV_ENABLED
|
||||
iconv_t iconv_in;
|
||||
iconv_t iconv_out;
|
||||
#endif /* LIBXML_ICONV_ENABLED */
|
||||
};
|
||||
|
||||
void xmlInitCharEncodingHandlers (void);
|
||||
@ -109,6 +140,14 @@ xmlCharEncodingHandlerPtr xmlGetCharEncodingHandler(xmlCharEncoding enc);
|
||||
xmlCharEncodingHandlerPtr xmlFindCharEncodingHandler(const char *name);
|
||||
int xmlCheckUTF8 (const unsigned char *utf);
|
||||
|
||||
int xmlCharEncOutFunc (xmlCharEncodingHandler *handler,
|
||||
xmlBufferPtr out,
|
||||
xmlBufferPtr in);
|
||||
|
||||
int xmlCharEncInFunc (xmlCharEncodingHandler *handler,
|
||||
xmlBufferPtr out,
|
||||
xmlBufferPtr in);
|
||||
int xmlCharEncCloseFunc (xmlCharEncodingHandler *handler);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
@ -28,10 +28,10 @@ extern "C" {
|
||||
* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF.
|
||||
*/
|
||||
#define IS_CHAR(c) \
|
||||
((((c) == 0x09) || ((c) == 0x0A) || ((c) == 0x0D) || \
|
||||
(((c) >= 0x20) && ((c) != 0xFFFE) && ((c) != 0xFFFF))) && \
|
||||
(((c) <= 0xD7FF) || ((c) >= 0xE000)) && ((c) >= 0) && \
|
||||
((c) <= 0x10FFFF))
|
||||
(((c) == 0x09) || ((c) == 0x0A) || ((c) == 0x0D) || \
|
||||
(((c) >= 0x20) && ((c) <= 0xD7FF)) || \
|
||||
(((c) >= 0xE000) && ((c) <= 0xFFFD)) || \
|
||||
(((c) >= 0x10000) && ((c) <= 0x10FFFF)))
|
||||
|
||||
/*
|
||||
* [3] S ::= (#x20 | #x9 | #xD | #xA)+
|
||||
@ -442,8 +442,10 @@ xmlParserCtxtPtr xmlNewParserCtxt (void);
|
||||
xmlParserCtxtPtr xmlCreateEntityParserCtxt(const xmlChar *URL,
|
||||
const xmlChar *ID,
|
||||
const xmlChar *base);
|
||||
void xmlSwitchEncoding (xmlParserCtxtPtr ctxt,
|
||||
int xmlSwitchEncoding (xmlParserCtxtPtr ctxt,
|
||||
xmlCharEncoding enc);
|
||||
int xmlSwitchToEncoding (xmlParserCtxtPtr ctxt,
|
||||
xmlCharEncodingHandlerPtr handler);
|
||||
void xmlFreeParserCtxt (xmlParserCtxtPtr ctxt);
|
||||
|
||||
/**
|
||||
|
@ -380,6 +380,8 @@ void xmlBufferCCat (xmlBufferPtr buf,
|
||||
const char *str);
|
||||
int xmlBufferShrink (xmlBufferPtr buf,
|
||||
int len);
|
||||
int xmlBufferGrow (xmlBufferPtr buf,
|
||||
int len);
|
||||
void xmlBufferEmpty (xmlBufferPtr buf);
|
||||
const xmlChar* xmlBufferContent (const xmlBufferPtr buf);
|
||||
int xmlBufferUse (const xmlBufferPtr buf);
|
||||
|
@ -33,6 +33,7 @@ struct _xmlParserInputBuffer {
|
||||
xmlCharEncodingHandlerPtr encoder; /* I18N conversions to UTF-8 */
|
||||
|
||||
xmlBufferPtr buffer; /* Local buffer encoded in UTF-8 */
|
||||
xmlBufferPtr raw; /* if encoder != NULL buffer for raw input */
|
||||
};
|
||||
|
||||
|
||||
|
556
parser.c
556
parser.c
@ -41,6 +41,7 @@
|
||||
#include <libxml/valid.h>
|
||||
#include <libxml/parserInternals.h>
|
||||
#include <libxml/xmlIO.h>
|
||||
#include <libxml/uri.h>
|
||||
#include "xml-error.h"
|
||||
|
||||
#define XML_PARSER_BIG_BUFFER_SIZE 1000
|
||||
@ -483,7 +484,7 @@ xmlNextChar(xmlParserCtxtPtr ctxt) {
|
||||
if ((ctxt->sax != NULL) &&
|
||||
(ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"Char out of allowed range\n");
|
||||
"Char 0x%X out of allowed range\n", val);
|
||||
ctxt->errNo = XML_ERR_INVALID_ENCODING;
|
||||
ctxt->wellFormed = 0;
|
||||
ctxt->disableSAX = 1;
|
||||
@ -612,7 +613,7 @@ xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
|
||||
if ((ctxt->sax != NULL) &&
|
||||
(ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"Char out of allowed range\n");
|
||||
"Char 0x%X out of allowed range\n", val);
|
||||
ctxt->errNo = XML_ERR_INVALID_ENCODING;
|
||||
ctxt->wellFormed = 0;
|
||||
ctxt->disableSAX = 1;
|
||||
@ -727,7 +728,7 @@ xmlStringCurrentChar(xmlParserCtxtPtr ctxt, const xmlChar *cur, int *len) {
|
||||
if ((ctxt->sax != NULL) &&
|
||||
(ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"Char out of allowed range\n");
|
||||
"Char 0x%X out of allowed range\n", val);
|
||||
ctxt->errNo = XML_ERR_INVALID_ENCODING;
|
||||
ctxt->wellFormed = 0;
|
||||
ctxt->disableSAX = 1;
|
||||
@ -2278,155 +2279,14 @@ xmlCheckLanguageID(const xmlChar *lang) {
|
||||
*
|
||||
* change the input functions when discovering the character encoding
|
||||
* of a given entity.
|
||||
*
|
||||
* Returns 0 in case of success, -1 otherwise
|
||||
*/
|
||||
void
|
||||
int
|
||||
xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
|
||||
{
|
||||
xmlCharEncodingHandlerPtr handler;
|
||||
|
||||
handler = xmlGetCharEncodingHandler(enc);
|
||||
if (handler != NULL) {
|
||||
if (ctxt->input != NULL) {
|
||||
if (ctxt->input->buf != NULL) {
|
||||
if (ctxt->input->buf->encoder != NULL) {
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"xmlSwitchEncoding : encoder already regitered\n");
|
||||
return;
|
||||
}
|
||||
ctxt->input->buf->encoder = handler;
|
||||
|
||||
/*
|
||||
* Is there already some content down the pipe to convert
|
||||
*/
|
||||
if ((ctxt->input->buf->buffer != NULL) &&
|
||||
(ctxt->input->buf->buffer->use > 0)) {
|
||||
xmlChar *buf;
|
||||
int res, len, size;
|
||||
int processed;
|
||||
|
||||
/*
|
||||
* Specific handling of the Byte Order Mark for
|
||||
* UTF-16
|
||||
*/
|
||||
if ((enc == XML_CHAR_ENCODING_UTF16LE) &&
|
||||
(ctxt->input->cur[0] == 0xFF) &&
|
||||
(ctxt->input->cur[1] == 0xFE)) {
|
||||
SKIP(2);
|
||||
}
|
||||
if ((enc == XML_CHAR_ENCODING_UTF16BE) &&
|
||||
(ctxt->input->cur[0] == 0xFE) &&
|
||||
(ctxt->input->cur[1] == 0xFF)) {
|
||||
SKIP(2);
|
||||
}
|
||||
|
||||
/*
|
||||
* convert the non processed part
|
||||
*/
|
||||
processed = ctxt->input->cur - ctxt->input->base;
|
||||
len = ctxt->input->buf->buffer->use - processed;
|
||||
|
||||
if (len <= 0) {
|
||||
return;
|
||||
}
|
||||
size = ctxt->input->buf->buffer->use * 4;
|
||||
if (size < 4000)
|
||||
size = 4000;
|
||||
retry_larger:
|
||||
buf = (xmlChar *) xmlMalloc(size + 1);
|
||||
if (buf == NULL) {
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"xmlSwitchEncoding : out of memory\n");
|
||||
return;
|
||||
}
|
||||
/* TODO !!! Handling of buf too small */
|
||||
res = handler->input(buf, size, ctxt->input->cur, &len);
|
||||
if (res == -1) {
|
||||
size *= 2;
|
||||
xmlFree(buf);
|
||||
goto retry_larger;
|
||||
}
|
||||
if ((res < 0) ||
|
||||
(len != ctxt->input->buf->buffer->use - processed)) {
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"xmlSwitchEncoding : conversion failed\n");
|
||||
xmlFree(buf);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Conversion succeeded, get rid of the old buffer
|
||||
*/
|
||||
xmlFree(ctxt->input->buf->buffer->content);
|
||||
ctxt->input->buf->buffer->content = buf;
|
||||
ctxt->input->base = buf;
|
||||
ctxt->input->cur = buf;
|
||||
ctxt->input->buf->buffer->size = size;
|
||||
ctxt->input->buf->buffer->use = res;
|
||||
buf[res] = 0;
|
||||
}
|
||||
return;
|
||||
} else {
|
||||
if (ctxt->input->length == 0) {
|
||||
/*
|
||||
* When parsing a static memory array one must know the
|
||||
* size to be able to convert the buffer.
|
||||
*/
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"xmlSwitchEncoding : no input\n");
|
||||
return;
|
||||
} else {
|
||||
xmlChar *buf;
|
||||
int res, len;
|
||||
int processed = ctxt->input->cur - ctxt->input->base;
|
||||
|
||||
/*
|
||||
* convert the non processed part
|
||||
*/
|
||||
len = ctxt->input->length - processed;
|
||||
if (len <= 0) {
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"xmlSwitchEncoding : input fully consumed?\n");
|
||||
return;
|
||||
}
|
||||
buf = (xmlChar *) xmlMalloc(ctxt->input->length * 4);
|
||||
if (buf == NULL) {
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"xmlSwitchEncoding : out of memory\n");
|
||||
return;
|
||||
}
|
||||
res = handler->input(buf, ctxt->input->length * 4,
|
||||
ctxt->input->cur, &len);
|
||||
if ((res < 0) ||
|
||||
(len != ctxt->input->length - processed)) {
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"xmlSwitchEncoding : conversion failed\n");
|
||||
xmlFree(buf);
|
||||
return;
|
||||
}
|
||||
/*
|
||||
* Conversion succeeded, get rid of the old buffer
|
||||
*/
|
||||
if ((ctxt->input->free != NULL) &&
|
||||
(ctxt->input->base != NULL))
|
||||
ctxt->input->free((xmlChar *) ctxt->input->base);
|
||||
ctxt->input->base = ctxt->input->cur = buf;
|
||||
ctxt->input->length = res;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"xmlSwitchEncoding : no input\n");
|
||||
}
|
||||
}
|
||||
|
||||
switch (enc) {
|
||||
case XML_CHAR_ENCODING_ERROR:
|
||||
ctxt->errNo = XML_ERR_UNKNOWN_ENCODING;
|
||||
@ -2437,21 +2297,35 @@ retry_larger:
|
||||
break;
|
||||
case XML_CHAR_ENCODING_NONE:
|
||||
/* let's assume it's UTF-8 without the XML decl */
|
||||
return;
|
||||
return(0);
|
||||
case XML_CHAR_ENCODING_UTF8:
|
||||
/* default encoding, no conversion should be needed */
|
||||
return;
|
||||
case XML_CHAR_ENCODING_UTF16LE:
|
||||
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
|
||||
return(0);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
handler = xmlGetCharEncodingHandler(enc);
|
||||
if (handler == NULL) {
|
||||
/*
|
||||
* Default handlers.
|
||||
*/
|
||||
switch (enc) {
|
||||
case XML_CHAR_ENCODING_ERROR:
|
||||
ctxt->errNo = XML_ERR_UNKNOWN_ENCODING;
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"char encoding UTF16 little endian not supported\n");
|
||||
ctxt->sax->error(ctxt->userData, "encoding unknown\n");
|
||||
ctxt->wellFormed = 0;
|
||||
ctxt->disableSAX = 1;
|
||||
break;
|
||||
case XML_CHAR_ENCODING_NONE:
|
||||
/* let's assume it's UTF-8 without the XML decl */
|
||||
return(0);
|
||||
case XML_CHAR_ENCODING_UTF8:
|
||||
/* default encoding, no conversion should be needed */
|
||||
return(0);
|
||||
case XML_CHAR_ENCODING_UTF16LE:
|
||||
break;
|
||||
case XML_CHAR_ENCODING_UTF16BE:
|
||||
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"char encoding UTF16 big endian not supported\n");
|
||||
break;
|
||||
case XML_CHAR_ENCODING_UCS4LE:
|
||||
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
|
||||
@ -2490,59 +2364,23 @@ retry_larger:
|
||||
"char encoding UCS2 not supported\n");
|
||||
break;
|
||||
case XML_CHAR_ENCODING_8859_1:
|
||||
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"char encoding ISO_8859_1 ISO Latin 1 not supported\n");
|
||||
break;
|
||||
case XML_CHAR_ENCODING_8859_2:
|
||||
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"char encoding ISO_8859_2 ISO Latin 2 not supported\n");
|
||||
break;
|
||||
case XML_CHAR_ENCODING_8859_3:
|
||||
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"char encoding ISO_8859_3 not supported\n");
|
||||
break;
|
||||
case XML_CHAR_ENCODING_8859_4:
|
||||
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"char encoding ISO_8859_4 not supported\n");
|
||||
break;
|
||||
case XML_CHAR_ENCODING_8859_5:
|
||||
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"char encoding ISO_8859_5 not supported\n");
|
||||
break;
|
||||
case XML_CHAR_ENCODING_8859_6:
|
||||
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"char encoding ISO_8859_6 not supported\n");
|
||||
break;
|
||||
case XML_CHAR_ENCODING_8859_7:
|
||||
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"char encoding ISO_8859_7 not supported\n");
|
||||
break;
|
||||
case XML_CHAR_ENCODING_8859_8:
|
||||
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"char encoding ISO_8859_8 not supported\n");
|
||||
break;
|
||||
case XML_CHAR_ENCODING_8859_9:
|
||||
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"char encoding ISO_8859_9 not supported\n");
|
||||
break;
|
||||
/*
|
||||
* Keep the internal content in the document encoding
|
||||
*/
|
||||
if ((ctxt->inputNr == 1) &&
|
||||
(ctxt->encoding == NULL) &&
|
||||
(ctxt->input->encoding != NULL)) {
|
||||
ctxt->encoding = xmlStrdup(ctxt->input->encoding);
|
||||
}
|
||||
return(0);
|
||||
case XML_CHAR_ENCODING_2022_JP:
|
||||
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||
@ -2563,6 +2401,150 @@ retry_larger:
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (handler == NULL)
|
||||
return(-1);
|
||||
return(xmlSwitchToEncoding(ctxt, handler));
|
||||
}
|
||||
|
||||
/**
|
||||
* xmlSwitchToEncoding:
|
||||
* @ctxt: the parser context
|
||||
* @handler: the encoding handler
|
||||
*
|
||||
* change the input functions when discovering the character encoding
|
||||
* of a given entity.
|
||||
*
|
||||
* Returns 0 in case of success, -1 otherwise
|
||||
*/
|
||||
int
|
||||
xmlSwitchToEncoding(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler)
|
||||
{
|
||||
int nbchars;
|
||||
|
||||
if (handler != NULL) {
|
||||
if (ctxt->input != NULL) {
|
||||
if (ctxt->input->buf != NULL) {
|
||||
if (ctxt->input->buf->encoder != NULL) {
|
||||
if (ctxt->input->buf->encoder == handler)
|
||||
return(0);
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"xmlSwitchEncoding : encoder already regitered\n");
|
||||
return(-1);
|
||||
}
|
||||
ctxt->input->buf->encoder = handler;
|
||||
|
||||
/*
|
||||
* Is there already some content down the pipe to convert ?
|
||||
*/
|
||||
if ((ctxt->input->buf->buffer != NULL) &&
|
||||
(ctxt->input->buf->buffer->use > 0)) {
|
||||
int processed;
|
||||
|
||||
/*
|
||||
* Specific handling of the Byte Order Mark for
|
||||
* UTF-16
|
||||
*/
|
||||
if ((handler->name != NULL) &&
|
||||
(!strcmp(handler->name, "UTF-16LE")) &&
|
||||
(ctxt->input->cur[0] == 0xFF) &&
|
||||
(ctxt->input->cur[1] == 0xFE)) {
|
||||
ctxt->input->cur += 2;
|
||||
}
|
||||
if ((handler->name != NULL) &&
|
||||
(!strcmp(handler->name, "UTF-16BE")) &&
|
||||
(ctxt->input->cur[0] == 0xFE) &&
|
||||
(ctxt->input->cur[1] == 0xFF)) {
|
||||
ctxt->input->cur += 2;
|
||||
}
|
||||
|
||||
/*
|
||||
* Shring the current input buffer.
|
||||
* Move it as the raw buffer and create a new input buffer
|
||||
*/
|
||||
processed = ctxt->input->cur - ctxt->input->base;
|
||||
xmlBufferShrink(ctxt->input->buf->buffer, processed);
|
||||
ctxt->input->buf->raw = ctxt->input->buf->buffer;
|
||||
ctxt->input->buf->buffer = xmlBufferCreate();
|
||||
|
||||
/*
|
||||
* convert as much as possible of the raw input
|
||||
* to the parser reading buffer.
|
||||
*/
|
||||
nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
|
||||
ctxt->input->buf->buffer,
|
||||
ctxt->input->buf->raw);
|
||||
if (nbchars < 0) {
|
||||
fprintf(stderr, "xmlSwitchToEncoding: encoder error\n");
|
||||
return(-1);
|
||||
}
|
||||
ctxt->input->base =
|
||||
ctxt->input->cur = ctxt->input->buf->buffer->content;
|
||||
}
|
||||
return(0);
|
||||
} else {
|
||||
if (ctxt->input->length == 0) {
|
||||
/*
|
||||
* When parsing a static memory array one must know the
|
||||
* size to be able to convert the buffer.
|
||||
*/
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"xmlSwitchEncoding : no input\n");
|
||||
return(-1);
|
||||
} else {
|
||||
int processed;
|
||||
|
||||
/*
|
||||
* Shring the current input buffer.
|
||||
* Move it as the raw buffer and create a new input buffer
|
||||
*/
|
||||
processed = ctxt->input->cur - ctxt->input->base;
|
||||
ctxt->input->buf->raw = xmlBufferCreate();
|
||||
xmlBufferAdd(ctxt->input->buf->raw, ctxt->input->cur,
|
||||
ctxt->input->length - processed);
|
||||
ctxt->input->buf->buffer = xmlBufferCreate();
|
||||
|
||||
/*
|
||||
* convert as much as possible of the raw input
|
||||
* to the parser reading buffer.
|
||||
*/
|
||||
nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
|
||||
ctxt->input->buf->buffer,
|
||||
ctxt->input->buf->raw);
|
||||
if (nbchars < 0) {
|
||||
fprintf(stderr, "xmlSwitchToEncoding: encoder error\n");
|
||||
return(-1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Conversion succeeded, get rid of the old buffer
|
||||
*/
|
||||
if ((ctxt->input->free != NULL) &&
|
||||
(ctxt->input->base != NULL))
|
||||
ctxt->input->free((xmlChar *) ctxt->input->base);
|
||||
ctxt->input->base =
|
||||
ctxt->input->cur = ctxt->input->buf->buffer->content;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"xmlSwitchEncoding : no input\n");
|
||||
return(-1);
|
||||
}
|
||||
/*
|
||||
* The parsing is now done in UTF8 natively
|
||||
*/
|
||||
if (ctxt->encoding != NULL) {
|
||||
xmlFree((xmlChar *) ctxt->encoding);
|
||||
ctxt->encoding = NULL;
|
||||
}
|
||||
} else
|
||||
return(-1);
|
||||
return(0);
|
||||
|
||||
}
|
||||
|
||||
/************************************************************************
|
||||
* *
|
||||
@ -4253,7 +4235,7 @@ xmlParseExternalID(xmlParserCtxtPtr ctxt, xmlChar **publicID, int strict) {
|
||||
void
|
||||
xmlParseComment(xmlParserCtxtPtr ctxt) {
|
||||
xmlChar *buf = NULL;
|
||||
int len = 0;
|
||||
int len;
|
||||
int size = XML_PARSER_BUFFER_SIZE;
|
||||
int q, ql;
|
||||
int r, rl;
|
||||
@ -4282,10 +4264,11 @@ xmlParseComment(xmlParserCtxtPtr ctxt) {
|
||||
r = CUR_CHAR(rl);
|
||||
NEXTL(rl);
|
||||
cur = CUR_CHAR(l);
|
||||
len = 0;
|
||||
while (IS_CHAR(cur) &&
|
||||
((cur != '>') ||
|
||||
(r != '-') || (q != '-'))) {
|
||||
if ((r == '-') && (q == '-')) {
|
||||
if ((r == '-') && (q == '-') && (len > 1)) {
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"Comment must not contain '--' (double-hyphen)`\n");
|
||||
@ -4732,12 +4715,37 @@ xmlParseEntityDecl(xmlParserCtxtPtr ctxt) {
|
||||
ctxt->disableSAX = 1;
|
||||
}
|
||||
if (URI) {
|
||||
xmlURIPtr uri;
|
||||
|
||||
uri = xmlParseURI((const char *) URI);
|
||||
if (uri == NULL) {
|
||||
if ((ctxt->sax != NULL) &&
|
||||
(!ctxt->disableSAX) && (ctxt->sax->entityDecl != NULL))
|
||||
(!ctxt->disableSAX) &&
|
||||
(ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"Invalid URI: %s\n", URI);
|
||||
ctxt->wellFormed = 0;
|
||||
ctxt->errNo = XML_ERR_INVALID_URI;
|
||||
} else {
|
||||
if (uri->fragment != NULL) {
|
||||
if ((ctxt->sax != NULL) &&
|
||||
(!ctxt->disableSAX) &&
|
||||
(ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"Fragment not allowed: %s\n", URI);
|
||||
ctxt->wellFormed = 0;
|
||||
ctxt->errNo = XML_ERR_URI_FRAGMENT;
|
||||
} else {
|
||||
if ((ctxt->sax != NULL) &&
|
||||
(!ctxt->disableSAX) &&
|
||||
(ctxt->sax->entityDecl != NULL))
|
||||
ctxt->sax->entityDecl(ctxt->userData, name,
|
||||
XML_EXTERNAL_PARAMETER_ENTITY,
|
||||
literal, URI, NULL);
|
||||
}
|
||||
xmlFreeURI(uri);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if ((RAW == '"') || (RAW == '\'')) {
|
||||
@ -4757,6 +4765,31 @@ xmlParseEntityDecl(xmlParserCtxtPtr ctxt) {
|
||||
ctxt->wellFormed = 0;
|
||||
ctxt->disableSAX = 1;
|
||||
}
|
||||
if (URI) {
|
||||
xmlURIPtr uri;
|
||||
|
||||
uri = xmlParseURI((const char *)URI);
|
||||
if (uri == NULL) {
|
||||
if ((ctxt->sax != NULL) &&
|
||||
(!ctxt->disableSAX) &&
|
||||
(ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"Invalid URI: %s\n", URI);
|
||||
ctxt->wellFormed = 0;
|
||||
ctxt->errNo = XML_ERR_INVALID_URI;
|
||||
} else {
|
||||
if (uri->fragment != NULL) {
|
||||
if ((ctxt->sax != NULL) &&
|
||||
(!ctxt->disableSAX) &&
|
||||
(ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"Fragment not allowed: %s\n", URI);
|
||||
ctxt->wellFormed = 0;
|
||||
ctxt->errNo = XML_ERR_URI_FRAGMENT;
|
||||
}
|
||||
xmlFreeURI(uri);
|
||||
}
|
||||
}
|
||||
if ((RAW != '>') && (!IS_BLANK(CUR))) {
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
@ -5973,7 +6006,20 @@ xmlParseTextDecl(xmlParserCtxtPtr ctxt) {
|
||||
/*
|
||||
* We know that '<?xml' is here.
|
||||
*/
|
||||
if ((RAW == '<') && (NXT(1) == '?') &&
|
||||
(NXT(2) == 'x') && (NXT(3) == 'm') &&
|
||||
(NXT(4) == 'l') && (IS_BLANK(NXT(5)))) {
|
||||
SKIP(5);
|
||||
} else {
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
"Text declaration '<?xml' required\n");
|
||||
ctxt->errNo = XML_ERR_XMLDECL_NOT_STARTED;
|
||||
ctxt->wellFormed = 0;
|
||||
ctxt->disableSAX = 1;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (!IS_BLANK(CUR)) {
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||
@ -6003,7 +6049,13 @@ xmlParseTextDecl(xmlParserCtxtPtr ctxt) {
|
||||
ctxt->wellFormed = 0;
|
||||
ctxt->disableSAX = 1;
|
||||
}
|
||||
ctxt->input->encoding = xmlParseEncodingDecl(ctxt);
|
||||
xmlParseEncodingDecl(ctxt);
|
||||
if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
|
||||
/*
|
||||
* The XML REC instructs us to stop parsing right here
|
||||
*/
|
||||
return;
|
||||
}
|
||||
|
||||
SKIP_BLANKS;
|
||||
if ((RAW == '?') && (NXT(1) == '>')) {
|
||||
@ -6192,6 +6244,13 @@ xmlParseExternalSubset(xmlParserCtxtPtr ctxt, const xmlChar *ExternalID,
|
||||
(NXT(2) == 'x') && (NXT(3) == 'm') &&
|
||||
(NXT(4) == 'l')) {
|
||||
xmlParseTextDecl(ctxt);
|
||||
if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
|
||||
/*
|
||||
* The XML REC instructs us to stop parsing right here
|
||||
*/
|
||||
ctxt->instate = XML_PARSER_EOF;
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (ctxt->myDoc == NULL) {
|
||||
ctxt->myDoc = xmlNewDoc(BAD_CAST "1.0");
|
||||
@ -6441,6 +6500,13 @@ xmlParseReference(xmlParserCtxtPtr ctxt) {
|
||||
(NXT(2) == 'x') && (NXT(3) == 'm') &&
|
||||
(NXT(4) == 'l') && (IS_BLANK(NXT(5)))) {
|
||||
xmlParseTextDecl(ctxt);
|
||||
if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
|
||||
/*
|
||||
* The XML REC instructs us to stop parsing right here
|
||||
*/
|
||||
ctxt->instate = XML_PARSER_EOF;
|
||||
return;
|
||||
}
|
||||
if (input->standalone) {
|
||||
if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
|
||||
ctxt->sax->error(ctxt->userData,
|
||||
@ -6947,6 +7013,15 @@ xmlParsePEReference(xmlParserCtxtPtr ctxt) {
|
||||
(NXT(2) == 'x') && (NXT(3) == 'm') &&
|
||||
(NXT(4) == 'l') && (IS_BLANK(NXT(5)))) {
|
||||
xmlParseTextDecl(ctxt);
|
||||
if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
|
||||
/*
|
||||
* The XML REC instructs us to stop parsing
|
||||
* right here
|
||||
*/
|
||||
ctxt->instate = XML_PARSER_EOF;
|
||||
xmlFree(name);
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (ctxt->token == 0)
|
||||
ctxt->token = ' ';
|
||||
@ -8197,6 +8272,38 @@ xmlParseEncodingDecl(xmlParserCtxtPtr ctxt) {
|
||||
ctxt->disableSAX = 1;
|
||||
ctxt->errNo = XML_ERR_STRING_NOT_STARTED;
|
||||
}
|
||||
if (encoding != NULL) {
|
||||
xmlCharEncoding enc;
|
||||
xmlCharEncodingHandlerPtr handler;
|
||||
|
||||
if (ctxt->input->encoding != NULL)
|
||||
xmlFree((xmlChar *) ctxt->input->encoding);
|
||||
ctxt->input->encoding = encoding;
|
||||
|
||||
enc = xmlParseCharEncoding((const char *) encoding);
|
||||
/*
|
||||
* registered set of known encodings
|
||||
*/
|
||||
if (enc != XML_CHAR_ENCODING_ERROR) {
|
||||
xmlSwitchEncoding(ctxt, enc);
|
||||
if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
|
||||
xmlFree(encoding);
|
||||
return(NULL);
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* fallback for unknown encodings
|
||||
*/
|
||||
handler = xmlFindCharEncodingHandler((const char *) encoding);
|
||||
if (handler != NULL) {
|
||||
xmlSwitchToEncoding(ctxt, handler);
|
||||
} else {
|
||||
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
|
||||
xmlFree(encoding);
|
||||
return(NULL);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return(encoding);
|
||||
}
|
||||
@ -8362,7 +8469,13 @@ xmlParseXMLDecl(xmlParserCtxtPtr ctxt) {
|
||||
ctxt->wellFormed = 0;
|
||||
ctxt->disableSAX = 1;
|
||||
}
|
||||
ctxt->input->encoding = xmlParseEncodingDecl(ctxt);
|
||||
xmlParseEncodingDecl(ctxt);
|
||||
if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
|
||||
/*
|
||||
* The XML REC instructs us to stop parsing right here
|
||||
*/
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* We may have the standalone status.
|
||||
@ -8489,12 +8602,19 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) {
|
||||
if ((RAW == '<') && (NXT(1) == '?') &&
|
||||
(NXT(2) == 'x') && (NXT(3) == 'm') &&
|
||||
(NXT(4) == 'l') && (IS_BLANK(NXT(5)))) {
|
||||
|
||||
/*
|
||||
* Note that we will switch encoding on the fly.
|
||||
*/
|
||||
xmlParseXMLDecl(ctxt);
|
||||
if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
|
||||
/*
|
||||
* The XML REC instructs us to stop parsing right here
|
||||
*/
|
||||
return(-1);
|
||||
}
|
||||
ctxt->standalone = ctxt->input->standalone;
|
||||
SKIP_BLANKS;
|
||||
if ((ctxt->encoding == NULL) && (ctxt->input->encoding != NULL))
|
||||
ctxt->encoding = xmlStrdup(ctxt->input->encoding);
|
||||
|
||||
} else {
|
||||
ctxt->version = xmlCharStrdup(XML_DEFAULT_VERSION);
|
||||
}
|
||||
@ -8581,14 +8701,6 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) {
|
||||
(!ctxt->disableSAX))
|
||||
ctxt->sax->endDocument(ctxt->userData);
|
||||
|
||||
/*
|
||||
* Grab the encoding if it was added on-the-fly
|
||||
*/
|
||||
if ((ctxt->encoding != NULL) && (ctxt->myDoc != NULL) &&
|
||||
(ctxt->myDoc->encoding == NULL)) {
|
||||
ctxt->myDoc->encoding = ctxt->encoding;
|
||||
ctxt->encoding = NULL;
|
||||
}
|
||||
if (! ctxt->wellFormed) return(-1);
|
||||
return(0);
|
||||
}
|
||||
@ -8805,6 +8917,14 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
|
||||
fprintf(stderr, "PP: Parsing XML Decl\n");
|
||||
#endif
|
||||
xmlParseXMLDecl(ctxt);
|
||||
if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
|
||||
/*
|
||||
* The XML REC instructs us to stop parsing right
|
||||
* here
|
||||
*/
|
||||
ctxt->instate = XML_PARSER_EOF;
|
||||
return(0);
|
||||
}
|
||||
ctxt->standalone = ctxt->input->standalone;
|
||||
if ((ctxt->encoding == NULL) &&
|
||||
(ctxt->input->encoding != NULL))
|
||||
|
@ -28,10 +28,10 @@ extern "C" {
|
||||
* any Unicode character, excluding the surrogate blocks, FFFE, and FFFF.
|
||||
*/
|
||||
#define IS_CHAR(c) \
|
||||
((((c) == 0x09) || ((c) == 0x0A) || ((c) == 0x0D) || \
|
||||
(((c) >= 0x20) && ((c) != 0xFFFE) && ((c) != 0xFFFF))) && \
|
||||
(((c) <= 0xD7FF) || ((c) >= 0xE000)) && ((c) >= 0) && \
|
||||
((c) <= 0x10FFFF))
|
||||
(((c) == 0x09) || ((c) == 0x0A) || ((c) == 0x0D) || \
|
||||
(((c) >= 0x20) && ((c) <= 0xD7FF)) || \
|
||||
(((c) >= 0xE000) && ((c) <= 0xFFFD)) || \
|
||||
(((c) >= 0x10000) && ((c) <= 0x10FFFF)))
|
||||
|
||||
/*
|
||||
* [3] S ::= (#x20 | #x9 | #xD | #xA)+
|
||||
@ -442,8 +442,10 @@ xmlParserCtxtPtr xmlNewParserCtxt (void);
|
||||
xmlParserCtxtPtr xmlCreateEntityParserCtxt(const xmlChar *URL,
|
||||
const xmlChar *ID,
|
||||
const xmlChar *base);
|
||||
void xmlSwitchEncoding (xmlParserCtxtPtr ctxt,
|
||||
int xmlSwitchEncoding (xmlParserCtxtPtr ctxt,
|
||||
xmlCharEncoding enc);
|
||||
int xmlSwitchToEncoding (xmlParserCtxtPtr ctxt,
|
||||
xmlCharEncodingHandlerPtr handler);
|
||||
void xmlFreeParserCtxt (xmlParserCtxtPtr ctxt);
|
||||
|
||||
/**
|
||||
|
25
tree.c
25
tree.c
@ -3771,6 +3771,31 @@ xmlBufferShrink(xmlBufferPtr buf, int len) {
|
||||
return(len);
|
||||
}
|
||||
|
||||
/**
|
||||
* xmlBufferGrow:
|
||||
* @buf: the buffer
|
||||
* @len: the minimum free sie to allocate
|
||||
*
|
||||
* Grow the available space of an XML buffer.
|
||||
*
|
||||
* Returns the new available space or -1 in case of error
|
||||
*/
|
||||
int
|
||||
xmlBufferGrow(xmlBufferPtr buf, int len) {
|
||||
int size;
|
||||
xmlChar *newbuf;
|
||||
|
||||
if (len <= buf->use) return(0);
|
||||
|
||||
size = buf->size + buf->use + len + 100;
|
||||
|
||||
newbuf = xmlRealloc(buf->content, size);
|
||||
if (newbuf == NULL) return(-1);
|
||||
buf->content = newbuf;
|
||||
buf->size = size;
|
||||
return(buf->size - buf->use);
|
||||
}
|
||||
|
||||
/**
|
||||
* xmlBufferDump:
|
||||
* @file: the file output
|
||||
|
2
tree.h
2
tree.h
@ -380,6 +380,8 @@ void xmlBufferCCat (xmlBufferPtr buf,
|
||||
const char *str);
|
||||
int xmlBufferShrink (xmlBufferPtr buf,
|
||||
int len);
|
||||
int xmlBufferGrow (xmlBufferPtr buf,
|
||||
int len);
|
||||
void xmlBufferEmpty (xmlBufferPtr buf);
|
||||
const xmlChar* xmlBufferContent (const xmlBufferPtr buf);
|
||||
int xmlBufferUse (const xmlBufferPtr buf);
|
||||
|
28
uri.c
28
uri.c
@ -1283,6 +1283,34 @@ xmlParseURIReference(xmlURIPtr uri, const char *str) {
|
||||
return(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* xmlParseURI:
|
||||
* @str: the URI string to analyze
|
||||
*
|
||||
* Parse an URI
|
||||
*
|
||||
* URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ]
|
||||
*
|
||||
* Returns a newly build xmlURIPtr or NULL in case of error
|
||||
*/
|
||||
xmlURIPtr
|
||||
xmlParseURI(const char *str) {
|
||||
xmlURIPtr uri;
|
||||
int ret;
|
||||
|
||||
if (str == NULL)
|
||||
return(NULL);
|
||||
uri = xmlCreateURI();
|
||||
if (uri != NULL) {
|
||||
ret = xmlParseURIReference(uri, str);
|
||||
if (ret) {
|
||||
xmlFreeURI(uri);
|
||||
return(NULL);
|
||||
}
|
||||
}
|
||||
return(uri);
|
||||
}
|
||||
|
||||
/**
|
||||
* xmlNormalizeURIPath:
|
||||
* @path: pointer to the path string
|
||||
|
@ -130,7 +130,9 @@ typedef enum {
|
||||
XML_ERR_ENTITY_CHAR_ERROR, /* 88 */
|
||||
XML_ERR_ENTITY_PE_INTERNAL, /* 88 */
|
||||
XML_ERR_ENTITY_LOOP, /* 89 */
|
||||
XML_ERR_ENTITY_BOUNDARY /* 90 */
|
||||
XML_ERR_ENTITY_BOUNDARY, /* 90 */
|
||||
XML_ERR_INVALID_URI, /* 91 */
|
||||
XML_ERR_URI_FRAGMENT /* 92 */
|
||||
}xmlParserErrors;
|
||||
|
||||
void xmlParserError (void *ctx,
|
||||
|
91
xmlIO.c
91
xmlIO.c
@ -498,6 +498,10 @@ xmlAllocParserInputBuffer(xmlCharEncoding enc) {
|
||||
}
|
||||
ret->buffer->alloc = XML_BUFFER_ALLOC_DOUBLEIT;
|
||||
ret->encoder = xmlGetCharEncodingHandler(enc);
|
||||
if (ret->encoder != NULL)
|
||||
ret->raw = xmlBufferCreate();
|
||||
else
|
||||
ret->raw = NULL;
|
||||
ret->readcallback = NULL;
|
||||
ret->closecallback = NULL;
|
||||
ret->context = NULL;
|
||||
@ -513,13 +517,20 @@ xmlAllocParserInputBuffer(xmlCharEncoding enc) {
|
||||
*/
|
||||
void
|
||||
xmlFreeParserInputBuffer(xmlParserInputBufferPtr in) {
|
||||
if (in->buffer != NULL) {
|
||||
xmlBufferFree(in->buffer);
|
||||
in->buffer = NULL;
|
||||
if (in->raw) {
|
||||
xmlBufferFree(in->raw);
|
||||
in->raw = NULL;
|
||||
}
|
||||
if (in->encoder != NULL) {
|
||||
xmlCharEncCloseFunc(in->encoder);
|
||||
}
|
||||
if (in->closecallback != NULL) {
|
||||
in->closecallback(in->context);
|
||||
}
|
||||
if (in->buffer != NULL) {
|
||||
xmlBufferFree(in->buffer);
|
||||
in->buffer = NULL;
|
||||
}
|
||||
|
||||
memset(in, 0xbe, (size_t) sizeof(xmlParserInputBuffer));
|
||||
xmlFree(in);
|
||||
@ -683,34 +694,22 @@ xmlParserInputBufferPush(xmlParserInputBufferPtr in, int len, const char *buf) {
|
||||
|
||||
if (len < 0) return(0);
|
||||
if (in->encoder != NULL) {
|
||||
xmlChar *buffer;
|
||||
int processed = len;
|
||||
|
||||
buffer = (xmlChar *) xmlMalloc((len + 1) * 2 * sizeof(xmlChar));
|
||||
if (buffer == NULL) {
|
||||
fprintf(stderr, "xmlParserInputBufferGrow : out of memory !\n");
|
||||
return(-1);
|
||||
}
|
||||
nbchars = in->encoder->input(buffer, (len + 1) * 2 * sizeof(xmlChar),
|
||||
(xmlChar *) buf, &processed);
|
||||
/*
|
||||
* TODO : we really need to have something atomic or the
|
||||
* encoder must report the number of bytes read
|
||||
* Store the data in the incoming raw buffer
|
||||
*/
|
||||
if (in->raw == NULL) {
|
||||
in->raw = xmlBufferCreate();
|
||||
}
|
||||
xmlBufferAdd(in->raw, (const xmlChar *) buf, len);
|
||||
|
||||
/*
|
||||
* convert as much as possible to the parser reading buffer.
|
||||
*/
|
||||
nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
|
||||
if (nbchars < 0) {
|
||||
fprintf(stderr, "xmlParserInputBufferPush: encoder error\n");
|
||||
xmlFree(buffer);
|
||||
return(-1);
|
||||
}
|
||||
if (processed != len) {
|
||||
fprintf(stderr,
|
||||
"TODO xmlParserInputBufferPush: processed != len\n");
|
||||
xmlFree(buffer);
|
||||
return(-1);
|
||||
}
|
||||
buffer[nbchars] = 0;
|
||||
xmlBufferAdd(in->buffer, (xmlChar *) buffer, nbchars);
|
||||
xmlFree(buffer);
|
||||
} else {
|
||||
nbchars = len;
|
||||
xmlBufferAdd(in->buffer, (xmlChar *) buf, nbchars);
|
||||
@ -730,7 +729,9 @@ xmlParserInputBufferPush(xmlParserInputBufferPtr in, int len, const char *buf) {
|
||||
* Grow up the content of the input buffer, the old data are preserved
|
||||
* This routine handle the I18N transcoding to internal UTF-8
|
||||
* This routine is used when operating the parser in normal (pull) mode
|
||||
* TODO: one should be able to remove one extra copy
|
||||
*
|
||||
* TODO: one should be able to remove one extra copy by copying directy
|
||||
* onto in->buffer or in->raw
|
||||
*
|
||||
* Returns the number of chars read and stored in the buffer, or -1
|
||||
* in case of error.
|
||||
@ -779,34 +780,22 @@ xmlParserInputBufferGrow(xmlParserInputBufferPtr in, int len) {
|
||||
return(-1);
|
||||
}
|
||||
if (in->encoder != NULL) {
|
||||
xmlChar *buf;
|
||||
int wrote = res;
|
||||
/*
|
||||
* Store the data in the incoming raw buffer
|
||||
*/
|
||||
if (in->raw == NULL) {
|
||||
in->raw = xmlBufferCreate();
|
||||
}
|
||||
xmlBufferAdd(in->raw, (const xmlChar *) buffer, len);
|
||||
|
||||
buf = (xmlChar *) xmlMalloc((res + 1) * 2 * sizeof(xmlChar));
|
||||
if (buf == NULL) {
|
||||
fprintf(stderr, "xmlParserInputBufferGrow : out of memory !\n");
|
||||
xmlFree(buffer);
|
||||
/*
|
||||
* convert as much as possible to the parser reading buffer.
|
||||
*/
|
||||
nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
|
||||
if (nbchars < 0) {
|
||||
fprintf(stderr, "xmlParserInputBufferGrow: encoder error\n");
|
||||
return(-1);
|
||||
}
|
||||
nbchars = in->encoder->input(buf, (res + 1) * 2 * sizeof(xmlChar),
|
||||
BAD_CAST buffer, &wrote);
|
||||
buf[nbchars] = 0;
|
||||
xmlBufferAdd(in->buffer, (xmlChar *) buf, nbchars);
|
||||
xmlFree(buf);
|
||||
|
||||
/*
|
||||
* Check that the encoder was able to process the full input
|
||||
*/
|
||||
if (wrote != res) {
|
||||
fprintf(stderr,
|
||||
"TODO : xmlParserInputBufferGrow wrote %d != res %d\n",
|
||||
wrote, res);
|
||||
/*
|
||||
* TODO !!!
|
||||
* Need to keep the unprocessed input in a buffer in->unprocessed
|
||||
*/
|
||||
}
|
||||
|
||||
} else {
|
||||
nbchars = res;
|
||||
buffer[nbchars] = 0;
|
||||
|
Reference in New Issue
Block a user