From 496a1cf59284292275cc5643e6078748dc79340e Mon Sep 17 00:00:00 2001 From: Daniel Veillard Date: Wed, 3 May 2000 14:20:55 +0000 Subject: [PATCH] revamped the encoding support, added iconv support, so now libxml if * encoding.[ch], xmlIO.[ch], parser.c, configure.in : revamped the encoding support, added iconv support, so now libxml if compiled with iconv automatically support japanese encodings among others. Work based on initial patch from Yuan-Chen Cheng I may have broken binary compat in the encoding handler registration scheme, but that was so utterly broken I don't expect anybody to have used this feature until now. * parserInternals.h: fixup on the CHAR range macro * xml-error.h, parser.c: catch URL/URI errors using the uri.c code. * tree.[ch]: added xmlBufferGrow(), was needed for iconv * uri.c: added xmlParseURI() I can't believe I forgot to implement this one in 2.0 !!! * SAX.c: moved doc->encoding update in the endDocument() call. * TODO: updated. Iconv rules :-) Daniel --- ChangeLog | 18 + SAX.c | 9 + TODO | 9 +- configure.in | 16 +- encoding.c | 728 +++++++++++++++++++++++-------- encoding.h | 47 +- include/libxml/encoding.h | 47 +- include/libxml/parserInternals.h | 12 +- include/libxml/tree.h | 2 + include/libxml/xmlIO.h | 1 + parser.c | 600 +++++++++++++++---------- parserInternals.h | 12 +- tree.c | 25 ++ tree.h | 2 + uri.c | 28 ++ xml-error.h | 4 +- xmlIO.c | 89 ++-- xmlIO.h | 1 + 18 files changed, 1163 insertions(+), 487 deletions(-) diff --git a/ChangeLog b/ChangeLog index 5ccb9e5e..5361b29c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,21 @@ +Wed May 3 14:21:25 CEST 2000 Daniel Veillard + + * encoding.[ch], xmlIO.[ch], parser.c, configure.in : revamped + the encoding support, added iconv support, so now libxml if + compiled with iconv automatically support japanese encodings + among others. Work based on initial patch from Yuan-Chen Cheng + I may have broken binary compat in the encoding handler + registration scheme, but that was so utterly broken I don't + expect anybody to have used this feature until now. + * parserInternals.h: fixup on the CHAR range macro + * xml-error.h, parser.c: catch URL/URI errors using the uri.c + code. + * tree.[ch]: added xmlBufferGrow(), was needed for iconv + * uri.c: added xmlParseURI() I can't believe I forgot to + implement this one in 2.0 !!! + * SAX.c: moved doc->encoding update in the endDocument() call. + * TODO: updated. + Mon Apr 24 13:30:13 CEST 2000 Daniel Veillard * tree.h: removed extraneous xmlRemoveProp definition diff --git a/SAX.c b/SAX.c index 5293df72..dace3058 100644 --- a/SAX.c +++ b/SAX.c @@ -595,6 +595,15 @@ endDocument(void *ctx) if (ctxt->validate && ctxt->wellFormed && ctxt->myDoc && ctxt->myDoc->intSubset) ctxt->valid &= xmlValidateDocumentFinal(&ctxt->vctxt, ctxt->myDoc); + + /* + * Grab the encoding if it was added on-the-fly + */ + if ((ctxt->encoding != NULL) && (ctxt->myDoc != NULL) && + (ctxt->myDoc->encoding == NULL)) { + ctxt->myDoc->encoding = ctxt->encoding; + ctxt->encoding = NULL; + } } /** diff --git a/TODO b/TODO index 51ea18b5..2b4ae5ee 100644 --- a/TODO +++ b/TODO @@ -6,6 +6,8 @@ TODO: ===== +- xmlSwitchToEncoding() need a rewrite for correct handling of conversion + error code conditions. - DOM needs xmlAttrPtr xmlNewDocProp(xmlDocPtr doc, const xmlChar *name, const xmlChar *value) int xmlPruneProp(xmlNodePtr node, xmlAtttrPtr attr); @@ -14,7 +16,6 @@ TODO: - add support for the trick from Henry conf/sun/valid/empty.xml - Correct standalone checking/emitting (hard) 2.9 Standalone Document Declaration -- URI checkings (no fragments) rfc2396.txt - Better checking of external parsed entities TAG 1234 - Find way of representing PERefs in the Dtd so that %entity; can be saved back. @@ -22,6 +23,7 @@ TODO: http://www.w3.org/XML/xml-19980210-errata ... bummmer - Handle undefined namespaces in entity contents better ... at least issue a warning +- Issue warning when using non-absolute namespaces URI. - General checking of DTD validation in presence of namespaces ... hairy - fix --disable-corba configure switch handling, and use XML_WITHOUT_CORBA not WITHOUT_CORBA flag @@ -30,7 +32,7 @@ TODO: ===== - Get OASIS testsuite to a more friendly result, check all the results - once stable. + once stable. Current state at: http://xmlsoft.org/conf/result.html - Optimization of tag strings allocation ? @@ -55,11 +57,13 @@ EXTENSIONS: - Add Xlink recognition/API => started adding an xlink.[ch] with a unified API for XML and HTML. + it's crap :-( - Implement XSLT => seems that someone volunteered ?!? - Implement XSchemas + => Really need to be done - O2K parsing; => this is a somewhat ugly mix of HTML and XML, adding a specific @@ -88,6 +92,7 @@ EXTENSIONS: Done: ===== +- URI checkings (no fragments) rfc2396.txt - Added a clean mechanism for overload or added input methods: xmlRegisterInputCallbacks() - dynamically adapt the alloc entry point to use g_alloc()/g_free() diff --git a/configure.in b/configure.in index baea9330..3ef84fb7 100644 --- a/configure.in +++ b/configure.in @@ -4,7 +4,7 @@ AC_INIT(entities.h) AM_CONFIG_HEADER(config.h) LIBXML_MAJOR_VERSION=2 -LIBXML_MINOR_VERSION=0 +LIBXML_MINOR_VERSION=1 LIBXML_MICRO_VERSION=0 LIBXML_VERSION=$LIBXML_MAJOR_VERSION.$LIBXML_MINOR_VERSION.$LIBXML_MICRO_VERSION LIBXML_VERSION_INFO=`expr $LIBXML_MAJOR_VERSION + $LIBXML_MINOR_VERSION`:$LIBXML_MICRO_VERSION:$LIBXML_MINOR_VERSION @@ -203,6 +203,20 @@ fi AC_SUBST(WITH_XPATH) AC_SUBST(XPATH_OBJ) +AC_ARG_WITH(iconv, [ --with-iconv Add the ICONV support (on)]) +if test "$with_iconv" = "no" ; then + echo Disabling ICONV support + WITH_ICONV=0 +else + if test "$have_iconv" != "" ; then + echo Iconv support not found + WITH_ICONV=0 + else + WITH_ICONV=1 + fi +fi +AC_SUBST(WITH_ICONV) + AC_ARG_WITH(debug, [ --with-debug Add the debugging module (on)]) if test "$with_debug" = "no" ; then echo Disabling DEBUG support diff --git a/encoding.c b/encoding.c index 42009291..1a4c157f 100644 --- a/encoding.c +++ b/encoding.c @@ -34,12 +34,26 @@ #ifdef HAVE_STDLIB_H #include #endif +#include +#ifdef LIBXML_ICONV_ENABLED +#ifdef HAVE_ERRNO_H +#include +#endif +#endif #include #include xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL; xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL; +#ifdef LIBXML_ICONV_ENABLED +#if 0 +#define DEBUG_ENCODING /* Define this to get encoding traces */ +#endif +#endif + +static int xmlLittleEndian = 1; + /* * From rfc2044: encoding of the Unicode values on UTF-8: * @@ -104,30 +118,38 @@ xmlCheckUTF8(const unsigned char *utf) * * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8 * block of chars out. - * Returns the number of byte written, or -1 by lack of space. + * Returns 0 if success, or -1 otherwise + * The value of @inlen after return is the number of octets consumed + * as the return value is positive, else unpredictiable. + * The value of @outlen after return is the number of ocetes consumed. */ int -isolat1ToUTF8(unsigned char* out, int outlen, +isolat1ToUTF8(unsigned char* out, int *outlen, const unsigned char* in, int *inlen) { - unsigned char* outstart= out; - unsigned char* outend= out+outlen; - const unsigned char* inend= in+*inlen; + unsigned char* outstart = out; + const unsigned char* processed = in; + unsigned char* outend = out + *outlen; + const unsigned char* inend = in + *inlen; unsigned char c; while (in < inend) { c= *in++; if (c < 0x80) { - if (out >= outend) return(-1); + if (out >= outend) + break; *out++ = c; } else { - if (out >= outend) return(-1); + if (out + 1 >= outend) break; *out++ = 0xC0 | (c >> 6); - if (out >= outend) return(-1); *out++ = 0x80 | (0x3F & c); } + processed = in; } - return(out-outstart); + *outlen = out - outstart; + *inlen = processed - in; + + return(0); } /** @@ -141,18 +163,18 @@ isolat1ToUTF8(unsigned char* out, int outlen, * block of chars out. * TODO: UTF8Toisolat1 need a fallback mechanism ... * - * Returns the number of byte written, or -1 by lack of space, or -2 - * if the transcoding fails (for *in is not valid utf8 string or - * the result of transformation can't fit into the encoding we want) + * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise * The value of @inlen after return is the number of octets consumed * as the return value is positive, else unpredictiable. + * The value of @outlen after return is the number of ocetes consumed. */ int -UTF8Toisolat1(unsigned char* out, int outlen, +UTF8Toisolat1(unsigned char* out, int *outlen, const unsigned char* in, int *inlen) { - unsigned char* outstart= out; - unsigned char* outend= out+outlen; - const unsigned char* inend= in+*inlen; + unsigned char* outstart = out; + const unsigned char* processed = in; + unsigned char* outend = out + *outlen; + const unsigned char* inend = in + *inlen; unsigned char c; while (in < inend) { @@ -162,18 +184,22 @@ UTF8Toisolat1(unsigned char* out, int outlen, *out++= c; } else if (in == inend) { - *inlen -= 1; break; } else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) { /* a two byte utf-8 and can be encoding as isolate1 */ *out++= ((c & 0x03) << 6) | (*in++ & 0x3F); } - else + else { + *outlen = out - outstart; + *inlen = processed - in; return(-2); - /* TODO : some should be represent as "&#x____;" */ + } + processed = in; } - return(out-outstart); + *outlen = out - outstart; + *inlen = processed - in; + return(0); } /** @@ -194,11 +220,12 @@ UTF8Toisolat1(unsigned char* out, int outlen, * as the return value is positive, else unpredictiable. */ int -UTF16LEToUTF8(unsigned char* out, int outlen, +UTF16LEToUTF8(unsigned char* out, int *outlen, const unsigned char* inb, int *inlenb) { - unsigned char* outstart= out; - unsigned char* outend= out+outlen; + unsigned char* outstart = out; + const unsigned char* processed = inb; + unsigned char* outend = out + *outlen; unsigned short* in = (unsigned short*) inb; unsigned short* inend; unsigned int c, d, inlen; @@ -208,42 +235,44 @@ UTF16LEToUTF8(unsigned char* out, int outlen, if ((*inlenb % 2) == 1) (*inlenb)--; inlen = *inlenb / 2; - inend= in + inlen; + inend = in + inlen; while (in < inend) { -#ifdef BIG_ENDIAN - tmp = (unsigned char *) in; - c = *tmp++; - c = c | (((unsigned int)*tmp) << 8); - in++; -#else /* BIG_ENDIAN */ - c= *in++; -#endif /* BIG_ENDIAN */ + if (xmlLittleEndian) { + c= *in++; + } else { + tmp = (unsigned char *) in; + c = *tmp++; + c = c | (((unsigned int)*tmp) << 8); + in++; + } if ((c & 0xFC00) == 0xD800) { /* surrogates */ if (in >= inend) { /* (in > inend) shouldn't happens */ - (*inlenb) -= 2; break; } -#ifdef BIG_ENDIAN - tmp = (unsigned char *) in; - d = *tmp++; - d = d | (((unsigned int)*tmp) << 8); - in++; -#else /* BIG_ENDIAN */ - d = *in++; -#endif /* BIG_ENDIAN */ + if (xmlLittleEndian) { + d = *in++; + } else { + tmp = (unsigned char *) in; + d = *tmp++; + d = d | (((unsigned int)*tmp) << 8); + in++; + } if ((d & 0xFC00) == 0xDC00) { c &= 0x03FF; c <<= 10; c |= d & 0x03FF; c += 0x10000; } - else + else { + *outlen = out - outstart; + *inlenb = processed - inb; return(-2); + } } /* assertion: c is a single UTF-4 value */ if (out >= outend) - return(-1); + break; if (c < 0x80) { *out++= c; bits= -6; } else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; } else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; } @@ -251,11 +280,14 @@ UTF16LEToUTF8(unsigned char* out, int outlen, for ( ; bits >= 0; bits-= 6) { if (out >= outend) - return(-1); + break; *out++= ((c >> bits) & 0x3F) | 0x80; } + processed = (const unsigned char*) in; } - return(out-outstart); + *outlen = out - outstart; + *inlenb = processed - inb; + return(0); } /** @@ -273,40 +305,44 @@ UTF16LEToUTF8(unsigned char* out, int outlen, * if the transcoding failed. */ int -UTF8ToUTF16LE(unsigned char* outb, int outlen, +UTF8ToUTF16LE(unsigned char* outb, int *outlen, const unsigned char* in, int *inlen) { unsigned short* out = (unsigned short*) outb; + const unsigned char* processed = in; unsigned short* outstart= out; unsigned short* outend; const unsigned char* inend= in+*inlen; unsigned int c, d, trailing; -#ifdef BIG_ENDIAN unsigned char *tmp; unsigned short tmp1, tmp2; -#endif /* BIG_ENDIAN */ - outlen /= 2; /* convert in short length */ - outend = out + outlen; + outend = out + (*outlen / 2); while (in < inend) { d= *in++; if (d < 0x80) { c= d; trailing= 0; } - else if (d < 0xC0) - return(-2); /* trailing byte in leading position */ - else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } + else if (d < 0xC0) { + /* trailing byte in leading position */ + *outlen = out - outstart; + *inlen = processed - in; + return(-2); + } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } else if (d < 0xF8) { c= d & 0x07; trailing= 3; } - else - return(-2); /* no chance for this in UTF-16 */ + else { + /* no chance for this in UTF-16 */ + *outlen = out - outstart; + *inlen = processed - in; + return(-2); + } if (inend - in < trailing) { - *inlen -= (inend - in); break; } for ( ; trailing; trailing--) { if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) - return(-1); + break; c <<= 6; c |= d & 0x3F; } @@ -314,41 +350,44 @@ UTF8ToUTF16LE(unsigned char* outb, int outlen, /* assertion: c is a single UTF-4 value */ if (c < 0x10000) { if (out >= outend) - return(-1); -#ifdef BIG_ENDIAN - tmp = (unsigned char *) out; - *tmp = c ; - *(tmp + 1) = c >> 8 ; - out++; -#else /* BIG_ENDIAN */ - *out++ = c; -#endif /* BIG_ENDIAN */ + break; + if (xmlLittleEndian) { + *out++ = c; + } else { + tmp = (unsigned char *) out; + *tmp = c ; + *(tmp + 1) = c >> 8 ; + out++; + } } else if (c < 0x110000) { if (out+1 >= outend) - return(-1); + break; c -= 0x10000; -#ifdef BIG_ENDIAN - tmp1 = 0xD800 | (c >> 10); - tmp = (unsigned char *) out; - *tmp = tmp1; - *(tmp + 1) = tmp1 >> 8; - out++; + if (xmlLittleEndian) { + *out++ = 0xD800 | (c >> 10); + *out++ = 0xDC00 | (c & 0x03FF); + } else { + tmp1 = 0xD800 | (c >> 10); + tmp = (unsigned char *) out; + *tmp = tmp1; + *(tmp + 1) = tmp1 >> 8; + out++; - tmp2 = 0xDC00 | (c & 0x03FF); - tmp = (unsigned char *) out; - *tmp = tmp2; - *(tmp + 1) = tmp2 >> 8; - out++; -#else /* BIG_ENDIAN */ - *out++ = 0xD800 | (c >> 10); - *out++ = 0xDC00 | (c & 0x03FF); -#endif /* BIG_ENDIAN */ + tmp2 = 0xDC00 | (c & 0x03FF); + tmp = (unsigned char *) out; + *tmp = tmp2; + *(tmp + 1) = tmp2 >> 8; + out++; + } } else - return(-1); + break; + processed = in; } - return(out-outstart); + *outlen = out - outstart; + *inlen = processed - in; + return(0); } /** @@ -369,18 +408,16 @@ UTF8ToUTF16LE(unsigned char* outb, int outlen, * as the return value is positive, else unpredictiable. */ int -UTF16BEToUTF8(unsigned char* out, int outlen, +UTF16BEToUTF8(unsigned char* out, int *outlen, const unsigned char* inb, int *inlenb) { - unsigned char* outstart= out; - unsigned char* outend= out+outlen; + unsigned char* outstart = out; + const unsigned char* processed = inb; + unsigned char* outend = out + *outlen; unsigned short* in = (unsigned short*) inb; unsigned short* inend; unsigned int c, d, inlen; -#ifdef BIG_ENDIAN -#else /* BIG_ENDIAN */ unsigned char *tmp; -#endif /* BIG_ENDIAN */ int bits; if ((*inlenb % 2) == 1) @@ -388,43 +425,46 @@ UTF16BEToUTF8(unsigned char* out, int outlen, inlen = *inlenb / 2; inend= in + inlen; while (in < inend) { -#ifdef BIG_ENDIAN - c= *in++; -#else - tmp = (unsigned char *) in; - c = *tmp++; - c = c << 8; - c = c | (unsigned int) *tmp; - in++; -#endif + if (xmlLittleEndian) { + tmp = (unsigned char *) in; + c = *tmp++; + c = c << 8; + c = c | (unsigned int) *tmp; + in++; + } else { + c= *in++; + } if ((c & 0xFC00) == 0xD800) { /* surrogates */ if (in >= inend) { /* (in > inend) shouldn't happens */ - (*inlenb) -= 2; - break; + *outlen = out - outstart; + *inlenb = processed - inb; + return(-2); + } + if (xmlLittleEndian) { + tmp = (unsigned char *) in; + d = *tmp++; + d = d << 8; + d = d | (unsigned int) *tmp; + in++; + } else { + d= *in++; } - -#ifdef BIG_ENDIAN - d= *in++; -#else - tmp = (unsigned char *) in; - d = *tmp++; - d = d << 8; - d = d | (unsigned int) *tmp; - in++; -#endif if ((d & 0xFC00) == 0xDC00) { c &= 0x03FF; c <<= 10; c |= d & 0x03FF; c += 0x10000; } - else + else { + *outlen = out - outstart; + *inlenb = processed - inb; return(-2); + } } /* assertion: c is a single UTF-4 value */ if (out >= outend) - return(-1); + break; if (c < 0x80) { *out++= c; bits= -6; } else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; } else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; } @@ -432,11 +472,14 @@ UTF16BEToUTF8(unsigned char* out, int outlen, for ( ; bits >= 0; bits-= 6) { if (out >= outend) - return(-1); + break; *out++= ((c >> bits) & 0x3F) | 0x80; } + processed = (const unsigned char*) in; } - return(out-outstart); + *outlen = out - outstart; + *inlenb = processed - inb; + return(0); } /** @@ -454,79 +497,86 @@ UTF16BEToUTF8(unsigned char* out, int outlen, * if the transcoding failed. */ int -UTF8ToUTF16BE(unsigned char* outb, int outlen, +UTF8ToUTF16BE(unsigned char* outb, int *outlen, const unsigned char* in, int *inlen) { unsigned short* out = (unsigned short*) outb; + const unsigned char* processed = in; unsigned short* outstart= out; unsigned short* outend; const unsigned char* inend= in+*inlen; unsigned int c, d, trailing; -#ifdef BIG_ENDIAN -#else unsigned char *tmp; unsigned short tmp1, tmp2; -#endif /* BIG_ENDIAN */ - outlen /= 2; /* convert in short length */ - outend = out + outlen; + outend = out + (*outlen / 2); while (in < inend) { d= *in++; if (d < 0x80) { c= d; trailing= 0; } - else if (d < 0xC0) - return(-2); /* trailing byte in leading position */ - else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } + else if (d < 0xC0) { + /* trailing byte in leading position */ + *outlen = out - outstart; + *inlen = processed - in; + return(-2); + } else if (d < 0xE0) { c= d & 0x1F; trailing= 1; } else if (d < 0xF0) { c= d & 0x0F; trailing= 2; } else if (d < 0xF8) { c= d & 0x07; trailing= 3; } - else - return(-2); /* no chance for this in UTF-16 */ + else { + /* no chance for this in UTF-16 */ + *outlen = out - outstart; + *inlen = processed - in; + return(-2); + } if (inend - in < trailing) { - *inlen -= (inend - in); break; } for ( ; trailing; trailing--) { - if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) return(-1); + if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break; c <<= 6; c |= d & 0x3F; } /* assertion: c is a single UTF-4 value */ if (c < 0x10000) { - if (out >= outend) return(-1); -#ifdef BIG_ENDIAN - *out++ = c; -#else - tmp = (unsigned char *) out; - *tmp = c >> 8; - *(tmp + 1) = c; - out++; -#endif /* BIG_ENDIAN */ + if (out >= outend) break; + if (xmlLittleEndian) { + tmp = (unsigned char *) out; + *tmp = c >> 8; + *(tmp + 1) = c; + out++; + } else { + *out++ = c; + } } else if (c < 0x110000) { - if (out+1 >= outend) return(-1); + if (out+1 >= outend) break; c -= 0x10000; -#ifdef BIG_ENDIAN - *out++ = 0xD800 | (c >> 10); - *out++ = 0xDC00 | (c & 0x03FF); -#else - tmp1 = 0xD800 | (c >> 10); - tmp = (unsigned char *) out; - *tmp = tmp1 >> 8; - *(tmp + 1) = tmp1; - out++; + if (xmlLittleEndian) { + tmp1 = 0xD800 | (c >> 10); + tmp = (unsigned char *) out; + *tmp = tmp1 >> 8; + *(tmp + 1) = tmp1; + out++; - tmp2 = 0xDC00 | (c & 0x03FF); - tmp = (unsigned char *) out; - *tmp = tmp2 >> 8; - *(tmp + 1) = tmp2; - out++; -#endif + tmp2 = 0xDC00 | (c & 0x03FF); + tmp = (unsigned char *) out; + *tmp = tmp2 >> 8; + *(tmp + 1) = tmp2; + out++; + } else { + *out++ = 0xD800 | (c >> 10); + *out++ = 0xDC00 | (c & 0x03FF); + } } - else return(-1); + else + break; + processed = in; } - return(out-outstart); + *outlen = out - outstart; + *inlen = processed - in; + return(0); } /** @@ -636,8 +686,12 @@ xmlParseCharEncoding(const char* name) if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9); if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP); - if (!strcmp(upper, "Shift_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS); + if (!strcmp(upper, "SHIFT_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS); if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP); + +#ifdef DEBUG_ENCODING + fprintf(stderr, "Unknown encoding %s\n", name); +#endif return(XML_CHAR_ENCODING_ERROR); } @@ -712,6 +766,9 @@ xmlNewCharEncodingHandler(const char *name, * registers and returns the handler. */ xmlRegisterCharEncodingHandler(handler); +#ifdef DEBUG_ENCODING + fprintf(stderr, "Registered encoding handler for %s\n", name); +#endif return(handler); } @@ -725,11 +782,18 @@ xmlNewCharEncodingHandler(const char *name, */ void xmlInitCharEncodingHandlers(void) { + unsigned short int tst = 0x1234; + unsigned char *ptr = (unsigned char *) &tst; + if (handlers != NULL) return; handlers = (xmlCharEncodingHandlerPtr *) xmlMalloc(MAX_ENCODING_HANDLERS * sizeof(xmlCharEncodingHandlerPtr)); + if (*ptr == 0x12) xmlLittleEndian = 0; + else if (*ptr == 0x34) xmlLittleEndian = 1; + else fprintf(stderr, "Odd problem at endianness detection\n"); + if (handlers == NULL) { fprintf(stderr, "xmlInitCharEncodingHandlers : out of memory !\n"); return; @@ -755,7 +819,8 @@ xmlCleanupCharEncodingHandlers(void) { for (;nbCharEncodingHandler > 0;) { nbCharEncodingHandler--; if (handlers[nbCharEncodingHandler] != NULL) { - xmlFree(handlers[nbCharEncodingHandler]->name); + if (handlers[nbCharEncodingHandler]->name != NULL) + xmlFree(handlers[nbCharEncodingHandler]->name); xmlFree(handlers[nbCharEncodingHandler]); } } @@ -798,6 +863,8 @@ xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) { */ xmlCharEncodingHandlerPtr xmlGetCharEncodingHandler(xmlCharEncoding enc) { + xmlCharEncodingHandlerPtr handler; + if (handlers == NULL) xmlInitCharEncodingHandlers(); switch (enc) { case XML_CHAR_ENCODING_ERROR: @@ -811,40 +878,68 @@ xmlGetCharEncodingHandler(xmlCharEncoding enc) { case XML_CHAR_ENCODING_UTF16BE: return(xmlUTF16BEHandler); case XML_CHAR_ENCODING_EBCDIC: - return(NULL); + handler = xmlFindCharEncodingHandler("EBCDIC"); + if (handler != NULL) return(handler); + handler = xmlFindCharEncodingHandler("ebcdic"); + if (handler != NULL) return(handler); + break; case XML_CHAR_ENCODING_UCS4LE: - return(NULL); + handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4"); + if (handler != NULL) return(handler); + handler = xmlFindCharEncodingHandler("UCS-4"); + if (handler != NULL) return(handler); + handler = xmlFindCharEncodingHandler("UCS4"); + if (handler != NULL) return(handler); + break; case XML_CHAR_ENCODING_UCS4BE: - return(NULL); + handler = xmlFindCharEncodingHandler("UCS4BE"); + if (handler != NULL) return(handler); + break; case XML_CHAR_ENCODING_UCS4_2143: - return(NULL); + break; case XML_CHAR_ENCODING_UCS4_3412: - return(NULL); + break; case XML_CHAR_ENCODING_UCS2: - return(NULL); + handler = xmlFindCharEncodingHandler("ISO-10646-UCS-2"); + if (handler != NULL) return(handler); + handler = xmlFindCharEncodingHandler("UCS-2"); + if (handler != NULL) return(handler); + handler = xmlFindCharEncodingHandler("UCS2"); + if (handler != NULL) return(handler); + break; case XML_CHAR_ENCODING_8859_1: - return(NULL); case XML_CHAR_ENCODING_8859_2: - return(NULL); case XML_CHAR_ENCODING_8859_3: - return(NULL); case XML_CHAR_ENCODING_8859_4: - return(NULL); case XML_CHAR_ENCODING_8859_5: - return(NULL); case XML_CHAR_ENCODING_8859_6: - return(NULL); case XML_CHAR_ENCODING_8859_7: - return(NULL); case XML_CHAR_ENCODING_8859_8: - return(NULL); case XML_CHAR_ENCODING_8859_9: return(NULL); case XML_CHAR_ENCODING_2022_JP: + handler = xmlFindCharEncodingHandler("ISO-2022-JP"); + if (handler != NULL) return(handler); + break; case XML_CHAR_ENCODING_SHIFT_JIS: + handler = xmlFindCharEncodingHandler("SHIFT-JIS"); + if (handler != NULL) return(handler); + handler = xmlFindCharEncodingHandler("SHIFT_JIS"); + if (handler != NULL) return(handler); + handler = xmlFindCharEncodingHandler("Shift_JIS"); + if (handler != NULL) return(handler); + break; case XML_CHAR_ENCODING_EUC_JP: - return(NULL); + handler = xmlFindCharEncodingHandler("EUC-JP"); + if (handler != NULL) return(handler); + break; + default: + break; } + +#ifdef DEBUG_ENCODING + fprintf(stderr, "No handler found for encoding %d\n", enc); +#endif return(NULL); } @@ -858,23 +953,306 @@ xmlGetCharEncodingHandler(xmlCharEncoding enc) { */ xmlCharEncodingHandlerPtr xmlFindCharEncodingHandler(const char *name) { - char upper[500]; +#ifdef LIBXML_ICONV_ENABLED + iconv_t icv_in, icv_out; + xmlCharEncodingHandlerPtr enc; +#endif /* LIBXML_ICONV_ENABLED */ + char upper[100]; int i; if (handlers == NULL) xmlInitCharEncodingHandlers(); if (name == NULL) return(xmlDefaultCharEncodingHandler); if (name[0] == 0) return(xmlDefaultCharEncodingHandler); - for (i = 0;i < 499;i++) { + for (i = 0;i < 99;i++) { upper[i] = toupper(name[i]); if (upper[i] == 0) break; } upper[i] = 0; for (i = 0;i < nbCharEncodingHandler; i++) - if (!strcmp(name, handlers[i]->name)) + if (!strcmp(upper, handlers[i]->name)) { +#ifdef DEBUG_ENCODING + fprintf(stderr, "Found registered handler for encoding %s\n", name); +#endif return(handlers[i]); + } +#ifdef LIBXML_ICONV_ENABLED + /* check whether iconv can handle this */ + icv_in = iconv_open("UTF-8", name); + icv_out = iconv_open(name, "UTF-8"); + if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) { + enc = xmlMalloc(sizeof(xmlCharEncodingHandler)); + if (enc == NULL) { + iconv_close(icv_in); + iconv_close(icv_out); + return(NULL); + } + enc->name = NULL; + enc->input = NULL; + enc->output = NULL; + enc->iconv_in = icv_in; + enc->iconv_out = icv_out; +#ifdef DEBUG_ENCODING + fprintf(stderr, "Found iconv handler for encoding %s\n", name); +#endif + return enc; + } else if ((icv_in != (iconv_t) -1) || icv_out != (iconv_t) -1) { + fprintf(stderr, "iconv : problems with filters for '%s'\n", name); + } +#endif /* LIBXML_ICONV_ENABLED */ +#ifdef DEBUG_ENCODING + fprintf(stderr, "No handler found for encoding %s\n", name); +#endif return(NULL); } +#ifdef LIBXML_ICONV_ENABLED +/** + * xmlIconvWrapper: + * @cd: iconv converter data structure + * @out: a pointer to an array of bytes to store the result + * @outlen: the length of @out + * @in: a pointer to an array of ISO Latin 1 chars + * @inlen: the length of @in + * + * Returns 0 if success, or + * -1 by lack of space, or + * -2 if the transcoding fails (for *in is not valid utf8 string or + * the result of transformation can't fit into the encoding we want), or + * -3 if there the last byte can't form a single output char. + * + * The value of @inlen after return is the number of octets consumed + * as the return value is positive, else unpredictiable. + * The value of @outlen after return is the number of ocetes consumed. + */ +static int +xmlIconvWrapper(iconv_t cd, + unsigned char *out, int *outlen, + const unsigned char *in, int *inlen) { + + size_t icv_inlen = *inlen, icv_outlen = *outlen; + const char *icv_in = (const char *) in; + char *icv_out = (char *) out; + int ret; + + ret = iconv(cd, + &icv_in, &icv_inlen, + &icv_out, &icv_outlen); + *inlen -= icv_inlen; + *outlen -= icv_outlen; + if (icv_inlen != 0 || ret == (size_t) -1) { +#ifdef EILSEQ + if (errno == EILSEQ) { + return -2; + } else +#endif +#ifdef E2BIG + if (errno == E2BIG) { + return -1; + } else +#endif +#ifdef EINVAL + if (errno == EINVAL) { + return -3; + } +#endif + else { + return -3; + } + } + return 0; +} +#endif /* LIBXML_ICONV_ENABLED */ + +/** + * xmlCharEncInFunc: + * @handler: char enconding transformation data structure + * @out: an xmlBuffer for the output. + * @in: an xmlBuffer for the input + * + * Generic front-end for the encoding handler input function + * + * Returns the number of byte written if success, or + * -1 general error + * -2 if the transcoding fails (for *in is not valid utf8 string or + * the result of transformation can't fit into the encoding we want), or + */ +int +xmlCharEncInFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out, + xmlBufferPtr in) { + int ret = -2; + int written; + int toconv; + + if (handler == NULL) return(-1); + if (out == NULL) return(-1); + if (in == NULL) return(-1); + + written = out->size - out->use; + toconv = in->use; + if (toconv * 2 >= written) { + xmlBufferGrow(out, toconv * 2); + written = out->size - out->use - 1; + } + if (handler->input != NULL) { + ret = handler->input(&out->content[out->use], &written, + in->content, &toconv); + xmlBufferShrink(in, toconv); + out->use += written; + out->content[out->use] = 0; + } +#ifdef LIBXML_ICONV_ENABLED + else if (handler->iconv_in != NULL) { + ret = xmlIconvWrapper(handler->iconv_in, &out->content[out->use], + &written, in->content, &toconv); + xmlBufferShrink(in, toconv); + out->use += written; + out->content[out->use] = 0; + if (ret == -1) ret = -3; + } +#endif /* LIBXML_ICONV_ENABLED */ +#ifdef DEBUG_ENCODING + switch (ret) { + case 0: + fprintf(stderr, "converted %d bytes to %d bytes of input\n", + toconv, written); + break; + case -1: + fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n", + toconv, written, in->use); + break; + case -2: + fprintf(stderr, "input conversion failed due to input error\n"); + break; + case -3: + fprintf(stderr,"converted %d bytes to %d bytes of input, %d left\n", + toconv, written, in->use); + break; + default: + fprintf(stderr,"Unknown input conversion failed %d\n", ret); + } +#endif + /* + * Ignore when input buffer is not on a boundary + */ + if (ret == -3) ret = 0; + return(ret); +} + +/** + * xmlCharEncOutFunc: + * @handler: char enconding transformation data structure + * @out: an xmlBuffer for the output. + * @in: an xmlBuffer for the input + * + * Generic front-end for the encoding handler output function + * + * Returns the number of byte written if success, or + * -1 general error + * -2 if the transcoding fails (for *in is not valid utf8 string or + * the result of transformation can't fit into the encoding we want), or + */ +int +xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out, + xmlBufferPtr in) { + int ret = -2; + int written; + int toconv; + + if (handler == NULL) return(-1); + if (out == NULL) return(-1); + if (in == NULL) return(-1); + + written = out->size - out->use; + toconv = in->use; + if (toconv * 2 >= written) { + xmlBufferGrow(out, toconv * 2); + written = out->size - out->use - 1; + } + if (handler->output != NULL) { + ret = handler->output(&out->content[out->use], &written, + in->content, &toconv); + xmlBufferShrink(in, toconv); + out->use += written; + out->content[out->use] = 0; + } +#ifdef LIBXML_ICONV_ENABLED + else if (handler->iconv_out != NULL) { + ret = xmlIconvWrapper(handler->iconv_out, &out->content[out->use], + &written, in->content, &toconv); + xmlBufferShrink(in, toconv); + out->use += written; + out->content[out->use] = 0; + if (ret == -1) ret = -3; + } +#endif /* LIBXML_ICONV_ENABLED */ +#ifdef DEBUG_ENCODING + switch (ret) { + case 0: + fprintf(stderr, "converted %d bytes to %d bytes of output\n", + toconv, written); + break; + case -1: + fprintf(stderr, "output conversion failed by lack of space\n"); + break; + case -2: + fprintf(stderr, "output conversion failed due to output error\n"); + break; + case -3: + fprintf(stderr,"converted %d bytes to %d bytes of output %d left\n", + toconv, written, in->use); + break; + default: + fprintf(stderr,"Unknown output conversion failed %d\n", ret); + } +#endif + return(ret); +} + +/** + * xmlCharEncCloseFunc: + * @handler: char enconding transformation data structure + * + * Generic front-end for hencoding handler close function + * + * Returns 0 if success, or -1 in case of error + */ +int +xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) { + int ret = 0; + if (handler == NULL) return(-1); + if (handler->name == NULL) return(-1); +#ifdef LIBXML_ICONV_ENABLED + /* + * Iconv handlers can be oused only once, free the whole block. + * and the associated icon resources. + */ + if ((handler->iconv_out != NULL) || (handler->iconv_in != NULL)) { + if (handler->name != NULL) + xmlFree(handler->name); + handler->name = NULL; + if (handler->iconv_out != NULL) { + if (iconv_close(handler->iconv_out)) + ret = -1; + handler->iconv_out = NULL; + } + if (handler->iconv_in != NULL) { + if (iconv_close(handler->iconv_in)) + ret = -1; + handler->iconv_in = NULL; + } + xmlFree(handler); + } +#endif /* LIBXML_ICONV_ENABLED */ +#ifdef DEBUG_ENCODING + if (ret) + fprintf(stderr, "failed to close the encoding handler\n"); + else + fprintf(stderr, "closed the encoding handler\n"); + +#endif + return(ret); +} + diff --git a/encoding.h b/encoding.h index 1b1c92e3..f6edbf29 100644 --- a/encoding.h +++ b/encoding.h @@ -22,12 +22,30 @@ #define __XML_CHAR_ENCODING_H__ #include +#ifdef LIBXML_ICONV_ENABLED +#include +#endif +#include + #ifdef __cplusplus extern "C" { #endif /** * Predefined values for some standard encodings + * Libxml don't do beforehand translation on UTF8, ISOLatinX + * It also support UTF16 (LE and BE) by default. + * + * Anything else would have to be translated to UTF8 before being + * given to the parser itself. The BOM for UTF16 and the encoding + * declaration are looked at and a converter is looked for at that + * point. If not found the parser stops here as asked by the XML REC + * Converter can be registered by the user using xmlRegisterCharEncodingHandler + * but the currentl form doesn't allow stateful transcoding (a serious + * problem agreed !). If iconv has been found it will be used + * automatically and allow stateful transcoding, the simplest is then + * to be sure to enable icon and to provide iconv libs for the encoding + * support needed. */ typedef enum { XML_CHAR_ENCODING_ERROR= -1, /* No char encoding detected */ @@ -65,9 +83,13 @@ typedef enum { * Take a block of chars in the original encoding and try to convert * it to an UTF-8 block of chars out. * - * Returns the number of byte written, or -1 by lack of space. + * Returns the number of byte written, or -1 by lack of space, or -2 + * if the transcoding failed. + * The value of @inlen after return is the number of octets consumed + * as the return value is positive, else unpredictiable. + * The value of @outlen after return is the number of ocetes consumed. */ -typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int outlen, +typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int *outlen, const unsigned char* in, int *inlen); @@ -83,12 +105,17 @@ typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int outlen, * * Returns the number of byte written, or -1 by lack of space, or -2 * if the transcoding failed. + * The value of @inlen after return is the number of octets consumed + * as the return value is positive, else unpredictiable. + * The value of @outlen after return is the number of ocetes consumed. */ -typedef int (* xmlCharEncodingOutputFunc)(unsigned char* out, int outlen, +typedef int (* xmlCharEncodingOutputFunc)(unsigned char* out, int *outlen, const unsigned char* in, int *inlen); + /* * Block defining the handlers for non UTF-8 encodings. + * If iconv is supported, there is two extra fields */ typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler; @@ -96,7 +123,11 @@ typedef xmlCharEncodingHandler *xmlCharEncodingHandlerPtr; struct _xmlCharEncodingHandler { char *name; xmlCharEncodingInputFunc input; - xmlCharEncodingOutputFunc output; + xmlCharEncodingOutputFunc output; +#ifdef LIBXML_ICONV_ENABLED + iconv_t iconv_in; + iconv_t iconv_out; +#endif /* LIBXML_ICONV_ENABLED */ }; void xmlInitCharEncodingHandlers (void); @@ -109,6 +140,14 @@ xmlCharEncodingHandlerPtr xmlGetCharEncodingHandler(xmlCharEncoding enc); xmlCharEncodingHandlerPtr xmlFindCharEncodingHandler(const char *name); int xmlCheckUTF8 (const unsigned char *utf); +int xmlCharEncOutFunc (xmlCharEncodingHandler *handler, + xmlBufferPtr out, + xmlBufferPtr in); + +int xmlCharEncInFunc (xmlCharEncodingHandler *handler, + xmlBufferPtr out, + xmlBufferPtr in); +int xmlCharEncCloseFunc (xmlCharEncodingHandler *handler); #ifdef __cplusplus } diff --git a/include/libxml/encoding.h b/include/libxml/encoding.h index 1b1c92e3..f6edbf29 100644 --- a/include/libxml/encoding.h +++ b/include/libxml/encoding.h @@ -22,12 +22,30 @@ #define __XML_CHAR_ENCODING_H__ #include +#ifdef LIBXML_ICONV_ENABLED +#include +#endif +#include + #ifdef __cplusplus extern "C" { #endif /** * Predefined values for some standard encodings + * Libxml don't do beforehand translation on UTF8, ISOLatinX + * It also support UTF16 (LE and BE) by default. + * + * Anything else would have to be translated to UTF8 before being + * given to the parser itself. The BOM for UTF16 and the encoding + * declaration are looked at and a converter is looked for at that + * point. If not found the parser stops here as asked by the XML REC + * Converter can be registered by the user using xmlRegisterCharEncodingHandler + * but the currentl form doesn't allow stateful transcoding (a serious + * problem agreed !). If iconv has been found it will be used + * automatically and allow stateful transcoding, the simplest is then + * to be sure to enable icon and to provide iconv libs for the encoding + * support needed. */ typedef enum { XML_CHAR_ENCODING_ERROR= -1, /* No char encoding detected */ @@ -65,9 +83,13 @@ typedef enum { * Take a block of chars in the original encoding and try to convert * it to an UTF-8 block of chars out. * - * Returns the number of byte written, or -1 by lack of space. + * Returns the number of byte written, or -1 by lack of space, or -2 + * if the transcoding failed. + * The value of @inlen after return is the number of octets consumed + * as the return value is positive, else unpredictiable. + * The value of @outlen after return is the number of ocetes consumed. */ -typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int outlen, +typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int *outlen, const unsigned char* in, int *inlen); @@ -83,12 +105,17 @@ typedef int (* xmlCharEncodingInputFunc)(unsigned char* out, int outlen, * * Returns the number of byte written, or -1 by lack of space, or -2 * if the transcoding failed. + * The value of @inlen after return is the number of octets consumed + * as the return value is positive, else unpredictiable. + * The value of @outlen after return is the number of ocetes consumed. */ -typedef int (* xmlCharEncodingOutputFunc)(unsigned char* out, int outlen, +typedef int (* xmlCharEncodingOutputFunc)(unsigned char* out, int *outlen, const unsigned char* in, int *inlen); + /* * Block defining the handlers for non UTF-8 encodings. + * If iconv is supported, there is two extra fields */ typedef struct _xmlCharEncodingHandler xmlCharEncodingHandler; @@ -96,7 +123,11 @@ typedef xmlCharEncodingHandler *xmlCharEncodingHandlerPtr; struct _xmlCharEncodingHandler { char *name; xmlCharEncodingInputFunc input; - xmlCharEncodingOutputFunc output; + xmlCharEncodingOutputFunc output; +#ifdef LIBXML_ICONV_ENABLED + iconv_t iconv_in; + iconv_t iconv_out; +#endif /* LIBXML_ICONV_ENABLED */ }; void xmlInitCharEncodingHandlers (void); @@ -109,6 +140,14 @@ xmlCharEncodingHandlerPtr xmlGetCharEncodingHandler(xmlCharEncoding enc); xmlCharEncodingHandlerPtr xmlFindCharEncodingHandler(const char *name); int xmlCheckUTF8 (const unsigned char *utf); +int xmlCharEncOutFunc (xmlCharEncodingHandler *handler, + xmlBufferPtr out, + xmlBufferPtr in); + +int xmlCharEncInFunc (xmlCharEncodingHandler *handler, + xmlBufferPtr out, + xmlBufferPtr in); +int xmlCharEncCloseFunc (xmlCharEncodingHandler *handler); #ifdef __cplusplus } diff --git a/include/libxml/parserInternals.h b/include/libxml/parserInternals.h index c3597348..e7e6fa0a 100644 --- a/include/libxml/parserInternals.h +++ b/include/libxml/parserInternals.h @@ -28,10 +28,10 @@ extern "C" { * any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */ #define IS_CHAR(c) \ - ((((c) == 0x09) || ((c) == 0x0A) || ((c) == 0x0D) || \ - (((c) >= 0x20) && ((c) != 0xFFFE) && ((c) != 0xFFFF))) && \ - (((c) <= 0xD7FF) || ((c) >= 0xE000)) && ((c) >= 0) && \ - ((c) <= 0x10FFFF)) + (((c) == 0x09) || ((c) == 0x0A) || ((c) == 0x0D) || \ + (((c) >= 0x20) && ((c) <= 0xD7FF)) || \ + (((c) >= 0xE000) && ((c) <= 0xFFFD)) || \ + (((c) >= 0x10000) && ((c) <= 0x10FFFF))) /* * [3] S ::= (#x20 | #x9 | #xD | #xA)+ @@ -442,8 +442,10 @@ xmlParserCtxtPtr xmlNewParserCtxt (void); xmlParserCtxtPtr xmlCreateEntityParserCtxt(const xmlChar *URL, const xmlChar *ID, const xmlChar *base); -void xmlSwitchEncoding (xmlParserCtxtPtr ctxt, +int xmlSwitchEncoding (xmlParserCtxtPtr ctxt, xmlCharEncoding enc); +int xmlSwitchToEncoding (xmlParserCtxtPtr ctxt, + xmlCharEncodingHandlerPtr handler); void xmlFreeParserCtxt (xmlParserCtxtPtr ctxt); /** diff --git a/include/libxml/tree.h b/include/libxml/tree.h index 1cb12e24..35ea5256 100644 --- a/include/libxml/tree.h +++ b/include/libxml/tree.h @@ -380,6 +380,8 @@ void xmlBufferCCat (xmlBufferPtr buf, const char *str); int xmlBufferShrink (xmlBufferPtr buf, int len); +int xmlBufferGrow (xmlBufferPtr buf, + int len); void xmlBufferEmpty (xmlBufferPtr buf); const xmlChar* xmlBufferContent (const xmlBufferPtr buf); int xmlBufferUse (const xmlBufferPtr buf); diff --git a/include/libxml/xmlIO.h b/include/libxml/xmlIO.h index 8f9b7e02..2d14ebeb 100644 --- a/include/libxml/xmlIO.h +++ b/include/libxml/xmlIO.h @@ -33,6 +33,7 @@ struct _xmlParserInputBuffer { xmlCharEncodingHandlerPtr encoder; /* I18N conversions to UTF-8 */ xmlBufferPtr buffer; /* Local buffer encoded in UTF-8 */ + xmlBufferPtr raw; /* if encoder != NULL buffer for raw input */ }; diff --git a/parser.c b/parser.c index a8e6ff4d..6714d3cf 100644 --- a/parser.c +++ b/parser.c @@ -41,6 +41,7 @@ #include #include #include +#include #include "xml-error.h" #define XML_PARSER_BIG_BUFFER_SIZE 1000 @@ -483,7 +484,7 @@ xmlNextChar(xmlParserCtxtPtr ctxt) { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, - "Char out of allowed range\n"); + "Char 0x%X out of allowed range\n", val); ctxt->errNo = XML_ERR_INVALID_ENCODING; ctxt->wellFormed = 0; ctxt->disableSAX = 1; @@ -612,7 +613,7 @@ xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, - "Char out of allowed range\n"); + "Char 0x%X out of allowed range\n", val); ctxt->errNo = XML_ERR_INVALID_ENCODING; ctxt->wellFormed = 0; ctxt->disableSAX = 1; @@ -727,7 +728,7 @@ xmlStringCurrentChar(xmlParserCtxtPtr ctxt, const xmlChar *cur, int *len) { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, - "Char out of allowed range\n"); + "Char 0x%X out of allowed range\n", val); ctxt->errNo = XML_ERR_INVALID_ENCODING; ctxt->wellFormed = 0; ctxt->disableSAX = 1; @@ -2278,96 +2279,209 @@ xmlCheckLanguageID(const xmlChar *lang) { * * change the input functions when discovering the character encoding * of a given entity. + * + * Returns 0 in case of success, -1 otherwise */ -void +int xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) { xmlCharEncodingHandlerPtr handler; + switch (enc) { + case XML_CHAR_ENCODING_ERROR: + ctxt->errNo = XML_ERR_UNKNOWN_ENCODING; + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, "encoding unknown\n"); + ctxt->wellFormed = 0; + ctxt->disableSAX = 1; + break; + case XML_CHAR_ENCODING_NONE: + /* let's assume it's UTF-8 without the XML decl */ + return(0); + case XML_CHAR_ENCODING_UTF8: + /* default encoding, no conversion should be needed */ + return(0); + default: + break; + } handler = xmlGetCharEncodingHandler(enc); + if (handler == NULL) { + /* + * Default handlers. + */ + switch (enc) { + case XML_CHAR_ENCODING_ERROR: + ctxt->errNo = XML_ERR_UNKNOWN_ENCODING; + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, "encoding unknown\n"); + ctxt->wellFormed = 0; + ctxt->disableSAX = 1; + break; + case XML_CHAR_ENCODING_NONE: + /* let's assume it's UTF-8 without the XML decl */ + return(0); + case XML_CHAR_ENCODING_UTF8: + /* default encoding, no conversion should be needed */ + return(0); + case XML_CHAR_ENCODING_UTF16LE: + break; + case XML_CHAR_ENCODING_UTF16BE: + break; + case XML_CHAR_ENCODING_UCS4LE: + ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "char encoding USC4 little endian not supported\n"); + break; + case XML_CHAR_ENCODING_UCS4BE: + ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "char encoding USC4 big endian not supported\n"); + break; + case XML_CHAR_ENCODING_EBCDIC: + ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "char encoding EBCDIC not supported\n"); + break; + case XML_CHAR_ENCODING_UCS4_2143: + ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "char encoding UCS4 2143 not supported\n"); + break; + case XML_CHAR_ENCODING_UCS4_3412: + ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "char encoding UCS4 3412 not supported\n"); + break; + case XML_CHAR_ENCODING_UCS2: + ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "char encoding UCS2 not supported\n"); + break; + case XML_CHAR_ENCODING_8859_1: + case XML_CHAR_ENCODING_8859_2: + case XML_CHAR_ENCODING_8859_3: + case XML_CHAR_ENCODING_8859_4: + case XML_CHAR_ENCODING_8859_5: + case XML_CHAR_ENCODING_8859_6: + case XML_CHAR_ENCODING_8859_7: + case XML_CHAR_ENCODING_8859_8: + case XML_CHAR_ENCODING_8859_9: + /* + * Keep the internal content in the document encoding + */ + if ((ctxt->inputNr == 1) && + (ctxt->encoding == NULL) && + (ctxt->input->encoding != NULL)) { + ctxt->encoding = xmlStrdup(ctxt->input->encoding); + } + return(0); + case XML_CHAR_ENCODING_2022_JP: + ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "char encoding ISO-2022-JPnot supported\n"); + break; + case XML_CHAR_ENCODING_SHIFT_JIS: + ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "char encoding Shift_JIS not supported\n"); + break; + case XML_CHAR_ENCODING_EUC_JP: + ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; + if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "char encoding EUC-JPnot supported\n"); + break; + } + } + if (handler == NULL) + return(-1); + return(xmlSwitchToEncoding(ctxt, handler)); +} + +/** + * xmlSwitchToEncoding: + * @ctxt: the parser context + * @handler: the encoding handler + * + * change the input functions when discovering the character encoding + * of a given entity. + * + * Returns 0 in case of success, -1 otherwise + */ +int +xmlSwitchToEncoding(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler) +{ + int nbchars; + if (handler != NULL) { if (ctxt->input != NULL) { if (ctxt->input->buf != NULL) { if (ctxt->input->buf->encoder != NULL) { + if (ctxt->input->buf->encoder == handler) + return(0); if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "xmlSwitchEncoding : encoder already regitered\n"); - return; + return(-1); } ctxt->input->buf->encoder = handler; /* - * Is there already some content down the pipe to convert + * Is there already some content down the pipe to convert ? */ if ((ctxt->input->buf->buffer != NULL) && (ctxt->input->buf->buffer->use > 0)) { - xmlChar *buf; - int res, len, size; int processed; /* * Specific handling of the Byte Order Mark for * UTF-16 */ - if ((enc == XML_CHAR_ENCODING_UTF16LE) && + if ((handler->name != NULL) && + (!strcmp(handler->name, "UTF-16LE")) && (ctxt->input->cur[0] == 0xFF) && (ctxt->input->cur[1] == 0xFE)) { - SKIP(2); + ctxt->input->cur += 2; } - if ((enc == XML_CHAR_ENCODING_UTF16BE) && + if ((handler->name != NULL) && + (!strcmp(handler->name, "UTF-16BE")) && (ctxt->input->cur[0] == 0xFE) && (ctxt->input->cur[1] == 0xFF)) { - SKIP(2); + ctxt->input->cur += 2; } /* - * convert the non processed part + * Shring the current input buffer. + * Move it as the raw buffer and create a new input buffer */ processed = ctxt->input->cur - ctxt->input->base; - len = ctxt->input->buf->buffer->use - processed; - - if (len <= 0) { - return; - } - size = ctxt->input->buf->buffer->use * 4; - if (size < 4000) - size = 4000; -retry_larger: - buf = (xmlChar *) xmlMalloc(size + 1); - if (buf == NULL) { - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "xmlSwitchEncoding : out of memory\n"); - return; - } - /* TODO !!! Handling of buf too small */ - res = handler->input(buf, size, ctxt->input->cur, &len); - if (res == -1) { - size *= 2; - xmlFree(buf); - goto retry_larger; - } - if ((res < 0) || - (len != ctxt->input->buf->buffer->use - processed)) { - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "xmlSwitchEncoding : conversion failed\n"); - xmlFree(buf); - return; - } + xmlBufferShrink(ctxt->input->buf->buffer, processed); + ctxt->input->buf->raw = ctxt->input->buf->buffer; + ctxt->input->buf->buffer = xmlBufferCreate(); /* - * Conversion succeeded, get rid of the old buffer + * convert as much as possible of the raw input + * to the parser reading buffer. */ - xmlFree(ctxt->input->buf->buffer->content); - ctxt->input->buf->buffer->content = buf; - ctxt->input->base = buf; - ctxt->input->cur = buf; - ctxt->input->buf->buffer->size = size; - ctxt->input->buf->buffer->use = res; - buf[res] = 0; + nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder, + ctxt->input->buf->buffer, + ctxt->input->buf->raw); + if (nbchars < 0) { + fprintf(stderr, "xmlSwitchToEncoding: encoder error\n"); + return(-1); + } + ctxt->input->base = + ctxt->input->cur = ctxt->input->buf->buffer->content; } - return; + return(0); } else { if (ctxt->input->length == 0) { /* @@ -2377,191 +2491,59 @@ retry_larger: if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "xmlSwitchEncoding : no input\n"); - return; + return(-1); } else { - xmlChar *buf; - int res, len; - int processed = ctxt->input->cur - ctxt->input->base; + int processed; /* - * convert the non processed part + * Shring the current input buffer. + * Move it as the raw buffer and create a new input buffer */ - len = ctxt->input->length - processed; - if (len <= 0) { - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "xmlSwitchEncoding : input fully consumed?\n"); - return; - } - buf = (xmlChar *) xmlMalloc(ctxt->input->length * 4); - if (buf == NULL) { - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "xmlSwitchEncoding : out of memory\n"); - return; - } - res = handler->input(buf, ctxt->input->length * 4, - ctxt->input->cur, &len); - if ((res < 0) || - (len != ctxt->input->length - processed)) { - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "xmlSwitchEncoding : conversion failed\n"); - xmlFree(buf); - return; + processed = ctxt->input->cur - ctxt->input->base; + ctxt->input->buf->raw = xmlBufferCreate(); + xmlBufferAdd(ctxt->input->buf->raw, ctxt->input->cur, + ctxt->input->length - processed); + ctxt->input->buf->buffer = xmlBufferCreate(); + + /* + * convert as much as possible of the raw input + * to the parser reading buffer. + */ + nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder, + ctxt->input->buf->buffer, + ctxt->input->buf->raw); + if (nbchars < 0) { + fprintf(stderr, "xmlSwitchToEncoding: encoder error\n"); + return(-1); } + /* * Conversion succeeded, get rid of the old buffer */ if ((ctxt->input->free != NULL) && (ctxt->input->base != NULL)) ctxt->input->free((xmlChar *) ctxt->input->base); - ctxt->input->base = ctxt->input->cur = buf; - ctxt->input->length = res; + ctxt->input->base = + ctxt->input->cur = ctxt->input->buf->buffer->content; } } } else { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "xmlSwitchEncoding : no input\n"); + return(-1); } - } + /* + * The parsing is now done in UTF8 natively + */ + if (ctxt->encoding != NULL) { + xmlFree((xmlChar *) ctxt->encoding); + ctxt->encoding = NULL; + } + } else + return(-1); + return(0); - switch (enc) { - case XML_CHAR_ENCODING_ERROR: - ctxt->errNo = XML_ERR_UNKNOWN_ENCODING; - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, "encoding unknown\n"); - ctxt->wellFormed = 0; - ctxt->disableSAX = 1; - break; - case XML_CHAR_ENCODING_NONE: - /* let's assume it's UTF-8 without the XML decl */ - return; - case XML_CHAR_ENCODING_UTF8: - /* default encoding, no conversion should be needed */ - return; - case XML_CHAR_ENCODING_UTF16LE: - ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding UTF16 little endian not supported\n"); - break; - case XML_CHAR_ENCODING_UTF16BE: - ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding UTF16 big endian not supported\n"); - break; - case XML_CHAR_ENCODING_UCS4LE: - ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding USC4 little endian not supported\n"); - break; - case XML_CHAR_ENCODING_UCS4BE: - ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding USC4 big endian not supported\n"); - break; - case XML_CHAR_ENCODING_EBCDIC: - ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding EBCDIC not supported\n"); - break; - case XML_CHAR_ENCODING_UCS4_2143: - ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding UCS4 2143 not supported\n"); - break; - case XML_CHAR_ENCODING_UCS4_3412: - ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding UCS4 3412 not supported\n"); - break; - case XML_CHAR_ENCODING_UCS2: - ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding UCS2 not supported\n"); - break; - case XML_CHAR_ENCODING_8859_1: - ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding ISO_8859_1 ISO Latin 1 not supported\n"); - break; - case XML_CHAR_ENCODING_8859_2: - ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding ISO_8859_2 ISO Latin 2 not supported\n"); - break; - case XML_CHAR_ENCODING_8859_3: - ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding ISO_8859_3 not supported\n"); - break; - case XML_CHAR_ENCODING_8859_4: - ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding ISO_8859_4 not supported\n"); - break; - case XML_CHAR_ENCODING_8859_5: - ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding ISO_8859_5 not supported\n"); - break; - case XML_CHAR_ENCODING_8859_6: - ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding ISO_8859_6 not supported\n"); - break; - case XML_CHAR_ENCODING_8859_7: - ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding ISO_8859_7 not supported\n"); - break; - case XML_CHAR_ENCODING_8859_8: - ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding ISO_8859_8 not supported\n"); - break; - case XML_CHAR_ENCODING_8859_9: - ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding ISO_8859_9 not supported\n"); - break; - case XML_CHAR_ENCODING_2022_JP: - ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding ISO-2022-JPnot supported\n"); - break; - case XML_CHAR_ENCODING_SHIFT_JIS: - ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding Shift_JISnot supported\n"); - break; - case XML_CHAR_ENCODING_EUC_JP: - ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; - if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) - ctxt->sax->error(ctxt->userData, - "char encoding EUC-JPnot supported\n"); - break; - } } /************************************************************************ @@ -4253,7 +4235,7 @@ xmlParseExternalID(xmlParserCtxtPtr ctxt, xmlChar **publicID, int strict) { void xmlParseComment(xmlParserCtxtPtr ctxt) { xmlChar *buf = NULL; - int len = 0; + int len; int size = XML_PARSER_BUFFER_SIZE; int q, ql; int r, rl; @@ -4282,10 +4264,11 @@ xmlParseComment(xmlParserCtxtPtr ctxt) { r = CUR_CHAR(rl); NEXTL(rl); cur = CUR_CHAR(l); + len = 0; while (IS_CHAR(cur) && ((cur != '>') || (r != '-') || (q != '-'))) { - if ((r == '-') && (q == '-')) { + if ((r == '-') && (q == '-') && (len > 1)) { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, "Comment must not contain '--' (double-hyphen)`\n"); @@ -4732,11 +4715,36 @@ xmlParseEntityDecl(xmlParserCtxtPtr ctxt) { ctxt->disableSAX = 1; } if (URI) { - if ((ctxt->sax != NULL) && - (!ctxt->disableSAX) && (ctxt->sax->entityDecl != NULL)) - ctxt->sax->entityDecl(ctxt->userData, name, - XML_EXTERNAL_PARAMETER_ENTITY, - literal, URI, NULL); + xmlURIPtr uri; + + uri = xmlParseURI((const char *) URI); + if (uri == NULL) { + if ((ctxt->sax != NULL) && + (!ctxt->disableSAX) && + (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "Invalid URI: %s\n", URI); + ctxt->wellFormed = 0; + ctxt->errNo = XML_ERR_INVALID_URI; + } else { + if (uri->fragment != NULL) { + if ((ctxt->sax != NULL) && + (!ctxt->disableSAX) && + (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "Fragment not allowed: %s\n", URI); + ctxt->wellFormed = 0; + ctxt->errNo = XML_ERR_URI_FRAGMENT; + } else { + if ((ctxt->sax != NULL) && + (!ctxt->disableSAX) && + (ctxt->sax->entityDecl != NULL)) + ctxt->sax->entityDecl(ctxt->userData, name, + XML_EXTERNAL_PARAMETER_ENTITY, + literal, URI, NULL); + } + xmlFreeURI(uri); + } } } } else { @@ -4757,6 +4765,31 @@ xmlParseEntityDecl(xmlParserCtxtPtr ctxt) { ctxt->wellFormed = 0; ctxt->disableSAX = 1; } + if (URI) { + xmlURIPtr uri; + + uri = xmlParseURI((const char *)URI); + if (uri == NULL) { + if ((ctxt->sax != NULL) && + (!ctxt->disableSAX) && + (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "Invalid URI: %s\n", URI); + ctxt->wellFormed = 0; + ctxt->errNo = XML_ERR_INVALID_URI; + } else { + if (uri->fragment != NULL) { + if ((ctxt->sax != NULL) && + (!ctxt->disableSAX) && + (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "Fragment not allowed: %s\n", URI); + ctxt->wellFormed = 0; + ctxt->errNo = XML_ERR_URI_FRAGMENT; + } + xmlFreeURI(uri); + } + } if ((RAW != '>') && (!IS_BLANK(CUR))) { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, @@ -5973,7 +6006,20 @@ xmlParseTextDecl(xmlParserCtxtPtr ctxt) { /* * We know that 'sax != NULL) && (ctxt->sax->error != NULL)) + ctxt->sax->error(ctxt->userData, + "Text declaration 'errNo = XML_ERR_XMLDECL_NOT_STARTED; + ctxt->wellFormed = 0; + ctxt->disableSAX = 1; + + return; + } if (!IS_BLANK(CUR)) { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) @@ -6003,7 +6049,13 @@ xmlParseTextDecl(xmlParserCtxtPtr ctxt) { ctxt->wellFormed = 0; ctxt->disableSAX = 1; } - ctxt->input->encoding = xmlParseEncodingDecl(ctxt); + xmlParseEncodingDecl(ctxt); + if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) { + /* + * The XML REC instructs us to stop parsing right here + */ + return; + } SKIP_BLANKS; if ((RAW == '?') && (NXT(1) == '>')) { @@ -6192,6 +6244,13 @@ xmlParseExternalSubset(xmlParserCtxtPtr ctxt, const xmlChar *ExternalID, (NXT(2) == 'x') && (NXT(3) == 'm') && (NXT(4) == 'l')) { xmlParseTextDecl(ctxt); + if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) { + /* + * The XML REC instructs us to stop parsing right here + */ + ctxt->instate = XML_PARSER_EOF; + return; + } } if (ctxt->myDoc == NULL) { ctxt->myDoc = xmlNewDoc(BAD_CAST "1.0"); @@ -6441,6 +6500,13 @@ xmlParseReference(xmlParserCtxtPtr ctxt) { (NXT(2) == 'x') && (NXT(3) == 'm') && (NXT(4) == 'l') && (IS_BLANK(NXT(5)))) { xmlParseTextDecl(ctxt); + if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) { + /* + * The XML REC instructs us to stop parsing right here + */ + ctxt->instate = XML_PARSER_EOF; + return; + } if (input->standalone) { if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL)) ctxt->sax->error(ctxt->userData, @@ -6947,6 +7013,15 @@ xmlParsePEReference(xmlParserCtxtPtr ctxt) { (NXT(2) == 'x') && (NXT(3) == 'm') && (NXT(4) == 'l') && (IS_BLANK(NXT(5)))) { xmlParseTextDecl(ctxt); + if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) { + /* + * The XML REC instructs us to stop parsing + * right here + */ + ctxt->instate = XML_PARSER_EOF; + xmlFree(name); + return; + } } if (ctxt->token == 0) ctxt->token = ' '; @@ -8197,6 +8272,38 @@ xmlParseEncodingDecl(xmlParserCtxtPtr ctxt) { ctxt->disableSAX = 1; ctxt->errNo = XML_ERR_STRING_NOT_STARTED; } + if (encoding != NULL) { + xmlCharEncoding enc; + xmlCharEncodingHandlerPtr handler; + + if (ctxt->input->encoding != NULL) + xmlFree((xmlChar *) ctxt->input->encoding); + ctxt->input->encoding = encoding; + + enc = xmlParseCharEncoding((const char *) encoding); + /* + * registered set of known encodings + */ + if (enc != XML_CHAR_ENCODING_ERROR) { + xmlSwitchEncoding(ctxt, enc); + if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) { + xmlFree(encoding); + return(NULL); + } + } else { + /* + * fallback for unknown encodings + */ + handler = xmlFindCharEncodingHandler((const char *) encoding); + if (handler != NULL) { + xmlSwitchToEncoding(ctxt, handler); + } else { + ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; + xmlFree(encoding); + return(NULL); + } + } + } } return(encoding); } @@ -8362,7 +8469,13 @@ xmlParseXMLDecl(xmlParserCtxtPtr ctxt) { ctxt->wellFormed = 0; ctxt->disableSAX = 1; } - ctxt->input->encoding = xmlParseEncodingDecl(ctxt); + xmlParseEncodingDecl(ctxt); + if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) { + /* + * The XML REC instructs us to stop parsing right here + */ + return; + } /* * We may have the standalone status. @@ -8489,12 +8602,19 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) { if ((RAW == '<') && (NXT(1) == '?') && (NXT(2) == 'x') && (NXT(3) == 'm') && (NXT(4) == 'l') && (IS_BLANK(NXT(5)))) { + + /* + * Note that we will switch encoding on the fly. + */ xmlParseXMLDecl(ctxt); + if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) { + /* + * The XML REC instructs us to stop parsing right here + */ + return(-1); + } ctxt->standalone = ctxt->input->standalone; SKIP_BLANKS; - if ((ctxt->encoding == NULL) && (ctxt->input->encoding != NULL)) - ctxt->encoding = xmlStrdup(ctxt->input->encoding); - } else { ctxt->version = xmlCharStrdup(XML_DEFAULT_VERSION); } @@ -8581,14 +8701,6 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) { (!ctxt->disableSAX)) ctxt->sax->endDocument(ctxt->userData); - /* - * Grab the encoding if it was added on-the-fly - */ - if ((ctxt->encoding != NULL) && (ctxt->myDoc != NULL) && - (ctxt->myDoc->encoding == NULL)) { - ctxt->myDoc->encoding = ctxt->encoding; - ctxt->encoding = NULL; - } if (! ctxt->wellFormed) return(-1); return(0); } @@ -8805,6 +8917,14 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { fprintf(stderr, "PP: Parsing XML Decl\n"); #endif xmlParseXMLDecl(ctxt); + if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) { + /* + * The XML REC instructs us to stop parsing right + * here + */ + ctxt->instate = XML_PARSER_EOF; + return(0); + } ctxt->standalone = ctxt->input->standalone; if ((ctxt->encoding == NULL) && (ctxt->input->encoding != NULL)) diff --git a/parserInternals.h b/parserInternals.h index c3597348..e7e6fa0a 100644 --- a/parserInternals.h +++ b/parserInternals.h @@ -28,10 +28,10 @@ extern "C" { * any Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */ #define IS_CHAR(c) \ - ((((c) == 0x09) || ((c) == 0x0A) || ((c) == 0x0D) || \ - (((c) >= 0x20) && ((c) != 0xFFFE) && ((c) != 0xFFFF))) && \ - (((c) <= 0xD7FF) || ((c) >= 0xE000)) && ((c) >= 0) && \ - ((c) <= 0x10FFFF)) + (((c) == 0x09) || ((c) == 0x0A) || ((c) == 0x0D) || \ + (((c) >= 0x20) && ((c) <= 0xD7FF)) || \ + (((c) >= 0xE000) && ((c) <= 0xFFFD)) || \ + (((c) >= 0x10000) && ((c) <= 0x10FFFF))) /* * [3] S ::= (#x20 | #x9 | #xD | #xA)+ @@ -442,8 +442,10 @@ xmlParserCtxtPtr xmlNewParserCtxt (void); xmlParserCtxtPtr xmlCreateEntityParserCtxt(const xmlChar *URL, const xmlChar *ID, const xmlChar *base); -void xmlSwitchEncoding (xmlParserCtxtPtr ctxt, +int xmlSwitchEncoding (xmlParserCtxtPtr ctxt, xmlCharEncoding enc); +int xmlSwitchToEncoding (xmlParserCtxtPtr ctxt, + xmlCharEncodingHandlerPtr handler); void xmlFreeParserCtxt (xmlParserCtxtPtr ctxt); /** diff --git a/tree.c b/tree.c index 2cc4b51d..74b5321f 100644 --- a/tree.c +++ b/tree.c @@ -3771,6 +3771,31 @@ xmlBufferShrink(xmlBufferPtr buf, int len) { return(len); } +/** + * xmlBufferGrow: + * @buf: the buffer + * @len: the minimum free sie to allocate + * + * Grow the available space of an XML buffer. + * + * Returns the new available space or -1 in case of error + */ +int +xmlBufferGrow(xmlBufferPtr buf, int len) { + int size; + xmlChar *newbuf; + + if (len <= buf->use) return(0); + + size = buf->size + buf->use + len + 100; + + newbuf = xmlRealloc(buf->content, size); + if (newbuf == NULL) return(-1); + buf->content = newbuf; + buf->size = size; + return(buf->size - buf->use); +} + /** * xmlBufferDump: * @file: the file output diff --git a/tree.h b/tree.h index 1cb12e24..35ea5256 100644 --- a/tree.h +++ b/tree.h @@ -380,6 +380,8 @@ void xmlBufferCCat (xmlBufferPtr buf, const char *str); int xmlBufferShrink (xmlBufferPtr buf, int len); +int xmlBufferGrow (xmlBufferPtr buf, + int len); void xmlBufferEmpty (xmlBufferPtr buf); const xmlChar* xmlBufferContent (const xmlBufferPtr buf); int xmlBufferUse (const xmlBufferPtr buf); diff --git a/uri.c b/uri.c index 1a481130..6000d39e 100644 --- a/uri.c +++ b/uri.c @@ -1283,6 +1283,34 @@ xmlParseURIReference(xmlURIPtr uri, const char *str) { return(0); } +/** + * xmlParseURI: + * @str: the URI string to analyze + * + * Parse an URI + * + * URI-reference = [ absoluteURI | relativeURI ] [ "#" fragment ] + * + * Returns a newly build xmlURIPtr or NULL in case of error + */ +xmlURIPtr +xmlParseURI(const char *str) { + xmlURIPtr uri; + int ret; + + if (str == NULL) + return(NULL); + uri = xmlCreateURI(); + if (uri != NULL) { + ret = xmlParseURIReference(uri, str); + if (ret) { + xmlFreeURI(uri); + return(NULL); + } + } + return(uri); +} + /** * xmlNormalizeURIPath: * @path: pointer to the path string diff --git a/xml-error.h b/xml-error.h index 34f4e668..25d9db09 100644 --- a/xml-error.h +++ b/xml-error.h @@ -130,7 +130,9 @@ typedef enum { XML_ERR_ENTITY_CHAR_ERROR, /* 88 */ XML_ERR_ENTITY_PE_INTERNAL, /* 88 */ XML_ERR_ENTITY_LOOP, /* 89 */ - XML_ERR_ENTITY_BOUNDARY /* 90 */ + XML_ERR_ENTITY_BOUNDARY, /* 90 */ + XML_ERR_INVALID_URI, /* 91 */ + XML_ERR_URI_FRAGMENT /* 92 */ }xmlParserErrors; void xmlParserError (void *ctx, diff --git a/xmlIO.c b/xmlIO.c index 65f5632b..841a6b6d 100644 --- a/xmlIO.c +++ b/xmlIO.c @@ -498,6 +498,10 @@ xmlAllocParserInputBuffer(xmlCharEncoding enc) { } ret->buffer->alloc = XML_BUFFER_ALLOC_DOUBLEIT; ret->encoder = xmlGetCharEncodingHandler(enc); + if (ret->encoder != NULL) + ret->raw = xmlBufferCreate(); + else + ret->raw = NULL; ret->readcallback = NULL; ret->closecallback = NULL; ret->context = NULL; @@ -513,13 +517,20 @@ xmlAllocParserInputBuffer(xmlCharEncoding enc) { */ void xmlFreeParserInputBuffer(xmlParserInputBufferPtr in) { - if (in->buffer != NULL) { - xmlBufferFree(in->buffer); - in->buffer = NULL; + if (in->raw) { + xmlBufferFree(in->raw); + in->raw = NULL; + } + if (in->encoder != NULL) { + xmlCharEncCloseFunc(in->encoder); } if (in->closecallback != NULL) { in->closecallback(in->context); } + if (in->buffer != NULL) { + xmlBufferFree(in->buffer); + in->buffer = NULL; + } memset(in, 0xbe, (size_t) sizeof(xmlParserInputBuffer)); xmlFree(in); @@ -683,34 +694,22 @@ xmlParserInputBufferPush(xmlParserInputBufferPtr in, int len, const char *buf) { if (len < 0) return(0); if (in->encoder != NULL) { - xmlChar *buffer; - int processed = len; - - buffer = (xmlChar *) xmlMalloc((len + 1) * 2 * sizeof(xmlChar)); - if (buffer == NULL) { - fprintf(stderr, "xmlParserInputBufferGrow : out of memory !\n"); - return(-1); - } - nbchars = in->encoder->input(buffer, (len + 1) * 2 * sizeof(xmlChar), - (xmlChar *) buf, &processed); - /* - * TODO : we really need to have something atomic or the - * encoder must report the number of bytes read + /* + * Store the data in the incoming raw buffer */ + if (in->raw == NULL) { + in->raw = xmlBufferCreate(); + } + xmlBufferAdd(in->raw, (const xmlChar *) buf, len); + + /* + * convert as much as possible to the parser reading buffer. + */ + nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw); if (nbchars < 0) { fprintf(stderr, "xmlParserInputBufferPush: encoder error\n"); - xmlFree(buffer); return(-1); } - if (processed != len) { - fprintf(stderr, - "TODO xmlParserInputBufferPush: processed != len\n"); - xmlFree(buffer); - return(-1); - } - buffer[nbchars] = 0; - xmlBufferAdd(in->buffer, (xmlChar *) buffer, nbchars); - xmlFree(buffer); } else { nbchars = len; xmlBufferAdd(in->buffer, (xmlChar *) buf, nbchars); @@ -730,7 +729,9 @@ xmlParserInputBufferPush(xmlParserInputBufferPtr in, int len, const char *buf) { * Grow up the content of the input buffer, the old data are preserved * This routine handle the I18N transcoding to internal UTF-8 * This routine is used when operating the parser in normal (pull) mode - * TODO: one should be able to remove one extra copy + * + * TODO: one should be able to remove one extra copy by copying directy + * onto in->buffer or in->raw * * Returns the number of chars read and stored in the buffer, or -1 * in case of error. @@ -779,34 +780,22 @@ xmlParserInputBufferGrow(xmlParserInputBufferPtr in, int len) { return(-1); } if (in->encoder != NULL) { - xmlChar *buf; - int wrote = res; - - buf = (xmlChar *) xmlMalloc((res + 1) * 2 * sizeof(xmlChar)); - if (buf == NULL) { - fprintf(stderr, "xmlParserInputBufferGrow : out of memory !\n"); - xmlFree(buffer); - return(-1); + /* + * Store the data in the incoming raw buffer + */ + if (in->raw == NULL) { + in->raw = xmlBufferCreate(); } - nbchars = in->encoder->input(buf, (res + 1) * 2 * sizeof(xmlChar), - BAD_CAST buffer, &wrote); - buf[nbchars] = 0; - xmlBufferAdd(in->buffer, (xmlChar *) buf, nbchars); - xmlFree(buf); + xmlBufferAdd(in->raw, (const xmlChar *) buffer, len); /* - * Check that the encoder was able to process the full input + * convert as much as possible to the parser reading buffer. */ - if (wrote != res) { - fprintf(stderr, - "TODO : xmlParserInputBufferGrow wrote %d != res %d\n", - wrote, res); - /* - * TODO !!! - * Need to keep the unprocessed input in a buffer in->unprocessed - */ + nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw); + if (nbchars < 0) { + fprintf(stderr, "xmlParserInputBufferGrow: encoder error\n"); + return(-1); } - } else { nbchars = res; buffer[nbchars] = 0; diff --git a/xmlIO.h b/xmlIO.h index 8f9b7e02..2d14ebeb 100644 --- a/xmlIO.h +++ b/xmlIO.h @@ -33,6 +33,7 @@ struct _xmlParserInputBuffer { xmlCharEncodingHandlerPtr encoder; /* I18N conversions to UTF-8 */ xmlBufferPtr buffer; /* Local buffer encoded in UTF-8 */ + xmlBufferPtr raw; /* if encoder != NULL buffer for raw input */ };