/* * encoding.c : implements the encoding conversion functions needed for XML * * Related specs: * rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies * rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau * [ISO-10646] UTF-8 and UTF-16 in Annexes * [ISO-8859-1] ISO Latin-1 characters codes. * [UNICODE] The Unicode Consortium, "The Unicode Standard -- * Worldwide Character Encoding -- Version 1.0", Addison- * Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is * described in Unicode Technical Report #4. * [US-ASCII] Coded Character Set--7-bit American Standard Code for * Information Interchange, ANSI X3.4-1986. * * See Copyright for the status of this software. * * daniel@veillard.com * * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" */ #define IN_LIBXML #include "libxml.h" #include #include #include #include #ifdef LIBXML_ICONV_ENABLED #include #include #endif #include #include #include #ifdef LIBXML_HTML_ENABLED #include #endif #include #include "private/buf.h" #include "private/enc.h" #include "private/error.h" #ifdef LIBXML_ICU_ENABLED #include #endif #define XML_HANDLER_STATIC 1 typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias; typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr; struct _xmlCharEncodingAlias { const char *name; const char *alias; }; static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL; static int xmlCharEncodingAliasesNb = 0; static int xmlCharEncodingAliasesMax = 0; static int xmlLittleEndian = 1; typedef struct { const char *name; xmlCharEncoding enc; } xmlEncTableEntry; static const xmlEncTableEntry xmlEncTable[] = { { "ASCII", XML_CHAR_ENCODING_ASCII }, { "EUC-JP", XML_CHAR_ENCODING_EUC_JP }, { "HTML", XML_CHAR_ENCODING_HTML }, { "ISO LATIN 1", XML_CHAR_ENCODING_8859_1 }, { "ISO LATIN 2", XML_CHAR_ENCODING_8859_2 }, { "ISO-10646-UCS-2", XML_CHAR_ENCODING_UCS2 }, { "ISO-10646-UCS-4", XML_CHAR_ENCODING_UCS4LE }, { "ISO-2022-JP", XML_CHAR_ENCODING_2022_JP }, { "ISO-8859-1", XML_CHAR_ENCODING_8859_1 }, { "ISO-8859-10", XML_CHAR_ENCODING_8859_10 }, { "ISO-8859-11", XML_CHAR_ENCODING_8859_11 }, { "ISO-8859-13", XML_CHAR_ENCODING_8859_13 }, { "ISO-8859-14", XML_CHAR_ENCODING_8859_14 }, { "ISO-8859-15", XML_CHAR_ENCODING_8859_15 }, { "ISO-8859-16", XML_CHAR_ENCODING_8859_16 }, { "ISO-8859-2", XML_CHAR_ENCODING_8859_2 }, { "ISO-8859-3", XML_CHAR_ENCODING_8859_3 }, { "ISO-8859-4", XML_CHAR_ENCODING_8859_4 }, { "ISO-8859-5", XML_CHAR_ENCODING_8859_5 }, { "ISO-8859-6", XML_CHAR_ENCODING_8859_6 }, { "ISO-8859-7", XML_CHAR_ENCODING_8859_7 }, { "ISO-8859-8", XML_CHAR_ENCODING_8859_8 }, { "ISO-8859-9", XML_CHAR_ENCODING_8859_9 }, { "ISO-LATIN-1", XML_CHAR_ENCODING_8859_1 }, { "ISO-LATIN-2", XML_CHAR_ENCODING_8859_2 }, { "SHIFT_JIS", XML_CHAR_ENCODING_SHIFT_JIS }, { "UCS-2", XML_CHAR_ENCODING_UCS2 }, { "UCS-4", XML_CHAR_ENCODING_UCS4LE }, { "UCS2", XML_CHAR_ENCODING_UCS2 }, { "UCS4", XML_CHAR_ENCODING_UCS4LE }, { "US-ASCII", XML_CHAR_ENCODING_ASCII }, { "UTF-16", XML_CHAR_ENCODING_UTF16 }, { "UTF-16BE", XML_CHAR_ENCODING_UTF16BE }, { "UTF-16LE", XML_CHAR_ENCODING_UTF16LE }, { "UTF-8", XML_CHAR_ENCODING_UTF8 }, { "UTF16", XML_CHAR_ENCODING_UTF16LE }, { "UTF8", XML_CHAR_ENCODING_UTF8 } }; static int asciiToAscii(unsigned char* out, int *outlen, const unsigned char* in, int *inlen, void *vctxt); static int UTF8ToUTF8(unsigned char* out, int *outlen, const unsigned char* inb, int *inlenb, void *vctxt); static int latin1ToUTF8(unsigned char* out, int *outlen, const unsigned char* in, int *inlen, void *vctxt); static int UTF16LEToUTF8(unsigned char* out, int *outlen, const unsigned char* inb, int *inlenb, void *vctxt); static int UTF16BEToUTF8(unsigned char* out, int *outlen, const unsigned char* inb, int *inlenb, void *vctxt); #ifdef LIBXML_OUTPUT_ENABLED static int UTF8ToLatin1(unsigned char* outb, int *outlen, const unsigned char* in, int *inlen, void *vctxt); static int UTF8ToUTF16(unsigned char* outb, int *outlen, const unsigned char* in, int *inlen, void *vctxt); static int UTF8ToUTF16LE(unsigned char* outb, int *outlen, const unsigned char* in, int *inlen, void *vctxt); static int UTF8ToUTF16BE(unsigned char* outb, int *outlen, const unsigned char* in, int *inlen, void *vctxt); #else /* LIBXML_OUTPUT_ENABLED */ #define UTF8ToLatin1 NULL #define UTF8ToUTF16 NULL #define UTF8ToUTF16LE NULL #define UTF8ToUTF16BE NULL #endif /* LIBXML_OUTPUT_ENABLED */ #if defined(LIBXML_OUTPUT_ENABLED) && defined(LIBXML_HTML_ENABLED) static int UTF8ToHtmlWrapper(unsigned char *out, int *outlen, const unsigned char *in, int *inlen, void *vctxt); #else #define UTF8ToHtmlWrapper NULL #endif #ifdef LIBXML_ICONV_ENABLED #define EMPTY_ICONV , (iconv_t) 0, (iconv_t) 0 #else #define EMPTY_ICONV #endif #if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) && \ defined(LIBXML_ISO8859X_ENABLED) #include "iso8859x.inc" static int ISO8859xToUTF8(unsigned char* out, int *outlen, const unsigned char* in, int *inlen, void *vctxt); static int UTF8ToISO8859x(unsigned char *out, int *outlen, const unsigned char *in, int *inlen, void *vctxt); #define MAKE_ISO_HANDLER(name, n) \ { (char *) name, \ (xmlCharEncodingInputFunc) (void (*)(void)) ISO8859xToUTF8, \ (xmlCharEncodingInputFunc) (void (*)(void)) UTF8ToISO8859x \ EMPTY_ICONV, \ (void *) xmlunicodetable_ISO8859_##n, \ (void *) xmltranscodetable_ISO8859_##n, \ NULL, XML_HANDLER_STATIC } #else /* LIBXML_ISO8859X_ENABLED */ #define MAKE_ISO_HANDLER(name, n) \ { (char *) name, NULL, NULL EMPTY_ICONV, NULL, NULL, NULL, \ XML_HANDLER_STATIC } #endif /* LIBXML_ISO8859X_ENABLED */ #define MAKE_HANDLER(name, in, out) \ { (char *) name, \ (xmlCharEncodingInputFunc) (void (*)(void)) in, \ (xmlCharEncodingOutputFunc) (void (*)(void)) out \ EMPTY_ICONV, NULL, NULL, NULL, XML_HANDLER_STATIC } /* * The layout must match enum xmlCharEncoding. * * Names should match the IANA registry if possible: * https://www.iana.org/assignments/character-sets/character-sets.xhtml */ static const xmlCharEncodingHandler defaultHandlers[31] = { MAKE_HANDLER(NULL, NULL, NULL), /* NONE */ MAKE_HANDLER("UTF-8", UTF8ToUTF8, UTF8ToUTF8), MAKE_HANDLER("UTF-16LE", UTF16LEToUTF8, UTF8ToUTF16LE), MAKE_HANDLER("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE), MAKE_HANDLER("UCS-4LE", NULL, NULL), MAKE_HANDLER("UCS-4BE", NULL, NULL), MAKE_HANDLER("IBM037", NULL, NULL), MAKE_HANDLER("ISO-10646-UCS-4", NULL, NULL), /* UCS4_2143 */ MAKE_HANDLER("ISO-10646-UCS-4", NULL, NULL), /* UCS4_2143 */ MAKE_HANDLER("ISO-10646-UCS-2", NULL, NULL), MAKE_HANDLER("ISO-8859-1", latin1ToUTF8, UTF8ToLatin1), MAKE_ISO_HANDLER("ISO-8859-2", 2), MAKE_ISO_HANDLER("ISO-8859-3", 3), MAKE_ISO_HANDLER("ISO-8859-4", 4), MAKE_ISO_HANDLER("ISO-8859-5", 5), MAKE_ISO_HANDLER("ISO-8859-6", 6), MAKE_ISO_HANDLER("ISO-8859-7", 7), MAKE_ISO_HANDLER("ISO-8859-8", 8), MAKE_ISO_HANDLER("ISO-8859-9", 9), MAKE_HANDLER("ISO-2022-JP", NULL, NULL), MAKE_HANDLER("Shift_JIS", NULL, NULL), MAKE_HANDLER("EUC-JP", NULL, NULL), MAKE_HANDLER("US-ASCII", asciiToAscii, asciiToAscii), MAKE_HANDLER("UTF-16", UTF16LEToUTF8, UTF8ToUTF16), MAKE_HANDLER("HTML", NULL, UTF8ToHtmlWrapper), MAKE_ISO_HANDLER("ISO-8859-10", 10), MAKE_ISO_HANDLER("ISO-8859-11", 11), MAKE_ISO_HANDLER("ISO-8859-13", 13), MAKE_ISO_HANDLER("ISO-8859-14", 14), MAKE_ISO_HANDLER("ISO-8859-15", 15), MAKE_ISO_HANDLER("ISO-8859-16", 16), }; #define NUM_DEFAULT_HANDLERS \ (sizeof(defaultHandlers) / sizeof(defaultHandlers[0])) /* the size should be growable, but it's not a big deal ... */ #define MAX_ENCODING_HANDLERS 50 static xmlCharEncodingHandlerPtr *globalHandlers = NULL; static int nbCharEncodingHandler = 0; #ifdef LIBXML_ICONV_ENABLED static int xmlCharEncIconv(void *vctxt, const char *name, xmlCharEncConverter *conv); #endif #ifdef LIBXML_ICU_ENABLED static int xmlCharEncUconv(void *vctxt, const char *name, xmlCharEncConverter *conv); #endif /************************************************************************ * * * Generic encoding handling routines * * * ************************************************************************/ /** * xmlDetectCharEncoding: * @in: a pointer to the first bytes of the XML entity, must be at least * 2 bytes long (at least 4 if encoding is UTF4 variant). * @len: pointer to the length of the buffer * * Guess the encoding of the entity using the first bytes of the entity content * according to the non-normative appendix F of the XML-1.0 recommendation. * * Returns one of the XML_CHAR_ENCODING_... values. */ xmlCharEncoding xmlDetectCharEncoding(const unsigned char* in, int len) { if (in == NULL) return(XML_CHAR_ENCODING_NONE); if (len >= 4) { if ((in[0] == 0x00) && (in[1] == 0x00) && (in[2] == 0x00) && (in[3] == 0x3C)) return(XML_CHAR_ENCODING_UCS4BE); if ((in[0] == 0x3C) && (in[1] == 0x00) && (in[2] == 0x00) && (in[3] == 0x00)) return(XML_CHAR_ENCODING_UCS4LE); if ((in[0] == 0x00) && (in[1] == 0x00) && (in[2] == 0x3C) && (in[3] == 0x00)) return(XML_CHAR_ENCODING_UCS4_2143); if ((in[0] == 0x00) && (in[1] == 0x3C) && (in[2] == 0x00) && (in[3] == 0x00)) return(XML_CHAR_ENCODING_UCS4_3412); if ((in[0] == 0x4C) && (in[1] == 0x6F) && (in[2] == 0xA7) && (in[3] == 0x94)) return(XML_CHAR_ENCODING_EBCDIC); if ((in[0] == 0x3C) && (in[1] == 0x3F) && (in[2] == 0x78) && (in[3] == 0x6D)) return(XML_CHAR_ENCODING_UTF8); /* * Although not part of the recommendation, we also * attempt an "auto-recognition" of UTF-16LE and * UTF-16BE encodings. */ if ((in[0] == 0x3C) && (in[1] == 0x00) && (in[2] == 0x3F) && (in[3] == 0x00)) return(XML_CHAR_ENCODING_UTF16LE); if ((in[0] == 0x00) && (in[1] == 0x3C) && (in[2] == 0x00) && (in[3] == 0x3F)) return(XML_CHAR_ENCODING_UTF16BE); } if (len >= 3) { /* * Errata on XML-1.0 June 20 2001 * We now allow an UTF8 encoded BOM */ if ((in[0] == 0xEF) && (in[1] == 0xBB) && (in[2] == 0xBF)) return(XML_CHAR_ENCODING_UTF8); } /* For UTF-16 we can recognize by the BOM */ if (len >= 2) { if ((in[0] == 0xFE) && (in[1] == 0xFF)) return(XML_CHAR_ENCODING_UTF16BE); if ((in[0] == 0xFF) && (in[1] == 0xFE)) return(XML_CHAR_ENCODING_UTF16LE); } return(XML_CHAR_ENCODING_NONE); } /** * xmlCleanupEncodingAliases: * * DEPRECATED: This function modifies global state and is not * thread-safe. * * Unregisters all aliases */ void xmlCleanupEncodingAliases(void) { int i; if (xmlCharEncodingAliases == NULL) return; for (i = 0;i < xmlCharEncodingAliasesNb;i++) { if (xmlCharEncodingAliases[i].name != NULL) xmlFree((char *) xmlCharEncodingAliases[i].name); if (xmlCharEncodingAliases[i].alias != NULL) xmlFree((char *) xmlCharEncodingAliases[i].alias); } xmlCharEncodingAliasesNb = 0; xmlCharEncodingAliasesMax = 0; xmlFree(xmlCharEncodingAliases); xmlCharEncodingAliases = NULL; } /** * xmlGetEncodingAlias: * @alias: the alias name as parsed, in UTF-8 format (ASCII actually) * * DEPRECATED: This function is not thread-safe. * * Lookup an encoding name for the given alias. * * Returns NULL if not found, otherwise the original name */ const char * xmlGetEncodingAlias(const char *alias) { int i; char upper[100]; if (alias == NULL) return(NULL); if (xmlCharEncodingAliases == NULL) return(NULL); for (i = 0;i < 99;i++) { upper[i] = (char) toupper((unsigned char) alias[i]); if (upper[i] == 0) break; } upper[i] = 0; /* * Walk down the list looking for a definition of the alias */ for (i = 0;i < xmlCharEncodingAliasesNb;i++) { if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) { return(xmlCharEncodingAliases[i].name); } } return(NULL); } /** * xmlAddEncodingAlias: * @name: the encoding name as parsed, in UTF-8 format (ASCII actually) * @alias: the alias name as parsed, in UTF-8 format (ASCII actually) * * DEPRECATED: This function modifies global state and is not * thread-safe. * * Registers an alias @alias for an encoding named @name. Existing alias * will be overwritten. * * Returns 0 in case of success, -1 in case of error */ int xmlAddEncodingAlias(const char *name, const char *alias) { int i; char upper[100]; char *nameCopy, *aliasCopy; if ((name == NULL) || (alias == NULL)) return(-1); for (i = 0;i < 99;i++) { upper[i] = (char) toupper((unsigned char) alias[i]); if (upper[i] == 0) break; } upper[i] = 0; if (xmlCharEncodingAliasesNb >= xmlCharEncodingAliasesMax) { xmlCharEncodingAliasPtr tmp; size_t newSize = xmlCharEncodingAliasesMax ? xmlCharEncodingAliasesMax * 2 : 20; tmp = (xmlCharEncodingAliasPtr) xmlRealloc(xmlCharEncodingAliases, newSize * sizeof(xmlCharEncodingAlias)); if (tmp == NULL) return(-1); xmlCharEncodingAliases = tmp; xmlCharEncodingAliasesMax = newSize; } /* * Walk down the list looking for a definition of the alias */ for (i = 0;i < xmlCharEncodingAliasesNb;i++) { if (!strcmp(xmlCharEncodingAliases[i].alias, upper)) { /* * Replace the definition. */ nameCopy = xmlMemStrdup(name); if (nameCopy == NULL) return(-1); xmlFree((char *) xmlCharEncodingAliases[i].name); xmlCharEncodingAliases[i].name = nameCopy; return(0); } } /* * Add the definition */ nameCopy = xmlMemStrdup(name); if (nameCopy == NULL) return(-1); aliasCopy = xmlMemStrdup(upper); if (aliasCopy == NULL) { xmlFree(nameCopy); return(-1); } xmlCharEncodingAliases[xmlCharEncodingAliasesNb].name = nameCopy; xmlCharEncodingAliases[xmlCharEncodingAliasesNb].alias = aliasCopy; xmlCharEncodingAliasesNb++; return(0); } /** * xmlDelEncodingAlias: * @alias: the alias name as parsed, in UTF-8 format (ASCII actually) * * DEPRECATED: This function modifies global state and is not * thread-safe. * * Unregisters an encoding alias @alias * * Returns 0 in case of success, -1 in case of error */ int xmlDelEncodingAlias(const char *alias) { int i; if (alias == NULL) return(-1); if (xmlCharEncodingAliases == NULL) return(-1); /* * Walk down the list looking for a definition of the alias */ for (i = 0;i < xmlCharEncodingAliasesNb;i++) { if (!strcmp(xmlCharEncodingAliases[i].alias, alias)) { xmlFree((char *) xmlCharEncodingAliases[i].name); xmlFree((char *) xmlCharEncodingAliases[i].alias); xmlCharEncodingAliasesNb--; memmove(&xmlCharEncodingAliases[i], &xmlCharEncodingAliases[i + 1], sizeof(xmlCharEncodingAlias) * (xmlCharEncodingAliasesNb - i)); return(0); } } return(-1); } static int xmlCompareEncTableEntries(const void *vkey, const void *ventry) { const char *key = vkey; const xmlEncTableEntry *entry = ventry; return(xmlStrcasecmp(BAD_CAST key, BAD_CAST entry->name)); } /** * xmlParseCharEncoding: * @name: the encoding name as parsed, in UTF-8 format (ASCII actually) * * Compare the string to the encoding schemes already known. Note * that the comparison is case insensitive accordingly to the section * [XML] 4.3.3 Character Encoding in Entities. * * Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE * if not recognized. */ xmlCharEncoding xmlParseCharEncoding(const char *name) { const xmlEncTableEntry *entry; if (name == NULL) return(XML_CHAR_ENCODING_NONE); entry = bsearch(name, xmlEncTable, sizeof(xmlEncTable) / sizeof(xmlEncTable[0]), sizeof(xmlEncTable[0]), xmlCompareEncTableEntries); if (entry != NULL) return(entry->enc); return(XML_CHAR_ENCODING_NONE); } /** * xmlGetCharEncodingName: * @enc: the encoding * * The "canonical" name for XML encoding. * C.f. http://www.w3.org/TR/REC-xml#charencoding * Section 4.3.3 Character Encoding in Entities * * Returns the canonical name for the given encoding */ const char* xmlGetCharEncodingName(xmlCharEncoding enc) { switch (enc) { case XML_CHAR_ENCODING_UTF16LE: return("UTF-16"); case XML_CHAR_ENCODING_UTF16BE: return("UTF-16"); case XML_CHAR_ENCODING_UCS4LE: return("ISO-10646-UCS-4"); case XML_CHAR_ENCODING_UCS4BE: return("ISO-10646-UCS-4"); default: break; } if ((enc <= 0) || ((size_t) enc >= NUM_DEFAULT_HANDLERS)) return(NULL); return(defaultHandlers[enc].name); } /************************************************************************ * * * Char encoding handlers * * * ************************************************************************/ /** * xmlNewCharEncodingHandler: * @name: the encoding name, in UTF-8 format (ASCII actually) * @input: the xmlCharEncodingInputFunc to read that encoding * @output: the xmlCharEncodingOutputFunc to write that encoding * * DEPRECATED: This function modifies global state and is not * thread-safe. * * Create and registers an xmlCharEncodingHandler. * * Returns the xmlCharEncodingHandlerPtr created (or NULL in case of error). */ xmlCharEncodingHandlerPtr xmlNewCharEncodingHandler(const char *name, xmlCharEncodingInputFunc input, xmlCharEncodingOutputFunc output) { xmlCharEncodingHandlerPtr handler; const char *alias; char upper[500]; int i; char *up = NULL; /* * Do the alias resolution */ alias = xmlGetEncodingAlias(name); if (alias != NULL) name = alias; /* * Keep only the uppercase version of the encoding. */ if (name == NULL) return(NULL); for (i = 0;i < 499;i++) { upper[i] = (char) toupper((unsigned char) name[i]); if (upper[i] == 0) break; } upper[i] = 0; up = xmlMemStrdup(upper); if (up == NULL) return(NULL); /* * allocate and fill-up an handler block. */ handler = (xmlCharEncodingHandlerPtr) xmlMalloc(sizeof(xmlCharEncodingHandler)); if (handler == NULL) { xmlFree(up); return(NULL); } memset(handler, 0, sizeof(xmlCharEncodingHandler)); handler->input = input; handler->output = output; handler->name = up; handler->flags = XML_HANDLER_STATIC; #ifdef LIBXML_ICONV_ENABLED handler->iconv_in = NULL; handler->iconv_out = NULL; #endif /* * registers and returns the handler. */ xmlRegisterCharEncodingHandler(handler); return(handler); } /** * xmlInitCharEncodingHandlers: * * DEPRECATED: Alias for xmlInitParser. */ void xmlInitCharEncodingHandlers(void) { xmlInitParser(); } /** * xmlInitEncodingInternal: * * Initialize the char encoding support. */ void xmlInitEncodingInternal(void) { unsigned short int tst = 0x1234; unsigned char *ptr = (unsigned char *) &tst; if (*ptr == 0x12) xmlLittleEndian = 0; else xmlLittleEndian = 1; } /** * xmlCleanupCharEncodingHandlers: * * DEPRECATED: This function will be made private. Call xmlCleanupParser * to free global state but see the warnings there. xmlCleanupParser * should be only called once at program exit. In most cases, you don't * have call cleanup functions at all. * * Cleanup the memory allocated for the char encoding support, it * unregisters all the encoding handlers and the aliases. */ void xmlCleanupCharEncodingHandlers(void) { xmlCleanupEncodingAliases(); if (globalHandlers == NULL) return; for (;nbCharEncodingHandler > 0;) { xmlCharEncodingHandler *handler; nbCharEncodingHandler--; handler = globalHandlers[nbCharEncodingHandler]; if (handler != NULL) { if (handler->name != NULL) xmlFree(handler->name); xmlFree(handler); } } xmlFree(globalHandlers); globalHandlers = NULL; nbCharEncodingHandler = 0; } /** * xmlRegisterCharEncodingHandler: * @handler: the xmlCharEncodingHandlerPtr handler block * * DEPRECATED: This function modifies global state and is not * thread-safe. * * Register the char encoding handler. */ void xmlRegisterCharEncodingHandler(xmlCharEncodingHandlerPtr handler) { if (handler == NULL) return; if (globalHandlers == NULL) { globalHandlers = xmlMalloc( MAX_ENCODING_HANDLERS * sizeof(globalHandlers[0])); if (globalHandlers == NULL) goto free_handler; } if (nbCharEncodingHandler >= MAX_ENCODING_HANDLERS) goto free_handler; globalHandlers[nbCharEncodingHandler++] = handler; return; free_handler: if (handler != NULL) { if (handler->name != NULL) { xmlFree(handler->name); } xmlFree(handler); } } static int xmlInvokeConvImpl(xmlCharEncConvImpl impl, void *implCtxt, const char *name, xmlCharEncodingHandler *handler) { xmlCharEncConverter conv = { NULL, NULL, NULL, NULL, NULL }; int ret; ret = impl(implCtxt, name, &conv); if (ret == XML_ERR_OK) { handler->input = (xmlCharEncodingInputFunc) (void (*)(void)) conv.input; handler->output = (xmlCharEncodingOutputFunc) (void (*)(void)) conv.output; handler->ctxtDtor = conv.ctxtDtor; handler->inputCtxt = conv.inputCtxt; handler->outputCtxt = conv.outputCtxt; } return(ret); } /** * xmlFindExtraHandler: * @norig: name of the char encoding * @name: potentially aliased name of the encoding * @output: boolean, use handler for output * @impl: a conversion implementation (optional) * @implCtxt: user data for conversion implementation (optional) * @out: pointer to resulting handler * * Search the non-default handlers for an exact match. * * Returns an xmlParserErrors error code. */ static int xmlFindExtraHandler(const char *norig, const char *name, int output, xmlCharEncConvImpl impl, void *implCtxt, xmlCharEncodingHandler **out) { xmlCharEncodingHandler *handler; int ret; int i; handler = xmlMalloc(sizeof(*handler)); if (handler == NULL) return(XML_ERR_NO_MEMORY); memset(handler, 0, sizeof(*handler)); handler->name = xmlMemStrdup(name); if (handler->name == NULL) { ret = XML_ERR_NO_MEMORY; goto done; } /* * Try custom implementation before deprecated global handlers. * * Note that we pass the original name without deprecated * alias resolution. */ if (impl != NULL) { ret = xmlInvokeConvImpl(impl, implCtxt, norig, handler); if (ret != XML_ERR_OK) goto done; *out = handler; return(XML_ERR_OK); } /* * Deprecated */ if (globalHandlers != NULL) { for (i = 0; i < nbCharEncodingHandler; i++) { xmlCharEncodingHandler *h = globalHandlers[i]; if (!xmlStrcasecmp((const xmlChar *) name, (const xmlChar *) h->name)) { if ((output ? h->output : h->input) != NULL) { *out = h; ret = XML_ERR_OK; goto done; } } } } #ifdef LIBXML_ICONV_ENABLED ret = xmlInvokeConvImpl(xmlCharEncIconv, handler, name, handler); if (ret == XML_ERR_OK) { *out = handler; return(XML_ERR_OK); } if (ret != XML_ERR_UNSUPPORTED_ENCODING) goto done; #endif /* LIBXML_ICONV_ENABLED */ #ifdef LIBXML_ICU_ENABLED ret = xmlInvokeConvImpl(xmlCharEncUconv, handler, name, handler); if (ret == XML_ERR_OK) { *out = handler; return(XML_ERR_OK); } if (ret != XML_ERR_UNSUPPORTED_ENCODING) goto done; #endif /* LIBXML_ICU_ENABLED */ ret = XML_ERR_UNSUPPORTED_ENCODING; done: if (handler != NULL) { xmlFree(handler->name); xmlFree(handler); } return(ret); } /** * xmlLookupCharEncodingHandler: * @enc: an xmlCharEncoding value. * @out: pointer to result * * Find or create a handler matching the encoding. The following * converters are looked up in order: * * - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII) * - User-registered global handler (deprecated) * - iconv if enabled * - ICU if enabled * * The handler must be closed with xmlCharEncCloseFunc. * * If the encoding is UTF-8, a NULL handler and no error code will * be returned. * * Available since 2.13.0. * * Returns XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another * xmlParserErrors error code. */ int xmlLookupCharEncodingHandler(xmlCharEncoding enc, xmlCharEncodingHandler **out) { const xmlCharEncodingHandler *handler; if (out == NULL) return(XML_ERR_ARGUMENT); *out = NULL; if ((enc <= 0) || ((size_t) enc >= NUM_DEFAULT_HANDLERS)) return(XML_ERR_UNSUPPORTED_ENCODING); /* Return NULL handler for UTF-8 */ if ((enc == XML_CHAR_ENCODING_UTF8) || (enc == XML_CHAR_ENCODING_NONE)) return(XML_ERR_OK); handler = &defaultHandlers[enc]; if ((handler->input != NULL) || (handler->output != NULL)) { *out = (xmlCharEncodingHandler *) handler; return(XML_ERR_OK); } if (handler->name != NULL) return(xmlFindExtraHandler(handler->name, handler->name, 0, NULL, NULL, out)); return(XML_ERR_UNSUPPORTED_ENCODING); } /** * xmlGetCharEncodingHandler: * @enc: an xmlCharEncoding value. * * DEPRECATED: Use xmlLookupCharEncodingHandler which has better error * reporting. * * Returns the handler or NULL if no handler was found or an error * occurred. */ xmlCharEncodingHandlerPtr xmlGetCharEncodingHandler(xmlCharEncoding enc) { xmlCharEncodingHandler *ret; xmlLookupCharEncodingHandler(enc, &ret); return(ret); } /** * xmlCreateCharEncodingHandler: * @name: a string describing the char encoding. * @output: boolean, use handler for output * @impl: a conversion implementation (optional) * @implCtxt: user data for conversion implementation (optional) * @out: pointer to result * * Find or create a handler matching the encoding. The following * converters are looked up in order: * * - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII) * - Custom implementation if provided * - User-registered global handler (deprecated) * - iconv if enabled * - ICU if enabled * * The handler must be closed with xmlCharEncCloseFunc. * * If the encoding is UTF-8, a NULL handler and no error code will * be returned. * * Available since 2.14.0. * * Returns XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another * xmlParserErrors error code. */ int xmlCreateCharEncodingHandler(const char *name, int output, xmlCharEncConvImpl impl, void *implCtxt, xmlCharEncodingHandler **out) { const xmlCharEncodingHandler *handler; const char *norig, *nalias; xmlCharEncoding enc; if (out == NULL) return(XML_ERR_ARGUMENT); *out = NULL; if (name == NULL) return(XML_ERR_ARGUMENT); norig = name; nalias = xmlGetEncodingAlias(name); if (nalias != NULL) name = nalias; enc = xmlParseCharEncoding(name); /* Return NULL handler for UTF-8 */ if (enc == XML_CHAR_ENCODING_UTF8) return(XML_ERR_OK); if ((enc > 0) && ((size_t) enc < NUM_DEFAULT_HANDLERS)) { handler = &defaultHandlers[enc]; if ((output ? handler->output : handler->input) != NULL) { *out = (xmlCharEncodingHandler *) handler; return(XML_ERR_OK); } } return(xmlFindExtraHandler(norig, name, output, impl, implCtxt, out)); } /** * xmlOpenCharEncodingHandler: * @name: a string describing the char encoding. * @output: boolean, use handler for output * @out: pointer to result * * Find or create a handler matching the encoding. The following * converters are looked up in order: * * - Built-in handler (UTF-8, UTF-16, ISO-8859-1, ASCII) * - User-registered global handler (deprecated) * - iconv if enabled * - ICU if enabled * * The handler must be closed with xmlCharEncCloseFunc. * * If the encoding is UTF-8, a NULL handler and no error code will * be returned. * * Available since 2.13.0. * * Returns XML_ERR_OK, XML_ERR_UNSUPPORTED_ENCODING or another * xmlParserErrors error code. */ int xmlOpenCharEncodingHandler(const char *name, int output, xmlCharEncodingHandler **out) { return(xmlCreateCharEncodingHandler(name, output, NULL, NULL, out)); } /** * xmlFindCharEncodingHandler: * @name: a string describing the char encoding. * * DEPRECATED: Use xmlOpenCharEncodingHandler which has better error * reporting. * * If the encoding is UTF-8, this will return a no-op handler that * shouldn't be used. * * Returns the handler or NULL if no handler was found or an error * occurred. */ xmlCharEncodingHandlerPtr xmlFindCharEncodingHandler(const char *name) { xmlCharEncodingHandler *ret; /* * This handler shouldn't be used, but we must return a non-NULL * handler. */ if ((xmlStrcasecmp(BAD_CAST name, BAD_CAST "UTF-8") == 0) || (xmlStrcasecmp(BAD_CAST name, BAD_CAST "UTF8") == 0)) return((xmlCharEncodingHandlerPtr) &defaultHandlers[XML_CHAR_ENCODING_UTF8]); xmlOpenCharEncodingHandler(name, 0, &ret); return(ret); } /************************************************************************ * * * ICONV based generic conversion functions * * * ************************************************************************/ #ifdef LIBXML_ICONV_ENABLED typedef struct { iconv_t cd; } xmlIconvCtxt; /** * xmlIconvConvert: * @vctxt: conversion context * @out: a pointer to an array of bytes to store the result * @outlen: the length of @out * @in: a pointer to an array of input bytes * @inlen: the length of @in * * Returns an XML_ENC_ERR code. * * The value of @inlen after return is the number of octets consumed * as the return value is positive, else unpredictable. * The value of @outlen after return is the number of octets produced. */ static int xmlIconvConvert(unsigned char *out, int *outlen, const unsigned char *in, int *inlen, void *vctxt) { xmlIconvCtxt *ctxt = vctxt; size_t icv_inlen, icv_outlen; const char *icv_in = (const char *) in; char *icv_out = (char *) out; size_t ret; if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) { if (outlen != NULL) *outlen = 0; return(XML_ENC_ERR_INTERNAL); } icv_inlen = *inlen; icv_outlen = *outlen; /* * Some versions take const, other versions take non-const input. */ ret = iconv(ctxt->cd, (void *) &icv_in, &icv_inlen, &icv_out, &icv_outlen); *inlen -= icv_inlen; *outlen -= icv_outlen; if (ret == (size_t) -1) { if (errno == EILSEQ) return(XML_ENC_ERR_INPUT); if (errno == E2BIG) return(XML_ENC_ERR_SPACE); /* * EINVAL means a truncated multi-byte sequence at the end * of the input buffer. We treat this as success. */ if (errno == EINVAL) return(XML_ENC_ERR_SUCCESS); return(XML_ENC_ERR_INTERNAL); } return(XML_ENC_ERR_SUCCESS); } static void xmlIconvFree(void *vctxt) { xmlIconvCtxt *ctxt = vctxt; if (ctxt->cd != (iconv_t) -1) iconv_close(ctxt->cd); xmlFree(ctxt); } static int xmlCharEncIconv(void *vctxt, const char *name, xmlCharEncConverter *conv) { xmlCharEncodingHandler *handler = vctxt; xmlIconvCtxt *inputCtxt = NULL, *outputCtxt = NULL; iconv_t icv_in; iconv_t icv_out; int ret; inputCtxt = xmlMalloc(sizeof(xmlIconvCtxt)); if (inputCtxt == NULL) { ret = XML_ERR_NO_MEMORY; goto error; } inputCtxt->cd = (iconv_t) -1; icv_in = iconv_open("UTF-8", name); if (icv_in == (iconv_t) -1) { if (errno == EINVAL) ret = XML_ERR_UNSUPPORTED_ENCODING; else if (errno == ENOMEM) ret = XML_ERR_NO_MEMORY; else ret = XML_ERR_SYSTEM; goto error; } inputCtxt->cd = icv_in; outputCtxt = xmlMalloc(sizeof(xmlIconvCtxt)); if (outputCtxt == NULL) { ret = XML_ERR_NO_MEMORY; goto error; } outputCtxt->cd = (iconv_t) -1; icv_out = iconv_open(name, "UTF-8"); if (icv_out == (iconv_t) -1) { if (errno == EINVAL) ret = XML_ERR_UNSUPPORTED_ENCODING; else if (errno == ENOMEM) ret = XML_ERR_NO_MEMORY; else ret = XML_ERR_SYSTEM; goto error; } outputCtxt->cd = icv_out; conv->input = xmlIconvConvert; conv->output = xmlIconvConvert; conv->ctxtDtor = xmlIconvFree; conv->inputCtxt = inputCtxt; conv->outputCtxt = outputCtxt; /* Backward compatibility */ if (handler != NULL) { handler->iconv_in = icv_in; handler->iconv_out = icv_out; } return(XML_ERR_OK); error: if (inputCtxt != NULL) xmlIconvFree(inputCtxt); if (outputCtxt != NULL) xmlIconvFree(outputCtxt); return(ret); } #endif /* LIBXML_ICONV_ENABLED */ /************************************************************************ * * * ICU based generic conversion functions * * * ************************************************************************/ #ifdef LIBXML_ICU_ENABLED /* Size of pivot buffer, same as icu/source/common/ucnv.cpp CHUNK_SIZE */ #define ICU_PIVOT_BUF_SIZE 1024 typedef struct _uconv_t xmlUconvCtxt; struct _uconv_t { UConverter *uconv; /* for conversion between an encoding and UTF-16 */ UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */ UChar *pivot_source; UChar *pivot_target; int isInput; UChar pivot_buf[ICU_PIVOT_BUF_SIZE]; }; /** * xmlUconvConvert: * @vctxt: converison context * @out: a pointer to an array of bytes to store the result * @outlen: the length of @out * @in: a pointer to an array of input bytes * @inlen: the length of @in * * Returns an XML_ENC_ERR code. * * The value of @inlen after return is the number of octets consumed * as the return value is positive, else unpredictable. * The value of @outlen after return is the number of octets produced. */ static int xmlUconvConvert(unsigned char *out, int *outlen, const unsigned char *in, int *inlen, void *vctxt) { xmlUconvCtxt *cd = vctxt; const char *ucv_in = (const char *) in; char *ucv_out = (char *) out; UErrorCode err = U_ZERO_ERROR; if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) { if (outlen != NULL) *outlen = 0; return(XML_ENC_ERR_INTERNAL); } /* * Note that the ICU API is stateful. It can always consume a certain * amount of input even if the output buffer would overflow. The * remaining input must be processed by calling ucnv_convertEx with a * possibly empty input buffer. * * ucnv_convertEx is always called with reset and flush set to 0, * so we don't mess up the state. This should never generate * U_TRUNCATED_CHAR_FOUND errors. */ if (cd->isInput) { /* encoding => UTF-16 => UTF-8 */ ucnv_convertEx(cd->utf8, cd->uconv, &ucv_out, ucv_out + *outlen, &ucv_in, ucv_in + *inlen, cd->pivot_buf, &cd->pivot_source, &cd->pivot_target, cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, 0, &err); } else { /* UTF-8 => UTF-16 => encoding */ ucnv_convertEx(cd->uconv, cd->utf8, &ucv_out, ucv_out + *outlen, &ucv_in, ucv_in + *inlen, cd->pivot_buf, &cd->pivot_source, &cd->pivot_target, cd->pivot_buf + ICU_PIVOT_BUF_SIZE, 0, 0, &err); } *inlen = ucv_in - (const char*) in; *outlen = ucv_out - (char *) out; if (U_SUCCESS(err)) { return(XML_ENC_ERR_SUCCESS); } if (err == U_BUFFER_OVERFLOW_ERROR) return(XML_ENC_ERR_SPACE); if (err == U_INVALID_CHAR_FOUND || err == U_ILLEGAL_CHAR_FOUND) return(XML_ENC_ERR_INPUT); return(XML_ENC_ERR_INTERNAL); } static int openIcuConverter(const char* name, int isInput, xmlUconvCtxt **out) { UErrorCode status; xmlUconvCtxt *conv; *out = NULL; conv = (xmlUconvCtxt *) xmlMalloc(sizeof(xmlUconvCtxt)); if (conv == NULL) return(XML_ERR_NO_MEMORY); conv->isInput = isInput; conv->pivot_source = conv->pivot_buf; conv->pivot_target = conv->pivot_buf; status = U_ZERO_ERROR; conv->uconv = ucnv_open(name, &status); if (U_FAILURE(status)) goto error; status = U_ZERO_ERROR; if (isInput) { ucnv_setToUCallBack(conv->uconv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &status); } else { ucnv_setFromUCallBack(conv->uconv, UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &status); } if (U_FAILURE(status)) goto error; status = U_ZERO_ERROR; conv->utf8 = ucnv_open("UTF-8", &status); if (U_FAILURE(status)) goto error; *out = conv; return(0); error: if (conv->uconv) ucnv_close(conv->uconv); xmlFree(conv); if (status == U_FILE_ACCESS_ERROR) return(XML_ERR_UNSUPPORTED_ENCODING); if (status == U_MEMORY_ALLOCATION_ERROR) return(XML_ERR_NO_MEMORY); return(XML_ERR_SYSTEM); } static void closeIcuConverter(xmlUconvCtxt *conv) { if (conv == NULL) return; ucnv_close(conv->uconv); ucnv_close(conv->utf8); xmlFree(conv); } static void xmlUconvFree(void *vctxt) { closeIcuConverter(vctxt); } static int xmlCharEncUconv(void *vctxt ATTRIBUTE_UNUSED, const char *name, xmlCharEncConverter *conv) { xmlUconvCtxt *ucv_in = NULL; xmlUconvCtxt *ucv_out = NULL; int ret; ret = openIcuConverter(name, 1, &ucv_in); if (ret != 0) goto error; ret = openIcuConverter(name, 0, &ucv_out); if (ret != 0) goto error; conv->input = xmlUconvConvert; conv->output = xmlUconvConvert; conv->ctxtDtor = xmlUconvFree; conv->inputCtxt = ucv_in; conv->outputCtxt = ucv_out; return(XML_ERR_OK); error: if (ucv_in != NULL) closeIcuConverter(ucv_in); if (ucv_out != NULL) closeIcuConverter(ucv_out); return(ret); } #endif /* LIBXML_ICU_ENABLED */ /************************************************************************ * * * The real API used by libxml for on-the-fly conversion * * * ************************************************************************/ /** * xmlEncConvertError: * @code: XML_ENC_ERR code * * Convert XML_ENC_ERR to libxml2 error codes. */ static int xmlEncConvertError(int code) { int ret; switch (code) { case XML_ENC_ERR_SUCCESS: ret = XML_ERR_OK; break; case XML_ENC_ERR_INPUT: ret = XML_ERR_INVALID_ENCODING; break; case XML_ENC_ERR_MEMORY: ret = XML_ERR_NO_MEMORY; break; default: ret = XML_ERR_INTERNAL_ERROR; break; } return(ret); } /** * xmlEncInputChunk: * @handler: encoding handler * @out: a pointer to an array of bytes to store the result * @outlen: the length of @out * @in: a pointer to an array of input bytes * @inlen: the length of @in * * The value of @inlen after return is the number of octets consumed * as the return value is 0, else unpredictable. * The value of @outlen after return is the number of octets produced. * * Returns an XML_ENC_ERR code. */ int xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out, int *outlen, const unsigned char *in, int *inlen) { int ret; if (handler->input != NULL) { xmlCharEncConvFunc conv = (xmlCharEncConvFunc) (void (*)(void)) handler->input; ret = conv(out, outlen, in, inlen, handler->inputCtxt); if (ret > 0) ret = XML_ENC_ERR_SUCCESS; } else { *outlen = 0; *inlen = 0; ret = XML_ENC_ERR_INTERNAL; } return(ret); } /** * xmlEncOutputChunk: * @handler: encoding handler * @out: a pointer to an array of bytes to store the result * @outlen: the length of @out * @in: a pointer to an array of input bytes * @inlen: the length of @in * * Returns an XML_ENC_ERR code. * * The value of @inlen after return is the number of octets consumed * as the return value is 0, else unpredictable. * The value of @outlen after return is the number of octets produced. */ static int xmlEncOutputChunk(xmlCharEncodingHandler *handler, unsigned char *out, int *outlen, const unsigned char *in, int *inlen) { int ret; if (handler->output != NULL) { xmlCharEncConvFunc conv = (xmlCharEncConvFunc) (void (*)(void)) handler->output; ret = conv(out, outlen, in, inlen, handler->outputCtxt); if (ret > 0) ret = XML_ENC_ERR_SUCCESS; } else { *outlen = 0; *inlen = 0; ret = XML_ENC_ERR_INTERNAL; } return(ret); } /** * xmlCharEncFirstLine: * @handler: char encoding transformation data structure * @out: an xmlBuffer for the output. * @in: an xmlBuffer for the input * * DEPERECATED: Don't use. * * Returns the number of bytes written or an XML_ENC_ERR code. */ int xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out, xmlBufferPtr in) { return(xmlCharEncInFunc(handler, out, in)); } /** * xmlCharEncInput: * @input: a parser input buffer * * Generic front-end for the encoding handler on parser input * * Returns the number of bytes written or an XML_ENC_ERR code. */ int xmlCharEncInput(xmlParserInputBufferPtr input) { int ret; size_t avail; size_t toconv; int c_in; int c_out; xmlBufPtr in; xmlBufPtr out; const xmlChar *inData; size_t inTotal = 0; if ((input == NULL) || (input->encoder == NULL) || (input->buffer == NULL) || (input->raw == NULL)) return(XML_ENC_ERR_INTERNAL); out = input->buffer; in = input->raw; toconv = xmlBufUse(in); if (toconv == 0) return (0); inData = xmlBufContent(in); inTotal = 0; do { c_in = toconv > INT_MAX / 2 ? INT_MAX / 2 : toconv; avail = xmlBufAvail(out); if (avail > INT_MAX) avail = INT_MAX; if (avail < 4096) { if (xmlBufGrow(out, 4096) < 0) { input->error = XML_ERR_NO_MEMORY; return(XML_ENC_ERR_MEMORY); } avail = xmlBufAvail(out); } c_in = toconv; c_out = avail; ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out, inData, &c_in); inTotal += c_in; inData += c_in; toconv -= c_in; xmlBufAddLen(out, c_out); } while (ret == XML_ENC_ERR_SPACE); xmlBufShrink(in, inTotal); if (input->rawconsumed > ULONG_MAX - (unsigned long)c_in) input->rawconsumed = ULONG_MAX; else input->rawconsumed += c_in; if (((ret != 0) && (c_out == 0)) || (ret == XML_ENC_ERR_MEMORY)) { if (input->error == 0) input->error = xmlEncConvertError(ret); return(ret); } return (c_out); } /** * xmlCharEncInFunc: * @handler: char encoding transformation data structure * @out: an xmlBuffer for the output. * @in: an xmlBuffer for the input * * Generic front-end for the encoding handler input function * * Returns the number of bytes written or an XML_ENC_ERR code. */ int xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out, xmlBufferPtr in) { int ret; int written; int toconv; if (handler == NULL) return(XML_ENC_ERR_INTERNAL); if (out == NULL) return(XML_ENC_ERR_INTERNAL); if (in == NULL) return(XML_ENC_ERR_INTERNAL); toconv = in->use; if (toconv == 0) return (0); written = out->size - out->use -1; /* count '\0' */ if (toconv * 2 >= written) { xmlBufferGrow(out, out->size + toconv * 2); written = out->size - out->use - 1; } ret = xmlEncInputChunk(handler, &out->content[out->use], &written, in->content, &toconv); xmlBufferShrink(in, toconv); out->use += written; out->content[out->use] = 0; return (written? written : ret); } #ifdef LIBXML_OUTPUT_ENABLED /** * xmlCharEncOutput: * @output: a parser output buffer * @init: is this an initialization call without data * * Generic front-end for the encoding handler on parser output * a first call with @init == 1 has to be made first to initiate the * output in case of non-stateless encoding needing to initiate their * state or the output (like the BOM in UTF16). * In case of UTF8 sequence conversion errors for the given encoder, * the content will be automatically remapped to a CharRef sequence. * * Returns the number of bytes written or an XML_ENC_ERR code. */ int xmlCharEncOutput(xmlOutputBufferPtr output, int init) { int ret; size_t written; int writtentot = 0; size_t toconv; int c_in; int c_out; xmlBufPtr in; xmlBufPtr out; if ((output == NULL) || (output->encoder == NULL) || (output->buffer == NULL) || (output->conv == NULL)) return(XML_ENC_ERR_INTERNAL); out = output->conv; in = output->buffer; retry: written = xmlBufAvail(out); /* * First specific handling of the initialization call */ if (init) { c_in = 0; c_out = written; /* TODO: Check return value. */ xmlEncOutputChunk(output->encoder, xmlBufEnd(out), &c_out, NULL, &c_in); xmlBufAddLen(out, c_out); return(c_out); } /* * Conversion itself. */ toconv = xmlBufUse(in); if (toconv > 64 * 1024) toconv = 64 * 1024; if (toconv * 4 >= written) { if (xmlBufGrow(out, toconv * 4) < 0) { ret = XML_ENC_ERR_MEMORY; goto error; } written = xmlBufAvail(out); } if (written > 256 * 1024) written = 256 * 1024; c_in = toconv; c_out = written; ret = xmlEncOutputChunk(output->encoder, xmlBufEnd(out), &c_out, xmlBufContent(in), &c_in); xmlBufShrink(in, c_in); xmlBufAddLen(out, c_out); writtentot += c_out; if (ret == XML_ENC_ERR_SPACE) goto retry; /* * Attempt to handle error cases */ if (ret == XML_ENC_ERR_INPUT) { xmlChar charref[20]; int len = xmlBufUse(in); xmlChar *content = xmlBufContent(in); int cur, charrefLen; cur = xmlGetUTF8Char(content, &len); if (cur <= 0) goto error; /* * Removes the UTF8 sequence, and replace it by a charref * and continue the transcoding phase, hoping the error * did not mangle the encoder state. */ charrefLen = snprintf((char *) &charref[0], sizeof(charref), "&#%d;", cur); xmlBufGrow(out, charrefLen * 4); c_out = xmlBufAvail(out); c_in = charrefLen; ret = xmlEncOutputChunk(output->encoder, xmlBufEnd(out), &c_out, charref, &c_in); if ((ret < 0) || (c_in != charrefLen)) { ret = XML_ENC_ERR_INTERNAL; goto error; } xmlBufShrink(in, len); xmlBufAddLen(out, c_out); writtentot += c_out; goto retry; } error: if (((writtentot <= 0) && (ret != 0)) || (ret == XML_ENC_ERR_MEMORY)) { if (output->error == 0) output->error = xmlEncConvertError(ret); return(ret); } return(writtentot); } #endif /** * xmlCharEncOutFunc: * @handler: char encoding transformation data structure * @out: an xmlBuffer for the output. * @in: an xmlBuffer for the input * * Generic front-end for the encoding handler output function * a first call with @in == NULL has to be made firs to initiate the * output in case of non-stateless encoding needing to initiate their * state or the output (like the BOM in UTF16). * In case of UTF8 sequence conversion errors for the given encoder, * the content will be automatically remapped to a CharRef sequence. * * Returns the number of bytes written or an XML_ENC_ERR code. */ int xmlCharEncOutFunc(xmlCharEncodingHandler *handler, xmlBufferPtr out, xmlBufferPtr in) { int ret; int written; int writtentot = 0; int toconv; if (handler == NULL) return(XML_ENC_ERR_INTERNAL); if (out == NULL) return(XML_ENC_ERR_INTERNAL); retry: written = out->size - out->use; if (written > 0) written--; /* Gennady: count '/0' */ /* * First specific handling of in = NULL, i.e. the initialization call */ if (in == NULL) { toconv = 0; /* TODO: Check return value. */ xmlEncOutputChunk(handler, &out->content[out->use], &written, NULL, &toconv); out->use += written; out->content[out->use] = 0; return(0); } /* * Conversion itself. */ toconv = in->use; if (toconv * 4 >= written) { xmlBufferGrow(out, toconv * 4); written = out->size - out->use - 1; } ret = xmlEncOutputChunk(handler, &out->content[out->use], &written, in->content, &toconv); xmlBufferShrink(in, toconv); out->use += written; writtentot += written; out->content[out->use] = 0; if (ret == XML_ENC_ERR_SPACE) goto retry; /* * Attempt to handle error cases */ if (ret == XML_ENC_ERR_INPUT) { xmlChar charref[20]; int len = in->use; const xmlChar *utf = (const xmlChar *) in->content; int cur, charrefLen; cur = xmlGetUTF8Char(utf, &len); if (cur <= 0) return(ret); /* * Removes the UTF8 sequence, and replace it by a charref * and continue the transcoding phase, hoping the error * did not mangle the encoder state. */ charrefLen = snprintf((char *) &charref[0], sizeof(charref), "&#%d;", cur); xmlBufferShrink(in, len); xmlBufferGrow(out, charrefLen * 4); written = out->size - out->use - 1; toconv = charrefLen; ret = xmlEncOutputChunk(handler, &out->content[out->use], &written, charref, &toconv); if ((ret < 0) || (toconv != charrefLen)) return(XML_ENC_ERR_INTERNAL); out->use += written; writtentot += written; out->content[out->use] = 0; goto retry; } return(writtentot ? writtentot : ret); } /** * xmlCharEncCloseFunc: * @handler: char encoding transformation data structure * * Releases an xmlCharEncodingHandler. Must be called after * a handler is no longer in use. * * Returns 0. */ int xmlCharEncCloseFunc(xmlCharEncodingHandler *handler) { if (handler == NULL) return(0); if (handler->flags & XML_HANDLER_STATIC) return(0); xmlFree(handler->name); if (handler->ctxtDtor != NULL) { handler->ctxtDtor(handler->inputCtxt); handler->ctxtDtor(handler->outputCtxt); } xmlFree(handler); return(0); } /** * xmlByteConsumed: * @ctxt: an XML parser context * * This function provides the current index of the parser relative * to the start of the current entity. This function is computed in * bytes from the beginning starting at zero and finishing at the * size in byte of the file if parsing a file. The function is * of constant cost if the input is UTF-8 but can be costly if run * on non-UTF-8 input. * * Returns the index in bytes from the beginning of the entity or -1 * in case the index could not be computed. */ long xmlByteConsumed(xmlParserCtxtPtr ctxt) { xmlParserInputPtr in; if (ctxt == NULL) return(-1); in = ctxt->input; if (in == NULL) return(-1); if ((in->buf != NULL) && (in->buf->encoder != NULL)) { unsigned int unused = 0; xmlCharEncodingHandler * handler = in->buf->encoder; /* * Encoding conversion, compute the number of unused original * bytes from the input not consumed and subtract that from * the raw consumed value, this is not a cheap operation */ if (in->end - in->cur > 0) { unsigned char convbuf[32000]; const unsigned char *cur = (const unsigned char *)in->cur; int toconv = in->end - in->cur, written = 32000; int ret; do { toconv = in->end - cur; written = 32000; ret = xmlEncOutputChunk(handler, &convbuf[0], &written, cur, &toconv); if ((ret != XML_ENC_ERR_SUCCESS) && (ret != XML_ENC_ERR_SPACE)) return(-1); unused += written; cur += toconv; } while (ret == XML_ENC_ERR_SPACE); } if (in->buf->rawconsumed < unused) return(-1); return(in->buf->rawconsumed - unused); } return(in->consumed + (in->cur - in->base)); } /************************************************************************ * * * Conversions To/From UTF8 encoding * * * ************************************************************************/ static int asciiToAscii(unsigned char* out, int *poutlen, const unsigned char* in, int *pinlen, void *vctxt ATTRIBUTE_UNUSED) { const unsigned char *inend; const unsigned char *instart = in; int inlen, outlen, ret; if (in == NULL) { *pinlen = 0; *poutlen = 0; return(XML_ENC_ERR_SUCCESS); } inlen = *pinlen; outlen = *poutlen; if (outlen < inlen) { inlen = outlen; ret = XML_ENC_ERR_SPACE; } else { ret = inlen; } inend = in + inlen; *poutlen = inlen; *pinlen = inlen; while (in < inend) { unsigned c = *in; if (c >= 0x80) { *poutlen = in - instart; *pinlen = in - instart; return(XML_ENC_ERR_INPUT); } in++; *out++ = c; } return(ret); } static int latin1ToUTF8(unsigned char* out, int *outlen, const unsigned char* in, int *inlen, void *vctxt ATTRIBUTE_UNUSED) { unsigned char* outstart = out; const unsigned char* instart = in; unsigned char* outend; const unsigned char* inend; int ret = XML_ENC_ERR_SPACE; if ((out == NULL) || (in == NULL) || (outlen == NULL) || (inlen == NULL)) return(XML_ENC_ERR_INTERNAL); outend = out + *outlen; inend = in + *inlen; while (in < inend) { unsigned c = *in; if (c < 0x80) { if (out >= outend) goto done; *out++ = c; } else { if (outend - out < 2) goto done; *out++ = (c >> 6) | 0xC0; *out++ = (c & 0x3F) | 0x80; } in++; } ret = out - outstart; done: *outlen = out - outstart; *inlen = in - instart; return(ret); } /** * isolat1ToUTF8: * @out: a pointer to an array of bytes to store the result * @outlen: the length of @out * @in: a pointer to an array of ISO Latin 1 chars * @inlen: the length of @in * * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8 * block of chars out. * * Returns the number of bytes written or an XML_ENC_ERR code. * * The value of @inlen after return is the number of octets consumed * if the return value is positive, else unpredictable. * The value of @outlen after return is the number of octets produced. */ int isolat1ToUTF8(unsigned char* out, int *outlen, const unsigned char* in, int *inlen) { return(latin1ToUTF8(out, outlen, in, inlen, NULL)); } static int UTF8ToUTF8(unsigned char* out, int *outlen, const unsigned char* in, int *inlen, void *vctxt ATTRIBUTE_UNUSED) { int len; int ret; if (in == NULL) { *inlen = 0; *outlen = 0; return(XML_ENC_ERR_SUCCESS); } if (*outlen < *inlen) { len = *outlen; ret = XML_ENC_ERR_SPACE; } else { len = *inlen; ret = len; } memcpy(out, in, len); *outlen = len; *inlen = len; return(ret); } #ifdef LIBXML_OUTPUT_ENABLED static int UTF8ToLatin1(unsigned char* out, int *outlen, const unsigned char* in, int *inlen, void *vctxt ATTRIBUTE_UNUSED) { const unsigned char* outend; const unsigned char* outstart = out; const unsigned char* instart = in; const unsigned char* inend; unsigned c; int ret = XML_ENC_ERR_SPACE; if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(XML_ENC_ERR_INTERNAL); if (in == NULL) { *inlen = 0; *outlen = 0; return(XML_ENC_ERR_SUCCESS); } inend = in + *inlen; outend = out + *outlen; while (in < inend) { if (out >= outend) goto done; c = *in; if (c < 0x80) { *out++ = c; } else if (c < 0xC4) { if (inend - in < 2) break; in++; *out++ = (unsigned char) ((c << 6) | (*in & 0x3F)); } else { ret = XML_ENC_ERR_INPUT; goto done; } in++; } ret = out - outstart; done: *outlen = out - outstart; *inlen = in - instart; return(ret); } /** * UTF8Toisolat1: * @out: a pointer to an array of bytes to store the result * @outlen: the length of @out * @in: a pointer to an array of UTF-8 chars * @inlen: the length of @in * * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1 * block of chars out. * * Returns the number of bytes written or an XML_ENC_ERR code. * * The value of @inlen after return is the number of octets consumed * if the return value is positive, else unpredictable. * The value of @outlen after return is the number of octets produced. */ int UTF8Toisolat1(unsigned char* out, int *outlen, const unsigned char* in, int *inlen) { const unsigned char* outend; const unsigned char* outstart = out; const unsigned char* instart = in; const unsigned char* inend; unsigned c; int ret = XML_ENC_ERR_SPACE; if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(XML_ENC_ERR_INTERNAL); if (in == NULL) { *inlen = 0; *outlen = 0; return(XML_ENC_ERR_SUCCESS); } inend = in + *inlen; outend = out + *outlen; while (in < inend) { if (out >= outend) goto done; c = *in; if (c < 0x80) { *out++ = c; } else if (c < 0xC4) { if (inend - in < 2) break; in++; *out++ = (unsigned char) ((c << 6) | (*in & 0x3F)); } else { ret = XML_ENC_ERR_INPUT; goto done; } in++; } ret = out - outstart; done: *outlen = out - outstart; *inlen = in - instart; return(ret); } #endif /* LIBXML_OUTPUT_ENABLED */ static int UTF16LEToUTF8(unsigned char *out, int *outlen, const unsigned char *in, int *inlen, void *vctxt ATTRIBUTE_UNUSED) { const unsigned char *instart = in; const unsigned char *inend = in + (*inlen & ~1); unsigned char *outstart = out; unsigned char *outend = out + *outlen; unsigned c, d; int ret = XML_ENC_ERR_SPACE; while (in < inend) { c = in[0] | (in[1] << 8); if (c < 0x80) { if (out >= outend) goto done; out[0] = c; in += 2; out += 1; } else if (c < 0x800) { if (outend - out < 2) goto done; out[0] = (c >> 6) | 0xC0; out[1] = (c & 0x3F) | 0x80; in += 2; out += 2; } else if ((c & 0xF800) != 0xD800) { if (outend - out < 3) goto done; out[0] = (c >> 12) | 0xE0; out[1] = ((c >> 6) & 0x3F) | 0x80; out[2] = (c & 0x3F) | 0x80; in += 2; out += 3; } else { /* Surrogate pair */ if ((c & 0xFC00) != 0xD800) { ret = XML_ENC_ERR_INPUT; goto done; } if (inend - in < 4) break; d = in[2] | (in[3] << 8); if ((d & 0xFC00) != 0xDC00) { ret = XML_ENC_ERR_INPUT; goto done; } if (outend - out < 4) goto done; c = (c << 10) + d - ((0xD800 << 10) + 0xDC00 - 0x10000); out[0] = (c >> 18) | 0xF0; out[1] = ((c >> 12) & 0x3F) | 0x80; out[2] = ((c >> 6) & 0x3F) | 0x80; out[3] = (c & 0x3F) | 0x80; in += 4; out += 4; } } ret = out - outstart; done: *outlen = out - outstart; *inlen = in - instart; return(ret); } #ifdef LIBXML_OUTPUT_ENABLED static int UTF8ToUTF16LE(unsigned char *out, int *outlen, const unsigned char *in, int *inlen, void *vctxt ATTRIBUTE_UNUSED) { const unsigned char *instart = in; const unsigned char *inend; unsigned char *outstart = out; unsigned char *outend; unsigned c, d; int ret = XML_ENC_ERR_SPACE; /* UTF16LE encoding has no BOM */ if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(XML_ENC_ERR_INTERNAL); if (in == NULL) { *outlen = 0; *inlen = 0; return(0); } inend = in + *inlen; outend = out + (*outlen & ~1); while (in < inend) { if (out >= outend) goto done; c = in[0]; if (c < 0x80) { out[0] = c; out[1] = 0; in += 1; out += 2; } else if (c < 0xE0) { if (inend - in < 2) break; c = ((c & 0x1F) << 6) | (in[1] & 0x3F); out[0] = c & 0xFF; out[1] = c >> 8; in += 2; out += 2; } else if (c < 0xF0) { if (inend - in < 3) break; c = ((c & 0x0F) << 12) | ((in[1] & 0x3F) << 6) | (in[2] & 0x3F); out[0] = c & 0xFF; out[1] = c >> 8; in += 3; out += 2; } else { /* c >= 0xF0 */ if (inend - in < 4) break; if (outend - out < 4) goto done; c = ((c & 0x0F) << 18) | ((in[1] & 0x3F) << 12) | ((in[2] & 0x3F) << 6) | (in[3] & 0x3F); c -= 0x10000; d = (c & 0x03FF) | 0xDC00; c = (c >> 10) | 0xD800; out[0] = c & 0xFF; out[1] = c >> 8; out[2] = d & 0xFF; out[3] = d >> 8; in += 4; out += 4; } } ret = out - outstart; done: *outlen = out - outstart; *inlen = in - instart; return(ret); } static int UTF8ToUTF16(unsigned char* outb, int *outlen, const unsigned char* in, int *inlen, void *vctxt ATTRIBUTE_UNUSED) { if (in == NULL) { /* * initialization, add the Byte Order Mark for UTF-16LE */ if (*outlen >= 2) { outb[0] = 0xFF; outb[1] = 0xFE; *outlen = 2; *inlen = 0; return(2); } *outlen = 0; *inlen = 0; return(0); } return (UTF8ToUTF16LE(outb, outlen, in, inlen, NULL)); } #endif /* LIBXML_OUTPUT_ENABLED */ static int UTF16BEToUTF8(unsigned char *out, int *outlen, const unsigned char *in, int *inlen, void *vctxt ATTRIBUTE_UNUSED) { const unsigned char *instart = in; const unsigned char *inend = in + (*inlen & ~1); unsigned char *outstart = out; unsigned char *outend = out + *outlen; unsigned c, d; int ret = XML_ENC_ERR_SPACE; while (in < inend) { c = (in[0] << 8) | in[1]; if (c < 0x80) { if (out >= outend) goto done; out[0] = c; in += 2; out += 1; } else if (c < 0x800) { if (outend - out < 2) goto done; out[0] = (c >> 6) | 0xC0; out[1] = (c & 0x3F) | 0x80; in += 2; out += 2; } else if ((c & 0xF800) != 0xD800) { if (outend - out < 3) goto done; out[0] = (c >> 12) | 0xE0; out[1] = ((c >> 6) & 0x3F) | 0x80; out[2] = (c & 0x3F) | 0x80; in += 2; out += 3; } else { /* Surrogate pair */ if ((c & 0xFC00) != 0xD800) { ret = XML_ENC_ERR_INPUT; goto done; } if (inend - in < 4) break; d = (in[2] << 8) | in[3]; if ((d & 0xFC00) != 0xDC00) { ret = XML_ENC_ERR_INPUT; goto done; } if (outend - out < 4) goto done; c = (c << 10) + d - ((0xD800 << 10) + 0xDC00 - 0x10000); out[0] = (c >> 18) | 0xF0; out[1] = ((c >> 12) & 0x3F) | 0x80; out[2] = ((c >> 6) & 0x3F) | 0x80; out[3] = (c & 0x3F) | 0x80; in += 4; out += 4; } } ret = out - outstart; done: *outlen = out - outstart; *inlen = in - instart; return(ret); } #ifdef LIBXML_OUTPUT_ENABLED static int UTF8ToUTF16BE(unsigned char *out, int *outlen, const unsigned char *in, int *inlen, void *vctxt ATTRIBUTE_UNUSED) { const unsigned char *instart = in; const unsigned char *inend; unsigned char *outstart = out; unsigned char *outend; unsigned c, d; int ret = XML_ENC_ERR_SPACE; /* UTF-16BE has no BOM */ if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1); if (in == NULL) { *outlen = 0; *inlen = 0; return(0); } inend = in + *inlen; outend = out + (*outlen & ~1); while (in < inend) { if (out >= outend) goto done; c = in[0]; if (c < 0x80) { out[0] = 0; out[1] = c; in += 1; out += 2; } else if (c < 0xE0) { if (inend - in < 2) break; c = ((c & 0x1F) << 6) | (in[1] & 0x3F); out[0] = c >> 8; out[1] = c & 0xFF; in += 2; out += 2; } else if (c < 0xF0) { if (inend - in < 3) break; c = ((c & 0x0F) << 12) | ((in[1] & 0x3F) << 6) | (in[2] & 0x3F); out[0] = c >> 8; out[1] = c & 0xFF; in += 3; out += 2; } else { /* c >= 0xF0 */ if (inend - in < 4) break; if (outend - out < 4) goto done; c = ((c & 0x0F) << 18) | ((in[1] & 0x3F) << 12) | ((in[2] & 0x3F) << 6) | (in[3] & 0x3F); c -= 0x10000; d = (c & 0x03FF) | 0xDC00; c = (c >> 10) | 0xD800; out[0] = c >> 8; out[1] = c & 0xFF; out[2] = d >> 8; out[3] = d & 0xFF; in += 4; out += 4; } } ret = out - outstart; done: *outlen = out - outstart; *inlen = in - instart; return(ret); } #endif /* LIBXML_OUTPUT_ENABLED */ #if defined(LIBXML_OUTPUT_ENABLED) && defined(LIBXML_HTML_ENABLED) static int UTF8ToHtmlWrapper(unsigned char *out, int *outlen, const unsigned char *in, int *inlen, void *vctxt ATTRIBUTE_UNUSED) { return(UTF8ToHtml(out, outlen, in, inlen)); } #endif #if !defined(LIBXML_ICONV_ENABLED) && !defined(LIBXML_ICU_ENABLED) && \ defined(LIBXML_ISO8859X_ENABLED) static int UTF8ToISO8859x(unsigned char *out, int *outlen, const unsigned char *in, int *inlen, void *vctxt) { const unsigned char *xlattable = vctxt; const unsigned char *instart = in; const unsigned char *inend; unsigned char *outstart = out; unsigned char *outend; int ret = XML_ENC_ERR_SPACE; if (in == NULL) { /* * initialization nothing to do */ *outlen = 0; *inlen = 0; return(XML_ENC_ERR_SUCCESS); } inend = in + *inlen; outend = out + *outlen; while (in < inend) { unsigned d = *in; if (d < 0x80) { if (out >= outend) goto done; in += 1; } else if (d < 0xE0) { unsigned c; if (inend - in < 2) break; c = in[1] & 0x3F; d = d & 0x1F; d = xlattable [48 + c + xlattable [d] * 64]; if (d == 0) { /* not in character set */ ret = XML_ENC_ERR_INPUT; goto done; } if (out >= outend) goto done; in += 2; } else if (d < 0xF0) { unsigned c1; unsigned c2; if (inend - in < 3) break; c1 = in[1] & 0x3F; c2 = in[2] & 0x3F; d = d & 0x0F; d = xlattable [48 + c2 + xlattable [48 + c1 + xlattable [32 + d] * 64] * 64]; if (d == 0) { /* not in character set */ ret = XML_ENC_ERR_INPUT; goto done; } if (out >= outend) goto done; in += 3; } else { /* cannot transcode >= U+010000 */ ret = XML_ENC_ERR_INPUT; goto done; } *out++ = d; } ret = out - outstart; done: *outlen = out - outstart; *inlen = in - instart; return(ret); } static int ISO8859xToUTF8(unsigned char* out, int *outlen, const unsigned char* in, int *inlen, void *vctxt) { unsigned short const *unicodetable = vctxt; const unsigned char* instart = in; const unsigned char* inend; unsigned char* outstart = out; unsigned char* outend; int ret = XML_ENC_ERR_SPACE; outend = out + *outlen; inend = in + *inlen; while (in < inend) { unsigned c = *in; if (c < 0x80) { if (out >= outend) goto done; *out++ = c; } else { c = unicodetable[c - 0x80]; if (c == 0) { /* undefined code point */ ret = XML_ENC_ERR_INPUT; goto done; } if (c < 0x800) { if (outend - out < 2) goto done; *out++ = ((c >> 6) & 0x1F) | 0xC0; *out++ = (c & 0x3F) | 0x80; } else { if (outend - out < 3) goto done; *out++ = ((c >> 12) & 0x0F) | 0xE0; *out++ = ((c >> 6) & 0x3F) | 0x80; *out++ = (c & 0x3F) | 0x80; } } in += 1; } ret = out - outstart; done: *outlen = out - outstart; *inlen = in - instart; return(ret); } #endif