diff --git a/encoding.c b/encoding.c index 101eba9b..93bd3f6c 100644 --- a/encoding.c +++ b/encoding.c @@ -2037,7 +2037,7 @@ xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen, * as the return value is 0, else unpredictable. * The value of @outlen after return is the number of octets produced. */ -static int +int xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out, int *outlen, const unsigned char *in, int *inlen, int flush) { int ret; @@ -2123,189 +2123,12 @@ xmlEncOutputChunk(xmlCharEncodingHandler *handler, unsigned char *out, * @out: an xmlBuffer for the output. * @in: an xmlBuffer for the input * - * Front-end for the encoding handler input function, but handle only - * the very first line, i.e. limit itself to 45 chars. - * - * Returns the number of byte written if success, or - * -1 general error - * -2 if the transcoding fails (for *in is not valid utf8 string or - * the result of transformation can't fit into the encoding we want), or + * DEPERECATED: Don't use. */ int xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out, xmlBufferPtr in) { - int ret; - int written; - int toconv; - - if (handler == NULL) return(-1); - if (out == NULL) return(-1); - if (in == NULL) return(-1); - - /* calculate space available */ - written = out->size - out->use - 1; /* count '\0' */ - toconv = in->use; - /* - * echo '' | wc -c => 38 - * 45 chars should be sufficient to reach the end of the encoding - * declaration without going too far inside the document content. - * on UTF-16 this means 90bytes, on UCS4 this means 180 - * The actual value depending on guessed encoding is passed as @len - * if provided - */ - if (toconv > 180) - toconv = 180; - if (toconv * 2 >= written) { - xmlBufferGrow(out, toconv * 2); - written = out->size - out->use - 1; - } - - ret = xmlEncInputChunk(handler, &out->content[out->use], &written, - in->content, &toconv, 0); - xmlBufferShrink(in, toconv); - out->use += written; - out->content[out->use] = 0; - if (ret == -1) ret = -3; - -#ifdef DEBUG_ENCODING - switch (ret) { - case 0: - xmlGenericError(xmlGenericErrorContext, - "converted %d bytes to %d bytes of input\n", - toconv, written); - break; - case -1: - xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n", - toconv, written, in->use); - break; - case -2: - xmlGenericError(xmlGenericErrorContext, - "input conversion failed due to input error\n"); - break; - case -3: - xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n", - toconv, written, in->use); - break; - default: - xmlGenericError(xmlGenericErrorContext,"Unknown input conversion failed %d\n", ret); - } -#endif /* DEBUG_ENCODING */ - /* - * Ignore when input buffer is not on a boundary - */ - if (ret == -3) ret = 0; - if (ret == -1) ret = 0; - return(written ? written : ret); -} - -/** - * xmlCharEncFirstLineInput: - * @input: a parser input buffer - * @len: number of bytes to convert for the first line, or -1 - * - * Front-end for the encoding handler input function, but handle only - * the very first line. Point is that this is based on autodetection - * of the encoding and once that first line is converted we may find - * out that a different decoder is needed to process the input. - * - * Returns the number of byte written if success, or - * -1 general error - * -2 if the transcoding fails (for *in is not valid utf8 string or - * the result of transformation can't fit into the encoding we want), or - */ -int -xmlCharEncFirstLineInput(xmlParserInputBufferPtr input, int len) -{ - int ret; - size_t written; - size_t toconv; - int c_in; - int c_out; - xmlBufPtr in; - xmlBufPtr out; - - if ((input == NULL) || (input->encoder == NULL) || - (input->buffer == NULL) || (input->raw == NULL)) - return (-1); - out = input->buffer; - in = input->raw; - - toconv = xmlBufUse(in); - if (toconv == 0) - return (0); - written = xmlBufAvail(out); - /* - * echo '' | wc -c => 38 - * 45 chars should be sufficient to reach the end of the encoding - * declaration without going too far inside the document content. - * on UTF-16 this means 90bytes, on UCS4 this means 180 - * The actual value depending on guessed encoding is passed as @len - * if provided - */ - if (len >= 0) { - if (toconv > (unsigned int) len) - toconv = len; - } else { - if (toconv > 180) - toconv = 180; - } - if (toconv * 2 >= written) { - xmlBufGrow(out, toconv * 2); - written = xmlBufAvail(out); - } - if (written > 360) - written = 360; - - c_in = toconv; - c_out = written; - ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out, - xmlBufContent(in), &c_in, 0); - xmlBufShrink(in, c_in); - xmlBufAddLen(out, c_out); - if (ret == -1) - ret = -3; - - switch (ret) { - case 0: -#ifdef DEBUG_ENCODING - xmlGenericError(xmlGenericErrorContext, - "converted %d bytes to %d bytes of input\n", - c_in, c_out); -#endif - break; - case -1: -#ifdef DEBUG_ENCODING - xmlGenericError(xmlGenericErrorContext, - "converted %d bytes to %d bytes of input, %d left\n", - c_in, c_out, (int)xmlBufUse(in)); -#endif - break; - case -3: -#ifdef DEBUG_ENCODING - xmlGenericError(xmlGenericErrorContext, - "converted %d bytes to %d bytes of input, %d left\n", - c_in, c_out, (int)xmlBufUse(in)); -#endif - break; - case -2: { - char buf[50]; - const xmlChar *content = xmlBufContent(in); - - snprintf(&buf[0], 49, "0x%02X 0x%02X 0x%02X 0x%02X", - content[0], content[1], - content[2], content[3]); - buf[49] = 0; - xmlEncodingErr(XML_I18N_CONV_FAILED, - "input conversion failed due to input error, bytes %s\n", - buf); - } - } - /* - * Ignore when input buffer is not on a boundary - */ - if (ret == -3) ret = 0; - if (ret == -1) ret = 0; - return(c_out ? c_out : ret); + return(xmlCharEncInFunc(handler, out, in)); } /** diff --git a/include/libxml/encoding.h b/include/libxml/encoding.h index 231b0be1..67add3b0 100644 --- a/include/libxml/encoding.h +++ b/include/libxml/encoding.h @@ -203,6 +203,7 @@ XMLPUBFUN int xmlCharEncInFunc (xmlCharEncodingHandler *handler, xmlBufferPtr out, xmlBufferPtr in); +XML_DEPRECATED XMLPUBFUN int xmlCharEncFirstLine (xmlCharEncodingHandler *handler, xmlBufferPtr out, diff --git a/include/private/enc.h b/include/private/enc.h index ddfc8aea..cbdc2b33 100644 --- a/include/private/enc.h +++ b/include/private/enc.h @@ -8,7 +8,8 @@ XML_HIDDEN void xmlInitEncodingInternal(void); XML_HIDDEN int -xmlCharEncFirstLineInput(xmlParserInputBufferPtr input, int len); +xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out, + int *outlen, const unsigned char *in, int *inlen, int flush); XML_HIDDEN int xmlCharEncInput(xmlParserInputBufferPtr input, int flush); XML_HIDDEN int diff --git a/parser.c b/parser.c index 27b27548..3a6069c8 100644 --- a/parser.c +++ b/parser.c @@ -10367,6 +10367,7 @@ xmlParseEncodingDecl(xmlParserCtxtPtr ctxt) { else if ((encoding != NULL) && ((!xmlStrcasecmp(encoding, BAD_CAST "UTF-8")) || (!xmlStrcasecmp(encoding, BAD_CAST "UTF8")))) { + /* TODO: Check for encoding mismatch. */ if (ctxt->encoding != NULL) xmlFree((xmlChar *) ctxt->encoding); ctxt->encoding = encoding; @@ -10692,15 +10693,7 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) { return(-1); } - /* - * Check for the XMLDecl in the Prolog. - * do not GROW here to avoid the detected encoder to decode more - * than just the first line, unless the amount of data is really - * too small to hold "input->end - ctxt->input->cur) < 35) { - GROW; - } + GROW; if ((CMP5(CUR_PTR, '<', '?', 'x', 'm', 'l')) && (IS_BLANK_CH(NXT(5)))) { /* @@ -11347,12 +11340,9 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { /* * If we are operating on converted input, try to flush * remaining chars to avoid them stalling in the non-converted - * buffer. But do not do this in document start where - * encoding="..." may not have been read and we work on a - * guessed encoding. + * buffer. */ - if ((ctxt->instate != XML_PARSER_START) && - (ctxt->input->buf->raw != NULL) && + if ((ctxt->input->buf->raw != NULL) && (xmlBufIsEmpty(ctxt->input->buf->raw) == 0)) { size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input); @@ -11395,6 +11385,13 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { start[2] = NXT(2); start[3] = NXT(3); enc = xmlDetectCharEncoding(start, 4); + /* + * We need more bytes to detect EBCDIC code pages. + * See xmlDetectEBCDIC. + */ + if ((enc == XML_CHAR_ENCODING_EBCDIC) && + (!terminate) && (avail < 200)) + goto done; xmlSwitchEncoding(ctxt, enc); break; } @@ -12186,15 +12183,8 @@ xmlCreatePushParserCtxt(xmlSAXHandlerPtr sax, void *user_data, xmlParserCtxtPtr ctxt; xmlParserInputPtr inputStream; xmlParserInputBufferPtr buf; - xmlCharEncoding enc = XML_CHAR_ENCODING_NONE; - /* - * plug some encoding conversion routines - */ - if ((chunk != NULL) && (size >= 4)) - enc = xmlDetectCharEncoding((const xmlChar *) chunk, size); - - buf = xmlAllocParserInputBuffer(enc); + buf = xmlAllocParserInputBuffer(XML_CHAR_ENCODING_NONE); if (buf == NULL) return(NULL); ctxt = xmlNewSAXParserCtxt(sax, user_data); @@ -12253,10 +12243,6 @@ xmlCreatePushParserCtxt(xmlSAXHandlerPtr sax, void *user_data, #endif } - if (enc != XML_CHAR_ENCODING_NONE) { - xmlSwitchEncoding(ctxt, enc); - } - return(ctxt); } #endif /* LIBXML_PUSH_ENABLED */ diff --git a/parserInternals.c b/parserInternals.c index ce4f75e0..a06bb76a 100644 --- a/parserInternals.c +++ b/parserInternals.c @@ -990,9 +990,62 @@ xmlCopyChar(int len ATTRIBUTE_UNUSED, xmlChar *out, int val) { * * ************************************************************************/ -static int -xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, - xmlCharEncodingHandlerPtr handler, int len); +static xmlCharEncodingHandlerPtr +xmlDetectEBCDIC(xmlParserInputPtr input) { + xmlChar out[200]; + xmlCharEncodingHandlerPtr handler; + int inlen, outlen, res, i; + + /* + * To detect the EBCDIC code page, we convert the first 200 bytes + * to EBCDIC-US and try to find the encoding declaration. + */ + handler = xmlGetCharEncodingHandler(XML_CHAR_ENCODING_EBCDIC); + if (handler == NULL) + return(NULL); + outlen = sizeof(out); + inlen = input->end - input->cur; + res = xmlEncInputChunk(handler, out, &outlen, input->cur, &inlen, 0); + if (res < 0) + return(handler); + + for (i = 0; i < outlen; i++) { + if (out[i] == '>') + break; + if ((out[i] == 'e') && + (xmlStrncmp(out + i, BAD_CAST "encoding", 8) == 0)) { + int start, cur, quote; + + i += 8; + while (IS_BLANK_CH(out[i])) + i += 1; + if (out[i++] != '=') + break; + while (IS_BLANK_CH(out[i])) + i += 1; + quote = out[i++]; + if ((quote != '\'') && (quote != '"')) + break; + start = i; + cur = out[i]; + while (((cur >= 'a') && (cur <= 'z')) || + ((cur >= 'A') && (cur <= 'Z')) || + ((cur >= '0') && (cur <= '9')) || + (cur == '.') || (cur == '_') || + (cur == '-')) + cur = out[++i]; + if (cur != quote) + break; + out[i] = 0; + xmlCharEncCloseFunc(handler); + handler = xmlFindCharEncodingHandler((char *) out + start); + break; + } + } + + return(handler); +} + /** * xmlSwitchEncoding: * @ctxt: the parser context @@ -1007,7 +1060,6 @@ int xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) { xmlCharEncodingHandlerPtr handler; - int len = -1; int ret; if (ctxt == NULL) return(-1); @@ -1036,51 +1088,13 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) ctxt->input->cur += 3; } return(0); - case XML_CHAR_ENCODING_UTF16LE: - case XML_CHAR_ENCODING_UTF16BE: - /*The raw input characters are encoded - *in UTF-16. As we expect this function - *to be called after xmlCharEncInFunc, we expect - *ctxt->input->cur to contain UTF-8 encoded characters. - *So the raw UTF16 Byte Order Mark - *has also been converted into - *an UTF-8 BOM. Let's skip that BOM. - */ - if ((ctxt->input != NULL) && (ctxt->input->cur != NULL) && - (ctxt->input->cur[0] == 0xEF) && - (ctxt->input->cur[1] == 0xBB) && - (ctxt->input->cur[2] == 0xBF)) { - ctxt->input->cur += 3; - } - len = 90; - break; - case XML_CHAR_ENCODING_UCS2: - len = 90; - break; - case XML_CHAR_ENCODING_UCS4BE: - case XML_CHAR_ENCODING_UCS4LE: - case XML_CHAR_ENCODING_UCS4_2143: - case XML_CHAR_ENCODING_UCS4_3412: - len = 180; - break; - case XML_CHAR_ENCODING_EBCDIC: - case XML_CHAR_ENCODING_8859_1: - case XML_CHAR_ENCODING_8859_2: - case XML_CHAR_ENCODING_8859_3: - case XML_CHAR_ENCODING_8859_4: - case XML_CHAR_ENCODING_8859_5: - case XML_CHAR_ENCODING_8859_6: - case XML_CHAR_ENCODING_8859_7: - case XML_CHAR_ENCODING_8859_8: - case XML_CHAR_ENCODING_8859_9: - case XML_CHAR_ENCODING_ASCII: - case XML_CHAR_ENCODING_2022_JP: - case XML_CHAR_ENCODING_SHIFT_JIS: - case XML_CHAR_ENCODING_EUC_JP: - len = 45; - break; + case XML_CHAR_ENCODING_EBCDIC: + handler = xmlDetectEBCDIC(ctxt->input); + break; + default: + handler = xmlGetCharEncodingHandler(enc); + break; } - handler = xmlGetCharEncodingHandler(enc); if (handler == NULL) { /* * Default handlers. @@ -1112,7 +1126,7 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) return(-1); } } - ret = xmlSwitchInputEncodingInt(ctxt, ctxt->input, handler, len); + ret = xmlSwitchInputEncoding(ctxt, ctxt->input, handler); if ((ret < 0) || (ctxt->errNo == XML_I18N_CONV_FAILED)) { /* * on encoding conversion errors, stop the parser @@ -1124,20 +1138,19 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) } /** - * xmlSwitchInputEncodingInt: + * xmlSwitchInputEncoding: * @ctxt: the parser context * @input: the input stream * @handler: the encoding handler - * @len: the number of bytes to convert for the first line or -1 * * change the input functions when discovering the character encoding * of a given entity. * * Returns 0 in case of success, -1 otherwise */ -static int -xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, - xmlCharEncodingHandlerPtr handler, int len) +int +xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, + xmlCharEncodingHandlerPtr handler) { int nbchars; xmlParserInputBufferPtr in; @@ -1159,30 +1172,17 @@ xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, return (-1); } - ctxt->charset = XML_CHAR_ENCODING_UTF8; - if (in->encoder != NULL) { /* - * Check in case the auto encoding detection triggered - * in already. + * TODO: Detect encoding mismatch. We should start by comparing + * in->encoder->name and handler->name, but there are a few + * compatible encodings like UTF-16 and UCS-2 or UTF-32 and UCS-4. */ - if (in->encoder == handler) - return (0); - - /* - * Note: this is a bit dangerous, but that's what it - * takes to use nearly compatible signature for different - * encodings. - * - * FIXME: Encoders might buffer partial byte sequences, so - * this probably can't work. We should return an error and - * make sure that callers never try to switch the encoding - * twice. - */ - xmlCharEncCloseFunc(in->encoder); - in->encoder = handler; + xmlCharEncCloseFunc(handler); return (0); } + + ctxt->charset = XML_CHAR_ENCODING_UTF8; in->encoder = handler; /* @@ -1230,20 +1230,7 @@ xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, in->rawconsumed = processed; use = xmlBufUse(in->raw); - if (ctxt->html) { - /* - * convert as much as possible of the buffer - */ - nbchars = xmlCharEncInput(in, 1); - } else { - /* - * convert just enough to get - * '' - * parsed with the autodetected encoding - * into the parser reading buffer. - */ - nbchars = xmlCharEncFirstLineInput(in, len); - } + nbchars = xmlCharEncInput(in, 0); xmlBufResetInput(in->buffer, input); if (nbchars < 0) { xmlErrInternal(ctxt, @@ -1261,25 +1248,6 @@ xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, return (0); } -/** - * xmlSwitchInputEncoding: - * @ctxt: the parser context - * @input: the input stream - * @handler: the encoding handler - * - * DEPRECATED: Use xmlSwitchToEncoding - * - * change the input functions when discovering the character encoding - * of a given entity. - * - * Returns 0 in case of success, -1 otherwise - */ -int -xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, - xmlCharEncodingHandlerPtr handler) { - return(xmlSwitchInputEncodingInt(ctxt, input, handler, -1)); -} - /** * xmlSwitchToEncoding: * @ctxt: the parser context @@ -1295,7 +1263,7 @@ xmlSwitchToEncoding(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler) { if (ctxt == NULL) return(-1); - return(xmlSwitchInputEncodingInt(ctxt, ctxt->input, handler, -1)); + return(xmlSwitchInputEncoding(ctxt, ctxt->input, handler)); } /************************************************************************