From ec7be50662ec17104355e7357f5067d43c47b207 Mon Sep 17 00:00:00 2001 From: Nick Wellnhofer Date: Tue, 8 Aug 2023 15:19:46 +0200 Subject: [PATCH] parser: Rework encoding detection Introduce XML_INPUT_HAS_ENCODING flag for xmlParserInput which is set when xmlSwitchEncoding is called. The parser can use the flag to reliably detect whether an encoding was already set via user override, BOM or other auto-detection. In this case, the encoding declaration won't be used to switch the encoding. Before, an inscrutable mix of ctxt->charset, ctxt->input->encoding and ctxt->input->buf->encoder was used. Introduce private helper functions to switch encodings used by both the XML and HTML parser: - xmlDetectEncoding which skips over the BOM, allowing to remove the BOM checks from other encoding functions. - xmlSetDeclaredEncoding, replacing htmlCheckEncodingDirect, which warns about encoding mismatches. If users override the encoding, store the declared instead of the actual encoding in xmlDoc. In this case, the actual encoding is known and the raw value from the doc is more useful. Also use the input flags to store the ISO-8859-1 fallback state. Restrict the fallback to cases where no encoding was specified. (The fallback is only useful in recovery mode and these days broken UTF-8 is probably more likely than ISO-8859-1, so it might eventually be removed completely.) The 'charset' member of xmlParserCtxt is now unused. The 'encoding' member of xmlParserInput is now unused. The 'standalone' member of xmlParserInput is renamed to 'flags'. A new parser state XML_PARSER_XML_DECL is added for the push parser. --- HTMLparser.c | 151 +--------------- SAX2.c | 23 --- include/libxml/parser.h | 10 +- include/libxml/tree.h | 5 +- include/libxml/xmlerror.h | 1 + include/private/parser.h | 16 ++ parser.c | 337 +++++++--------------------------- parserInternals.c | 371 ++++++++++++++++++++++++-------------- testchar.c | 8 +- xmlIO.c | 2 - 10 files changed, 341 insertions(+), 583 deletions(-) diff --git a/HTMLparser.c b/HTMLparser.c index 67ee6654..d0fa178b 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -350,8 +350,7 @@ htmlFindEncoding(xmlParserCtxtPtr ctxt) { const xmlChar *start, *cur, *end; if ((ctxt == NULL) || (ctxt->input == NULL) || - (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) || - (ctxt->input->buf->encoder != NULL)) + (ctxt->input->flags & XML_INPUT_HAS_ENCODING)) return(NULL); if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL)) return(NULL); @@ -417,7 +416,7 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { return(0); } - if (ctxt->charset != XML_CHAR_ENCODING_UTF8) { + if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) { xmlChar * guess; xmlCharEncodingHandlerPtr handler; @@ -444,10 +443,8 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { if (guess == NULL) { xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); } else { - if (ctxt->input->encoding != NULL) - xmlFree((xmlChar *) ctxt->input->encoding); - ctxt->input->encoding = guess; handler = xmlFindCharEncodingHandler((const char *) guess); + xmlFree(guess); if (handler != NULL) { /* * Don't use UTF-8 encoder which isn't required and @@ -460,7 +457,7 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { "Unsupported encoding %s", guess, NULL); } } - ctxt->charset = XML_CHAR_ENCODING_UTF8; + ctxt->input->flags |= XML_INPUT_HAS_ENCODING; } /* @@ -537,13 +534,6 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { } encoding_error: - /* - * If we detect an UTF8 error that probably mean that the - * input encoding didn't get properly advertised in the - * declaration header. Report the error and switch the encoding - * to ISO-Latin-1 (if you don't like this policy, just declare the - * encoding !) - */ { char buffer[150]; @@ -559,15 +549,7 @@ encoding_error: BAD_CAST buffer, NULL); } - /* - * Don't switch encodings twice. Note that if there's an encoder, we - * shouldn't receive invalid UTF-8 anyway. - * - * Note that if ctxt->input->buf == NULL, switching encodings is - * impossible, see Gitlab issue #34. - */ - if ((ctxt->input->buf != NULL) && - (ctxt->input->buf->encoder == NULL)) + if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); *len = 1; return(*ctxt->input->cur); @@ -3781,94 +3763,6 @@ htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) { return(name); } -/** - * htmlCheckEncodingDirect: - * @ctxt: an HTML parser context - * @attvalue: the attribute value - * - * Checks an attribute value to detect - * the encoding - * If a new encoding is detected the parser is switched to decode - * it and pass UTF8 - */ -static void -htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) { - - if ((ctxt == NULL) || (encoding == NULL) || - (ctxt->options & HTML_PARSE_IGNORE_ENC)) - return; - - /* do not change encoding */ - if (ctxt->input->encoding != NULL) - return; - - if (encoding != NULL) { - xmlCharEncoding enc; - xmlCharEncodingHandlerPtr handler; - - while ((*encoding == ' ') || (*encoding == '\t')) encoding++; - - if (ctxt->input->encoding != NULL) - xmlFree((xmlChar *) ctxt->input->encoding); - ctxt->input->encoding = xmlStrdup(encoding); - - enc = xmlParseCharEncoding((const char *) encoding); - /* - * registered set of known encodings - */ - if (enc != XML_CHAR_ENCODING_ERROR) { - if (((enc == XML_CHAR_ENCODING_UTF16LE) || - (enc == XML_CHAR_ENCODING_UTF16BE) || - (enc == XML_CHAR_ENCODING_UCS4LE) || - (enc == XML_CHAR_ENCODING_UCS4BE)) && - (ctxt->input->buf != NULL) && - (ctxt->input->buf->encoder == NULL)) { - htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, - "htmlCheckEncoding: wrong encoding meta\n", - NULL, NULL); - } else { - xmlSwitchEncoding(ctxt, enc); - } - ctxt->charset = XML_CHAR_ENCODING_UTF8; - } else { - /* - * fallback for unknown encodings - */ - handler = xmlFindCharEncodingHandler((const char *) encoding); - if (handler != NULL) { - xmlSwitchToEncoding(ctxt, handler); - ctxt->charset = XML_CHAR_ENCODING_UTF8; - } else { - htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, - "htmlCheckEncoding: unknown encoding %s\n", - encoding, NULL); - } - } - - if ((ctxt->input->buf != NULL) && - (ctxt->input->buf->encoder != NULL) && - (ctxt->input->buf->raw != NULL) && - (ctxt->input->buf->buffer != NULL)) { - int nbchars; - size_t processed; - - /* - * convert as much as possible to the parser reading buffer. - */ - processed = ctxt->input->cur - ctxt->input->base; - xmlBufShrink(ctxt->input->buf->buffer, processed); - nbchars = xmlCharEncInput(ctxt->input->buf, 1); - xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input); - if (nbchars < 0) { - htmlParseErr(ctxt, ctxt->input->buf->error, - "htmlCheckEncoding: encoder error\n", - NULL, NULL); - xmlHaltParser(ctxt); - } - } - } -} - /** * htmlCheckEncoding: * @ctxt: an HTML parser context @@ -3897,7 +3791,7 @@ htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) { encoding = xmlStrcasestr(attvalue, BAD_CAST"="); if (encoding && *encoding == '=') { encoding ++; - htmlCheckEncodingDirect(ctxt, encoding); + xmlSetDeclaredEncoding(ctxt, xmlStrdup(encoding)); } } @@ -3926,7 +3820,7 @@ htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) { && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) http = 1; else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset"))) - htmlCheckEncodingDirect(ctxt, value); + xmlSetDeclaredEncoding(ctxt, xmlStrdup(value)); else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content"))) content = value; att = atts[i++]; @@ -4953,8 +4847,6 @@ __htmlParseContent(void *ctxt) { int htmlParseDocument(htmlParserCtxtPtr ctxt) { - xmlChar start[4]; - xmlCharEncoding enc; xmlDtdPtr dtd; xmlInitParser(); @@ -4964,29 +4856,14 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) { "htmlParseDocument: context error\n", NULL, NULL); return(XML_ERR_INTERNAL_ERROR); } - GROW; + /* * SAX: beginning of the document processing. */ if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator); - if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) && - ((ctxt->input->end - ctxt->input->cur) >= 4)) { - /* - * Get the 4 first bytes and decode the charset - * if enc != XML_CHAR_ENCODING_NONE - * plug some encoding conversion routines. - */ - start[0] = RAW; - start[1] = NXT(1); - start[2] = NXT(2); - start[3] = NXT(3); - enc = xmlDetectCharEncoding(&start[0], 4); - if (enc != XML_CHAR_ENCODING_NONE) { - xmlSwitchEncoding(ctxt, enc); - } - } + xmlDetectEncoding(ctxt); /* * Wipe out everything which is before the first '<' @@ -5317,10 +5194,6 @@ htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) { xmlCharEncoding enc; xmlCharEncodingHandlerPtr handler; - if (ctxt->input->encoding != NULL) - xmlFree((xmlChar *) ctxt->input->encoding); - ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding); - enc = xmlParseCharEncoding(encoding); /* * registered set of known encodings @@ -6265,8 +6138,6 @@ htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data, xmlFreeParserInputBuffer(buf); return(NULL); } - if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder) - ctxt->charset=XML_CHAR_ENCODING_UTF8; if (filename == NULL) { ctxt->directory = NULL; } else { @@ -6722,7 +6593,6 @@ htmlCtxtReset(htmlParserCtxtPtr ctxt) ctxt->inSubset = 0; ctxt->errNo = XML_ERR_OK; ctxt->depth = 0; - ctxt->charset = XML_CHAR_ENCODING_NONE; ctxt->catalogs = NULL; xmlInitNodeInfoSeq(&ctxt->node_seq); @@ -6839,9 +6709,6 @@ htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding, hdlr = xmlFindCharEncodingHandler(encoding); if (hdlr != NULL) { xmlSwitchToEncoding(ctxt, hdlr); - if (ctxt->input->encoding != NULL) - xmlFree((xmlChar *) ctxt->input->encoding); - ctxt->input->encoding = xmlStrdup((xmlChar *)encoding); } } if ((URL != NULL) && (ctxt->input != NULL) && diff --git a/SAX2.c b/SAX2.c index 968da080..07c5c017 100644 --- a/SAX2.c +++ b/SAX2.c @@ -384,8 +384,6 @@ xmlSAX2ExternalSubset(void *ctx, const xmlChar *name, int oldinputMax; xmlParserInputPtr *oldinputTab; xmlParserInputPtr input = NULL; - xmlCharEncoding enc; - int oldcharset; const xmlChar *oldencoding; int oldprogressive; unsigned long consumed; @@ -410,7 +408,6 @@ xmlSAX2ExternalSubset(void *ctx, const xmlChar *name, oldinputNr = ctxt->inputNr; oldinputMax = ctxt->inputMax; oldinputTab = ctxt->inputTab; - oldcharset = ctxt->charset; oldencoding = ctxt->encoding; oldprogressive = ctxt->progressive; ctxt->encoding = NULL; @@ -425,7 +422,6 @@ xmlSAX2ExternalSubset(void *ctx, const xmlChar *name, ctxt->inputNr = oldinputNr; ctxt->inputMax = oldinputMax; ctxt->inputTab = oldinputTab; - ctxt->charset = oldcharset; ctxt->encoding = oldencoding; ctxt->progressive = oldprogressive; return; @@ -435,14 +431,6 @@ xmlSAX2ExternalSubset(void *ctx, const xmlChar *name, ctxt->input = NULL; xmlPushInput(ctxt, input); - /* - * On the fly encoding conversion if needed - */ - if (ctxt->input->length >= 4) { - enc = xmlDetectCharEncoding(ctxt->input->cur, 4); - xmlSwitchEncoding(ctxt, enc); - } - if (input->filename == NULL) input->filename = (char *) xmlCanonicPath(SystemID); input->line = 1; @@ -484,7 +472,6 @@ xmlSAX2ExternalSubset(void *ctx, const xmlChar *name, ctxt->inputNr = oldinputNr; ctxt->inputMax = oldinputMax; ctxt->inputTab = oldinputTab; - ctxt->charset = oldcharset; if ((ctxt->encoding != NULL) && ((ctxt->dict == NULL) || (!xmlDictOwns(ctxt->dict, ctxt->encoding)))) @@ -1041,16 +1028,6 @@ xmlSAX2EndDocument(void *ctx) ctxt->myDoc->encoding = ctxt->encoding; ctxt->encoding = NULL; } - if ((ctxt->inputTab != NULL) && - (ctxt->inputNr > 0) && (ctxt->inputTab[0] != NULL) && - (ctxt->inputTab[0]->encoding != NULL) && (ctxt->myDoc != NULL) && - (ctxt->myDoc->encoding == NULL)) { - ctxt->myDoc->encoding = xmlStrdup(ctxt->inputTab[0]->encoding); - } - if ((ctxt->charset != XML_CHAR_ENCODING_NONE) && (ctxt->myDoc != NULL) && - (ctxt->myDoc->charset == XML_CHAR_ENCODING_NONE)) { - ctxt->myDoc->charset = ctxt->charset; - } } #if defined(LIBXML_SAX1_ENABLED) || defined(LIBXML_HTML_ENABLED) || defined(LIBXML_WRITER_ENABLED) || defined(LIBXML_LEGACY_ENABLED) diff --git a/include/libxml/parser.h b/include/libxml/parser.h index 950ebe32..e1955a08 100644 --- a/include/libxml/parser.h +++ b/include/libxml/parser.h @@ -63,9 +63,9 @@ struct _xmlParserInput { int col; /* Current column */ unsigned long consumed; /* How many xmlChars already consumed */ xmlParserInputDeallocate free; /* function to deallocate the base */ - const xmlChar *encoding; /* the encoding string for entity */ + const xmlChar *encoding; /* unused */ const xmlChar *version; /* the version string for entity */ - int standalone; /* Was that entity marked standalone */ + int flags; /* Flags */ int id; /* an unique identifier for the entity */ unsigned long parentConsumed; /* consumed bytes from parents */ xmlEntityPtr entity; /* entity, if any */ @@ -122,7 +122,8 @@ typedef enum { XML_PARSER_SYSTEM_LITERAL, /* within a SYSTEM value */ XML_PARSER_EPILOG, /* the Misc* after the last end tag */ XML_PARSER_IGNORE, /* within an IGNORED section */ - XML_PARSER_PUBLIC_LITERAL /* within a PUBLIC value */ + XML_PARSER_PUBLIC_LITERAL, /* within a PUBLIC value */ + XML_PARSER_XML_DECL /* before XML decl (but after BOM) */ } xmlParserInputState; /** @@ -245,8 +246,7 @@ struct _xmlParserCtxt { int depth; /* to prevent entity substitution loops */ xmlParserInputPtr entity; /* used to check entities boundaries */ - int charset; /* encoding of the in-memory content - actually an xmlCharEncoding */ + int charset; /* unused */ int nodelen; /* Those two fields are there to */ int nodemem; /* Speed up large node parsing */ int pedantic; /* signal pedantic warnings */ diff --git a/include/libxml/tree.h b/include/libxml/tree.h index a1cabf69..4e5bf434 100644 --- a/include/libxml/tree.h +++ b/include/libxml/tree.h @@ -573,12 +573,11 @@ struct _xmlDoc { struct _xmlDtd *extSubset; /* the document external subset */ struct _xmlNs *oldNs; /* Global namespace, the old way */ const xmlChar *version; /* the XML version string */ - const xmlChar *encoding; /* external initial encoding, if any */ + const xmlChar *encoding; /* encoding from XML declaration, if any */ void *ids; /* Hash table for ID attributes if any */ void *refs; /* Hash table for IDREFs attributes if any */ const xmlChar *URL; /* The URI for that document */ - int charset; /* Internal flag for charset handling, - actually an xmlCharEncoding */ + int charset; /* unused */ struct _xmlDict *dict; /* dict used to allocate names or NULL */ void *psvi; /* for type/PSVI information */ int parseFlags; /* set of xmlParserOption used to parse the diff --git a/include/libxml/xmlerror.h b/include/libxml/xmlerror.h index 830b4a68..63ddaa95 100644 --- a/include/libxml/xmlerror.h +++ b/include/libxml/xmlerror.h @@ -210,6 +210,7 @@ typedef enum { XML_ERR_NAME_TOO_LONG, /* 110 */ XML_ERR_USER_STOP, /* 111 */ XML_ERR_COMMENT_ABRUPTLY_ENDED, /* 112 */ + XML_WAR_ENCODING_MISMATCH, /* 113 */ XML_NS_ERR_XML_NAMESPACE = 200, XML_NS_ERR_UNDEFINED_NAMESPACE, /* 201 */ XML_NS_ERR_QNAME, /* 202 */ diff --git a/include/private/parser.h b/include/private/parser.h index bf933f7d..bc4bc0d1 100644 --- a/include/private/parser.h +++ b/include/private/parser.h @@ -17,10 +17,21 @@ */ #define XML_VCTXT_USE_PCTXT (1u << 1) +#define XML_INPUT_HAS_ENCODING (1u << 0) +#define XML_INPUT_AUTO_ENCODING (7u << 1) +#define XML_INPUT_AUTO_UTF8 (1u << 1) +#define XML_INPUT_AUTO_UTF16LE (2u << 1) +#define XML_INPUT_AUTO_UTF16BE (3u << 1) +#define XML_INPUT_AUTO_OTHER (4u << 1) +#define XML_INPUT_8_BIT (1u << 4) + XML_HIDDEN void xmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra); XML_HIDDEN void xmlFatalErr(xmlParserCtxtPtr ctxt, xmlParserErrors error, const char *info); +XML_HIDDEN void LIBXML_ATTR_FORMAT(3,0) +xmlWarningMsg(xmlParserCtxtPtr ctxt, xmlParserErrors error, + const char *msg, const xmlChar *str1, const xmlChar *str2); XML_HIDDEN void __xmlErrEncoding(xmlParserCtxtPtr ctxt, xmlParserErrors xmlerr, const char *msg, const xmlChar *str1, @@ -32,4 +43,9 @@ xmlParserGrow(xmlParserCtxtPtr ctxt); XML_HIDDEN void xmlParserShrink(xmlParserCtxtPtr ctxt); +XML_HIDDEN void +xmlDetectEncoding(xmlParserCtxtPtr ctxt); +XML_HIDDEN void +xmlSetDeclaredEncoding(xmlParserCtxtPtr ctxt, xmlChar *encoding); + #endif /* XML_PARSER_H_PRIVATE__ */ diff --git a/parser.c b/parser.c index 942029a6..bb4f0e2c 100644 --- a/parser.c +++ b/parser.c @@ -281,7 +281,7 @@ xmlFatalErrMsg(xmlParserCtxtPtr ctxt, xmlParserErrors error, * * Handle a warning. */ -static void LIBXML_ATTR_FORMAT(3,0) +void LIBXML_ATTR_FORMAT(3,0) xmlWarningMsg(xmlParserCtxtPtr ctxt, xmlParserErrors error, const char *msg, const xmlChar *str1, const xmlChar *str2) { @@ -2313,6 +2313,7 @@ xmlParserHandlePEReference(xmlParserCtxtPtr ctxt) { return; case XML_PARSER_PROLOG: case XML_PARSER_START: + case XML_PARSER_XML_DECL: case XML_PARSER_MISC: xmlFatalErr(ctxt, XML_ERR_PEREF_IN_PROLOG, NULL); return; @@ -6682,7 +6683,6 @@ xmlParseMarkupDecl(xmlParserCtxtPtr ctxt) { void xmlParseTextDecl(xmlParserCtxtPtr ctxt) { xmlChar *version; - const xmlChar *encoding; int oldstate; /* @@ -6721,7 +6721,7 @@ xmlParseTextDecl(xmlParserCtxtPtr ctxt) { /* * We must have the encoding declaration */ - encoding = xmlParseEncodingDecl(ctxt); + xmlParseEncodingDecl(ctxt); if (ctxt->instate == XML_PARSER_EOF) return; if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) { @@ -6731,10 +6731,6 @@ xmlParseTextDecl(xmlParserCtxtPtr ctxt) { ctxt->instate = oldstate; return; } - if ((encoding == NULL) && (ctxt->errNo == XML_ERR_OK)) { - xmlFatalErrMsg(ctxt, XML_ERR_MISSING_ENCODING, - "Missing encoding in text declaration\n"); - } SKIP_BLANKS; if ((RAW == '?') && (NXT(1) == '>')) { @@ -6773,21 +6769,8 @@ void xmlParseExternalSubset(xmlParserCtxtPtr ctxt, const xmlChar *ExternalID, const xmlChar *SystemID) { xmlDetectSAX2(ctxt); - GROW; - if ((ctxt->encoding == NULL) && - (ctxt->input->end - ctxt->input->cur >= 4)) { - xmlChar start[4]; - xmlCharEncoding enc; - - start[0] = RAW; - start[1] = NXT(1); - start[2] = NXT(2); - start[3] = NXT(3); - enc = xmlDetectCharEncoding(start, 4); - if (enc != XML_CHAR_ENCODING_NONE) - xmlSwitchEncoding(ctxt, enc); - } + xmlDetectEncoding(ctxt); if (CMP5(CUR_PTR, '<', '?', 'x', 'm', 'l')) { xmlParseTextDecl(ctxt); @@ -7727,8 +7710,6 @@ xmlParsePEReference(xmlParserCtxtPtr ctxt) "Internal: %%%s; is not a parameter entity\n", name, NULL); } else { - xmlChar start[4]; - xmlCharEncoding enc; unsigned long parentConsumed; xmlEntityPtr oldEnt; @@ -7769,28 +7750,7 @@ xmlParsePEReference(xmlParserCtxtPtr ctxt) input->parentConsumed = parentConsumed; if (entity->etype == XML_EXTERNAL_PARAMETER_ENTITY) { - /* - * Get the 4 first bytes and decode the charset - * if enc != XML_CHAR_ENCODING_NONE - * plug some encoding conversion routines. - * Note that, since we may have some non-UTF8 - * encoding (like UTF16, bug 135229), the 'length' - * is not known, but we can calculate based upon - * the amount of data in the buffer. - */ - GROW - if (ctxt->instate == XML_PARSER_EOF) - return; - if ((ctxt->input->end - ctxt->input->cur)>=4) { - start[0] = RAW; - start[1] = NXT(1); - start[2] = NXT(2); - start[3] = NXT(3); - enc = xmlDetectCharEncoding(start, 4); - if (enc != XML_CHAR_ENCODING_NONE) { - xmlSwitchEncoding(ctxt, enc); - } - } + xmlDetectEncoding(ctxt); if ((CMP5(CUR_PTR, '<', '?', 'x', 'm', 'l')) && (IS_BLANK_CH(NXT(5)))) { @@ -10094,101 +10054,45 @@ xmlParseEncodingDecl(xmlParserCtxtPtr ctxt) { xmlChar *encoding = NULL; SKIP_BLANKS; - if (CMP8(CUR_PTR, 'e', 'n', 'c', 'o', 'd', 'i', 'n', 'g')) { - SKIP(8); - SKIP_BLANKS; - if (RAW != '=') { - xmlFatalErr(ctxt, XML_ERR_EQUAL_REQUIRED, NULL); - return(NULL); - } - NEXT; - SKIP_BLANKS; - if (RAW == '"') { - NEXT; - encoding = xmlParseEncName(ctxt); - if (RAW != '"') { - xmlFatalErr(ctxt, XML_ERR_STRING_NOT_CLOSED, NULL); - xmlFree((xmlChar *) encoding); - return(NULL); - } else - NEXT; - } else if (RAW == '\''){ - NEXT; - encoding = xmlParseEncName(ctxt); - if (RAW != '\'') { - xmlFatalErr(ctxt, XML_ERR_STRING_NOT_CLOSED, NULL); - xmlFree((xmlChar *) encoding); - return(NULL); - } else - NEXT; - } else { - xmlFatalErr(ctxt, XML_ERR_STRING_NOT_STARTED, NULL); - } + if (CMP8(CUR_PTR, 'e', 'n', 'c', 'o', 'd', 'i', 'n', 'g') == 0) + return(NULL); - /* - * Non standard parsing, allowing the user to ignore encoding - */ - if (ctxt->options & XML_PARSE_IGNORE_ENC) { - xmlFree((xmlChar *) encoding); - return(NULL); - } - - /* - * UTF-16 encoding switch has already taken place at this stage, - * more over the little-endian/big-endian selection is already done - */ - if ((encoding != NULL) && - ((!xmlStrcasecmp(encoding, BAD_CAST "UTF-16")) || - (!xmlStrcasecmp(encoding, BAD_CAST "UTF16")))) { - /* - * If no encoding was passed to the parser, that we are - * using UTF-16 and no decoder is present i.e. the - * document is apparently UTF-8 compatible, then raise an - * encoding mismatch fatal error - */ - if ((ctxt->encoding == NULL) && - (ctxt->input->buf != NULL) && - (ctxt->input->buf->encoder == NULL)) { - xmlFatalErrMsg(ctxt, XML_ERR_INVALID_ENCODING, - "Document labelled UTF-16 but has UTF-8 content\n"); - } - if (ctxt->encoding != NULL) - xmlFree((xmlChar *) ctxt->encoding); - ctxt->encoding = encoding; - } - /* - * UTF-8 encoding is handled natively - */ - else if ((encoding != NULL) && - ((!xmlStrcasecmp(encoding, BAD_CAST "UTF-8")) || - (!xmlStrcasecmp(encoding, BAD_CAST "UTF8")))) { - /* TODO: Check for encoding mismatch. */ - if (ctxt->encoding != NULL) - xmlFree((xmlChar *) ctxt->encoding); - ctxt->encoding = encoding; - } - else if (encoding != NULL) { - xmlCharEncodingHandlerPtr handler; - - if (ctxt->input->encoding != NULL) - xmlFree((xmlChar *) ctxt->input->encoding); - ctxt->input->encoding = encoding; - - handler = xmlFindCharEncodingHandler((const char *) encoding); - if (handler != NULL) { - if (xmlSwitchToEncoding(ctxt, handler) < 0) { - /* failed to convert */ - ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; - return(NULL); - } - } else { - xmlFatalErrMsgStr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, - "Unsupported encoding %s\n", encoding); - return(NULL); - } - } + SKIP(8); + SKIP_BLANKS; + if (RAW != '=') { + xmlFatalErr(ctxt, XML_ERR_EQUAL_REQUIRED, NULL); + return(NULL); } - return(encoding); + NEXT; + SKIP_BLANKS; + if (RAW == '"') { + NEXT; + encoding = xmlParseEncName(ctxt); + if (RAW != '"') { + xmlFatalErr(ctxt, XML_ERR_STRING_NOT_CLOSED, NULL); + xmlFree((xmlChar *) encoding); + return(NULL); + } else + NEXT; + } else if (RAW == '\''){ + NEXT; + encoding = xmlParseEncName(ctxt); + if (RAW != '\'') { + xmlFatalErr(ctxt, XML_ERR_STRING_NOT_CLOSED, NULL); + xmlFree((xmlChar *) encoding); + return(NULL); + } else + NEXT; + } else { + xmlFatalErr(ctxt, XML_ERR_STRING_NOT_STARTED, NULL); + } + + if (encoding == NULL) + return(NULL); + + xmlSetDeclaredEncoding(ctxt, encoding); + + return(ctxt->encoding); } /** @@ -10365,7 +10269,7 @@ xmlParseXMLDecl(xmlParserCtxtPtr ctxt) { /* * We may have the standalone status. */ - if ((ctxt->input->encoding != NULL) && (!IS_BLANK_CH(RAW))) { + if ((ctxt->encoding != NULL) && (!IS_BLANK_CH(RAW))) { if ((RAW == '?') && (NXT(1) == '>')) { SKIP(2); return; @@ -10443,9 +10347,6 @@ xmlParseMisc(xmlParserCtxtPtr ctxt) { int xmlParseDocument(xmlParserCtxtPtr ctxt) { - xmlChar start[4]; - xmlCharEncoding enc; - xmlInitParser(); if ((ctxt == NULL) || (ctxt->input == NULL)) @@ -10466,23 +10367,7 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) { if (ctxt->instate == XML_PARSER_EOF) return(-1); - if ((ctxt->encoding == NULL) && - ((ctxt->input->end - ctxt->input->cur) >= 4)) { - /* - * Get the 4 first bytes and decode the charset - * if enc != XML_CHAR_ENCODING_NONE - * plug some encoding conversion routines. - */ - start[0] = RAW; - start[1] = NXT(1); - start[2] = NXT(2); - start[3] = NXT(3); - enc = xmlDetectCharEncoding(&start[0], 4); - if (enc != XML_CHAR_ENCODING_NONE) { - xmlSwitchEncoding(ctxt, enc); - } - } - + xmlDetectEncoding(ctxt); if (CUR == 0) { xmlFatalErr(ctxt, XML_ERR_DOCUMENT_EMPTY, NULL); @@ -10626,38 +10511,18 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) { int xmlParseExtParsedEnt(xmlParserCtxtPtr ctxt) { - xmlChar start[4]; - xmlCharEncoding enc; - if ((ctxt == NULL) || (ctxt->input == NULL)) return(-1); xmlDetectSAX2(ctxt); - GROW; - /* * SAX: beginning of the document processing. */ if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator); - /* - * Get the 4 first bytes and decode the charset - * if enc != XML_CHAR_ENCODING_NONE - * plug some encoding conversion routines. - */ - if ((ctxt->input->end - ctxt->input->cur) >= 4) { - start[0] = RAW; - start[1] = NXT(1); - start[2] = NXT(2); - start[3] = NXT(3); - enc = xmlDetectCharEncoding(start, 4); - if (enc != XML_CHAR_ENCODING_NONE) { - xmlSwitchEncoding(ctxt, enc); - } - } - + xmlDetectEncoding(ctxt); if (CUR == 0) { xmlFatalErr(ctxt, XML_ERR_DOCUMENT_EMPTY, NULL); @@ -11076,6 +10941,9 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { case XML_PARSER_START: xmlGenericError(xmlGenericErrorContext, "PP: try START\n"); break; + case XML_PARSER_XML_DECL: + xmlGenericError(xmlGenericErrorContext, + "PP: try XML_DECL\n"); break; case XML_PARSER_MISC: xmlGenericError(xmlGenericErrorContext, "PP: try MISC\n");break; @@ -11164,39 +11032,25 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { */ goto done; case XML_PARSER_START: - if (ctxt->charset == XML_CHAR_ENCODING_NONE) { - xmlChar start[4]; - xmlCharEncoding enc; + /* + * Very first chars read from the document flow. + */ + if (avail < 4) + goto done; - /* - * Very first chars read from the document flow. - */ - if (avail < 4) - goto done; + /* + * We need more bytes to detect EBCDIC code pages. + * See xmlDetectEBCDIC. + */ + if ((CMP4(CUR_PTR, 0x4C, 0x6F, 0xA7, 0x94)) && + (!terminate) && (avail < 200)) + goto done; - /* - * Get the 4 first bytes and decode the charset - * if enc != XML_CHAR_ENCODING_NONE - * plug some encoding conversion routines, - * else xmlSwitchEncoding will set to (default) - * UTF8. - */ - start[0] = RAW; - start[1] = NXT(1); - start[2] = NXT(2); - start[3] = NXT(3); - enc = xmlDetectCharEncoding(start, 4); - /* - * We need more bytes to detect EBCDIC code pages. - * See xmlDetectEBCDIC. - */ - if ((enc == XML_CHAR_ENCODING_EBCDIC) && - (!terminate) && (avail < 200)) - goto done; - xmlSwitchEncoding(ctxt, enc); - break; - } + xmlDetectEncoding(ctxt); + ctxt->instate = XML_PARSER_XML_DECL; + break; + case XML_PARSER_XML_DECL: if (avail < 2) goto done; cur = ctxt->input->cur[0]; @@ -11242,9 +11096,6 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { xmlHaltParser(ctxt); return(0); } - if ((ctxt->encoding == NULL) && - (ctxt->input->encoding != NULL)) - ctxt->encoding = xmlStrdup(ctxt->input->encoding); if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX)) ctxt->sax->startDocument(ctxt->userData); @@ -11978,13 +11829,6 @@ xmlCreatePushParserCtxt(xmlSAXHandlerPtr sax, void *user_data, xmlBufResetInput(inputStream->buf->buffer, inputStream); inputPush(ctxt, inputStream); - /* - * If the caller didn't provide an initial 'chunk' for determining - * the encoding, we set the context to XML_CHAR_ENCODING_NONE so - * that it can be automatically determined later - */ - ctxt->charset = XML_CHAR_ENCODING_NONE; - if ((size != 0) && (chunk != NULL) && (ctxt->input != NULL) && (ctxt->input->buf != NULL)) { size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input); @@ -12092,7 +11936,6 @@ xmlIOParseDTD(xmlSAXHandlerPtr sax, xmlParserInputBufferPtr input, xmlDtdPtr ret = NULL; xmlParserCtxtPtr ctxt; xmlParserInputPtr pinput = NULL; - xmlChar start[4]; if (input == NULL) return(NULL); @@ -12150,22 +11993,7 @@ xmlIOParseDTD(xmlSAXHandlerPtr sax, xmlParserInputBufferPtr input, ctxt->myDoc->extSubset = xmlNewDtd(ctxt->myDoc, BAD_CAST "none", BAD_CAST "none", BAD_CAST "none"); - if ((enc == XML_CHAR_ENCODING_NONE) && - ((ctxt->input->end - ctxt->input->cur) >= 4)) { - /* - * Get the 4 first bytes and decode the charset - * if enc != XML_CHAR_ENCODING_NONE - * plug some encoding conversion routines. - */ - start[0] = RAW; - start[1] = NXT(1); - start[2] = NXT(2); - start[3] = NXT(3); - enc = xmlDetectCharEncoding(start, 4); - if (enc != XML_CHAR_ENCODING_NONE) { - xmlSwitchEncoding(ctxt, enc); - } - } + xmlDetectEncoding(ctxt); xmlParseExternalSubset(ctxt, BAD_CAST "none", BAD_CAST "none"); @@ -12213,7 +12041,6 @@ xmlSAXParseDTD(xmlSAXHandlerPtr sax, const xmlChar *ExternalID, xmlDtdPtr ret = NULL; xmlParserCtxtPtr ctxt; xmlParserInputPtr input = NULL; - xmlCharEncoding enc; xmlChar* systemIdCanonic; if ((ExternalID == NULL) && (SystemID == NULL)) return(NULL); @@ -12258,10 +12085,8 @@ xmlSAXParseDTD(xmlSAXHandlerPtr sax, const xmlChar *ExternalID, xmlFree(systemIdCanonic); return(NULL); } - if ((ctxt->input->end - ctxt->input->cur) >= 4) { - enc = xmlDetectCharEncoding(ctxt->input->cur, 4); - xmlSwitchEncoding(ctxt, enc); - } + + xmlDetectEncoding(ctxt); if (input->filename == NULL) input->filename = (char *) systemIdCanonic; @@ -12399,8 +12224,6 @@ xmlParseExternalEntityPrivate(xmlDocPtr doc, xmlParserCtxtPtr oldctxt, xmlDocPtr newDoc; xmlNodePtr newRoot; xmlParserErrors ret = XML_ERR_OK; - xmlChar start[4]; - xmlCharEncoding enc; if (((depth > 40) && ((oldctxt == NULL) || (oldctxt->options & XML_PARSE_HUGE) == 0)) || @@ -12461,22 +12284,7 @@ xmlParseExternalEntityPrivate(xmlDocPtr doc, xmlParserCtxtPtr oldctxt, newRoot->doc = doc; } - /* - * Get the 4 first bytes and decode the charset - * if enc != XML_CHAR_ENCODING_NONE - * plug some encoding conversion routines. - */ - GROW; - if ((ctxt->input->end - ctxt->input->cur) >= 4) { - start[0] = RAW; - start[1] = NXT(1); - start[2] = NXT(2); - start[3] = NXT(3); - enc = xmlDetectCharEncoding(start, 4); - if (enc != XML_CHAR_ENCODING_NONE) { - xmlSwitchEncoding(ctxt, enc); - } - } + xmlDetectEncoding(ctxt); /* * Parse a possible text declaration first @@ -12963,10 +12771,6 @@ xmlParseInNodeContext(xmlNodePtr node, const char *data, int datalen, if (doc->encoding != NULL) { xmlCharEncodingHandlerPtr hdlr; - if (ctxt->encoding != NULL) - xmlFree((xmlChar *) ctxt->encoding); - ctxt->encoding = xmlStrdup((const xmlChar *) doc->encoding); - hdlr = xmlFindCharEncodingHandler((const char *) doc->encoding); if (hdlr != NULL) { xmlSwitchToEncoding(ctxt, hdlr); @@ -14273,7 +14077,6 @@ xmlCtxtReset(xmlParserCtxtPtr ctxt) ctxt->inSubset = 0; ctxt->errNo = XML_ERR_OK; ctxt->depth = 0; - ctxt->charset = XML_CHAR_ENCODING_UTF8; ctxt->catalogs = NULL; ctxt->sizeentities = 0; ctxt->sizeentcopy = 0; @@ -14374,10 +14177,6 @@ xmlCtxtResetPush(xmlParserCtxtPtr ctxt, const char *chunk, if (encoding != NULL) { xmlCharEncodingHandlerPtr hdlr; - if (ctxt->encoding != NULL) - xmlFree((xmlChar *) ctxt->encoding); - ctxt->encoding = xmlStrdup((const xmlChar *) encoding); - hdlr = xmlFindCharEncodingHandler(encoding); if (hdlr != NULL) { xmlSwitchToEncoding(ctxt, hdlr); diff --git a/parserInternals.c b/parserInternals.c index ed2d3dee..63f8372e 100644 --- a/parserInternals.c +++ b/parserInternals.c @@ -765,7 +765,7 @@ xmlNextChar(xmlParserCtxtPtr ctxt) return; } - if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { + if ((ctxt->input->flags & XML_INPUT_8_BIT) == 0) { const unsigned char *cur; unsigned char c; @@ -876,7 +876,10 @@ encoding_error: "Input is not proper UTF-8, indicate encoding !\n%s", BAD_CAST buffer, NULL); } - ctxt->charset = XML_CHAR_ENCODING_8859_1; + if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) { + ctxt->input->flags |= XML_INPUT_HAS_ENCODING; + ctxt->input->flags |= XML_INPUT_8_BIT; + } ctxt->input->cur++; return; } @@ -917,7 +920,7 @@ xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { *len = 1; return(*ctxt->input->cur); } - if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { + if ((ctxt->input->flags & XML_INPUT_8_BIT) == 0) { /* * We are supposed to handle UTF8, check it's valid * From rfc2044: encoding of the Unicode values on UTF-8: @@ -1040,7 +1043,10 @@ encoding_error: "Input is not proper UTF-8, indicate encoding !\n%s", BAD_CAST buffer, NULL); } - ctxt->charset = XML_CHAR_ENCODING_8859_1; + if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) { + ctxt->input->flags |= XML_INPUT_HAS_ENCODING; + ctxt->input->flags |= XML_INPUT_8_BIT; + } *len = 1; return(*ctxt->input->cur); @@ -1073,7 +1079,8 @@ int xmlStringCurrentChar(xmlParserCtxtPtr ctxt, const xmlChar * cur, int *len) { if ((len == NULL) || (cur == NULL)) return(0); - if ((ctxt == NULL) || (ctxt->charset == XML_CHAR_ENCODING_UTF8)) { + if ((ctxt == NULL) || (ctxt->input == NULL) || + ((ctxt->input->flags & XML_INPUT_8_BIT) == 0)) { /* * We are supposed to handle UTF8, check it's valid * From rfc2044: encoding of the Unicode values on UTF-8: @@ -1300,58 +1307,29 @@ xmlDetectEBCDIC(xmlParserInputPtr input) { * @ctxt: the parser context * @enc: the encoding value (number) * - * change the input functions when discovering the character encoding - * of a given entity. + * Use encoding specified by enum to decode input data. + * + * This function can be used to enforce the encoding of chunks passed + * to xmlParseChunk. * * Returns 0 in case of success, -1 otherwise */ int xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) { - xmlCharEncodingHandlerPtr handler; + xmlCharEncodingHandlerPtr handler = NULL; + int check = 1; int ret; - if (ctxt == NULL) return(-1); - - /* - * FIXME: The BOM shouldn't be skipped here, but in the parsing code. - * - * Note that we look for a decoded UTF-8 BOM when switching to UTF-16. - * This is mostly useless but Webkit/Chromium relies on this behavior. - * See https://bugs.chromium.org/p/chromium/issues/detail?id=1451026 - */ - if ((ctxt->input != NULL) && - (ctxt->input->consumed == 0) && - (ctxt->input->cur != NULL) && - (ctxt->input->cur == ctxt->input->base) && - ((enc == XML_CHAR_ENCODING_UTF8) || - (enc == XML_CHAR_ENCODING_UTF16LE) || - (enc == XML_CHAR_ENCODING_UTF16BE))) { - /* - * Errata on XML-1.0 June 20 2001 - * Specific handling of the Byte Order Mark for - * UTF-8 - */ - if ((ctxt->input->cur[0] == 0xEF) && - (ctxt->input->cur[1] == 0xBB) && - (ctxt->input->cur[2] == 0xBF)) { - ctxt->input->cur += 3; - } - } + if ((ctxt == NULL) || (ctxt->input == NULL)) + return(-1); switch (enc) { - case XML_CHAR_ENCODING_ERROR: - __xmlErrEncoding(ctxt, XML_ERR_UNKNOWN_ENCODING, - "encoding unknown\n", NULL, NULL); - return(-1); case XML_CHAR_ENCODING_NONE: - /* let's assume it's UTF-8 without the XML decl */ - ctxt->charset = XML_CHAR_ENCODING_UTF8; - return(0); case XML_CHAR_ENCODING_UTF8: - /* default encoding, no conversion should be needed */ - ctxt->charset = XML_CHAR_ENCODING_UTF8; - return(0); + case XML_CHAR_ENCODING_ASCII: + check = 0; + break; case XML_CHAR_ENCODING_EBCDIC: handler = xmlDetectEBCDIC(ctxt->input); break; @@ -1359,45 +1337,28 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) handler = xmlGetCharEncodingHandler(enc); break; } - if (handler == NULL) { - /* - * Default handlers. - */ - switch (enc) { - case XML_CHAR_ENCODING_ASCII: - /* default encoding, no conversion should be needed */ - ctxt->charset = XML_CHAR_ENCODING_UTF8; - return(0); - case XML_CHAR_ENCODING_8859_1: - if ((ctxt->inputNr == 1) && - (ctxt->encoding == NULL) && - (ctxt->input != NULL) && - (ctxt->input->encoding != NULL)) { - ctxt->encoding = xmlStrdup(ctxt->input->encoding); - } - ctxt->charset = enc; - return(0); - default: - __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING, - "encoding not supported: %s\n", - BAD_CAST xmlGetCharEncodingName(enc), NULL); - /* - * TODO: We could recover from errors in external entities - * if we didn't stop the parser. But most callers of this - * function don't check the return value. - */ - xmlStopParser(ctxt); - return(-1); - } - } - ret = xmlSwitchInputEncoding(ctxt, ctxt->input, handler); - if ((ret < 0) || (ctxt->errNo == XML_I18N_CONV_FAILED)) { + + if ((check) && (handler == NULL)) { + const char *name = xmlGetCharEncodingName(enc); + + __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING, + "encoding not supported: %s\n", + BAD_CAST (name ? name : ""), NULL); /* - * on encoding conversion errors, stop the parser - */ + * TODO: We could recover from errors in external entities + * if we didn't stop the parser. But most callers of this + * function don't check the return value. + */ xmlStopParser(ctxt); - ctxt->errNo = XML_I18N_CONV_FAILED; + return(-1); } + + ret = xmlSwitchInputEncoding(ctxt, ctxt->input, handler); + + if ((ret >= 0) && (enc == XML_CHAR_ENCODING_NONE)) { + ctxt->input->flags &= ~XML_INPUT_HAS_ENCODING; + } + return(ret); } @@ -1407,8 +1368,9 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) * @input: the input stream * @handler: the encoding handler * - * change the input functions when discovering the character encoding - * of a given entity. + * DEPRECATED: Internal function, don't use. + * + * Use encoding handler to decode input data. * * Returns 0 in case of success, -1 otherwise */ @@ -1419,27 +1381,19 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, int nbchars; xmlParserInputBufferPtr in; - if (handler == NULL) - return (-1); - if (input == NULL) - return (-1); - in = input->buf; - if (in == NULL) { - xmlErrInternal(ctxt, - "static memory buffer doesn't support encoding\n", NULL); - /* - * Callers assume that the input buffer takes ownership of the - * encoding handler. xmlCharEncCloseFunc frees unregistered - * handlers and avoids a memory leak. - */ + if ((input == NULL) || (input->buf == NULL)) { xmlCharEncCloseFunc(handler); return (-1); } + in = input->buf; + + input->flags |= XML_INPUT_HAS_ENCODING; + input->flags &= ~XML_INPUT_8_BIT; + + if (in->encoder == handler) + return (0); if (in->encoder != NULL) { - if (in->encoder == handler) - return (0); - /* * Switching encodings during parsing is a really bad idea, * but Chromium can switch between ISO-8859-1 and UTF-16 before @@ -1454,7 +1408,6 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, return (0); } - ctxt->charset = XML_CHAR_ENCODING_UTF8; in->encoder = handler; /* @@ -1463,37 +1416,6 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, if (xmlBufIsEmpty(in->buffer) == 0) { size_t processed, use, consumed; - /* - * FIXME: The BOM shouldn't be skipped here, but in the parsing code. - */ - - /* - * Specific handling of the Byte Order Mark for - * UTF-16 - */ - if ((handler->name != NULL) && - (!strcmp(handler->name, "UTF-16LE") || - !strcmp(handler->name, "UTF-16")) && - (input->cur[0] == 0xFF) && (input->cur[1] == 0xFE)) { - input->cur += 2; - } - if ((handler->name != NULL) && - (!strcmp(handler->name, "UTF-16BE")) && - (input->cur[0] == 0xFE) && (input->cur[1] == 0xFF)) { - input->cur += 2; - } - /* - * Errata on XML-1.0 June 20 2001 - * Specific handling of the Byte Order Mark for - * UTF-8 - */ - if ((handler->name != NULL) && - (!strcmp(handler->name, "UTF-8")) && - (input->cur[0] == 0xEF) && - (input->cur[1] == 0xBB) && (input->cur[2] == 0xBF)) { - input->cur += 3; - } - /* * Shrink the current input buffer. * Move it as the raw buffer and create a new input buffer @@ -1541,8 +1463,10 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, * @ctxt: the parser context * @handler: the encoding handler * - * change the input functions when discovering the character encoding - * of a given entity. + * Use encoding handler to decode input data. + * + * This function can be used to enforce the encoding of chunks passed + * to xmlParseChunk. * * Returns 0 in case of success, -1 otherwise */ @@ -1554,6 +1478,185 @@ xmlSwitchToEncoding(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler) return(xmlSwitchInputEncoding(ctxt, ctxt->input, handler)); } +/** + * xmlDetectEncoding: + * @ctxt: the parser context + * + * Handle optional BOM, detect and switch to encoding. + * + * Assumes that there are at least four bytes in the input buffer. + */ +void +xmlDetectEncoding(xmlParserCtxtPtr ctxt) { + const xmlChar *in = ctxt->input->cur; + xmlCharEncoding enc; + int bomSize; + int autoFlag = 0; + + if (xmlParserGrow(ctxt) < 0) + return; + if (ctxt->input->end - in < 4) + return; + + if (ctxt->input->flags & XML_INPUT_HAS_ENCODING) { + /* + * If the encoding was already set, only skip the BOM which was + * possibly decoded to UTF-8. + */ + if ((in[0] == 0xEF) && (in[1] == 0xBB) && (in[2] == 0xBF)) { + ctxt->input->cur += 3; + } + + return; + } + + enc = XML_CHAR_ENCODING_NONE; + bomSize = 0; + + switch (in[0]) { + case 0x00: + if ((in[1] == 0x00) && (in[2] == 0x00) && (in[3] == 0x3C)) { + enc = XML_CHAR_ENCODING_UCS4BE; + autoFlag = XML_INPUT_AUTO_OTHER; + } else if ((in[1] == 0x3C) && (in[2] == 0x00) && (in[3] == 0x3F)) { + enc = XML_CHAR_ENCODING_UTF16BE; + autoFlag = XML_INPUT_AUTO_UTF16BE; + } + break; + + case 0x3C: + if (in[1] == 0x00) { + if ((in[2] == 0x00) && (in[3] == 0x00)) { + enc = XML_CHAR_ENCODING_UCS4LE; + autoFlag = XML_INPUT_AUTO_OTHER; + } else if ((in[2] == 0x3F) && (in[3] == 0x00)) { + enc = XML_CHAR_ENCODING_UTF16LE; + autoFlag = XML_INPUT_AUTO_UTF16LE; + } + } + break; + + case 0x4C: + if ((in[1] == 0x6F) && (in[2] == 0xA7) && (in[3] == 0x94)) { + enc = XML_CHAR_ENCODING_EBCDIC; + autoFlag = XML_INPUT_AUTO_OTHER; + } + break; + + case 0xEF: + if ((in[1] == 0xBB) && (in[2] == 0xBF)) { + enc = XML_CHAR_ENCODING_UTF8; + autoFlag = XML_INPUT_AUTO_UTF8; + bomSize = 3; + } + break; + + case 0xFE: + if (in[1] == 0xFF) { + enc = XML_CHAR_ENCODING_UTF16BE; + autoFlag = XML_INPUT_AUTO_UTF16BE; + bomSize = 2; + } + break; + + case 0xFF: + if (in[1] == 0xFE) { + enc = XML_CHAR_ENCODING_UTF16LE; + autoFlag = XML_INPUT_AUTO_UTF16LE; + bomSize = 2; + } + break; + } + + if (bomSize > 0) { + ctxt->input->cur += bomSize; + } + + if (enc != XML_CHAR_ENCODING_NONE) { + ctxt->input->flags |= autoFlag; + xmlSwitchEncoding(ctxt, enc); + } +} + +/** + * xmlSetDeclaredEncoding: + * @ctxt: the parser context + * @encoding: declared encoding + * + * Set the encoding from a declaration in the document. + * + * If no encoding was set yet, switch the encoding. Otherwise, only warn + * about encoding mismatches. + * + * Takes ownership of 'encoding'. + */ +void +xmlSetDeclaredEncoding(xmlParserCtxtPtr ctxt, xmlChar *encoding) { + if (ctxt->encoding != NULL) + xmlFree((xmlChar *) ctxt->encoding); + ctxt->encoding = encoding; + + if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) && + ((ctxt->options & XML_PARSE_IGNORE_ENC) == 0)) { + xmlCharEncodingHandlerPtr handler; + + handler = xmlFindCharEncodingHandler((const char *) encoding); + if (handler != NULL) { + xmlSwitchToEncoding(ctxt, handler); + } else { + __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING, + "Unsupported encoding: %s\n", + encoding, NULL); + } + } else if (ctxt->input->flags & XML_INPUT_AUTO_ENCODING) { + static const char *allowedUTF8[] = { + "UTF-8", "UTF8", NULL + }; + static const char *allowedUTF16LE[] = { + "UTF-16", "UTF-16LE", "UTF16", NULL + }; + static const char *allowedUTF16BE[] = { + "UTF-16", "UTF-16BE", "UTF16", NULL + }; + const char **allowed = NULL; + const char *autoEnc = NULL; + + switch (ctxt->input->flags & XML_INPUT_AUTO_ENCODING) { + case XML_INPUT_AUTO_UTF8: + allowed = allowedUTF8; + autoEnc = "UTF-8"; + break; + case XML_INPUT_AUTO_UTF16LE: + allowed = allowedUTF16LE; + autoEnc = "UTF-16LE"; + break; + case XML_INPUT_AUTO_UTF16BE: + allowed = allowedUTF16BE; + autoEnc = "UTF-16BE"; + break; + } + + if (allowed != NULL) { + const char **p; + int match = 0; + + for (p = allowed; *p != NULL; p++) { + if (xmlStrcasecmp(encoding, BAD_CAST *p) == 0) { + match = 1; + break; + } + } + + if (match == 0) { + xmlWarningMsg(ctxt, XML_WAR_ENCODING_MISMATCH, + "Encoding '%s' doesn't match " + "auto-detected '%s'\n", + encoding, BAD_CAST autoEnc); + } + } + } +} + /************************************************************************ * * * Commodity functions to handle entities processing * @@ -1572,7 +1675,6 @@ xmlFreeInputStream(xmlParserInputPtr input) { if (input->filename != NULL) xmlFree((char *) input->filename); if (input->directory != NULL) xmlFree((char *) input->directory); - if (input->encoding != NULL) xmlFree((char *) input->encoding); if (input->version != NULL) xmlFree((char *) input->version); if ((input->free != NULL) && (input->base != NULL)) input->free((xmlChar *) input->base); @@ -2015,7 +2117,6 @@ xmlInitSAXParserCtxt(xmlParserCtxtPtr ctxt, const xmlSAXHandler *sax, ctxt->inSubset = 0; ctxt->errNo = XML_ERR_OK; ctxt->depth = 0; - ctxt->charset = XML_CHAR_ENCODING_UTF8; ctxt->catalogs = NULL; ctxt->sizeentities = 0; ctxt->sizeentcopy = 0; diff --git a/testchar.c b/testchar.c index a819e196..20d4296d 100644 --- a/testchar.c +++ b/testchar.c @@ -271,11 +271,11 @@ static int testCharRangeByte1(xmlParserCtxtPtr ctxt) { data[3] = 0; for (i = 0;i <= 0xFF;i++) { data[0] = (char) i; - ctxt->charset = XML_CHAR_ENCODING_UTF8; ctxt->nbErrors = 0; lastError = 0; c = xmlCurrentChar(ctxt, &len); + ctxt->input->flags = 0; if ((i == 0) || (i >= 0x80)) { /* we must see an error there */ if (lastError != XML_ERR_INVALID_CHAR) { @@ -307,11 +307,11 @@ static int testCharRangeByte2(xmlParserCtxtPtr ctxt) { for (j = 0;j <= 0xFF;j++) { data[0] = (char) i; data[1] = (char) j; - ctxt->charset = XML_CHAR_ENCODING_UTF8; ctxt->nbErrors = 0; lastError = 0; c = xmlCurrentChar(ctxt, &len); + ctxt->input->flags = 0; /* if first bit of first char is set, then second bit must too */ if ((i & 0x80) && ((i & 0x40) == 0)) { @@ -401,11 +401,11 @@ static int testCharRangeByte3(xmlParserCtxtPtr ctxt) { K = lows[k]; data[2] = (char) K; value = (K & 0x3F) + ((j & 0x3F) << 6) + ((i & 0xF) << 12); - ctxt->charset = XML_CHAR_ENCODING_UTF8; ctxt->nbErrors = 0; lastError = 0; c = xmlCurrentChar(ctxt, &len); + ctxt->input->flags = 0; /* * if fourth bit of first char is set, then the sequence would need @@ -504,11 +504,11 @@ static int testCharRangeByte4(xmlParserCtxtPtr ctxt) { data[3] = (char) L; value = (L & 0x3F) + ((K & 0x3F) << 6) + ((j & 0x3F) << 12) + ((i & 0x7) << 18); - ctxt->charset = XML_CHAR_ENCODING_UTF8; ctxt->nbErrors = 0; lastError = 0; c = xmlCurrentChar(ctxt, &len); + ctxt->input->flags = 0; /* * if fifth bit of first char is set, then the sequence would need diff --git a/xmlIO.c b/xmlIO.c index 9fd9c780..490a82e7 100644 --- a/xmlIO.c +++ b/xmlIO.c @@ -3790,8 +3790,6 @@ xmlCheckHTTPInput(xmlParserCtxtPtr ctxt, xmlParserInputPtr ret) { "Unknown encoding %s", BAD_CAST encoding, NULL); } - if (ret->encoding == NULL) - ret->encoding = xmlStrdup(BAD_CAST encoding); } #if 0 } else if (xmlStrstr(BAD_CAST mime, BAD_CAST "html")) {