diff --git a/HTMLparser.c b/HTMLparser.c index 67ee6654..d0fa178b 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -350,8 +350,7 @@ htmlFindEncoding(xmlParserCtxtPtr ctxt) { const xmlChar *start, *cur, *end; if ((ctxt == NULL) || (ctxt->input == NULL) || - (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) || - (ctxt->input->buf->encoder != NULL)) + (ctxt->input->flags & XML_INPUT_HAS_ENCODING)) return(NULL); if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL)) return(NULL); @@ -417,7 +416,7 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { return(0); } - if (ctxt->charset != XML_CHAR_ENCODING_UTF8) { + if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) { xmlChar * guess; xmlCharEncodingHandlerPtr handler; @@ -444,10 +443,8 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { if (guess == NULL) { xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); } else { - if (ctxt->input->encoding != NULL) - xmlFree((xmlChar *) ctxt->input->encoding); - ctxt->input->encoding = guess; handler = xmlFindCharEncodingHandler((const char *) guess); + xmlFree(guess); if (handler != NULL) { /* * Don't use UTF-8 encoder which isn't required and @@ -460,7 +457,7 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { "Unsupported encoding %s", guess, NULL); } } - ctxt->charset = XML_CHAR_ENCODING_UTF8; + ctxt->input->flags |= XML_INPUT_HAS_ENCODING; } /* @@ -537,13 +534,6 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { } encoding_error: - /* - * If we detect an UTF8 error that probably mean that the - * input encoding didn't get properly advertised in the - * declaration header. Report the error and switch the encoding - * to ISO-Latin-1 (if you don't like this policy, just declare the - * encoding !) - */ { char buffer[150]; @@ -559,15 +549,7 @@ encoding_error: BAD_CAST buffer, NULL); } - /* - * Don't switch encodings twice. Note that if there's an encoder, we - * shouldn't receive invalid UTF-8 anyway. - * - * Note that if ctxt->input->buf == NULL, switching encodings is - * impossible, see Gitlab issue #34. - */ - if ((ctxt->input->buf != NULL) && - (ctxt->input->buf->encoder == NULL)) + if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); *len = 1; return(*ctxt->input->cur); @@ -3781,94 +3763,6 @@ htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) { return(name); } -/** - * htmlCheckEncodingDirect: - * @ctxt: an HTML parser context - * @attvalue: the attribute value - * - * Checks an attribute value to detect - * the encoding - * If a new encoding is detected the parser is switched to decode - * it and pass UTF8 - */ -static void -htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) { - - if ((ctxt == NULL) || (encoding == NULL) || - (ctxt->options & HTML_PARSE_IGNORE_ENC)) - return; - - /* do not change encoding */ - if (ctxt->input->encoding != NULL) - return; - - if (encoding != NULL) { - xmlCharEncoding enc; - xmlCharEncodingHandlerPtr handler; - - while ((*encoding == ' ') || (*encoding == '\t')) encoding++; - - if (ctxt->input->encoding != NULL) - xmlFree((xmlChar *) ctxt->input->encoding); - ctxt->input->encoding = xmlStrdup(encoding); - - enc = xmlParseCharEncoding((const char *) encoding); - /* - * registered set of known encodings - */ - if (enc != XML_CHAR_ENCODING_ERROR) { - if (((enc == XML_CHAR_ENCODING_UTF16LE) || - (enc == XML_CHAR_ENCODING_UTF16BE) || - (enc == XML_CHAR_ENCODING_UCS4LE) || - (enc == XML_CHAR_ENCODING_UCS4BE)) && - (ctxt->input->buf != NULL) && - (ctxt->input->buf->encoder == NULL)) { - htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, - "htmlCheckEncoding: wrong encoding meta\n", - NULL, NULL); - } else { - xmlSwitchEncoding(ctxt, enc); - } - ctxt->charset = XML_CHAR_ENCODING_UTF8; - } else { - /* - * fallback for unknown encodings - */ - handler = xmlFindCharEncodingHandler((const char *) encoding); - if (handler != NULL) { - xmlSwitchToEncoding(ctxt, handler); - ctxt->charset = XML_CHAR_ENCODING_UTF8; - } else { - htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, - "htmlCheckEncoding: unknown encoding %s\n", - encoding, NULL); - } - } - - if ((ctxt->input->buf != NULL) && - (ctxt->input->buf->encoder != NULL) && - (ctxt->input->buf->raw != NULL) && - (ctxt->input->buf->buffer != NULL)) { - int nbchars; - size_t processed; - - /* - * convert as much as possible to the parser reading buffer. - */ - processed = ctxt->input->cur - ctxt->input->base; - xmlBufShrink(ctxt->input->buf->buffer, processed); - nbchars = xmlCharEncInput(ctxt->input->buf, 1); - xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input); - if (nbchars < 0) { - htmlParseErr(ctxt, ctxt->input->buf->error, - "htmlCheckEncoding: encoder error\n", - NULL, NULL); - xmlHaltParser(ctxt); - } - } - } -} - /** * htmlCheckEncoding: * @ctxt: an HTML parser context @@ -3897,7 +3791,7 @@ htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) { encoding = xmlStrcasestr(attvalue, BAD_CAST"="); if (encoding && *encoding == '=') { encoding ++; - htmlCheckEncodingDirect(ctxt, encoding); + xmlSetDeclaredEncoding(ctxt, xmlStrdup(encoding)); } } @@ -3926,7 +3820,7 @@ htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) { && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) http = 1; else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset"))) - htmlCheckEncodingDirect(ctxt, value); + xmlSetDeclaredEncoding(ctxt, xmlStrdup(value)); else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content"))) content = value; att = atts[i++]; @@ -4953,8 +4847,6 @@ __htmlParseContent(void *ctxt) { int htmlParseDocument(htmlParserCtxtPtr ctxt) { - xmlChar start[4]; - xmlCharEncoding enc; xmlDtdPtr dtd; xmlInitParser(); @@ -4964,29 +4856,14 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) { "htmlParseDocument: context error\n", NULL, NULL); return(XML_ERR_INTERNAL_ERROR); } - GROW; + /* * SAX: beginning of the document processing. */ if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator); - if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) && - ((ctxt->input->end - ctxt->input->cur) >= 4)) { - /* - * Get the 4 first bytes and decode the charset - * if enc != XML_CHAR_ENCODING_NONE - * plug some encoding conversion routines. - */ - start[0] = RAW; - start[1] = NXT(1); - start[2] = NXT(2); - start[3] = NXT(3); - enc = xmlDetectCharEncoding(&start[0], 4); - if (enc != XML_CHAR_ENCODING_NONE) { - xmlSwitchEncoding(ctxt, enc); - } - } + xmlDetectEncoding(ctxt); /* * Wipe out everything which is before the first '<' @@ -5317,10 +5194,6 @@ htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) { xmlCharEncoding enc; xmlCharEncodingHandlerPtr handler; - if (ctxt->input->encoding != NULL) - xmlFree((xmlChar *) ctxt->input->encoding); - ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding); - enc = xmlParseCharEncoding(encoding); /* * registered set of known encodings @@ -6265,8 +6138,6 @@ htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data, xmlFreeParserInputBuffer(buf); return(NULL); } - if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder) - ctxt->charset=XML_CHAR_ENCODING_UTF8; if (filename == NULL) { ctxt->directory = NULL; } else { @@ -6722,7 +6593,6 @@ htmlCtxtReset(htmlParserCtxtPtr ctxt) ctxt->inSubset = 0; ctxt->errNo = XML_ERR_OK; ctxt->depth = 0; - ctxt->charset = XML_CHAR_ENCODING_NONE; ctxt->catalogs = NULL; xmlInitNodeInfoSeq(&ctxt->node_seq); @@ -6839,9 +6709,6 @@ htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding, hdlr = xmlFindCharEncodingHandler(encoding); if (hdlr != NULL) { xmlSwitchToEncoding(ctxt, hdlr); - if (ctxt->input->encoding != NULL) - xmlFree((xmlChar *) ctxt->input->encoding); - ctxt->input->encoding = xmlStrdup((xmlChar *)encoding); } } if ((URL != NULL) && (ctxt->input != NULL) && diff --git a/SAX2.c b/SAX2.c index 968da080..07c5c017 100644 --- a/SAX2.c +++ b/SAX2.c @@ -384,8 +384,6 @@ xmlSAX2ExternalSubset(void *ctx, const xmlChar *name, int oldinputMax; xmlParserInputPtr *oldinputTab; xmlParserInputPtr input = NULL; - xmlCharEncoding enc; - int oldcharset; const xmlChar *oldencoding; int oldprogressive; unsigned long consumed; @@ -410,7 +408,6 @@ xmlSAX2ExternalSubset(void *ctx, const xmlChar *name, oldinputNr = ctxt->inputNr; oldinputMax = ctxt->inputMax; oldinputTab = ctxt->inputTab; - oldcharset = ctxt->charset; oldencoding = ctxt->encoding; oldprogressive = ctxt->progressive; ctxt->encoding = NULL; @@ -425,7 +422,6 @@ xmlSAX2ExternalSubset(void *ctx, const xmlChar *name, ctxt->inputNr = oldinputNr; ctxt->inputMax = oldinputMax; ctxt->inputTab = oldinputTab; - ctxt->charset = oldcharset; ctxt->encoding = oldencoding; ctxt->progressive = oldprogressive; return; @@ -435,14 +431,6 @@ xmlSAX2ExternalSubset(void *ctx, const xmlChar *name, ctxt->input = NULL; xmlPushInput(ctxt, input); - /* - * On the fly encoding conversion if needed - */ - if (ctxt->input->length >= 4) { - enc = xmlDetectCharEncoding(ctxt->input->cur, 4); - xmlSwitchEncoding(ctxt, enc); - } - if (input->filename == NULL) input->filename = (char *) xmlCanonicPath(SystemID); input->line = 1; @@ -484,7 +472,6 @@ xmlSAX2ExternalSubset(void *ctx, const xmlChar *name, ctxt->inputNr = oldinputNr; ctxt->inputMax = oldinputMax; ctxt->inputTab = oldinputTab; - ctxt->charset = oldcharset; if ((ctxt->encoding != NULL) && ((ctxt->dict == NULL) || (!xmlDictOwns(ctxt->dict, ctxt->encoding)))) @@ -1041,16 +1028,6 @@ xmlSAX2EndDocument(void *ctx) ctxt->myDoc->encoding = ctxt->encoding; ctxt->encoding = NULL; } - if ((ctxt->inputTab != NULL) && - (ctxt->inputNr > 0) && (ctxt->inputTab[0] != NULL) && - (ctxt->inputTab[0]->encoding != NULL) && (ctxt->myDoc != NULL) && - (ctxt->myDoc->encoding == NULL)) { - ctxt->myDoc->encoding = xmlStrdup(ctxt->inputTab[0]->encoding); - } - if ((ctxt->charset != XML_CHAR_ENCODING_NONE) && (ctxt->myDoc != NULL) && - (ctxt->myDoc->charset == XML_CHAR_ENCODING_NONE)) { - ctxt->myDoc->charset = ctxt->charset; - } } #if defined(LIBXML_SAX1_ENABLED) || defined(LIBXML_HTML_ENABLED) || defined(LIBXML_WRITER_ENABLED) || defined(LIBXML_LEGACY_ENABLED) diff --git a/include/libxml/parser.h b/include/libxml/parser.h index 950ebe32..e1955a08 100644 --- a/include/libxml/parser.h +++ b/include/libxml/parser.h @@ -63,9 +63,9 @@ struct _xmlParserInput { int col; /* Current column */ unsigned long consumed; /* How many xmlChars already consumed */ xmlParserInputDeallocate free; /* function to deallocate the base */ - const xmlChar *encoding; /* the encoding string for entity */ + const xmlChar *encoding; /* unused */ const xmlChar *version; /* the version string for entity */ - int standalone; /* Was that entity marked standalone */ + int flags; /* Flags */ int id; /* an unique identifier for the entity */ unsigned long parentConsumed; /* consumed bytes from parents */ xmlEntityPtr entity; /* entity, if any */ @@ -122,7 +122,8 @@ typedef enum { XML_PARSER_SYSTEM_LITERAL, /* within a SYSTEM value */ XML_PARSER_EPILOG, /* the Misc* after the last end tag */ XML_PARSER_IGNORE, /* within an IGNORED section */ - XML_PARSER_PUBLIC_LITERAL /* within a PUBLIC value */ + XML_PARSER_PUBLIC_LITERAL, /* within a PUBLIC value */ + XML_PARSER_XML_DECL /* before XML decl (but after BOM) */ } xmlParserInputState; /** @@ -245,8 +246,7 @@ struct _xmlParserCtxt { int depth; /* to prevent entity substitution loops */ xmlParserInputPtr entity; /* used to check entities boundaries */ - int charset; /* encoding of the in-memory content - actually an xmlCharEncoding */ + int charset; /* unused */ int nodelen; /* Those two fields are there to */ int nodemem; /* Speed up large node parsing */ int pedantic; /* signal pedantic warnings */ diff --git a/include/libxml/tree.h b/include/libxml/tree.h index a1cabf69..4e5bf434 100644 --- a/include/libxml/tree.h +++ b/include/libxml/tree.h @@ -573,12 +573,11 @@ struct _xmlDoc { struct _xmlDtd *extSubset; /* the document external subset */ struct _xmlNs *oldNs; /* Global namespace, the old way */ const xmlChar *version; /* the XML version string */ - const xmlChar *encoding; /* external initial encoding, if any */ + const xmlChar *encoding; /* encoding from XML declaration, if any */ void *ids; /* Hash table for ID attributes if any */ void *refs; /* Hash table for IDREFs attributes if any */ const xmlChar *URL; /* The URI for that document */ - int charset; /* Internal flag for charset handling, - actually an xmlCharEncoding */ + int charset; /* unused */ struct _xmlDict *dict; /* dict used to allocate names or NULL */ void *psvi; /* for type/PSVI information */ int parseFlags; /* set of xmlParserOption used to parse the diff --git a/include/libxml/xmlerror.h b/include/libxml/xmlerror.h index 830b4a68..63ddaa95 100644 --- a/include/libxml/xmlerror.h +++ b/include/libxml/xmlerror.h @@ -210,6 +210,7 @@ typedef enum { XML_ERR_NAME_TOO_LONG, /* 110 */ XML_ERR_USER_STOP, /* 111 */ XML_ERR_COMMENT_ABRUPTLY_ENDED, /* 112 */ + XML_WAR_ENCODING_MISMATCH, /* 113 */ XML_NS_ERR_XML_NAMESPACE = 200, XML_NS_ERR_UNDEFINED_NAMESPACE, /* 201 */ XML_NS_ERR_QNAME, /* 202 */ diff --git a/include/private/parser.h b/include/private/parser.h index bf933f7d..bc4bc0d1 100644 --- a/include/private/parser.h +++ b/include/private/parser.h @@ -17,10 +17,21 @@ */ #define XML_VCTXT_USE_PCTXT (1u << 1) +#define XML_INPUT_HAS_ENCODING (1u << 0) +#define XML_INPUT_AUTO_ENCODING (7u << 1) +#define XML_INPUT_AUTO_UTF8 (1u << 1) +#define XML_INPUT_AUTO_UTF16LE (2u << 1) +#define XML_INPUT_AUTO_UTF16BE (3u << 1) +#define XML_INPUT_AUTO_OTHER (4u << 1) +#define XML_INPUT_8_BIT (1u << 4) + XML_HIDDEN void xmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra); XML_HIDDEN void xmlFatalErr(xmlParserCtxtPtr ctxt, xmlParserErrors error, const char *info); +XML_HIDDEN void LIBXML_ATTR_FORMAT(3,0) +xmlWarningMsg(xmlParserCtxtPtr ctxt, xmlParserErrors error, + const char *msg, const xmlChar *str1, const xmlChar *str2); XML_HIDDEN void __xmlErrEncoding(xmlParserCtxtPtr ctxt, xmlParserErrors xmlerr, const char *msg, const xmlChar *str1, @@ -32,4 +43,9 @@ xmlParserGrow(xmlParserCtxtPtr ctxt); XML_HIDDEN void xmlParserShrink(xmlParserCtxtPtr ctxt); +XML_HIDDEN void +xmlDetectEncoding(xmlParserCtxtPtr ctxt); +XML_HIDDEN void +xmlSetDeclaredEncoding(xmlParserCtxtPtr ctxt, xmlChar *encoding); + #endif /* XML_PARSER_H_PRIVATE__ */ diff --git a/parser.c b/parser.c index 942029a6..bb4f0e2c 100644 --- a/parser.c +++ b/parser.c @@ -281,7 +281,7 @@ xmlFatalErrMsg(xmlParserCtxtPtr ctxt, xmlParserErrors error, * * Handle a warning. */ -static void LIBXML_ATTR_FORMAT(3,0) +void LIBXML_ATTR_FORMAT(3,0) xmlWarningMsg(xmlParserCtxtPtr ctxt, xmlParserErrors error, const char *msg, const xmlChar *str1, const xmlChar *str2) { @@ -2313,6 +2313,7 @@ xmlParserHandlePEReference(xmlParserCtxtPtr ctxt) { return; case XML_PARSER_PROLOG: case XML_PARSER_START: + case XML_PARSER_XML_DECL: case XML_PARSER_MISC: xmlFatalErr(ctxt, XML_ERR_PEREF_IN_PROLOG, NULL); return; @@ -6682,7 +6683,6 @@ xmlParseMarkupDecl(xmlParserCtxtPtr ctxt) { void xmlParseTextDecl(xmlParserCtxtPtr ctxt) { xmlChar *version; - const xmlChar *encoding; int oldstate; /* @@ -6721,7 +6721,7 @@ xmlParseTextDecl(xmlParserCtxtPtr ctxt) { /* * We must have the encoding declaration */ - encoding = xmlParseEncodingDecl(ctxt); + xmlParseEncodingDecl(ctxt); if (ctxt->instate == XML_PARSER_EOF) return; if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) { @@ -6731,10 +6731,6 @@ xmlParseTextDecl(xmlParserCtxtPtr ctxt) { ctxt->instate = oldstate; return; } - if ((encoding == NULL) && (ctxt->errNo == XML_ERR_OK)) { - xmlFatalErrMsg(ctxt, XML_ERR_MISSING_ENCODING, - "Missing encoding in text declaration\n"); - } SKIP_BLANKS; if ((RAW == '?') && (NXT(1) == '>')) { @@ -6773,21 +6769,8 @@ void xmlParseExternalSubset(xmlParserCtxtPtr ctxt, const xmlChar *ExternalID, const xmlChar *SystemID) { xmlDetectSAX2(ctxt); - GROW; - if ((ctxt->encoding == NULL) && - (ctxt->input->end - ctxt->input->cur >= 4)) { - xmlChar start[4]; - xmlCharEncoding enc; - - start[0] = RAW; - start[1] = NXT(1); - start[2] = NXT(2); - start[3] = NXT(3); - enc = xmlDetectCharEncoding(start, 4); - if (enc != XML_CHAR_ENCODING_NONE) - xmlSwitchEncoding(ctxt, enc); - } + xmlDetectEncoding(ctxt); if (CMP5(CUR_PTR, '<', '?', 'x', 'm', 'l')) { xmlParseTextDecl(ctxt); @@ -7727,8 +7710,6 @@ xmlParsePEReference(xmlParserCtxtPtr ctxt) "Internal: %%%s; is not a parameter entity\n", name, NULL); } else { - xmlChar start[4]; - xmlCharEncoding enc; unsigned long parentConsumed; xmlEntityPtr oldEnt; @@ -7769,28 +7750,7 @@ xmlParsePEReference(xmlParserCtxtPtr ctxt) input->parentConsumed = parentConsumed; if (entity->etype == XML_EXTERNAL_PARAMETER_ENTITY) { - /* - * Get the 4 first bytes and decode the charset - * if enc != XML_CHAR_ENCODING_NONE - * plug some encoding conversion routines. - * Note that, since we may have some non-UTF8 - * encoding (like UTF16, bug 135229), the 'length' - * is not known, but we can calculate based upon - * the amount of data in the buffer. - */ - GROW - if (ctxt->instate == XML_PARSER_EOF) - return; - if ((ctxt->input->end - ctxt->input->cur)>=4) { - start[0] = RAW; - start[1] = NXT(1); - start[2] = NXT(2); - start[3] = NXT(3); - enc = xmlDetectCharEncoding(start, 4); - if (enc != XML_CHAR_ENCODING_NONE) { - xmlSwitchEncoding(ctxt, enc); - } - } + xmlDetectEncoding(ctxt); if ((CMP5(CUR_PTR, '<', '?', 'x', 'm', 'l')) && (IS_BLANK_CH(NXT(5)))) { @@ -10094,101 +10054,45 @@ xmlParseEncodingDecl(xmlParserCtxtPtr ctxt) { xmlChar *encoding = NULL; SKIP_BLANKS; - if (CMP8(CUR_PTR, 'e', 'n', 'c', 'o', 'd', 'i', 'n', 'g')) { - SKIP(8); - SKIP_BLANKS; - if (RAW != '=') { - xmlFatalErr(ctxt, XML_ERR_EQUAL_REQUIRED, NULL); - return(NULL); - } - NEXT; - SKIP_BLANKS; - if (RAW == '"') { - NEXT; - encoding = xmlParseEncName(ctxt); - if (RAW != '"') { - xmlFatalErr(ctxt, XML_ERR_STRING_NOT_CLOSED, NULL); - xmlFree((xmlChar *) encoding); - return(NULL); - } else - NEXT; - } else if (RAW == '\''){ - NEXT; - encoding = xmlParseEncName(ctxt); - if (RAW != '\'') { - xmlFatalErr(ctxt, XML_ERR_STRING_NOT_CLOSED, NULL); - xmlFree((xmlChar *) encoding); - return(NULL); - } else - NEXT; - } else { - xmlFatalErr(ctxt, XML_ERR_STRING_NOT_STARTED, NULL); - } + if (CMP8(CUR_PTR, 'e', 'n', 'c', 'o', 'd', 'i', 'n', 'g') == 0) + return(NULL); - /* - * Non standard parsing, allowing the user to ignore encoding - */ - if (ctxt->options & XML_PARSE_IGNORE_ENC) { - xmlFree((xmlChar *) encoding); - return(NULL); - } - - /* - * UTF-16 encoding switch has already taken place at this stage, - * more over the little-endian/big-endian selection is already done - */ - if ((encoding != NULL) && - ((!xmlStrcasecmp(encoding, BAD_CAST "UTF-16")) || - (!xmlStrcasecmp(encoding, BAD_CAST "UTF16")))) { - /* - * If no encoding was passed to the parser, that we are - * using UTF-16 and no decoder is present i.e. the - * document is apparently UTF-8 compatible, then raise an - * encoding mismatch fatal error - */ - if ((ctxt->encoding == NULL) && - (ctxt->input->buf != NULL) && - (ctxt->input->buf->encoder == NULL)) { - xmlFatalErrMsg(ctxt, XML_ERR_INVALID_ENCODING, - "Document labelled UTF-16 but has UTF-8 content\n"); - } - if (ctxt->encoding != NULL) - xmlFree((xmlChar *) ctxt->encoding); - ctxt->encoding = encoding; - } - /* - * UTF-8 encoding is handled natively - */ - else if ((encoding != NULL) && - ((!xmlStrcasecmp(encoding, BAD_CAST "UTF-8")) || - (!xmlStrcasecmp(encoding, BAD_CAST "UTF8")))) { - /* TODO: Check for encoding mismatch. */ - if (ctxt->encoding != NULL) - xmlFree((xmlChar *) ctxt->encoding); - ctxt->encoding = encoding; - } - else if (encoding != NULL) { - xmlCharEncodingHandlerPtr handler; - - if (ctxt->input->encoding != NULL) - xmlFree((xmlChar *) ctxt->input->encoding); - ctxt->input->encoding = encoding; - - handler = xmlFindCharEncodingHandler((const char *) encoding); - if (handler != NULL) { - if (xmlSwitchToEncoding(ctxt, handler) < 0) { - /* failed to convert */ - ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING; - return(NULL); - } - } else { - xmlFatalErrMsgStr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, - "Unsupported encoding %s\n", encoding); - return(NULL); - } - } + SKIP(8); + SKIP_BLANKS; + if (RAW != '=') { + xmlFatalErr(ctxt, XML_ERR_EQUAL_REQUIRED, NULL); + return(NULL); } - return(encoding); + NEXT; + SKIP_BLANKS; + if (RAW == '"') { + NEXT; + encoding = xmlParseEncName(ctxt); + if (RAW != '"') { + xmlFatalErr(ctxt, XML_ERR_STRING_NOT_CLOSED, NULL); + xmlFree((xmlChar *) encoding); + return(NULL); + } else + NEXT; + } else if (RAW == '\''){ + NEXT; + encoding = xmlParseEncName(ctxt); + if (RAW != '\'') { + xmlFatalErr(ctxt, XML_ERR_STRING_NOT_CLOSED, NULL); + xmlFree((xmlChar *) encoding); + return(NULL); + } else + NEXT; + } else { + xmlFatalErr(ctxt, XML_ERR_STRING_NOT_STARTED, NULL); + } + + if (encoding == NULL) + return(NULL); + + xmlSetDeclaredEncoding(ctxt, encoding); + + return(ctxt->encoding); } /** @@ -10365,7 +10269,7 @@ xmlParseXMLDecl(xmlParserCtxtPtr ctxt) { /* * We may have the standalone status. */ - if ((ctxt->input->encoding != NULL) && (!IS_BLANK_CH(RAW))) { + if ((ctxt->encoding != NULL) && (!IS_BLANK_CH(RAW))) { if ((RAW == '?') && (NXT(1) == '>')) { SKIP(2); return; @@ -10443,9 +10347,6 @@ xmlParseMisc(xmlParserCtxtPtr ctxt) { int xmlParseDocument(xmlParserCtxtPtr ctxt) { - xmlChar start[4]; - xmlCharEncoding enc; - xmlInitParser(); if ((ctxt == NULL) || (ctxt->input == NULL)) @@ -10466,23 +10367,7 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) { if (ctxt->instate == XML_PARSER_EOF) return(-1); - if ((ctxt->encoding == NULL) && - ((ctxt->input->end - ctxt->input->cur) >= 4)) { - /* - * Get the 4 first bytes and decode the charset - * if enc != XML_CHAR_ENCODING_NONE - * plug some encoding conversion routines. - */ - start[0] = RAW; - start[1] = NXT(1); - start[2] = NXT(2); - start[3] = NXT(3); - enc = xmlDetectCharEncoding(&start[0], 4); - if (enc != XML_CHAR_ENCODING_NONE) { - xmlSwitchEncoding(ctxt, enc); - } - } - + xmlDetectEncoding(ctxt); if (CUR == 0) { xmlFatalErr(ctxt, XML_ERR_DOCUMENT_EMPTY, NULL); @@ -10626,38 +10511,18 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) { int xmlParseExtParsedEnt(xmlParserCtxtPtr ctxt) { - xmlChar start[4]; - xmlCharEncoding enc; - if ((ctxt == NULL) || (ctxt->input == NULL)) return(-1); xmlDetectSAX2(ctxt); - GROW; - /* * SAX: beginning of the document processing. */ if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator); - /* - * Get the 4 first bytes and decode the charset - * if enc != XML_CHAR_ENCODING_NONE - * plug some encoding conversion routines. - */ - if ((ctxt->input->end - ctxt->input->cur) >= 4) { - start[0] = RAW; - start[1] = NXT(1); - start[2] = NXT(2); - start[3] = NXT(3); - enc = xmlDetectCharEncoding(start, 4); - if (enc != XML_CHAR_ENCODING_NONE) { - xmlSwitchEncoding(ctxt, enc); - } - } - + xmlDetectEncoding(ctxt); if (CUR == 0) { xmlFatalErr(ctxt, XML_ERR_DOCUMENT_EMPTY, NULL); @@ -11076,6 +10941,9 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { case XML_PARSER_START: xmlGenericError(xmlGenericErrorContext, "PP: try START\n"); break; + case XML_PARSER_XML_DECL: + xmlGenericError(xmlGenericErrorContext, + "PP: try XML_DECL\n"); break; case XML_PARSER_MISC: xmlGenericError(xmlGenericErrorContext, "PP: try MISC\n");break; @@ -11164,39 +11032,25 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { */ goto done; case XML_PARSER_START: - if (ctxt->charset == XML_CHAR_ENCODING_NONE) { - xmlChar start[4]; - xmlCharEncoding enc; + /* + * Very first chars read from the document flow. + */ + if (avail < 4) + goto done; - /* - * Very first chars read from the document flow. - */ - if (avail < 4) - goto done; + /* + * We need more bytes to detect EBCDIC code pages. + * See xmlDetectEBCDIC. + */ + if ((CMP4(CUR_PTR, 0x4C, 0x6F, 0xA7, 0x94)) && + (!terminate) && (avail < 200)) + goto done; - /* - * Get the 4 first bytes and decode the charset - * if enc != XML_CHAR_ENCODING_NONE - * plug some encoding conversion routines, - * else xmlSwitchEncoding will set to (default) - * UTF8. - */ - start[0] = RAW; - start[1] = NXT(1); - start[2] = NXT(2); - start[3] = NXT(3); - enc = xmlDetectCharEncoding(start, 4); - /* - * We need more bytes to detect EBCDIC code pages. - * See xmlDetectEBCDIC. - */ - if ((enc == XML_CHAR_ENCODING_EBCDIC) && - (!terminate) && (avail < 200)) - goto done; - xmlSwitchEncoding(ctxt, enc); - break; - } + xmlDetectEncoding(ctxt); + ctxt->instate = XML_PARSER_XML_DECL; + break; + case XML_PARSER_XML_DECL: if (avail < 2) goto done; cur = ctxt->input->cur[0]; @@ -11242,9 +11096,6 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) { xmlHaltParser(ctxt); return(0); } - if ((ctxt->encoding == NULL) && - (ctxt->input->encoding != NULL)) - ctxt->encoding = xmlStrdup(ctxt->input->encoding); if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX)) ctxt->sax->startDocument(ctxt->userData); @@ -11978,13 +11829,6 @@ xmlCreatePushParserCtxt(xmlSAXHandlerPtr sax, void *user_data, xmlBufResetInput(inputStream->buf->buffer, inputStream); inputPush(ctxt, inputStream); - /* - * If the caller didn't provide an initial 'chunk' for determining - * the encoding, we set the context to XML_CHAR_ENCODING_NONE so - * that it can be automatically determined later - */ - ctxt->charset = XML_CHAR_ENCODING_NONE; - if ((size != 0) && (chunk != NULL) && (ctxt->input != NULL) && (ctxt->input->buf != NULL)) { size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input); @@ -12092,7 +11936,6 @@ xmlIOParseDTD(xmlSAXHandlerPtr sax, xmlParserInputBufferPtr input, xmlDtdPtr ret = NULL; xmlParserCtxtPtr ctxt; xmlParserInputPtr pinput = NULL; - xmlChar start[4]; if (input == NULL) return(NULL); @@ -12150,22 +11993,7 @@ xmlIOParseDTD(xmlSAXHandlerPtr sax, xmlParserInputBufferPtr input, ctxt->myDoc->extSubset = xmlNewDtd(ctxt->myDoc, BAD_CAST "none", BAD_CAST "none", BAD_CAST "none"); - if ((enc == XML_CHAR_ENCODING_NONE) && - ((ctxt->input->end - ctxt->input->cur) >= 4)) { - /* - * Get the 4 first bytes and decode the charset - * if enc != XML_CHAR_ENCODING_NONE - * plug some encoding conversion routines. - */ - start[0] = RAW; - start[1] = NXT(1); - start[2] = NXT(2); - start[3] = NXT(3); - enc = xmlDetectCharEncoding(start, 4); - if (enc != XML_CHAR_ENCODING_NONE) { - xmlSwitchEncoding(ctxt, enc); - } - } + xmlDetectEncoding(ctxt); xmlParseExternalSubset(ctxt, BAD_CAST "none", BAD_CAST "none"); @@ -12213,7 +12041,6 @@ xmlSAXParseDTD(xmlSAXHandlerPtr sax, const xmlChar *ExternalID, xmlDtdPtr ret = NULL; xmlParserCtxtPtr ctxt; xmlParserInputPtr input = NULL; - xmlCharEncoding enc; xmlChar* systemIdCanonic; if ((ExternalID == NULL) && (SystemID == NULL)) return(NULL); @@ -12258,10 +12085,8 @@ xmlSAXParseDTD(xmlSAXHandlerPtr sax, const xmlChar *ExternalID, xmlFree(systemIdCanonic); return(NULL); } - if ((ctxt->input->end - ctxt->input->cur) >= 4) { - enc = xmlDetectCharEncoding(ctxt->input->cur, 4); - xmlSwitchEncoding(ctxt, enc); - } + + xmlDetectEncoding(ctxt); if (input->filename == NULL) input->filename = (char *) systemIdCanonic; @@ -12399,8 +12224,6 @@ xmlParseExternalEntityPrivate(xmlDocPtr doc, xmlParserCtxtPtr oldctxt, xmlDocPtr newDoc; xmlNodePtr newRoot; xmlParserErrors ret = XML_ERR_OK; - xmlChar start[4]; - xmlCharEncoding enc; if (((depth > 40) && ((oldctxt == NULL) || (oldctxt->options & XML_PARSE_HUGE) == 0)) || @@ -12461,22 +12284,7 @@ xmlParseExternalEntityPrivate(xmlDocPtr doc, xmlParserCtxtPtr oldctxt, newRoot->doc = doc; } - /* - * Get the 4 first bytes and decode the charset - * if enc != XML_CHAR_ENCODING_NONE - * plug some encoding conversion routines. - */ - GROW; - if ((ctxt->input->end - ctxt->input->cur) >= 4) { - start[0] = RAW; - start[1] = NXT(1); - start[2] = NXT(2); - start[3] = NXT(3); - enc = xmlDetectCharEncoding(start, 4); - if (enc != XML_CHAR_ENCODING_NONE) { - xmlSwitchEncoding(ctxt, enc); - } - } + xmlDetectEncoding(ctxt); /* * Parse a possible text declaration first @@ -12963,10 +12771,6 @@ xmlParseInNodeContext(xmlNodePtr node, const char *data, int datalen, if (doc->encoding != NULL) { xmlCharEncodingHandlerPtr hdlr; - if (ctxt->encoding != NULL) - xmlFree((xmlChar *) ctxt->encoding); - ctxt->encoding = xmlStrdup((const xmlChar *) doc->encoding); - hdlr = xmlFindCharEncodingHandler((const char *) doc->encoding); if (hdlr != NULL) { xmlSwitchToEncoding(ctxt, hdlr); @@ -14273,7 +14077,6 @@ xmlCtxtReset(xmlParserCtxtPtr ctxt) ctxt->inSubset = 0; ctxt->errNo = XML_ERR_OK; ctxt->depth = 0; - ctxt->charset = XML_CHAR_ENCODING_UTF8; ctxt->catalogs = NULL; ctxt->sizeentities = 0; ctxt->sizeentcopy = 0; @@ -14374,10 +14177,6 @@ xmlCtxtResetPush(xmlParserCtxtPtr ctxt, const char *chunk, if (encoding != NULL) { xmlCharEncodingHandlerPtr hdlr; - if (ctxt->encoding != NULL) - xmlFree((xmlChar *) ctxt->encoding); - ctxt->encoding = xmlStrdup((const xmlChar *) encoding); - hdlr = xmlFindCharEncodingHandler(encoding); if (hdlr != NULL) { xmlSwitchToEncoding(ctxt, hdlr); diff --git a/parserInternals.c b/parserInternals.c index ed2d3dee..63f8372e 100644 --- a/parserInternals.c +++ b/parserInternals.c @@ -765,7 +765,7 @@ xmlNextChar(xmlParserCtxtPtr ctxt) return; } - if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { + if ((ctxt->input->flags & XML_INPUT_8_BIT) == 0) { const unsigned char *cur; unsigned char c; @@ -876,7 +876,10 @@ encoding_error: "Input is not proper UTF-8, indicate encoding !\n%s", BAD_CAST buffer, NULL); } - ctxt->charset = XML_CHAR_ENCODING_8859_1; + if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) { + ctxt->input->flags |= XML_INPUT_HAS_ENCODING; + ctxt->input->flags |= XML_INPUT_8_BIT; + } ctxt->input->cur++; return; } @@ -917,7 +920,7 @@ xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { *len = 1; return(*ctxt->input->cur); } - if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { + if ((ctxt->input->flags & XML_INPUT_8_BIT) == 0) { /* * We are supposed to handle UTF8, check it's valid * From rfc2044: encoding of the Unicode values on UTF-8: @@ -1040,7 +1043,10 @@ encoding_error: "Input is not proper UTF-8, indicate encoding !\n%s", BAD_CAST buffer, NULL); } - ctxt->charset = XML_CHAR_ENCODING_8859_1; + if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) { + ctxt->input->flags |= XML_INPUT_HAS_ENCODING; + ctxt->input->flags |= XML_INPUT_8_BIT; + } *len = 1; return(*ctxt->input->cur); @@ -1073,7 +1079,8 @@ int xmlStringCurrentChar(xmlParserCtxtPtr ctxt, const xmlChar * cur, int *len) { if ((len == NULL) || (cur == NULL)) return(0); - if ((ctxt == NULL) || (ctxt->charset == XML_CHAR_ENCODING_UTF8)) { + if ((ctxt == NULL) || (ctxt->input == NULL) || + ((ctxt->input->flags & XML_INPUT_8_BIT) == 0)) { /* * We are supposed to handle UTF8, check it's valid * From rfc2044: encoding of the Unicode values on UTF-8: @@ -1300,58 +1307,29 @@ xmlDetectEBCDIC(xmlParserInputPtr input) { * @ctxt: the parser context * @enc: the encoding value (number) * - * change the input functions when discovering the character encoding - * of a given entity. + * Use encoding specified by enum to decode input data. + * + * This function can be used to enforce the encoding of chunks passed + * to xmlParseChunk. * * Returns 0 in case of success, -1 otherwise */ int xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) { - xmlCharEncodingHandlerPtr handler; + xmlCharEncodingHandlerPtr handler = NULL; + int check = 1; int ret; - if (ctxt == NULL) return(-1); - - /* - * FIXME: The BOM shouldn't be skipped here, but in the parsing code. - * - * Note that we look for a decoded UTF-8 BOM when switching to UTF-16. - * This is mostly useless but Webkit/Chromium relies on this behavior. - * See https://bugs.chromium.org/p/chromium/issues/detail?id=1451026 - */ - if ((ctxt->input != NULL) && - (ctxt->input->consumed == 0) && - (ctxt->input->cur != NULL) && - (ctxt->input->cur == ctxt->input->base) && - ((enc == XML_CHAR_ENCODING_UTF8) || - (enc == XML_CHAR_ENCODING_UTF16LE) || - (enc == XML_CHAR_ENCODING_UTF16BE))) { - /* - * Errata on XML-1.0 June 20 2001 - * Specific handling of the Byte Order Mark for - * UTF-8 - */ - if ((ctxt->input->cur[0] == 0xEF) && - (ctxt->input->cur[1] == 0xBB) && - (ctxt->input->cur[2] == 0xBF)) { - ctxt->input->cur += 3; - } - } + if ((ctxt == NULL) || (ctxt->input == NULL)) + return(-1); switch (enc) { - case XML_CHAR_ENCODING_ERROR: - __xmlErrEncoding(ctxt, XML_ERR_UNKNOWN_ENCODING, - "encoding unknown\n", NULL, NULL); - return(-1); case XML_CHAR_ENCODING_NONE: - /* let's assume it's UTF-8 without the XML decl */ - ctxt->charset = XML_CHAR_ENCODING_UTF8; - return(0); case XML_CHAR_ENCODING_UTF8: - /* default encoding, no conversion should be needed */ - ctxt->charset = XML_CHAR_ENCODING_UTF8; - return(0); + case XML_CHAR_ENCODING_ASCII: + check = 0; + break; case XML_CHAR_ENCODING_EBCDIC: handler = xmlDetectEBCDIC(ctxt->input); break; @@ -1359,45 +1337,28 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) handler = xmlGetCharEncodingHandler(enc); break; } - if (handler == NULL) { - /* - * Default handlers. - */ - switch (enc) { - case XML_CHAR_ENCODING_ASCII: - /* default encoding, no conversion should be needed */ - ctxt->charset = XML_CHAR_ENCODING_UTF8; - return(0); - case XML_CHAR_ENCODING_8859_1: - if ((ctxt->inputNr == 1) && - (ctxt->encoding == NULL) && - (ctxt->input != NULL) && - (ctxt->input->encoding != NULL)) { - ctxt->encoding = xmlStrdup(ctxt->input->encoding); - } - ctxt->charset = enc; - return(0); - default: - __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING, - "encoding not supported: %s\n", - BAD_CAST xmlGetCharEncodingName(enc), NULL); - /* - * TODO: We could recover from errors in external entities - * if we didn't stop the parser. But most callers of this - * function don't check the return value. - */ - xmlStopParser(ctxt); - return(-1); - } - } - ret = xmlSwitchInputEncoding(ctxt, ctxt->input, handler); - if ((ret < 0) || (ctxt->errNo == XML_I18N_CONV_FAILED)) { + + if ((check) && (handler == NULL)) { + const char *name = xmlGetCharEncodingName(enc); + + __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING, + "encoding not supported: %s\n", + BAD_CAST (name ? name : ""), NULL); /* - * on encoding conversion errors, stop the parser - */ + * TODO: We could recover from errors in external entities + * if we didn't stop the parser. But most callers of this + * function don't check the return value. + */ xmlStopParser(ctxt); - ctxt->errNo = XML_I18N_CONV_FAILED; + return(-1); } + + ret = xmlSwitchInputEncoding(ctxt, ctxt->input, handler); + + if ((ret >= 0) && (enc == XML_CHAR_ENCODING_NONE)) { + ctxt->input->flags &= ~XML_INPUT_HAS_ENCODING; + } + return(ret); } @@ -1407,8 +1368,9 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) * @input: the input stream * @handler: the encoding handler * - * change the input functions when discovering the character encoding - * of a given entity. + * DEPRECATED: Internal function, don't use. + * + * Use encoding handler to decode input data. * * Returns 0 in case of success, -1 otherwise */ @@ -1419,27 +1381,19 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, int nbchars; xmlParserInputBufferPtr in; - if (handler == NULL) - return (-1); - if (input == NULL) - return (-1); - in = input->buf; - if (in == NULL) { - xmlErrInternal(ctxt, - "static memory buffer doesn't support encoding\n", NULL); - /* - * Callers assume that the input buffer takes ownership of the - * encoding handler. xmlCharEncCloseFunc frees unregistered - * handlers and avoids a memory leak. - */ + if ((input == NULL) || (input->buf == NULL)) { xmlCharEncCloseFunc(handler); return (-1); } + in = input->buf; + + input->flags |= XML_INPUT_HAS_ENCODING; + input->flags &= ~XML_INPUT_8_BIT; + + if (in->encoder == handler) + return (0); if (in->encoder != NULL) { - if (in->encoder == handler) - return (0); - /* * Switching encodings during parsing is a really bad idea, * but Chromium can switch between ISO-8859-1 and UTF-16 before @@ -1454,7 +1408,6 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, return (0); } - ctxt->charset = XML_CHAR_ENCODING_UTF8; in->encoder = handler; /* @@ -1463,37 +1416,6 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, if (xmlBufIsEmpty(in->buffer) == 0) { size_t processed, use, consumed; - /* - * FIXME: The BOM shouldn't be skipped here, but in the parsing code. - */ - - /* - * Specific handling of the Byte Order Mark for - * UTF-16 - */ - if ((handler->name != NULL) && - (!strcmp(handler->name, "UTF-16LE") || - !strcmp(handler->name, "UTF-16")) && - (input->cur[0] == 0xFF) && (input->cur[1] == 0xFE)) { - input->cur += 2; - } - if ((handler->name != NULL) && - (!strcmp(handler->name, "UTF-16BE")) && - (input->cur[0] == 0xFE) && (input->cur[1] == 0xFF)) { - input->cur += 2; - } - /* - * Errata on XML-1.0 June 20 2001 - * Specific handling of the Byte Order Mark for - * UTF-8 - */ - if ((handler->name != NULL) && - (!strcmp(handler->name, "UTF-8")) && - (input->cur[0] == 0xEF) && - (input->cur[1] == 0xBB) && (input->cur[2] == 0xBF)) { - input->cur += 3; - } - /* * Shrink the current input buffer. * Move it as the raw buffer and create a new input buffer @@ -1541,8 +1463,10 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, * @ctxt: the parser context * @handler: the encoding handler * - * change the input functions when discovering the character encoding - * of a given entity. + * Use encoding handler to decode input data. + * + * This function can be used to enforce the encoding of chunks passed + * to xmlParseChunk. * * Returns 0 in case of success, -1 otherwise */ @@ -1554,6 +1478,185 @@ xmlSwitchToEncoding(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler) return(xmlSwitchInputEncoding(ctxt, ctxt->input, handler)); } +/** + * xmlDetectEncoding: + * @ctxt: the parser context + * + * Handle optional BOM, detect and switch to encoding. + * + * Assumes that there are at least four bytes in the input buffer. + */ +void +xmlDetectEncoding(xmlParserCtxtPtr ctxt) { + const xmlChar *in = ctxt->input->cur; + xmlCharEncoding enc; + int bomSize; + int autoFlag = 0; + + if (xmlParserGrow(ctxt) < 0) + return; + if (ctxt->input->end - in < 4) + return; + + if (ctxt->input->flags & XML_INPUT_HAS_ENCODING) { + /* + * If the encoding was already set, only skip the BOM which was + * possibly decoded to UTF-8. + */ + if ((in[0] == 0xEF) && (in[1] == 0xBB) && (in[2] == 0xBF)) { + ctxt->input->cur += 3; + } + + return; + } + + enc = XML_CHAR_ENCODING_NONE; + bomSize = 0; + + switch (in[0]) { + case 0x00: + if ((in[1] == 0x00) && (in[2] == 0x00) && (in[3] == 0x3C)) { + enc = XML_CHAR_ENCODING_UCS4BE; + autoFlag = XML_INPUT_AUTO_OTHER; + } else if ((in[1] == 0x3C) && (in[2] == 0x00) && (in[3] == 0x3F)) { + enc = XML_CHAR_ENCODING_UTF16BE; + autoFlag = XML_INPUT_AUTO_UTF16BE; + } + break; + + case 0x3C: + if (in[1] == 0x00) { + if ((in[2] == 0x00) && (in[3] == 0x00)) { + enc = XML_CHAR_ENCODING_UCS4LE; + autoFlag = XML_INPUT_AUTO_OTHER; + } else if ((in[2] == 0x3F) && (in[3] == 0x00)) { + enc = XML_CHAR_ENCODING_UTF16LE; + autoFlag = XML_INPUT_AUTO_UTF16LE; + } + } + break; + + case 0x4C: + if ((in[1] == 0x6F) && (in[2] == 0xA7) && (in[3] == 0x94)) { + enc = XML_CHAR_ENCODING_EBCDIC; + autoFlag = XML_INPUT_AUTO_OTHER; + } + break; + + case 0xEF: + if ((in[1] == 0xBB) && (in[2] == 0xBF)) { + enc = XML_CHAR_ENCODING_UTF8; + autoFlag = XML_INPUT_AUTO_UTF8; + bomSize = 3; + } + break; + + case 0xFE: + if (in[1] == 0xFF) { + enc = XML_CHAR_ENCODING_UTF16BE; + autoFlag = XML_INPUT_AUTO_UTF16BE; + bomSize = 2; + } + break; + + case 0xFF: + if (in[1] == 0xFE) { + enc = XML_CHAR_ENCODING_UTF16LE; + autoFlag = XML_INPUT_AUTO_UTF16LE; + bomSize = 2; + } + break; + } + + if (bomSize > 0) { + ctxt->input->cur += bomSize; + } + + if (enc != XML_CHAR_ENCODING_NONE) { + ctxt->input->flags |= autoFlag; + xmlSwitchEncoding(ctxt, enc); + } +} + +/** + * xmlSetDeclaredEncoding: + * @ctxt: the parser context + * @encoding: declared encoding + * + * Set the encoding from a declaration in the document. + * + * If no encoding was set yet, switch the encoding. Otherwise, only warn + * about encoding mismatches. + * + * Takes ownership of 'encoding'. + */ +void +xmlSetDeclaredEncoding(xmlParserCtxtPtr ctxt, xmlChar *encoding) { + if (ctxt->encoding != NULL) + xmlFree((xmlChar *) ctxt->encoding); + ctxt->encoding = encoding; + + if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) && + ((ctxt->options & XML_PARSE_IGNORE_ENC) == 0)) { + xmlCharEncodingHandlerPtr handler; + + handler = xmlFindCharEncodingHandler((const char *) encoding); + if (handler != NULL) { + xmlSwitchToEncoding(ctxt, handler); + } else { + __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING, + "Unsupported encoding: %s\n", + encoding, NULL); + } + } else if (ctxt->input->flags & XML_INPUT_AUTO_ENCODING) { + static const char *allowedUTF8[] = { + "UTF-8", "UTF8", NULL + }; + static const char *allowedUTF16LE[] = { + "UTF-16", "UTF-16LE", "UTF16", NULL + }; + static const char *allowedUTF16BE[] = { + "UTF-16", "UTF-16BE", "UTF16", NULL + }; + const char **allowed = NULL; + const char *autoEnc = NULL; + + switch (ctxt->input->flags & XML_INPUT_AUTO_ENCODING) { + case XML_INPUT_AUTO_UTF8: + allowed = allowedUTF8; + autoEnc = "UTF-8"; + break; + case XML_INPUT_AUTO_UTF16LE: + allowed = allowedUTF16LE; + autoEnc = "UTF-16LE"; + break; + case XML_INPUT_AUTO_UTF16BE: + allowed = allowedUTF16BE; + autoEnc = "UTF-16BE"; + break; + } + + if (allowed != NULL) { + const char **p; + int match = 0; + + for (p = allowed; *p != NULL; p++) { + if (xmlStrcasecmp(encoding, BAD_CAST *p) == 0) { + match = 1; + break; + } + } + + if (match == 0) { + xmlWarningMsg(ctxt, XML_WAR_ENCODING_MISMATCH, + "Encoding '%s' doesn't match " + "auto-detected '%s'\n", + encoding, BAD_CAST autoEnc); + } + } + } +} + /************************************************************************ * * * Commodity functions to handle entities processing * @@ -1572,7 +1675,6 @@ xmlFreeInputStream(xmlParserInputPtr input) { if (input->filename != NULL) xmlFree((char *) input->filename); if (input->directory != NULL) xmlFree((char *) input->directory); - if (input->encoding != NULL) xmlFree((char *) input->encoding); if (input->version != NULL) xmlFree((char *) input->version); if ((input->free != NULL) && (input->base != NULL)) input->free((xmlChar *) input->base); @@ -2015,7 +2117,6 @@ xmlInitSAXParserCtxt(xmlParserCtxtPtr ctxt, const xmlSAXHandler *sax, ctxt->inSubset = 0; ctxt->errNo = XML_ERR_OK; ctxt->depth = 0; - ctxt->charset = XML_CHAR_ENCODING_UTF8; ctxt->catalogs = NULL; ctxt->sizeentities = 0; ctxt->sizeentcopy = 0; diff --git a/testchar.c b/testchar.c index a819e196..20d4296d 100644 --- a/testchar.c +++ b/testchar.c @@ -271,11 +271,11 @@ static int testCharRangeByte1(xmlParserCtxtPtr ctxt) { data[3] = 0; for (i = 0;i <= 0xFF;i++) { data[0] = (char) i; - ctxt->charset = XML_CHAR_ENCODING_UTF8; ctxt->nbErrors = 0; lastError = 0; c = xmlCurrentChar(ctxt, &len); + ctxt->input->flags = 0; if ((i == 0) || (i >= 0x80)) { /* we must see an error there */ if (lastError != XML_ERR_INVALID_CHAR) { @@ -307,11 +307,11 @@ static int testCharRangeByte2(xmlParserCtxtPtr ctxt) { for (j = 0;j <= 0xFF;j++) { data[0] = (char) i; data[1] = (char) j; - ctxt->charset = XML_CHAR_ENCODING_UTF8; ctxt->nbErrors = 0; lastError = 0; c = xmlCurrentChar(ctxt, &len); + ctxt->input->flags = 0; /* if first bit of first char is set, then second bit must too */ if ((i & 0x80) && ((i & 0x40) == 0)) { @@ -401,11 +401,11 @@ static int testCharRangeByte3(xmlParserCtxtPtr ctxt) { K = lows[k]; data[2] = (char) K; value = (K & 0x3F) + ((j & 0x3F) << 6) + ((i & 0xF) << 12); - ctxt->charset = XML_CHAR_ENCODING_UTF8; ctxt->nbErrors = 0; lastError = 0; c = xmlCurrentChar(ctxt, &len); + ctxt->input->flags = 0; /* * if fourth bit of first char is set, then the sequence would need @@ -504,11 +504,11 @@ static int testCharRangeByte4(xmlParserCtxtPtr ctxt) { data[3] = (char) L; value = (L & 0x3F) + ((K & 0x3F) << 6) + ((j & 0x3F) << 12) + ((i & 0x7) << 18); - ctxt->charset = XML_CHAR_ENCODING_UTF8; ctxt->nbErrors = 0; lastError = 0; c = xmlCurrentChar(ctxt, &len); + ctxt->input->flags = 0; /* * if fifth bit of first char is set, then the sequence would need diff --git a/xmlIO.c b/xmlIO.c index 9fd9c780..490a82e7 100644 --- a/xmlIO.c +++ b/xmlIO.c @@ -3790,8 +3790,6 @@ xmlCheckHTTPInput(xmlParserCtxtPtr ctxt, xmlParserInputPtr ret) { "Unknown encoding %s", BAD_CAST encoding, NULL); } - if (ret->encoding == NULL) - ret->encoding = xmlStrdup(BAD_CAST encoding); } #if 0 } else if (xmlStrstr(BAD_CAST mime, BAD_CAST "html")) {