diff --git a/SAX2.c b/SAX2.c index a5ef2436..753e6df3 100644 --- a/SAX2.c +++ b/SAX2.c @@ -977,10 +977,6 @@ xmlSAX2StartDocument(void *ctx) if (ctxt->options & XML_PARSE_OLD10) doc->properties |= XML_DOC_OLD10; doc->parseFlags = ctxt->options; - if (ctxt->encoding != NULL) - doc->encoding = xmlStrdup(ctxt->encoding); - else - doc->encoding = NULL; doc->standalone = ctxt->standalone; } else { xmlSAX2ErrMemory(ctxt, "xmlSAX2StartDocument"); @@ -1009,6 +1005,8 @@ void xmlSAX2EndDocument(void *ctx) { xmlParserCtxtPtr ctxt = (xmlParserCtxtPtr) ctx; + xmlDocPtr doc; + #ifdef DEBUG_SAX xmlGenericError(xmlGenericErrorContext, "SAX.xmlSAX2EndDocument()\n"); @@ -1020,13 +1018,25 @@ xmlSAX2EndDocument(void *ctx) ctxt->valid &= xmlValidateDocumentFinal(&ctxt->vctxt, ctxt->myDoc); #endif /* LIBXML_VALID_ENABLED */ - /* - * Grab the encoding if it was added on-the-fly - */ - if ((ctxt->encoding != NULL) && (ctxt->myDoc != NULL) && - (ctxt->myDoc->encoding == NULL)) { - ctxt->myDoc->encoding = ctxt->encoding; - ctxt->encoding = NULL; + doc = ctxt->myDoc; + if ((doc != NULL) && (doc->encoding == NULL)) { + const xmlChar *encoding = NULL; + + if ((ctxt->input->flags & XML_INPUT_USES_ENC_DECL) || + (ctxt->input->flags & XML_INPUT_AUTO_ENCODING)) { + /* Preserve encoding exactly */ + encoding = ctxt->encoding; + } else if ((ctxt->input->buf) && (ctxt->input->buf->encoder)) { + encoding = BAD_CAST ctxt->input->buf->encoder->name; + } else if (ctxt->input->flags & XML_INPUT_HAS_ENCODING) { + encoding = BAD_CAST "UTF-8"; + } + + if (encoding != NULL) { + doc->encoding = xmlStrdup(encoding); + if (doc->encoding == NULL) + xmlSAX2ErrMemory(ctxt, "xmlSAX2EndDocument"); + } } } diff --git a/include/libxml/tree.h b/include/libxml/tree.h index 4e5bf434..efada857 100644 --- a/include/libxml/tree.h +++ b/include/libxml/tree.h @@ -573,7 +573,7 @@ struct _xmlDoc { struct _xmlDtd *extSubset; /* the document external subset */ struct _xmlNs *oldNs; /* Global namespace, the old way */ const xmlChar *version; /* the XML version string */ - const xmlChar *encoding; /* encoding from XML declaration, if any */ + const xmlChar *encoding; /* actual encoding, if any */ void *ids; /* Hash table for ID attributes if any */ void *refs; /* Hash table for IDREFs attributes if any */ const xmlChar *URL; /* The URI for that document */ diff --git a/include/private/parser.h b/include/private/parser.h index bc4bc0d1..50cf2187 100644 --- a/include/private/parser.h +++ b/include/private/parser.h @@ -23,7 +23,8 @@ #define XML_INPUT_AUTO_UTF16LE (2u << 1) #define XML_INPUT_AUTO_UTF16BE (3u << 1) #define XML_INPUT_AUTO_OTHER (4u << 1) -#define XML_INPUT_8_BIT (1u << 4) +#define XML_INPUT_USES_ENC_DECL (1u << 4) +#define XML_INPUT_8_BIT (1u << 5) XML_HIDDEN void xmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra); diff --git a/parserInternals.c b/parserInternals.c index df720b24..c5cfd4b8 100644 --- a/parserInternals.c +++ b/parserInternals.c @@ -1590,13 +1590,15 @@ xmlSetDeclaredEncoding(xmlParserCtxtPtr ctxt, xmlChar *encoding) { xmlCharEncodingHandlerPtr handler; handler = xmlFindCharEncodingHandler((const char *) encoding); - if (handler != NULL) { - xmlSwitchToEncoding(ctxt, handler); - } else { + if (handler == NULL) { __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING, "Unsupported encoding: %s\n", encoding, NULL); + return; } + + xmlSwitchToEncoding(ctxt, handler); + ctxt->input->flags |= XML_INPUT_USES_ENC_DECL; } else if (ctxt->input->flags & XML_INPUT_AUTO_ENCODING) { static const char *allowedUTF8[] = { "UTF-8", "UTF8", NULL