diff --git a/HTMLparser.c b/HTMLparser.c
index 67ee6654..d0fa178b 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -350,8 +350,7 @@ htmlFindEncoding(xmlParserCtxtPtr ctxt) {
const xmlChar *start, *cur, *end;
if ((ctxt == NULL) || (ctxt->input == NULL) ||
- (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
- (ctxt->input->buf->encoder != NULL))
+ (ctxt->input->flags & XML_INPUT_HAS_ENCODING))
return(NULL);
if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
return(NULL);
@@ -417,7 +416,7 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
return(0);
}
- if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
+ if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) {
xmlChar * guess;
xmlCharEncodingHandlerPtr handler;
@@ -444,10 +443,8 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
if (guess == NULL) {
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
} else {
- if (ctxt->input->encoding != NULL)
- xmlFree((xmlChar *) ctxt->input->encoding);
- ctxt->input->encoding = guess;
handler = xmlFindCharEncodingHandler((const char *) guess);
+ xmlFree(guess);
if (handler != NULL) {
/*
* Don't use UTF-8 encoder which isn't required and
@@ -460,7 +457,7 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
"Unsupported encoding %s", guess, NULL);
}
}
- ctxt->charset = XML_CHAR_ENCODING_UTF8;
+ ctxt->input->flags |= XML_INPUT_HAS_ENCODING;
}
/*
@@ -537,13 +534,6 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
}
encoding_error:
- /*
- * If we detect an UTF8 error that probably mean that the
- * input encoding didn't get properly advertised in the
- * declaration header. Report the error and switch the encoding
- * to ISO-Latin-1 (if you don't like this policy, just declare the
- * encoding !)
- */
{
char buffer[150];
@@ -559,15 +549,7 @@ encoding_error:
BAD_CAST buffer, NULL);
}
- /*
- * Don't switch encodings twice. Note that if there's an encoder, we
- * shouldn't receive invalid UTF-8 anyway.
- *
- * Note that if ctxt->input->buf == NULL, switching encodings is
- * impossible, see Gitlab issue #34.
- */
- if ((ctxt->input->buf != NULL) &&
- (ctxt->input->buf->encoder == NULL))
+ if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0)
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
*len = 1;
return(*ctxt->input->cur);
@@ -3781,94 +3763,6 @@ htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
return(name);
}
-/**
- * htmlCheckEncodingDirect:
- * @ctxt: an HTML parser context
- * @attvalue: the attribute value
- *
- * Checks an attribute value to detect
- * the encoding
- * If a new encoding is detected the parser is switched to decode
- * it and pass UTF8
- */
-static void
-htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
-
- if ((ctxt == NULL) || (encoding == NULL) ||
- (ctxt->options & HTML_PARSE_IGNORE_ENC))
- return;
-
- /* do not change encoding */
- if (ctxt->input->encoding != NULL)
- return;
-
- if (encoding != NULL) {
- xmlCharEncoding enc;
- xmlCharEncodingHandlerPtr handler;
-
- while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
-
- if (ctxt->input->encoding != NULL)
- xmlFree((xmlChar *) ctxt->input->encoding);
- ctxt->input->encoding = xmlStrdup(encoding);
-
- enc = xmlParseCharEncoding((const char *) encoding);
- /*
- * registered set of known encodings
- */
- if (enc != XML_CHAR_ENCODING_ERROR) {
- if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
- (enc == XML_CHAR_ENCODING_UTF16BE) ||
- (enc == XML_CHAR_ENCODING_UCS4LE) ||
- (enc == XML_CHAR_ENCODING_UCS4BE)) &&
- (ctxt->input->buf != NULL) &&
- (ctxt->input->buf->encoder == NULL)) {
- htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
- "htmlCheckEncoding: wrong encoding meta\n",
- NULL, NULL);
- } else {
- xmlSwitchEncoding(ctxt, enc);
- }
- ctxt->charset = XML_CHAR_ENCODING_UTF8;
- } else {
- /*
- * fallback for unknown encodings
- */
- handler = xmlFindCharEncodingHandler((const char *) encoding);
- if (handler != NULL) {
- xmlSwitchToEncoding(ctxt, handler);
- ctxt->charset = XML_CHAR_ENCODING_UTF8;
- } else {
- htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
- "htmlCheckEncoding: unknown encoding %s\n",
- encoding, NULL);
- }
- }
-
- if ((ctxt->input->buf != NULL) &&
- (ctxt->input->buf->encoder != NULL) &&
- (ctxt->input->buf->raw != NULL) &&
- (ctxt->input->buf->buffer != NULL)) {
- int nbchars;
- size_t processed;
-
- /*
- * convert as much as possible to the parser reading buffer.
- */
- processed = ctxt->input->cur - ctxt->input->base;
- xmlBufShrink(ctxt->input->buf->buffer, processed);
- nbchars = xmlCharEncInput(ctxt->input->buf, 1);
- xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
- if (nbchars < 0) {
- htmlParseErr(ctxt, ctxt->input->buf->error,
- "htmlCheckEncoding: encoder error\n",
- NULL, NULL);
- xmlHaltParser(ctxt);
- }
- }
- }
-}
-
/**
* htmlCheckEncoding:
* @ctxt: an HTML parser context
@@ -3897,7 +3791,7 @@ htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
if (encoding && *encoding == '=') {
encoding ++;
- htmlCheckEncodingDirect(ctxt, encoding);
+ xmlSetDeclaredEncoding(ctxt, xmlStrdup(encoding));
}
}
@@ -3926,7 +3820,7 @@ htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
http = 1;
else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
- htmlCheckEncodingDirect(ctxt, value);
+ xmlSetDeclaredEncoding(ctxt, xmlStrdup(value));
else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
content = value;
att = atts[i++];
@@ -4953,8 +4847,6 @@ __htmlParseContent(void *ctxt) {
int
htmlParseDocument(htmlParserCtxtPtr ctxt) {
- xmlChar start[4];
- xmlCharEncoding enc;
xmlDtdPtr dtd;
xmlInitParser();
@@ -4964,29 +4856,14 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
"htmlParseDocument: context error\n", NULL, NULL);
return(XML_ERR_INTERNAL_ERROR);
}
- GROW;
+
/*
* SAX: beginning of the document processing.
*/
if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
- if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
- ((ctxt->input->end - ctxt->input->cur) >= 4)) {
- /*
- * Get the 4 first bytes and decode the charset
- * if enc != XML_CHAR_ENCODING_NONE
- * plug some encoding conversion routines.
- */
- start[0] = RAW;
- start[1] = NXT(1);
- start[2] = NXT(2);
- start[3] = NXT(3);
- enc = xmlDetectCharEncoding(&start[0], 4);
- if (enc != XML_CHAR_ENCODING_NONE) {
- xmlSwitchEncoding(ctxt, enc);
- }
- }
+ xmlDetectEncoding(ctxt);
/*
* Wipe out everything which is before the first '<'
@@ -5317,10 +5194,6 @@ htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
xmlCharEncoding enc;
xmlCharEncodingHandlerPtr handler;
- if (ctxt->input->encoding != NULL)
- xmlFree((xmlChar *) ctxt->input->encoding);
- ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
-
enc = xmlParseCharEncoding(encoding);
/*
* registered set of known encodings
@@ -6265,8 +6138,6 @@ htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
xmlFreeParserInputBuffer(buf);
return(NULL);
}
- if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
- ctxt->charset=XML_CHAR_ENCODING_UTF8;
if (filename == NULL) {
ctxt->directory = NULL;
} else {
@@ -6722,7 +6593,6 @@ htmlCtxtReset(htmlParserCtxtPtr ctxt)
ctxt->inSubset = 0;
ctxt->errNo = XML_ERR_OK;
ctxt->depth = 0;
- ctxt->charset = XML_CHAR_ENCODING_NONE;
ctxt->catalogs = NULL;
xmlInitNodeInfoSeq(&ctxt->node_seq);
@@ -6839,9 +6709,6 @@ htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
hdlr = xmlFindCharEncodingHandler(encoding);
if (hdlr != NULL) {
xmlSwitchToEncoding(ctxt, hdlr);
- if (ctxt->input->encoding != NULL)
- xmlFree((xmlChar *) ctxt->input->encoding);
- ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
}
}
if ((URL != NULL) && (ctxt->input != NULL) &&
diff --git a/SAX2.c b/SAX2.c
index 968da080..07c5c017 100644
--- a/SAX2.c
+++ b/SAX2.c
@@ -384,8 +384,6 @@ xmlSAX2ExternalSubset(void *ctx, const xmlChar *name,
int oldinputMax;
xmlParserInputPtr *oldinputTab;
xmlParserInputPtr input = NULL;
- xmlCharEncoding enc;
- int oldcharset;
const xmlChar *oldencoding;
int oldprogressive;
unsigned long consumed;
@@ -410,7 +408,6 @@ xmlSAX2ExternalSubset(void *ctx, const xmlChar *name,
oldinputNr = ctxt->inputNr;
oldinputMax = ctxt->inputMax;
oldinputTab = ctxt->inputTab;
- oldcharset = ctxt->charset;
oldencoding = ctxt->encoding;
oldprogressive = ctxt->progressive;
ctxt->encoding = NULL;
@@ -425,7 +422,6 @@ xmlSAX2ExternalSubset(void *ctx, const xmlChar *name,
ctxt->inputNr = oldinputNr;
ctxt->inputMax = oldinputMax;
ctxt->inputTab = oldinputTab;
- ctxt->charset = oldcharset;
ctxt->encoding = oldencoding;
ctxt->progressive = oldprogressive;
return;
@@ -435,14 +431,6 @@ xmlSAX2ExternalSubset(void *ctx, const xmlChar *name,
ctxt->input = NULL;
xmlPushInput(ctxt, input);
- /*
- * On the fly encoding conversion if needed
- */
- if (ctxt->input->length >= 4) {
- enc = xmlDetectCharEncoding(ctxt->input->cur, 4);
- xmlSwitchEncoding(ctxt, enc);
- }
-
if (input->filename == NULL)
input->filename = (char *) xmlCanonicPath(SystemID);
input->line = 1;
@@ -484,7 +472,6 @@ xmlSAX2ExternalSubset(void *ctx, const xmlChar *name,
ctxt->inputNr = oldinputNr;
ctxt->inputMax = oldinputMax;
ctxt->inputTab = oldinputTab;
- ctxt->charset = oldcharset;
if ((ctxt->encoding != NULL) &&
((ctxt->dict == NULL) ||
(!xmlDictOwns(ctxt->dict, ctxt->encoding))))
@@ -1041,16 +1028,6 @@ xmlSAX2EndDocument(void *ctx)
ctxt->myDoc->encoding = ctxt->encoding;
ctxt->encoding = NULL;
}
- if ((ctxt->inputTab != NULL) &&
- (ctxt->inputNr > 0) && (ctxt->inputTab[0] != NULL) &&
- (ctxt->inputTab[0]->encoding != NULL) && (ctxt->myDoc != NULL) &&
- (ctxt->myDoc->encoding == NULL)) {
- ctxt->myDoc->encoding = xmlStrdup(ctxt->inputTab[0]->encoding);
- }
- if ((ctxt->charset != XML_CHAR_ENCODING_NONE) && (ctxt->myDoc != NULL) &&
- (ctxt->myDoc->charset == XML_CHAR_ENCODING_NONE)) {
- ctxt->myDoc->charset = ctxt->charset;
- }
}
#if defined(LIBXML_SAX1_ENABLED) || defined(LIBXML_HTML_ENABLED) || defined(LIBXML_WRITER_ENABLED) || defined(LIBXML_LEGACY_ENABLED)
diff --git a/include/libxml/parser.h b/include/libxml/parser.h
index 950ebe32..e1955a08 100644
--- a/include/libxml/parser.h
+++ b/include/libxml/parser.h
@@ -63,9 +63,9 @@ struct _xmlParserInput {
int col; /* Current column */
unsigned long consumed; /* How many xmlChars already consumed */
xmlParserInputDeallocate free; /* function to deallocate the base */
- const xmlChar *encoding; /* the encoding string for entity */
+ const xmlChar *encoding; /* unused */
const xmlChar *version; /* the version string for entity */
- int standalone; /* Was that entity marked standalone */
+ int flags; /* Flags */
int id; /* an unique identifier for the entity */
unsigned long parentConsumed; /* consumed bytes from parents */
xmlEntityPtr entity; /* entity, if any */
@@ -122,7 +122,8 @@ typedef enum {
XML_PARSER_SYSTEM_LITERAL, /* within a SYSTEM value */
XML_PARSER_EPILOG, /* the Misc* after the last end tag */
XML_PARSER_IGNORE, /* within an IGNORED section */
- XML_PARSER_PUBLIC_LITERAL /* within a PUBLIC value */
+ XML_PARSER_PUBLIC_LITERAL, /* within a PUBLIC value */
+ XML_PARSER_XML_DECL /* before XML decl (but after BOM) */
} xmlParserInputState;
/**
@@ -245,8 +246,7 @@ struct _xmlParserCtxt {
int depth; /* to prevent entity substitution loops */
xmlParserInputPtr entity; /* used to check entities boundaries */
- int charset; /* encoding of the in-memory content
- actually an xmlCharEncoding */
+ int charset; /* unused */
int nodelen; /* Those two fields are there to */
int nodemem; /* Speed up large node parsing */
int pedantic; /* signal pedantic warnings */
diff --git a/include/libxml/tree.h b/include/libxml/tree.h
index a1cabf69..4e5bf434 100644
--- a/include/libxml/tree.h
+++ b/include/libxml/tree.h
@@ -573,12 +573,11 @@ struct _xmlDoc {
struct _xmlDtd *extSubset; /* the document external subset */
struct _xmlNs *oldNs; /* Global namespace, the old way */
const xmlChar *version; /* the XML version string */
- const xmlChar *encoding; /* external initial encoding, if any */
+ const xmlChar *encoding; /* encoding from XML declaration, if any */
void *ids; /* Hash table for ID attributes if any */
void *refs; /* Hash table for IDREFs attributes if any */
const xmlChar *URL; /* The URI for that document */
- int charset; /* Internal flag for charset handling,
- actually an xmlCharEncoding */
+ int charset; /* unused */
struct _xmlDict *dict; /* dict used to allocate names or NULL */
void *psvi; /* for type/PSVI information */
int parseFlags; /* set of xmlParserOption used to parse the
diff --git a/include/libxml/xmlerror.h b/include/libxml/xmlerror.h
index 830b4a68..63ddaa95 100644
--- a/include/libxml/xmlerror.h
+++ b/include/libxml/xmlerror.h
@@ -210,6 +210,7 @@ typedef enum {
XML_ERR_NAME_TOO_LONG, /* 110 */
XML_ERR_USER_STOP, /* 111 */
XML_ERR_COMMENT_ABRUPTLY_ENDED, /* 112 */
+ XML_WAR_ENCODING_MISMATCH, /* 113 */
XML_NS_ERR_XML_NAMESPACE = 200,
XML_NS_ERR_UNDEFINED_NAMESPACE, /* 201 */
XML_NS_ERR_QNAME, /* 202 */
diff --git a/include/private/parser.h b/include/private/parser.h
index bf933f7d..bc4bc0d1 100644
--- a/include/private/parser.h
+++ b/include/private/parser.h
@@ -17,10 +17,21 @@
*/
#define XML_VCTXT_USE_PCTXT (1u << 1)
+#define XML_INPUT_HAS_ENCODING (1u << 0)
+#define XML_INPUT_AUTO_ENCODING (7u << 1)
+#define XML_INPUT_AUTO_UTF8 (1u << 1)
+#define XML_INPUT_AUTO_UTF16LE (2u << 1)
+#define XML_INPUT_AUTO_UTF16BE (3u << 1)
+#define XML_INPUT_AUTO_OTHER (4u << 1)
+#define XML_INPUT_8_BIT (1u << 4)
+
XML_HIDDEN void
xmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra);
XML_HIDDEN void
xmlFatalErr(xmlParserCtxtPtr ctxt, xmlParserErrors error, const char *info);
+XML_HIDDEN void LIBXML_ATTR_FORMAT(3,0)
+xmlWarningMsg(xmlParserCtxtPtr ctxt, xmlParserErrors error,
+ const char *msg, const xmlChar *str1, const xmlChar *str2);
XML_HIDDEN void
__xmlErrEncoding(xmlParserCtxtPtr ctxt, xmlParserErrors xmlerr,
const char *msg, const xmlChar *str1,
@@ -32,4 +43,9 @@ xmlParserGrow(xmlParserCtxtPtr ctxt);
XML_HIDDEN void
xmlParserShrink(xmlParserCtxtPtr ctxt);
+XML_HIDDEN void
+xmlDetectEncoding(xmlParserCtxtPtr ctxt);
+XML_HIDDEN void
+xmlSetDeclaredEncoding(xmlParserCtxtPtr ctxt, xmlChar *encoding);
+
#endif /* XML_PARSER_H_PRIVATE__ */
diff --git a/parser.c b/parser.c
index 942029a6..bb4f0e2c 100644
--- a/parser.c
+++ b/parser.c
@@ -281,7 +281,7 @@ xmlFatalErrMsg(xmlParserCtxtPtr ctxt, xmlParserErrors error,
*
* Handle a warning.
*/
-static void LIBXML_ATTR_FORMAT(3,0)
+void LIBXML_ATTR_FORMAT(3,0)
xmlWarningMsg(xmlParserCtxtPtr ctxt, xmlParserErrors error,
const char *msg, const xmlChar *str1, const xmlChar *str2)
{
@@ -2313,6 +2313,7 @@ xmlParserHandlePEReference(xmlParserCtxtPtr ctxt) {
return;
case XML_PARSER_PROLOG:
case XML_PARSER_START:
+ case XML_PARSER_XML_DECL:
case XML_PARSER_MISC:
xmlFatalErr(ctxt, XML_ERR_PEREF_IN_PROLOG, NULL);
return;
@@ -6682,7 +6683,6 @@ xmlParseMarkupDecl(xmlParserCtxtPtr ctxt) {
void
xmlParseTextDecl(xmlParserCtxtPtr ctxt) {
xmlChar *version;
- const xmlChar *encoding;
int oldstate;
/*
@@ -6721,7 +6721,7 @@ xmlParseTextDecl(xmlParserCtxtPtr ctxt) {
/*
* We must have the encoding declaration
*/
- encoding = xmlParseEncodingDecl(ctxt);
+ xmlParseEncodingDecl(ctxt);
if (ctxt->instate == XML_PARSER_EOF)
return;
if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
@@ -6731,10 +6731,6 @@ xmlParseTextDecl(xmlParserCtxtPtr ctxt) {
ctxt->instate = oldstate;
return;
}
- if ((encoding == NULL) && (ctxt->errNo == XML_ERR_OK)) {
- xmlFatalErrMsg(ctxt, XML_ERR_MISSING_ENCODING,
- "Missing encoding in text declaration\n");
- }
SKIP_BLANKS;
if ((RAW == '?') && (NXT(1) == '>')) {
@@ -6773,21 +6769,8 @@ void
xmlParseExternalSubset(xmlParserCtxtPtr ctxt, const xmlChar *ExternalID,
const xmlChar *SystemID) {
xmlDetectSAX2(ctxt);
- GROW;
- if ((ctxt->encoding == NULL) &&
- (ctxt->input->end - ctxt->input->cur >= 4)) {
- xmlChar start[4];
- xmlCharEncoding enc;
-
- start[0] = RAW;
- start[1] = NXT(1);
- start[2] = NXT(2);
- start[3] = NXT(3);
- enc = xmlDetectCharEncoding(start, 4);
- if (enc != XML_CHAR_ENCODING_NONE)
- xmlSwitchEncoding(ctxt, enc);
- }
+ xmlDetectEncoding(ctxt);
if (CMP5(CUR_PTR, '<', '?', 'x', 'm', 'l')) {
xmlParseTextDecl(ctxt);
@@ -7727,8 +7710,6 @@ xmlParsePEReference(xmlParserCtxtPtr ctxt)
"Internal: %%%s; is not a parameter entity\n",
name, NULL);
} else {
- xmlChar start[4];
- xmlCharEncoding enc;
unsigned long parentConsumed;
xmlEntityPtr oldEnt;
@@ -7769,28 +7750,7 @@ xmlParsePEReference(xmlParserCtxtPtr ctxt)
input->parentConsumed = parentConsumed;
if (entity->etype == XML_EXTERNAL_PARAMETER_ENTITY) {
- /*
- * Get the 4 first bytes and decode the charset
- * if enc != XML_CHAR_ENCODING_NONE
- * plug some encoding conversion routines.
- * Note that, since we may have some non-UTF8
- * encoding (like UTF16, bug 135229), the 'length'
- * is not known, but we can calculate based upon
- * the amount of data in the buffer.
- */
- GROW
- if (ctxt->instate == XML_PARSER_EOF)
- return;
- if ((ctxt->input->end - ctxt->input->cur)>=4) {
- start[0] = RAW;
- start[1] = NXT(1);
- start[2] = NXT(2);
- start[3] = NXT(3);
- enc = xmlDetectCharEncoding(start, 4);
- if (enc != XML_CHAR_ENCODING_NONE) {
- xmlSwitchEncoding(ctxt, enc);
- }
- }
+ xmlDetectEncoding(ctxt);
if ((CMP5(CUR_PTR, '<', '?', 'x', 'm', 'l')) &&
(IS_BLANK_CH(NXT(5)))) {
@@ -10094,101 +10054,45 @@ xmlParseEncodingDecl(xmlParserCtxtPtr ctxt) {
xmlChar *encoding = NULL;
SKIP_BLANKS;
- if (CMP8(CUR_PTR, 'e', 'n', 'c', 'o', 'd', 'i', 'n', 'g')) {
- SKIP(8);
- SKIP_BLANKS;
- if (RAW != '=') {
- xmlFatalErr(ctxt, XML_ERR_EQUAL_REQUIRED, NULL);
- return(NULL);
- }
- NEXT;
- SKIP_BLANKS;
- if (RAW == '"') {
- NEXT;
- encoding = xmlParseEncName(ctxt);
- if (RAW != '"') {
- xmlFatalErr(ctxt, XML_ERR_STRING_NOT_CLOSED, NULL);
- xmlFree((xmlChar *) encoding);
- return(NULL);
- } else
- NEXT;
- } else if (RAW == '\''){
- NEXT;
- encoding = xmlParseEncName(ctxt);
- if (RAW != '\'') {
- xmlFatalErr(ctxt, XML_ERR_STRING_NOT_CLOSED, NULL);
- xmlFree((xmlChar *) encoding);
- return(NULL);
- } else
- NEXT;
- } else {
- xmlFatalErr(ctxt, XML_ERR_STRING_NOT_STARTED, NULL);
- }
+ if (CMP8(CUR_PTR, 'e', 'n', 'c', 'o', 'd', 'i', 'n', 'g') == 0)
+ return(NULL);
- /*
- * Non standard parsing, allowing the user to ignore encoding
- */
- if (ctxt->options & XML_PARSE_IGNORE_ENC) {
- xmlFree((xmlChar *) encoding);
- return(NULL);
- }
-
- /*
- * UTF-16 encoding switch has already taken place at this stage,
- * more over the little-endian/big-endian selection is already done
- */
- if ((encoding != NULL) &&
- ((!xmlStrcasecmp(encoding, BAD_CAST "UTF-16")) ||
- (!xmlStrcasecmp(encoding, BAD_CAST "UTF16")))) {
- /*
- * If no encoding was passed to the parser, that we are
- * using UTF-16 and no decoder is present i.e. the
- * document is apparently UTF-8 compatible, then raise an
- * encoding mismatch fatal error
- */
- if ((ctxt->encoding == NULL) &&
- (ctxt->input->buf != NULL) &&
- (ctxt->input->buf->encoder == NULL)) {
- xmlFatalErrMsg(ctxt, XML_ERR_INVALID_ENCODING,
- "Document labelled UTF-16 but has UTF-8 content\n");
- }
- if (ctxt->encoding != NULL)
- xmlFree((xmlChar *) ctxt->encoding);
- ctxt->encoding = encoding;
- }
- /*
- * UTF-8 encoding is handled natively
- */
- else if ((encoding != NULL) &&
- ((!xmlStrcasecmp(encoding, BAD_CAST "UTF-8")) ||
- (!xmlStrcasecmp(encoding, BAD_CAST "UTF8")))) {
- /* TODO: Check for encoding mismatch. */
- if (ctxt->encoding != NULL)
- xmlFree((xmlChar *) ctxt->encoding);
- ctxt->encoding = encoding;
- }
- else if (encoding != NULL) {
- xmlCharEncodingHandlerPtr handler;
-
- if (ctxt->input->encoding != NULL)
- xmlFree((xmlChar *) ctxt->input->encoding);
- ctxt->input->encoding = encoding;
-
- handler = xmlFindCharEncodingHandler((const char *) encoding);
- if (handler != NULL) {
- if (xmlSwitchToEncoding(ctxt, handler) < 0) {
- /* failed to convert */
- ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
- return(NULL);
- }
- } else {
- xmlFatalErrMsgStr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
- "Unsupported encoding %s\n", encoding);
- return(NULL);
- }
- }
+ SKIP(8);
+ SKIP_BLANKS;
+ if (RAW != '=') {
+ xmlFatalErr(ctxt, XML_ERR_EQUAL_REQUIRED, NULL);
+ return(NULL);
}
- return(encoding);
+ NEXT;
+ SKIP_BLANKS;
+ if (RAW == '"') {
+ NEXT;
+ encoding = xmlParseEncName(ctxt);
+ if (RAW != '"') {
+ xmlFatalErr(ctxt, XML_ERR_STRING_NOT_CLOSED, NULL);
+ xmlFree((xmlChar *) encoding);
+ return(NULL);
+ } else
+ NEXT;
+ } else if (RAW == '\''){
+ NEXT;
+ encoding = xmlParseEncName(ctxt);
+ if (RAW != '\'') {
+ xmlFatalErr(ctxt, XML_ERR_STRING_NOT_CLOSED, NULL);
+ xmlFree((xmlChar *) encoding);
+ return(NULL);
+ } else
+ NEXT;
+ } else {
+ xmlFatalErr(ctxt, XML_ERR_STRING_NOT_STARTED, NULL);
+ }
+
+ if (encoding == NULL)
+ return(NULL);
+
+ xmlSetDeclaredEncoding(ctxt, encoding);
+
+ return(ctxt->encoding);
}
/**
@@ -10365,7 +10269,7 @@ xmlParseXMLDecl(xmlParserCtxtPtr ctxt) {
/*
* We may have the standalone status.
*/
- if ((ctxt->input->encoding != NULL) && (!IS_BLANK_CH(RAW))) {
+ if ((ctxt->encoding != NULL) && (!IS_BLANK_CH(RAW))) {
if ((RAW == '?') && (NXT(1) == '>')) {
SKIP(2);
return;
@@ -10443,9 +10347,6 @@ xmlParseMisc(xmlParserCtxtPtr ctxt) {
int
xmlParseDocument(xmlParserCtxtPtr ctxt) {
- xmlChar start[4];
- xmlCharEncoding enc;
-
xmlInitParser();
if ((ctxt == NULL) || (ctxt->input == NULL))
@@ -10466,23 +10367,7 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) {
if (ctxt->instate == XML_PARSER_EOF)
return(-1);
- if ((ctxt->encoding == NULL) &&
- ((ctxt->input->end - ctxt->input->cur) >= 4)) {
- /*
- * Get the 4 first bytes and decode the charset
- * if enc != XML_CHAR_ENCODING_NONE
- * plug some encoding conversion routines.
- */
- start[0] = RAW;
- start[1] = NXT(1);
- start[2] = NXT(2);
- start[3] = NXT(3);
- enc = xmlDetectCharEncoding(&start[0], 4);
- if (enc != XML_CHAR_ENCODING_NONE) {
- xmlSwitchEncoding(ctxt, enc);
- }
- }
-
+ xmlDetectEncoding(ctxt);
if (CUR == 0) {
xmlFatalErr(ctxt, XML_ERR_DOCUMENT_EMPTY, NULL);
@@ -10626,38 +10511,18 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) {
int
xmlParseExtParsedEnt(xmlParserCtxtPtr ctxt) {
- xmlChar start[4];
- xmlCharEncoding enc;
-
if ((ctxt == NULL) || (ctxt->input == NULL))
return(-1);
xmlDetectSAX2(ctxt);
- GROW;
-
/*
* SAX: beginning of the document processing.
*/
if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
- /*
- * Get the 4 first bytes and decode the charset
- * if enc != XML_CHAR_ENCODING_NONE
- * plug some encoding conversion routines.
- */
- if ((ctxt->input->end - ctxt->input->cur) >= 4) {
- start[0] = RAW;
- start[1] = NXT(1);
- start[2] = NXT(2);
- start[3] = NXT(3);
- enc = xmlDetectCharEncoding(start, 4);
- if (enc != XML_CHAR_ENCODING_NONE) {
- xmlSwitchEncoding(ctxt, enc);
- }
- }
-
+ xmlDetectEncoding(ctxt);
if (CUR == 0) {
xmlFatalErr(ctxt, XML_ERR_DOCUMENT_EMPTY, NULL);
@@ -11076,6 +10941,9 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
case XML_PARSER_START:
xmlGenericError(xmlGenericErrorContext,
"PP: try START\n"); break;
+ case XML_PARSER_XML_DECL:
+ xmlGenericError(xmlGenericErrorContext,
+ "PP: try XML_DECL\n"); break;
case XML_PARSER_MISC:
xmlGenericError(xmlGenericErrorContext,
"PP: try MISC\n");break;
@@ -11164,39 +11032,25 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
*/
goto done;
case XML_PARSER_START:
- if (ctxt->charset == XML_CHAR_ENCODING_NONE) {
- xmlChar start[4];
- xmlCharEncoding enc;
+ /*
+ * Very first chars read from the document flow.
+ */
+ if (avail < 4)
+ goto done;
- /*
- * Very first chars read from the document flow.
- */
- if (avail < 4)
- goto done;
+ /*
+ * We need more bytes to detect EBCDIC code pages.
+ * See xmlDetectEBCDIC.
+ */
+ if ((CMP4(CUR_PTR, 0x4C, 0x6F, 0xA7, 0x94)) &&
+ (!terminate) && (avail < 200))
+ goto done;
- /*
- * Get the 4 first bytes and decode the charset
- * if enc != XML_CHAR_ENCODING_NONE
- * plug some encoding conversion routines,
- * else xmlSwitchEncoding will set to (default)
- * UTF8.
- */
- start[0] = RAW;
- start[1] = NXT(1);
- start[2] = NXT(2);
- start[3] = NXT(3);
- enc = xmlDetectCharEncoding(start, 4);
- /*
- * We need more bytes to detect EBCDIC code pages.
- * See xmlDetectEBCDIC.
- */
- if ((enc == XML_CHAR_ENCODING_EBCDIC) &&
- (!terminate) && (avail < 200))
- goto done;
- xmlSwitchEncoding(ctxt, enc);
- break;
- }
+ xmlDetectEncoding(ctxt);
+ ctxt->instate = XML_PARSER_XML_DECL;
+ break;
+ case XML_PARSER_XML_DECL:
if (avail < 2)
goto done;
cur = ctxt->input->cur[0];
@@ -11242,9 +11096,6 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
xmlHaltParser(ctxt);
return(0);
}
- if ((ctxt->encoding == NULL) &&
- (ctxt->input->encoding != NULL))
- ctxt->encoding = xmlStrdup(ctxt->input->encoding);
if ((ctxt->sax) && (ctxt->sax->startDocument) &&
(!ctxt->disableSAX))
ctxt->sax->startDocument(ctxt->userData);
@@ -11978,13 +11829,6 @@ xmlCreatePushParserCtxt(xmlSAXHandlerPtr sax, void *user_data,
xmlBufResetInput(inputStream->buf->buffer, inputStream);
inputPush(ctxt, inputStream);
- /*
- * If the caller didn't provide an initial 'chunk' for determining
- * the encoding, we set the context to XML_CHAR_ENCODING_NONE so
- * that it can be automatically determined later
- */
- ctxt->charset = XML_CHAR_ENCODING_NONE;
-
if ((size != 0) && (chunk != NULL) &&
(ctxt->input != NULL) && (ctxt->input->buf != NULL)) {
size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
@@ -12092,7 +11936,6 @@ xmlIOParseDTD(xmlSAXHandlerPtr sax, xmlParserInputBufferPtr input,
xmlDtdPtr ret = NULL;
xmlParserCtxtPtr ctxt;
xmlParserInputPtr pinput = NULL;
- xmlChar start[4];
if (input == NULL)
return(NULL);
@@ -12150,22 +11993,7 @@ xmlIOParseDTD(xmlSAXHandlerPtr sax, xmlParserInputBufferPtr input,
ctxt->myDoc->extSubset = xmlNewDtd(ctxt->myDoc, BAD_CAST "none",
BAD_CAST "none", BAD_CAST "none");
- if ((enc == XML_CHAR_ENCODING_NONE) &&
- ((ctxt->input->end - ctxt->input->cur) >= 4)) {
- /*
- * Get the 4 first bytes and decode the charset
- * if enc != XML_CHAR_ENCODING_NONE
- * plug some encoding conversion routines.
- */
- start[0] = RAW;
- start[1] = NXT(1);
- start[2] = NXT(2);
- start[3] = NXT(3);
- enc = xmlDetectCharEncoding(start, 4);
- if (enc != XML_CHAR_ENCODING_NONE) {
- xmlSwitchEncoding(ctxt, enc);
- }
- }
+ xmlDetectEncoding(ctxt);
xmlParseExternalSubset(ctxt, BAD_CAST "none", BAD_CAST "none");
@@ -12213,7 +12041,6 @@ xmlSAXParseDTD(xmlSAXHandlerPtr sax, const xmlChar *ExternalID,
xmlDtdPtr ret = NULL;
xmlParserCtxtPtr ctxt;
xmlParserInputPtr input = NULL;
- xmlCharEncoding enc;
xmlChar* systemIdCanonic;
if ((ExternalID == NULL) && (SystemID == NULL)) return(NULL);
@@ -12258,10 +12085,8 @@ xmlSAXParseDTD(xmlSAXHandlerPtr sax, const xmlChar *ExternalID,
xmlFree(systemIdCanonic);
return(NULL);
}
- if ((ctxt->input->end - ctxt->input->cur) >= 4) {
- enc = xmlDetectCharEncoding(ctxt->input->cur, 4);
- xmlSwitchEncoding(ctxt, enc);
- }
+
+ xmlDetectEncoding(ctxt);
if (input->filename == NULL)
input->filename = (char *) systemIdCanonic;
@@ -12399,8 +12224,6 @@ xmlParseExternalEntityPrivate(xmlDocPtr doc, xmlParserCtxtPtr oldctxt,
xmlDocPtr newDoc;
xmlNodePtr newRoot;
xmlParserErrors ret = XML_ERR_OK;
- xmlChar start[4];
- xmlCharEncoding enc;
if (((depth > 40) &&
((oldctxt == NULL) || (oldctxt->options & XML_PARSE_HUGE) == 0)) ||
@@ -12461,22 +12284,7 @@ xmlParseExternalEntityPrivate(xmlDocPtr doc, xmlParserCtxtPtr oldctxt,
newRoot->doc = doc;
}
- /*
- * Get the 4 first bytes and decode the charset
- * if enc != XML_CHAR_ENCODING_NONE
- * plug some encoding conversion routines.
- */
- GROW;
- if ((ctxt->input->end - ctxt->input->cur) >= 4) {
- start[0] = RAW;
- start[1] = NXT(1);
- start[2] = NXT(2);
- start[3] = NXT(3);
- enc = xmlDetectCharEncoding(start, 4);
- if (enc != XML_CHAR_ENCODING_NONE) {
- xmlSwitchEncoding(ctxt, enc);
- }
- }
+ xmlDetectEncoding(ctxt);
/*
* Parse a possible text declaration first
@@ -12963,10 +12771,6 @@ xmlParseInNodeContext(xmlNodePtr node, const char *data, int datalen,
if (doc->encoding != NULL) {
xmlCharEncodingHandlerPtr hdlr;
- if (ctxt->encoding != NULL)
- xmlFree((xmlChar *) ctxt->encoding);
- ctxt->encoding = xmlStrdup((const xmlChar *) doc->encoding);
-
hdlr = xmlFindCharEncodingHandler((const char *) doc->encoding);
if (hdlr != NULL) {
xmlSwitchToEncoding(ctxt, hdlr);
@@ -14273,7 +14077,6 @@ xmlCtxtReset(xmlParserCtxtPtr ctxt)
ctxt->inSubset = 0;
ctxt->errNo = XML_ERR_OK;
ctxt->depth = 0;
- ctxt->charset = XML_CHAR_ENCODING_UTF8;
ctxt->catalogs = NULL;
ctxt->sizeentities = 0;
ctxt->sizeentcopy = 0;
@@ -14374,10 +14177,6 @@ xmlCtxtResetPush(xmlParserCtxtPtr ctxt, const char *chunk,
if (encoding != NULL) {
xmlCharEncodingHandlerPtr hdlr;
- if (ctxt->encoding != NULL)
- xmlFree((xmlChar *) ctxt->encoding);
- ctxt->encoding = xmlStrdup((const xmlChar *) encoding);
-
hdlr = xmlFindCharEncodingHandler(encoding);
if (hdlr != NULL) {
xmlSwitchToEncoding(ctxt, hdlr);
diff --git a/parserInternals.c b/parserInternals.c
index ed2d3dee..63f8372e 100644
--- a/parserInternals.c
+++ b/parserInternals.c
@@ -765,7 +765,7 @@ xmlNextChar(xmlParserCtxtPtr ctxt)
return;
}
- if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
+ if ((ctxt->input->flags & XML_INPUT_8_BIT) == 0) {
const unsigned char *cur;
unsigned char c;
@@ -876,7 +876,10 @@ encoding_error:
"Input is not proper UTF-8, indicate encoding !\n%s",
BAD_CAST buffer, NULL);
}
- ctxt->charset = XML_CHAR_ENCODING_8859_1;
+ if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) {
+ ctxt->input->flags |= XML_INPUT_HAS_ENCODING;
+ ctxt->input->flags |= XML_INPUT_8_BIT;
+ }
ctxt->input->cur++;
return;
}
@@ -917,7 +920,7 @@ xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
*len = 1;
return(*ctxt->input->cur);
}
- if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
+ if ((ctxt->input->flags & XML_INPUT_8_BIT) == 0) {
/*
* We are supposed to handle UTF8, check it's valid
* From rfc2044: encoding of the Unicode values on UTF-8:
@@ -1040,7 +1043,10 @@ encoding_error:
"Input is not proper UTF-8, indicate encoding !\n%s",
BAD_CAST buffer, NULL);
}
- ctxt->charset = XML_CHAR_ENCODING_8859_1;
+ if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) {
+ ctxt->input->flags |= XML_INPUT_HAS_ENCODING;
+ ctxt->input->flags |= XML_INPUT_8_BIT;
+ }
*len = 1;
return(*ctxt->input->cur);
@@ -1073,7 +1079,8 @@ int
xmlStringCurrentChar(xmlParserCtxtPtr ctxt, const xmlChar * cur, int *len)
{
if ((len == NULL) || (cur == NULL)) return(0);
- if ((ctxt == NULL) || (ctxt->charset == XML_CHAR_ENCODING_UTF8)) {
+ if ((ctxt == NULL) || (ctxt->input == NULL) ||
+ ((ctxt->input->flags & XML_INPUT_8_BIT) == 0)) {
/*
* We are supposed to handle UTF8, check it's valid
* From rfc2044: encoding of the Unicode values on UTF-8:
@@ -1300,58 +1307,29 @@ xmlDetectEBCDIC(xmlParserInputPtr input) {
* @ctxt: the parser context
* @enc: the encoding value (number)
*
- * change the input functions when discovering the character encoding
- * of a given entity.
+ * Use encoding specified by enum to decode input data.
+ *
+ * This function can be used to enforce the encoding of chunks passed
+ * to xmlParseChunk.
*
* Returns 0 in case of success, -1 otherwise
*/
int
xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
{
- xmlCharEncodingHandlerPtr handler;
+ xmlCharEncodingHandlerPtr handler = NULL;
+ int check = 1;
int ret;
- if (ctxt == NULL) return(-1);
-
- /*
- * FIXME: The BOM shouldn't be skipped here, but in the parsing code.
- *
- * Note that we look for a decoded UTF-8 BOM when switching to UTF-16.
- * This is mostly useless but Webkit/Chromium relies on this behavior.
- * See https://bugs.chromium.org/p/chromium/issues/detail?id=1451026
- */
- if ((ctxt->input != NULL) &&
- (ctxt->input->consumed == 0) &&
- (ctxt->input->cur != NULL) &&
- (ctxt->input->cur == ctxt->input->base) &&
- ((enc == XML_CHAR_ENCODING_UTF8) ||
- (enc == XML_CHAR_ENCODING_UTF16LE) ||
- (enc == XML_CHAR_ENCODING_UTF16BE))) {
- /*
- * Errata on XML-1.0 June 20 2001
- * Specific handling of the Byte Order Mark for
- * UTF-8
- */
- if ((ctxt->input->cur[0] == 0xEF) &&
- (ctxt->input->cur[1] == 0xBB) &&
- (ctxt->input->cur[2] == 0xBF)) {
- ctxt->input->cur += 3;
- }
- }
+ if ((ctxt == NULL) || (ctxt->input == NULL))
+ return(-1);
switch (enc) {
- case XML_CHAR_ENCODING_ERROR:
- __xmlErrEncoding(ctxt, XML_ERR_UNKNOWN_ENCODING,
- "encoding unknown\n", NULL, NULL);
- return(-1);
case XML_CHAR_ENCODING_NONE:
- /* let's assume it's UTF-8 without the XML decl */
- ctxt->charset = XML_CHAR_ENCODING_UTF8;
- return(0);
case XML_CHAR_ENCODING_UTF8:
- /* default encoding, no conversion should be needed */
- ctxt->charset = XML_CHAR_ENCODING_UTF8;
- return(0);
+ case XML_CHAR_ENCODING_ASCII:
+ check = 0;
+ break;
case XML_CHAR_ENCODING_EBCDIC:
handler = xmlDetectEBCDIC(ctxt->input);
break;
@@ -1359,45 +1337,28 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
handler = xmlGetCharEncodingHandler(enc);
break;
}
- if (handler == NULL) {
- /*
- * Default handlers.
- */
- switch (enc) {
- case XML_CHAR_ENCODING_ASCII:
- /* default encoding, no conversion should be needed */
- ctxt->charset = XML_CHAR_ENCODING_UTF8;
- return(0);
- case XML_CHAR_ENCODING_8859_1:
- if ((ctxt->inputNr == 1) &&
- (ctxt->encoding == NULL) &&
- (ctxt->input != NULL) &&
- (ctxt->input->encoding != NULL)) {
- ctxt->encoding = xmlStrdup(ctxt->input->encoding);
- }
- ctxt->charset = enc;
- return(0);
- default:
- __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
- "encoding not supported: %s\n",
- BAD_CAST xmlGetCharEncodingName(enc), NULL);
- /*
- * TODO: We could recover from errors in external entities
- * if we didn't stop the parser. But most callers of this
- * function don't check the return value.
- */
- xmlStopParser(ctxt);
- return(-1);
- }
- }
- ret = xmlSwitchInputEncoding(ctxt, ctxt->input, handler);
- if ((ret < 0) || (ctxt->errNo == XML_I18N_CONV_FAILED)) {
+
+ if ((check) && (handler == NULL)) {
+ const char *name = xmlGetCharEncodingName(enc);
+
+ __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
+ "encoding not supported: %s\n",
+ BAD_CAST (name ? name : ""), NULL);
/*
- * on encoding conversion errors, stop the parser
- */
+ * TODO: We could recover from errors in external entities
+ * if we didn't stop the parser. But most callers of this
+ * function don't check the return value.
+ */
xmlStopParser(ctxt);
- ctxt->errNo = XML_I18N_CONV_FAILED;
+ return(-1);
}
+
+ ret = xmlSwitchInputEncoding(ctxt, ctxt->input, handler);
+
+ if ((ret >= 0) && (enc == XML_CHAR_ENCODING_NONE)) {
+ ctxt->input->flags &= ~XML_INPUT_HAS_ENCODING;
+ }
+
return(ret);
}
@@ -1407,8 +1368,9 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
* @input: the input stream
* @handler: the encoding handler
*
- * change the input functions when discovering the character encoding
- * of a given entity.
+ * DEPRECATED: Internal function, don't use.
+ *
+ * Use encoding handler to decode input data.
*
* Returns 0 in case of success, -1 otherwise
*/
@@ -1419,27 +1381,19 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
int nbchars;
xmlParserInputBufferPtr in;
- if (handler == NULL)
- return (-1);
- if (input == NULL)
- return (-1);
- in = input->buf;
- if (in == NULL) {
- xmlErrInternal(ctxt,
- "static memory buffer doesn't support encoding\n", NULL);
- /*
- * Callers assume that the input buffer takes ownership of the
- * encoding handler. xmlCharEncCloseFunc frees unregistered
- * handlers and avoids a memory leak.
- */
+ if ((input == NULL) || (input->buf == NULL)) {
xmlCharEncCloseFunc(handler);
return (-1);
}
+ in = input->buf;
+
+ input->flags |= XML_INPUT_HAS_ENCODING;
+ input->flags &= ~XML_INPUT_8_BIT;
+
+ if (in->encoder == handler)
+ return (0);
if (in->encoder != NULL) {
- if (in->encoder == handler)
- return (0);
-
/*
* Switching encodings during parsing is a really bad idea,
* but Chromium can switch between ISO-8859-1 and UTF-16 before
@@ -1454,7 +1408,6 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
return (0);
}
- ctxt->charset = XML_CHAR_ENCODING_UTF8;
in->encoder = handler;
/*
@@ -1463,37 +1416,6 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
if (xmlBufIsEmpty(in->buffer) == 0) {
size_t processed, use, consumed;
- /*
- * FIXME: The BOM shouldn't be skipped here, but in the parsing code.
- */
-
- /*
- * Specific handling of the Byte Order Mark for
- * UTF-16
- */
- if ((handler->name != NULL) &&
- (!strcmp(handler->name, "UTF-16LE") ||
- !strcmp(handler->name, "UTF-16")) &&
- (input->cur[0] == 0xFF) && (input->cur[1] == 0xFE)) {
- input->cur += 2;
- }
- if ((handler->name != NULL) &&
- (!strcmp(handler->name, "UTF-16BE")) &&
- (input->cur[0] == 0xFE) && (input->cur[1] == 0xFF)) {
- input->cur += 2;
- }
- /*
- * Errata on XML-1.0 June 20 2001
- * Specific handling of the Byte Order Mark for
- * UTF-8
- */
- if ((handler->name != NULL) &&
- (!strcmp(handler->name, "UTF-8")) &&
- (input->cur[0] == 0xEF) &&
- (input->cur[1] == 0xBB) && (input->cur[2] == 0xBF)) {
- input->cur += 3;
- }
-
/*
* Shrink the current input buffer.
* Move it as the raw buffer and create a new input buffer
@@ -1541,8 +1463,10 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
* @ctxt: the parser context
* @handler: the encoding handler
*
- * change the input functions when discovering the character encoding
- * of a given entity.
+ * Use encoding handler to decode input data.
+ *
+ * This function can be used to enforce the encoding of chunks passed
+ * to xmlParseChunk.
*
* Returns 0 in case of success, -1 otherwise
*/
@@ -1554,6 +1478,185 @@ xmlSwitchToEncoding(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler)
return(xmlSwitchInputEncoding(ctxt, ctxt->input, handler));
}
+/**
+ * xmlDetectEncoding:
+ * @ctxt: the parser context
+ *
+ * Handle optional BOM, detect and switch to encoding.
+ *
+ * Assumes that there are at least four bytes in the input buffer.
+ */
+void
+xmlDetectEncoding(xmlParserCtxtPtr ctxt) {
+ const xmlChar *in = ctxt->input->cur;
+ xmlCharEncoding enc;
+ int bomSize;
+ int autoFlag = 0;
+
+ if (xmlParserGrow(ctxt) < 0)
+ return;
+ if (ctxt->input->end - in < 4)
+ return;
+
+ if (ctxt->input->flags & XML_INPUT_HAS_ENCODING) {
+ /*
+ * If the encoding was already set, only skip the BOM which was
+ * possibly decoded to UTF-8.
+ */
+ if ((in[0] == 0xEF) && (in[1] == 0xBB) && (in[2] == 0xBF)) {
+ ctxt->input->cur += 3;
+ }
+
+ return;
+ }
+
+ enc = XML_CHAR_ENCODING_NONE;
+ bomSize = 0;
+
+ switch (in[0]) {
+ case 0x00:
+ if ((in[1] == 0x00) && (in[2] == 0x00) && (in[3] == 0x3C)) {
+ enc = XML_CHAR_ENCODING_UCS4BE;
+ autoFlag = XML_INPUT_AUTO_OTHER;
+ } else if ((in[1] == 0x3C) && (in[2] == 0x00) && (in[3] == 0x3F)) {
+ enc = XML_CHAR_ENCODING_UTF16BE;
+ autoFlag = XML_INPUT_AUTO_UTF16BE;
+ }
+ break;
+
+ case 0x3C:
+ if (in[1] == 0x00) {
+ if ((in[2] == 0x00) && (in[3] == 0x00)) {
+ enc = XML_CHAR_ENCODING_UCS4LE;
+ autoFlag = XML_INPUT_AUTO_OTHER;
+ } else if ((in[2] == 0x3F) && (in[3] == 0x00)) {
+ enc = XML_CHAR_ENCODING_UTF16LE;
+ autoFlag = XML_INPUT_AUTO_UTF16LE;
+ }
+ }
+ break;
+
+ case 0x4C:
+ if ((in[1] == 0x6F) && (in[2] == 0xA7) && (in[3] == 0x94)) {
+ enc = XML_CHAR_ENCODING_EBCDIC;
+ autoFlag = XML_INPUT_AUTO_OTHER;
+ }
+ break;
+
+ case 0xEF:
+ if ((in[1] == 0xBB) && (in[2] == 0xBF)) {
+ enc = XML_CHAR_ENCODING_UTF8;
+ autoFlag = XML_INPUT_AUTO_UTF8;
+ bomSize = 3;
+ }
+ break;
+
+ case 0xFE:
+ if (in[1] == 0xFF) {
+ enc = XML_CHAR_ENCODING_UTF16BE;
+ autoFlag = XML_INPUT_AUTO_UTF16BE;
+ bomSize = 2;
+ }
+ break;
+
+ case 0xFF:
+ if (in[1] == 0xFE) {
+ enc = XML_CHAR_ENCODING_UTF16LE;
+ autoFlag = XML_INPUT_AUTO_UTF16LE;
+ bomSize = 2;
+ }
+ break;
+ }
+
+ if (bomSize > 0) {
+ ctxt->input->cur += bomSize;
+ }
+
+ if (enc != XML_CHAR_ENCODING_NONE) {
+ ctxt->input->flags |= autoFlag;
+ xmlSwitchEncoding(ctxt, enc);
+ }
+}
+
+/**
+ * xmlSetDeclaredEncoding:
+ * @ctxt: the parser context
+ * @encoding: declared encoding
+ *
+ * Set the encoding from a declaration in the document.
+ *
+ * If no encoding was set yet, switch the encoding. Otherwise, only warn
+ * about encoding mismatches.
+ *
+ * Takes ownership of 'encoding'.
+ */
+void
+xmlSetDeclaredEncoding(xmlParserCtxtPtr ctxt, xmlChar *encoding) {
+ if (ctxt->encoding != NULL)
+ xmlFree((xmlChar *) ctxt->encoding);
+ ctxt->encoding = encoding;
+
+ if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
+ ((ctxt->options & XML_PARSE_IGNORE_ENC) == 0)) {
+ xmlCharEncodingHandlerPtr handler;
+
+ handler = xmlFindCharEncodingHandler((const char *) encoding);
+ if (handler != NULL) {
+ xmlSwitchToEncoding(ctxt, handler);
+ } else {
+ __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
+ "Unsupported encoding: %s\n",
+ encoding, NULL);
+ }
+ } else if (ctxt->input->flags & XML_INPUT_AUTO_ENCODING) {
+ static const char *allowedUTF8[] = {
+ "UTF-8", "UTF8", NULL
+ };
+ static const char *allowedUTF16LE[] = {
+ "UTF-16", "UTF-16LE", "UTF16", NULL
+ };
+ static const char *allowedUTF16BE[] = {
+ "UTF-16", "UTF-16BE", "UTF16", NULL
+ };
+ const char **allowed = NULL;
+ const char *autoEnc = NULL;
+
+ switch (ctxt->input->flags & XML_INPUT_AUTO_ENCODING) {
+ case XML_INPUT_AUTO_UTF8:
+ allowed = allowedUTF8;
+ autoEnc = "UTF-8";
+ break;
+ case XML_INPUT_AUTO_UTF16LE:
+ allowed = allowedUTF16LE;
+ autoEnc = "UTF-16LE";
+ break;
+ case XML_INPUT_AUTO_UTF16BE:
+ allowed = allowedUTF16BE;
+ autoEnc = "UTF-16BE";
+ break;
+ }
+
+ if (allowed != NULL) {
+ const char **p;
+ int match = 0;
+
+ for (p = allowed; *p != NULL; p++) {
+ if (xmlStrcasecmp(encoding, BAD_CAST *p) == 0) {
+ match = 1;
+ break;
+ }
+ }
+
+ if (match == 0) {
+ xmlWarningMsg(ctxt, XML_WAR_ENCODING_MISMATCH,
+ "Encoding '%s' doesn't match "
+ "auto-detected '%s'\n",
+ encoding, BAD_CAST autoEnc);
+ }
+ }
+ }
+}
+
/************************************************************************
* *
* Commodity functions to handle entities processing *
@@ -1572,7 +1675,6 @@ xmlFreeInputStream(xmlParserInputPtr input) {
if (input->filename != NULL) xmlFree((char *) input->filename);
if (input->directory != NULL) xmlFree((char *) input->directory);
- if (input->encoding != NULL) xmlFree((char *) input->encoding);
if (input->version != NULL) xmlFree((char *) input->version);
if ((input->free != NULL) && (input->base != NULL))
input->free((xmlChar *) input->base);
@@ -2015,7 +2117,6 @@ xmlInitSAXParserCtxt(xmlParserCtxtPtr ctxt, const xmlSAXHandler *sax,
ctxt->inSubset = 0;
ctxt->errNo = XML_ERR_OK;
ctxt->depth = 0;
- ctxt->charset = XML_CHAR_ENCODING_UTF8;
ctxt->catalogs = NULL;
ctxt->sizeentities = 0;
ctxt->sizeentcopy = 0;
diff --git a/testchar.c b/testchar.c
index a819e196..20d4296d 100644
--- a/testchar.c
+++ b/testchar.c
@@ -271,11 +271,11 @@ static int testCharRangeByte1(xmlParserCtxtPtr ctxt) {
data[3] = 0;
for (i = 0;i <= 0xFF;i++) {
data[0] = (char) i;
- ctxt->charset = XML_CHAR_ENCODING_UTF8;
ctxt->nbErrors = 0;
lastError = 0;
c = xmlCurrentChar(ctxt, &len);
+ ctxt->input->flags = 0;
if ((i == 0) || (i >= 0x80)) {
/* we must see an error there */
if (lastError != XML_ERR_INVALID_CHAR) {
@@ -307,11 +307,11 @@ static int testCharRangeByte2(xmlParserCtxtPtr ctxt) {
for (j = 0;j <= 0xFF;j++) {
data[0] = (char) i;
data[1] = (char) j;
- ctxt->charset = XML_CHAR_ENCODING_UTF8;
ctxt->nbErrors = 0;
lastError = 0;
c = xmlCurrentChar(ctxt, &len);
+ ctxt->input->flags = 0;
/* if first bit of first char is set, then second bit must too */
if ((i & 0x80) && ((i & 0x40) == 0)) {
@@ -401,11 +401,11 @@ static int testCharRangeByte3(xmlParserCtxtPtr ctxt) {
K = lows[k];
data[2] = (char) K;
value = (K & 0x3F) + ((j & 0x3F) << 6) + ((i & 0xF) << 12);
- ctxt->charset = XML_CHAR_ENCODING_UTF8;
ctxt->nbErrors = 0;
lastError = 0;
c = xmlCurrentChar(ctxt, &len);
+ ctxt->input->flags = 0;
/*
* if fourth bit of first char is set, then the sequence would need
@@ -504,11 +504,11 @@ static int testCharRangeByte4(xmlParserCtxtPtr ctxt) {
data[3] = (char) L;
value = (L & 0x3F) + ((K & 0x3F) << 6) + ((j & 0x3F) << 12) +
((i & 0x7) << 18);
- ctxt->charset = XML_CHAR_ENCODING_UTF8;
ctxt->nbErrors = 0;
lastError = 0;
c = xmlCurrentChar(ctxt, &len);
+ ctxt->input->flags = 0;
/*
* if fifth bit of first char is set, then the sequence would need
diff --git a/xmlIO.c b/xmlIO.c
index 9fd9c780..490a82e7 100644
--- a/xmlIO.c
+++ b/xmlIO.c
@@ -3790,8 +3790,6 @@ xmlCheckHTTPInput(xmlParserCtxtPtr ctxt, xmlParserInputPtr ret) {
"Unknown encoding %s",
BAD_CAST encoding, NULL);
}
- if (ret->encoding == NULL)
- ret->encoding = xmlStrdup(BAD_CAST encoding);
}
#if 0
} else if (xmlStrstr(BAD_CAST mime, BAD_CAST "html")) {