1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-10-23 01:52:48 +03:00

parser: Rework encoding detection

Introduce XML_INPUT_HAS_ENCODING flag for xmlParserInput which is set
when xmlSwitchEncoding is called. The parser can use the flag to
reliably detect whether an encoding was already set via user override,
BOM or other auto-detection. In this case, the encoding declaration
won't be used to switch the encoding.

Before, an inscrutable mix of ctxt->charset, ctxt->input->encoding
and ctxt->input->buf->encoder was used.

Introduce private helper functions to switch encodings used by both the
XML and HTML parser:

- xmlDetectEncoding which skips over the BOM, allowing to remove the
  BOM checks from other encoding functions.
- xmlSetDeclaredEncoding, replacing htmlCheckEncodingDirect, which warns
  about encoding mismatches.

If users override the encoding, store the declared instead of the actual
encoding in xmlDoc. In this case, the actual encoding is known and the
raw value from the doc is more useful.

Also use the input flags to store the ISO-8859-1 fallback state.
Restrict the fallback to cases where no encoding was specified. (The
fallback is only useful in recovery mode and these days broken UTF-8 is
probably more likely than ISO-8859-1, so it might eventually be removed
completely.)

The 'charset' member of xmlParserCtxt is now unused. The 'encoding'
member of xmlParserInput is now unused.

The 'standalone' member of xmlParserInput is renamed to 'flags'.

A new parser state XML_PARSER_XML_DECL is added for the push parser.
This commit is contained in:
Nick Wellnhofer
2023-08-08 15:19:46 +02:00
parent d38e73f91e
commit ec7be50662
10 changed files with 341 additions and 583 deletions

View File

@@ -350,8 +350,7 @@ htmlFindEncoding(xmlParserCtxtPtr ctxt) {
const xmlChar *start, *cur, *end; const xmlChar *start, *cur, *end;
if ((ctxt == NULL) || (ctxt->input == NULL) || if ((ctxt == NULL) || (ctxt->input == NULL) ||
(ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) || (ctxt->input->flags & XML_INPUT_HAS_ENCODING))
(ctxt->input->buf->encoder != NULL))
return(NULL); return(NULL);
if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL)) if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
return(NULL); return(NULL);
@@ -417,7 +416,7 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
return(0); return(0);
} }
if (ctxt->charset != XML_CHAR_ENCODING_UTF8) { if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) {
xmlChar * guess; xmlChar * guess;
xmlCharEncodingHandlerPtr handler; xmlCharEncodingHandlerPtr handler;
@@ -444,10 +443,8 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
if (guess == NULL) { if (guess == NULL) {
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
} else { } else {
if (ctxt->input->encoding != NULL)
xmlFree((xmlChar *) ctxt->input->encoding);
ctxt->input->encoding = guess;
handler = xmlFindCharEncodingHandler((const char *) guess); handler = xmlFindCharEncodingHandler((const char *) guess);
xmlFree(guess);
if (handler != NULL) { if (handler != NULL) {
/* /*
* Don't use UTF-8 encoder which isn't required and * Don't use UTF-8 encoder which isn't required and
@@ -460,7 +457,7 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
"Unsupported encoding %s", guess, NULL); "Unsupported encoding %s", guess, NULL);
} }
} }
ctxt->charset = XML_CHAR_ENCODING_UTF8; ctxt->input->flags |= XML_INPUT_HAS_ENCODING;
} }
/* /*
@@ -537,13 +534,6 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
} }
encoding_error: encoding_error:
/*
* If we detect an UTF8 error that probably mean that the
* input encoding didn't get properly advertised in the
* declaration header. Report the error and switch the encoding
* to ISO-Latin-1 (if you don't like this policy, just declare the
* encoding !)
*/
{ {
char buffer[150]; char buffer[150];
@@ -559,15 +549,7 @@ encoding_error:
BAD_CAST buffer, NULL); BAD_CAST buffer, NULL);
} }
/* if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0)
* Don't switch encodings twice. Note that if there's an encoder, we
* shouldn't receive invalid UTF-8 anyway.
*
* Note that if ctxt->input->buf == NULL, switching encodings is
* impossible, see Gitlab issue #34.
*/
if ((ctxt->input->buf != NULL) &&
(ctxt->input->buf->encoder == NULL))
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
*len = 1; *len = 1;
return(*ctxt->input->cur); return(*ctxt->input->cur);
@@ -3781,94 +3763,6 @@ htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
return(name); return(name);
} }
/**
* htmlCheckEncodingDirect:
* @ctxt: an HTML parser context
* @attvalue: the attribute value
*
* Checks an attribute value to detect
* the encoding
* If a new encoding is detected the parser is switched to decode
* it and pass UTF8
*/
static void
htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
if ((ctxt == NULL) || (encoding == NULL) ||
(ctxt->options & HTML_PARSE_IGNORE_ENC))
return;
/* do not change encoding */
if (ctxt->input->encoding != NULL)
return;
if (encoding != NULL) {
xmlCharEncoding enc;
xmlCharEncodingHandlerPtr handler;
while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
if (ctxt->input->encoding != NULL)
xmlFree((xmlChar *) ctxt->input->encoding);
ctxt->input->encoding = xmlStrdup(encoding);
enc = xmlParseCharEncoding((const char *) encoding);
/*
* registered set of known encodings
*/
if (enc != XML_CHAR_ENCODING_ERROR) {
if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
(enc == XML_CHAR_ENCODING_UTF16BE) ||
(enc == XML_CHAR_ENCODING_UCS4LE) ||
(enc == XML_CHAR_ENCODING_UCS4BE)) &&
(ctxt->input->buf != NULL) &&
(ctxt->input->buf->encoder == NULL)) {
htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
"htmlCheckEncoding: wrong encoding meta\n",
NULL, NULL);
} else {
xmlSwitchEncoding(ctxt, enc);
}
ctxt->charset = XML_CHAR_ENCODING_UTF8;
} else {
/*
* fallback for unknown encodings
*/
handler = xmlFindCharEncodingHandler((const char *) encoding);
if (handler != NULL) {
xmlSwitchToEncoding(ctxt, handler);
ctxt->charset = XML_CHAR_ENCODING_UTF8;
} else {
htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
"htmlCheckEncoding: unknown encoding %s\n",
encoding, NULL);
}
}
if ((ctxt->input->buf != NULL) &&
(ctxt->input->buf->encoder != NULL) &&
(ctxt->input->buf->raw != NULL) &&
(ctxt->input->buf->buffer != NULL)) {
int nbchars;
size_t processed;
/*
* convert as much as possible to the parser reading buffer.
*/
processed = ctxt->input->cur - ctxt->input->base;
xmlBufShrink(ctxt->input->buf->buffer, processed);
nbchars = xmlCharEncInput(ctxt->input->buf, 1);
xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
if (nbchars < 0) {
htmlParseErr(ctxt, ctxt->input->buf->error,
"htmlCheckEncoding: encoder error\n",
NULL, NULL);
xmlHaltParser(ctxt);
}
}
}
}
/** /**
* htmlCheckEncoding: * htmlCheckEncoding:
* @ctxt: an HTML parser context * @ctxt: an HTML parser context
@@ -3897,7 +3791,7 @@ htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
encoding = xmlStrcasestr(attvalue, BAD_CAST"="); encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
if (encoding && *encoding == '=') { if (encoding && *encoding == '=') {
encoding ++; encoding ++;
htmlCheckEncodingDirect(ctxt, encoding); xmlSetDeclaredEncoding(ctxt, xmlStrdup(encoding));
} }
} }
@@ -3926,7 +3820,7 @@ htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
http = 1; http = 1;
else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset"))) else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
htmlCheckEncodingDirect(ctxt, value); xmlSetDeclaredEncoding(ctxt, xmlStrdup(value));
else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content"))) else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
content = value; content = value;
att = atts[i++]; att = atts[i++];
@@ -4953,8 +4847,6 @@ __htmlParseContent(void *ctxt) {
int int
htmlParseDocument(htmlParserCtxtPtr ctxt) { htmlParseDocument(htmlParserCtxtPtr ctxt) {
xmlChar start[4];
xmlCharEncoding enc;
xmlDtdPtr dtd; xmlDtdPtr dtd;
xmlInitParser(); xmlInitParser();
@@ -4964,29 +4856,14 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
"htmlParseDocument: context error\n", NULL, NULL); "htmlParseDocument: context error\n", NULL, NULL);
return(XML_ERR_INTERNAL_ERROR); return(XML_ERR_INTERNAL_ERROR);
} }
GROW;
/* /*
* SAX: beginning of the document processing. * SAX: beginning of the document processing.
*/ */
if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator); ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) && xmlDetectEncoding(ctxt);
((ctxt->input->end - ctxt->input->cur) >= 4)) {
/*
* Get the 4 first bytes and decode the charset
* if enc != XML_CHAR_ENCODING_NONE
* plug some encoding conversion routines.
*/
start[0] = RAW;
start[1] = NXT(1);
start[2] = NXT(2);
start[3] = NXT(3);
enc = xmlDetectCharEncoding(&start[0], 4);
if (enc != XML_CHAR_ENCODING_NONE) {
xmlSwitchEncoding(ctxt, enc);
}
}
/* /*
* Wipe out everything which is before the first '<' * Wipe out everything which is before the first '<'
@@ -5317,10 +5194,6 @@ htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
xmlCharEncoding enc; xmlCharEncoding enc;
xmlCharEncodingHandlerPtr handler; xmlCharEncodingHandlerPtr handler;
if (ctxt->input->encoding != NULL)
xmlFree((xmlChar *) ctxt->input->encoding);
ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
enc = xmlParseCharEncoding(encoding); enc = xmlParseCharEncoding(encoding);
/* /*
* registered set of known encodings * registered set of known encodings
@@ -6265,8 +6138,6 @@ htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
xmlFreeParserInputBuffer(buf); xmlFreeParserInputBuffer(buf);
return(NULL); return(NULL);
} }
if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
ctxt->charset=XML_CHAR_ENCODING_UTF8;
if (filename == NULL) { if (filename == NULL) {
ctxt->directory = NULL; ctxt->directory = NULL;
} else { } else {
@@ -6722,7 +6593,6 @@ htmlCtxtReset(htmlParserCtxtPtr ctxt)
ctxt->inSubset = 0; ctxt->inSubset = 0;
ctxt->errNo = XML_ERR_OK; ctxt->errNo = XML_ERR_OK;
ctxt->depth = 0; ctxt->depth = 0;
ctxt->charset = XML_CHAR_ENCODING_NONE;
ctxt->catalogs = NULL; ctxt->catalogs = NULL;
xmlInitNodeInfoSeq(&ctxt->node_seq); xmlInitNodeInfoSeq(&ctxt->node_seq);
@@ -6839,9 +6709,6 @@ htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
hdlr = xmlFindCharEncodingHandler(encoding); hdlr = xmlFindCharEncodingHandler(encoding);
if (hdlr != NULL) { if (hdlr != NULL) {
xmlSwitchToEncoding(ctxt, hdlr); xmlSwitchToEncoding(ctxt, hdlr);
if (ctxt->input->encoding != NULL)
xmlFree((xmlChar *) ctxt->input->encoding);
ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
} }
} }
if ((URL != NULL) && (ctxt->input != NULL) && if ((URL != NULL) && (ctxt->input != NULL) &&

23
SAX2.c
View File

@@ -384,8 +384,6 @@ xmlSAX2ExternalSubset(void *ctx, const xmlChar *name,
int oldinputMax; int oldinputMax;
xmlParserInputPtr *oldinputTab; xmlParserInputPtr *oldinputTab;
xmlParserInputPtr input = NULL; xmlParserInputPtr input = NULL;
xmlCharEncoding enc;
int oldcharset;
const xmlChar *oldencoding; const xmlChar *oldencoding;
int oldprogressive; int oldprogressive;
unsigned long consumed; unsigned long consumed;
@@ -410,7 +408,6 @@ xmlSAX2ExternalSubset(void *ctx, const xmlChar *name,
oldinputNr = ctxt->inputNr; oldinputNr = ctxt->inputNr;
oldinputMax = ctxt->inputMax; oldinputMax = ctxt->inputMax;
oldinputTab = ctxt->inputTab; oldinputTab = ctxt->inputTab;
oldcharset = ctxt->charset;
oldencoding = ctxt->encoding; oldencoding = ctxt->encoding;
oldprogressive = ctxt->progressive; oldprogressive = ctxt->progressive;
ctxt->encoding = NULL; ctxt->encoding = NULL;
@@ -425,7 +422,6 @@ xmlSAX2ExternalSubset(void *ctx, const xmlChar *name,
ctxt->inputNr = oldinputNr; ctxt->inputNr = oldinputNr;
ctxt->inputMax = oldinputMax; ctxt->inputMax = oldinputMax;
ctxt->inputTab = oldinputTab; ctxt->inputTab = oldinputTab;
ctxt->charset = oldcharset;
ctxt->encoding = oldencoding; ctxt->encoding = oldencoding;
ctxt->progressive = oldprogressive; ctxt->progressive = oldprogressive;
return; return;
@@ -435,14 +431,6 @@ xmlSAX2ExternalSubset(void *ctx, const xmlChar *name,
ctxt->input = NULL; ctxt->input = NULL;
xmlPushInput(ctxt, input); xmlPushInput(ctxt, input);
/*
* On the fly encoding conversion if needed
*/
if (ctxt->input->length >= 4) {
enc = xmlDetectCharEncoding(ctxt->input->cur, 4);
xmlSwitchEncoding(ctxt, enc);
}
if (input->filename == NULL) if (input->filename == NULL)
input->filename = (char *) xmlCanonicPath(SystemID); input->filename = (char *) xmlCanonicPath(SystemID);
input->line = 1; input->line = 1;
@@ -484,7 +472,6 @@ xmlSAX2ExternalSubset(void *ctx, const xmlChar *name,
ctxt->inputNr = oldinputNr; ctxt->inputNr = oldinputNr;
ctxt->inputMax = oldinputMax; ctxt->inputMax = oldinputMax;
ctxt->inputTab = oldinputTab; ctxt->inputTab = oldinputTab;
ctxt->charset = oldcharset;
if ((ctxt->encoding != NULL) && if ((ctxt->encoding != NULL) &&
((ctxt->dict == NULL) || ((ctxt->dict == NULL) ||
(!xmlDictOwns(ctxt->dict, ctxt->encoding)))) (!xmlDictOwns(ctxt->dict, ctxt->encoding))))
@@ -1041,16 +1028,6 @@ xmlSAX2EndDocument(void *ctx)
ctxt->myDoc->encoding = ctxt->encoding; ctxt->myDoc->encoding = ctxt->encoding;
ctxt->encoding = NULL; ctxt->encoding = NULL;
} }
if ((ctxt->inputTab != NULL) &&
(ctxt->inputNr > 0) && (ctxt->inputTab[0] != NULL) &&
(ctxt->inputTab[0]->encoding != NULL) && (ctxt->myDoc != NULL) &&
(ctxt->myDoc->encoding == NULL)) {
ctxt->myDoc->encoding = xmlStrdup(ctxt->inputTab[0]->encoding);
}
if ((ctxt->charset != XML_CHAR_ENCODING_NONE) && (ctxt->myDoc != NULL) &&
(ctxt->myDoc->charset == XML_CHAR_ENCODING_NONE)) {
ctxt->myDoc->charset = ctxt->charset;
}
} }
#if defined(LIBXML_SAX1_ENABLED) || defined(LIBXML_HTML_ENABLED) || defined(LIBXML_WRITER_ENABLED) || defined(LIBXML_LEGACY_ENABLED) #if defined(LIBXML_SAX1_ENABLED) || defined(LIBXML_HTML_ENABLED) || defined(LIBXML_WRITER_ENABLED) || defined(LIBXML_LEGACY_ENABLED)

View File

@@ -63,9 +63,9 @@ struct _xmlParserInput {
int col; /* Current column */ int col; /* Current column */
unsigned long consumed; /* How many xmlChars already consumed */ unsigned long consumed; /* How many xmlChars already consumed */
xmlParserInputDeallocate free; /* function to deallocate the base */ xmlParserInputDeallocate free; /* function to deallocate the base */
const xmlChar *encoding; /* the encoding string for entity */ const xmlChar *encoding; /* unused */
const xmlChar *version; /* the version string for entity */ const xmlChar *version; /* the version string for entity */
int standalone; /* Was that entity marked standalone */ int flags; /* Flags */
int id; /* an unique identifier for the entity */ int id; /* an unique identifier for the entity */
unsigned long parentConsumed; /* consumed bytes from parents */ unsigned long parentConsumed; /* consumed bytes from parents */
xmlEntityPtr entity; /* entity, if any */ xmlEntityPtr entity; /* entity, if any */
@@ -122,7 +122,8 @@ typedef enum {
XML_PARSER_SYSTEM_LITERAL, /* within a SYSTEM value */ XML_PARSER_SYSTEM_LITERAL, /* within a SYSTEM value */
XML_PARSER_EPILOG, /* the Misc* after the last end tag */ XML_PARSER_EPILOG, /* the Misc* after the last end tag */
XML_PARSER_IGNORE, /* within an IGNORED section */ XML_PARSER_IGNORE, /* within an IGNORED section */
XML_PARSER_PUBLIC_LITERAL /* within a PUBLIC value */ XML_PARSER_PUBLIC_LITERAL, /* within a PUBLIC value */
XML_PARSER_XML_DECL /* before XML decl (but after BOM) */
} xmlParserInputState; } xmlParserInputState;
/** /**
@@ -245,8 +246,7 @@ struct _xmlParserCtxt {
int depth; /* to prevent entity substitution loops */ int depth; /* to prevent entity substitution loops */
xmlParserInputPtr entity; /* used to check entities boundaries */ xmlParserInputPtr entity; /* used to check entities boundaries */
int charset; /* encoding of the in-memory content int charset; /* unused */
actually an xmlCharEncoding */
int nodelen; /* Those two fields are there to */ int nodelen; /* Those two fields are there to */
int nodemem; /* Speed up large node parsing */ int nodemem; /* Speed up large node parsing */
int pedantic; /* signal pedantic warnings */ int pedantic; /* signal pedantic warnings */

View File

@@ -573,12 +573,11 @@ struct _xmlDoc {
struct _xmlDtd *extSubset; /* the document external subset */ struct _xmlDtd *extSubset; /* the document external subset */
struct _xmlNs *oldNs; /* Global namespace, the old way */ struct _xmlNs *oldNs; /* Global namespace, the old way */
const xmlChar *version; /* the XML version string */ const xmlChar *version; /* the XML version string */
const xmlChar *encoding; /* external initial encoding, if any */ const xmlChar *encoding; /* encoding from XML declaration, if any */
void *ids; /* Hash table for ID attributes if any */ void *ids; /* Hash table for ID attributes if any */
void *refs; /* Hash table for IDREFs attributes if any */ void *refs; /* Hash table for IDREFs attributes if any */
const xmlChar *URL; /* The URI for that document */ const xmlChar *URL; /* The URI for that document */
int charset; /* Internal flag for charset handling, int charset; /* unused */
actually an xmlCharEncoding */
struct _xmlDict *dict; /* dict used to allocate names or NULL */ struct _xmlDict *dict; /* dict used to allocate names or NULL */
void *psvi; /* for type/PSVI information */ void *psvi; /* for type/PSVI information */
int parseFlags; /* set of xmlParserOption used to parse the int parseFlags; /* set of xmlParserOption used to parse the

View File

@@ -210,6 +210,7 @@ typedef enum {
XML_ERR_NAME_TOO_LONG, /* 110 */ XML_ERR_NAME_TOO_LONG, /* 110 */
XML_ERR_USER_STOP, /* 111 */ XML_ERR_USER_STOP, /* 111 */
XML_ERR_COMMENT_ABRUPTLY_ENDED, /* 112 */ XML_ERR_COMMENT_ABRUPTLY_ENDED, /* 112 */
XML_WAR_ENCODING_MISMATCH, /* 113 */
XML_NS_ERR_XML_NAMESPACE = 200, XML_NS_ERR_XML_NAMESPACE = 200,
XML_NS_ERR_UNDEFINED_NAMESPACE, /* 201 */ XML_NS_ERR_UNDEFINED_NAMESPACE, /* 201 */
XML_NS_ERR_QNAME, /* 202 */ XML_NS_ERR_QNAME, /* 202 */

View File

@@ -17,10 +17,21 @@
*/ */
#define XML_VCTXT_USE_PCTXT (1u << 1) #define XML_VCTXT_USE_PCTXT (1u << 1)
#define XML_INPUT_HAS_ENCODING (1u << 0)
#define XML_INPUT_AUTO_ENCODING (7u << 1)
#define XML_INPUT_AUTO_UTF8 (1u << 1)
#define XML_INPUT_AUTO_UTF16LE (2u << 1)
#define XML_INPUT_AUTO_UTF16BE (3u << 1)
#define XML_INPUT_AUTO_OTHER (4u << 1)
#define XML_INPUT_8_BIT (1u << 4)
XML_HIDDEN void XML_HIDDEN void
xmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra); xmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra);
XML_HIDDEN void XML_HIDDEN void
xmlFatalErr(xmlParserCtxtPtr ctxt, xmlParserErrors error, const char *info); xmlFatalErr(xmlParserCtxtPtr ctxt, xmlParserErrors error, const char *info);
XML_HIDDEN void LIBXML_ATTR_FORMAT(3,0)
xmlWarningMsg(xmlParserCtxtPtr ctxt, xmlParserErrors error,
const char *msg, const xmlChar *str1, const xmlChar *str2);
XML_HIDDEN void XML_HIDDEN void
__xmlErrEncoding(xmlParserCtxtPtr ctxt, xmlParserErrors xmlerr, __xmlErrEncoding(xmlParserCtxtPtr ctxt, xmlParserErrors xmlerr,
const char *msg, const xmlChar *str1, const char *msg, const xmlChar *str1,
@@ -32,4 +43,9 @@ xmlParserGrow(xmlParserCtxtPtr ctxt);
XML_HIDDEN void XML_HIDDEN void
xmlParserShrink(xmlParserCtxtPtr ctxt); xmlParserShrink(xmlParserCtxtPtr ctxt);
XML_HIDDEN void
xmlDetectEncoding(xmlParserCtxtPtr ctxt);
XML_HIDDEN void
xmlSetDeclaredEncoding(xmlParserCtxtPtr ctxt, xmlChar *encoding);
#endif /* XML_PARSER_H_PRIVATE__ */ #endif /* XML_PARSER_H_PRIVATE__ */

255
parser.c
View File

@@ -281,7 +281,7 @@ xmlFatalErrMsg(xmlParserCtxtPtr ctxt, xmlParserErrors error,
* *
* Handle a warning. * Handle a warning.
*/ */
static void LIBXML_ATTR_FORMAT(3,0) void LIBXML_ATTR_FORMAT(3,0)
xmlWarningMsg(xmlParserCtxtPtr ctxt, xmlParserErrors error, xmlWarningMsg(xmlParserCtxtPtr ctxt, xmlParserErrors error,
const char *msg, const xmlChar *str1, const xmlChar *str2) const char *msg, const xmlChar *str1, const xmlChar *str2)
{ {
@@ -2313,6 +2313,7 @@ xmlParserHandlePEReference(xmlParserCtxtPtr ctxt) {
return; return;
case XML_PARSER_PROLOG: case XML_PARSER_PROLOG:
case XML_PARSER_START: case XML_PARSER_START:
case XML_PARSER_XML_DECL:
case XML_PARSER_MISC: case XML_PARSER_MISC:
xmlFatalErr(ctxt, XML_ERR_PEREF_IN_PROLOG, NULL); xmlFatalErr(ctxt, XML_ERR_PEREF_IN_PROLOG, NULL);
return; return;
@@ -6682,7 +6683,6 @@ xmlParseMarkupDecl(xmlParserCtxtPtr ctxt) {
void void
xmlParseTextDecl(xmlParserCtxtPtr ctxt) { xmlParseTextDecl(xmlParserCtxtPtr ctxt) {
xmlChar *version; xmlChar *version;
const xmlChar *encoding;
int oldstate; int oldstate;
/* /*
@@ -6721,7 +6721,7 @@ xmlParseTextDecl(xmlParserCtxtPtr ctxt) {
/* /*
* We must have the encoding declaration * We must have the encoding declaration
*/ */
encoding = xmlParseEncodingDecl(ctxt); xmlParseEncodingDecl(ctxt);
if (ctxt->instate == XML_PARSER_EOF) if (ctxt->instate == XML_PARSER_EOF)
return; return;
if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) { if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
@@ -6731,10 +6731,6 @@ xmlParseTextDecl(xmlParserCtxtPtr ctxt) {
ctxt->instate = oldstate; ctxt->instate = oldstate;
return; return;
} }
if ((encoding == NULL) && (ctxt->errNo == XML_ERR_OK)) {
xmlFatalErrMsg(ctxt, XML_ERR_MISSING_ENCODING,
"Missing encoding in text declaration\n");
}
SKIP_BLANKS; SKIP_BLANKS;
if ((RAW == '?') && (NXT(1) == '>')) { if ((RAW == '?') && (NXT(1) == '>')) {
@@ -6773,21 +6769,8 @@ void
xmlParseExternalSubset(xmlParserCtxtPtr ctxt, const xmlChar *ExternalID, xmlParseExternalSubset(xmlParserCtxtPtr ctxt, const xmlChar *ExternalID,
const xmlChar *SystemID) { const xmlChar *SystemID) {
xmlDetectSAX2(ctxt); xmlDetectSAX2(ctxt);
GROW;
if ((ctxt->encoding == NULL) && xmlDetectEncoding(ctxt);
(ctxt->input->end - ctxt->input->cur >= 4)) {
xmlChar start[4];
xmlCharEncoding enc;
start[0] = RAW;
start[1] = NXT(1);
start[2] = NXT(2);
start[3] = NXT(3);
enc = xmlDetectCharEncoding(start, 4);
if (enc != XML_CHAR_ENCODING_NONE)
xmlSwitchEncoding(ctxt, enc);
}
if (CMP5(CUR_PTR, '<', '?', 'x', 'm', 'l')) { if (CMP5(CUR_PTR, '<', '?', 'x', 'm', 'l')) {
xmlParseTextDecl(ctxt); xmlParseTextDecl(ctxt);
@@ -7727,8 +7710,6 @@ xmlParsePEReference(xmlParserCtxtPtr ctxt)
"Internal: %%%s; is not a parameter entity\n", "Internal: %%%s; is not a parameter entity\n",
name, NULL); name, NULL);
} else { } else {
xmlChar start[4];
xmlCharEncoding enc;
unsigned long parentConsumed; unsigned long parentConsumed;
xmlEntityPtr oldEnt; xmlEntityPtr oldEnt;
@@ -7769,28 +7750,7 @@ xmlParsePEReference(xmlParserCtxtPtr ctxt)
input->parentConsumed = parentConsumed; input->parentConsumed = parentConsumed;
if (entity->etype == XML_EXTERNAL_PARAMETER_ENTITY) { if (entity->etype == XML_EXTERNAL_PARAMETER_ENTITY) {
/* xmlDetectEncoding(ctxt);
* Get the 4 first bytes and decode the charset
* if enc != XML_CHAR_ENCODING_NONE
* plug some encoding conversion routines.
* Note that, since we may have some non-UTF8
* encoding (like UTF16, bug 135229), the 'length'
* is not known, but we can calculate based upon
* the amount of data in the buffer.
*/
GROW
if (ctxt->instate == XML_PARSER_EOF)
return;
if ((ctxt->input->end - ctxt->input->cur)>=4) {
start[0] = RAW;
start[1] = NXT(1);
start[2] = NXT(2);
start[3] = NXT(3);
enc = xmlDetectCharEncoding(start, 4);
if (enc != XML_CHAR_ENCODING_NONE) {
xmlSwitchEncoding(ctxt, enc);
}
}
if ((CMP5(CUR_PTR, '<', '?', 'x', 'm', 'l')) && if ((CMP5(CUR_PTR, '<', '?', 'x', 'm', 'l')) &&
(IS_BLANK_CH(NXT(5)))) { (IS_BLANK_CH(NXT(5)))) {
@@ -10094,7 +10054,9 @@ xmlParseEncodingDecl(xmlParserCtxtPtr ctxt) {
xmlChar *encoding = NULL; xmlChar *encoding = NULL;
SKIP_BLANKS; SKIP_BLANKS;
if (CMP8(CUR_PTR, 'e', 'n', 'c', 'o', 'd', 'i', 'n', 'g')) { if (CMP8(CUR_PTR, 'e', 'n', 'c', 'o', 'd', 'i', 'n', 'g') == 0)
return(NULL);
SKIP(8); SKIP(8);
SKIP_BLANKS; SKIP_BLANKS;
if (RAW != '=') { if (RAW != '=') {
@@ -10125,70 +10087,12 @@ xmlParseEncodingDecl(xmlParserCtxtPtr ctxt) {
xmlFatalErr(ctxt, XML_ERR_STRING_NOT_STARTED, NULL); xmlFatalErr(ctxt, XML_ERR_STRING_NOT_STARTED, NULL);
} }
/* if (encoding == NULL)
* Non standard parsing, allowing the user to ignore encoding
*/
if (ctxt->options & XML_PARSE_IGNORE_ENC) {
xmlFree((xmlChar *) encoding);
return(NULL); return(NULL);
}
/* xmlSetDeclaredEncoding(ctxt, encoding);
* UTF-16 encoding switch has already taken place at this stage,
* more over the little-endian/big-endian selection is already done
*/
if ((encoding != NULL) &&
((!xmlStrcasecmp(encoding, BAD_CAST "UTF-16")) ||
(!xmlStrcasecmp(encoding, BAD_CAST "UTF16")))) {
/*
* If no encoding was passed to the parser, that we are
* using UTF-16 and no decoder is present i.e. the
* document is apparently UTF-8 compatible, then raise an
* encoding mismatch fatal error
*/
if ((ctxt->encoding == NULL) &&
(ctxt->input->buf != NULL) &&
(ctxt->input->buf->encoder == NULL)) {
xmlFatalErrMsg(ctxt, XML_ERR_INVALID_ENCODING,
"Document labelled UTF-16 but has UTF-8 content\n");
}
if (ctxt->encoding != NULL)
xmlFree((xmlChar *) ctxt->encoding);
ctxt->encoding = encoding;
}
/*
* UTF-8 encoding is handled natively
*/
else if ((encoding != NULL) &&
((!xmlStrcasecmp(encoding, BAD_CAST "UTF-8")) ||
(!xmlStrcasecmp(encoding, BAD_CAST "UTF8")))) {
/* TODO: Check for encoding mismatch. */
if (ctxt->encoding != NULL)
xmlFree((xmlChar *) ctxt->encoding);
ctxt->encoding = encoding;
}
else if (encoding != NULL) {
xmlCharEncodingHandlerPtr handler;
if (ctxt->input->encoding != NULL) return(ctxt->encoding);
xmlFree((xmlChar *) ctxt->input->encoding);
ctxt->input->encoding = encoding;
handler = xmlFindCharEncodingHandler((const char *) encoding);
if (handler != NULL) {
if (xmlSwitchToEncoding(ctxt, handler) < 0) {
/* failed to convert */
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
return(NULL);
}
} else {
xmlFatalErrMsgStr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
"Unsupported encoding %s\n", encoding);
return(NULL);
}
}
}
return(encoding);
} }
/** /**
@@ -10365,7 +10269,7 @@ xmlParseXMLDecl(xmlParserCtxtPtr ctxt) {
/* /*
* We may have the standalone status. * We may have the standalone status.
*/ */
if ((ctxt->input->encoding != NULL) && (!IS_BLANK_CH(RAW))) { if ((ctxt->encoding != NULL) && (!IS_BLANK_CH(RAW))) {
if ((RAW == '?') && (NXT(1) == '>')) { if ((RAW == '?') && (NXT(1) == '>')) {
SKIP(2); SKIP(2);
return; return;
@@ -10443,9 +10347,6 @@ xmlParseMisc(xmlParserCtxtPtr ctxt) {
int int
xmlParseDocument(xmlParserCtxtPtr ctxt) { xmlParseDocument(xmlParserCtxtPtr ctxt) {
xmlChar start[4];
xmlCharEncoding enc;
xmlInitParser(); xmlInitParser();
if ((ctxt == NULL) || (ctxt->input == NULL)) if ((ctxt == NULL) || (ctxt->input == NULL))
@@ -10466,23 +10367,7 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) {
if (ctxt->instate == XML_PARSER_EOF) if (ctxt->instate == XML_PARSER_EOF)
return(-1); return(-1);
if ((ctxt->encoding == NULL) && xmlDetectEncoding(ctxt);
((ctxt->input->end - ctxt->input->cur) >= 4)) {
/*
* Get the 4 first bytes and decode the charset
* if enc != XML_CHAR_ENCODING_NONE
* plug some encoding conversion routines.
*/
start[0] = RAW;
start[1] = NXT(1);
start[2] = NXT(2);
start[3] = NXT(3);
enc = xmlDetectCharEncoding(&start[0], 4);
if (enc != XML_CHAR_ENCODING_NONE) {
xmlSwitchEncoding(ctxt, enc);
}
}
if (CUR == 0) { if (CUR == 0) {
xmlFatalErr(ctxt, XML_ERR_DOCUMENT_EMPTY, NULL); xmlFatalErr(ctxt, XML_ERR_DOCUMENT_EMPTY, NULL);
@@ -10626,38 +10511,18 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) {
int int
xmlParseExtParsedEnt(xmlParserCtxtPtr ctxt) { xmlParseExtParsedEnt(xmlParserCtxtPtr ctxt) {
xmlChar start[4];
xmlCharEncoding enc;
if ((ctxt == NULL) || (ctxt->input == NULL)) if ((ctxt == NULL) || (ctxt->input == NULL))
return(-1); return(-1);
xmlDetectSAX2(ctxt); xmlDetectSAX2(ctxt);
GROW;
/* /*
* SAX: beginning of the document processing. * SAX: beginning of the document processing.
*/ */
if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator); ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
/* xmlDetectEncoding(ctxt);
* Get the 4 first bytes and decode the charset
* if enc != XML_CHAR_ENCODING_NONE
* plug some encoding conversion routines.
*/
if ((ctxt->input->end - ctxt->input->cur) >= 4) {
start[0] = RAW;
start[1] = NXT(1);
start[2] = NXT(2);
start[3] = NXT(3);
enc = xmlDetectCharEncoding(start, 4);
if (enc != XML_CHAR_ENCODING_NONE) {
xmlSwitchEncoding(ctxt, enc);
}
}
if (CUR == 0) { if (CUR == 0) {
xmlFatalErr(ctxt, XML_ERR_DOCUMENT_EMPTY, NULL); xmlFatalErr(ctxt, XML_ERR_DOCUMENT_EMPTY, NULL);
@@ -11076,6 +10941,9 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
case XML_PARSER_START: case XML_PARSER_START:
xmlGenericError(xmlGenericErrorContext, xmlGenericError(xmlGenericErrorContext,
"PP: try START\n"); break; "PP: try START\n"); break;
case XML_PARSER_XML_DECL:
xmlGenericError(xmlGenericErrorContext,
"PP: try XML_DECL\n"); break;
case XML_PARSER_MISC: case XML_PARSER_MISC:
xmlGenericError(xmlGenericErrorContext, xmlGenericError(xmlGenericErrorContext,
"PP: try MISC\n");break; "PP: try MISC\n");break;
@@ -11164,39 +11032,25 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
*/ */
goto done; goto done;
case XML_PARSER_START: case XML_PARSER_START:
if (ctxt->charset == XML_CHAR_ENCODING_NONE) {
xmlChar start[4];
xmlCharEncoding enc;
/* /*
* Very first chars read from the document flow. * Very first chars read from the document flow.
*/ */
if (avail < 4) if (avail < 4)
goto done; goto done;
/*
* Get the 4 first bytes and decode the charset
* if enc != XML_CHAR_ENCODING_NONE
* plug some encoding conversion routines,
* else xmlSwitchEncoding will set to (default)
* UTF8.
*/
start[0] = RAW;
start[1] = NXT(1);
start[2] = NXT(2);
start[3] = NXT(3);
enc = xmlDetectCharEncoding(start, 4);
/* /*
* We need more bytes to detect EBCDIC code pages. * We need more bytes to detect EBCDIC code pages.
* See xmlDetectEBCDIC. * See xmlDetectEBCDIC.
*/ */
if ((enc == XML_CHAR_ENCODING_EBCDIC) && if ((CMP4(CUR_PTR, 0x4C, 0x6F, 0xA7, 0x94)) &&
(!terminate) && (avail < 200)) (!terminate) && (avail < 200))
goto done; goto done;
xmlSwitchEncoding(ctxt, enc);
break;
}
xmlDetectEncoding(ctxt);
ctxt->instate = XML_PARSER_XML_DECL;
break;
case XML_PARSER_XML_DECL:
if (avail < 2) if (avail < 2)
goto done; goto done;
cur = ctxt->input->cur[0]; cur = ctxt->input->cur[0];
@@ -11242,9 +11096,6 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
xmlHaltParser(ctxt); xmlHaltParser(ctxt);
return(0); return(0);
} }
if ((ctxt->encoding == NULL) &&
(ctxt->input->encoding != NULL))
ctxt->encoding = xmlStrdup(ctxt->input->encoding);
if ((ctxt->sax) && (ctxt->sax->startDocument) && if ((ctxt->sax) && (ctxt->sax->startDocument) &&
(!ctxt->disableSAX)) (!ctxt->disableSAX))
ctxt->sax->startDocument(ctxt->userData); ctxt->sax->startDocument(ctxt->userData);
@@ -11978,13 +11829,6 @@ xmlCreatePushParserCtxt(xmlSAXHandlerPtr sax, void *user_data,
xmlBufResetInput(inputStream->buf->buffer, inputStream); xmlBufResetInput(inputStream->buf->buffer, inputStream);
inputPush(ctxt, inputStream); inputPush(ctxt, inputStream);
/*
* If the caller didn't provide an initial 'chunk' for determining
* the encoding, we set the context to XML_CHAR_ENCODING_NONE so
* that it can be automatically determined later
*/
ctxt->charset = XML_CHAR_ENCODING_NONE;
if ((size != 0) && (chunk != NULL) && if ((size != 0) && (chunk != NULL) &&
(ctxt->input != NULL) && (ctxt->input->buf != NULL)) { (ctxt->input != NULL) && (ctxt->input->buf != NULL)) {
size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input); size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
@@ -12092,7 +11936,6 @@ xmlIOParseDTD(xmlSAXHandlerPtr sax, xmlParserInputBufferPtr input,
xmlDtdPtr ret = NULL; xmlDtdPtr ret = NULL;
xmlParserCtxtPtr ctxt; xmlParserCtxtPtr ctxt;
xmlParserInputPtr pinput = NULL; xmlParserInputPtr pinput = NULL;
xmlChar start[4];
if (input == NULL) if (input == NULL)
return(NULL); return(NULL);
@@ -12150,22 +11993,7 @@ xmlIOParseDTD(xmlSAXHandlerPtr sax, xmlParserInputBufferPtr input,
ctxt->myDoc->extSubset = xmlNewDtd(ctxt->myDoc, BAD_CAST "none", ctxt->myDoc->extSubset = xmlNewDtd(ctxt->myDoc, BAD_CAST "none",
BAD_CAST "none", BAD_CAST "none"); BAD_CAST "none", BAD_CAST "none");
if ((enc == XML_CHAR_ENCODING_NONE) && xmlDetectEncoding(ctxt);
((ctxt->input->end - ctxt->input->cur) >= 4)) {
/*
* Get the 4 first bytes and decode the charset
* if enc != XML_CHAR_ENCODING_NONE
* plug some encoding conversion routines.
*/
start[0] = RAW;
start[1] = NXT(1);
start[2] = NXT(2);
start[3] = NXT(3);
enc = xmlDetectCharEncoding(start, 4);
if (enc != XML_CHAR_ENCODING_NONE) {
xmlSwitchEncoding(ctxt, enc);
}
}
xmlParseExternalSubset(ctxt, BAD_CAST "none", BAD_CAST "none"); xmlParseExternalSubset(ctxt, BAD_CAST "none", BAD_CAST "none");
@@ -12213,7 +12041,6 @@ xmlSAXParseDTD(xmlSAXHandlerPtr sax, const xmlChar *ExternalID,
xmlDtdPtr ret = NULL; xmlDtdPtr ret = NULL;
xmlParserCtxtPtr ctxt; xmlParserCtxtPtr ctxt;
xmlParserInputPtr input = NULL; xmlParserInputPtr input = NULL;
xmlCharEncoding enc;
xmlChar* systemIdCanonic; xmlChar* systemIdCanonic;
if ((ExternalID == NULL) && (SystemID == NULL)) return(NULL); if ((ExternalID == NULL) && (SystemID == NULL)) return(NULL);
@@ -12258,10 +12085,8 @@ xmlSAXParseDTD(xmlSAXHandlerPtr sax, const xmlChar *ExternalID,
xmlFree(systemIdCanonic); xmlFree(systemIdCanonic);
return(NULL); return(NULL);
} }
if ((ctxt->input->end - ctxt->input->cur) >= 4) {
enc = xmlDetectCharEncoding(ctxt->input->cur, 4); xmlDetectEncoding(ctxt);
xmlSwitchEncoding(ctxt, enc);
}
if (input->filename == NULL) if (input->filename == NULL)
input->filename = (char *) systemIdCanonic; input->filename = (char *) systemIdCanonic;
@@ -12399,8 +12224,6 @@ xmlParseExternalEntityPrivate(xmlDocPtr doc, xmlParserCtxtPtr oldctxt,
xmlDocPtr newDoc; xmlDocPtr newDoc;
xmlNodePtr newRoot; xmlNodePtr newRoot;
xmlParserErrors ret = XML_ERR_OK; xmlParserErrors ret = XML_ERR_OK;
xmlChar start[4];
xmlCharEncoding enc;
if (((depth > 40) && if (((depth > 40) &&
((oldctxt == NULL) || (oldctxt->options & XML_PARSE_HUGE) == 0)) || ((oldctxt == NULL) || (oldctxt->options & XML_PARSE_HUGE) == 0)) ||
@@ -12461,22 +12284,7 @@ xmlParseExternalEntityPrivate(xmlDocPtr doc, xmlParserCtxtPtr oldctxt,
newRoot->doc = doc; newRoot->doc = doc;
} }
/* xmlDetectEncoding(ctxt);
* Get the 4 first bytes and decode the charset
* if enc != XML_CHAR_ENCODING_NONE
* plug some encoding conversion routines.
*/
GROW;
if ((ctxt->input->end - ctxt->input->cur) >= 4) {
start[0] = RAW;
start[1] = NXT(1);
start[2] = NXT(2);
start[3] = NXT(3);
enc = xmlDetectCharEncoding(start, 4);
if (enc != XML_CHAR_ENCODING_NONE) {
xmlSwitchEncoding(ctxt, enc);
}
}
/* /*
* Parse a possible text declaration first * Parse a possible text declaration first
@@ -12963,10 +12771,6 @@ xmlParseInNodeContext(xmlNodePtr node, const char *data, int datalen,
if (doc->encoding != NULL) { if (doc->encoding != NULL) {
xmlCharEncodingHandlerPtr hdlr; xmlCharEncodingHandlerPtr hdlr;
if (ctxt->encoding != NULL)
xmlFree((xmlChar *) ctxt->encoding);
ctxt->encoding = xmlStrdup((const xmlChar *) doc->encoding);
hdlr = xmlFindCharEncodingHandler((const char *) doc->encoding); hdlr = xmlFindCharEncodingHandler((const char *) doc->encoding);
if (hdlr != NULL) { if (hdlr != NULL) {
xmlSwitchToEncoding(ctxt, hdlr); xmlSwitchToEncoding(ctxt, hdlr);
@@ -14273,7 +14077,6 @@ xmlCtxtReset(xmlParserCtxtPtr ctxt)
ctxt->inSubset = 0; ctxt->inSubset = 0;
ctxt->errNo = XML_ERR_OK; ctxt->errNo = XML_ERR_OK;
ctxt->depth = 0; ctxt->depth = 0;
ctxt->charset = XML_CHAR_ENCODING_UTF8;
ctxt->catalogs = NULL; ctxt->catalogs = NULL;
ctxt->sizeentities = 0; ctxt->sizeentities = 0;
ctxt->sizeentcopy = 0; ctxt->sizeentcopy = 0;
@@ -14374,10 +14177,6 @@ xmlCtxtResetPush(xmlParserCtxtPtr ctxt, const char *chunk,
if (encoding != NULL) { if (encoding != NULL) {
xmlCharEncodingHandlerPtr hdlr; xmlCharEncodingHandlerPtr hdlr;
if (ctxt->encoding != NULL)
xmlFree((xmlChar *) ctxt->encoding);
ctxt->encoding = xmlStrdup((const xmlChar *) encoding);
hdlr = xmlFindCharEncodingHandler(encoding); hdlr = xmlFindCharEncodingHandler(encoding);
if (hdlr != NULL) { if (hdlr != NULL) {
xmlSwitchToEncoding(ctxt, hdlr); xmlSwitchToEncoding(ctxt, hdlr);

View File

@@ -765,7 +765,7 @@ xmlNextChar(xmlParserCtxtPtr ctxt)
return; return;
} }
if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { if ((ctxt->input->flags & XML_INPUT_8_BIT) == 0) {
const unsigned char *cur; const unsigned char *cur;
unsigned char c; unsigned char c;
@@ -876,7 +876,10 @@ encoding_error:
"Input is not proper UTF-8, indicate encoding !\n%s", "Input is not proper UTF-8, indicate encoding !\n%s",
BAD_CAST buffer, NULL); BAD_CAST buffer, NULL);
} }
ctxt->charset = XML_CHAR_ENCODING_8859_1; if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) {
ctxt->input->flags |= XML_INPUT_HAS_ENCODING;
ctxt->input->flags |= XML_INPUT_8_BIT;
}
ctxt->input->cur++; ctxt->input->cur++;
return; return;
} }
@@ -917,7 +920,7 @@ xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
*len = 1; *len = 1;
return(*ctxt->input->cur); return(*ctxt->input->cur);
} }
if (ctxt->charset == XML_CHAR_ENCODING_UTF8) { if ((ctxt->input->flags & XML_INPUT_8_BIT) == 0) {
/* /*
* We are supposed to handle UTF8, check it's valid * We are supposed to handle UTF8, check it's valid
* From rfc2044: encoding of the Unicode values on UTF-8: * From rfc2044: encoding of the Unicode values on UTF-8:
@@ -1040,7 +1043,10 @@ encoding_error:
"Input is not proper UTF-8, indicate encoding !\n%s", "Input is not proper UTF-8, indicate encoding !\n%s",
BAD_CAST buffer, NULL); BAD_CAST buffer, NULL);
} }
ctxt->charset = XML_CHAR_ENCODING_8859_1; if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) {
ctxt->input->flags |= XML_INPUT_HAS_ENCODING;
ctxt->input->flags |= XML_INPUT_8_BIT;
}
*len = 1; *len = 1;
return(*ctxt->input->cur); return(*ctxt->input->cur);
@@ -1073,7 +1079,8 @@ int
xmlStringCurrentChar(xmlParserCtxtPtr ctxt, const xmlChar * cur, int *len) xmlStringCurrentChar(xmlParserCtxtPtr ctxt, const xmlChar * cur, int *len)
{ {
if ((len == NULL) || (cur == NULL)) return(0); if ((len == NULL) || (cur == NULL)) return(0);
if ((ctxt == NULL) || (ctxt->charset == XML_CHAR_ENCODING_UTF8)) { if ((ctxt == NULL) || (ctxt->input == NULL) ||
((ctxt->input->flags & XML_INPUT_8_BIT) == 0)) {
/* /*
* We are supposed to handle UTF8, check it's valid * We are supposed to handle UTF8, check it's valid
* From rfc2044: encoding of the Unicode values on UTF-8: * From rfc2044: encoding of the Unicode values on UTF-8:
@@ -1300,58 +1307,29 @@ xmlDetectEBCDIC(xmlParserInputPtr input) {
* @ctxt: the parser context * @ctxt: the parser context
* @enc: the encoding value (number) * @enc: the encoding value (number)
* *
* change the input functions when discovering the character encoding * Use encoding specified by enum to decode input data.
* of a given entity. *
* This function can be used to enforce the encoding of chunks passed
* to xmlParseChunk.
* *
* Returns 0 in case of success, -1 otherwise * Returns 0 in case of success, -1 otherwise
*/ */
int int
xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
{ {
xmlCharEncodingHandlerPtr handler; xmlCharEncodingHandlerPtr handler = NULL;
int check = 1;
int ret; int ret;
if (ctxt == NULL) return(-1); if ((ctxt == NULL) || (ctxt->input == NULL))
return(-1);
/*
* FIXME: The BOM shouldn't be skipped here, but in the parsing code.
*
* Note that we look for a decoded UTF-8 BOM when switching to UTF-16.
* This is mostly useless but Webkit/Chromium relies on this behavior.
* See https://bugs.chromium.org/p/chromium/issues/detail?id=1451026
*/
if ((ctxt->input != NULL) &&
(ctxt->input->consumed == 0) &&
(ctxt->input->cur != NULL) &&
(ctxt->input->cur == ctxt->input->base) &&
((enc == XML_CHAR_ENCODING_UTF8) ||
(enc == XML_CHAR_ENCODING_UTF16LE) ||
(enc == XML_CHAR_ENCODING_UTF16BE))) {
/*
* Errata on XML-1.0 June 20 2001
* Specific handling of the Byte Order Mark for
* UTF-8
*/
if ((ctxt->input->cur[0] == 0xEF) &&
(ctxt->input->cur[1] == 0xBB) &&
(ctxt->input->cur[2] == 0xBF)) {
ctxt->input->cur += 3;
}
}
switch (enc) { switch (enc) {
case XML_CHAR_ENCODING_ERROR:
__xmlErrEncoding(ctxt, XML_ERR_UNKNOWN_ENCODING,
"encoding unknown\n", NULL, NULL);
return(-1);
case XML_CHAR_ENCODING_NONE: case XML_CHAR_ENCODING_NONE:
/* let's assume it's UTF-8 without the XML decl */
ctxt->charset = XML_CHAR_ENCODING_UTF8;
return(0);
case XML_CHAR_ENCODING_UTF8: case XML_CHAR_ENCODING_UTF8:
/* default encoding, no conversion should be needed */ case XML_CHAR_ENCODING_ASCII:
ctxt->charset = XML_CHAR_ENCODING_UTF8; check = 0;
return(0); break;
case XML_CHAR_ENCODING_EBCDIC: case XML_CHAR_ENCODING_EBCDIC:
handler = xmlDetectEBCDIC(ctxt->input); handler = xmlDetectEBCDIC(ctxt->input);
break; break;
@@ -1359,28 +1337,13 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
handler = xmlGetCharEncodingHandler(enc); handler = xmlGetCharEncodingHandler(enc);
break; break;
} }
if (handler == NULL) {
/* if ((check) && (handler == NULL)) {
* Default handlers. const char *name = xmlGetCharEncodingName(enc);
*/
switch (enc) {
case XML_CHAR_ENCODING_ASCII:
/* default encoding, no conversion should be needed */
ctxt->charset = XML_CHAR_ENCODING_UTF8;
return(0);
case XML_CHAR_ENCODING_8859_1:
if ((ctxt->inputNr == 1) &&
(ctxt->encoding == NULL) &&
(ctxt->input != NULL) &&
(ctxt->input->encoding != NULL)) {
ctxt->encoding = xmlStrdup(ctxt->input->encoding);
}
ctxt->charset = enc;
return(0);
default:
__xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING, __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
"encoding not supported: %s\n", "encoding not supported: %s\n",
BAD_CAST xmlGetCharEncodingName(enc), NULL); BAD_CAST (name ? name : "<null>"), NULL);
/* /*
* TODO: We could recover from errors in external entities * TODO: We could recover from errors in external entities
* if we didn't stop the parser. But most callers of this * if we didn't stop the parser. But most callers of this
@@ -1389,15 +1352,13 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
xmlStopParser(ctxt); xmlStopParser(ctxt);
return(-1); return(-1);
} }
}
ret = xmlSwitchInputEncoding(ctxt, ctxt->input, handler); ret = xmlSwitchInputEncoding(ctxt, ctxt->input, handler);
if ((ret < 0) || (ctxt->errNo == XML_I18N_CONV_FAILED)) {
/* if ((ret >= 0) && (enc == XML_CHAR_ENCODING_NONE)) {
* on encoding conversion errors, stop the parser ctxt->input->flags &= ~XML_INPUT_HAS_ENCODING;
*/
xmlStopParser(ctxt);
ctxt->errNo = XML_I18N_CONV_FAILED;
} }
return(ret); return(ret);
} }
@@ -1407,8 +1368,9 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
* @input: the input stream * @input: the input stream
* @handler: the encoding handler * @handler: the encoding handler
* *
* change the input functions when discovering the character encoding * DEPRECATED: Internal function, don't use.
* of a given entity. *
* Use encoding handler to decode input data.
* *
* Returns 0 in case of success, -1 otherwise * Returns 0 in case of success, -1 otherwise
*/ */
@@ -1419,27 +1381,19 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
int nbchars; int nbchars;
xmlParserInputBufferPtr in; xmlParserInputBufferPtr in;
if (handler == NULL) if ((input == NULL) || (input->buf == NULL)) {
return (-1);
if (input == NULL)
return (-1);
in = input->buf;
if (in == NULL) {
xmlErrInternal(ctxt,
"static memory buffer doesn't support encoding\n", NULL);
/*
* Callers assume that the input buffer takes ownership of the
* encoding handler. xmlCharEncCloseFunc frees unregistered
* handlers and avoids a memory leak.
*/
xmlCharEncCloseFunc(handler); xmlCharEncCloseFunc(handler);
return (-1); return (-1);
} }
in = input->buf;
input->flags |= XML_INPUT_HAS_ENCODING;
input->flags &= ~XML_INPUT_8_BIT;
if (in->encoder != NULL) {
if (in->encoder == handler) if (in->encoder == handler)
return (0); return (0);
if (in->encoder != NULL) {
/* /*
* Switching encodings during parsing is a really bad idea, * Switching encodings during parsing is a really bad idea,
* but Chromium can switch between ISO-8859-1 and UTF-16 before * but Chromium can switch between ISO-8859-1 and UTF-16 before
@@ -1454,7 +1408,6 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
return (0); return (0);
} }
ctxt->charset = XML_CHAR_ENCODING_UTF8;
in->encoder = handler; in->encoder = handler;
/* /*
@@ -1463,37 +1416,6 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
if (xmlBufIsEmpty(in->buffer) == 0) { if (xmlBufIsEmpty(in->buffer) == 0) {
size_t processed, use, consumed; size_t processed, use, consumed;
/*
* FIXME: The BOM shouldn't be skipped here, but in the parsing code.
*/
/*
* Specific handling of the Byte Order Mark for
* UTF-16
*/
if ((handler->name != NULL) &&
(!strcmp(handler->name, "UTF-16LE") ||
!strcmp(handler->name, "UTF-16")) &&
(input->cur[0] == 0xFF) && (input->cur[1] == 0xFE)) {
input->cur += 2;
}
if ((handler->name != NULL) &&
(!strcmp(handler->name, "UTF-16BE")) &&
(input->cur[0] == 0xFE) && (input->cur[1] == 0xFF)) {
input->cur += 2;
}
/*
* Errata on XML-1.0 June 20 2001
* Specific handling of the Byte Order Mark for
* UTF-8
*/
if ((handler->name != NULL) &&
(!strcmp(handler->name, "UTF-8")) &&
(input->cur[0] == 0xEF) &&
(input->cur[1] == 0xBB) && (input->cur[2] == 0xBF)) {
input->cur += 3;
}
/* /*
* Shrink the current input buffer. * Shrink the current input buffer.
* Move it as the raw buffer and create a new input buffer * Move it as the raw buffer and create a new input buffer
@@ -1541,8 +1463,10 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
* @ctxt: the parser context * @ctxt: the parser context
* @handler: the encoding handler * @handler: the encoding handler
* *
* change the input functions when discovering the character encoding * Use encoding handler to decode input data.
* of a given entity. *
* This function can be used to enforce the encoding of chunks passed
* to xmlParseChunk.
* *
* Returns 0 in case of success, -1 otherwise * Returns 0 in case of success, -1 otherwise
*/ */
@@ -1554,6 +1478,185 @@ xmlSwitchToEncoding(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler)
return(xmlSwitchInputEncoding(ctxt, ctxt->input, handler)); return(xmlSwitchInputEncoding(ctxt, ctxt->input, handler));
} }
/**
* xmlDetectEncoding:
* @ctxt: the parser context
*
* Handle optional BOM, detect and switch to encoding.
*
* Assumes that there are at least four bytes in the input buffer.
*/
void
xmlDetectEncoding(xmlParserCtxtPtr ctxt) {
const xmlChar *in = ctxt->input->cur;
xmlCharEncoding enc;
int bomSize;
int autoFlag = 0;
if (xmlParserGrow(ctxt) < 0)
return;
if (ctxt->input->end - in < 4)
return;
if (ctxt->input->flags & XML_INPUT_HAS_ENCODING) {
/*
* If the encoding was already set, only skip the BOM which was
* possibly decoded to UTF-8.
*/
if ((in[0] == 0xEF) && (in[1] == 0xBB) && (in[2] == 0xBF)) {
ctxt->input->cur += 3;
}
return;
}
enc = XML_CHAR_ENCODING_NONE;
bomSize = 0;
switch (in[0]) {
case 0x00:
if ((in[1] == 0x00) && (in[2] == 0x00) && (in[3] == 0x3C)) {
enc = XML_CHAR_ENCODING_UCS4BE;
autoFlag = XML_INPUT_AUTO_OTHER;
} else if ((in[1] == 0x3C) && (in[2] == 0x00) && (in[3] == 0x3F)) {
enc = XML_CHAR_ENCODING_UTF16BE;
autoFlag = XML_INPUT_AUTO_UTF16BE;
}
break;
case 0x3C:
if (in[1] == 0x00) {
if ((in[2] == 0x00) && (in[3] == 0x00)) {
enc = XML_CHAR_ENCODING_UCS4LE;
autoFlag = XML_INPUT_AUTO_OTHER;
} else if ((in[2] == 0x3F) && (in[3] == 0x00)) {
enc = XML_CHAR_ENCODING_UTF16LE;
autoFlag = XML_INPUT_AUTO_UTF16LE;
}
}
break;
case 0x4C:
if ((in[1] == 0x6F) && (in[2] == 0xA7) && (in[3] == 0x94)) {
enc = XML_CHAR_ENCODING_EBCDIC;
autoFlag = XML_INPUT_AUTO_OTHER;
}
break;
case 0xEF:
if ((in[1] == 0xBB) && (in[2] == 0xBF)) {
enc = XML_CHAR_ENCODING_UTF8;
autoFlag = XML_INPUT_AUTO_UTF8;
bomSize = 3;
}
break;
case 0xFE:
if (in[1] == 0xFF) {
enc = XML_CHAR_ENCODING_UTF16BE;
autoFlag = XML_INPUT_AUTO_UTF16BE;
bomSize = 2;
}
break;
case 0xFF:
if (in[1] == 0xFE) {
enc = XML_CHAR_ENCODING_UTF16LE;
autoFlag = XML_INPUT_AUTO_UTF16LE;
bomSize = 2;
}
break;
}
if (bomSize > 0) {
ctxt->input->cur += bomSize;
}
if (enc != XML_CHAR_ENCODING_NONE) {
ctxt->input->flags |= autoFlag;
xmlSwitchEncoding(ctxt, enc);
}
}
/**
* xmlSetDeclaredEncoding:
* @ctxt: the parser context
* @encoding: declared encoding
*
* Set the encoding from a declaration in the document.
*
* If no encoding was set yet, switch the encoding. Otherwise, only warn
* about encoding mismatches.
*
* Takes ownership of 'encoding'.
*/
void
xmlSetDeclaredEncoding(xmlParserCtxtPtr ctxt, xmlChar *encoding) {
if (ctxt->encoding != NULL)
xmlFree((xmlChar *) ctxt->encoding);
ctxt->encoding = encoding;
if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
((ctxt->options & XML_PARSE_IGNORE_ENC) == 0)) {
xmlCharEncodingHandlerPtr handler;
handler = xmlFindCharEncodingHandler((const char *) encoding);
if (handler != NULL) {
xmlSwitchToEncoding(ctxt, handler);
} else {
__xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
"Unsupported encoding: %s\n",
encoding, NULL);
}
} else if (ctxt->input->flags & XML_INPUT_AUTO_ENCODING) {
static const char *allowedUTF8[] = {
"UTF-8", "UTF8", NULL
};
static const char *allowedUTF16LE[] = {
"UTF-16", "UTF-16LE", "UTF16", NULL
};
static const char *allowedUTF16BE[] = {
"UTF-16", "UTF-16BE", "UTF16", NULL
};
const char **allowed = NULL;
const char *autoEnc = NULL;
switch (ctxt->input->flags & XML_INPUT_AUTO_ENCODING) {
case XML_INPUT_AUTO_UTF8:
allowed = allowedUTF8;
autoEnc = "UTF-8";
break;
case XML_INPUT_AUTO_UTF16LE:
allowed = allowedUTF16LE;
autoEnc = "UTF-16LE";
break;
case XML_INPUT_AUTO_UTF16BE:
allowed = allowedUTF16BE;
autoEnc = "UTF-16BE";
break;
}
if (allowed != NULL) {
const char **p;
int match = 0;
for (p = allowed; *p != NULL; p++) {
if (xmlStrcasecmp(encoding, BAD_CAST *p) == 0) {
match = 1;
break;
}
}
if (match == 0) {
xmlWarningMsg(ctxt, XML_WAR_ENCODING_MISMATCH,
"Encoding '%s' doesn't match "
"auto-detected '%s'\n",
encoding, BAD_CAST autoEnc);
}
}
}
}
/************************************************************************ /************************************************************************
* * * *
* Commodity functions to handle entities processing * * Commodity functions to handle entities processing *
@@ -1572,7 +1675,6 @@ xmlFreeInputStream(xmlParserInputPtr input) {
if (input->filename != NULL) xmlFree((char *) input->filename); if (input->filename != NULL) xmlFree((char *) input->filename);
if (input->directory != NULL) xmlFree((char *) input->directory); if (input->directory != NULL) xmlFree((char *) input->directory);
if (input->encoding != NULL) xmlFree((char *) input->encoding);
if (input->version != NULL) xmlFree((char *) input->version); if (input->version != NULL) xmlFree((char *) input->version);
if ((input->free != NULL) && (input->base != NULL)) if ((input->free != NULL) && (input->base != NULL))
input->free((xmlChar *) input->base); input->free((xmlChar *) input->base);
@@ -2015,7 +2117,6 @@ xmlInitSAXParserCtxt(xmlParserCtxtPtr ctxt, const xmlSAXHandler *sax,
ctxt->inSubset = 0; ctxt->inSubset = 0;
ctxt->errNo = XML_ERR_OK; ctxt->errNo = XML_ERR_OK;
ctxt->depth = 0; ctxt->depth = 0;
ctxt->charset = XML_CHAR_ENCODING_UTF8;
ctxt->catalogs = NULL; ctxt->catalogs = NULL;
ctxt->sizeentities = 0; ctxt->sizeentities = 0;
ctxt->sizeentcopy = 0; ctxt->sizeentcopy = 0;

View File

@@ -271,11 +271,11 @@ static int testCharRangeByte1(xmlParserCtxtPtr ctxt) {
data[3] = 0; data[3] = 0;
for (i = 0;i <= 0xFF;i++) { for (i = 0;i <= 0xFF;i++) {
data[0] = (char) i; data[0] = (char) i;
ctxt->charset = XML_CHAR_ENCODING_UTF8;
ctxt->nbErrors = 0; ctxt->nbErrors = 0;
lastError = 0; lastError = 0;
c = xmlCurrentChar(ctxt, &len); c = xmlCurrentChar(ctxt, &len);
ctxt->input->flags = 0;
if ((i == 0) || (i >= 0x80)) { if ((i == 0) || (i >= 0x80)) {
/* we must see an error there */ /* we must see an error there */
if (lastError != XML_ERR_INVALID_CHAR) { if (lastError != XML_ERR_INVALID_CHAR) {
@@ -307,11 +307,11 @@ static int testCharRangeByte2(xmlParserCtxtPtr ctxt) {
for (j = 0;j <= 0xFF;j++) { for (j = 0;j <= 0xFF;j++) {
data[0] = (char) i; data[0] = (char) i;
data[1] = (char) j; data[1] = (char) j;
ctxt->charset = XML_CHAR_ENCODING_UTF8;
ctxt->nbErrors = 0; ctxt->nbErrors = 0;
lastError = 0; lastError = 0;
c = xmlCurrentChar(ctxt, &len); c = xmlCurrentChar(ctxt, &len);
ctxt->input->flags = 0;
/* if first bit of first char is set, then second bit must too */ /* if first bit of first char is set, then second bit must too */
if ((i & 0x80) && ((i & 0x40) == 0)) { if ((i & 0x80) && ((i & 0x40) == 0)) {
@@ -401,11 +401,11 @@ static int testCharRangeByte3(xmlParserCtxtPtr ctxt) {
K = lows[k]; K = lows[k];
data[2] = (char) K; data[2] = (char) K;
value = (K & 0x3F) + ((j & 0x3F) << 6) + ((i & 0xF) << 12); value = (K & 0x3F) + ((j & 0x3F) << 6) + ((i & 0xF) << 12);
ctxt->charset = XML_CHAR_ENCODING_UTF8;
ctxt->nbErrors = 0; ctxt->nbErrors = 0;
lastError = 0; lastError = 0;
c = xmlCurrentChar(ctxt, &len); c = xmlCurrentChar(ctxt, &len);
ctxt->input->flags = 0;
/* /*
* if fourth bit of first char is set, then the sequence would need * if fourth bit of first char is set, then the sequence would need
@@ -504,11 +504,11 @@ static int testCharRangeByte4(xmlParserCtxtPtr ctxt) {
data[3] = (char) L; data[3] = (char) L;
value = (L & 0x3F) + ((K & 0x3F) << 6) + ((j & 0x3F) << 12) + value = (L & 0x3F) + ((K & 0x3F) << 6) + ((j & 0x3F) << 12) +
((i & 0x7) << 18); ((i & 0x7) << 18);
ctxt->charset = XML_CHAR_ENCODING_UTF8;
ctxt->nbErrors = 0; ctxt->nbErrors = 0;
lastError = 0; lastError = 0;
c = xmlCurrentChar(ctxt, &len); c = xmlCurrentChar(ctxt, &len);
ctxt->input->flags = 0;
/* /*
* if fifth bit of first char is set, then the sequence would need * if fifth bit of first char is set, then the sequence would need

View File

@@ -3790,8 +3790,6 @@ xmlCheckHTTPInput(xmlParserCtxtPtr ctxt, xmlParserInputPtr ret) {
"Unknown encoding %s", "Unknown encoding %s",
BAD_CAST encoding, NULL); BAD_CAST encoding, NULL);
} }
if (ret->encoding == NULL)
ret->encoding = xmlStrdup(BAD_CAST encoding);
} }
#if 0 #if 0
} else if (xmlStrstr(BAD_CAST mime, BAD_CAST "html")) { } else if (xmlStrstr(BAD_CAST mime, BAD_CAST "html")) {