1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-10-24 13:33:01 +03:00

parser: Rework encoding detection

Introduce XML_INPUT_HAS_ENCODING flag for xmlParserInput which is set
when xmlSwitchEncoding is called. The parser can use the flag to
reliably detect whether an encoding was already set via user override,
BOM or other auto-detection. In this case, the encoding declaration
won't be used to switch the encoding.

Before, an inscrutable mix of ctxt->charset, ctxt->input->encoding
and ctxt->input->buf->encoder was used.

Introduce private helper functions to switch encodings used by both the
XML and HTML parser:

- xmlDetectEncoding which skips over the BOM, allowing to remove the
  BOM checks from other encoding functions.
- xmlSetDeclaredEncoding, replacing htmlCheckEncodingDirect, which warns
  about encoding mismatches.

If users override the encoding, store the declared instead of the actual
encoding in xmlDoc. In this case, the actual encoding is known and the
raw value from the doc is more useful.

Also use the input flags to store the ISO-8859-1 fallback state.
Restrict the fallback to cases where no encoding was specified. (The
fallback is only useful in recovery mode and these days broken UTF-8 is
probably more likely than ISO-8859-1, so it might eventually be removed
completely.)

The 'charset' member of xmlParserCtxt is now unused. The 'encoding'
member of xmlParserInput is now unused.

The 'standalone' member of xmlParserInput is renamed to 'flags'.

A new parser state XML_PARSER_XML_DECL is added for the push parser.
This commit is contained in:
Nick Wellnhofer
2023-08-08 15:19:46 +02:00
parent d38e73f91e
commit ec7be50662
10 changed files with 341 additions and 583 deletions

337
parser.c
View File

@@ -281,7 +281,7 @@ xmlFatalErrMsg(xmlParserCtxtPtr ctxt, xmlParserErrors error,
*
* Handle a warning.
*/
static void LIBXML_ATTR_FORMAT(3,0)
void LIBXML_ATTR_FORMAT(3,0)
xmlWarningMsg(xmlParserCtxtPtr ctxt, xmlParserErrors error,
const char *msg, const xmlChar *str1, const xmlChar *str2)
{
@@ -2313,6 +2313,7 @@ xmlParserHandlePEReference(xmlParserCtxtPtr ctxt) {
return;
case XML_PARSER_PROLOG:
case XML_PARSER_START:
case XML_PARSER_XML_DECL:
case XML_PARSER_MISC:
xmlFatalErr(ctxt, XML_ERR_PEREF_IN_PROLOG, NULL);
return;
@@ -6682,7 +6683,6 @@ xmlParseMarkupDecl(xmlParserCtxtPtr ctxt) {
void
xmlParseTextDecl(xmlParserCtxtPtr ctxt) {
xmlChar *version;
const xmlChar *encoding;
int oldstate;
/*
@@ -6721,7 +6721,7 @@ xmlParseTextDecl(xmlParserCtxtPtr ctxt) {
/*
* We must have the encoding declaration
*/
encoding = xmlParseEncodingDecl(ctxt);
xmlParseEncodingDecl(ctxt);
if (ctxt->instate == XML_PARSER_EOF)
return;
if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
@@ -6731,10 +6731,6 @@ xmlParseTextDecl(xmlParserCtxtPtr ctxt) {
ctxt->instate = oldstate;
return;
}
if ((encoding == NULL) && (ctxt->errNo == XML_ERR_OK)) {
xmlFatalErrMsg(ctxt, XML_ERR_MISSING_ENCODING,
"Missing encoding in text declaration\n");
}
SKIP_BLANKS;
if ((RAW == '?') && (NXT(1) == '>')) {
@@ -6773,21 +6769,8 @@ void
xmlParseExternalSubset(xmlParserCtxtPtr ctxt, const xmlChar *ExternalID,
const xmlChar *SystemID) {
xmlDetectSAX2(ctxt);
GROW;
if ((ctxt->encoding == NULL) &&
(ctxt->input->end - ctxt->input->cur >= 4)) {
xmlChar start[4];
xmlCharEncoding enc;
start[0] = RAW;
start[1] = NXT(1);
start[2] = NXT(2);
start[3] = NXT(3);
enc = xmlDetectCharEncoding(start, 4);
if (enc != XML_CHAR_ENCODING_NONE)
xmlSwitchEncoding(ctxt, enc);
}
xmlDetectEncoding(ctxt);
if (CMP5(CUR_PTR, '<', '?', 'x', 'm', 'l')) {
xmlParseTextDecl(ctxt);
@@ -7727,8 +7710,6 @@ xmlParsePEReference(xmlParserCtxtPtr ctxt)
"Internal: %%%s; is not a parameter entity\n",
name, NULL);
} else {
xmlChar start[4];
xmlCharEncoding enc;
unsigned long parentConsumed;
xmlEntityPtr oldEnt;
@@ -7769,28 +7750,7 @@ xmlParsePEReference(xmlParserCtxtPtr ctxt)
input->parentConsumed = parentConsumed;
if (entity->etype == XML_EXTERNAL_PARAMETER_ENTITY) {
/*
* Get the 4 first bytes and decode the charset
* if enc != XML_CHAR_ENCODING_NONE
* plug some encoding conversion routines.
* Note that, since we may have some non-UTF8
* encoding (like UTF16, bug 135229), the 'length'
* is not known, but we can calculate based upon
* the amount of data in the buffer.
*/
GROW
if (ctxt->instate == XML_PARSER_EOF)
return;
if ((ctxt->input->end - ctxt->input->cur)>=4) {
start[0] = RAW;
start[1] = NXT(1);
start[2] = NXT(2);
start[3] = NXT(3);
enc = xmlDetectCharEncoding(start, 4);
if (enc != XML_CHAR_ENCODING_NONE) {
xmlSwitchEncoding(ctxt, enc);
}
}
xmlDetectEncoding(ctxt);
if ((CMP5(CUR_PTR, '<', '?', 'x', 'm', 'l')) &&
(IS_BLANK_CH(NXT(5)))) {
@@ -10094,101 +10054,45 @@ xmlParseEncodingDecl(xmlParserCtxtPtr ctxt) {
xmlChar *encoding = NULL;
SKIP_BLANKS;
if (CMP8(CUR_PTR, 'e', 'n', 'c', 'o', 'd', 'i', 'n', 'g')) {
SKIP(8);
SKIP_BLANKS;
if (RAW != '=') {
xmlFatalErr(ctxt, XML_ERR_EQUAL_REQUIRED, NULL);
return(NULL);
}
NEXT;
SKIP_BLANKS;
if (RAW == '"') {
NEXT;
encoding = xmlParseEncName(ctxt);
if (RAW != '"') {
xmlFatalErr(ctxt, XML_ERR_STRING_NOT_CLOSED, NULL);
xmlFree((xmlChar *) encoding);
return(NULL);
} else
NEXT;
} else if (RAW == '\''){
NEXT;
encoding = xmlParseEncName(ctxt);
if (RAW != '\'') {
xmlFatalErr(ctxt, XML_ERR_STRING_NOT_CLOSED, NULL);
xmlFree((xmlChar *) encoding);
return(NULL);
} else
NEXT;
} else {
xmlFatalErr(ctxt, XML_ERR_STRING_NOT_STARTED, NULL);
}
if (CMP8(CUR_PTR, 'e', 'n', 'c', 'o', 'd', 'i', 'n', 'g') == 0)
return(NULL);
/*
* Non standard parsing, allowing the user to ignore encoding
*/
if (ctxt->options & XML_PARSE_IGNORE_ENC) {
xmlFree((xmlChar *) encoding);
return(NULL);
}
/*
* UTF-16 encoding switch has already taken place at this stage,
* more over the little-endian/big-endian selection is already done
*/
if ((encoding != NULL) &&
((!xmlStrcasecmp(encoding, BAD_CAST "UTF-16")) ||
(!xmlStrcasecmp(encoding, BAD_CAST "UTF16")))) {
/*
* If no encoding was passed to the parser, that we are
* using UTF-16 and no decoder is present i.e. the
* document is apparently UTF-8 compatible, then raise an
* encoding mismatch fatal error
*/
if ((ctxt->encoding == NULL) &&
(ctxt->input->buf != NULL) &&
(ctxt->input->buf->encoder == NULL)) {
xmlFatalErrMsg(ctxt, XML_ERR_INVALID_ENCODING,
"Document labelled UTF-16 but has UTF-8 content\n");
}
if (ctxt->encoding != NULL)
xmlFree((xmlChar *) ctxt->encoding);
ctxt->encoding = encoding;
}
/*
* UTF-8 encoding is handled natively
*/
else if ((encoding != NULL) &&
((!xmlStrcasecmp(encoding, BAD_CAST "UTF-8")) ||
(!xmlStrcasecmp(encoding, BAD_CAST "UTF8")))) {
/* TODO: Check for encoding mismatch. */
if (ctxt->encoding != NULL)
xmlFree((xmlChar *) ctxt->encoding);
ctxt->encoding = encoding;
}
else if (encoding != NULL) {
xmlCharEncodingHandlerPtr handler;
if (ctxt->input->encoding != NULL)
xmlFree((xmlChar *) ctxt->input->encoding);
ctxt->input->encoding = encoding;
handler = xmlFindCharEncodingHandler((const char *) encoding);
if (handler != NULL) {
if (xmlSwitchToEncoding(ctxt, handler) < 0) {
/* failed to convert */
ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
return(NULL);
}
} else {
xmlFatalErrMsgStr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
"Unsupported encoding %s\n", encoding);
return(NULL);
}
}
SKIP(8);
SKIP_BLANKS;
if (RAW != '=') {
xmlFatalErr(ctxt, XML_ERR_EQUAL_REQUIRED, NULL);
return(NULL);
}
return(encoding);
NEXT;
SKIP_BLANKS;
if (RAW == '"') {
NEXT;
encoding = xmlParseEncName(ctxt);
if (RAW != '"') {
xmlFatalErr(ctxt, XML_ERR_STRING_NOT_CLOSED, NULL);
xmlFree((xmlChar *) encoding);
return(NULL);
} else
NEXT;
} else if (RAW == '\''){
NEXT;
encoding = xmlParseEncName(ctxt);
if (RAW != '\'') {
xmlFatalErr(ctxt, XML_ERR_STRING_NOT_CLOSED, NULL);
xmlFree((xmlChar *) encoding);
return(NULL);
} else
NEXT;
} else {
xmlFatalErr(ctxt, XML_ERR_STRING_NOT_STARTED, NULL);
}
if (encoding == NULL)
return(NULL);
xmlSetDeclaredEncoding(ctxt, encoding);
return(ctxt->encoding);
}
/**
@@ -10365,7 +10269,7 @@ xmlParseXMLDecl(xmlParserCtxtPtr ctxt) {
/*
* We may have the standalone status.
*/
if ((ctxt->input->encoding != NULL) && (!IS_BLANK_CH(RAW))) {
if ((ctxt->encoding != NULL) && (!IS_BLANK_CH(RAW))) {
if ((RAW == '?') && (NXT(1) == '>')) {
SKIP(2);
return;
@@ -10443,9 +10347,6 @@ xmlParseMisc(xmlParserCtxtPtr ctxt) {
int
xmlParseDocument(xmlParserCtxtPtr ctxt) {
xmlChar start[4];
xmlCharEncoding enc;
xmlInitParser();
if ((ctxt == NULL) || (ctxt->input == NULL))
@@ -10466,23 +10367,7 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) {
if (ctxt->instate == XML_PARSER_EOF)
return(-1);
if ((ctxt->encoding == NULL) &&
((ctxt->input->end - ctxt->input->cur) >= 4)) {
/*
* Get the 4 first bytes and decode the charset
* if enc != XML_CHAR_ENCODING_NONE
* plug some encoding conversion routines.
*/
start[0] = RAW;
start[1] = NXT(1);
start[2] = NXT(2);
start[3] = NXT(3);
enc = xmlDetectCharEncoding(&start[0], 4);
if (enc != XML_CHAR_ENCODING_NONE) {
xmlSwitchEncoding(ctxt, enc);
}
}
xmlDetectEncoding(ctxt);
if (CUR == 0) {
xmlFatalErr(ctxt, XML_ERR_DOCUMENT_EMPTY, NULL);
@@ -10626,38 +10511,18 @@ xmlParseDocument(xmlParserCtxtPtr ctxt) {
int
xmlParseExtParsedEnt(xmlParserCtxtPtr ctxt) {
xmlChar start[4];
xmlCharEncoding enc;
if ((ctxt == NULL) || (ctxt->input == NULL))
return(-1);
xmlDetectSAX2(ctxt);
GROW;
/*
* SAX: beginning of the document processing.
*/
if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
/*
* Get the 4 first bytes and decode the charset
* if enc != XML_CHAR_ENCODING_NONE
* plug some encoding conversion routines.
*/
if ((ctxt->input->end - ctxt->input->cur) >= 4) {
start[0] = RAW;
start[1] = NXT(1);
start[2] = NXT(2);
start[3] = NXT(3);
enc = xmlDetectCharEncoding(start, 4);
if (enc != XML_CHAR_ENCODING_NONE) {
xmlSwitchEncoding(ctxt, enc);
}
}
xmlDetectEncoding(ctxt);
if (CUR == 0) {
xmlFatalErr(ctxt, XML_ERR_DOCUMENT_EMPTY, NULL);
@@ -11076,6 +10941,9 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
case XML_PARSER_START:
xmlGenericError(xmlGenericErrorContext,
"PP: try START\n"); break;
case XML_PARSER_XML_DECL:
xmlGenericError(xmlGenericErrorContext,
"PP: try XML_DECL\n"); break;
case XML_PARSER_MISC:
xmlGenericError(xmlGenericErrorContext,
"PP: try MISC\n");break;
@@ -11164,39 +11032,25 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
*/
goto done;
case XML_PARSER_START:
if (ctxt->charset == XML_CHAR_ENCODING_NONE) {
xmlChar start[4];
xmlCharEncoding enc;
/*
* Very first chars read from the document flow.
*/
if (avail < 4)
goto done;
/*
* Very first chars read from the document flow.
*/
if (avail < 4)
goto done;
/*
* We need more bytes to detect EBCDIC code pages.
* See xmlDetectEBCDIC.
*/
if ((CMP4(CUR_PTR, 0x4C, 0x6F, 0xA7, 0x94)) &&
(!terminate) && (avail < 200))
goto done;
/*
* Get the 4 first bytes and decode the charset
* if enc != XML_CHAR_ENCODING_NONE
* plug some encoding conversion routines,
* else xmlSwitchEncoding will set to (default)
* UTF8.
*/
start[0] = RAW;
start[1] = NXT(1);
start[2] = NXT(2);
start[3] = NXT(3);
enc = xmlDetectCharEncoding(start, 4);
/*
* We need more bytes to detect EBCDIC code pages.
* See xmlDetectEBCDIC.
*/
if ((enc == XML_CHAR_ENCODING_EBCDIC) &&
(!terminate) && (avail < 200))
goto done;
xmlSwitchEncoding(ctxt, enc);
break;
}
xmlDetectEncoding(ctxt);
ctxt->instate = XML_PARSER_XML_DECL;
break;
case XML_PARSER_XML_DECL:
if (avail < 2)
goto done;
cur = ctxt->input->cur[0];
@@ -11242,9 +11096,6 @@ xmlParseTryOrFinish(xmlParserCtxtPtr ctxt, int terminate) {
xmlHaltParser(ctxt);
return(0);
}
if ((ctxt->encoding == NULL) &&
(ctxt->input->encoding != NULL))
ctxt->encoding = xmlStrdup(ctxt->input->encoding);
if ((ctxt->sax) && (ctxt->sax->startDocument) &&
(!ctxt->disableSAX))
ctxt->sax->startDocument(ctxt->userData);
@@ -11978,13 +11829,6 @@ xmlCreatePushParserCtxt(xmlSAXHandlerPtr sax, void *user_data,
xmlBufResetInput(inputStream->buf->buffer, inputStream);
inputPush(ctxt, inputStream);
/*
* If the caller didn't provide an initial 'chunk' for determining
* the encoding, we set the context to XML_CHAR_ENCODING_NONE so
* that it can be automatically determined later
*/
ctxt->charset = XML_CHAR_ENCODING_NONE;
if ((size != 0) && (chunk != NULL) &&
(ctxt->input != NULL) && (ctxt->input->buf != NULL)) {
size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
@@ -12092,7 +11936,6 @@ xmlIOParseDTD(xmlSAXHandlerPtr sax, xmlParserInputBufferPtr input,
xmlDtdPtr ret = NULL;
xmlParserCtxtPtr ctxt;
xmlParserInputPtr pinput = NULL;
xmlChar start[4];
if (input == NULL)
return(NULL);
@@ -12150,22 +11993,7 @@ xmlIOParseDTD(xmlSAXHandlerPtr sax, xmlParserInputBufferPtr input,
ctxt->myDoc->extSubset = xmlNewDtd(ctxt->myDoc, BAD_CAST "none",
BAD_CAST "none", BAD_CAST "none");
if ((enc == XML_CHAR_ENCODING_NONE) &&
((ctxt->input->end - ctxt->input->cur) >= 4)) {
/*
* Get the 4 first bytes and decode the charset
* if enc != XML_CHAR_ENCODING_NONE
* plug some encoding conversion routines.
*/
start[0] = RAW;
start[1] = NXT(1);
start[2] = NXT(2);
start[3] = NXT(3);
enc = xmlDetectCharEncoding(start, 4);
if (enc != XML_CHAR_ENCODING_NONE) {
xmlSwitchEncoding(ctxt, enc);
}
}
xmlDetectEncoding(ctxt);
xmlParseExternalSubset(ctxt, BAD_CAST "none", BAD_CAST "none");
@@ -12213,7 +12041,6 @@ xmlSAXParseDTD(xmlSAXHandlerPtr sax, const xmlChar *ExternalID,
xmlDtdPtr ret = NULL;
xmlParserCtxtPtr ctxt;
xmlParserInputPtr input = NULL;
xmlCharEncoding enc;
xmlChar* systemIdCanonic;
if ((ExternalID == NULL) && (SystemID == NULL)) return(NULL);
@@ -12258,10 +12085,8 @@ xmlSAXParseDTD(xmlSAXHandlerPtr sax, const xmlChar *ExternalID,
xmlFree(systemIdCanonic);
return(NULL);
}
if ((ctxt->input->end - ctxt->input->cur) >= 4) {
enc = xmlDetectCharEncoding(ctxt->input->cur, 4);
xmlSwitchEncoding(ctxt, enc);
}
xmlDetectEncoding(ctxt);
if (input->filename == NULL)
input->filename = (char *) systemIdCanonic;
@@ -12399,8 +12224,6 @@ xmlParseExternalEntityPrivate(xmlDocPtr doc, xmlParserCtxtPtr oldctxt,
xmlDocPtr newDoc;
xmlNodePtr newRoot;
xmlParserErrors ret = XML_ERR_OK;
xmlChar start[4];
xmlCharEncoding enc;
if (((depth > 40) &&
((oldctxt == NULL) || (oldctxt->options & XML_PARSE_HUGE) == 0)) ||
@@ -12461,22 +12284,7 @@ xmlParseExternalEntityPrivate(xmlDocPtr doc, xmlParserCtxtPtr oldctxt,
newRoot->doc = doc;
}
/*
* Get the 4 first bytes and decode the charset
* if enc != XML_CHAR_ENCODING_NONE
* plug some encoding conversion routines.
*/
GROW;
if ((ctxt->input->end - ctxt->input->cur) >= 4) {
start[0] = RAW;
start[1] = NXT(1);
start[2] = NXT(2);
start[3] = NXT(3);
enc = xmlDetectCharEncoding(start, 4);
if (enc != XML_CHAR_ENCODING_NONE) {
xmlSwitchEncoding(ctxt, enc);
}
}
xmlDetectEncoding(ctxt);
/*
* Parse a possible text declaration first
@@ -12963,10 +12771,6 @@ xmlParseInNodeContext(xmlNodePtr node, const char *data, int datalen,
if (doc->encoding != NULL) {
xmlCharEncodingHandlerPtr hdlr;
if (ctxt->encoding != NULL)
xmlFree((xmlChar *) ctxt->encoding);
ctxt->encoding = xmlStrdup((const xmlChar *) doc->encoding);
hdlr = xmlFindCharEncodingHandler((const char *) doc->encoding);
if (hdlr != NULL) {
xmlSwitchToEncoding(ctxt, hdlr);
@@ -14273,7 +14077,6 @@ xmlCtxtReset(xmlParserCtxtPtr ctxt)
ctxt->inSubset = 0;
ctxt->errNo = XML_ERR_OK;
ctxt->depth = 0;
ctxt->charset = XML_CHAR_ENCODING_UTF8;
ctxt->catalogs = NULL;
ctxt->sizeentities = 0;
ctxt->sizeentcopy = 0;
@@ -14374,10 +14177,6 @@ xmlCtxtResetPush(xmlParserCtxtPtr ctxt, const char *chunk,
if (encoding != NULL) {
xmlCharEncodingHandlerPtr hdlr;
if (ctxt->encoding != NULL)
xmlFree((xmlChar *) ctxt->encoding);
ctxt->encoding = xmlStrdup((const xmlChar *) encoding);
hdlr = xmlFindCharEncodingHandler(encoding);
if (hdlr != NULL) {
xmlSwitchToEncoding(ctxt, hdlr);