mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-10-23 01:52:48 +03:00
parser: Rework encoding detection
Introduce XML_INPUT_HAS_ENCODING flag for xmlParserInput which is set when xmlSwitchEncoding is called. The parser can use the flag to reliably detect whether an encoding was already set via user override, BOM or other auto-detection. In this case, the encoding declaration won't be used to switch the encoding. Before, an inscrutable mix of ctxt->charset, ctxt->input->encoding and ctxt->input->buf->encoder was used. Introduce private helper functions to switch encodings used by both the XML and HTML parser: - xmlDetectEncoding which skips over the BOM, allowing to remove the BOM checks from other encoding functions. - xmlSetDeclaredEncoding, replacing htmlCheckEncodingDirect, which warns about encoding mismatches. If users override the encoding, store the declared instead of the actual encoding in xmlDoc. In this case, the actual encoding is known and the raw value from the doc is more useful. Also use the input flags to store the ISO-8859-1 fallback state. Restrict the fallback to cases where no encoding was specified. (The fallback is only useful in recovery mode and these days broken UTF-8 is probably more likely than ISO-8859-1, so it might eventually be removed completely.) The 'charset' member of xmlParserCtxt is now unused. The 'encoding' member of xmlParserInput is now unused. The 'standalone' member of xmlParserInput is renamed to 'flags'. A new parser state XML_PARSER_XML_DECL is added for the push parser.
This commit is contained in:
@@ -271,11 +271,11 @@ static int testCharRangeByte1(xmlParserCtxtPtr ctxt) {
|
||||
data[3] = 0;
|
||||
for (i = 0;i <= 0xFF;i++) {
|
||||
data[0] = (char) i;
|
||||
ctxt->charset = XML_CHAR_ENCODING_UTF8;
|
||||
ctxt->nbErrors = 0;
|
||||
|
||||
lastError = 0;
|
||||
c = xmlCurrentChar(ctxt, &len);
|
||||
ctxt->input->flags = 0;
|
||||
if ((i == 0) || (i >= 0x80)) {
|
||||
/* we must see an error there */
|
||||
if (lastError != XML_ERR_INVALID_CHAR) {
|
||||
@@ -307,11 +307,11 @@ static int testCharRangeByte2(xmlParserCtxtPtr ctxt) {
|
||||
for (j = 0;j <= 0xFF;j++) {
|
||||
data[0] = (char) i;
|
||||
data[1] = (char) j;
|
||||
ctxt->charset = XML_CHAR_ENCODING_UTF8;
|
||||
ctxt->nbErrors = 0;
|
||||
|
||||
lastError = 0;
|
||||
c = xmlCurrentChar(ctxt, &len);
|
||||
ctxt->input->flags = 0;
|
||||
|
||||
/* if first bit of first char is set, then second bit must too */
|
||||
if ((i & 0x80) && ((i & 0x40) == 0)) {
|
||||
@@ -401,11 +401,11 @@ static int testCharRangeByte3(xmlParserCtxtPtr ctxt) {
|
||||
K = lows[k];
|
||||
data[2] = (char) K;
|
||||
value = (K & 0x3F) + ((j & 0x3F) << 6) + ((i & 0xF) << 12);
|
||||
ctxt->charset = XML_CHAR_ENCODING_UTF8;
|
||||
ctxt->nbErrors = 0;
|
||||
|
||||
lastError = 0;
|
||||
c = xmlCurrentChar(ctxt, &len);
|
||||
ctxt->input->flags = 0;
|
||||
|
||||
/*
|
||||
* if fourth bit of first char is set, then the sequence would need
|
||||
@@ -504,11 +504,11 @@ static int testCharRangeByte4(xmlParserCtxtPtr ctxt) {
|
||||
data[3] = (char) L;
|
||||
value = (L & 0x3F) + ((K & 0x3F) << 6) + ((j & 0x3F) << 12) +
|
||||
((i & 0x7) << 18);
|
||||
ctxt->charset = XML_CHAR_ENCODING_UTF8;
|
||||
ctxt->nbErrors = 0;
|
||||
|
||||
lastError = 0;
|
||||
c = xmlCurrentChar(ctxt, &len);
|
||||
ctxt->input->flags = 0;
|
||||
|
||||
/*
|
||||
* if fifth bit of first char is set, then the sequence would need
|
||||
|
Reference in New Issue
Block a user