1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-10-26 00:37:43 +03:00

parser: Rework encoding detection

Introduce XML_INPUT_HAS_ENCODING flag for xmlParserInput which is set
when xmlSwitchEncoding is called. The parser can use the flag to
reliably detect whether an encoding was already set via user override,
BOM or other auto-detection. In this case, the encoding declaration
won't be used to switch the encoding.

Before, an inscrutable mix of ctxt->charset, ctxt->input->encoding
and ctxt->input->buf->encoder was used.

Introduce private helper functions to switch encodings used by both the
XML and HTML parser:

- xmlDetectEncoding which skips over the BOM, allowing to remove the
  BOM checks from other encoding functions.
- xmlSetDeclaredEncoding, replacing htmlCheckEncodingDirect, which warns
  about encoding mismatches.

If users override the encoding, store the declared instead of the actual
encoding in xmlDoc. In this case, the actual encoding is known and the
raw value from the doc is more useful.

Also use the input flags to store the ISO-8859-1 fallback state.
Restrict the fallback to cases where no encoding was specified. (The
fallback is only useful in recovery mode and these days broken UTF-8 is
probably more likely than ISO-8859-1, so it might eventually be removed
completely.)

The 'charset' member of xmlParserCtxt is now unused. The 'encoding'
member of xmlParserInput is now unused.

The 'standalone' member of xmlParserInput is renamed to 'flags'.

A new parser state XML_PARSER_XML_DECL is added for the push parser.
This commit is contained in:
Nick Wellnhofer
2023-08-08 15:19:46 +02:00
parent d38e73f91e
commit ec7be50662
10 changed files with 341 additions and 583 deletions

View File

@@ -63,9 +63,9 @@ struct _xmlParserInput {
int col; /* Current column */
unsigned long consumed; /* How many xmlChars already consumed */
xmlParserInputDeallocate free; /* function to deallocate the base */
const xmlChar *encoding; /* the encoding string for entity */
const xmlChar *encoding; /* unused */
const xmlChar *version; /* the version string for entity */
int standalone; /* Was that entity marked standalone */
int flags; /* Flags */
int id; /* an unique identifier for the entity */
unsigned long parentConsumed; /* consumed bytes from parents */
xmlEntityPtr entity; /* entity, if any */
@@ -122,7 +122,8 @@ typedef enum {
XML_PARSER_SYSTEM_LITERAL, /* within a SYSTEM value */
XML_PARSER_EPILOG, /* the Misc* after the last end tag */
XML_PARSER_IGNORE, /* within an IGNORED section */
XML_PARSER_PUBLIC_LITERAL /* within a PUBLIC value */
XML_PARSER_PUBLIC_LITERAL, /* within a PUBLIC value */
XML_PARSER_XML_DECL /* before XML decl (but after BOM) */
} xmlParserInputState;
/**
@@ -245,8 +246,7 @@ struct _xmlParserCtxt {
int depth; /* to prevent entity substitution loops */
xmlParserInputPtr entity; /* used to check entities boundaries */
int charset; /* encoding of the in-memory content
actually an xmlCharEncoding */
int charset; /* unused */
int nodelen; /* Those two fields are there to */
int nodemem; /* Speed up large node parsing */
int pedantic; /* signal pedantic warnings */