1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-10-24 13:33:01 +03:00

html: Map some encodings according to HTML5

Windows-1252 is a superset of ISO-8859-1 and should be used instead.
Same for ASCII.

Also map UCS-2 and UTF-16 to UTF-16LE.
This commit is contained in:
Nick Wellnhofer
2025-05-12 13:00:20 +02:00
parent 93f671064e
commit f0983199e8
6 changed files with 37 additions and 7 deletions

View File

@@ -2745,7 +2745,8 @@ htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask,
guess = htmlFindEncoding(ctxt);
#endif
if (guess == NULL) {
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
xmlSwitchEncoding(ctxt,
XML_CHAR_ENCODING_WINDOWS_1252);
} else {
xmlSwitchEncodingName(ctxt, (const char *) guess);
xmlFree(guess);
@@ -3288,7 +3289,8 @@ htmlParseCharData(htmlParserCtxtPtr ctxt, int partial) {
guess = htmlFindEncoding(ctxt);
#endif
if (guess == NULL) {
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
xmlSwitchEncoding(ctxt,
XML_CHAR_ENCODING_WINDOWS_1252);
} else {
xmlSwitchEncodingName(ctxt, (const char *) guess);
xmlFree(guess);

View File

@@ -1087,6 +1087,26 @@ xmlCreateCharEncodingHandler(const char *name, xmlCharEncFlags flags,
return(XML_ERR_OK);
if ((enc > 0) && ((size_t) enc < NUM_DEFAULT_HANDLERS)) {
if (flags & XML_ENC_HTML) {
/*
* TODO: HTML5 only allows a fixed set of charset
* labels. We should add an option to enable or
* disable this restriction.
*
* TODO: Map ISO-8859-9 to windows-1254.
*/
switch (enc) {
case XML_CHAR_ENCODING_ASCII:
case XML_CHAR_ENCODING_8859_1:
enc = XML_CHAR_ENCODING_WINDOWS_1252;
break;
case XML_CHAR_ENCODING_UCS2:
case XML_CHAR_ENCODING_UTF16:
enc = XML_CHAR_ENCODING_UTF16LE;
break;
}
}
handler = &defaultHandlers[enc];
if ((((flags & XML_ENC_INPUT) == 0) || (handler->input.func)) &&
(((flags & XML_ENC_OUTPUT) == 0) || (handler->output.func))) {

View File

@@ -119,7 +119,9 @@ typedef enum {
/** Create converter for input (conversion to UTF-8) */
XML_ENC_INPUT = (1 << 0),
/** Create converter for output (conversion from UTF-8) */
XML_ENC_OUTPUT = (1 << 1)
XML_ENC_OUTPUT = (1 << 1),
/** Use HTML5 mappings */
XML_ENC_HTML = (1 << 2)
} xmlCharEncFlags;
/**

View File

@@ -1229,11 +1229,14 @@ xmlSwitchInputEncodingName(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
const char *encoding) {
xmlCharEncodingHandlerPtr handler;
xmlParserErrors res;
xmlCharEncFlags flags = XML_ENC_INPUT;
if (encoding == NULL)
return(-1);
res = xmlCreateCharEncodingHandler(encoding, XML_ENC_INPUT,
if (ctxt->html)
flags |= XML_ENC_HTML;
res = xmlCreateCharEncodingHandler(encoding, flags,
ctxt->convImpl, ctxt->convCtxt, &handler);
if (res == XML_ERR_UNSUPPORTED_ENCODING) {
xmlWarningMsg(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
@@ -1569,14 +1572,17 @@ xmlSetDeclaredEncoding(xmlParserCtxtPtr ctxt, xmlChar *encoding) {
((ctxt->options & XML_PARSE_IGNORE_ENC) == 0)) {
xmlCharEncodingHandlerPtr handler;
xmlParserErrors res;
xmlCharEncFlags flags = XML_ENC_INPUT;
/*
* xmlSwitchEncodingName treats unsupported encodings as
* warnings, but we want it to be an error in an encoding
* declaration.
*/
if (ctxt->html)
flags |= XML_ENC_HTML;
res = xmlCreateCharEncodingHandler((const char *) encoding,
XML_ENC_INPUT, ctxt->convImpl, ctxt->convCtxt, &handler);
flags, ctxt->convImpl, ctxt->convCtxt, &handler);
if (res != XML_ERR_OK) {
xmlFatalErr(ctxt, res, (const char *) encoding);
xmlFree(encoding);

View File

@@ -1,4 +1,4 @@
SAX.setDocumentLocator()
SAX.startDocument()
SAX.comment(?a“)
SAX.comment(?a)
SAX.endDocument()

View File

@@ -1,5 +1,5 @@
SAX.setDocumentLocator()
SAX.startDocument()
SAX.comment( ‘<!dOctYPE
SAX.comment( <!dOctYPE
)
SAX.endDocument()