1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-10-24 13:33:01 +03:00

html: Map some encodings according to HTML5

Windows-1252 is a superset of ISO-8859-1 and should be used instead.
Same for ASCII.

Also map UCS-2 and UTF-16 to UTF-16LE.
This commit is contained in:
Nick Wellnhofer
2025-05-12 13:00:20 +02:00
parent 93f671064e
commit f0983199e8
6 changed files with 37 additions and 7 deletions

View File

@@ -2745,7 +2745,8 @@ htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask,
guess = htmlFindEncoding(ctxt); guess = htmlFindEncoding(ctxt);
#endif #endif
if (guess == NULL) { if (guess == NULL) {
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); xmlSwitchEncoding(ctxt,
XML_CHAR_ENCODING_WINDOWS_1252);
} else { } else {
xmlSwitchEncodingName(ctxt, (const char *) guess); xmlSwitchEncodingName(ctxt, (const char *) guess);
xmlFree(guess); xmlFree(guess);
@@ -3288,7 +3289,8 @@ htmlParseCharData(htmlParserCtxtPtr ctxt, int partial) {
guess = htmlFindEncoding(ctxt); guess = htmlFindEncoding(ctxt);
#endif #endif
if (guess == NULL) { if (guess == NULL) {
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); xmlSwitchEncoding(ctxt,
XML_CHAR_ENCODING_WINDOWS_1252);
} else { } else {
xmlSwitchEncodingName(ctxt, (const char *) guess); xmlSwitchEncodingName(ctxt, (const char *) guess);
xmlFree(guess); xmlFree(guess);

View File

@@ -1087,6 +1087,26 @@ xmlCreateCharEncodingHandler(const char *name, xmlCharEncFlags flags,
return(XML_ERR_OK); return(XML_ERR_OK);
if ((enc > 0) && ((size_t) enc < NUM_DEFAULT_HANDLERS)) { if ((enc > 0) && ((size_t) enc < NUM_DEFAULT_HANDLERS)) {
if (flags & XML_ENC_HTML) {
/*
* TODO: HTML5 only allows a fixed set of charset
* labels. We should add an option to enable or
* disable this restriction.
*
* TODO: Map ISO-8859-9 to windows-1254.
*/
switch (enc) {
case XML_CHAR_ENCODING_ASCII:
case XML_CHAR_ENCODING_8859_1:
enc = XML_CHAR_ENCODING_WINDOWS_1252;
break;
case XML_CHAR_ENCODING_UCS2:
case XML_CHAR_ENCODING_UTF16:
enc = XML_CHAR_ENCODING_UTF16LE;
break;
}
}
handler = &defaultHandlers[enc]; handler = &defaultHandlers[enc];
if ((((flags & XML_ENC_INPUT) == 0) || (handler->input.func)) && if ((((flags & XML_ENC_INPUT) == 0) || (handler->input.func)) &&
(((flags & XML_ENC_OUTPUT) == 0) || (handler->output.func))) { (((flags & XML_ENC_OUTPUT) == 0) || (handler->output.func))) {

View File

@@ -119,7 +119,9 @@ typedef enum {
/** Create converter for input (conversion to UTF-8) */ /** Create converter for input (conversion to UTF-8) */
XML_ENC_INPUT = (1 << 0), XML_ENC_INPUT = (1 << 0),
/** Create converter for output (conversion from UTF-8) */ /** Create converter for output (conversion from UTF-8) */
XML_ENC_OUTPUT = (1 << 1) XML_ENC_OUTPUT = (1 << 1),
/** Use HTML5 mappings */
XML_ENC_HTML = (1 << 2)
} xmlCharEncFlags; } xmlCharEncFlags;
/** /**

View File

@@ -1229,11 +1229,14 @@ xmlSwitchInputEncodingName(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
const char *encoding) { const char *encoding) {
xmlCharEncodingHandlerPtr handler; xmlCharEncodingHandlerPtr handler;
xmlParserErrors res; xmlParserErrors res;
xmlCharEncFlags flags = XML_ENC_INPUT;
if (encoding == NULL) if (encoding == NULL)
return(-1); return(-1);
res = xmlCreateCharEncodingHandler(encoding, XML_ENC_INPUT, if (ctxt->html)
flags |= XML_ENC_HTML;
res = xmlCreateCharEncodingHandler(encoding, flags,
ctxt->convImpl, ctxt->convCtxt, &handler); ctxt->convImpl, ctxt->convCtxt, &handler);
if (res == XML_ERR_UNSUPPORTED_ENCODING) { if (res == XML_ERR_UNSUPPORTED_ENCODING) {
xmlWarningMsg(ctxt, XML_ERR_UNSUPPORTED_ENCODING, xmlWarningMsg(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
@@ -1569,14 +1572,17 @@ xmlSetDeclaredEncoding(xmlParserCtxtPtr ctxt, xmlChar *encoding) {
((ctxt->options & XML_PARSE_IGNORE_ENC) == 0)) { ((ctxt->options & XML_PARSE_IGNORE_ENC) == 0)) {
xmlCharEncodingHandlerPtr handler; xmlCharEncodingHandlerPtr handler;
xmlParserErrors res; xmlParserErrors res;
xmlCharEncFlags flags = XML_ENC_INPUT;
/* /*
* xmlSwitchEncodingName treats unsupported encodings as * xmlSwitchEncodingName treats unsupported encodings as
* warnings, but we want it to be an error in an encoding * warnings, but we want it to be an error in an encoding
* declaration. * declaration.
*/ */
if (ctxt->html)
flags |= XML_ENC_HTML;
res = xmlCreateCharEncodingHandler((const char *) encoding, res = xmlCreateCharEncodingHandler((const char *) encoding,
XML_ENC_INPUT, ctxt->convImpl, ctxt->convCtxt, &handler); flags, ctxt->convImpl, ctxt->convCtxt, &handler);
if (res != XML_ERR_OK) { if (res != XML_ERR_OK) {
xmlFatalErr(ctxt, res, (const char *) encoding); xmlFatalErr(ctxt, res, (const char *) encoding);
xmlFree(encoding); xmlFree(encoding);

View File

@@ -1,4 +1,4 @@
SAX.setDocumentLocator() SAX.setDocumentLocator()
SAX.startDocument() SAX.startDocument()
SAX.comment(?a“) SAX.comment(?a)
SAX.endDocument() SAX.endDocument()

View File

@@ -1,5 +1,5 @@
SAX.setDocumentLocator() SAX.setDocumentLocator()
SAX.startDocument() SAX.startDocument()
SAX.comment( ‘<!dOctYPE SAX.comment( <!dOctYPE
) )
SAX.endDocument() SAX.endDocument()