mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-10-24 13:33:01 +03:00
html: Map some encodings according to HTML5
Windows-1252 is a superset of ISO-8859-1 and should be used instead. Same for ASCII. Also map UCS-2 and UTF-16 to UTF-16LE.
This commit is contained in:
@@ -2745,7 +2745,8 @@ htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask,
|
||||
guess = htmlFindEncoding(ctxt);
|
||||
#endif
|
||||
if (guess == NULL) {
|
||||
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
|
||||
xmlSwitchEncoding(ctxt,
|
||||
XML_CHAR_ENCODING_WINDOWS_1252);
|
||||
} else {
|
||||
xmlSwitchEncodingName(ctxt, (const char *) guess);
|
||||
xmlFree(guess);
|
||||
@@ -3288,7 +3289,8 @@ htmlParseCharData(htmlParserCtxtPtr ctxt, int partial) {
|
||||
guess = htmlFindEncoding(ctxt);
|
||||
#endif
|
||||
if (guess == NULL) {
|
||||
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
|
||||
xmlSwitchEncoding(ctxt,
|
||||
XML_CHAR_ENCODING_WINDOWS_1252);
|
||||
} else {
|
||||
xmlSwitchEncodingName(ctxt, (const char *) guess);
|
||||
xmlFree(guess);
|
||||
|
||||
20
encoding.c
20
encoding.c
@@ -1087,6 +1087,26 @@ xmlCreateCharEncodingHandler(const char *name, xmlCharEncFlags flags,
|
||||
return(XML_ERR_OK);
|
||||
|
||||
if ((enc > 0) && ((size_t) enc < NUM_DEFAULT_HANDLERS)) {
|
||||
if (flags & XML_ENC_HTML) {
|
||||
/*
|
||||
* TODO: HTML5 only allows a fixed set of charset
|
||||
* labels. We should add an option to enable or
|
||||
* disable this restriction.
|
||||
*
|
||||
* TODO: Map ISO-8859-9 to windows-1254.
|
||||
*/
|
||||
switch (enc) {
|
||||
case XML_CHAR_ENCODING_ASCII:
|
||||
case XML_CHAR_ENCODING_8859_1:
|
||||
enc = XML_CHAR_ENCODING_WINDOWS_1252;
|
||||
break;
|
||||
case XML_CHAR_ENCODING_UCS2:
|
||||
case XML_CHAR_ENCODING_UTF16:
|
||||
enc = XML_CHAR_ENCODING_UTF16LE;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
handler = &defaultHandlers[enc];
|
||||
if ((((flags & XML_ENC_INPUT) == 0) || (handler->input.func)) &&
|
||||
(((flags & XML_ENC_OUTPUT) == 0) || (handler->output.func))) {
|
||||
|
||||
@@ -119,7 +119,9 @@ typedef enum {
|
||||
/** Create converter for input (conversion to UTF-8) */
|
||||
XML_ENC_INPUT = (1 << 0),
|
||||
/** Create converter for output (conversion from UTF-8) */
|
||||
XML_ENC_OUTPUT = (1 << 1)
|
||||
XML_ENC_OUTPUT = (1 << 1),
|
||||
/** Use HTML5 mappings */
|
||||
XML_ENC_HTML = (1 << 2)
|
||||
} xmlCharEncFlags;
|
||||
|
||||
/**
|
||||
|
||||
@@ -1229,11 +1229,14 @@ xmlSwitchInputEncodingName(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
|
||||
const char *encoding) {
|
||||
xmlCharEncodingHandlerPtr handler;
|
||||
xmlParserErrors res;
|
||||
xmlCharEncFlags flags = XML_ENC_INPUT;
|
||||
|
||||
if (encoding == NULL)
|
||||
return(-1);
|
||||
|
||||
res = xmlCreateCharEncodingHandler(encoding, XML_ENC_INPUT,
|
||||
if (ctxt->html)
|
||||
flags |= XML_ENC_HTML;
|
||||
res = xmlCreateCharEncodingHandler(encoding, flags,
|
||||
ctxt->convImpl, ctxt->convCtxt, &handler);
|
||||
if (res == XML_ERR_UNSUPPORTED_ENCODING) {
|
||||
xmlWarningMsg(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
|
||||
@@ -1569,14 +1572,17 @@ xmlSetDeclaredEncoding(xmlParserCtxtPtr ctxt, xmlChar *encoding) {
|
||||
((ctxt->options & XML_PARSE_IGNORE_ENC) == 0)) {
|
||||
xmlCharEncodingHandlerPtr handler;
|
||||
xmlParserErrors res;
|
||||
xmlCharEncFlags flags = XML_ENC_INPUT;
|
||||
|
||||
/*
|
||||
* xmlSwitchEncodingName treats unsupported encodings as
|
||||
* warnings, but we want it to be an error in an encoding
|
||||
* declaration.
|
||||
*/
|
||||
if (ctxt->html)
|
||||
flags |= XML_ENC_HTML;
|
||||
res = xmlCreateCharEncodingHandler((const char *) encoding,
|
||||
XML_ENC_INPUT, ctxt->convImpl, ctxt->convCtxt, &handler);
|
||||
flags, ctxt->convImpl, ctxt->convCtxt, &handler);
|
||||
if (res != XML_ERR_OK) {
|
||||
xmlFatalErr(ctxt, res, (const char *) encoding);
|
||||
xmlFree(encoding);
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
SAX.setDocumentLocator()
|
||||
SAX.startDocument()
|
||||
SAX.comment(?a)
|
||||
SAX.comment(?a“)
|
||||
SAX.endDocument()
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
SAX.setDocumentLocator()
|
||||
SAX.startDocument()
|
||||
SAX.comment(<!dOctYPE
|
||||
SAX.comment(‘<!dOctYPE
|
||||
)
|
||||
SAX.endDocument()
|
||||
|
||||
Reference in New Issue
Block a user