mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-10-24 13:33:01 +03:00
html: Map some encodings according to HTML5
Windows-1252 is a superset of ISO-8859-1 and should be used instead. Same for ASCII. Also map UCS-2 and UTF-16 to UTF-16LE.
This commit is contained in:
@@ -2745,7 +2745,8 @@ htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask,
|
|||||||
guess = htmlFindEncoding(ctxt);
|
guess = htmlFindEncoding(ctxt);
|
||||||
#endif
|
#endif
|
||||||
if (guess == NULL) {
|
if (guess == NULL) {
|
||||||
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
|
xmlSwitchEncoding(ctxt,
|
||||||
|
XML_CHAR_ENCODING_WINDOWS_1252);
|
||||||
} else {
|
} else {
|
||||||
xmlSwitchEncodingName(ctxt, (const char *) guess);
|
xmlSwitchEncodingName(ctxt, (const char *) guess);
|
||||||
xmlFree(guess);
|
xmlFree(guess);
|
||||||
@@ -3288,7 +3289,8 @@ htmlParseCharData(htmlParserCtxtPtr ctxt, int partial) {
|
|||||||
guess = htmlFindEncoding(ctxt);
|
guess = htmlFindEncoding(ctxt);
|
||||||
#endif
|
#endif
|
||||||
if (guess == NULL) {
|
if (guess == NULL) {
|
||||||
xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
|
xmlSwitchEncoding(ctxt,
|
||||||
|
XML_CHAR_ENCODING_WINDOWS_1252);
|
||||||
} else {
|
} else {
|
||||||
xmlSwitchEncodingName(ctxt, (const char *) guess);
|
xmlSwitchEncodingName(ctxt, (const char *) guess);
|
||||||
xmlFree(guess);
|
xmlFree(guess);
|
||||||
|
|||||||
20
encoding.c
20
encoding.c
@@ -1087,6 +1087,26 @@ xmlCreateCharEncodingHandler(const char *name, xmlCharEncFlags flags,
|
|||||||
return(XML_ERR_OK);
|
return(XML_ERR_OK);
|
||||||
|
|
||||||
if ((enc > 0) && ((size_t) enc < NUM_DEFAULT_HANDLERS)) {
|
if ((enc > 0) && ((size_t) enc < NUM_DEFAULT_HANDLERS)) {
|
||||||
|
if (flags & XML_ENC_HTML) {
|
||||||
|
/*
|
||||||
|
* TODO: HTML5 only allows a fixed set of charset
|
||||||
|
* labels. We should add an option to enable or
|
||||||
|
* disable this restriction.
|
||||||
|
*
|
||||||
|
* TODO: Map ISO-8859-9 to windows-1254.
|
||||||
|
*/
|
||||||
|
switch (enc) {
|
||||||
|
case XML_CHAR_ENCODING_ASCII:
|
||||||
|
case XML_CHAR_ENCODING_8859_1:
|
||||||
|
enc = XML_CHAR_ENCODING_WINDOWS_1252;
|
||||||
|
break;
|
||||||
|
case XML_CHAR_ENCODING_UCS2:
|
||||||
|
case XML_CHAR_ENCODING_UTF16:
|
||||||
|
enc = XML_CHAR_ENCODING_UTF16LE;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
handler = &defaultHandlers[enc];
|
handler = &defaultHandlers[enc];
|
||||||
if ((((flags & XML_ENC_INPUT) == 0) || (handler->input.func)) &&
|
if ((((flags & XML_ENC_INPUT) == 0) || (handler->input.func)) &&
|
||||||
(((flags & XML_ENC_OUTPUT) == 0) || (handler->output.func))) {
|
(((flags & XML_ENC_OUTPUT) == 0) || (handler->output.func))) {
|
||||||
|
|||||||
@@ -119,7 +119,9 @@ typedef enum {
|
|||||||
/** Create converter for input (conversion to UTF-8) */
|
/** Create converter for input (conversion to UTF-8) */
|
||||||
XML_ENC_INPUT = (1 << 0),
|
XML_ENC_INPUT = (1 << 0),
|
||||||
/** Create converter for output (conversion from UTF-8) */
|
/** Create converter for output (conversion from UTF-8) */
|
||||||
XML_ENC_OUTPUT = (1 << 1)
|
XML_ENC_OUTPUT = (1 << 1),
|
||||||
|
/** Use HTML5 mappings */
|
||||||
|
XML_ENC_HTML = (1 << 2)
|
||||||
} xmlCharEncFlags;
|
} xmlCharEncFlags;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -1229,11 +1229,14 @@ xmlSwitchInputEncodingName(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
|
|||||||
const char *encoding) {
|
const char *encoding) {
|
||||||
xmlCharEncodingHandlerPtr handler;
|
xmlCharEncodingHandlerPtr handler;
|
||||||
xmlParserErrors res;
|
xmlParserErrors res;
|
||||||
|
xmlCharEncFlags flags = XML_ENC_INPUT;
|
||||||
|
|
||||||
if (encoding == NULL)
|
if (encoding == NULL)
|
||||||
return(-1);
|
return(-1);
|
||||||
|
|
||||||
res = xmlCreateCharEncodingHandler(encoding, XML_ENC_INPUT,
|
if (ctxt->html)
|
||||||
|
flags |= XML_ENC_HTML;
|
||||||
|
res = xmlCreateCharEncodingHandler(encoding, flags,
|
||||||
ctxt->convImpl, ctxt->convCtxt, &handler);
|
ctxt->convImpl, ctxt->convCtxt, &handler);
|
||||||
if (res == XML_ERR_UNSUPPORTED_ENCODING) {
|
if (res == XML_ERR_UNSUPPORTED_ENCODING) {
|
||||||
xmlWarningMsg(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
|
xmlWarningMsg(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
|
||||||
@@ -1569,14 +1572,17 @@ xmlSetDeclaredEncoding(xmlParserCtxtPtr ctxt, xmlChar *encoding) {
|
|||||||
((ctxt->options & XML_PARSE_IGNORE_ENC) == 0)) {
|
((ctxt->options & XML_PARSE_IGNORE_ENC) == 0)) {
|
||||||
xmlCharEncodingHandlerPtr handler;
|
xmlCharEncodingHandlerPtr handler;
|
||||||
xmlParserErrors res;
|
xmlParserErrors res;
|
||||||
|
xmlCharEncFlags flags = XML_ENC_INPUT;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* xmlSwitchEncodingName treats unsupported encodings as
|
* xmlSwitchEncodingName treats unsupported encodings as
|
||||||
* warnings, but we want it to be an error in an encoding
|
* warnings, but we want it to be an error in an encoding
|
||||||
* declaration.
|
* declaration.
|
||||||
*/
|
*/
|
||||||
|
if (ctxt->html)
|
||||||
|
flags |= XML_ENC_HTML;
|
||||||
res = xmlCreateCharEncodingHandler((const char *) encoding,
|
res = xmlCreateCharEncodingHandler((const char *) encoding,
|
||||||
XML_ENC_INPUT, ctxt->convImpl, ctxt->convCtxt, &handler);
|
flags, ctxt->convImpl, ctxt->convCtxt, &handler);
|
||||||
if (res != XML_ERR_OK) {
|
if (res != XML_ERR_OK) {
|
||||||
xmlFatalErr(ctxt, res, (const char *) encoding);
|
xmlFatalErr(ctxt, res, (const char *) encoding);
|
||||||
xmlFree(encoding);
|
xmlFree(encoding);
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
SAX.setDocumentLocator()
|
SAX.setDocumentLocator()
|
||||||
SAX.startDocument()
|
SAX.startDocument()
|
||||||
SAX.comment(?a)
|
SAX.comment(?a“)
|
||||||
SAX.endDocument()
|
SAX.endDocument()
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
SAX.setDocumentLocator()
|
SAX.setDocumentLocator()
|
||||||
SAX.startDocument()
|
SAX.startDocument()
|
||||||
SAX.comment(<!dOctYPE
|
SAX.comment(‘<!dOctYPE
|
||||||
)
|
)
|
||||||
SAX.endDocument()
|
SAX.endDocument()
|
||||||
|
|||||||
Reference in New Issue
Block a user