diff --git a/HTMLparser.c b/HTMLparser.c index 4afab358..06efbc93 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -2745,7 +2745,8 @@ htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask, guess = htmlFindEncoding(ctxt); #endif if (guess == NULL) { - xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); + xmlSwitchEncoding(ctxt, + XML_CHAR_ENCODING_WINDOWS_1252); } else { xmlSwitchEncodingName(ctxt, (const char *) guess); xmlFree(guess); @@ -3288,7 +3289,8 @@ htmlParseCharData(htmlParserCtxtPtr ctxt, int partial) { guess = htmlFindEncoding(ctxt); #endif if (guess == NULL) { - xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1); + xmlSwitchEncoding(ctxt, + XML_CHAR_ENCODING_WINDOWS_1252); } else { xmlSwitchEncodingName(ctxt, (const char *) guess); xmlFree(guess); diff --git a/encoding.c b/encoding.c index 0101c051..e49d5351 100644 --- a/encoding.c +++ b/encoding.c @@ -1087,6 +1087,26 @@ xmlCreateCharEncodingHandler(const char *name, xmlCharEncFlags flags, return(XML_ERR_OK); if ((enc > 0) && ((size_t) enc < NUM_DEFAULT_HANDLERS)) { + if (flags & XML_ENC_HTML) { + /* + * TODO: HTML5 only allows a fixed set of charset + * labels. We should add an option to enable or + * disable this restriction. + * + * TODO: Map ISO-8859-9 to windows-1254. + */ + switch (enc) { + case XML_CHAR_ENCODING_ASCII: + case XML_CHAR_ENCODING_8859_1: + enc = XML_CHAR_ENCODING_WINDOWS_1252; + break; + case XML_CHAR_ENCODING_UCS2: + case XML_CHAR_ENCODING_UTF16: + enc = XML_CHAR_ENCODING_UTF16LE; + break; + } + } + handler = &defaultHandlers[enc]; if ((((flags & XML_ENC_INPUT) == 0) || (handler->input.func)) && (((flags & XML_ENC_OUTPUT) == 0) || (handler->output.func))) { diff --git a/include/libxml/encoding.h b/include/libxml/encoding.h index afa8db58..3299ec0f 100644 --- a/include/libxml/encoding.h +++ b/include/libxml/encoding.h @@ -119,7 +119,9 @@ typedef enum { /** Create converter for input (conversion to UTF-8) */ XML_ENC_INPUT = (1 << 0), /** Create converter for output (conversion from UTF-8) */ - XML_ENC_OUTPUT = (1 << 1) + XML_ENC_OUTPUT = (1 << 1), + /** Use HTML5 mappings */ + XML_ENC_HTML = (1 << 2) } xmlCharEncFlags; /** diff --git a/parserInternals.c b/parserInternals.c index 8ef972ec..46737add 100644 --- a/parserInternals.c +++ b/parserInternals.c @@ -1229,11 +1229,14 @@ xmlSwitchInputEncodingName(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, const char *encoding) { xmlCharEncodingHandlerPtr handler; xmlParserErrors res; + xmlCharEncFlags flags = XML_ENC_INPUT; if (encoding == NULL) return(-1); - res = xmlCreateCharEncodingHandler(encoding, XML_ENC_INPUT, + if (ctxt->html) + flags |= XML_ENC_HTML; + res = xmlCreateCharEncodingHandler(encoding, flags, ctxt->convImpl, ctxt->convCtxt, &handler); if (res == XML_ERR_UNSUPPORTED_ENCODING) { xmlWarningMsg(ctxt, XML_ERR_UNSUPPORTED_ENCODING, @@ -1569,14 +1572,17 @@ xmlSetDeclaredEncoding(xmlParserCtxtPtr ctxt, xmlChar *encoding) { ((ctxt->options & XML_PARSE_IGNORE_ENC) == 0)) { xmlCharEncodingHandlerPtr handler; xmlParserErrors res; + xmlCharEncFlags flags = XML_ENC_INPUT; /* * xmlSwitchEncodingName treats unsupported encodings as * warnings, but we want it to be an error in an encoding * declaration. */ + if (ctxt->html) + flags |= XML_ENC_HTML; res = xmlCreateCharEncodingHandler((const char *) encoding, - XML_ENC_INPUT, ctxt->convImpl, ctxt->convCtxt, &handler); + flags, ctxt->convImpl, ctxt->convCtxt, &handler); if (res != XML_ERR_OK) { xmlFatalErr(ctxt, res, (const char *) encoding); xmlFree(encoding); diff --git a/result/HTML/758518-tag.html.sax b/result/HTML/758518-tag.html.sax index d94eb193..26b77c94 100644 --- a/result/HTML/758518-tag.html.sax +++ b/result/HTML/758518-tag.html.sax @@ -1,4 +1,4 @@ SAX.setDocumentLocator() SAX.startDocument() -SAX.comment(?a“) +SAX.comment(?a“) SAX.endDocument() diff --git a/result/HTML/758606_2.html.sax b/result/HTML/758606_2.html.sax index 5883bc2f..7cc1b5a8 100644 --- a/result/HTML/758606_2.html.sax +++ b/result/HTML/758606_2.html.sax @@ -1,5 +1,5 @@ SAX.setDocumentLocator() SAX.startDocument() -SAX.comment( ‘