diff --git a/HTMLparser.c b/HTMLparser.c
index 4afab358..06efbc93 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -2745,7 +2745,8 @@ htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask,
guess = htmlFindEncoding(ctxt);
#endif
if (guess == NULL) {
- xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
+ xmlSwitchEncoding(ctxt,
+ XML_CHAR_ENCODING_WINDOWS_1252);
} else {
xmlSwitchEncodingName(ctxt, (const char *) guess);
xmlFree(guess);
@@ -3288,7 +3289,8 @@ htmlParseCharData(htmlParserCtxtPtr ctxt, int partial) {
guess = htmlFindEncoding(ctxt);
#endif
if (guess == NULL) {
- xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
+ xmlSwitchEncoding(ctxt,
+ XML_CHAR_ENCODING_WINDOWS_1252);
} else {
xmlSwitchEncodingName(ctxt, (const char *) guess);
xmlFree(guess);
diff --git a/encoding.c b/encoding.c
index 0101c051..e49d5351 100644
--- a/encoding.c
+++ b/encoding.c
@@ -1087,6 +1087,26 @@ xmlCreateCharEncodingHandler(const char *name, xmlCharEncFlags flags,
return(XML_ERR_OK);
if ((enc > 0) && ((size_t) enc < NUM_DEFAULT_HANDLERS)) {
+ if (flags & XML_ENC_HTML) {
+ /*
+ * TODO: HTML5 only allows a fixed set of charset
+ * labels. We should add an option to enable or
+ * disable this restriction.
+ *
+ * TODO: Map ISO-8859-9 to windows-1254.
+ */
+ switch (enc) {
+ case XML_CHAR_ENCODING_ASCII:
+ case XML_CHAR_ENCODING_8859_1:
+ enc = XML_CHAR_ENCODING_WINDOWS_1252;
+ break;
+ case XML_CHAR_ENCODING_UCS2:
+ case XML_CHAR_ENCODING_UTF16:
+ enc = XML_CHAR_ENCODING_UTF16LE;
+ break;
+ }
+ }
+
handler = &defaultHandlers[enc];
if ((((flags & XML_ENC_INPUT) == 0) || (handler->input.func)) &&
(((flags & XML_ENC_OUTPUT) == 0) || (handler->output.func))) {
diff --git a/include/libxml/encoding.h b/include/libxml/encoding.h
index afa8db58..3299ec0f 100644
--- a/include/libxml/encoding.h
+++ b/include/libxml/encoding.h
@@ -119,7 +119,9 @@ typedef enum {
/** Create converter for input (conversion to UTF-8) */
XML_ENC_INPUT = (1 << 0),
/** Create converter for output (conversion from UTF-8) */
- XML_ENC_OUTPUT = (1 << 1)
+ XML_ENC_OUTPUT = (1 << 1),
+ /** Use HTML5 mappings */
+ XML_ENC_HTML = (1 << 2)
} xmlCharEncFlags;
/**
diff --git a/parserInternals.c b/parserInternals.c
index 8ef972ec..46737add 100644
--- a/parserInternals.c
+++ b/parserInternals.c
@@ -1229,11 +1229,14 @@ xmlSwitchInputEncodingName(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
const char *encoding) {
xmlCharEncodingHandlerPtr handler;
xmlParserErrors res;
+ xmlCharEncFlags flags = XML_ENC_INPUT;
if (encoding == NULL)
return(-1);
- res = xmlCreateCharEncodingHandler(encoding, XML_ENC_INPUT,
+ if (ctxt->html)
+ flags |= XML_ENC_HTML;
+ res = xmlCreateCharEncodingHandler(encoding, flags,
ctxt->convImpl, ctxt->convCtxt, &handler);
if (res == XML_ERR_UNSUPPORTED_ENCODING) {
xmlWarningMsg(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
@@ -1569,14 +1572,17 @@ xmlSetDeclaredEncoding(xmlParserCtxtPtr ctxt, xmlChar *encoding) {
((ctxt->options & XML_PARSE_IGNORE_ENC) == 0)) {
xmlCharEncodingHandlerPtr handler;
xmlParserErrors res;
+ xmlCharEncFlags flags = XML_ENC_INPUT;
/*
* xmlSwitchEncodingName treats unsupported encodings as
* warnings, but we want it to be an error in an encoding
* declaration.
*/
+ if (ctxt->html)
+ flags |= XML_ENC_HTML;
res = xmlCreateCharEncodingHandler((const char *) encoding,
- XML_ENC_INPUT, ctxt->convImpl, ctxt->convCtxt, &handler);
+ flags, ctxt->convImpl, ctxt->convCtxt, &handler);
if (res != XML_ERR_OK) {
xmlFatalErr(ctxt, res, (const char *) encoding);
xmlFree(encoding);
diff --git a/result/HTML/758518-tag.html.sax b/result/HTML/758518-tag.html.sax
index d94eb193..26b77c94 100644
--- a/result/HTML/758518-tag.html.sax
+++ b/result/HTML/758518-tag.html.sax
@@ -1,4 +1,4 @@
SAX.setDocumentLocator()
SAX.startDocument()
-SAX.comment(?a)
+SAX.comment(?a“)
SAX.endDocument()
diff --git a/result/HTML/758606_2.html.sax b/result/HTML/758606_2.html.sax
index 5883bc2f..7cc1b5a8 100644
--- a/result/HTML/758606_2.html.sax
+++ b/result/HTML/758606_2.html.sax
@@ -1,5 +1,5 @@
SAX.setDocumentLocator()
SAX.startDocument()
-SAX.comment(