diff --git a/HTMLparser.c b/HTMLparser.c
index 14cc56fa..c9a64c78 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -457,7 +457,12 @@ htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
ctxt->input->encoding = guess;
handler = xmlFindCharEncodingHandler((const char *) guess);
if (handler != NULL) {
- xmlSwitchToEncoding(ctxt, handler);
+ /*
+ * Don't use UTF-8 encoder which isn't required and
+ * can produce invalid UTF-8.
+ */
+ if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
+ xmlSwitchToEncoding(ctxt, handler);
} else {
htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
"Unsupported encoding %s", guess, NULL);
@@ -570,7 +575,16 @@ encoding_error:
BAD_CAST buffer, NULL);
}
- ctxt->charset = XML_CHAR_ENCODING_8859_1;
+ /*
+ * Don't switch encodings twice. Note that if there's an encoder, we
+ * shouldn't receive invalid UTF-8 anyway.
+ *
+ * Note that if ctxt->input->buf == NULL, switching encodings is
+ * impossible, see Gitlab issue #34.
+ */
+ if ((ctxt->input->buf != NULL) &&
+ (ctxt->input->buf->encoder == NULL))
+ xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
*len = 1;
return((int) *ctxt->input->cur);
}
diff --git a/encoding.c b/encoding.c
index d67c16d9..cdff6ae7 100644
--- a/encoding.c
+++ b/encoding.c
@@ -373,6 +373,11 @@ UTF8ToUTF8(unsigned char* out, int *outlen,
if (len < 0)
return(-1);
+ /*
+ * FIXME: Conversion functions must assure valid UTF-8, so we have
+ * to check for UTF-8 validity. Preferably, this converter shouldn't
+ * be used at all.
+ */
memcpy(out, inb, len);
*outlen = len;
diff --git a/parserInternals.c b/parserInternals.c
index b0629ef3..cbcfde0e 100644
--- a/parserInternals.c
+++ b/parserInternals.c
@@ -1153,6 +1153,11 @@ xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
* Note: this is a bit dangerous, but that's what it
* takes to use nearly compatible signature for different
* encodings.
+ *
+ * FIXME: Encoders might buffer partial byte sequences, so
+ * this probably can't work. We should return an error and
+ * make sure that callers never try to switch the encoding
+ * twice.
*/
xmlCharEncCloseFunc(input->buf->encoder);
input->buf->encoder = handler;