diff --git a/HTMLparser.c b/HTMLparser.c index 4b142349..628197f3 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -3614,6 +3614,50 @@ htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) { return(hname); } +static int +htmlCharEncCheckAsciiCompatible(htmlParserCtxt *ctxt, + const xmlChar *encoding) { + xmlCharEncodingHandler *handler; + xmlChar in[9] = ""; + xmlChar out[9]; + int inlen, outlen; + int res; + + res = xmlCreateCharEncodingHandler( + (const char *) encoding, + XML_ENC_INPUT | XML_ENC_HTML, + ctxt->convImpl, ctxt->convCtxt, + &handler); + /* + * TODO: Unlike the XML parser, we shouldn't raise a fatal + * if the encoding is unsupported. + */ + if (res != XML_ERR_OK) { + xmlFatalErr(ctxt, res, (const char *) encoding); + return(-1); + } + + /* UTF-8 */ + if (handler == NULL) + return(0); + + inlen = 8; + outlen = 8; + res = xmlEncInputChunk(handler, out, &outlen, in, &inlen, /* flush */ 1); + + xmlCharEncCloseFunc(handler); + + if ((res != XML_ENC_ERR_SUCCESS) || + (inlen != 8) || (outlen != 8) || + (memcmp(in, out, 8) != 0)) { + htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING, + "Encoding %s isn't ASCII-compatible", encoding, NULL); + return(-1); + } + + return(0); +} + /** * Handle charset encoding in meta tag. * @@ -3626,6 +3670,7 @@ htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) { const xmlChar *att, *value; int isContentType = 0; const xmlChar *content = NULL; + xmlChar *encoding = NULL; if ((ctxt == NULL) || (atts == NULL)) return; @@ -3639,12 +3684,10 @@ htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) { (!xmlStrcasecmp(value, BAD_CAST "Content-Type"))) { isContentType = 1; } else if (!xmlStrcasecmp(att, BAD_CAST "charset")) { - xmlChar *encoding; - encoding = xmlStrdup(value); if (encoding == NULL) htmlErrMemory(ctxt); - xmlSetDeclaredEncoding(ctxt, encoding); + break; } else if (!xmlStrcasecmp(att, BAD_CAST "content")) { content = value; } @@ -3652,18 +3695,28 @@ htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) { att = atts[i++]; } - if ((isContentType) && (content != NULL)) { + if ((encoding == NULL) && (isContentType) && (content != NULL)) { htmlMetaEncodingOffsets off; if (htmlParseContentType(content, &off)) { - xmlChar *encoding; - encoding = xmlStrndup(content + off.start, off.end - off.start); if (encoding == NULL) htmlErrMemory(ctxt); - xmlSetDeclaredEncoding(ctxt, encoding); } } + + if (encoding != NULL) { + if (htmlCharEncCheckAsciiCompatible(ctxt, encoding) < 0) { + xmlFree(encoding); + return; + } + + /* + * TODO: Unlike the XML parser, we shouldn't raise a fatal + * if the encoding is unsupported. + */ + xmlSetDeclaredEncoding(ctxt, encoding); + } } /**