1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-10-24 13:33:01 +03:00

Add HTML parser support for HTML5 meta charset encoding declaration

For https://bugzilla.gnome.org/show_bug.cgi?id=655218

http://www.w3.org/TR/2011/WD-html5-20110525/semantics.html#the-meta-element

"""
The charset attribute specifies the character encoding used by the document.
This is a character encoding declaration. If the attribute is present in an XML
document, its value must be an ASCII case-insensitive match for the string
"UTF-8" (and the document is therefore forced to use UTF-8 as its
encoding).
"""

However, while <meta http-equiv="Content-Type" content="text/html;
charset=utf8"> works, <meta charset="utf8"> does not.

While libxml2 HTML parser is not tuned for HTML5, this is a simple
addition

Also added a testcase
This commit is contained in:
Denis Pauk
2012-05-10 15:34:57 +08:00
committed by Daniel Veillard
parent 1eabc31401
commit 868d92da89
6 changed files with 84 additions and 17 deletions

View File

@@ -727,7 +727,7 @@ static const char* const map_contents[] = { BLOCK, "area", NULL } ;
static const char* const name_attr[] = { "name", NULL } ;
static const char* const action_attr[] = { "action", NULL } ;
static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
static const char* const content_attr[] = { "content", NULL } ;
static const char* const type_attr[] = { "type", NULL } ;
static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
@@ -3435,20 +3435,19 @@ htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
}
/**
* htmlCheckEncoding:
* htmlCheckEncodingDirect:
* @ctxt: an HTML parser context
* @attvalue: the attribute value
*
* Checks an http-equiv attribute from a Meta tag to detect
* Checks an attribute value to detect
* the encoding
* If a new encoding is detected the parser is switched to decode
* it and pass UTF8
*/
static void
htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
const xmlChar *encoding;
htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
if ((ctxt == NULL) || (attvalue == NULL) ||
if ((ctxt == NULL) || (encoding == NULL) ||
(ctxt->options & HTML_PARSE_IGNORE_ENC))
return;
@@ -3456,14 +3455,6 @@ htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
if (ctxt->input->encoding != NULL)
return;
encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
if (encoding != NULL) {
encoding += 8;
} else {
encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
if (encoding != NULL)
encoding += 9;
}
if (encoding != NULL) {
xmlCharEncoding enc;
xmlCharEncodingHandlerPtr handler;
@@ -3535,6 +3526,38 @@ htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
}
}
/**
* htmlCheckEncoding:
* @ctxt: an HTML parser context
* @attvalue: the attribute value
*
* Checks an http-equiv attribute from a Meta tag to detect
* the encoding
* If a new encoding is detected the parser is switched to decode
* it and pass UTF8
*/
static void
htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
const xmlChar *encoding;
if (!attvalue)
return;
encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
if (encoding != NULL) {
encoding += 7;
}
/*
* skip blank
*/
if (encoding && IS_BLANK_CH(*encoding))
encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
if (encoding && *encoding == '=') {
encoding ++;
htmlCheckEncodingDirect(ctxt, encoding);
}
}
/**
* htmlCheckMeta:
* @ctxt: an HTML parser context
@@ -3559,6 +3582,8 @@ htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
http = 1;
else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
htmlCheckEncodingDirect(ctxt, value);
else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
content = value;
att = atts[i++];