1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-10-24 13:33:01 +03:00

html: Rework meta charset handling

Don't use encoding from meta tags when serializing. Only use the value
in `doc->encoding`, matching the XML serializer. This is the actual
encoding used when parsing.

Stop modifying the input document by setting meta tags before
serializing. Meta tags are now injected during serialization.

Add full support for <meta charset=""> which is also used when adding
meta tags.

Align with HTML5 and implement the "algorithm for extracting a character
encoding from a meta element". Only modify the encoding substring in
Content-Type meta tags.

Only switch encoding once when parsing.

Fix htmlSaveFileFormat with a NULL encoding not to declare a misleading
UTF-8 charset.

Fixes #909.
This commit is contained in:
Nick Wellnhofer
2025-05-09 00:21:47 +02:00
parent 9aaa52fe48
commit 46f05ea4d5
15 changed files with 637 additions and 456 deletions

View File

@@ -14,6 +14,7 @@
#include <libxml/xmlsave.h>
#include <libxml/xmlwriter.h>
#include <libxml/HTMLparser.h>
#include <libxml/HTMLtree.h>
#include <string.h>
@@ -611,6 +612,129 @@ testHtmlIds(void) {
return 0;
}
#define MHE "meta http-equiv=\"Content-Type\""
static int
testHtmlInsertMetaEncoding(void) {
/* We currently require a head element to be present. */
const char *html =
"<html>"
"<head></head>"
"<body>text</body>"
"</html>\n";
const char *expect =
"<html>"
"<head><meta charset=\"utf-8\"></head>"
"<body>text</body>"
"</html>\n";
htmlDocPtr doc;
xmlBufferPtr buf;
xmlSaveCtxtPtr save;
xmlChar *out;
int size, err = 0;
doc = htmlReadDoc(BAD_CAST html, NULL, NULL, HTML_PARSE_NODEFDTD);
/* xmlSave updates meta tags */
buf = xmlBufferCreate();
save = xmlSaveToBuffer(buf, "utf-8", 0);
xmlSaveDoc(save, doc);
xmlSaveClose(save);
if (!xmlStrEqual(xmlBufferContent(buf), BAD_CAST expect)) {
fprintf(stderr, "meta tag insertion failed when serializing\n");
err = 1;
}
xmlBufferFree(buf);
htmlSetMetaEncoding(doc, BAD_CAST "utf-8");
/* htmlDocDumpMemoryFormat doesn't update meta tags */
htmlDocDumpMemoryFormat(doc, &out, &size, 0);
if (!xmlStrEqual(out, BAD_CAST expect)) {
fprintf(stderr, "htmlSetMetaEncoding insertion failed\n");
err = 1;
}
xmlFree(out);
xmlFreeDoc(doc);
return err;
}
static int
testHtmlUpdateMetaEncoding(void) {
/* We rely on the implementation adjusting all meta tags */
const char *html =
"<html>\n"
" <head>\n"
" <meta charset=\"utf-8\">\n"
" <meta charset=\" foo \">\n"
" <meta charset=\"\">\n"
" <" MHE " content=\"text/html; ChArSeT=foo\">\n"
" <" MHE " content=\"text/html; charset = \">\n"
" <" MHE " content=\"text/html; charset = ' foo '\">\n"
" <" MHE " content=\"text/html; charset = ' foo \">\n"
" <" MHE " content='text/html; charset = \" foo \"'>\n"
" <" MHE " content='text/html; charset = \" foo '>\n"
" <" MHE " content=\"charset ; charset = bar; baz\">\n"
" <" MHE " content=\"text/html\">\n"
" <" MHE " content=\"\">\n"
" <" MHE ">\n"
" </head>\n"
" <body></body>\n"
"</html>\n";
const char *expect =
"<html>\n"
" <head>\n"
" <meta charset=\"utf-8\">\n"
" <meta charset=\" utf-8 \">\n"
" <meta charset=\"utf-8\">\n"
" <" MHE " content=\"text/html; ChArSeT=utf-8\">\n"
" <" MHE " content=\"text/html; charset = \">\n"
" <" MHE " content=\"text/html; charset = ' utf-8 '\">\n"
" <" MHE " content=\"text/html; charset = ' foo \">\n"
" <" MHE " content='text/html; charset = \" utf-8 \"'>\n"
" <" MHE " content='text/html; charset = \" foo '>\n"
" <" MHE " content=\"charset ; charset = utf-8; baz\">\n"
" <" MHE " content=\"text/html\">\n"
" <" MHE " content=\"\">\n"
" <" MHE ">\n"
" </head>\n"
" <body></body>\n"
"</html>\n";
htmlDocPtr doc;
xmlBufferPtr buf;
xmlSaveCtxtPtr save;
xmlChar *out;
int size, err = 0;
doc = htmlReadDoc(BAD_CAST html, NULL, NULL, HTML_PARSE_NODEFDTD);
/* xmlSave updates meta tags */
buf = xmlBufferCreate();
save = xmlSaveToBuffer(buf, NULL, 0);
xmlSaveDoc(save, doc);
xmlSaveClose(save);
if (!xmlStrEqual(xmlBufferContent(buf), BAD_CAST expect)) {
fprintf(stderr, "meta tag update failed when serializing\n");
err = 1;
}
xmlBufferFree(buf);
xmlFree((xmlChar *) doc->encoding);
doc->encoding = NULL;
htmlSetMetaEncoding(doc, BAD_CAST "utf-8");
/* htmlDocDumpMemoryFormat doesn't update meta tags */
htmlDocDumpMemoryFormat(doc, &out, &size, 0);
if (!xmlStrEqual(out, BAD_CAST expect)) {
fprintf(stderr, "htmlSetMetaEncoding update failed\n");
err = 1;
}
xmlFree(out);
xmlFreeDoc(doc);
return err;
}
#ifdef LIBXML_PUSH_ENABLED
static int
testHtmlPushWithEncoding(void) {
@@ -1293,6 +1417,8 @@ main(void) {
#endif
#ifdef LIBXML_HTML_ENABLED
err |= testHtmlIds();
err |= testHtmlInsertMetaEncoding();
err |= testHtmlUpdateMetaEncoding();
#ifdef LIBXML_PUSH_ENABLED
err |= testHtmlPushWithEncoding();
#endif