mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-10-24 13:33:01 +03:00
html: Rework meta charset handling
Don't use encoding from meta tags when serializing. Only use the value in `doc->encoding`, matching the XML serializer. This is the actual encoding used when parsing. Stop modifying the input document by setting meta tags before serializing. Meta tags are now injected during serialization. Add full support for <meta charset=""> which is also used when adding meta tags. Align with HTML5 and implement the "algorithm for extracting a character encoding from a meta element". Only modify the encoding substring in Content-Type meta tags. Only switch encoding once when parsing. Fix htmlSaveFileFormat with a NULL encoding not to declare a misleading UTF-8 charset. Fixes #909.
This commit is contained in:
126
testparser.c
126
testparser.c
@@ -14,6 +14,7 @@
|
||||
#include <libxml/xmlsave.h>
|
||||
#include <libxml/xmlwriter.h>
|
||||
#include <libxml/HTMLparser.h>
|
||||
#include <libxml/HTMLtree.h>
|
||||
|
||||
#include <string.h>
|
||||
|
||||
@@ -611,6 +612,129 @@ testHtmlIds(void) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define MHE "meta http-equiv=\"Content-Type\""
|
||||
|
||||
static int
|
||||
testHtmlInsertMetaEncoding(void) {
|
||||
/* We currently require a head element to be present. */
|
||||
const char *html =
|
||||
"<html>"
|
||||
"<head></head>"
|
||||
"<body>text</body>"
|
||||
"</html>\n";
|
||||
const char *expect =
|
||||
"<html>"
|
||||
"<head><meta charset=\"utf-8\"></head>"
|
||||
"<body>text</body>"
|
||||
"</html>\n";
|
||||
htmlDocPtr doc;
|
||||
xmlBufferPtr buf;
|
||||
xmlSaveCtxtPtr save;
|
||||
xmlChar *out;
|
||||
int size, err = 0;
|
||||
|
||||
|
||||
doc = htmlReadDoc(BAD_CAST html, NULL, NULL, HTML_PARSE_NODEFDTD);
|
||||
|
||||
/* xmlSave updates meta tags */
|
||||
buf = xmlBufferCreate();
|
||||
save = xmlSaveToBuffer(buf, "utf-8", 0);
|
||||
xmlSaveDoc(save, doc);
|
||||
xmlSaveClose(save);
|
||||
if (!xmlStrEqual(xmlBufferContent(buf), BAD_CAST expect)) {
|
||||
fprintf(stderr, "meta tag insertion failed when serializing\n");
|
||||
err = 1;
|
||||
}
|
||||
xmlBufferFree(buf);
|
||||
|
||||
htmlSetMetaEncoding(doc, BAD_CAST "utf-8");
|
||||
/* htmlDocDumpMemoryFormat doesn't update meta tags */
|
||||
htmlDocDumpMemoryFormat(doc, &out, &size, 0);
|
||||
if (!xmlStrEqual(out, BAD_CAST expect)) {
|
||||
fprintf(stderr, "htmlSetMetaEncoding insertion failed\n");
|
||||
err = 1;
|
||||
}
|
||||
xmlFree(out);
|
||||
|
||||
xmlFreeDoc(doc);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int
|
||||
testHtmlUpdateMetaEncoding(void) {
|
||||
/* We rely on the implementation adjusting all meta tags */
|
||||
const char *html =
|
||||
"<html>\n"
|
||||
" <head>\n"
|
||||
" <meta charset=\"utf-8\">\n"
|
||||
" <meta charset=\" foo \">\n"
|
||||
" <meta charset=\"\">\n"
|
||||
" <" MHE " content=\"text/html; ChArSeT=foo\">\n"
|
||||
" <" MHE " content=\"text/html; charset = \">\n"
|
||||
" <" MHE " content=\"text/html; charset = ' foo '\">\n"
|
||||
" <" MHE " content=\"text/html; charset = ' foo \">\n"
|
||||
" <" MHE " content='text/html; charset = \" foo \"'>\n"
|
||||
" <" MHE " content='text/html; charset = \" foo '>\n"
|
||||
" <" MHE " content=\"charset ; charset = bar; baz\">\n"
|
||||
" <" MHE " content=\"text/html\">\n"
|
||||
" <" MHE " content=\"\">\n"
|
||||
" <" MHE ">\n"
|
||||
" </head>\n"
|
||||
" <body></body>\n"
|
||||
"</html>\n";
|
||||
const char *expect =
|
||||
"<html>\n"
|
||||
" <head>\n"
|
||||
" <meta charset=\"utf-8\">\n"
|
||||
" <meta charset=\" utf-8 \">\n"
|
||||
" <meta charset=\"utf-8\">\n"
|
||||
" <" MHE " content=\"text/html; ChArSeT=utf-8\">\n"
|
||||
" <" MHE " content=\"text/html; charset = \">\n"
|
||||
" <" MHE " content=\"text/html; charset = ' utf-8 '\">\n"
|
||||
" <" MHE " content=\"text/html; charset = ' foo \">\n"
|
||||
" <" MHE " content='text/html; charset = \" utf-8 \"'>\n"
|
||||
" <" MHE " content='text/html; charset = \" foo '>\n"
|
||||
" <" MHE " content=\"charset ; charset = utf-8; baz\">\n"
|
||||
" <" MHE " content=\"text/html\">\n"
|
||||
" <" MHE " content=\"\">\n"
|
||||
" <" MHE ">\n"
|
||||
" </head>\n"
|
||||
" <body></body>\n"
|
||||
"</html>\n";
|
||||
htmlDocPtr doc;
|
||||
xmlBufferPtr buf;
|
||||
xmlSaveCtxtPtr save;
|
||||
xmlChar *out;
|
||||
int size, err = 0;
|
||||
|
||||
doc = htmlReadDoc(BAD_CAST html, NULL, NULL, HTML_PARSE_NODEFDTD);
|
||||
|
||||
/* xmlSave updates meta tags */
|
||||
buf = xmlBufferCreate();
|
||||
save = xmlSaveToBuffer(buf, NULL, 0);
|
||||
xmlSaveDoc(save, doc);
|
||||
xmlSaveClose(save);
|
||||
if (!xmlStrEqual(xmlBufferContent(buf), BAD_CAST expect)) {
|
||||
fprintf(stderr, "meta tag update failed when serializing\n");
|
||||
err = 1;
|
||||
}
|
||||
xmlBufferFree(buf);
|
||||
|
||||
xmlFree((xmlChar *) doc->encoding);
|
||||
doc->encoding = NULL;
|
||||
htmlSetMetaEncoding(doc, BAD_CAST "utf-8");
|
||||
/* htmlDocDumpMemoryFormat doesn't update meta tags */
|
||||
htmlDocDumpMemoryFormat(doc, &out, &size, 0);
|
||||
if (!xmlStrEqual(out, BAD_CAST expect)) {
|
||||
fprintf(stderr, "htmlSetMetaEncoding update failed\n");
|
||||
err = 1;
|
||||
}
|
||||
xmlFree(out);
|
||||
|
||||
xmlFreeDoc(doc);
|
||||
return err;
|
||||
}
|
||||
|
||||
#ifdef LIBXML_PUSH_ENABLED
|
||||
static int
|
||||
testHtmlPushWithEncoding(void) {
|
||||
@@ -1293,6 +1417,8 @@ main(void) {
|
||||
#endif
|
||||
#ifdef LIBXML_HTML_ENABLED
|
||||
err |= testHtmlIds();
|
||||
err |= testHtmlInsertMetaEncoding();
|
||||
err |= testHtmlUpdateMetaEncoding();
|
||||
#ifdef LIBXML_PUSH_ENABLED
|
||||
err |= testHtmlPushWithEncoding();
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user