mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-10-21 14:53:44 +03:00
html: Rework meta charset handling
Don't use encoding from meta tags when serializing. Only use the value in `doc->encoding`, matching the XML serializer. This is the actual encoding used when parsing. Stop modifying the input document by setting meta tags before serializing. Meta tags are now injected during serialization. Add full support for <meta charset=""> which is also used when adding meta tags. Align with HTML5 and implement the "algorithm for extracting a character encoding from a meta element". Only modify the encoding substring in Content-Type meta tags. Only switch encoding once when parsing. Fix htmlSaveFileFormat with a NULL encoding not to declare a misleading UTF-8 charset. Fixes #909.
This commit is contained in:
@@ -40,7 +40,7 @@ gcc:c89:
|
||||
extends: .test
|
||||
variables:
|
||||
CONFIG: "--without-python"
|
||||
CFLAGS: "-O2 -std=c89 -D_XOPEN_SOURCE=600 -Wno-error=unused-function"
|
||||
CFLAGS: "-O2 -std=c89 -D_XOPEN_SOURCE=600 -Wno-error=unused-function -Wno-error=overlength-strings"
|
||||
|
||||
gcc:minimum:
|
||||
extends: .test
|
||||
|
90
HTMLparser.c
90
HTMLparser.c
@@ -50,10 +50,6 @@
|
||||
#define HTML_PARSER_BIG_BUFFER_SIZE 1000
|
||||
#define HTML_PARSER_BUFFER_SIZE 100
|
||||
|
||||
#define IS_WS_HTML(c) \
|
||||
(((c) == 0x20) || \
|
||||
(((c) >= 0x09) && ((c) <= 0x0D) && ((c) != 0x0B)))
|
||||
|
||||
#define IS_HEX_DIGIT(c) \
|
||||
((IS_ASCII_DIGIT(c)) || \
|
||||
((((c) | 0x20) >= 'a') && (((c) | 0x20) <= 'f')))
|
||||
@@ -314,17 +310,15 @@ htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
|
||||
#define CUR (*ctxt->input->cur)
|
||||
|
||||
/**
|
||||
* `the` HTML parser context
|
||||
* Prescan to find encoding.
|
||||
*
|
||||
* Ty to find and encoding in the current data available in the input
|
||||
* buffer this is needed to try to switch to the proper encoding when
|
||||
* one face a character error.
|
||||
* That's an heuristic, since it's operating outside of parsing it could
|
||||
* try to use a meta which had been commented out, that's the reason it
|
||||
* should only be used in case of error, not as a default.
|
||||
* Try to find an encoding in the current data available in the input
|
||||
* buffer.
|
||||
*
|
||||
* @returns an encoding string or NULL if not found, the string need to
|
||||
* be freed
|
||||
* TODO: Implement HTML5 prescan algorithm.
|
||||
*
|
||||
* @param ctxt the HTML parser context
|
||||
* @returns an encoding string or NULL if not found
|
||||
*/
|
||||
static xmlChar *
|
||||
htmlFindEncoding(xmlParserCtxtPtr ctxt) {
|
||||
@@ -3606,42 +3600,7 @@ htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks an http-equiv attribute from a Meta tag to detect
|
||||
* the encoding
|
||||
* If a new encoding is detected the parser is switched to decode
|
||||
* it and pass UTF8
|
||||
*
|
||||
* @param ctxt an HTML parser context
|
||||
* @param attvalue the attribute value
|
||||
*/
|
||||
static void
|
||||
htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
|
||||
const xmlChar *encoding;
|
||||
xmlChar *copy;
|
||||
|
||||
if (!attvalue)
|
||||
return;
|
||||
|
||||
encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
|
||||
if (encoding != NULL) {
|
||||
encoding += 7;
|
||||
}
|
||||
/*
|
||||
* skip blank
|
||||
*/
|
||||
if (encoding && IS_WS_HTML(*encoding))
|
||||
encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
|
||||
if (encoding && *encoding == '=') {
|
||||
encoding ++;
|
||||
copy = xmlStrdup(encoding);
|
||||
if (copy == NULL)
|
||||
htmlErrMemory(ctxt);
|
||||
xmlSetDeclaredEncoding(ctxt, copy);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks an attributes from a Meta tag
|
||||
* Handle charset encoding in meta tag.
|
||||
*
|
||||
* @param ctxt an HTML parser context
|
||||
* @param atts the attributes values
|
||||
@@ -3650,7 +3609,7 @@ static void
|
||||
htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
|
||||
int i;
|
||||
const xmlChar *att, *value;
|
||||
int http = 0;
|
||||
int isContentType = 0;
|
||||
const xmlChar *content = NULL;
|
||||
|
||||
if ((ctxt == NULL) || (atts == NULL))
|
||||
@@ -3663,23 +3622,33 @@ htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
|
||||
if (value != NULL) {
|
||||
if ((!xmlStrcasecmp(att, BAD_CAST "http-equiv")) &&
|
||||
(!xmlStrcasecmp(value, BAD_CAST "Content-Type"))) {
|
||||
http = 1;
|
||||
isContentType = 1;
|
||||
} else if (!xmlStrcasecmp(att, BAD_CAST "charset")) {
|
||||
xmlChar *copy;
|
||||
xmlChar *encoding;
|
||||
|
||||
copy = xmlStrdup(value);
|
||||
if (copy == NULL)
|
||||
encoding = xmlStrdup(value);
|
||||
if (encoding == NULL)
|
||||
htmlErrMemory(ctxt);
|
||||
xmlSetDeclaredEncoding(ctxt, copy);
|
||||
xmlSetDeclaredEncoding(ctxt, encoding);
|
||||
} else if (!xmlStrcasecmp(att, BAD_CAST "content")) {
|
||||
content = value;
|
||||
}
|
||||
}
|
||||
att = atts[i++];
|
||||
}
|
||||
if ((http) && (content != NULL))
|
||||
htmlCheckEncoding(ctxt, content);
|
||||
|
||||
if ((isContentType) && (content != NULL)) {
|
||||
htmlMetaEncodingOffsets off;
|
||||
|
||||
if (htmlParseContentType(content, &off)) {
|
||||
xmlChar *encoding;
|
||||
|
||||
encoding = xmlStrndup(content + off.start, off.end - off.start);
|
||||
if (encoding == NULL)
|
||||
htmlErrMemory(ctxt);
|
||||
xmlSetDeclaredEncoding(ctxt, encoding);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -3748,7 +3717,6 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
|
||||
const xmlChar **atts;
|
||||
int nbatts = 0;
|
||||
int maxatts;
|
||||
int meta = 0;
|
||||
int i;
|
||||
int discardtag = 0;
|
||||
|
||||
@@ -3763,8 +3731,6 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
|
||||
name = htmlParseHTMLName(ctxt, 0).name;
|
||||
if (name == NULL)
|
||||
return;
|
||||
if (xmlStrEqual(name, BAD_CAST"meta"))
|
||||
meta = 1;
|
||||
|
||||
if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
|
||||
/*
|
||||
@@ -3960,8 +3926,10 @@ failed:
|
||||
/*
|
||||
* Handle specific association to the META tag
|
||||
*/
|
||||
if (meta)
|
||||
if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
|
||||
(strcmp((char *) name, "meta") == 0)) {
|
||||
htmlCheckMeta(ctxt, atts);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
741
HTMLtree.c
741
HTMLtree.c
@@ -25,6 +25,7 @@
|
||||
|
||||
#include "private/buf.h"
|
||||
#include "private/error.h"
|
||||
#include "private/html.h"
|
||||
#include "private/io.h"
|
||||
#include "private/save.h"
|
||||
|
||||
@@ -34,265 +35,315 @@
|
||||
* *
|
||||
************************************************************************/
|
||||
|
||||
typedef struct {
|
||||
xmlAttrPtr attr; /* charset or content */
|
||||
const xmlChar *attrValue;
|
||||
htmlMetaEncodingOffsets off;
|
||||
} htmlMetaEncoding;
|
||||
|
||||
static htmlNodePtr
|
||||
htmlFindFirstChild(htmlNodePtr parent, const char *name) {
|
||||
htmlNodePtr child;
|
||||
|
||||
for (child = parent->children; child != NULL; child = child->next) {
|
||||
if ((child->type == XML_ELEMENT_NODE) &&
|
||||
(child->ns == NULL) &&
|
||||
(xmlStrcasecmp(child->name, BAD_CAST name) == 0))
|
||||
return(child);
|
||||
}
|
||||
|
||||
return(NULL);
|
||||
}
|
||||
|
||||
static htmlNodePtr
|
||||
htmlFindHead(htmlDocPtr doc) {
|
||||
htmlNodePtr html;
|
||||
|
||||
if (doc == NULL)
|
||||
return(NULL);
|
||||
|
||||
html = htmlFindFirstChild((htmlNodePtr) doc, "html");
|
||||
if (html == NULL)
|
||||
return(NULL);
|
||||
|
||||
return(htmlFindFirstChild(html, "head"));
|
||||
}
|
||||
|
||||
int
|
||||
htmlParseContentType(const xmlChar *val, htmlMetaEncodingOffsets *off) {
|
||||
const xmlChar *p = val;
|
||||
|
||||
while (1) {
|
||||
size_t start, end;
|
||||
|
||||
while ((*p != 'c') && (*p != 'C')) {
|
||||
if (*p == 0)
|
||||
return(0);
|
||||
p += 1;
|
||||
}
|
||||
p += 1;
|
||||
|
||||
if (xmlStrncasecmp(p, BAD_CAST "harset", 6) != 0)
|
||||
continue;
|
||||
|
||||
p += 6;
|
||||
while (IS_WS_HTML(*p)) p += 1;
|
||||
|
||||
if (*p != '=')
|
||||
continue;
|
||||
|
||||
p += 1;
|
||||
while (IS_WS_HTML(*p)) p += 1;
|
||||
|
||||
if (*p == 0)
|
||||
return(0);
|
||||
|
||||
if ((*p == '"') || (*p == '\'')) {
|
||||
int quote = *p;
|
||||
|
||||
p += 1;
|
||||
while (IS_WS_HTML(*p)) p += 1;
|
||||
|
||||
start = p - val;
|
||||
end = start;
|
||||
|
||||
while (*p != quote) {
|
||||
if (*p == 0)
|
||||
return(0);
|
||||
if (!IS_WS_HTML(*p))
|
||||
end = p + 1 - val;
|
||||
p += 1;
|
||||
}
|
||||
} else {
|
||||
start = p - val;
|
||||
|
||||
while ((*p != 0) && (*p != ';') && (!IS_WS_HTML(*p)))
|
||||
p += 1;
|
||||
|
||||
end = p - val;
|
||||
}
|
||||
|
||||
off->start = start;
|
||||
off->end = end;
|
||||
off->size = p - val + strlen((char *) p);
|
||||
|
||||
return(1);
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
||||
static xmlAttrPtr
|
||||
htmlFindMetaEncodingAttr(htmlNodePtr elem, int *outIsContentType) {
|
||||
xmlAttrPtr attr, contentAttr = NULL;
|
||||
int isContentType = 0;
|
||||
|
||||
if (xmlStrcasecmp(elem->name, BAD_CAST "meta") != 0)
|
||||
return(NULL);
|
||||
|
||||
for (attr = elem->properties; attr != NULL; attr = attr->next) {
|
||||
if (attr->ns != NULL)
|
||||
continue;
|
||||
if (xmlStrcasecmp(attr->name, BAD_CAST "charset") == 0) {
|
||||
*outIsContentType = 0;
|
||||
return(attr);
|
||||
}
|
||||
if (xmlStrcasecmp(attr->name, BAD_CAST "content") == 0)
|
||||
contentAttr = attr;
|
||||
if ((xmlStrcasecmp(attr->name, BAD_CAST "http-equiv") == 0) &&
|
||||
(attr->children != NULL) &&
|
||||
(attr->children->type == XML_TEXT_NODE) &&
|
||||
(attr->children->next == NULL) &&
|
||||
(xmlStrcasecmp(attr->children->content,
|
||||
BAD_CAST "Content-Type") == 0))
|
||||
isContentType = 1;
|
||||
}
|
||||
|
||||
if ((isContentType) && (contentAttr != NULL)) {
|
||||
*outIsContentType = 1;
|
||||
return(contentAttr);
|
||||
}
|
||||
|
||||
return(NULL);
|
||||
}
|
||||
|
||||
static int
|
||||
htmlParseMetaEncoding(htmlNodePtr elem, htmlMetaEncoding *menc) {
|
||||
xmlAttrPtr attr;
|
||||
const xmlChar *val = NULL;
|
||||
int isContentType;
|
||||
|
||||
if ((elem->type != XML_ELEMENT_NODE) ||
|
||||
(elem->ns != NULL) ||
|
||||
(xmlStrcasecmp(elem->name, BAD_CAST "meta") != 0))
|
||||
return(0);
|
||||
|
||||
attr = htmlFindMetaEncodingAttr(elem, &isContentType);
|
||||
if (attr == NULL)
|
||||
return(0);
|
||||
|
||||
if ((attr->children != NULL) &&
|
||||
(attr->children->type == XML_TEXT_NODE) &&
|
||||
(attr->children->next == NULL) &&
|
||||
(attr->children->content != NULL))
|
||||
val = attr->children->content;
|
||||
else
|
||||
val = BAD_CAST "";
|
||||
|
||||
|
||||
if (!isContentType) {
|
||||
size_t size = strlen((char *) val);
|
||||
size_t start = 0;
|
||||
size_t end = size;
|
||||
|
||||
while ((start < size) && (IS_WS_HTML(val[start])))
|
||||
start += 1;
|
||||
|
||||
while ((end > 0) && (IS_WS_HTML(val[end-1])))
|
||||
end -= 1;
|
||||
|
||||
menc->attr = attr;
|
||||
menc->attrValue = val;
|
||||
menc->off.start = start;
|
||||
menc->off.end = end;
|
||||
menc->off.size = size;
|
||||
|
||||
return(1);
|
||||
} else {
|
||||
if (htmlParseContentType(val, &menc->off)) {
|
||||
menc->attr = attr;
|
||||
menc->attrValue = val;
|
||||
|
||||
return(1);
|
||||
}
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
||||
static xmlChar *
|
||||
htmlUpdateMetaEncoding(htmlMetaEncoding *menc, const char *encoding) {
|
||||
xmlChar *newVal, *p;
|
||||
size_t size, oldEncSize, newEncSize;
|
||||
|
||||
/*
|
||||
* The pseudo "HTML" encoding only produces ASCII.
|
||||
*/
|
||||
if (xmlStrcasecmp(BAD_CAST encoding, BAD_CAST "HTML") == 0)
|
||||
encoding = "ASCII";
|
||||
|
||||
oldEncSize = menc->off.end - menc->off.start;
|
||||
newEncSize = strlen((char *) encoding);
|
||||
size = menc->off.size - oldEncSize + newEncSize;
|
||||
newVal = xmlMalloc(size + 1);
|
||||
if (newVal == NULL)
|
||||
return(NULL);
|
||||
|
||||
p = newVal;
|
||||
memcpy(p, menc->attrValue, menc->off.start);
|
||||
p += menc->off.start;
|
||||
memcpy(p, encoding, newEncSize);
|
||||
p += newEncSize;
|
||||
memcpy(p, menc->attrValue + menc->off.end, menc->off.size - menc->off.end);
|
||||
newVal[size] = 0;
|
||||
|
||||
return(newVal);
|
||||
}
|
||||
|
||||
/**
|
||||
* Look up and encoding declaration in the meta tags.
|
||||
*
|
||||
* Does not support `<meta charset="">` yet. Only supports deprecated
|
||||
* `<meta http-equiv="Content-Type" content="">`.
|
||||
*
|
||||
* The returned string points into attribute content. It should be
|
||||
* copied before modifying or freeing nodes.
|
||||
* The returned string points into attribute content and can contain
|
||||
* trailing garbage. It should be copied before modifying or freeing
|
||||
* nodes.
|
||||
*
|
||||
* @param doc the document
|
||||
* @returns the encoding ot NULL if not found.
|
||||
*/
|
||||
const xmlChar *
|
||||
htmlGetMetaEncoding(htmlDocPtr doc) {
|
||||
htmlNodePtr cur;
|
||||
const xmlChar *content;
|
||||
const xmlChar *encoding;
|
||||
htmlNodePtr head, node;
|
||||
|
||||
if (doc == NULL)
|
||||
return(NULL);
|
||||
cur = doc->children;
|
||||
head = htmlFindHead(doc);
|
||||
if (head == NULL)
|
||||
return(NULL);
|
||||
|
||||
/*
|
||||
* Search the html
|
||||
*/
|
||||
while (cur != NULL) {
|
||||
if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
|
||||
if (xmlStrEqual(cur->name, BAD_CAST"html"))
|
||||
break;
|
||||
if (xmlStrEqual(cur->name, BAD_CAST"head"))
|
||||
goto found_head;
|
||||
if (xmlStrEqual(cur->name, BAD_CAST"meta"))
|
||||
goto found_meta;
|
||||
}
|
||||
cur = cur->next;
|
||||
for (node = head->children; node != NULL; node = node->next) {
|
||||
htmlMetaEncoding menc;
|
||||
|
||||
if (htmlParseMetaEncoding(node, &menc)) {
|
||||
/*
|
||||
* Returning a `const xmlChar *` only allows to return
|
||||
* a suffix. In http-equiv meta tags, there could be
|
||||
* more data after the charset, although it's probably
|
||||
* rare in practice.
|
||||
*/
|
||||
return(menc.attrValue + menc.off.start);
|
||||
}
|
||||
}
|
||||
if (cur == NULL)
|
||||
return(NULL);
|
||||
cur = cur->children;
|
||||
|
||||
/*
|
||||
* Search the head
|
||||
*/
|
||||
while (cur != NULL) {
|
||||
if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
|
||||
if (xmlStrEqual(cur->name, BAD_CAST"head"))
|
||||
break;
|
||||
if (xmlStrEqual(cur->name, BAD_CAST"meta"))
|
||||
goto found_meta;
|
||||
}
|
||||
cur = cur->next;
|
||||
}
|
||||
if (cur == NULL)
|
||||
return(NULL);
|
||||
found_head:
|
||||
cur = cur->children;
|
||||
|
||||
/*
|
||||
* Search the meta elements
|
||||
*/
|
||||
found_meta:
|
||||
while (cur != NULL) {
|
||||
if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
|
||||
if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
|
||||
xmlAttrPtr attr = cur->properties;
|
||||
int http;
|
||||
const xmlChar *value;
|
||||
|
||||
content = NULL;
|
||||
http = 0;
|
||||
while (attr != NULL) {
|
||||
if ((attr->children != NULL) &&
|
||||
(attr->children->type == XML_TEXT_NODE) &&
|
||||
(attr->children->next == NULL)) {
|
||||
value = attr->children->content;
|
||||
if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
|
||||
&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
|
||||
http = 1;
|
||||
else if ((value != NULL)
|
||||
&& (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
|
||||
content = value;
|
||||
if ((http != 0) && (content != NULL))
|
||||
goto found_content;
|
||||
}
|
||||
attr = attr->next;
|
||||
}
|
||||
}
|
||||
}
|
||||
cur = cur->next;
|
||||
}
|
||||
return(NULL);
|
||||
|
||||
found_content:
|
||||
encoding = xmlStrstr(content, BAD_CAST"charset=");
|
||||
if (encoding == NULL)
|
||||
encoding = xmlStrstr(content, BAD_CAST"Charset=");
|
||||
if (encoding == NULL)
|
||||
encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
|
||||
if (encoding != NULL) {
|
||||
encoding += 8;
|
||||
} else {
|
||||
encoding = xmlStrstr(content, BAD_CAST"charset =");
|
||||
if (encoding == NULL)
|
||||
encoding = xmlStrstr(content, BAD_CAST"Charset =");
|
||||
if (encoding == NULL)
|
||||
encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
|
||||
if (encoding != NULL)
|
||||
encoding += 9;
|
||||
}
|
||||
if (encoding != NULL) {
|
||||
while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
|
||||
}
|
||||
return(encoding);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates or updates a meta tag with an encoding declaration.
|
||||
*
|
||||
* Does not support `<meta charset="">` yet. Only supports deprecated
|
||||
* `<meta http-equiv="Content-Type" content="">`.
|
||||
*
|
||||
* NOTE: This will not change the document content encoding.
|
||||
*
|
||||
* @param doc the document
|
||||
* @param encoding the encoding string
|
||||
* @returns 0 in case of success and -1 in case of error
|
||||
* @returns 0 in case of success, 1 if no head element was found or
|
||||
* arguments are invalid and -1 if memory allocation failed.
|
||||
*/
|
||||
int
|
||||
htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
|
||||
htmlNodePtr cur, meta = NULL, head = NULL;
|
||||
const xmlChar *content = NULL;
|
||||
char newcontent[100];
|
||||
htmlNodePtr head, meta;
|
||||
int found = 0;
|
||||
|
||||
newcontent[0] = 0;
|
||||
if (encoding == NULL)
|
||||
return(1);
|
||||
|
||||
if (doc == NULL)
|
||||
return(-1);
|
||||
head = htmlFindHead(doc);
|
||||
if (head == NULL)
|
||||
return(1);
|
||||
|
||||
/* html isn't a real encoding it's just libxml2 way to get entities */
|
||||
if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
|
||||
for (meta = head->children; meta != NULL; meta = meta->next) {
|
||||
htmlMetaEncoding menc;
|
||||
|
||||
if (htmlParseMetaEncoding(meta, &menc)) {
|
||||
xmlChar *newVal;
|
||||
int ret;
|
||||
|
||||
found = 1;
|
||||
|
||||
newVal = htmlUpdateMetaEncoding(&menc, (char *) encoding);
|
||||
if (newVal == NULL)
|
||||
return(-1);
|
||||
xmlNodeSetContent((xmlNodePtr) menc.attr, NULL);
|
||||
ret = xmlNodeAddContent((xmlNodePtr) menc.attr, newVal);
|
||||
xmlFree(newVal);
|
||||
|
||||
if (ret < 0)
|
||||
return(-1);
|
||||
}
|
||||
}
|
||||
|
||||
if (found)
|
||||
return(0);
|
||||
|
||||
meta = xmlNewDocNode(head->doc, NULL, BAD_CAST "meta", NULL);
|
||||
if (meta == NULL)
|
||||
return(-1);
|
||||
|
||||
if (encoding != NULL) {
|
||||
snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
|
||||
(char *)encoding);
|
||||
newcontent[sizeof(newcontent) - 1] = 0;
|
||||
}
|
||||
|
||||
cur = doc->children;
|
||||
|
||||
/*
|
||||
* Search the html
|
||||
*/
|
||||
while (cur != NULL) {
|
||||
if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
|
||||
if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
|
||||
break;
|
||||
if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
|
||||
goto found_head;
|
||||
if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
|
||||
goto found_meta;
|
||||
}
|
||||
cur = cur->next;
|
||||
}
|
||||
if (cur == NULL)
|
||||
return(-1);
|
||||
cur = cur->children;
|
||||
|
||||
/*
|
||||
* Search the head
|
||||
*/
|
||||
while (cur != NULL) {
|
||||
if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
|
||||
if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
|
||||
break;
|
||||
if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
|
||||
head = cur->parent;
|
||||
goto found_meta;
|
||||
}
|
||||
}
|
||||
cur = cur->next;
|
||||
}
|
||||
if (cur == NULL)
|
||||
return(-1);
|
||||
found_head:
|
||||
head = cur;
|
||||
if (cur->children == NULL)
|
||||
goto create;
|
||||
cur = cur->children;
|
||||
|
||||
found_meta:
|
||||
/*
|
||||
* Search and update all the remaining the meta elements carrying
|
||||
* encoding information
|
||||
*/
|
||||
while (cur != NULL) {
|
||||
if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
|
||||
if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
|
||||
xmlAttrPtr attr = cur->properties;
|
||||
int http;
|
||||
const xmlChar *value;
|
||||
|
||||
content = NULL;
|
||||
http = 0;
|
||||
while (attr != NULL) {
|
||||
if ((attr->children != NULL) &&
|
||||
(attr->children->type == XML_TEXT_NODE) &&
|
||||
(attr->children->next == NULL)) {
|
||||
value = attr->children->content;
|
||||
if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
|
||||
&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
|
||||
http = 1;
|
||||
else
|
||||
{
|
||||
if ((value != NULL) &&
|
||||
(!xmlStrcasecmp(attr->name, BAD_CAST"content")))
|
||||
content = value;
|
||||
}
|
||||
if ((http != 0) && (content != NULL))
|
||||
break;
|
||||
}
|
||||
attr = attr->next;
|
||||
}
|
||||
if ((http != 0) && (content != NULL)) {
|
||||
meta = cur;
|
||||
break;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
cur = cur->next;
|
||||
}
|
||||
create:
|
||||
if (meta == NULL) {
|
||||
if ((encoding != NULL) && (head != NULL)) {
|
||||
/*
|
||||
* Create a new Meta element with the right attributes
|
||||
*/
|
||||
|
||||
meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
|
||||
if (head->children == NULL)
|
||||
xmlAddChild(head, meta);
|
||||
else
|
||||
xmlAddPrevSibling(head->children, meta);
|
||||
xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
|
||||
xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
|
||||
}
|
||||
} else {
|
||||
/* remove the meta tag if NULL is passed */
|
||||
if (encoding == NULL) {
|
||||
xmlUnlinkNode(meta);
|
||||
xmlFreeNode(meta);
|
||||
}
|
||||
/* change the document only if there is a real encoding change */
|
||||
else if (xmlStrcasestr(content, encoding) == NULL) {
|
||||
xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
|
||||
}
|
||||
if (xmlNewProp(meta, BAD_CAST "charset", encoding) == NULL) {
|
||||
xmlFreeNode(meta);
|
||||
return(-1);
|
||||
}
|
||||
|
||||
if (head->children == NULL)
|
||||
xmlAddChild(head, meta);
|
||||
else
|
||||
xmlAddPrevSibling(head->children, meta);
|
||||
|
||||
return(0);
|
||||
}
|
||||
@@ -383,7 +434,7 @@ htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
|
||||
outbuf->written = 0;
|
||||
|
||||
use = xmlBufUse(buf);
|
||||
htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
|
||||
htmlNodeDumpInternal(outbuf, doc, cur, NULL, format);
|
||||
if (outbuf->error)
|
||||
ret = (size_t) -1;
|
||||
else
|
||||
@@ -455,7 +506,7 @@ htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
|
||||
if (buf == NULL)
|
||||
return(-1);
|
||||
|
||||
htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format);
|
||||
htmlNodeDumpInternal(buf, doc, cur, NULL, format);
|
||||
|
||||
ret = xmlOutputBufferClose(buf);
|
||||
return(ret);
|
||||
@@ -479,14 +530,11 @@ htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
|
||||
* Serialize an HTML node to a memory, also returning the size of
|
||||
* the result. It's up to the caller to free the memory.
|
||||
*
|
||||
* WARNING: Uses the encoding from a deprecated meta tag, see
|
||||
* htmlGetMetaEncoding(). This is typically undesired. If no such
|
||||
* tag was found, ASCII with HTML 4.0 named character entities will
|
||||
* Uses the encoding of the document. If the document has no
|
||||
* encoding, ASCII with HTML 4.0 named character entities will
|
||||
* be used. This is inefficient compared to UTF-8 and might be
|
||||
* changed in a future version.
|
||||
*
|
||||
* Use of this function is therefore DISCOURAGED in favor of
|
||||
* htmlDocContentDumpFormatOutput().
|
||||
* @param cur the document
|
||||
* @param mem OUT: the memory pointer
|
||||
* @param size OUT: the memory length
|
||||
@@ -496,7 +544,6 @@ void
|
||||
htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
|
||||
xmlOutputBufferPtr buf;
|
||||
xmlCharEncodingHandlerPtr handler = NULL;
|
||||
const char *encoding;
|
||||
|
||||
xmlInitParser();
|
||||
|
||||
@@ -507,8 +554,7 @@ htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
|
||||
if (cur == NULL)
|
||||
return;
|
||||
|
||||
encoding = (const char *) htmlGetMetaEncoding(cur);
|
||||
if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK)
|
||||
if (htmlFindOutputEncoder((char *) cur->encoding, &handler) != XML_ERR_OK)
|
||||
return;
|
||||
buf = xmlAllocOutputBuffer(handler);
|
||||
if (buf == NULL)
|
||||
@@ -657,18 +703,19 @@ htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
|
||||
/**
|
||||
* Serialize an HTML node to an output buffer.
|
||||
*
|
||||
* Ignores `encoding` and uses the encoding of the output buffer.
|
||||
* If `encoding` is specified, it is used to create or update meta
|
||||
* tags containing the character encoding.
|
||||
*
|
||||
* @param buf the HTML buffer output
|
||||
* @param doc the document
|
||||
* @param cur the current node
|
||||
* @param encoding the encoding string (unused)
|
||||
* @param encoding the encoding string (optional)
|
||||
* @param format should formatting newlines been added
|
||||
*/
|
||||
void
|
||||
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
|
||||
xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED,
|
||||
int format) {
|
||||
xmlNodePtr root, parent;
|
||||
htmlNodeDumpInternal(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
|
||||
const char *encoding, int format) {
|
||||
xmlNodePtr root, parent, metaHead = NULL;
|
||||
xmlAttrPtr attr;
|
||||
const htmlElemDesc * info;
|
||||
|
||||
@@ -699,24 +746,61 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
|
||||
}
|
||||
break;
|
||||
|
||||
case XML_ELEMENT_NODE:
|
||||
case XML_ELEMENT_NODE: {
|
||||
htmlMetaEncoding menc;
|
||||
int isMeta = 0;
|
||||
int addMeta = 0;
|
||||
|
||||
/*
|
||||
* Some users like lxml are known to pass nodes with a corrupted
|
||||
* tree structure. Fall back to a recursive call to handle this
|
||||
* case.
|
||||
*/
|
||||
if ((cur->parent != parent) && (cur->children != NULL)) {
|
||||
htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
|
||||
htmlNodeDumpInternal(buf, doc, cur, encoding, format);
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* Get specific HTML info for that node.
|
||||
*/
|
||||
if (cur->ns == NULL)
|
||||
if (cur->ns == NULL) {
|
||||
info = htmlTagLookup(cur->name);
|
||||
else
|
||||
|
||||
if (encoding != NULL) {
|
||||
isMeta = htmlParseMetaEncoding(cur, &menc);
|
||||
|
||||
/*
|
||||
* Don't add meta tag for "HTML" encoding.
|
||||
*/
|
||||
if ((xmlStrcasecmp(BAD_CAST encoding,
|
||||
BAD_CAST "HTML") != 0) &&
|
||||
(xmlStrcasecmp(cur->name, BAD_CAST "head") == 0) &&
|
||||
(parent != NULL) &&
|
||||
(parent->ns == NULL) &&
|
||||
(xmlStrcasecmp(parent->name, BAD_CAST "html") == 0) &&
|
||||
(parent->parent != NULL) &&
|
||||
(parent->parent->parent == NULL) &&
|
||||
(metaHead == NULL)) {
|
||||
xmlNodePtr n;
|
||||
|
||||
metaHead = cur;
|
||||
addMeta = 1;
|
||||
|
||||
for (n = cur->children; n != NULL; n = n->next) {
|
||||
int unused;
|
||||
|
||||
if (htmlFindMetaEncodingAttr(n, &unused) != NULL) {
|
||||
metaHead = NULL;
|
||||
addMeta = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
info = NULL;
|
||||
}
|
||||
|
||||
xmlOutputBufferWriteString(buf, "<");
|
||||
if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
|
||||
@@ -728,7 +812,23 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
|
||||
xmlNsListDumpOutput(buf, cur->nsDef);
|
||||
attr = cur->properties;
|
||||
while (attr != NULL) {
|
||||
htmlAttrDumpOutput(buf, doc, attr);
|
||||
if ((!isMeta) || (attr != menc.attr)) {
|
||||
htmlAttrDumpOutput(buf, doc, attr);
|
||||
} else {
|
||||
xmlChar *newVal;
|
||||
|
||||
xmlOutputBufferWriteString(buf, " ");
|
||||
xmlOutputBufferWriteString(buf, (char *) attr->name);
|
||||
|
||||
newVal = htmlUpdateMetaEncoding(&menc, encoding);
|
||||
if (newVal == NULL) {
|
||||
buf->error = XML_ERR_NO_MEMORY;
|
||||
return;
|
||||
}
|
||||
xmlOutputBufferWriteString(buf, "=");
|
||||
xmlOutputBufferWriteQuotedString(buf, newVal);
|
||||
xmlFree(newVal);
|
||||
}
|
||||
attr = attr->next;
|
||||
}
|
||||
|
||||
@@ -740,7 +840,14 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
|
||||
(xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
|
||||
xmlOutputBufferWriteString(buf, ">");
|
||||
} else {
|
||||
xmlOutputBufferWriteString(buf, "></");
|
||||
if (addMeta) {
|
||||
xmlOutputBufferWriteString(buf, "><meta charset=\"");
|
||||
/* TODO: Escape */
|
||||
xmlOutputBufferWriteString(buf, encoding);
|
||||
xmlOutputBufferWriteString(buf, "\"></");
|
||||
} else {
|
||||
xmlOutputBufferWriteString(buf, "></");
|
||||
}
|
||||
if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
|
||||
xmlOutputBufferWriteString(buf,
|
||||
(const char *)cur->ns->prefix);
|
||||
@@ -751,13 +858,25 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
|
||||
}
|
||||
} else {
|
||||
xmlOutputBufferWriteString(buf, ">");
|
||||
if ((format) && (info != NULL) && (!info->isinline) &&
|
||||
(cur->children->type != HTML_TEXT_NODE) &&
|
||||
(cur->children->type != HTML_ENTITY_REF_NODE) &&
|
||||
(cur->children != cur->last) &&
|
||||
(cur->name != NULL) &&
|
||||
(cur->name[0] != 'p')) /* p, pre, param */
|
||||
if ((format) &&
|
||||
((addMeta) ||
|
||||
((info != NULL) && (!info->isinline) &&
|
||||
(cur->children->type != HTML_TEXT_NODE) &&
|
||||
(cur->children->type != HTML_ENTITY_REF_NODE) &&
|
||||
(cur->children != cur->last) &&
|
||||
(cur->name != NULL) &&
|
||||
(cur->name[0] != 'p')))) /* p, pre, param */
|
||||
xmlOutputBufferWriteString(buf, "\n");
|
||||
if (addMeta) {
|
||||
xmlOutputBufferWriteString(buf, "<meta charset=\"");
|
||||
/* TODO: Escape */
|
||||
xmlOutputBufferWriteString(buf, encoding);
|
||||
xmlOutputBufferWriteString(buf, "\">");
|
||||
if ((format) &&
|
||||
(cur->children->type != HTML_TEXT_NODE) &&
|
||||
(cur->children->type != HTML_ENTITY_REF_NODE))
|
||||
xmlOutputBufferWriteString(buf, "\n");
|
||||
}
|
||||
parent = cur;
|
||||
cur = cur->children;
|
||||
continue;
|
||||
@@ -774,6 +893,7 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
case XML_ATTRIBUTE_NODE:
|
||||
htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur);
|
||||
@@ -862,7 +982,7 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
|
||||
if ((format) && (info != NULL) && (!info->isinline) &&
|
||||
(cur->last->type != HTML_TEXT_NODE) &&
|
||||
(cur->last->type != HTML_ENTITY_REF_NODE) &&
|
||||
(cur->children != cur->last) &&
|
||||
((cur->children != cur->last) || (cur == metaHead)) &&
|
||||
(cur->name != NULL) &&
|
||||
(cur->name[0] != 'p')) /* p, pre, param */
|
||||
xmlOutputBufferWriteString(buf, "\n");
|
||||
@@ -884,32 +1004,48 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
|
||||
(parent->name[0] != 'p')) /* p, pre, param */
|
||||
xmlOutputBufferWriteString(buf, "\n");
|
||||
}
|
||||
|
||||
if (cur == metaHead)
|
||||
metaHead = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Serialize an HTML node to an output buffer.
|
||||
*
|
||||
* @param buf the HTML buffer output
|
||||
* @param doc the document
|
||||
* @param cur the current node
|
||||
* @param encoding the encoding string (unused)
|
||||
* @param format should formatting newlines been added
|
||||
*/
|
||||
void
|
||||
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
|
||||
const char *encoding ATTRIBUTE_UNUSED, int format) {
|
||||
htmlNodeDumpInternal(buf, doc, cur, NULL, format);
|
||||
}
|
||||
|
||||
/**
|
||||
* Same as htmlNodeDumpFormatOutput() with `format` set to 1 which is
|
||||
* typically undesired. Use of this function is DISCOURAGED in favor
|
||||
* of htmlNodeDumpFormatOutput().
|
||||
*
|
||||
* Ignores `encoding` and uses the encoding of the output buffer.
|
||||
* @param buf the HTML buffer output
|
||||
* @param doc the document
|
||||
* @param cur the current node
|
||||
* @param encoding the encoding string (unused)
|
||||
*/
|
||||
void
|
||||
htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
|
||||
xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) {
|
||||
htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1);
|
||||
htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
|
||||
const char *encoding ATTRIBUTE_UNUSED) {
|
||||
htmlNodeDumpInternal(buf, doc, cur, NULL, 1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Serialize an HTML document to an output buffer.
|
||||
*
|
||||
* Ignores `encoding` and uses the encoding of the output buffer.
|
||||
* @param buf the HTML buffer output
|
||||
* @param cur the document
|
||||
* @param encoding the encoding string (unused)
|
||||
@@ -919,31 +1055,14 @@ void
|
||||
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
|
||||
const char *encoding ATTRIBUTE_UNUSED,
|
||||
int format) {
|
||||
int type = 0;
|
||||
|
||||
/*
|
||||
* This is needed when serializing XML documents as HTML.
|
||||
* xmlEncodeEntitiesReentrant uses the document type to
|
||||
* determine the serialization mode.
|
||||
*
|
||||
* Once we call more low-level functions directly with
|
||||
* HTML flags, this hack can be removed.
|
||||
*/
|
||||
if (cur) {
|
||||
type = cur->type;
|
||||
cur->type = XML_HTML_DOCUMENT_NODE;
|
||||
}
|
||||
htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format);
|
||||
if (cur)
|
||||
cur->type = (xmlElementType) type;
|
||||
htmlNodeDumpInternal(buf, cur, (xmlNodePtr) cur, NULL, format);
|
||||
}
|
||||
|
||||
/**
|
||||
* Same as htmlNodeDumpFormatOutput() with `format` set to 1 which is
|
||||
* typically undesired. Use of this function is DISCOURAGED in favor
|
||||
* of htmlDocContentDumpFormatOutput().
|
||||
* Same as htmlDocContentDumpFormatDump() with `format` set to 1
|
||||
* which is typically undesired. Use of this function is DISCOURAGED
|
||||
* in favor of htmlDocContentDumpFormatOutput().
|
||||
*
|
||||
* Ignores `encoding` and uses the encoding of the output buffer.
|
||||
* @param buf the HTML buffer output
|
||||
* @param cur the document
|
||||
* @param encoding the encoding string (unused)
|
||||
@@ -951,7 +1070,7 @@ htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
|
||||
void
|
||||
htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
|
||||
const char *encoding ATTRIBUTE_UNUSED) {
|
||||
htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1);
|
||||
htmlNodeDumpInternal(buf, cur, (xmlNodePtr) cur, NULL, 1);
|
||||
}
|
||||
|
||||
/************************************************************************
|
||||
@@ -963,13 +1082,12 @@ htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
|
||||
/**
|
||||
* Serialize an HTML document to an open `FILE`.
|
||||
*
|
||||
* WARNING: Uses the encoding from a deprecated meta tag, see
|
||||
* htmlGetMetaEncoding(). This is typically undesired. If no such
|
||||
* tag was found, ASCII with HTML 4.0 named character entities will
|
||||
* Uses the encoding of the document. If the document has no
|
||||
* encoding, ASCII with HTML 4.0 named character entities will
|
||||
* be used. This is inefficient compared to UTF-8 and might be
|
||||
* changed in a future version.
|
||||
*
|
||||
* Also enables "formatting" unconditionally which is typically
|
||||
* Enables "formatting" unconditionally which is typically
|
||||
* undesired.
|
||||
*
|
||||
* Use of this function is DISCOURAGED in favor of
|
||||
@@ -983,7 +1101,6 @@ int
|
||||
htmlDocDump(FILE *f, xmlDocPtr cur) {
|
||||
xmlOutputBufferPtr buf;
|
||||
xmlCharEncodingHandlerPtr handler = NULL;
|
||||
const char *encoding;
|
||||
int ret;
|
||||
|
||||
xmlInitParser();
|
||||
@@ -992,8 +1109,7 @@ htmlDocDump(FILE *f, xmlDocPtr cur) {
|
||||
return(-1);
|
||||
}
|
||||
|
||||
encoding = (const char *) htmlGetMetaEncoding(cur);
|
||||
if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK)
|
||||
if (htmlFindOutputEncoder((char *) cur->encoding, &handler) != XML_ERR_OK)
|
||||
return(-1);
|
||||
buf = xmlOutputBufferCreateFile(f, handler);
|
||||
if (buf == NULL)
|
||||
@@ -1005,18 +1121,10 @@ htmlDocDump(FILE *f, xmlDocPtr cur) {
|
||||
}
|
||||
|
||||
/**
|
||||
* Serialize an HTML document to a file. If `filename` is `"-"`,
|
||||
* stdout is used. This is potentially insecure and might be
|
||||
* changed in a future version.
|
||||
* Serialize an HTML document to a file.
|
||||
*
|
||||
* WARNING: Uses the encoding from a deprecated meta tag, see
|
||||
* htmlGetMetaEncoding(). This is typically undesired. If no such
|
||||
* tag was found, ASCII with HTML 4.0 named character entities will
|
||||
* be used. This is inefficient compared to UTF-8 and might be
|
||||
* changed in a future version.
|
||||
*
|
||||
* Also enables "formatting" unconditionally which is typically
|
||||
* undesired.
|
||||
* Same as htmlSaveFileFormat() with `encoding` set to NULL and
|
||||
* `format` set to 1 which is typically undesired.
|
||||
*
|
||||
* Use of this function is DISCOURAGED in favor of
|
||||
* htmlSaveFileFormat().
|
||||
@@ -1027,31 +1135,12 @@ htmlDocDump(FILE *f, xmlDocPtr cur) {
|
||||
*/
|
||||
int
|
||||
htmlSaveFile(const char *filename, xmlDocPtr cur) {
|
||||
xmlOutputBufferPtr buf;
|
||||
xmlCharEncodingHandlerPtr handler = NULL;
|
||||
const char *encoding;
|
||||
int ret;
|
||||
|
||||
if ((cur == NULL) || (filename == NULL))
|
||||
return(-1);
|
||||
|
||||
xmlInitParser();
|
||||
|
||||
encoding = (const char *) htmlGetMetaEncoding(cur);
|
||||
if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK)
|
||||
return(-1);
|
||||
buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
|
||||
if (buf == NULL)
|
||||
return(-1);
|
||||
|
||||
htmlDocContentDumpOutput(buf, cur, NULL);
|
||||
|
||||
ret = xmlOutputBufferClose(buf);
|
||||
return(ret);
|
||||
return(htmlSaveFileFormat(filename, cur, NULL, 1));
|
||||
}
|
||||
|
||||
/**
|
||||
* Serialize an HTML document to a file using a given encoding.
|
||||
*
|
||||
* If `filename` is `"-"`, stdout is used. This is potentially
|
||||
* insecure and might be changed in a future version.
|
||||
*
|
||||
@@ -1059,6 +1148,8 @@ htmlSaveFile(const char *filename, xmlDocPtr cur) {
|
||||
* will be used. This is inefficient compared to UTF-8 and might be
|
||||
* changed in a future version.
|
||||
*
|
||||
* Sets or updates meta tags containing the character encoding.
|
||||
*
|
||||
* @param filename the filename
|
||||
* @param cur the document
|
||||
* @param format should formatting newlines been added
|
||||
@@ -1079,15 +1170,11 @@ htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
|
||||
|
||||
if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK)
|
||||
return(-1);
|
||||
if (handler != NULL)
|
||||
htmlSetMetaEncoding(cur, (const xmlChar *) handler->name);
|
||||
else
|
||||
htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
|
||||
|
||||
/*
|
||||
* save the content to a temp buffer.
|
||||
*/
|
||||
buf = xmlOutputBufferCreateFilename(filename, handler, 0);
|
||||
buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
|
||||
if (buf == NULL)
|
||||
return(0);
|
||||
|
||||
@@ -1098,6 +1185,8 @@ htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
|
||||
}
|
||||
|
||||
/**
|
||||
* Serialize an HTML document to a file.
|
||||
*
|
||||
* Same as htmlSaveFileFormat() with `format` set to 1 which is
|
||||
* typically undesired. Also see the warnings there. Use of this
|
||||
* function is DISCOURAGED in favor of htmlSaveFileFormat().
|
||||
|
@@ -5,9 +5,26 @@
|
||||
|
||||
#ifdef LIBXML_HTML_ENABLED
|
||||
|
||||
#define IS_WS_HTML(c) \
|
||||
(((c) == 0x20) || \
|
||||
(((c) >= 0x09) && ((c) <= 0x0D) && ((c) != 0x0B)))
|
||||
|
||||
typedef struct {
|
||||
size_t start;
|
||||
size_t end;
|
||||
size_t size;
|
||||
} htmlMetaEncodingOffsets;
|
||||
|
||||
XML_HIDDEN xmlNodePtr
|
||||
htmlCtxtParseContentInternal(xmlParserCtxtPtr ctxt, xmlParserInputPtr input);
|
||||
|
||||
XML_HIDDEN int
|
||||
htmlParseContentType(const xmlChar *val, htmlMetaEncodingOffsets *off);
|
||||
|
||||
XML_HIDDEN void
|
||||
htmlNodeDumpInternal(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
|
||||
const char *encoding, int format);
|
||||
|
||||
#endif /* LIBXML_HTML_ENABLED */
|
||||
|
||||
#endif /* XML_HTML_H_PRIVATE__ */
|
||||
|
@@ -77,17 +77,14 @@ if str != """<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http
|
||||
sys.exit(1)
|
||||
str = doc.serialize("ISO-8859-1")
|
||||
if str != """<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||
<html><head><meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"><title>Hello</title></head><body><p>hello</p></body></html>
|
||||
<html><head><meta charset="ISO-8859-1"><title>Hello</title></head><body><p>hello</p></body></html>
|
||||
""":
|
||||
print("error serializing HTML document 2")
|
||||
sys.exit(1)
|
||||
str = doc.serialize(format=1)
|
||||
if str != """<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
|
||||
<title>Hello</title>
|
||||
</head>
|
||||
<head><title>Hello</title></head>
|
||||
<body><p>hello</p></body>
|
||||
</html>
|
||||
""":
|
||||
@@ -97,13 +94,13 @@ str = doc.serialize("iso-8859-1", 1)
|
||||
if str != """<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
|
||||
<meta charset="iso-8859-1">
|
||||
<title>Hello</title>
|
||||
</head>
|
||||
<body><p>hello</p></body>
|
||||
</html>
|
||||
""":
|
||||
print("error serializing HTML document 4")
|
||||
print("error serializing HTML document 4", str)
|
||||
sys.exit(1)
|
||||
|
||||
#
|
||||
@@ -116,15 +113,12 @@ if str != """<html><head><title>Hello</title></head><body><p>hello</p></body></h
|
||||
print("error serializing HTML root 1")
|
||||
sys.exit(1)
|
||||
str = root.serialize("ISO-8859-1")
|
||||
if str != """<html><head><meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"><title>Hello</title></head><body><p>hello</p></body></html>""":
|
||||
if str != """<html><head><meta charset="ISO-8859-1"><title>Hello</title></head><body><p>hello</p></body></html>""":
|
||||
print("error serializing HTML root 2")
|
||||
sys.exit(1)
|
||||
str = root.serialize(format=1)
|
||||
if str != """<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
|
||||
<title>Hello</title>
|
||||
</head>
|
||||
<head><title>Hello</title></head>
|
||||
<body><p>hello</p></body>
|
||||
</html>""":
|
||||
print("error serializing HTML root 3")
|
||||
@@ -132,7 +126,7 @@ if str != """<html>
|
||||
str = root.serialize("iso-8859-1", 1)
|
||||
if str != """<html>
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
|
||||
<meta charset="iso-8859-1">
|
||||
<title>Hello</title>
|
||||
</head>
|
||||
<body><p>hello</p></body>
|
||||
|
@@ -1,2 +1,2 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||
<html><body>&jÙ</body></html>
|
||||
<html><body>&j<EFBFBD></body></html>
|
||||
|
@@ -1,2 +1,2 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||
<!--?a“-->
|
||||
<!--?a<EFBFBD>-->
|
||||
|
@@ -1,3 +1,3 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||
<html><body>&:ê
|
||||
<html><body>&:<EFBFBD>
|
||||
</body></html>
|
||||
|
@@ -1,3 +1,3 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||
<!--‘<!dOctYPE
|
||||
<!--<EFBFBD><!dOctYPE
|
||||
-->
|
||||
|
@@ -11,7 +11,7 @@
|
||||
<font face="Verdana">
|
||||
<h1><a name="top">Microsoft FrontPage 2000 Server Extensions, UNIX</a></h1>
|
||||
|
||||
<font size="2"><i>© Copyright Microsoft Corporation, 1999 </i></font>
|
||||
<font size="2"><i><EFBFBD> Copyright Microsoft Corporation, 1999<EFBFBD></i></font>
|
||||
|
||||
|
||||
<p>The FrontPage Server Extensions are a set of programs on the Web server that support:
|
||||
@@ -24,11 +24,11 @@
|
||||
</ul>
|
||||
|
||||
|
||||
<h2>Contents </h2>
|
||||
<h2>Contents<EFBFBD></h2>
|
||||
|
||||
<a href="#relnotes">Release Notes</a><br>
|
||||
<a href="#moreinfo">Resources for More Information</a>
|
||||
<p> </p>
|
||||
<p><EFBFBD></p>
|
||||
<hr>
|
||||
<h2><a name="relnotes">Release Notes</a></h2>
|
||||
|
||||
@@ -54,7 +54,7 @@ configuration files (access.conf, srm.conf), add the following lines to http.con
|
||||
</font>
|
||||
<blockquote>
|
||||
<font face="Courier New">
|
||||
ResourceConfig /dev/null <br>
|
||||
ResourceConfig /dev/null<EFBFBD><br>
|
||||
AccessConfig /dev/null</font>
|
||||
</blockquote>
|
||||
<font face="Verdana">
|
||||
@@ -160,7 +160,7 @@ answering inquiries, so you can write your question in your own words. To begin,
|
||||
<p align="right"><font size="1"><a href="#moreinfo">Top of Section</a></font></p>
|
||||
|
||||
|
||||
<p> </p>
|
||||
<p><EFBFBD></p>
|
||||
|
||||
</font>
|
||||
</body>
|
||||
|
@@ -4,6 +4,6 @@
|
||||
<meta charset="iso-8859-1">
|
||||
</head>
|
||||
<body>
|
||||
<p>très</p>
|
||||
<p>tr<EFBFBD>s</p>
|
||||
</body>
|
||||
</html>
|
||||
|
File diff suppressed because one or more lines are too long
@@ -1,4 +1,4 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||
<!--?xml encoding="UTF-8"--><html><body>
|
||||
<p>öäüß</p>
|
||||
<p>öäüß</p>
|
||||
</body></html>
|
||||
|
126
testparser.c
126
testparser.c
@@ -14,6 +14,7 @@
|
||||
#include <libxml/xmlsave.h>
|
||||
#include <libxml/xmlwriter.h>
|
||||
#include <libxml/HTMLparser.h>
|
||||
#include <libxml/HTMLtree.h>
|
||||
|
||||
#include <string.h>
|
||||
|
||||
@@ -611,6 +612,129 @@ testHtmlIds(void) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define MHE "meta http-equiv=\"Content-Type\""
|
||||
|
||||
static int
|
||||
testHtmlInsertMetaEncoding(void) {
|
||||
/* We currently require a head element to be present. */
|
||||
const char *html =
|
||||
"<html>"
|
||||
"<head></head>"
|
||||
"<body>text</body>"
|
||||
"</html>\n";
|
||||
const char *expect =
|
||||
"<html>"
|
||||
"<head><meta charset=\"utf-8\"></head>"
|
||||
"<body>text</body>"
|
||||
"</html>\n";
|
||||
htmlDocPtr doc;
|
||||
xmlBufferPtr buf;
|
||||
xmlSaveCtxtPtr save;
|
||||
xmlChar *out;
|
||||
int size, err = 0;
|
||||
|
||||
|
||||
doc = htmlReadDoc(BAD_CAST html, NULL, NULL, HTML_PARSE_NODEFDTD);
|
||||
|
||||
/* xmlSave updates meta tags */
|
||||
buf = xmlBufferCreate();
|
||||
save = xmlSaveToBuffer(buf, "utf-8", 0);
|
||||
xmlSaveDoc(save, doc);
|
||||
xmlSaveClose(save);
|
||||
if (!xmlStrEqual(xmlBufferContent(buf), BAD_CAST expect)) {
|
||||
fprintf(stderr, "meta tag insertion failed when serializing\n");
|
||||
err = 1;
|
||||
}
|
||||
xmlBufferFree(buf);
|
||||
|
||||
htmlSetMetaEncoding(doc, BAD_CAST "utf-8");
|
||||
/* htmlDocDumpMemoryFormat doesn't update meta tags */
|
||||
htmlDocDumpMemoryFormat(doc, &out, &size, 0);
|
||||
if (!xmlStrEqual(out, BAD_CAST expect)) {
|
||||
fprintf(stderr, "htmlSetMetaEncoding insertion failed\n");
|
||||
err = 1;
|
||||
}
|
||||
xmlFree(out);
|
||||
|
||||
xmlFreeDoc(doc);
|
||||
return err;
|
||||
}
|
||||
|
||||
static int
|
||||
testHtmlUpdateMetaEncoding(void) {
|
||||
/* We rely on the implementation adjusting all meta tags */
|
||||
const char *html =
|
||||
"<html>\n"
|
||||
" <head>\n"
|
||||
" <meta charset=\"utf-8\">\n"
|
||||
" <meta charset=\" foo \">\n"
|
||||
" <meta charset=\"\">\n"
|
||||
" <" MHE " content=\"text/html; ChArSeT=foo\">\n"
|
||||
" <" MHE " content=\"text/html; charset = \">\n"
|
||||
" <" MHE " content=\"text/html; charset = ' foo '\">\n"
|
||||
" <" MHE " content=\"text/html; charset = ' foo \">\n"
|
||||
" <" MHE " content='text/html; charset = \" foo \"'>\n"
|
||||
" <" MHE " content='text/html; charset = \" foo '>\n"
|
||||
" <" MHE " content=\"charset ; charset = bar; baz\">\n"
|
||||
" <" MHE " content=\"text/html\">\n"
|
||||
" <" MHE " content=\"\">\n"
|
||||
" <" MHE ">\n"
|
||||
" </head>\n"
|
||||
" <body></body>\n"
|
||||
"</html>\n";
|
||||
const char *expect =
|
||||
"<html>\n"
|
||||
" <head>\n"
|
||||
" <meta charset=\"utf-8\">\n"
|
||||
" <meta charset=\" utf-8 \">\n"
|
||||
" <meta charset=\"utf-8\">\n"
|
||||
" <" MHE " content=\"text/html; ChArSeT=utf-8\">\n"
|
||||
" <" MHE " content=\"text/html; charset = \">\n"
|
||||
" <" MHE " content=\"text/html; charset = ' utf-8 '\">\n"
|
||||
" <" MHE " content=\"text/html; charset = ' foo \">\n"
|
||||
" <" MHE " content='text/html; charset = \" utf-8 \"'>\n"
|
||||
" <" MHE " content='text/html; charset = \" foo '>\n"
|
||||
" <" MHE " content=\"charset ; charset = utf-8; baz\">\n"
|
||||
" <" MHE " content=\"text/html\">\n"
|
||||
" <" MHE " content=\"\">\n"
|
||||
" <" MHE ">\n"
|
||||
" </head>\n"
|
||||
" <body></body>\n"
|
||||
"</html>\n";
|
||||
htmlDocPtr doc;
|
||||
xmlBufferPtr buf;
|
||||
xmlSaveCtxtPtr save;
|
||||
xmlChar *out;
|
||||
int size, err = 0;
|
||||
|
||||
doc = htmlReadDoc(BAD_CAST html, NULL, NULL, HTML_PARSE_NODEFDTD);
|
||||
|
||||
/* xmlSave updates meta tags */
|
||||
buf = xmlBufferCreate();
|
||||
save = xmlSaveToBuffer(buf, NULL, 0);
|
||||
xmlSaveDoc(save, doc);
|
||||
xmlSaveClose(save);
|
||||
if (!xmlStrEqual(xmlBufferContent(buf), BAD_CAST expect)) {
|
||||
fprintf(stderr, "meta tag update failed when serializing\n");
|
||||
err = 1;
|
||||
}
|
||||
xmlBufferFree(buf);
|
||||
|
||||
xmlFree((xmlChar *) doc->encoding);
|
||||
doc->encoding = NULL;
|
||||
htmlSetMetaEncoding(doc, BAD_CAST "utf-8");
|
||||
/* htmlDocDumpMemoryFormat doesn't update meta tags */
|
||||
htmlDocDumpMemoryFormat(doc, &out, &size, 0);
|
||||
if (!xmlStrEqual(out, BAD_CAST expect)) {
|
||||
fprintf(stderr, "htmlSetMetaEncoding update failed\n");
|
||||
err = 1;
|
||||
}
|
||||
xmlFree(out);
|
||||
|
||||
xmlFreeDoc(doc);
|
||||
return err;
|
||||
}
|
||||
|
||||
#ifdef LIBXML_PUSH_ENABLED
|
||||
static int
|
||||
testHtmlPushWithEncoding(void) {
|
||||
@@ -1293,6 +1417,8 @@ main(void) {
|
||||
#endif
|
||||
#ifdef LIBXML_HTML_ENABLED
|
||||
err |= testHtmlIds();
|
||||
err |= testHtmlInsertMetaEncoding();
|
||||
err |= testHtmlUpdateMetaEncoding();
|
||||
#ifdef LIBXML_PUSH_ENABLED
|
||||
err |= testHtmlPushWithEncoding();
|
||||
#endif
|
||||
|
37
xmlsave.c
37
xmlsave.c
@@ -25,6 +25,7 @@
|
||||
#include "private/enc.h"
|
||||
#include "private/entities.h"
|
||||
#include "private/error.h"
|
||||
#include "private/html.h"
|
||||
#include "private/io.h"
|
||||
#include "private/save.h"
|
||||
|
||||
@@ -1022,32 +1023,24 @@ xmlAttrDumpOutput(xmlSaveCtxtPtr ctxt, xmlAttrPtr cur) {
|
||||
*/
|
||||
static int
|
||||
htmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) {
|
||||
const xmlChar *encoding;
|
||||
int switched_encoding = 0;
|
||||
int format = 0;
|
||||
xmlDocPtr doc;
|
||||
|
||||
xmlInitParser();
|
||||
|
||||
encoding = ctxt->encoding;
|
||||
doc = cur->doc;
|
||||
if (doc != NULL) {
|
||||
if (encoding == NULL)
|
||||
encoding = doc->encoding;
|
||||
|
||||
/* We probably shouldn't do this unless we're dumping a document. */
|
||||
if (encoding != NULL)
|
||||
htmlSetMetaEncoding(doc, encoding);
|
||||
}
|
||||
|
||||
if (ctxt->encoding == NULL) {
|
||||
if ((encoding == NULL) && (doc != NULL))
|
||||
encoding = htmlGetMetaEncoding(doc);
|
||||
const char *encoding = NULL;
|
||||
|
||||
if (doc != NULL)
|
||||
encoding = (char *) doc->encoding;
|
||||
|
||||
if (encoding == NULL)
|
||||
encoding = BAD_CAST "HTML";
|
||||
encoding = "HTML";
|
||||
|
||||
if (xmlSaveSwitchEncoding(ctxt, (const char*) encoding) < 0)
|
||||
if (xmlSaveSwitchEncoding(ctxt, encoding) < 0)
|
||||
return(-1);
|
||||
switched_encoding = 1;
|
||||
}
|
||||
@@ -1055,7 +1048,7 @@ htmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) {
|
||||
if (ctxt->options & XML_SAVE_FORMAT)
|
||||
format = 1;
|
||||
|
||||
htmlNodeDumpFormatOutput(ctxt->buf, doc, cur, NULL, format);
|
||||
htmlNodeDumpInternal(ctxt->buf, doc, cur, (char *) ctxt->encoding, format);
|
||||
|
||||
if (switched_encoding) {
|
||||
xmlSaveClearEncoding(ctxt);
|
||||
@@ -1361,16 +1354,9 @@ xmlSaveDocInternal(xmlSaveCtxtPtr ctxt, xmlDocPtr cur,
|
||||
#ifdef LIBXML_HTML_ENABLED
|
||||
int format = 0;
|
||||
|
||||
if (encoding != NULL)
|
||||
htmlSetMetaEncoding(cur, encoding);
|
||||
|
||||
if (ctxt->encoding == NULL) {
|
||||
if (encoding == NULL) {
|
||||
encoding = htmlGetMetaEncoding(cur);
|
||||
|
||||
if (encoding == NULL)
|
||||
encoding = BAD_CAST "HTML";
|
||||
}
|
||||
if (encoding == NULL)
|
||||
encoding = BAD_CAST "HTML";
|
||||
|
||||
if (xmlSaveSwitchEncoding(ctxt, (const char*) encoding) < 0) {
|
||||
return(-1);
|
||||
@@ -1380,7 +1366,8 @@ xmlSaveDocInternal(xmlSaveCtxtPtr ctxt, xmlDocPtr cur,
|
||||
|
||||
if (ctxt->options & XML_SAVE_FORMAT)
|
||||
format = 1;
|
||||
htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
|
||||
htmlNodeDumpInternal(buf, cur, (htmlNodePtr) cur,
|
||||
(char *) ctxt->encoding, format);
|
||||
#else
|
||||
return(-1);
|
||||
#endif
|
||||
|
Reference in New Issue
Block a user