mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-10-21 14:53:44 +03:00
html: Rework meta charset handling
Don't use encoding from meta tags when serializing. Only use the value in `doc->encoding`, matching the XML serializer. This is the actual encoding used when parsing. Stop modifying the input document by setting meta tags before serializing. Meta tags are now injected during serialization. Add full support for <meta charset=""> which is also used when adding meta tags. Align with HTML5 and implement the "algorithm for extracting a character encoding from a meta element". Only modify the encoding substring in Content-Type meta tags. Only switch encoding once when parsing. Fix htmlSaveFileFormat with a NULL encoding not to declare a misleading UTF-8 charset. Fixes #909.
This commit is contained in:
@@ -40,7 +40,7 @@ gcc:c89:
|
|||||||
extends: .test
|
extends: .test
|
||||||
variables:
|
variables:
|
||||||
CONFIG: "--without-python"
|
CONFIG: "--without-python"
|
||||||
CFLAGS: "-O2 -std=c89 -D_XOPEN_SOURCE=600 -Wno-error=unused-function"
|
CFLAGS: "-O2 -std=c89 -D_XOPEN_SOURCE=600 -Wno-error=unused-function -Wno-error=overlength-strings"
|
||||||
|
|
||||||
gcc:minimum:
|
gcc:minimum:
|
||||||
extends: .test
|
extends: .test
|
||||||
|
90
HTMLparser.c
90
HTMLparser.c
@@ -50,10 +50,6 @@
|
|||||||
#define HTML_PARSER_BIG_BUFFER_SIZE 1000
|
#define HTML_PARSER_BIG_BUFFER_SIZE 1000
|
||||||
#define HTML_PARSER_BUFFER_SIZE 100
|
#define HTML_PARSER_BUFFER_SIZE 100
|
||||||
|
|
||||||
#define IS_WS_HTML(c) \
|
|
||||||
(((c) == 0x20) || \
|
|
||||||
(((c) >= 0x09) && ((c) <= 0x0D) && ((c) != 0x0B)))
|
|
||||||
|
|
||||||
#define IS_HEX_DIGIT(c) \
|
#define IS_HEX_DIGIT(c) \
|
||||||
((IS_ASCII_DIGIT(c)) || \
|
((IS_ASCII_DIGIT(c)) || \
|
||||||
((((c) | 0x20) >= 'a') && (((c) | 0x20) <= 'f')))
|
((((c) | 0x20) >= 'a') && (((c) | 0x20) <= 'f')))
|
||||||
@@ -314,17 +310,15 @@ htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
|
|||||||
#define CUR (*ctxt->input->cur)
|
#define CUR (*ctxt->input->cur)
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* `the` HTML parser context
|
* Prescan to find encoding.
|
||||||
*
|
*
|
||||||
* Ty to find and encoding in the current data available in the input
|
* Try to find an encoding in the current data available in the input
|
||||||
* buffer this is needed to try to switch to the proper encoding when
|
* buffer.
|
||||||
* one face a character error.
|
|
||||||
* That's an heuristic, since it's operating outside of parsing it could
|
|
||||||
* try to use a meta which had been commented out, that's the reason it
|
|
||||||
* should only be used in case of error, not as a default.
|
|
||||||
*
|
*
|
||||||
* @returns an encoding string or NULL if not found, the string need to
|
* TODO: Implement HTML5 prescan algorithm.
|
||||||
* be freed
|
*
|
||||||
|
* @param ctxt the HTML parser context
|
||||||
|
* @returns an encoding string or NULL if not found
|
||||||
*/
|
*/
|
||||||
static xmlChar *
|
static xmlChar *
|
||||||
htmlFindEncoding(xmlParserCtxtPtr ctxt) {
|
htmlFindEncoding(xmlParserCtxtPtr ctxt) {
|
||||||
@@ -3606,42 +3600,7 @@ htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Checks an http-equiv attribute from a Meta tag to detect
|
* Handle charset encoding in meta tag.
|
||||||
* the encoding
|
|
||||||
* If a new encoding is detected the parser is switched to decode
|
|
||||||
* it and pass UTF8
|
|
||||||
*
|
|
||||||
* @param ctxt an HTML parser context
|
|
||||||
* @param attvalue the attribute value
|
|
||||||
*/
|
|
||||||
static void
|
|
||||||
htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
|
|
||||||
const xmlChar *encoding;
|
|
||||||
xmlChar *copy;
|
|
||||||
|
|
||||||
if (!attvalue)
|
|
||||||
return;
|
|
||||||
|
|
||||||
encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
|
|
||||||
if (encoding != NULL) {
|
|
||||||
encoding += 7;
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
* skip blank
|
|
||||||
*/
|
|
||||||
if (encoding && IS_WS_HTML(*encoding))
|
|
||||||
encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
|
|
||||||
if (encoding && *encoding == '=') {
|
|
||||||
encoding ++;
|
|
||||||
copy = xmlStrdup(encoding);
|
|
||||||
if (copy == NULL)
|
|
||||||
htmlErrMemory(ctxt);
|
|
||||||
xmlSetDeclaredEncoding(ctxt, copy);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Checks an attributes from a Meta tag
|
|
||||||
*
|
*
|
||||||
* @param ctxt an HTML parser context
|
* @param ctxt an HTML parser context
|
||||||
* @param atts the attributes values
|
* @param atts the attributes values
|
||||||
@@ -3650,7 +3609,7 @@ static void
|
|||||||
htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
|
htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
|
||||||
int i;
|
int i;
|
||||||
const xmlChar *att, *value;
|
const xmlChar *att, *value;
|
||||||
int http = 0;
|
int isContentType = 0;
|
||||||
const xmlChar *content = NULL;
|
const xmlChar *content = NULL;
|
||||||
|
|
||||||
if ((ctxt == NULL) || (atts == NULL))
|
if ((ctxt == NULL) || (atts == NULL))
|
||||||
@@ -3663,23 +3622,33 @@ htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
|
|||||||
if (value != NULL) {
|
if (value != NULL) {
|
||||||
if ((!xmlStrcasecmp(att, BAD_CAST "http-equiv")) &&
|
if ((!xmlStrcasecmp(att, BAD_CAST "http-equiv")) &&
|
||||||
(!xmlStrcasecmp(value, BAD_CAST "Content-Type"))) {
|
(!xmlStrcasecmp(value, BAD_CAST "Content-Type"))) {
|
||||||
http = 1;
|
isContentType = 1;
|
||||||
} else if (!xmlStrcasecmp(att, BAD_CAST "charset")) {
|
} else if (!xmlStrcasecmp(att, BAD_CAST "charset")) {
|
||||||
xmlChar *copy;
|
xmlChar *encoding;
|
||||||
|
|
||||||
copy = xmlStrdup(value);
|
encoding = xmlStrdup(value);
|
||||||
if (copy == NULL)
|
if (encoding == NULL)
|
||||||
htmlErrMemory(ctxt);
|
htmlErrMemory(ctxt);
|
||||||
xmlSetDeclaredEncoding(ctxt, copy);
|
xmlSetDeclaredEncoding(ctxt, encoding);
|
||||||
} else if (!xmlStrcasecmp(att, BAD_CAST "content")) {
|
} else if (!xmlStrcasecmp(att, BAD_CAST "content")) {
|
||||||
content = value;
|
content = value;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
att = atts[i++];
|
att = atts[i++];
|
||||||
}
|
}
|
||||||
if ((http) && (content != NULL))
|
|
||||||
htmlCheckEncoding(ctxt, content);
|
|
||||||
|
|
||||||
|
if ((isContentType) && (content != NULL)) {
|
||||||
|
htmlMetaEncodingOffsets off;
|
||||||
|
|
||||||
|
if (htmlParseContentType(content, &off)) {
|
||||||
|
xmlChar *encoding;
|
||||||
|
|
||||||
|
encoding = xmlStrndup(content + off.start, off.end - off.start);
|
||||||
|
if (encoding == NULL)
|
||||||
|
htmlErrMemory(ctxt);
|
||||||
|
xmlSetDeclaredEncoding(ctxt, encoding);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -3748,7 +3717,6 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
|
|||||||
const xmlChar **atts;
|
const xmlChar **atts;
|
||||||
int nbatts = 0;
|
int nbatts = 0;
|
||||||
int maxatts;
|
int maxatts;
|
||||||
int meta = 0;
|
|
||||||
int i;
|
int i;
|
||||||
int discardtag = 0;
|
int discardtag = 0;
|
||||||
|
|
||||||
@@ -3763,8 +3731,6 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
|
|||||||
name = htmlParseHTMLName(ctxt, 0).name;
|
name = htmlParseHTMLName(ctxt, 0).name;
|
||||||
if (name == NULL)
|
if (name == NULL)
|
||||||
return;
|
return;
|
||||||
if (xmlStrEqual(name, BAD_CAST"meta"))
|
|
||||||
meta = 1;
|
|
||||||
|
|
||||||
if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
|
if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
|
||||||
/*
|
/*
|
||||||
@@ -3960,8 +3926,10 @@ failed:
|
|||||||
/*
|
/*
|
||||||
* Handle specific association to the META tag
|
* Handle specific association to the META tag
|
||||||
*/
|
*/
|
||||||
if (meta)
|
if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
|
||||||
|
(strcmp((char *) name, "meta") == 0)) {
|
||||||
htmlCheckMeta(ctxt, atts);
|
htmlCheckMeta(ctxt, atts);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
715
HTMLtree.c
715
HTMLtree.c
@@ -25,6 +25,7 @@
|
|||||||
|
|
||||||
#include "private/buf.h"
|
#include "private/buf.h"
|
||||||
#include "private/error.h"
|
#include "private/error.h"
|
||||||
|
#include "private/html.h"
|
||||||
#include "private/io.h"
|
#include "private/io.h"
|
||||||
#include "private/save.h"
|
#include "private/save.h"
|
||||||
|
|
||||||
@@ -34,265 +35,315 @@
|
|||||||
* *
|
* *
|
||||||
************************************************************************/
|
************************************************************************/
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
xmlAttrPtr attr; /* charset or content */
|
||||||
|
const xmlChar *attrValue;
|
||||||
|
htmlMetaEncodingOffsets off;
|
||||||
|
} htmlMetaEncoding;
|
||||||
|
|
||||||
|
static htmlNodePtr
|
||||||
|
htmlFindFirstChild(htmlNodePtr parent, const char *name) {
|
||||||
|
htmlNodePtr child;
|
||||||
|
|
||||||
|
for (child = parent->children; child != NULL; child = child->next) {
|
||||||
|
if ((child->type == XML_ELEMENT_NODE) &&
|
||||||
|
(child->ns == NULL) &&
|
||||||
|
(xmlStrcasecmp(child->name, BAD_CAST name) == 0))
|
||||||
|
return(child);
|
||||||
|
}
|
||||||
|
|
||||||
|
return(NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
static htmlNodePtr
|
||||||
|
htmlFindHead(htmlDocPtr doc) {
|
||||||
|
htmlNodePtr html;
|
||||||
|
|
||||||
|
if (doc == NULL)
|
||||||
|
return(NULL);
|
||||||
|
|
||||||
|
html = htmlFindFirstChild((htmlNodePtr) doc, "html");
|
||||||
|
if (html == NULL)
|
||||||
|
return(NULL);
|
||||||
|
|
||||||
|
return(htmlFindFirstChild(html, "head"));
|
||||||
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
htmlParseContentType(const xmlChar *val, htmlMetaEncodingOffsets *off) {
|
||||||
|
const xmlChar *p = val;
|
||||||
|
|
||||||
|
while (1) {
|
||||||
|
size_t start, end;
|
||||||
|
|
||||||
|
while ((*p != 'c') && (*p != 'C')) {
|
||||||
|
if (*p == 0)
|
||||||
|
return(0);
|
||||||
|
p += 1;
|
||||||
|
}
|
||||||
|
p += 1;
|
||||||
|
|
||||||
|
if (xmlStrncasecmp(p, BAD_CAST "harset", 6) != 0)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
p += 6;
|
||||||
|
while (IS_WS_HTML(*p)) p += 1;
|
||||||
|
|
||||||
|
if (*p != '=')
|
||||||
|
continue;
|
||||||
|
|
||||||
|
p += 1;
|
||||||
|
while (IS_WS_HTML(*p)) p += 1;
|
||||||
|
|
||||||
|
if (*p == 0)
|
||||||
|
return(0);
|
||||||
|
|
||||||
|
if ((*p == '"') || (*p == '\'')) {
|
||||||
|
int quote = *p;
|
||||||
|
|
||||||
|
p += 1;
|
||||||
|
while (IS_WS_HTML(*p)) p += 1;
|
||||||
|
|
||||||
|
start = p - val;
|
||||||
|
end = start;
|
||||||
|
|
||||||
|
while (*p != quote) {
|
||||||
|
if (*p == 0)
|
||||||
|
return(0);
|
||||||
|
if (!IS_WS_HTML(*p))
|
||||||
|
end = p + 1 - val;
|
||||||
|
p += 1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
start = p - val;
|
||||||
|
|
||||||
|
while ((*p != 0) && (*p != ';') && (!IS_WS_HTML(*p)))
|
||||||
|
p += 1;
|
||||||
|
|
||||||
|
end = p - val;
|
||||||
|
}
|
||||||
|
|
||||||
|
off->start = start;
|
||||||
|
off->end = end;
|
||||||
|
off->size = p - val + strlen((char *) p);
|
||||||
|
|
||||||
|
return(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
return(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
static xmlAttrPtr
|
||||||
|
htmlFindMetaEncodingAttr(htmlNodePtr elem, int *outIsContentType) {
|
||||||
|
xmlAttrPtr attr, contentAttr = NULL;
|
||||||
|
int isContentType = 0;
|
||||||
|
|
||||||
|
if (xmlStrcasecmp(elem->name, BAD_CAST "meta") != 0)
|
||||||
|
return(NULL);
|
||||||
|
|
||||||
|
for (attr = elem->properties; attr != NULL; attr = attr->next) {
|
||||||
|
if (attr->ns != NULL)
|
||||||
|
continue;
|
||||||
|
if (xmlStrcasecmp(attr->name, BAD_CAST "charset") == 0) {
|
||||||
|
*outIsContentType = 0;
|
||||||
|
return(attr);
|
||||||
|
}
|
||||||
|
if (xmlStrcasecmp(attr->name, BAD_CAST "content") == 0)
|
||||||
|
contentAttr = attr;
|
||||||
|
if ((xmlStrcasecmp(attr->name, BAD_CAST "http-equiv") == 0) &&
|
||||||
|
(attr->children != NULL) &&
|
||||||
|
(attr->children->type == XML_TEXT_NODE) &&
|
||||||
|
(attr->children->next == NULL) &&
|
||||||
|
(xmlStrcasecmp(attr->children->content,
|
||||||
|
BAD_CAST "Content-Type") == 0))
|
||||||
|
isContentType = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((isContentType) && (contentAttr != NULL)) {
|
||||||
|
*outIsContentType = 1;
|
||||||
|
return(contentAttr);
|
||||||
|
}
|
||||||
|
|
||||||
|
return(NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
htmlParseMetaEncoding(htmlNodePtr elem, htmlMetaEncoding *menc) {
|
||||||
|
xmlAttrPtr attr;
|
||||||
|
const xmlChar *val = NULL;
|
||||||
|
int isContentType;
|
||||||
|
|
||||||
|
if ((elem->type != XML_ELEMENT_NODE) ||
|
||||||
|
(elem->ns != NULL) ||
|
||||||
|
(xmlStrcasecmp(elem->name, BAD_CAST "meta") != 0))
|
||||||
|
return(0);
|
||||||
|
|
||||||
|
attr = htmlFindMetaEncodingAttr(elem, &isContentType);
|
||||||
|
if (attr == NULL)
|
||||||
|
return(0);
|
||||||
|
|
||||||
|
if ((attr->children != NULL) &&
|
||||||
|
(attr->children->type == XML_TEXT_NODE) &&
|
||||||
|
(attr->children->next == NULL) &&
|
||||||
|
(attr->children->content != NULL))
|
||||||
|
val = attr->children->content;
|
||||||
|
else
|
||||||
|
val = BAD_CAST "";
|
||||||
|
|
||||||
|
|
||||||
|
if (!isContentType) {
|
||||||
|
size_t size = strlen((char *) val);
|
||||||
|
size_t start = 0;
|
||||||
|
size_t end = size;
|
||||||
|
|
||||||
|
while ((start < size) && (IS_WS_HTML(val[start])))
|
||||||
|
start += 1;
|
||||||
|
|
||||||
|
while ((end > 0) && (IS_WS_HTML(val[end-1])))
|
||||||
|
end -= 1;
|
||||||
|
|
||||||
|
menc->attr = attr;
|
||||||
|
menc->attrValue = val;
|
||||||
|
menc->off.start = start;
|
||||||
|
menc->off.end = end;
|
||||||
|
menc->off.size = size;
|
||||||
|
|
||||||
|
return(1);
|
||||||
|
} else {
|
||||||
|
if (htmlParseContentType(val, &menc->off)) {
|
||||||
|
menc->attr = attr;
|
||||||
|
menc->attrValue = val;
|
||||||
|
|
||||||
|
return(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
static xmlChar *
|
||||||
|
htmlUpdateMetaEncoding(htmlMetaEncoding *menc, const char *encoding) {
|
||||||
|
xmlChar *newVal, *p;
|
||||||
|
size_t size, oldEncSize, newEncSize;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The pseudo "HTML" encoding only produces ASCII.
|
||||||
|
*/
|
||||||
|
if (xmlStrcasecmp(BAD_CAST encoding, BAD_CAST "HTML") == 0)
|
||||||
|
encoding = "ASCII";
|
||||||
|
|
||||||
|
oldEncSize = menc->off.end - menc->off.start;
|
||||||
|
newEncSize = strlen((char *) encoding);
|
||||||
|
size = menc->off.size - oldEncSize + newEncSize;
|
||||||
|
newVal = xmlMalloc(size + 1);
|
||||||
|
if (newVal == NULL)
|
||||||
|
return(NULL);
|
||||||
|
|
||||||
|
p = newVal;
|
||||||
|
memcpy(p, menc->attrValue, menc->off.start);
|
||||||
|
p += menc->off.start;
|
||||||
|
memcpy(p, encoding, newEncSize);
|
||||||
|
p += newEncSize;
|
||||||
|
memcpy(p, menc->attrValue + menc->off.end, menc->off.size - menc->off.end);
|
||||||
|
newVal[size] = 0;
|
||||||
|
|
||||||
|
return(newVal);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Look up and encoding declaration in the meta tags.
|
* Look up and encoding declaration in the meta tags.
|
||||||
*
|
*
|
||||||
* Does not support `<meta charset="">` yet. Only supports deprecated
|
* The returned string points into attribute content and can contain
|
||||||
* `<meta http-equiv="Content-Type" content="">`.
|
* trailing garbage. It should be copied before modifying or freeing
|
||||||
*
|
* nodes.
|
||||||
* The returned string points into attribute content. It should be
|
|
||||||
* copied before modifying or freeing nodes.
|
|
||||||
*
|
*
|
||||||
* @param doc the document
|
* @param doc the document
|
||||||
* @returns the encoding ot NULL if not found.
|
* @returns the encoding ot NULL if not found.
|
||||||
*/
|
*/
|
||||||
const xmlChar *
|
const xmlChar *
|
||||||
htmlGetMetaEncoding(htmlDocPtr doc) {
|
htmlGetMetaEncoding(htmlDocPtr doc) {
|
||||||
htmlNodePtr cur;
|
htmlNodePtr head, node;
|
||||||
const xmlChar *content;
|
|
||||||
const xmlChar *encoding;
|
|
||||||
|
|
||||||
if (doc == NULL)
|
head = htmlFindHead(doc);
|
||||||
|
if (head == NULL)
|
||||||
return(NULL);
|
return(NULL);
|
||||||
cur = doc->children;
|
|
||||||
|
|
||||||
|
for (node = head->children; node != NULL; node = node->next) {
|
||||||
|
htmlMetaEncoding menc;
|
||||||
|
|
||||||
|
if (htmlParseMetaEncoding(node, &menc)) {
|
||||||
/*
|
/*
|
||||||
* Search the html
|
* Returning a `const xmlChar *` only allows to return
|
||||||
|
* a suffix. In http-equiv meta tags, there could be
|
||||||
|
* more data after the charset, although it's probably
|
||||||
|
* rare in practice.
|
||||||
*/
|
*/
|
||||||
while (cur != NULL) {
|
return(menc.attrValue + menc.off.start);
|
||||||
if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
|
|
||||||
if (xmlStrEqual(cur->name, BAD_CAST"html"))
|
|
||||||
break;
|
|
||||||
if (xmlStrEqual(cur->name, BAD_CAST"head"))
|
|
||||||
goto found_head;
|
|
||||||
if (xmlStrEqual(cur->name, BAD_CAST"meta"))
|
|
||||||
goto found_meta;
|
|
||||||
}
|
}
|
||||||
cur = cur->next;
|
|
||||||
}
|
}
|
||||||
if (cur == NULL)
|
|
||||||
return(NULL);
|
return(NULL);
|
||||||
cur = cur->children;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Search the head
|
|
||||||
*/
|
|
||||||
while (cur != NULL) {
|
|
||||||
if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
|
|
||||||
if (xmlStrEqual(cur->name, BAD_CAST"head"))
|
|
||||||
break;
|
|
||||||
if (xmlStrEqual(cur->name, BAD_CAST"meta"))
|
|
||||||
goto found_meta;
|
|
||||||
}
|
|
||||||
cur = cur->next;
|
|
||||||
}
|
|
||||||
if (cur == NULL)
|
|
||||||
return(NULL);
|
|
||||||
found_head:
|
|
||||||
cur = cur->children;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Search the meta elements
|
|
||||||
*/
|
|
||||||
found_meta:
|
|
||||||
while (cur != NULL) {
|
|
||||||
if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
|
|
||||||
if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
|
|
||||||
xmlAttrPtr attr = cur->properties;
|
|
||||||
int http;
|
|
||||||
const xmlChar *value;
|
|
||||||
|
|
||||||
content = NULL;
|
|
||||||
http = 0;
|
|
||||||
while (attr != NULL) {
|
|
||||||
if ((attr->children != NULL) &&
|
|
||||||
(attr->children->type == XML_TEXT_NODE) &&
|
|
||||||
(attr->children->next == NULL)) {
|
|
||||||
value = attr->children->content;
|
|
||||||
if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
|
|
||||||
&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
|
|
||||||
http = 1;
|
|
||||||
else if ((value != NULL)
|
|
||||||
&& (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
|
|
||||||
content = value;
|
|
||||||
if ((http != 0) && (content != NULL))
|
|
||||||
goto found_content;
|
|
||||||
}
|
|
||||||
attr = attr->next;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
cur = cur->next;
|
|
||||||
}
|
|
||||||
return(NULL);
|
|
||||||
|
|
||||||
found_content:
|
|
||||||
encoding = xmlStrstr(content, BAD_CAST"charset=");
|
|
||||||
if (encoding == NULL)
|
|
||||||
encoding = xmlStrstr(content, BAD_CAST"Charset=");
|
|
||||||
if (encoding == NULL)
|
|
||||||
encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
|
|
||||||
if (encoding != NULL) {
|
|
||||||
encoding += 8;
|
|
||||||
} else {
|
|
||||||
encoding = xmlStrstr(content, BAD_CAST"charset =");
|
|
||||||
if (encoding == NULL)
|
|
||||||
encoding = xmlStrstr(content, BAD_CAST"Charset =");
|
|
||||||
if (encoding == NULL)
|
|
||||||
encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
|
|
||||||
if (encoding != NULL)
|
|
||||||
encoding += 9;
|
|
||||||
}
|
|
||||||
if (encoding != NULL) {
|
|
||||||
while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
|
|
||||||
}
|
|
||||||
return(encoding);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates or updates a meta tag with an encoding declaration.
|
* Creates or updates a meta tag with an encoding declaration.
|
||||||
*
|
*
|
||||||
* Does not support `<meta charset="">` yet. Only supports deprecated
|
|
||||||
* `<meta http-equiv="Content-Type" content="">`.
|
|
||||||
*
|
|
||||||
* NOTE: This will not change the document content encoding.
|
* NOTE: This will not change the document content encoding.
|
||||||
*
|
*
|
||||||
* @param doc the document
|
* @param doc the document
|
||||||
* @param encoding the encoding string
|
* @param encoding the encoding string
|
||||||
* @returns 0 in case of success and -1 in case of error
|
* @returns 0 in case of success, 1 if no head element was found or
|
||||||
|
* arguments are invalid and -1 if memory allocation failed.
|
||||||
*/
|
*/
|
||||||
int
|
int
|
||||||
htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
|
htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
|
||||||
htmlNodePtr cur, meta = NULL, head = NULL;
|
htmlNodePtr head, meta;
|
||||||
const xmlChar *content = NULL;
|
int found = 0;
|
||||||
char newcontent[100];
|
|
||||||
|
|
||||||
newcontent[0] = 0;
|
if (encoding == NULL)
|
||||||
|
return(1);
|
||||||
|
|
||||||
if (doc == NULL)
|
head = htmlFindHead(doc);
|
||||||
|
if (head == NULL)
|
||||||
|
return(1);
|
||||||
|
|
||||||
|
for (meta = head->children; meta != NULL; meta = meta->next) {
|
||||||
|
htmlMetaEncoding menc;
|
||||||
|
|
||||||
|
if (htmlParseMetaEncoding(meta, &menc)) {
|
||||||
|
xmlChar *newVal;
|
||||||
|
int ret;
|
||||||
|
|
||||||
|
found = 1;
|
||||||
|
|
||||||
|
newVal = htmlUpdateMetaEncoding(&menc, (char *) encoding);
|
||||||
|
if (newVal == NULL)
|
||||||
|
return(-1);
|
||||||
|
xmlNodeSetContent((xmlNodePtr) menc.attr, NULL);
|
||||||
|
ret = xmlNodeAddContent((xmlNodePtr) menc.attr, newVal);
|
||||||
|
xmlFree(newVal);
|
||||||
|
|
||||||
|
if (ret < 0)
|
||||||
|
return(-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (found)
|
||||||
|
return(0);
|
||||||
|
|
||||||
|
meta = xmlNewDocNode(head->doc, NULL, BAD_CAST "meta", NULL);
|
||||||
|
if (meta == NULL)
|
||||||
return(-1);
|
return(-1);
|
||||||
|
|
||||||
/* html isn't a real encoding it's just libxml2 way to get entities */
|
if (xmlNewProp(meta, BAD_CAST "charset", encoding) == NULL) {
|
||||||
if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
|
xmlFreeNode(meta);
|
||||||
return(-1);
|
return(-1);
|
||||||
|
|
||||||
if (encoding != NULL) {
|
|
||||||
snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
|
|
||||||
(char *)encoding);
|
|
||||||
newcontent[sizeof(newcontent) - 1] = 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
cur = doc->children;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Search the html
|
|
||||||
*/
|
|
||||||
while (cur != NULL) {
|
|
||||||
if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
|
|
||||||
if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
|
|
||||||
break;
|
|
||||||
if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
|
|
||||||
goto found_head;
|
|
||||||
if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
|
|
||||||
goto found_meta;
|
|
||||||
}
|
|
||||||
cur = cur->next;
|
|
||||||
}
|
|
||||||
if (cur == NULL)
|
|
||||||
return(-1);
|
|
||||||
cur = cur->children;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Search the head
|
|
||||||
*/
|
|
||||||
while (cur != NULL) {
|
|
||||||
if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
|
|
||||||
if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
|
|
||||||
break;
|
|
||||||
if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
|
|
||||||
head = cur->parent;
|
|
||||||
goto found_meta;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
cur = cur->next;
|
|
||||||
}
|
|
||||||
if (cur == NULL)
|
|
||||||
return(-1);
|
|
||||||
found_head:
|
|
||||||
head = cur;
|
|
||||||
if (cur->children == NULL)
|
|
||||||
goto create;
|
|
||||||
cur = cur->children;
|
|
||||||
|
|
||||||
found_meta:
|
|
||||||
/*
|
|
||||||
* Search and update all the remaining the meta elements carrying
|
|
||||||
* encoding information
|
|
||||||
*/
|
|
||||||
while (cur != NULL) {
|
|
||||||
if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
|
|
||||||
if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
|
|
||||||
xmlAttrPtr attr = cur->properties;
|
|
||||||
int http;
|
|
||||||
const xmlChar *value;
|
|
||||||
|
|
||||||
content = NULL;
|
|
||||||
http = 0;
|
|
||||||
while (attr != NULL) {
|
|
||||||
if ((attr->children != NULL) &&
|
|
||||||
(attr->children->type == XML_TEXT_NODE) &&
|
|
||||||
(attr->children->next == NULL)) {
|
|
||||||
value = attr->children->content;
|
|
||||||
if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
|
|
||||||
&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
|
|
||||||
http = 1;
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if ((value != NULL) &&
|
|
||||||
(!xmlStrcasecmp(attr->name, BAD_CAST"content")))
|
|
||||||
content = value;
|
|
||||||
}
|
|
||||||
if ((http != 0) && (content != NULL))
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
attr = attr->next;
|
|
||||||
}
|
|
||||||
if ((http != 0) && (content != NULL)) {
|
|
||||||
meta = cur;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
cur = cur->next;
|
|
||||||
}
|
|
||||||
create:
|
|
||||||
if (meta == NULL) {
|
|
||||||
if ((encoding != NULL) && (head != NULL)) {
|
|
||||||
/*
|
|
||||||
* Create a new Meta element with the right attributes
|
|
||||||
*/
|
|
||||||
|
|
||||||
meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
|
|
||||||
if (head->children == NULL)
|
if (head->children == NULL)
|
||||||
xmlAddChild(head, meta);
|
xmlAddChild(head, meta);
|
||||||
else
|
else
|
||||||
xmlAddPrevSibling(head->children, meta);
|
xmlAddPrevSibling(head->children, meta);
|
||||||
xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
|
|
||||||
xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
/* remove the meta tag if NULL is passed */
|
|
||||||
if (encoding == NULL) {
|
|
||||||
xmlUnlinkNode(meta);
|
|
||||||
xmlFreeNode(meta);
|
|
||||||
}
|
|
||||||
/* change the document only if there is a real encoding change */
|
|
||||||
else if (xmlStrcasestr(content, encoding) == NULL) {
|
|
||||||
xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
return(0);
|
return(0);
|
||||||
}
|
}
|
||||||
@@ -383,7 +434,7 @@ htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
|
|||||||
outbuf->written = 0;
|
outbuf->written = 0;
|
||||||
|
|
||||||
use = xmlBufUse(buf);
|
use = xmlBufUse(buf);
|
||||||
htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
|
htmlNodeDumpInternal(outbuf, doc, cur, NULL, format);
|
||||||
if (outbuf->error)
|
if (outbuf->error)
|
||||||
ret = (size_t) -1;
|
ret = (size_t) -1;
|
||||||
else
|
else
|
||||||
@@ -455,7 +506,7 @@ htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
|
|||||||
if (buf == NULL)
|
if (buf == NULL)
|
||||||
return(-1);
|
return(-1);
|
||||||
|
|
||||||
htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format);
|
htmlNodeDumpInternal(buf, doc, cur, NULL, format);
|
||||||
|
|
||||||
ret = xmlOutputBufferClose(buf);
|
ret = xmlOutputBufferClose(buf);
|
||||||
return(ret);
|
return(ret);
|
||||||
@@ -479,14 +530,11 @@ htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
|
|||||||
* Serialize an HTML node to a memory, also returning the size of
|
* Serialize an HTML node to a memory, also returning the size of
|
||||||
* the result. It's up to the caller to free the memory.
|
* the result. It's up to the caller to free the memory.
|
||||||
*
|
*
|
||||||
* WARNING: Uses the encoding from a deprecated meta tag, see
|
* Uses the encoding of the document. If the document has no
|
||||||
* htmlGetMetaEncoding(). This is typically undesired. If no such
|
* encoding, ASCII with HTML 4.0 named character entities will
|
||||||
* tag was found, ASCII with HTML 4.0 named character entities will
|
|
||||||
* be used. This is inefficient compared to UTF-8 and might be
|
* be used. This is inefficient compared to UTF-8 and might be
|
||||||
* changed in a future version.
|
* changed in a future version.
|
||||||
*
|
*
|
||||||
* Use of this function is therefore DISCOURAGED in favor of
|
|
||||||
* htmlDocContentDumpFormatOutput().
|
|
||||||
* @param cur the document
|
* @param cur the document
|
||||||
* @param mem OUT: the memory pointer
|
* @param mem OUT: the memory pointer
|
||||||
* @param size OUT: the memory length
|
* @param size OUT: the memory length
|
||||||
@@ -496,7 +544,6 @@ void
|
|||||||
htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
|
htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
|
||||||
xmlOutputBufferPtr buf;
|
xmlOutputBufferPtr buf;
|
||||||
xmlCharEncodingHandlerPtr handler = NULL;
|
xmlCharEncodingHandlerPtr handler = NULL;
|
||||||
const char *encoding;
|
|
||||||
|
|
||||||
xmlInitParser();
|
xmlInitParser();
|
||||||
|
|
||||||
@@ -507,8 +554,7 @@ htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
|
|||||||
if (cur == NULL)
|
if (cur == NULL)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
encoding = (const char *) htmlGetMetaEncoding(cur);
|
if (htmlFindOutputEncoder((char *) cur->encoding, &handler) != XML_ERR_OK)
|
||||||
if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK)
|
|
||||||
return;
|
return;
|
||||||
buf = xmlAllocOutputBuffer(handler);
|
buf = xmlAllocOutputBuffer(handler);
|
||||||
if (buf == NULL)
|
if (buf == NULL)
|
||||||
@@ -657,18 +703,19 @@ htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
|
|||||||
/**
|
/**
|
||||||
* Serialize an HTML node to an output buffer.
|
* Serialize an HTML node to an output buffer.
|
||||||
*
|
*
|
||||||
* Ignores `encoding` and uses the encoding of the output buffer.
|
* If `encoding` is specified, it is used to create or update meta
|
||||||
|
* tags containing the character encoding.
|
||||||
|
*
|
||||||
* @param buf the HTML buffer output
|
* @param buf the HTML buffer output
|
||||||
* @param doc the document
|
* @param doc the document
|
||||||
* @param cur the current node
|
* @param cur the current node
|
||||||
* @param encoding the encoding string (unused)
|
* @param encoding the encoding string (optional)
|
||||||
* @param format should formatting newlines been added
|
* @param format should formatting newlines been added
|
||||||
*/
|
*/
|
||||||
void
|
void
|
||||||
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
|
htmlNodeDumpInternal(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
|
||||||
xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED,
|
const char *encoding, int format) {
|
||||||
int format) {
|
xmlNodePtr root, parent, metaHead = NULL;
|
||||||
xmlNodePtr root, parent;
|
|
||||||
xmlAttrPtr attr;
|
xmlAttrPtr attr;
|
||||||
const htmlElemDesc * info;
|
const htmlElemDesc * info;
|
||||||
|
|
||||||
@@ -699,24 +746,61 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case XML_ELEMENT_NODE:
|
case XML_ELEMENT_NODE: {
|
||||||
|
htmlMetaEncoding menc;
|
||||||
|
int isMeta = 0;
|
||||||
|
int addMeta = 0;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Some users like lxml are known to pass nodes with a corrupted
|
* Some users like lxml are known to pass nodes with a corrupted
|
||||||
* tree structure. Fall back to a recursive call to handle this
|
* tree structure. Fall back to a recursive call to handle this
|
||||||
* case.
|
* case.
|
||||||
*/
|
*/
|
||||||
if ((cur->parent != parent) && (cur->children != NULL)) {
|
if ((cur->parent != parent) && (cur->children != NULL)) {
|
||||||
htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
|
htmlNodeDumpInternal(buf, doc, cur, encoding, format);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Get specific HTML info for that node.
|
* Get specific HTML info for that node.
|
||||||
*/
|
*/
|
||||||
if (cur->ns == NULL)
|
if (cur->ns == NULL) {
|
||||||
info = htmlTagLookup(cur->name);
|
info = htmlTagLookup(cur->name);
|
||||||
else
|
|
||||||
|
if (encoding != NULL) {
|
||||||
|
isMeta = htmlParseMetaEncoding(cur, &menc);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Don't add meta tag for "HTML" encoding.
|
||||||
|
*/
|
||||||
|
if ((xmlStrcasecmp(BAD_CAST encoding,
|
||||||
|
BAD_CAST "HTML") != 0) &&
|
||||||
|
(xmlStrcasecmp(cur->name, BAD_CAST "head") == 0) &&
|
||||||
|
(parent != NULL) &&
|
||||||
|
(parent->ns == NULL) &&
|
||||||
|
(xmlStrcasecmp(parent->name, BAD_CAST "html") == 0) &&
|
||||||
|
(parent->parent != NULL) &&
|
||||||
|
(parent->parent->parent == NULL) &&
|
||||||
|
(metaHead == NULL)) {
|
||||||
|
xmlNodePtr n;
|
||||||
|
|
||||||
|
metaHead = cur;
|
||||||
|
addMeta = 1;
|
||||||
|
|
||||||
|
for (n = cur->children; n != NULL; n = n->next) {
|
||||||
|
int unused;
|
||||||
|
|
||||||
|
if (htmlFindMetaEncodingAttr(n, &unused) != NULL) {
|
||||||
|
metaHead = NULL;
|
||||||
|
addMeta = 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
info = NULL;
|
info = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
xmlOutputBufferWriteString(buf, "<");
|
xmlOutputBufferWriteString(buf, "<");
|
||||||
if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
|
if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
|
||||||
@@ -728,7 +812,23 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
|
|||||||
xmlNsListDumpOutput(buf, cur->nsDef);
|
xmlNsListDumpOutput(buf, cur->nsDef);
|
||||||
attr = cur->properties;
|
attr = cur->properties;
|
||||||
while (attr != NULL) {
|
while (attr != NULL) {
|
||||||
|
if ((!isMeta) || (attr != menc.attr)) {
|
||||||
htmlAttrDumpOutput(buf, doc, attr);
|
htmlAttrDumpOutput(buf, doc, attr);
|
||||||
|
} else {
|
||||||
|
xmlChar *newVal;
|
||||||
|
|
||||||
|
xmlOutputBufferWriteString(buf, " ");
|
||||||
|
xmlOutputBufferWriteString(buf, (char *) attr->name);
|
||||||
|
|
||||||
|
newVal = htmlUpdateMetaEncoding(&menc, encoding);
|
||||||
|
if (newVal == NULL) {
|
||||||
|
buf->error = XML_ERR_NO_MEMORY;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
xmlOutputBufferWriteString(buf, "=");
|
||||||
|
xmlOutputBufferWriteQuotedString(buf, newVal);
|
||||||
|
xmlFree(newVal);
|
||||||
|
}
|
||||||
attr = attr->next;
|
attr = attr->next;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -739,8 +839,15 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
|
|||||||
(xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
|
(xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
|
||||||
(xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
|
(xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
|
||||||
xmlOutputBufferWriteString(buf, ">");
|
xmlOutputBufferWriteString(buf, ">");
|
||||||
|
} else {
|
||||||
|
if (addMeta) {
|
||||||
|
xmlOutputBufferWriteString(buf, "><meta charset=\"");
|
||||||
|
/* TODO: Escape */
|
||||||
|
xmlOutputBufferWriteString(buf, encoding);
|
||||||
|
xmlOutputBufferWriteString(buf, "\"></");
|
||||||
} else {
|
} else {
|
||||||
xmlOutputBufferWriteString(buf, "></");
|
xmlOutputBufferWriteString(buf, "></");
|
||||||
|
}
|
||||||
if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
|
if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
|
||||||
xmlOutputBufferWriteString(buf,
|
xmlOutputBufferWriteString(buf,
|
||||||
(const char *)cur->ns->prefix);
|
(const char *)cur->ns->prefix);
|
||||||
@@ -751,13 +858,25 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
xmlOutputBufferWriteString(buf, ">");
|
xmlOutputBufferWriteString(buf, ">");
|
||||||
if ((format) && (info != NULL) && (!info->isinline) &&
|
if ((format) &&
|
||||||
|
((addMeta) ||
|
||||||
|
((info != NULL) && (!info->isinline) &&
|
||||||
(cur->children->type != HTML_TEXT_NODE) &&
|
(cur->children->type != HTML_TEXT_NODE) &&
|
||||||
(cur->children->type != HTML_ENTITY_REF_NODE) &&
|
(cur->children->type != HTML_ENTITY_REF_NODE) &&
|
||||||
(cur->children != cur->last) &&
|
(cur->children != cur->last) &&
|
||||||
(cur->name != NULL) &&
|
(cur->name != NULL) &&
|
||||||
(cur->name[0] != 'p')) /* p, pre, param */
|
(cur->name[0] != 'p')))) /* p, pre, param */
|
||||||
xmlOutputBufferWriteString(buf, "\n");
|
xmlOutputBufferWriteString(buf, "\n");
|
||||||
|
if (addMeta) {
|
||||||
|
xmlOutputBufferWriteString(buf, "<meta charset=\"");
|
||||||
|
/* TODO: Escape */
|
||||||
|
xmlOutputBufferWriteString(buf, encoding);
|
||||||
|
xmlOutputBufferWriteString(buf, "\">");
|
||||||
|
if ((format) &&
|
||||||
|
(cur->children->type != HTML_TEXT_NODE) &&
|
||||||
|
(cur->children->type != HTML_ENTITY_REF_NODE))
|
||||||
|
xmlOutputBufferWriteString(buf, "\n");
|
||||||
|
}
|
||||||
parent = cur;
|
parent = cur;
|
||||||
cur = cur->children;
|
cur = cur->children;
|
||||||
continue;
|
continue;
|
||||||
@@ -774,6 +893,7 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
|
|||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
case XML_ATTRIBUTE_NODE:
|
case XML_ATTRIBUTE_NODE:
|
||||||
htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur);
|
htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur);
|
||||||
@@ -862,7 +982,7 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
|
|||||||
if ((format) && (info != NULL) && (!info->isinline) &&
|
if ((format) && (info != NULL) && (!info->isinline) &&
|
||||||
(cur->last->type != HTML_TEXT_NODE) &&
|
(cur->last->type != HTML_TEXT_NODE) &&
|
||||||
(cur->last->type != HTML_ENTITY_REF_NODE) &&
|
(cur->last->type != HTML_ENTITY_REF_NODE) &&
|
||||||
(cur->children != cur->last) &&
|
((cur->children != cur->last) || (cur == metaHead)) &&
|
||||||
(cur->name != NULL) &&
|
(cur->name != NULL) &&
|
||||||
(cur->name[0] != 'p')) /* p, pre, param */
|
(cur->name[0] != 'p')) /* p, pre, param */
|
||||||
xmlOutputBufferWriteString(buf, "\n");
|
xmlOutputBufferWriteString(buf, "\n");
|
||||||
@@ -884,32 +1004,48 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
|
|||||||
(parent->name[0] != 'p')) /* p, pre, param */
|
(parent->name[0] != 'p')) /* p, pre, param */
|
||||||
xmlOutputBufferWriteString(buf, "\n");
|
xmlOutputBufferWriteString(buf, "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (cur == metaHead)
|
||||||
|
metaHead = NULL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Serialize an HTML node to an output buffer.
|
||||||
|
*
|
||||||
|
* @param buf the HTML buffer output
|
||||||
|
* @param doc the document
|
||||||
|
* @param cur the current node
|
||||||
|
* @param encoding the encoding string (unused)
|
||||||
|
* @param format should formatting newlines been added
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
|
||||||
|
const char *encoding ATTRIBUTE_UNUSED, int format) {
|
||||||
|
htmlNodeDumpInternal(buf, doc, cur, NULL, format);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Same as htmlNodeDumpFormatOutput() with `format` set to 1 which is
|
* Same as htmlNodeDumpFormatOutput() with `format` set to 1 which is
|
||||||
* typically undesired. Use of this function is DISCOURAGED in favor
|
* typically undesired. Use of this function is DISCOURAGED in favor
|
||||||
* of htmlNodeDumpFormatOutput().
|
* of htmlNodeDumpFormatOutput().
|
||||||
*
|
*
|
||||||
* Ignores `encoding` and uses the encoding of the output buffer.
|
|
||||||
* @param buf the HTML buffer output
|
* @param buf the HTML buffer output
|
||||||
* @param doc the document
|
* @param doc the document
|
||||||
* @param cur the current node
|
* @param cur the current node
|
||||||
* @param encoding the encoding string (unused)
|
* @param encoding the encoding string (unused)
|
||||||
*/
|
*/
|
||||||
void
|
void
|
||||||
htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
|
htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
|
||||||
xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) {
|
const char *encoding ATTRIBUTE_UNUSED) {
|
||||||
htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1);
|
htmlNodeDumpInternal(buf, doc, cur, NULL, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Serialize an HTML document to an output buffer.
|
* Serialize an HTML document to an output buffer.
|
||||||
*
|
*
|
||||||
* Ignores `encoding` and uses the encoding of the output buffer.
|
|
||||||
* @param buf the HTML buffer output
|
* @param buf the HTML buffer output
|
||||||
* @param cur the document
|
* @param cur the document
|
||||||
* @param encoding the encoding string (unused)
|
* @param encoding the encoding string (unused)
|
||||||
@@ -919,31 +1055,14 @@ void
|
|||||||
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
|
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
|
||||||
const char *encoding ATTRIBUTE_UNUSED,
|
const char *encoding ATTRIBUTE_UNUSED,
|
||||||
int format) {
|
int format) {
|
||||||
int type = 0;
|
htmlNodeDumpInternal(buf, cur, (xmlNodePtr) cur, NULL, format);
|
||||||
|
|
||||||
/*
|
|
||||||
* This is needed when serializing XML documents as HTML.
|
|
||||||
* xmlEncodeEntitiesReentrant uses the document type to
|
|
||||||
* determine the serialization mode.
|
|
||||||
*
|
|
||||||
* Once we call more low-level functions directly with
|
|
||||||
* HTML flags, this hack can be removed.
|
|
||||||
*/
|
|
||||||
if (cur) {
|
|
||||||
type = cur->type;
|
|
||||||
cur->type = XML_HTML_DOCUMENT_NODE;
|
|
||||||
}
|
|
||||||
htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format);
|
|
||||||
if (cur)
|
|
||||||
cur->type = (xmlElementType) type;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Same as htmlNodeDumpFormatOutput() with `format` set to 1 which is
|
* Same as htmlDocContentDumpFormatDump() with `format` set to 1
|
||||||
* typically undesired. Use of this function is DISCOURAGED in favor
|
* which is typically undesired. Use of this function is DISCOURAGED
|
||||||
* of htmlDocContentDumpFormatOutput().
|
* in favor of htmlDocContentDumpFormatOutput().
|
||||||
*
|
*
|
||||||
* Ignores `encoding` and uses the encoding of the output buffer.
|
|
||||||
* @param buf the HTML buffer output
|
* @param buf the HTML buffer output
|
||||||
* @param cur the document
|
* @param cur the document
|
||||||
* @param encoding the encoding string (unused)
|
* @param encoding the encoding string (unused)
|
||||||
@@ -951,7 +1070,7 @@ htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
|
|||||||
void
|
void
|
||||||
htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
|
htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
|
||||||
const char *encoding ATTRIBUTE_UNUSED) {
|
const char *encoding ATTRIBUTE_UNUSED) {
|
||||||
htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1);
|
htmlNodeDumpInternal(buf, cur, (xmlNodePtr) cur, NULL, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
/************************************************************************
|
/************************************************************************
|
||||||
@@ -963,13 +1082,12 @@ htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
|
|||||||
/**
|
/**
|
||||||
* Serialize an HTML document to an open `FILE`.
|
* Serialize an HTML document to an open `FILE`.
|
||||||
*
|
*
|
||||||
* WARNING: Uses the encoding from a deprecated meta tag, see
|
* Uses the encoding of the document. If the document has no
|
||||||
* htmlGetMetaEncoding(). This is typically undesired. If no such
|
* encoding, ASCII with HTML 4.0 named character entities will
|
||||||
* tag was found, ASCII with HTML 4.0 named character entities will
|
|
||||||
* be used. This is inefficient compared to UTF-8 and might be
|
* be used. This is inefficient compared to UTF-8 and might be
|
||||||
* changed in a future version.
|
* changed in a future version.
|
||||||
*
|
*
|
||||||
* Also enables "formatting" unconditionally which is typically
|
* Enables "formatting" unconditionally which is typically
|
||||||
* undesired.
|
* undesired.
|
||||||
*
|
*
|
||||||
* Use of this function is DISCOURAGED in favor of
|
* Use of this function is DISCOURAGED in favor of
|
||||||
@@ -983,7 +1101,6 @@ int
|
|||||||
htmlDocDump(FILE *f, xmlDocPtr cur) {
|
htmlDocDump(FILE *f, xmlDocPtr cur) {
|
||||||
xmlOutputBufferPtr buf;
|
xmlOutputBufferPtr buf;
|
||||||
xmlCharEncodingHandlerPtr handler = NULL;
|
xmlCharEncodingHandlerPtr handler = NULL;
|
||||||
const char *encoding;
|
|
||||||
int ret;
|
int ret;
|
||||||
|
|
||||||
xmlInitParser();
|
xmlInitParser();
|
||||||
@@ -992,8 +1109,7 @@ htmlDocDump(FILE *f, xmlDocPtr cur) {
|
|||||||
return(-1);
|
return(-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
encoding = (const char *) htmlGetMetaEncoding(cur);
|
if (htmlFindOutputEncoder((char *) cur->encoding, &handler) != XML_ERR_OK)
|
||||||
if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK)
|
|
||||||
return(-1);
|
return(-1);
|
||||||
buf = xmlOutputBufferCreateFile(f, handler);
|
buf = xmlOutputBufferCreateFile(f, handler);
|
||||||
if (buf == NULL)
|
if (buf == NULL)
|
||||||
@@ -1005,18 +1121,10 @@ htmlDocDump(FILE *f, xmlDocPtr cur) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Serialize an HTML document to a file. If `filename` is `"-"`,
|
* Serialize an HTML document to a file.
|
||||||
* stdout is used. This is potentially insecure and might be
|
|
||||||
* changed in a future version.
|
|
||||||
*
|
*
|
||||||
* WARNING: Uses the encoding from a deprecated meta tag, see
|
* Same as htmlSaveFileFormat() with `encoding` set to NULL and
|
||||||
* htmlGetMetaEncoding(). This is typically undesired. If no such
|
* `format` set to 1 which is typically undesired.
|
||||||
* tag was found, ASCII with HTML 4.0 named character entities will
|
|
||||||
* be used. This is inefficient compared to UTF-8 and might be
|
|
||||||
* changed in a future version.
|
|
||||||
*
|
|
||||||
* Also enables "formatting" unconditionally which is typically
|
|
||||||
* undesired.
|
|
||||||
*
|
*
|
||||||
* Use of this function is DISCOURAGED in favor of
|
* Use of this function is DISCOURAGED in favor of
|
||||||
* htmlSaveFileFormat().
|
* htmlSaveFileFormat().
|
||||||
@@ -1027,31 +1135,12 @@ htmlDocDump(FILE *f, xmlDocPtr cur) {
|
|||||||
*/
|
*/
|
||||||
int
|
int
|
||||||
htmlSaveFile(const char *filename, xmlDocPtr cur) {
|
htmlSaveFile(const char *filename, xmlDocPtr cur) {
|
||||||
xmlOutputBufferPtr buf;
|
return(htmlSaveFileFormat(filename, cur, NULL, 1));
|
||||||
xmlCharEncodingHandlerPtr handler = NULL;
|
|
||||||
const char *encoding;
|
|
||||||
int ret;
|
|
||||||
|
|
||||||
if ((cur == NULL) || (filename == NULL))
|
|
||||||
return(-1);
|
|
||||||
|
|
||||||
xmlInitParser();
|
|
||||||
|
|
||||||
encoding = (const char *) htmlGetMetaEncoding(cur);
|
|
||||||
if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK)
|
|
||||||
return(-1);
|
|
||||||
buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
|
|
||||||
if (buf == NULL)
|
|
||||||
return(-1);
|
|
||||||
|
|
||||||
htmlDocContentDumpOutput(buf, cur, NULL);
|
|
||||||
|
|
||||||
ret = xmlOutputBufferClose(buf);
|
|
||||||
return(ret);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Serialize an HTML document to a file using a given encoding.
|
* Serialize an HTML document to a file using a given encoding.
|
||||||
|
*
|
||||||
* If `filename` is `"-"`, stdout is used. This is potentially
|
* If `filename` is `"-"`, stdout is used. This is potentially
|
||||||
* insecure and might be changed in a future version.
|
* insecure and might be changed in a future version.
|
||||||
*
|
*
|
||||||
@@ -1059,6 +1148,8 @@ htmlSaveFile(const char *filename, xmlDocPtr cur) {
|
|||||||
* will be used. This is inefficient compared to UTF-8 and might be
|
* will be used. This is inefficient compared to UTF-8 and might be
|
||||||
* changed in a future version.
|
* changed in a future version.
|
||||||
*
|
*
|
||||||
|
* Sets or updates meta tags containing the character encoding.
|
||||||
|
*
|
||||||
* @param filename the filename
|
* @param filename the filename
|
||||||
* @param cur the document
|
* @param cur the document
|
||||||
* @param format should formatting newlines been added
|
* @param format should formatting newlines been added
|
||||||
@@ -1079,15 +1170,11 @@ htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
|
|||||||
|
|
||||||
if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK)
|
if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK)
|
||||||
return(-1);
|
return(-1);
|
||||||
if (handler != NULL)
|
|
||||||
htmlSetMetaEncoding(cur, (const xmlChar *) handler->name);
|
|
||||||
else
|
|
||||||
htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* save the content to a temp buffer.
|
* save the content to a temp buffer.
|
||||||
*/
|
*/
|
||||||
buf = xmlOutputBufferCreateFilename(filename, handler, 0);
|
buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
|
||||||
if (buf == NULL)
|
if (buf == NULL)
|
||||||
return(0);
|
return(0);
|
||||||
|
|
||||||
@@ -1098,6 +1185,8 @@ htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* Serialize an HTML document to a file.
|
||||||
|
*
|
||||||
* Same as htmlSaveFileFormat() with `format` set to 1 which is
|
* Same as htmlSaveFileFormat() with `format` set to 1 which is
|
||||||
* typically undesired. Also see the warnings there. Use of this
|
* typically undesired. Also see the warnings there. Use of this
|
||||||
* function is DISCOURAGED in favor of htmlSaveFileFormat().
|
* function is DISCOURAGED in favor of htmlSaveFileFormat().
|
||||||
|
@@ -5,9 +5,26 @@
|
|||||||
|
|
||||||
#ifdef LIBXML_HTML_ENABLED
|
#ifdef LIBXML_HTML_ENABLED
|
||||||
|
|
||||||
|
#define IS_WS_HTML(c) \
|
||||||
|
(((c) == 0x20) || \
|
||||||
|
(((c) >= 0x09) && ((c) <= 0x0D) && ((c) != 0x0B)))
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
size_t start;
|
||||||
|
size_t end;
|
||||||
|
size_t size;
|
||||||
|
} htmlMetaEncodingOffsets;
|
||||||
|
|
||||||
XML_HIDDEN xmlNodePtr
|
XML_HIDDEN xmlNodePtr
|
||||||
htmlCtxtParseContentInternal(xmlParserCtxtPtr ctxt, xmlParserInputPtr input);
|
htmlCtxtParseContentInternal(xmlParserCtxtPtr ctxt, xmlParserInputPtr input);
|
||||||
|
|
||||||
|
XML_HIDDEN int
|
||||||
|
htmlParseContentType(const xmlChar *val, htmlMetaEncodingOffsets *off);
|
||||||
|
|
||||||
|
XML_HIDDEN void
|
||||||
|
htmlNodeDumpInternal(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
|
||||||
|
const char *encoding, int format);
|
||||||
|
|
||||||
#endif /* LIBXML_HTML_ENABLED */
|
#endif /* LIBXML_HTML_ENABLED */
|
||||||
|
|
||||||
#endif /* XML_HTML_H_PRIVATE__ */
|
#endif /* XML_HTML_H_PRIVATE__ */
|
||||||
|
@@ -77,17 +77,14 @@ if str != """<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http
|
|||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
str = doc.serialize("ISO-8859-1")
|
str = doc.serialize("ISO-8859-1")
|
||||||
if str != """<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
if str != """<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||||
<html><head><meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"><title>Hello</title></head><body><p>hello</p></body></html>
|
<html><head><meta charset="ISO-8859-1"><title>Hello</title></head><body><p>hello</p></body></html>
|
||||||
""":
|
""":
|
||||||
print("error serializing HTML document 2")
|
print("error serializing HTML document 2")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
str = doc.serialize(format=1)
|
str = doc.serialize(format=1)
|
||||||
if str != """<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
if str != """<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||||
<html>
|
<html>
|
||||||
<head>
|
<head><title>Hello</title></head>
|
||||||
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
|
|
||||||
<title>Hello</title>
|
|
||||||
</head>
|
|
||||||
<body><p>hello</p></body>
|
<body><p>hello</p></body>
|
||||||
</html>
|
</html>
|
||||||
""":
|
""":
|
||||||
@@ -97,13 +94,13 @@ str = doc.serialize("iso-8859-1", 1)
|
|||||||
if str != """<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
if str != """<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||||
<html>
|
<html>
|
||||||
<head>
|
<head>
|
||||||
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
|
<meta charset="iso-8859-1">
|
||||||
<title>Hello</title>
|
<title>Hello</title>
|
||||||
</head>
|
</head>
|
||||||
<body><p>hello</p></body>
|
<body><p>hello</p></body>
|
||||||
</html>
|
</html>
|
||||||
""":
|
""":
|
||||||
print("error serializing HTML document 4")
|
print("error serializing HTML document 4", str)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
#
|
#
|
||||||
@@ -116,15 +113,12 @@ if str != """<html><head><title>Hello</title></head><body><p>hello</p></body></h
|
|||||||
print("error serializing HTML root 1")
|
print("error serializing HTML root 1")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
str = root.serialize("ISO-8859-1")
|
str = root.serialize("ISO-8859-1")
|
||||||
if str != """<html><head><meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1"><title>Hello</title></head><body><p>hello</p></body></html>""":
|
if str != """<html><head><meta charset="ISO-8859-1"><title>Hello</title></head><body><p>hello</p></body></html>""":
|
||||||
print("error serializing HTML root 2")
|
print("error serializing HTML root 2")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
str = root.serialize(format=1)
|
str = root.serialize(format=1)
|
||||||
if str != """<html>
|
if str != """<html>
|
||||||
<head>
|
<head><title>Hello</title></head>
|
||||||
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
|
|
||||||
<title>Hello</title>
|
|
||||||
</head>
|
|
||||||
<body><p>hello</p></body>
|
<body><p>hello</p></body>
|
||||||
</html>""":
|
</html>""":
|
||||||
print("error serializing HTML root 3")
|
print("error serializing HTML root 3")
|
||||||
@@ -132,7 +126,7 @@ if str != """<html>
|
|||||||
str = root.serialize("iso-8859-1", 1)
|
str = root.serialize("iso-8859-1", 1)
|
||||||
if str != """<html>
|
if str != """<html>
|
||||||
<head>
|
<head>
|
||||||
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
|
<meta charset="iso-8859-1">
|
||||||
<title>Hello</title>
|
<title>Hello</title>
|
||||||
</head>
|
</head>
|
||||||
<body><p>hello</p></body>
|
<body><p>hello</p></body>
|
||||||
|
@@ -1,2 +1,2 @@
|
|||||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||||
<html><body>&jÙ</body></html>
|
<html><body>&j<EFBFBD></body></html>
|
||||||
|
@@ -1,2 +1,2 @@
|
|||||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||||
<!--?a“-->
|
<!--?a<EFBFBD>-->
|
||||||
|
@@ -1,3 +1,3 @@
|
|||||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||||
<html><body>&:ê
|
<html><body>&:<EFBFBD>
|
||||||
</body></html>
|
</body></html>
|
||||||
|
@@ -1,3 +1,3 @@
|
|||||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||||
<!--‘<!dOctYPE
|
<!--<EFBFBD><!dOctYPE
|
||||||
-->
|
-->
|
||||||
|
@@ -11,7 +11,7 @@
|
|||||||
<font face="Verdana">
|
<font face="Verdana">
|
||||||
<h1><a name="top">Microsoft FrontPage 2000 Server Extensions, UNIX</a></h1>
|
<h1><a name="top">Microsoft FrontPage 2000 Server Extensions, UNIX</a></h1>
|
||||||
|
|
||||||
<font size="2"><i>© Copyright Microsoft Corporation, 1999 </i></font>
|
<font size="2"><i><EFBFBD> Copyright Microsoft Corporation, 1999<EFBFBD></i></font>
|
||||||
|
|
||||||
|
|
||||||
<p>The FrontPage Server Extensions are a set of programs on the Web server that support:
|
<p>The FrontPage Server Extensions are a set of programs on the Web server that support:
|
||||||
@@ -24,11 +24,11 @@
|
|||||||
</ul>
|
</ul>
|
||||||
|
|
||||||
|
|
||||||
<h2>Contents </h2>
|
<h2>Contents<EFBFBD></h2>
|
||||||
|
|
||||||
<a href="#relnotes">Release Notes</a><br>
|
<a href="#relnotes">Release Notes</a><br>
|
||||||
<a href="#moreinfo">Resources for More Information</a>
|
<a href="#moreinfo">Resources for More Information</a>
|
||||||
<p> </p>
|
<p><EFBFBD></p>
|
||||||
<hr>
|
<hr>
|
||||||
<h2><a name="relnotes">Release Notes</a></h2>
|
<h2><a name="relnotes">Release Notes</a></h2>
|
||||||
|
|
||||||
@@ -54,7 +54,7 @@ configuration files (access.conf, srm.conf), add the following lines to http.con
|
|||||||
</font>
|
</font>
|
||||||
<blockquote>
|
<blockquote>
|
||||||
<font face="Courier New">
|
<font face="Courier New">
|
||||||
ResourceConfig /dev/null <br>
|
ResourceConfig /dev/null<EFBFBD><br>
|
||||||
AccessConfig /dev/null</font>
|
AccessConfig /dev/null</font>
|
||||||
</blockquote>
|
</blockquote>
|
||||||
<font face="Verdana">
|
<font face="Verdana">
|
||||||
@@ -160,7 +160,7 @@ answering inquiries, so you can write your question in your own words. To begin,
|
|||||||
<p align="right"><font size="1"><a href="#moreinfo">Top of Section</a></font></p>
|
<p align="right"><font size="1"><a href="#moreinfo">Top of Section</a></font></p>
|
||||||
|
|
||||||
|
|
||||||
<p> </p>
|
<p><EFBFBD></p>
|
||||||
|
|
||||||
</font>
|
</font>
|
||||||
</body>
|
</body>
|
||||||
|
@@ -4,6 +4,6 @@
|
|||||||
<meta charset="iso-8859-1">
|
<meta charset="iso-8859-1">
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<p>très</p>
|
<p>tr<EFBFBD>s</p>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
File diff suppressed because one or more lines are too long
@@ -1,4 +1,4 @@
|
|||||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
|
||||||
<!--?xml encoding="UTF-8"--><html><body>
|
<!--?xml encoding="UTF-8"--><html><body>
|
||||||
<p>öäüß</p>
|
<p>öäüß</p>
|
||||||
</body></html>
|
</body></html>
|
||||||
|
126
testparser.c
126
testparser.c
@@ -14,6 +14,7 @@
|
|||||||
#include <libxml/xmlsave.h>
|
#include <libxml/xmlsave.h>
|
||||||
#include <libxml/xmlwriter.h>
|
#include <libxml/xmlwriter.h>
|
||||||
#include <libxml/HTMLparser.h>
|
#include <libxml/HTMLparser.h>
|
||||||
|
#include <libxml/HTMLtree.h>
|
||||||
|
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
|
||||||
@@ -611,6 +612,129 @@ testHtmlIds(void) {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define MHE "meta http-equiv=\"Content-Type\""
|
||||||
|
|
||||||
|
static int
|
||||||
|
testHtmlInsertMetaEncoding(void) {
|
||||||
|
/* We currently require a head element to be present. */
|
||||||
|
const char *html =
|
||||||
|
"<html>"
|
||||||
|
"<head></head>"
|
||||||
|
"<body>text</body>"
|
||||||
|
"</html>\n";
|
||||||
|
const char *expect =
|
||||||
|
"<html>"
|
||||||
|
"<head><meta charset=\"utf-8\"></head>"
|
||||||
|
"<body>text</body>"
|
||||||
|
"</html>\n";
|
||||||
|
htmlDocPtr doc;
|
||||||
|
xmlBufferPtr buf;
|
||||||
|
xmlSaveCtxtPtr save;
|
||||||
|
xmlChar *out;
|
||||||
|
int size, err = 0;
|
||||||
|
|
||||||
|
|
||||||
|
doc = htmlReadDoc(BAD_CAST html, NULL, NULL, HTML_PARSE_NODEFDTD);
|
||||||
|
|
||||||
|
/* xmlSave updates meta tags */
|
||||||
|
buf = xmlBufferCreate();
|
||||||
|
save = xmlSaveToBuffer(buf, "utf-8", 0);
|
||||||
|
xmlSaveDoc(save, doc);
|
||||||
|
xmlSaveClose(save);
|
||||||
|
if (!xmlStrEqual(xmlBufferContent(buf), BAD_CAST expect)) {
|
||||||
|
fprintf(stderr, "meta tag insertion failed when serializing\n");
|
||||||
|
err = 1;
|
||||||
|
}
|
||||||
|
xmlBufferFree(buf);
|
||||||
|
|
||||||
|
htmlSetMetaEncoding(doc, BAD_CAST "utf-8");
|
||||||
|
/* htmlDocDumpMemoryFormat doesn't update meta tags */
|
||||||
|
htmlDocDumpMemoryFormat(doc, &out, &size, 0);
|
||||||
|
if (!xmlStrEqual(out, BAD_CAST expect)) {
|
||||||
|
fprintf(stderr, "htmlSetMetaEncoding insertion failed\n");
|
||||||
|
err = 1;
|
||||||
|
}
|
||||||
|
xmlFree(out);
|
||||||
|
|
||||||
|
xmlFreeDoc(doc);
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
testHtmlUpdateMetaEncoding(void) {
|
||||||
|
/* We rely on the implementation adjusting all meta tags */
|
||||||
|
const char *html =
|
||||||
|
"<html>\n"
|
||||||
|
" <head>\n"
|
||||||
|
" <meta charset=\"utf-8\">\n"
|
||||||
|
" <meta charset=\" foo \">\n"
|
||||||
|
" <meta charset=\"\">\n"
|
||||||
|
" <" MHE " content=\"text/html; ChArSeT=foo\">\n"
|
||||||
|
" <" MHE " content=\"text/html; charset = \">\n"
|
||||||
|
" <" MHE " content=\"text/html; charset = ' foo '\">\n"
|
||||||
|
" <" MHE " content=\"text/html; charset = ' foo \">\n"
|
||||||
|
" <" MHE " content='text/html; charset = \" foo \"'>\n"
|
||||||
|
" <" MHE " content='text/html; charset = \" foo '>\n"
|
||||||
|
" <" MHE " content=\"charset ; charset = bar; baz\">\n"
|
||||||
|
" <" MHE " content=\"text/html\">\n"
|
||||||
|
" <" MHE " content=\"\">\n"
|
||||||
|
" <" MHE ">\n"
|
||||||
|
" </head>\n"
|
||||||
|
" <body></body>\n"
|
||||||
|
"</html>\n";
|
||||||
|
const char *expect =
|
||||||
|
"<html>\n"
|
||||||
|
" <head>\n"
|
||||||
|
" <meta charset=\"utf-8\">\n"
|
||||||
|
" <meta charset=\" utf-8 \">\n"
|
||||||
|
" <meta charset=\"utf-8\">\n"
|
||||||
|
" <" MHE " content=\"text/html; ChArSeT=utf-8\">\n"
|
||||||
|
" <" MHE " content=\"text/html; charset = \">\n"
|
||||||
|
" <" MHE " content=\"text/html; charset = ' utf-8 '\">\n"
|
||||||
|
" <" MHE " content=\"text/html; charset = ' foo \">\n"
|
||||||
|
" <" MHE " content='text/html; charset = \" utf-8 \"'>\n"
|
||||||
|
" <" MHE " content='text/html; charset = \" foo '>\n"
|
||||||
|
" <" MHE " content=\"charset ; charset = utf-8; baz\">\n"
|
||||||
|
" <" MHE " content=\"text/html\">\n"
|
||||||
|
" <" MHE " content=\"\">\n"
|
||||||
|
" <" MHE ">\n"
|
||||||
|
" </head>\n"
|
||||||
|
" <body></body>\n"
|
||||||
|
"</html>\n";
|
||||||
|
htmlDocPtr doc;
|
||||||
|
xmlBufferPtr buf;
|
||||||
|
xmlSaveCtxtPtr save;
|
||||||
|
xmlChar *out;
|
||||||
|
int size, err = 0;
|
||||||
|
|
||||||
|
doc = htmlReadDoc(BAD_CAST html, NULL, NULL, HTML_PARSE_NODEFDTD);
|
||||||
|
|
||||||
|
/* xmlSave updates meta tags */
|
||||||
|
buf = xmlBufferCreate();
|
||||||
|
save = xmlSaveToBuffer(buf, NULL, 0);
|
||||||
|
xmlSaveDoc(save, doc);
|
||||||
|
xmlSaveClose(save);
|
||||||
|
if (!xmlStrEqual(xmlBufferContent(buf), BAD_CAST expect)) {
|
||||||
|
fprintf(stderr, "meta tag update failed when serializing\n");
|
||||||
|
err = 1;
|
||||||
|
}
|
||||||
|
xmlBufferFree(buf);
|
||||||
|
|
||||||
|
xmlFree((xmlChar *) doc->encoding);
|
||||||
|
doc->encoding = NULL;
|
||||||
|
htmlSetMetaEncoding(doc, BAD_CAST "utf-8");
|
||||||
|
/* htmlDocDumpMemoryFormat doesn't update meta tags */
|
||||||
|
htmlDocDumpMemoryFormat(doc, &out, &size, 0);
|
||||||
|
if (!xmlStrEqual(out, BAD_CAST expect)) {
|
||||||
|
fprintf(stderr, "htmlSetMetaEncoding update failed\n");
|
||||||
|
err = 1;
|
||||||
|
}
|
||||||
|
xmlFree(out);
|
||||||
|
|
||||||
|
xmlFreeDoc(doc);
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef LIBXML_PUSH_ENABLED
|
#ifdef LIBXML_PUSH_ENABLED
|
||||||
static int
|
static int
|
||||||
testHtmlPushWithEncoding(void) {
|
testHtmlPushWithEncoding(void) {
|
||||||
@@ -1293,6 +1417,8 @@ main(void) {
|
|||||||
#endif
|
#endif
|
||||||
#ifdef LIBXML_HTML_ENABLED
|
#ifdef LIBXML_HTML_ENABLED
|
||||||
err |= testHtmlIds();
|
err |= testHtmlIds();
|
||||||
|
err |= testHtmlInsertMetaEncoding();
|
||||||
|
err |= testHtmlUpdateMetaEncoding();
|
||||||
#ifdef LIBXML_PUSH_ENABLED
|
#ifdef LIBXML_PUSH_ENABLED
|
||||||
err |= testHtmlPushWithEncoding();
|
err |= testHtmlPushWithEncoding();
|
||||||
#endif
|
#endif
|
||||||
|
33
xmlsave.c
33
xmlsave.c
@@ -25,6 +25,7 @@
|
|||||||
#include "private/enc.h"
|
#include "private/enc.h"
|
||||||
#include "private/entities.h"
|
#include "private/entities.h"
|
||||||
#include "private/error.h"
|
#include "private/error.h"
|
||||||
|
#include "private/html.h"
|
||||||
#include "private/io.h"
|
#include "private/io.h"
|
||||||
#include "private/save.h"
|
#include "private/save.h"
|
||||||
|
|
||||||
@@ -1022,32 +1023,24 @@ xmlAttrDumpOutput(xmlSaveCtxtPtr ctxt, xmlAttrPtr cur) {
|
|||||||
*/
|
*/
|
||||||
static int
|
static int
|
||||||
htmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) {
|
htmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) {
|
||||||
const xmlChar *encoding;
|
|
||||||
int switched_encoding = 0;
|
int switched_encoding = 0;
|
||||||
int format = 0;
|
int format = 0;
|
||||||
xmlDocPtr doc;
|
xmlDocPtr doc;
|
||||||
|
|
||||||
xmlInitParser();
|
xmlInitParser();
|
||||||
|
|
||||||
encoding = ctxt->encoding;
|
|
||||||
doc = cur->doc;
|
doc = cur->doc;
|
||||||
if (doc != NULL) {
|
|
||||||
if (encoding == NULL)
|
|
||||||
encoding = doc->encoding;
|
|
||||||
|
|
||||||
/* We probably shouldn't do this unless we're dumping a document. */
|
|
||||||
if (encoding != NULL)
|
|
||||||
htmlSetMetaEncoding(doc, encoding);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (ctxt->encoding == NULL) {
|
if (ctxt->encoding == NULL) {
|
||||||
if ((encoding == NULL) && (doc != NULL))
|
const char *encoding = NULL;
|
||||||
encoding = htmlGetMetaEncoding(doc);
|
|
||||||
|
if (doc != NULL)
|
||||||
|
encoding = (char *) doc->encoding;
|
||||||
|
|
||||||
if (encoding == NULL)
|
if (encoding == NULL)
|
||||||
encoding = BAD_CAST "HTML";
|
encoding = "HTML";
|
||||||
|
|
||||||
if (xmlSaveSwitchEncoding(ctxt, (const char*) encoding) < 0)
|
if (xmlSaveSwitchEncoding(ctxt, encoding) < 0)
|
||||||
return(-1);
|
return(-1);
|
||||||
switched_encoding = 1;
|
switched_encoding = 1;
|
||||||
}
|
}
|
||||||
@@ -1055,7 +1048,7 @@ htmlNodeDumpOutputInternal(xmlSaveCtxtPtr ctxt, xmlNodePtr cur) {
|
|||||||
if (ctxt->options & XML_SAVE_FORMAT)
|
if (ctxt->options & XML_SAVE_FORMAT)
|
||||||
format = 1;
|
format = 1;
|
||||||
|
|
||||||
htmlNodeDumpFormatOutput(ctxt->buf, doc, cur, NULL, format);
|
htmlNodeDumpInternal(ctxt->buf, doc, cur, (char *) ctxt->encoding, format);
|
||||||
|
|
||||||
if (switched_encoding) {
|
if (switched_encoding) {
|
||||||
xmlSaveClearEncoding(ctxt);
|
xmlSaveClearEncoding(ctxt);
|
||||||
@@ -1361,16 +1354,9 @@ xmlSaveDocInternal(xmlSaveCtxtPtr ctxt, xmlDocPtr cur,
|
|||||||
#ifdef LIBXML_HTML_ENABLED
|
#ifdef LIBXML_HTML_ENABLED
|
||||||
int format = 0;
|
int format = 0;
|
||||||
|
|
||||||
if (encoding != NULL)
|
|
||||||
htmlSetMetaEncoding(cur, encoding);
|
|
||||||
|
|
||||||
if (ctxt->encoding == NULL) {
|
if (ctxt->encoding == NULL) {
|
||||||
if (encoding == NULL) {
|
|
||||||
encoding = htmlGetMetaEncoding(cur);
|
|
||||||
|
|
||||||
if (encoding == NULL)
|
if (encoding == NULL)
|
||||||
encoding = BAD_CAST "HTML";
|
encoding = BAD_CAST "HTML";
|
||||||
}
|
|
||||||
|
|
||||||
if (xmlSaveSwitchEncoding(ctxt, (const char*) encoding) < 0) {
|
if (xmlSaveSwitchEncoding(ctxt, (const char*) encoding) < 0) {
|
||||||
return(-1);
|
return(-1);
|
||||||
@@ -1380,7 +1366,8 @@ xmlSaveDocInternal(xmlSaveCtxtPtr ctxt, xmlDocPtr cur,
|
|||||||
|
|
||||||
if (ctxt->options & XML_SAVE_FORMAT)
|
if (ctxt->options & XML_SAVE_FORMAT)
|
||||||
format = 1;
|
format = 1;
|
||||||
htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
|
htmlNodeDumpInternal(buf, cur, (htmlNodePtr) cur,
|
||||||
|
(char *) ctxt->encoding, format);
|
||||||
#else
|
#else
|
||||||
return(-1);
|
return(-1);
|
||||||
#endif
|
#endif
|
||||||
|
Reference in New Issue
Block a user