1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-10-21 14:53:44 +03:00
Files
libxml2/HTMLtree.c
Nick Wellnhofer 46f05ea4d5 html: Rework meta charset handling
Don't use encoding from meta tags when serializing. Only use the value
in `doc->encoding`, matching the XML serializer. This is the actual
encoding used when parsing.

Stop modifying the input document by setting meta tags before
serializing. Meta tags are now injected during serialization.

Add full support for <meta charset=""> which is also used when adding
meta tags.

Align with HTML5 and implement the "algorithm for extracting a character
encoding from a meta element". Only modify the encoding substring in
Content-Type meta tags.

Only switch encoding once when parsing.

Fix htmlSaveFileFormat with a NULL encoding not to declare a misleading
UTF-8 charset.

Fixes #909.
2025-05-11 20:29:25 +02:00

1207 lines
36 KiB
C

/*
* HTMLtree.c : implementation of access function for an HTML tree.
*
* See Copyright for the status of this software.
*
* Author: Daniel Veillard
*/
#define IN_LIBXML
#include "libxml.h"
#ifdef LIBXML_HTML_ENABLED
#include <string.h> /* for memset() only ! */
#include <ctype.h>
#include <stdlib.h>
#include <libxml/xmlmemory.h>
#include <libxml/HTMLparser.h>
#include <libxml/HTMLtree.h>
#include <libxml/entities.h>
#include <libxml/xmlerror.h>
#include <libxml/parserInternals.h>
#include <libxml/uri.h>
#include "private/buf.h"
#include "private/error.h"
#include "private/html.h"
#include "private/io.h"
#include "private/save.h"
/************************************************************************
* *
* Getting/Setting encoding meta tags *
* *
************************************************************************/
typedef struct {
xmlAttrPtr attr; /* charset or content */
const xmlChar *attrValue;
htmlMetaEncodingOffsets off;
} htmlMetaEncoding;
static htmlNodePtr
htmlFindFirstChild(htmlNodePtr parent, const char *name) {
htmlNodePtr child;
for (child = parent->children; child != NULL; child = child->next) {
if ((child->type == XML_ELEMENT_NODE) &&
(child->ns == NULL) &&
(xmlStrcasecmp(child->name, BAD_CAST name) == 0))
return(child);
}
return(NULL);
}
static htmlNodePtr
htmlFindHead(htmlDocPtr doc) {
htmlNodePtr html;
if (doc == NULL)
return(NULL);
html = htmlFindFirstChild((htmlNodePtr) doc, "html");
if (html == NULL)
return(NULL);
return(htmlFindFirstChild(html, "head"));
}
int
htmlParseContentType(const xmlChar *val, htmlMetaEncodingOffsets *off) {
const xmlChar *p = val;
while (1) {
size_t start, end;
while ((*p != 'c') && (*p != 'C')) {
if (*p == 0)
return(0);
p += 1;
}
p += 1;
if (xmlStrncasecmp(p, BAD_CAST "harset", 6) != 0)
continue;
p += 6;
while (IS_WS_HTML(*p)) p += 1;
if (*p != '=')
continue;
p += 1;
while (IS_WS_HTML(*p)) p += 1;
if (*p == 0)
return(0);
if ((*p == '"') || (*p == '\'')) {
int quote = *p;
p += 1;
while (IS_WS_HTML(*p)) p += 1;
start = p - val;
end = start;
while (*p != quote) {
if (*p == 0)
return(0);
if (!IS_WS_HTML(*p))
end = p + 1 - val;
p += 1;
}
} else {
start = p - val;
while ((*p != 0) && (*p != ';') && (!IS_WS_HTML(*p)))
p += 1;
end = p - val;
}
off->start = start;
off->end = end;
off->size = p - val + strlen((char *) p);
return(1);
}
return(0);
}
static xmlAttrPtr
htmlFindMetaEncodingAttr(htmlNodePtr elem, int *outIsContentType) {
xmlAttrPtr attr, contentAttr = NULL;
int isContentType = 0;
if (xmlStrcasecmp(elem->name, BAD_CAST "meta") != 0)
return(NULL);
for (attr = elem->properties; attr != NULL; attr = attr->next) {
if (attr->ns != NULL)
continue;
if (xmlStrcasecmp(attr->name, BAD_CAST "charset") == 0) {
*outIsContentType = 0;
return(attr);
}
if (xmlStrcasecmp(attr->name, BAD_CAST "content") == 0)
contentAttr = attr;
if ((xmlStrcasecmp(attr->name, BAD_CAST "http-equiv") == 0) &&
(attr->children != NULL) &&
(attr->children->type == XML_TEXT_NODE) &&
(attr->children->next == NULL) &&
(xmlStrcasecmp(attr->children->content,
BAD_CAST "Content-Type") == 0))
isContentType = 1;
}
if ((isContentType) && (contentAttr != NULL)) {
*outIsContentType = 1;
return(contentAttr);
}
return(NULL);
}
static int
htmlParseMetaEncoding(htmlNodePtr elem, htmlMetaEncoding *menc) {
xmlAttrPtr attr;
const xmlChar *val = NULL;
int isContentType;
if ((elem->type != XML_ELEMENT_NODE) ||
(elem->ns != NULL) ||
(xmlStrcasecmp(elem->name, BAD_CAST "meta") != 0))
return(0);
attr = htmlFindMetaEncodingAttr(elem, &isContentType);
if (attr == NULL)
return(0);
if ((attr->children != NULL) &&
(attr->children->type == XML_TEXT_NODE) &&
(attr->children->next == NULL) &&
(attr->children->content != NULL))
val = attr->children->content;
else
val = BAD_CAST "";
if (!isContentType) {
size_t size = strlen((char *) val);
size_t start = 0;
size_t end = size;
while ((start < size) && (IS_WS_HTML(val[start])))
start += 1;
while ((end > 0) && (IS_WS_HTML(val[end-1])))
end -= 1;
menc->attr = attr;
menc->attrValue = val;
menc->off.start = start;
menc->off.end = end;
menc->off.size = size;
return(1);
} else {
if (htmlParseContentType(val, &menc->off)) {
menc->attr = attr;
menc->attrValue = val;
return(1);
}
}
return(0);
}
static xmlChar *
htmlUpdateMetaEncoding(htmlMetaEncoding *menc, const char *encoding) {
xmlChar *newVal, *p;
size_t size, oldEncSize, newEncSize;
/*
* The pseudo "HTML" encoding only produces ASCII.
*/
if (xmlStrcasecmp(BAD_CAST encoding, BAD_CAST "HTML") == 0)
encoding = "ASCII";
oldEncSize = menc->off.end - menc->off.start;
newEncSize = strlen((char *) encoding);
size = menc->off.size - oldEncSize + newEncSize;
newVal = xmlMalloc(size + 1);
if (newVal == NULL)
return(NULL);
p = newVal;
memcpy(p, menc->attrValue, menc->off.start);
p += menc->off.start;
memcpy(p, encoding, newEncSize);
p += newEncSize;
memcpy(p, menc->attrValue + menc->off.end, menc->off.size - menc->off.end);
newVal[size] = 0;
return(newVal);
}
/**
* Look up and encoding declaration in the meta tags.
*
* The returned string points into attribute content and can contain
* trailing garbage. It should be copied before modifying or freeing
* nodes.
*
* @param doc the document
* @returns the encoding ot NULL if not found.
*/
const xmlChar *
htmlGetMetaEncoding(htmlDocPtr doc) {
htmlNodePtr head, node;
head = htmlFindHead(doc);
if (head == NULL)
return(NULL);
for (node = head->children; node != NULL; node = node->next) {
htmlMetaEncoding menc;
if (htmlParseMetaEncoding(node, &menc)) {
/*
* Returning a `const xmlChar *` only allows to return
* a suffix. In http-equiv meta tags, there could be
* more data after the charset, although it's probably
* rare in practice.
*/
return(menc.attrValue + menc.off.start);
}
}
return(NULL);
}
/**
* Creates or updates a meta tag with an encoding declaration.
*
* NOTE: This will not change the document content encoding.
*
* @param doc the document
* @param encoding the encoding string
* @returns 0 in case of success, 1 if no head element was found or
* arguments are invalid and -1 if memory allocation failed.
*/
int
htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
htmlNodePtr head, meta;
int found = 0;
if (encoding == NULL)
return(1);
head = htmlFindHead(doc);
if (head == NULL)
return(1);
for (meta = head->children; meta != NULL; meta = meta->next) {
htmlMetaEncoding menc;
if (htmlParseMetaEncoding(meta, &menc)) {
xmlChar *newVal;
int ret;
found = 1;
newVal = htmlUpdateMetaEncoding(&menc, (char *) encoding);
if (newVal == NULL)
return(-1);
xmlNodeSetContent((xmlNodePtr) menc.attr, NULL);
ret = xmlNodeAddContent((xmlNodePtr) menc.attr, newVal);
xmlFree(newVal);
if (ret < 0)
return(-1);
}
}
if (found)
return(0);
meta = xmlNewDocNode(head->doc, NULL, BAD_CAST "meta", NULL);
if (meta == NULL)
return(-1);
if (xmlNewProp(meta, BAD_CAST "charset", encoding) == NULL) {
xmlFreeNode(meta);
return(-1);
}
if (head->children == NULL)
xmlAddChild(head, meta);
else
xmlAddPrevSibling(head->children, meta);
return(0);
}
/**
* These are the HTML attributes which will be output
* in minimized form, i.e. `<option selected="selected">` will be
* output as `<option selected>`, as per XSLT 1.0 16.2 "HTML Output Method"
*/
static const char* const htmlBooleanAttrs[] = {
"checked", "compact", "declare", "defer", "disabled", "ismap",
"multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
"selected", NULL
};
/**
* Determine if a given attribute is a boolean attribute. This
* doesn't handle HTML5.
*
* @deprecated Internal function, don't use.
*
* @param name the name of the attribute to check
* @returns false if the attribute is not boolean, true otherwise.
*/
int
htmlIsBooleanAttr(const xmlChar *name)
{
int i = 0;
while (htmlBooleanAttrs[i] != NULL) {
if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
return 1;
i++;
}
return 0;
}
#ifdef LIBXML_OUTPUT_ENABLED
/************************************************************************
* *
* Dumping HTML tree content to a simple buffer *
* *
************************************************************************/
static xmlParserErrors
htmlFindOutputEncoder(const char *encoding, xmlCharEncodingHandler **out) {
/*
* Fallback to HTML if the encoding is unspecified
*/
if (encoding == NULL)
encoding = "HTML";
return(xmlOpenCharEncodingHandler(encoding, /* output */ 1, out));
}
/**
* Serialize an HTML document to an xmlBuf.
*
* @param buf the xmlBufPtr output
* @param doc the document
* @param cur the current node
* @param format should formatting newlines been added
* @returns the number of bytes written or -1 in case of error
*/
static size_t
htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
int format) {
size_t use;
size_t ret;
xmlOutputBufferPtr outbuf;
if (cur == NULL) {
return ((size_t) -1);
}
if (buf == NULL) {
return ((size_t) -1);
}
outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
if (outbuf == NULL)
return ((size_t) -1);
memset(outbuf, 0, sizeof(xmlOutputBuffer));
outbuf->buffer = buf;
outbuf->encoder = NULL;
outbuf->writecallback = NULL;
outbuf->closecallback = NULL;
outbuf->context = NULL;
outbuf->written = 0;
use = xmlBufUse(buf);
htmlNodeDumpInternal(outbuf, doc, cur, NULL, format);
if (outbuf->error)
ret = (size_t) -1;
else
ret = xmlBufUse(buf) - use;
xmlFree(outbuf);
return (ret);
}
/**
* Serialize an HTML node to an xmlBuffer. Always uses UTF-8.
*
* @param buf the HTML buffer output
* @param doc the document
* @param cur the current node
* @returns the number of bytes written or -1 in case of error
*/
int
htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
xmlBufPtr buffer;
size_t ret1;
int ret2;
if ((buf == NULL) || (cur == NULL))
return(-1);
xmlInitParser();
buffer = xmlBufFromBuffer(buf);
if (buffer == NULL)
return(-1);
ret1 = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
ret2 = xmlBufBackToBuffer(buffer, buf);
if ((ret1 == (size_t) -1) || (ret2 < 0))
return(-1);
return(ret1 > INT_MAX ? INT_MAX : ret1);
}
/**
* Serialize an HTML node to an xmlBuffer.
*
* If encoding is NULL, ASCII with HTML 4.0 named character entities
* will be used. This is inefficient compared to UTF-8 and might be
* changed in a future version.
*
* @param out the FILE pointer
* @param doc the document
* @param cur the current node
* @param encoding the document encoding (optional)
* @param format should formatting newlines been added
* @returns the number of bytes written or -1 in case of failure.
*/
int
htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
xmlNodePtr cur, const char *encoding, int format) {
xmlOutputBufferPtr buf;
xmlCharEncodingHandlerPtr handler;
int ret;
xmlInitParser();
/*
* save the content to a temp buffer.
*/
if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK)
return(-1);
buf = xmlOutputBufferCreateFile(out, handler);
if (buf == NULL)
return(-1);
htmlNodeDumpInternal(buf, doc, cur, NULL, format);
ret = xmlOutputBufferClose(buf);
return(ret);
}
/**
* Same as htmlNodeDumpFileFormat() with `format` set to 1 which is
* typically undesired. Use of this function is DISCOURAGED in favor
* of htmlNodeDumpFileFormat().
*
* @param out the FILE pointer
* @param doc the document
* @param cur the current node
*/
void
htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
}
/**
* Serialize an HTML node to a memory, also returning the size of
* the result. It's up to the caller to free the memory.
*
* Uses the encoding of the document. If the document has no
* encoding, ASCII with HTML 4.0 named character entities will
* be used. This is inefficient compared to UTF-8 and might be
* changed in a future version.
*
* @param cur the document
* @param mem OUT: the memory pointer
* @param size OUT: the memory length
* @param format should formatting newlines been added
*/
void
htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
xmlOutputBufferPtr buf;
xmlCharEncodingHandlerPtr handler = NULL;
xmlInitParser();
if ((mem == NULL) || (size == NULL))
return;
*mem = NULL;
*size = 0;
if (cur == NULL)
return;
if (htmlFindOutputEncoder((char *) cur->encoding, &handler) != XML_ERR_OK)
return;
buf = xmlAllocOutputBuffer(handler);
if (buf == NULL)
return;
htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
xmlOutputBufferFlush(buf);
if (!buf->error) {
if (buf->conv != NULL) {
*size = xmlBufUse(buf->conv);
*mem = xmlStrndup(xmlBufContent(buf->conv), *size);
} else {
*size = xmlBufUse(buf->buffer);
*mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
}
}
xmlOutputBufferClose(buf);
}
/**
* Same as htmlDocDumpMemoryFormat() with `format` set to 1 which
* is typically undesired. Also see the warnings there. Use of
* this function is DISCOURAGED in favor of
* htmlDocContentDumpFormatOutput().
*
* @param cur the document
* @param mem OUT: the memory pointer
* @param size OUT: the memory length
*/
void
htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
htmlDocDumpMemoryFormat(cur, mem, size, 1);
}
/************************************************************************
* *
* Dumping HTML tree content to an I/O output buffer *
* *
************************************************************************/
/**
* Serialize the HTML document's DTD, if any.
*
* Ignores `encoding` and uses the encoding of the output buffer.
* @param buf the HTML buffer output
* @param doc the document
* @param encoding the encoding string (unused)
*/
static void
htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
const char *encoding ATTRIBUTE_UNUSED) {
xmlDtdPtr cur = doc->intSubset;
if (cur == NULL)
return;
xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
xmlOutputBufferWriteString(buf, (const char *)cur->name);
if (cur->ExternalID != NULL) {
xmlOutputBufferWriteString(buf, " PUBLIC ");
xmlOutputBufferWriteQuotedString(buf, cur->ExternalID);
if (cur->SystemID != NULL) {
xmlOutputBufferWriteString(buf, " ");
xmlOutputBufferWriteQuotedString(buf, cur->SystemID);
}
} else if (cur->SystemID != NULL &&
xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
xmlOutputBufferWriteString(buf, " SYSTEM ");
xmlOutputBufferWriteQuotedString(buf, cur->SystemID);
}
xmlOutputBufferWriteString(buf, ">\n");
}
/**
* Serialize an HTML attribute.
*
* @param buf the HTML buffer output
* @param doc the document
* @param cur the attribute pointer
*/
static void
htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
xmlChar *value;
/*
* The html output method should not escape a & character
* occurring in an attribute value immediately followed by
* a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
* This is implemented in xmlEncodeEntitiesReentrant
*/
if (cur == NULL) {
return;
}
xmlOutputBufferWriteString(buf, " ");
if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
xmlOutputBufferWriteString(buf, ":");
}
xmlOutputBufferWriteString(buf, (const char *)cur->name);
if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
value = xmlNodeListGetString(doc, cur->children, 0);
if (value) {
xmlOutputBufferWriteString(buf, "=");
if ((cur->ns == NULL) && (cur->parent != NULL) &&
(cur->parent->ns == NULL) &&
((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
(!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
(!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
(!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
xmlChar *escaped;
xmlChar *tmp = value;
while (IS_BLANK_CH(*tmp)) tmp++;
/*
* Angle brackets are technically illegal in URIs, but they're
* used in server side includes, for example. Curly brackets
* are illegal as well and often used in templates.
* Don't escape non-whitespace, printable ASCII chars for
* improved interoperability. Only escape space, control
* and non-ASCII chars.
*/
escaped = xmlURIEscapeStr(tmp,
BAD_CAST "\"#$%&+,/:;<=>?@[\\]^`{|}");
if (escaped != NULL) {
xmlOutputBufferWriteQuotedString(buf, escaped);
xmlFree(escaped);
} else {
buf->error = XML_ERR_NO_MEMORY;
}
} else {
xmlOutputBufferWriteQuotedString(buf, value);
}
xmlFree(value);
} else {
buf->error = XML_ERR_NO_MEMORY;
}
}
}
/**
* Serialize an HTML node to an output buffer.
*
* If `encoding` is specified, it is used to create or update meta
* tags containing the character encoding.
*
* @param buf the HTML buffer output
* @param doc the document
* @param cur the current node
* @param encoding the encoding string (optional)
* @param format should formatting newlines been added
*/
void
htmlNodeDumpInternal(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
const char *encoding, int format) {
xmlNodePtr root, parent, metaHead = NULL;
xmlAttrPtr attr;
const htmlElemDesc * info;
xmlInitParser();
if ((cur == NULL) || (buf == NULL)) {
return;
}
root = cur;
parent = cur->parent;
while (1) {
switch (cur->type) {
case XML_HTML_DOCUMENT_NODE:
case XML_DOCUMENT_NODE:
if (((xmlDocPtr) cur)->intSubset != NULL) {
htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
}
if (cur->children != NULL) {
/* Always validate cur->parent when descending. */
if (cur->parent == parent) {
parent = cur;
cur = cur->children;
continue;
}
} else {
xmlOutputBufferWriteString(buf, "\n");
}
break;
case XML_ELEMENT_NODE: {
htmlMetaEncoding menc;
int isMeta = 0;
int addMeta = 0;
/*
* Some users like lxml are known to pass nodes with a corrupted
* tree structure. Fall back to a recursive call to handle this
* case.
*/
if ((cur->parent != parent) && (cur->children != NULL)) {
htmlNodeDumpInternal(buf, doc, cur, encoding, format);
break;
}
/*
* Get specific HTML info for that node.
*/
if (cur->ns == NULL) {
info = htmlTagLookup(cur->name);
if (encoding != NULL) {
isMeta = htmlParseMetaEncoding(cur, &menc);
/*
* Don't add meta tag for "HTML" encoding.
*/
if ((xmlStrcasecmp(BAD_CAST encoding,
BAD_CAST "HTML") != 0) &&
(xmlStrcasecmp(cur->name, BAD_CAST "head") == 0) &&
(parent != NULL) &&
(parent->ns == NULL) &&
(xmlStrcasecmp(parent->name, BAD_CAST "html") == 0) &&
(parent->parent != NULL) &&
(parent->parent->parent == NULL) &&
(metaHead == NULL)) {
xmlNodePtr n;
metaHead = cur;
addMeta = 1;
for (n = cur->children; n != NULL; n = n->next) {
int unused;
if (htmlFindMetaEncodingAttr(n, &unused) != NULL) {
metaHead = NULL;
addMeta = 0;
break;
}
}
}
}
} else {
info = NULL;
}
xmlOutputBufferWriteString(buf, "<");
if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
xmlOutputBufferWriteString(buf, ":");
}
xmlOutputBufferWriteString(buf, (const char *)cur->name);
if (cur->nsDef)
xmlNsListDumpOutput(buf, cur->nsDef);
attr = cur->properties;
while (attr != NULL) {
if ((!isMeta) || (attr != menc.attr)) {
htmlAttrDumpOutput(buf, doc, attr);
} else {
xmlChar *newVal;
xmlOutputBufferWriteString(buf, " ");
xmlOutputBufferWriteString(buf, (char *) attr->name);
newVal = htmlUpdateMetaEncoding(&menc, encoding);
if (newVal == NULL) {
buf->error = XML_ERR_NO_MEMORY;
return;
}
xmlOutputBufferWriteString(buf, "=");
xmlOutputBufferWriteQuotedString(buf, newVal);
xmlFree(newVal);
}
attr = attr->next;
}
if ((info != NULL) && (info->empty)) {
xmlOutputBufferWriteString(buf, ">");
} else if (cur->children == NULL) {
if ((info != NULL) && (info->saveEndTag != 0) &&
(xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
(xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
xmlOutputBufferWriteString(buf, ">");
} else {
if (addMeta) {
xmlOutputBufferWriteString(buf, "><meta charset=\"");
/* TODO: Escape */
xmlOutputBufferWriteString(buf, encoding);
xmlOutputBufferWriteString(buf, "\"></");
} else {
xmlOutputBufferWriteString(buf, "></");
}
if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
xmlOutputBufferWriteString(buf,
(const char *)cur->ns->prefix);
xmlOutputBufferWriteString(buf, ":");
}
xmlOutputBufferWriteString(buf, (const char *)cur->name);
xmlOutputBufferWriteString(buf, ">");
}
} else {
xmlOutputBufferWriteString(buf, ">");
if ((format) &&
((addMeta) ||
((info != NULL) && (!info->isinline) &&
(cur->children->type != HTML_TEXT_NODE) &&
(cur->children->type != HTML_ENTITY_REF_NODE) &&
(cur->children != cur->last) &&
(cur->name != NULL) &&
(cur->name[0] != 'p')))) /* p, pre, param */
xmlOutputBufferWriteString(buf, "\n");
if (addMeta) {
xmlOutputBufferWriteString(buf, "<meta charset=\"");
/* TODO: Escape */
xmlOutputBufferWriteString(buf, encoding);
xmlOutputBufferWriteString(buf, "\">");
if ((format) &&
(cur->children->type != HTML_TEXT_NODE) &&
(cur->children->type != HTML_ENTITY_REF_NODE))
xmlOutputBufferWriteString(buf, "\n");
}
parent = cur;
cur = cur->children;
continue;
}
if ((format) && (cur->next != NULL) &&
(info != NULL) && (!info->isinline)) {
if ((cur->next->type != HTML_TEXT_NODE) &&
(cur->next->type != HTML_ENTITY_REF_NODE) &&
(parent != NULL) &&
(parent->name != NULL) &&
(parent->name[0] != 'p')) /* p, pre, param */
xmlOutputBufferWriteString(buf, "\n");
}
break;
}
case XML_ATTRIBUTE_NODE:
htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur);
break;
case HTML_TEXT_NODE:
if (cur->content == NULL)
break;
if (((cur->name == (const xmlChar *)xmlStringText) ||
(cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
((parent == NULL) ||
((xmlStrcasecmp(parent->name, BAD_CAST "script")) &&
(xmlStrcasecmp(parent->name, BAD_CAST "style"))))) {
xmlChar *buffer;
buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
if (buffer == NULL) {
buf->error = XML_ERR_NO_MEMORY;
return;
}
xmlOutputBufferWriteString(buf, (const char *)buffer);
xmlFree(buffer);
} else {
xmlOutputBufferWriteString(buf, (const char *)cur->content);
}
break;
case HTML_COMMENT_NODE:
if (cur->content != NULL) {
xmlOutputBufferWriteString(buf, "<!--");
xmlOutputBufferWriteString(buf, (const char *)cur->content);
xmlOutputBufferWriteString(buf, "-->");
}
break;
case HTML_PI_NODE:
if (cur->name != NULL) {
xmlOutputBufferWriteString(buf, "<?");
xmlOutputBufferWriteString(buf, (const char *)cur->name);
if (cur->content != NULL) {
xmlOutputBufferWriteString(buf, " ");
xmlOutputBufferWriteString(buf,
(const char *)cur->content);
}
xmlOutputBufferWriteString(buf, ">");
}
break;
case HTML_ENTITY_REF_NODE:
xmlOutputBufferWriteString(buf, "&");
xmlOutputBufferWriteString(buf, (const char *)cur->name);
xmlOutputBufferWriteString(buf, ";");
break;
case HTML_PRESERVE_NODE:
if (cur->content != NULL) {
xmlOutputBufferWriteString(buf, (const char *)cur->content);
}
break;
default:
break;
}
while (1) {
if (cur == root)
return;
if (cur->next != NULL) {
cur = cur->next;
break;
}
cur = parent;
/* cur->parent was validated when descending. */
parent = cur->parent;
if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
(cur->type == XML_DOCUMENT_NODE)) {
xmlOutputBufferWriteString(buf, "\n");
} else {
if ((format) && (cur->ns == NULL))
info = htmlTagLookup(cur->name);
else
info = NULL;
if ((format) && (info != NULL) && (!info->isinline) &&
(cur->last->type != HTML_TEXT_NODE) &&
(cur->last->type != HTML_ENTITY_REF_NODE) &&
((cur->children != cur->last) || (cur == metaHead)) &&
(cur->name != NULL) &&
(cur->name[0] != 'p')) /* p, pre, param */
xmlOutputBufferWriteString(buf, "\n");
xmlOutputBufferWriteString(buf, "</");
if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
xmlOutputBufferWriteString(buf, ":");
}
xmlOutputBufferWriteString(buf, (const char *)cur->name);
xmlOutputBufferWriteString(buf, ">");
if ((format) && (info != NULL) && (!info->isinline) &&
(cur->next != NULL)) {
if ((cur->next->type != HTML_TEXT_NODE) &&
(cur->next->type != HTML_ENTITY_REF_NODE) &&
(parent != NULL) &&
(parent->name != NULL) &&
(parent->name[0] != 'p')) /* p, pre, param */
xmlOutputBufferWriteString(buf, "\n");
}
if (cur == metaHead)
metaHead = NULL;
}
}
}
}
/**
* Serialize an HTML node to an output buffer.
*
* @param buf the HTML buffer output
* @param doc the document
* @param cur the current node
* @param encoding the encoding string (unused)
* @param format should formatting newlines been added
*/
void
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
const char *encoding ATTRIBUTE_UNUSED, int format) {
htmlNodeDumpInternal(buf, doc, cur, NULL, format);
}
/**
* Same as htmlNodeDumpFormatOutput() with `format` set to 1 which is
* typically undesired. Use of this function is DISCOURAGED in favor
* of htmlNodeDumpFormatOutput().
*
* @param buf the HTML buffer output
* @param doc the document
* @param cur the current node
* @param encoding the encoding string (unused)
*/
void
htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur,
const char *encoding ATTRIBUTE_UNUSED) {
htmlNodeDumpInternal(buf, doc, cur, NULL, 1);
}
/**
* Serialize an HTML document to an output buffer.
*
* @param buf the HTML buffer output
* @param cur the document
* @param encoding the encoding string (unused)
* @param format should formatting newlines been added
*/
void
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
const char *encoding ATTRIBUTE_UNUSED,
int format) {
htmlNodeDumpInternal(buf, cur, (xmlNodePtr) cur, NULL, format);
}
/**
* Same as htmlDocContentDumpFormatDump() with `format` set to 1
* which is typically undesired. Use of this function is DISCOURAGED
* in favor of htmlDocContentDumpFormatOutput().
*
* @param buf the HTML buffer output
* @param cur the document
* @param encoding the encoding string (unused)
*/
void
htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
const char *encoding ATTRIBUTE_UNUSED) {
htmlNodeDumpInternal(buf, cur, (xmlNodePtr) cur, NULL, 1);
}
/************************************************************************
* *
* Saving functions front-ends *
* *
************************************************************************/
/**
* Serialize an HTML document to an open `FILE`.
*
* Uses the encoding of the document. If the document has no
* encoding, ASCII with HTML 4.0 named character entities will
* be used. This is inefficient compared to UTF-8 and might be
* changed in a future version.
*
* Enables "formatting" unconditionally which is typically
* undesired.
*
* Use of this function is DISCOURAGED in favor of
* htmlNodeDumpFileFormat().
*
* @param f the FILE*
* @param cur the document
* @returns the number of bytes written or -1 in case of failure.
*/
int
htmlDocDump(FILE *f, xmlDocPtr cur) {
xmlOutputBufferPtr buf;
xmlCharEncodingHandlerPtr handler = NULL;
int ret;
xmlInitParser();
if ((cur == NULL) || (f == NULL)) {
return(-1);
}
if (htmlFindOutputEncoder((char *) cur->encoding, &handler) != XML_ERR_OK)
return(-1);
buf = xmlOutputBufferCreateFile(f, handler);
if (buf == NULL)
return(-1);
htmlDocContentDumpOutput(buf, cur, NULL);
ret = xmlOutputBufferClose(buf);
return(ret);
}
/**
* Serialize an HTML document to a file.
*
* Same as htmlSaveFileFormat() with `encoding` set to NULL and
* `format` set to 1 which is typically undesired.
*
* Use of this function is DISCOURAGED in favor of
* htmlSaveFileFormat().
*
* @param filename the filename (or URL)
* @param cur the document
* @returns the number of bytes written or -1 in case of failure.
*/
int
htmlSaveFile(const char *filename, xmlDocPtr cur) {
return(htmlSaveFileFormat(filename, cur, NULL, 1));
}
/**
* Serialize an HTML document to a file using a given encoding.
*
* If `filename` is `"-"`, stdout is used. This is potentially
* insecure and might be changed in a future version.
*
* If encoding is NULL, ASCII with HTML 4.0 named character entities
* will be used. This is inefficient compared to UTF-8 and might be
* changed in a future version.
*
* Sets or updates meta tags containing the character encoding.
*
* @param filename the filename
* @param cur the document
* @param format should formatting newlines been added
* @param encoding the document encoding (optional)
* @returns the number of bytes written or -1 in case of failure.
*/
int
htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
const char *encoding, int format) {
xmlOutputBufferPtr buf;
xmlCharEncodingHandlerPtr handler = NULL;
int ret;
if ((cur == NULL) || (filename == NULL))
return(-1);
xmlInitParser();
if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK)
return(-1);
/*
* save the content to a temp buffer.
*/
buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
if (buf == NULL)
return(0);
htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
ret = xmlOutputBufferClose(buf);
return(ret);
}
/**
* Serialize an HTML document to a file.
*
* Same as htmlSaveFileFormat() with `format` set to 1 which is
* typically undesired. Also see the warnings there. Use of this
* function is DISCOURAGED in favor of htmlSaveFileFormat().
*
* @param filename the filename
* @param cur the document
* @param encoding the document encoding
* @returns the number of bytes written or -1 in case of failure.
*/
int
htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
return(htmlSaveFileFormat(filename, cur, encoding, 1));
}
#endif /* LIBXML_OUTPUT_ENABLED */
#endif /* LIBXML_HTML_ENABLED */