1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-10-24 13:33:01 +03:00
Files
libxml2/HTMLtree.c
Nick Wellnhofer af4fae5ae3 html: Add some comments regarding HTML5 serialization
It seems that the specification of the HTML output method in XSLT 1.0
had a lot of influence on how the HTML serializer in libxml2 ended up:

https://www.w3.org/TR/xslt-10/#section-HTML-Output-Method

There are two remaining behaviors suggested by XSLT 1.0 that don't match
the HTML5 fragment serialization algorithm:

We escape non-ASCII characters in URI attributes (the list of which is
probably outdated). This was originally recommended in appendix B of the
HTML 4.01 spec, but only for user agents:

https://www.w3.org/TR/html401/appendix/notes.html#h-B.2.1

From my experience, any tool that processes HTML should escape as little
as possible. For example, we used to escape many more characters which
are invalid in URIs, but often used in template languages. (Note that we
still escape whitespace and control chars.) Nevertheless, I guess that
some libxslt users continue to expect this behavior from libxml2.

Then we collapse Boolean attributes using an outdated list. This is
mostly a cosmetic issue, but a somewhat important one for libxslt users.

We probably need a serialization option for the xmlsave module that
enables fully HTML5-conformant output.
2025-05-13 23:00:51 +02:00

1315 lines
38 KiB
C

/*
* HTMLtree.c : implementation of access function for an HTML tree.
*
* See Copyright for the status of this software.
*
* Author: Daniel Veillard
*/
#define IN_LIBXML
#include "libxml.h"
#ifdef LIBXML_HTML_ENABLED
#include <string.h> /* for memset() only ! */
#include <ctype.h>
#include <stdlib.h>
#include <libxml/xmlmemory.h>
#include <libxml/HTMLparser.h>
#include <libxml/HTMLtree.h>
#include <libxml/entities.h>
#include <libxml/xmlerror.h>
#include <libxml/parserInternals.h>
#include <libxml/uri.h>
#include "private/buf.h"
#include "private/html.h"
#include "private/error.h"
#include "private/html.h"
#include "private/io.h"
#include "private/save.h"
#include "private/tree.h"
/************************************************************************
* *
* Getting/Setting encoding meta tags *
* *
************************************************************************/
typedef struct {
xmlAttrPtr attr; /* charset or content */
const xmlChar *attrValue;
htmlMetaEncodingOffsets off;
} htmlMetaEncoding;
static htmlNodePtr
htmlFindFirstChild(htmlNodePtr parent, const char *name) {
htmlNodePtr child;
for (child = parent->children; child != NULL; child = child->next) {
if ((child->type == XML_ELEMENT_NODE) &&
(xmlStrcasecmp(child->name, BAD_CAST name) == 0))
return(child);
}
return(NULL);
}
static htmlNodePtr
htmlFindHead(htmlDocPtr doc) {
htmlNodePtr html;
if (doc == NULL)
return(NULL);
html = htmlFindFirstChild((htmlNodePtr) doc, "html");
if (html == NULL)
return(NULL);
return(htmlFindFirstChild(html, "head"));
}
int
htmlParseContentType(const xmlChar *val, htmlMetaEncodingOffsets *off) {
const xmlChar *p = val;
while (1) {
size_t start, end;
while ((*p != 'c') && (*p != 'C')) {
if (*p == 0)
return(0);
p += 1;
}
p += 1;
if (xmlStrncasecmp(p, BAD_CAST "harset", 6) != 0)
continue;
p += 6;
while (IS_WS_HTML(*p)) p += 1;
if (*p != '=')
continue;
p += 1;
while (IS_WS_HTML(*p)) p += 1;
if (*p == 0)
return(0);
if ((*p == '"') || (*p == '\'')) {
int quote = *p;
p += 1;
while (IS_WS_HTML(*p)) p += 1;
start = p - val;
end = start;
while (*p != quote) {
if (*p == 0)
return(0);
if (!IS_WS_HTML(*p))
end = p + 1 - val;
p += 1;
}
} else {
start = p - val;
while ((*p != 0) && (*p != ';') && (!IS_WS_HTML(*p)))
p += 1;
end = p - val;
}
off->start = start;
off->end = end;
off->size = p - val + strlen((char *) p);
return(1);
}
return(0);
}
static xmlAttrPtr
htmlFindMetaEncodingAttr(htmlNodePtr elem, int *outIsContentType) {
xmlAttrPtr attr, contentAttr = NULL;
int isContentType = 0;
if (xmlStrcasecmp(elem->name, BAD_CAST "meta") != 0)
return(NULL);
for (attr = elem->properties; attr != NULL; attr = attr->next) {
if (attr->ns != NULL)
continue;
if (xmlStrcasecmp(attr->name, BAD_CAST "charset") == 0) {
*outIsContentType = 0;
return(attr);
}
if (xmlStrcasecmp(attr->name, BAD_CAST "content") == 0)
contentAttr = attr;
if ((xmlStrcasecmp(attr->name, BAD_CAST "http-equiv") == 0) &&
(attr->children != NULL) &&
(attr->children->type == XML_TEXT_NODE) &&
(attr->children->next == NULL) &&
(xmlStrcasecmp(attr->children->content,
BAD_CAST "Content-Type") == 0))
isContentType = 1;
}
if ((isContentType) && (contentAttr != NULL)) {
*outIsContentType = 1;
return(contentAttr);
}
return(NULL);
}
static int
htmlParseMetaEncoding(htmlNodePtr elem, htmlMetaEncoding *menc) {
xmlAttrPtr attr;
const xmlChar *val = NULL;
int isContentType;
if ((elem->type != XML_ELEMENT_NODE) ||
(xmlStrcasecmp(elem->name, BAD_CAST "meta") != 0))
return(0);
attr = htmlFindMetaEncodingAttr(elem, &isContentType);
if (attr == NULL)
return(0);
if ((attr->children != NULL) &&
(attr->children->type == XML_TEXT_NODE) &&
(attr->children->next == NULL) &&
(attr->children->content != NULL))
val = attr->children->content;
else
val = BAD_CAST "";
if (!isContentType) {
size_t size = strlen((char *) val);
size_t start = 0;
size_t end = size;
while ((start < size) && (IS_WS_HTML(val[start])))
start += 1;
while ((end > 0) && (IS_WS_HTML(val[end-1])))
end -= 1;
menc->attr = attr;
menc->attrValue = val;
menc->off.start = start;
menc->off.end = end;
menc->off.size = size;
return(1);
} else {
if (htmlParseContentType(val, &menc->off)) {
menc->attr = attr;
menc->attrValue = val;
return(1);
}
}
return(0);
}
static xmlChar *
htmlUpdateMetaEncoding(htmlMetaEncoding *menc, const char *encoding) {
xmlChar *newVal, *p;
size_t size, oldEncSize, newEncSize;
/*
* The pseudo "HTML" encoding only produces ASCII.
*/
if (xmlStrcasecmp(BAD_CAST encoding, BAD_CAST "HTML") == 0)
encoding = "ASCII";
oldEncSize = menc->off.end - menc->off.start;
newEncSize = strlen((char *) encoding);
size = menc->off.size - oldEncSize + newEncSize;
newVal = xmlMalloc(size + 1);
if (newVal == NULL)
return(NULL);
p = newVal;
memcpy(p, menc->attrValue, menc->off.start);
p += menc->off.start;
memcpy(p, encoding, newEncSize);
p += newEncSize;
memcpy(p, menc->attrValue + menc->off.end, menc->off.size - menc->off.end);
newVal[size] = 0;
return(newVal);
}
/**
* Look up and encoding declaration in the meta tags.
*
* The returned string points into attribute content and can contain
* trailing garbage. It should be copied before modifying or freeing
* nodes.
*
* @param doc the document
* @returns the encoding ot NULL if not found.
*/
const xmlChar *
htmlGetMetaEncoding(htmlDocPtr doc) {
htmlNodePtr head, node;
head = htmlFindHead(doc);
if (head == NULL)
return(NULL);
for (node = head->children; node != NULL; node = node->next) {
htmlMetaEncoding menc;
if (htmlParseMetaEncoding(node, &menc)) {
/*
* Returning a `const xmlChar *` only allows to return
* a suffix. In http-equiv meta tags, there could be
* more data after the charset, although it's probably
* rare in practice.
*/
return(menc.attrValue + menc.off.start);
}
}
return(NULL);
}
/**
* Creates or updates a meta tag with an encoding declaration.
*
* NOTE: This will not change the document content encoding.
*
* @param doc the document
* @param encoding the encoding string
* @returns 0 in case of success, 1 if no head element was found or
* arguments are invalid and -1 if memory allocation failed.
*/
int
htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
htmlNodePtr head, meta;
int found = 0;
if (encoding == NULL)
return(1);
head = htmlFindHead(doc);
if (head == NULL)
return(1);
for (meta = head->children; meta != NULL; meta = meta->next) {
htmlMetaEncoding menc;
if (htmlParseMetaEncoding(meta, &menc)) {
xmlChar *newVal;
int ret;
found = 1;
newVal = htmlUpdateMetaEncoding(&menc, (char *) encoding);
if (newVal == NULL)
return(-1);
xmlNodeSetContent((xmlNodePtr) menc.attr, NULL);
ret = xmlNodeAddContent((xmlNodePtr) menc.attr, newVal);
xmlFree(newVal);
if (ret < 0)
return(-1);
}
}
if (found)
return(0);
meta = xmlNewDocNode(head->doc, NULL, BAD_CAST "meta", NULL);
if (meta == NULL)
return(-1);
if (xmlNewProp(meta, BAD_CAST "charset", encoding) == NULL) {
xmlFreeNode(meta);
return(-1);
}
if (head->children == NULL)
xmlAddChild(head, meta);
else
xmlAddPrevSibling(head->children, meta);
return(0);
}
/**
* Determine if a given attribute is a boolean attribute. This
* doesn't handle HTML5.
*
* @deprecated Internal function, don't use.
*
* @param name the name of the attribute to check
* @returns false if the attribute is not boolean, true otherwise.
*/
int
htmlIsBooleanAttr(const xmlChar *name)
{
const char *str = NULL;
if (name == NULL)
return(0);
/*
* These are the HTML attributes which will be output
* in minimized form, i.e. `<option selected="selected">` will be
* output as `<option selected>`, as per XSLT 1.0 16.2 "HTML Output
* Method":
*
* "checked", "compact", "declare", "defer", "disabled", "ismap",
* "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
* "selected"
*
* Additional attributes from HTML5 (not implemented yet):
*
* "allowfullscreen", "alpha", "async", "autofocus", "autoplay",
* "controls", "default", "formnovalidate", "inert", "itemscope",
* "loop", "muted", "nomodule", "novalidate", "open", "playsinline",
* "required", "reversed", "shadowrootdelegatesfocus",
* "shadowrootclonable", "shadowrootserializable",
* "shadowrootcustomelementregistry", "truespeed"
*/
switch (name[0] | 0x20) {
case 'c':
name += 1;
switch (name[0] | 0x20) {
case 'h': str = "ecked"; break;
case 'o': str = "mpact"; break;
}
break;
case 'd':
name += 1;
switch (name[0] | 0x20) {
case 'e':
name += 1;
switch (name[0] | 0x20) {
case 'c': str = "lare"; break;
case 'f': str = "er"; break;
}
break;
case 'i': str = "sabled"; break;
}
break;
case 'i':
str = "smap";
break;
case 'm':
str = "ultiple";
break;
case 'n':
name += 1;
if ((name[0] | 0x20) != 'o')
break;
name += 1;
switch (name[0] | 0x20) {
case 'h': str = "ref"; break;
case 'r': str = "esize"; break;
case 's': str = "hade"; break;
case 'w': str = "rap"; break;
}
break;
case 'r':
str = "eadonly";
break;
case 's':
str = "elected";
break;
}
if (str == NULL)
return(0);
return(xmlStrcasecmp(name + 1, BAD_CAST str) == 0);
}
#ifdef LIBXML_OUTPUT_ENABLED
/************************************************************************
* *
* Dumping HTML tree content to a simple buffer *
* *
************************************************************************/
static xmlParserErrors
htmlFindOutputEncoder(const char *encoding, xmlCharEncodingHandler **out) {
/*
* Fallback to HTML if the encoding is unspecified
*/
if (encoding == NULL)
encoding = "HTML";
return(xmlOpenCharEncodingHandler(encoding, /* output */ 1, out));
}
/**
* Serialize an HTML document to an xmlBuf.
*
* @param buf the xmlBufPtr output
* @param doc the document (unused)
* @param cur the current node
* @param format should formatting newlines been added
* @returns the number of bytes written or -1 in case of error
*/
static size_t
htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc ATTRIBUTE_UNUSED,
xmlNodePtr cur, int format) {
size_t use;
size_t ret;
xmlOutputBufferPtr outbuf;
if (cur == NULL) {
return ((size_t) -1);
}
if (buf == NULL) {
return ((size_t) -1);
}
outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
if (outbuf == NULL)
return ((size_t) -1);
memset(outbuf, 0, sizeof(xmlOutputBuffer));
outbuf->buffer = buf;
outbuf->encoder = NULL;
outbuf->writecallback = NULL;
outbuf->closecallback = NULL;
outbuf->context = NULL;
outbuf->written = 0;
use = xmlBufUse(buf);
htmlNodeDumpInternal(outbuf, cur, NULL, format);
if (outbuf->error)
ret = (size_t) -1;
else
ret = xmlBufUse(buf) - use;
xmlFree(outbuf);
return (ret);
}
/**
* Serialize an HTML node to an xmlBuffer. Always uses UTF-8.
*
* @param buf the HTML buffer output
* @param doc the document
* @param cur the current node
* @returns the number of bytes written or -1 in case of error
*/
int
htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
xmlBufPtr buffer;
size_t ret1;
int ret2;
if ((buf == NULL) || (cur == NULL))
return(-1);
xmlInitParser();
buffer = xmlBufFromBuffer(buf);
if (buffer == NULL)
return(-1);
ret1 = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
ret2 = xmlBufBackToBuffer(buffer, buf);
if ((ret1 == (size_t) -1) || (ret2 < 0))
return(-1);
return(ret1 > INT_MAX ? INT_MAX : ret1);
}
/**
* Serialize an HTML node to an xmlBuffer.
*
* If encoding is NULL, ASCII with HTML 4.0 named character entities
* will be used. This is inefficient compared to UTF-8 and might be
* changed in a future version.
*
* @param out the FILE pointer
* @param doc the document (unused)
* @param cur the current node
* @param encoding the document encoding (optional)
* @param format should formatting newlines been added
* @returns the number of bytes written or -1 in case of failure.
*/
int
htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc ATTRIBUTE_UNUSED,
xmlNodePtr cur, const char *encoding, int format) {
xmlOutputBufferPtr buf;
xmlCharEncodingHandlerPtr handler;
int ret;
xmlInitParser();
/*
* save the content to a temp buffer.
*/
if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK)
return(-1);
buf = xmlOutputBufferCreateFile(out, handler);
if (buf == NULL) {
xmlCharEncCloseFunc(handler);
return(-1);
}
htmlNodeDumpInternal(buf, cur, NULL, format);
ret = xmlOutputBufferClose(buf);
return(ret);
}
/**
* Same as htmlNodeDumpFileFormat() with `format` set to 1 which is
* typically undesired. Use of this function is DISCOURAGED in favor
* of htmlNodeDumpFileFormat().
*
* @param out the FILE pointer
* @param doc the document
* @param cur the current node
*/
void
htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
}
/**
* Serialize an HTML node to a memory, also returning the size of
* the result. It's up to the caller to free the memory.
*
* Uses the encoding of the document. If the document has no
* encoding, ASCII with HTML 4.0 named character entities will
* be used. This is inefficient compared to UTF-8 and might be
* changed in a future version.
*
* @param cur the document
* @param mem OUT: the memory pointer
* @param size OUT: the memory length
* @param format should formatting newlines been added
*/
void
htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
xmlOutputBufferPtr buf;
xmlCharEncodingHandlerPtr handler = NULL;
xmlInitParser();
if ((mem == NULL) || (size == NULL))
return;
*mem = NULL;
*size = 0;
if (cur == NULL)
return;
if (htmlFindOutputEncoder((char *) cur->encoding, &handler) != XML_ERR_OK)
return;
buf = xmlAllocOutputBuffer(handler);
if (buf == NULL) {
xmlCharEncCloseFunc(handler);
return;
}
htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
xmlOutputBufferFlush(buf);
if (!buf->error) {
if (buf->conv != NULL) {
*size = xmlBufUse(buf->conv);
*mem = xmlStrndup(xmlBufContent(buf->conv), *size);
} else {
*size = xmlBufUse(buf->buffer);
*mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
}
}
xmlOutputBufferClose(buf);
}
/**
* Same as htmlDocDumpMemoryFormat() with `format` set to 1 which
* is typically undesired. Also see the warnings there. Use of
* this function is DISCOURAGED in favor of
* htmlDocContentDumpFormatOutput().
*
* @param cur the document
* @param mem OUT: the memory pointer
* @param size OUT: the memory length
*/
void
htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
htmlDocDumpMemoryFormat(cur, mem, size, 1);
}
/************************************************************************
* *
* Dumping HTML tree content to an I/O output buffer *
* *
************************************************************************/
/**
* Serialize the HTML document's DTD, if any.
*
* Ignores `encoding` and uses the encoding of the output buffer.
*
* @param buf the HTML buffer output
* @param doc the document
* @param encoding the encoding string (unused)
*/
static void
htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
const char *encoding ATTRIBUTE_UNUSED) {
xmlDtdPtr cur = doc->intSubset;
if (cur == NULL)
return;
xmlOutputBufferWrite(buf, 10, "<!DOCTYPE ");
xmlOutputBufferWriteString(buf, (const char *)cur->name);
if (cur->ExternalID != NULL) {
xmlOutputBufferWrite(buf, 8, " PUBLIC ");
xmlOutputBufferWriteQuotedString(buf, cur->ExternalID);
if (cur->SystemID != NULL) {
xmlOutputBufferWrite(buf, 1, " ");
xmlOutputBufferWriteQuotedString(buf, cur->SystemID);
}
} else if (cur->SystemID != NULL &&
xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
xmlOutputBufferWrite(buf, 8, " SYSTEM ");
xmlOutputBufferWriteQuotedString(buf, cur->SystemID);
}
xmlOutputBufferWrite(buf, 2, ">\n");
}
static void
htmlSerializeUri(xmlOutputBufferPtr buf, const xmlChar *content) {
const xmlChar *tmp = content;
/*
* See appendix "B.2.1 Non-ASCII characters in URI attribute
* values" in the HTML 4.01 spec. This is also recommended
* by the HTML output method of the XSLT 1.0 spec.
*
* We also escape space and control chars.
*/
/* Skip over initial whitespace */
while (IS_WS_HTML(*tmp)) tmp++;
if (tmp > content) {
xmlOutputBufferWrite(buf, tmp - content, (char *) content);
content = tmp;
}
while (1) {
char escbuf[3];
const char *repl;
int replSize;
int c = *tmp;
while ((c > 0x20) && (c < 0x7F) && (c != '"') && (c != '&')) {
tmp += 1;
c = *tmp;
}
if (tmp > content)
xmlOutputBufferWrite(buf, tmp - content, (char *) content);
if ((c <= 0x20) || (c >= 0x7F)) {
static const char hex[16] = {
'0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
};
if (c == 0)
break;
escbuf[0] = '%';
escbuf[1] = hex[(c >> 4) & 0x0F];
escbuf[2] = hex[c & 0x0F];
repl = escbuf;
replSize = 3;
} else if (c == '"') {
repl = "&quot;";
replSize = 6;
} else {
repl = "&amp;";
replSize = 5;
}
xmlOutputBufferWrite(buf, replSize, repl);
tmp += 1;
content = tmp;
}
}
/**
* Serialize an HTML attribute.
*
* @param buf the HTML buffer output
* @param cur the attribute pointer
*/
static void
htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlAttrPtr cur) {
xmlOutputBufferWrite(buf, 1, " ");
if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
xmlOutputBufferWrite(buf, 1, ":");
}
xmlOutputBufferWriteString(buf, (const char *)cur->name);
/*
* The HTML5 spec requires to always serialize empty attribute
* values as `=""`. We should probably align with HTML5 at some
* point.
*/
if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
xmlNodePtr child;
int isUri;
xmlOutputBufferWrite(buf, 2, "=\"");
/*
* Special handling of URIs doesn't conform to HTML5 and
* should probably be removed at some point.
*/
isUri = (cur->ns == NULL) && (cur->parent != NULL) &&
(cur->parent->ns == NULL) &&
((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
(!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
(!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
(!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))));
for (child = cur->children; child != NULL; child = child->next) {
if (child->type == XML_TEXT_NODE) {
const xmlChar *content = child->content;
if (content == NULL)
continue;
if (isUri) {
htmlSerializeUri(buf, content);
} else {
xmlSerializeText(buf, content, SIZE_MAX,
XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
}
} else if (child->type == XML_ENTITY_REF_NODE) {
/* TODO: We should probably expand entity refs */
xmlOutputBufferWrite(buf, 1, "&");
xmlOutputBufferWriteString(buf, (char *) child->name);
xmlOutputBufferWrite(buf, 1, ";");
}
}
xmlOutputBufferWrite(buf, 1, "\"");
}
}
/**
* Serialize an HTML node to an output buffer.
*
* If `encoding` is specified, it is used to create or update meta
* tags containing the character encoding.
*
* @param buf the HTML buffer output
* @param cur the current node
* @param encoding the encoding string (optional)
* @param format should formatting newlines been added
*/
void
htmlNodeDumpInternal(xmlOutputBufferPtr buf, xmlNodePtr cur,
const char *encoding, int format) {
xmlNodePtr root, parent, metaHead = NULL;
xmlAttrPtr attr;
const htmlElemDesc * info;
int isRaw = 0;
xmlInitParser();
if ((cur == NULL) || (buf == NULL)) {
return;
}
root = cur;
parent = cur->parent;
while (1) {
switch (cur->type) {
case XML_HTML_DOCUMENT_NODE:
case XML_DOCUMENT_NODE:
if (((xmlDocPtr) cur)->intSubset != NULL) {
htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
}
if (cur->children != NULL) {
/* Always validate cur->parent when descending. */
if (cur->parent == parent) {
parent = cur;
cur = cur->children;
continue;
}
} else {
xmlOutputBufferWrite(buf, 1, "\n");
}
break;
case XML_ELEMENT_NODE: {
htmlMetaEncoding menc;
int isMeta = 0;
int addMeta = 0;
/*
* Some users like lxml are known to pass nodes with a corrupted
* tree structure. Fall back to a recursive call to handle this
* case.
*/
if ((cur->parent != parent) && (cur->children != NULL)) {
htmlNodeDumpInternal(buf, cur, encoding, format);
break;
}
/*
* Get specific HTML info for that node.
*/
if (cur->ns == NULL)
info = htmlTagLookup(cur->name);
else
info = NULL;
if (encoding != NULL) {
isMeta = htmlParseMetaEncoding(cur, &menc);
/*
* Don't add meta tag for "HTML" encoding.
*/
if ((xmlStrcasecmp(BAD_CAST encoding,
BAD_CAST "HTML") != 0) &&
(xmlStrcasecmp(cur->name, BAD_CAST "head") == 0) &&
(parent != NULL) &&
(xmlStrcasecmp(parent->name, BAD_CAST "html") == 0) &&
(parent->parent != NULL) &&
(parent->parent->parent == NULL) &&
(metaHead == NULL)) {
xmlNodePtr n;
metaHead = cur;
addMeta = 1;
for (n = cur->children; n != NULL; n = n->next) {
int unused;
if (htmlFindMetaEncodingAttr(n, &unused) != NULL) {
metaHead = NULL;
addMeta = 0;
break;
}
}
}
}
xmlOutputBufferWrite(buf, 1, "<");
if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
xmlOutputBufferWrite(buf, 1, ":");
}
xmlOutputBufferWriteString(buf, (const char *)cur->name);
if (cur->nsDef)
xmlNsListDumpOutput(buf, cur->nsDef);
attr = cur->properties;
while (attr != NULL) {
if ((!isMeta) || (attr != menc.attr)) {
htmlAttrDumpOutput(buf, attr);
} else {
xmlOutputBufferWrite(buf, 1, " ");
xmlOutputBufferWriteString(buf, (char *) attr->name);
xmlOutputBufferWrite(buf, 2, "=\"");
xmlSerializeText(buf, menc.attrValue, menc.off.start,
XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
xmlSerializeText(buf, BAD_CAST encoding, SIZE_MAX,
XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
xmlSerializeText(buf, menc.attrValue + menc.off.end,
menc.off.size - menc.off.end,
XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
xmlOutputBufferWrite(buf, 1, "\"");
}
attr = attr->next;
}
if ((info != NULL) && (info->empty)) {
xmlOutputBufferWrite(buf, 1, ">");
} else if (cur->children == NULL) {
if (addMeta) {
xmlOutputBufferWrite(buf, 16, "><meta charset=\"");
xmlSerializeText(buf, BAD_CAST encoding, SIZE_MAX,
XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
xmlOutputBufferWrite(buf, 4, "\"></");
} else {
xmlOutputBufferWrite(buf, 3, "></");
}
if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
xmlOutputBufferWriteString(buf,
(const char *)cur->ns->prefix);
xmlOutputBufferWrite(buf, 1, ":");
}
xmlOutputBufferWriteString(buf, (const char *)cur->name);
xmlOutputBufferWrite(buf, 1, ">");
} else {
xmlOutputBufferWrite(buf, 1, ">");
if ((format) &&
((addMeta) ||
((info != NULL) && (!info->isinline) &&
(cur->children->type != HTML_TEXT_NODE) &&
(cur->children->type != HTML_ENTITY_REF_NODE) &&
(cur->children != cur->last) &&
(cur->name != NULL) &&
(cur->name[0] != 'p')))) /* p, pre, param */
xmlOutputBufferWrite(buf, 1, "\n");
if (addMeta) {
xmlOutputBufferWrite(buf, 15, "<meta charset=\"");
xmlSerializeText(buf, BAD_CAST encoding, SIZE_MAX,
XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
xmlOutputBufferWrite(buf, 2, "\">");
if ((format) &&
(cur->children->type != HTML_TEXT_NODE) &&
(cur->children->type != HTML_ENTITY_REF_NODE))
xmlOutputBufferWrite(buf, 1, "\n");
}
if ((info != NULL) && (info->dataMode >= DATA_RAWTEXT))
isRaw = 1;
parent = cur;
cur = cur->children;
continue;
}
if ((format) && (cur->next != NULL) &&
(info != NULL) && (!info->isinline)) {
if ((cur->next->type != HTML_TEXT_NODE) &&
(cur->next->type != HTML_ENTITY_REF_NODE) &&
(parent != NULL) &&
(parent->name != NULL) &&
(parent->name[0] != 'p')) /* p, pre, param */
xmlOutputBufferWrite(buf, 1, "\n");
}
break;
}
case XML_ATTRIBUTE_NODE:
htmlAttrDumpOutput(buf, (xmlAttrPtr) cur);
break;
case HTML_TEXT_NODE:
if (cur->content == NULL)
break;
if ((cur->name == (const xmlChar *)xmlStringTextNoenc) ||
(isRaw)) {
xmlOutputBufferWriteString(buf, (const char *)cur->content);
} else {
xmlSerializeText(buf, cur->content, SIZE_MAX, XML_ESCAPE_HTML);
}
break;
case HTML_COMMENT_NODE:
if (cur->content != NULL) {
xmlOutputBufferWrite(buf, 4, "<!--");
xmlOutputBufferWriteString(buf, (const char *)cur->content);
xmlOutputBufferWrite(buf, 3, "-->");
}
break;
case HTML_PI_NODE:
if (cur->name != NULL) {
xmlOutputBufferWrite(buf, 2, "<?");
xmlOutputBufferWriteString(buf, (const char *)cur->name);
if (cur->content != NULL) {
xmlOutputBufferWrite(buf, 1, " ");
xmlOutputBufferWriteString(buf,
(const char *)cur->content);
}
xmlOutputBufferWrite(buf, 1, ">");
}
break;
case HTML_ENTITY_REF_NODE:
xmlOutputBufferWrite(buf, 1, "&");
xmlOutputBufferWriteString(buf, (const char *)cur->name);
xmlOutputBufferWrite(buf, 1, ";");
break;
case HTML_PRESERVE_NODE:
if (cur->content != NULL) {
xmlOutputBufferWriteString(buf, (const char *)cur->content);
}
break;
default:
break;
}
while (1) {
if (cur == root)
return;
if (cur->next != NULL) {
cur = cur->next;
break;
}
isRaw = 0;
cur = parent;
/* cur->parent was validated when descending. */
parent = cur->parent;
if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
(cur->type == XML_DOCUMENT_NODE)) {
xmlOutputBufferWrite(buf, 1, "\n");
} else {
if ((format) && (cur->ns == NULL))
info = htmlTagLookup(cur->name);
else
info = NULL;
if ((format) && (info != NULL) && (!info->isinline) &&
(cur->last->type != HTML_TEXT_NODE) &&
(cur->last->type != HTML_ENTITY_REF_NODE) &&
((cur->children != cur->last) || (cur == metaHead)) &&
(cur->name != NULL) &&
(cur->name[0] != 'p')) /* p, pre, param */
xmlOutputBufferWrite(buf, 1, "\n");
xmlOutputBufferWrite(buf, 2, "</");
if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
xmlOutputBufferWrite(buf, 1, ":");
}
xmlOutputBufferWriteString(buf, (const char *)cur->name);
xmlOutputBufferWrite(buf, 1, ">");
if ((format) && (info != NULL) && (!info->isinline) &&
(cur->next != NULL)) {
if ((cur->next->type != HTML_TEXT_NODE) &&
(cur->next->type != HTML_ENTITY_REF_NODE) &&
(parent != NULL) &&
(parent->name != NULL) &&
(parent->name[0] != 'p')) /* p, pre, param */
xmlOutputBufferWrite(buf, 1, "\n");
}
if (cur == metaHead)
metaHead = NULL;
}
}
}
}
/**
* Serialize an HTML node to an output buffer.
*
* @param buf the HTML buffer output
* @param doc the document (unused)
* @param cur the current node
* @param encoding the encoding string (unused)
* @param format should formatting newlines been added
*/
void
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf,
xmlDocPtr doc ATTRIBUTE_UNUSED, xmlNodePtr cur,
const char *encoding ATTRIBUTE_UNUSED, int format) {
htmlNodeDumpInternal(buf, cur, NULL, format);
}
/**
* Same as htmlNodeDumpFormatOutput() with `format` set to 1 which is
* typically undesired. Use of this function is DISCOURAGED in favor
* of htmlNodeDumpFormatOutput().
*
* @param buf the HTML buffer output
* @param doc the document (unused)
* @param cur the current node
* @param encoding the encoding string (unused)
*/
void
htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc ATTRIBUTE_UNUSED,
xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) {
htmlNodeDumpInternal(buf, cur, NULL, 1);
}
/**
* Serialize an HTML document to an output buffer.
*
* @param buf the HTML buffer output
* @param cur the document
* @param encoding the encoding string (unused)
* @param format should formatting newlines been added
*/
void
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
const char *encoding ATTRIBUTE_UNUSED,
int format) {
htmlNodeDumpInternal(buf, (xmlNodePtr) cur, NULL, format);
}
/**
* Same as htmlDocContentDumpFormatDump() with `format` set to 1
* which is typically undesired. Use of this function is DISCOURAGED
* in favor of htmlDocContentDumpFormatOutput().
*
* @param buf the HTML buffer output
* @param cur the document
* @param encoding the encoding string (unused)
*/
void
htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
const char *encoding ATTRIBUTE_UNUSED) {
htmlNodeDumpInternal(buf, (xmlNodePtr) cur, NULL, 1);
}
/************************************************************************
* *
* Saving functions front-ends *
* *
************************************************************************/
/**
* Serialize an HTML document to an open `FILE`.
*
* Uses the encoding of the document. If the document has no
* encoding, ASCII with HTML 4.0 named character entities will
* be used. This is inefficient compared to UTF-8 and might be
* changed in a future version.
*
* Enables "formatting" unconditionally which is typically
* undesired.
*
* Use of this function is DISCOURAGED in favor of
* htmlNodeDumpFileFormat().
*
* @param f the FILE*
* @param cur the document
* @returns the number of bytes written or -1 in case of failure.
*/
int
htmlDocDump(FILE *f, xmlDocPtr cur) {
xmlOutputBufferPtr buf;
xmlCharEncodingHandlerPtr handler = NULL;
int ret;
xmlInitParser();
if ((cur == NULL) || (f == NULL)) {
return(-1);
}
if (htmlFindOutputEncoder((char *) cur->encoding, &handler) != XML_ERR_OK)
return(-1);
buf = xmlOutputBufferCreateFile(f, handler);
if (buf == NULL) {
xmlCharEncCloseFunc(handler);
return(-1);
}
htmlDocContentDumpOutput(buf, cur, NULL);
ret = xmlOutputBufferClose(buf);
return(ret);
}
/**
* Serialize an HTML document to a file.
*
* Same as htmlSaveFileFormat() with `encoding` set to NULL and
* `format` set to 1 which is typically undesired.
*
* Use of this function is DISCOURAGED in favor of
* htmlSaveFileFormat().
*
* @param filename the filename (or URL)
* @param cur the document
* @returns the number of bytes written or -1 in case of failure.
*/
int
htmlSaveFile(const char *filename, xmlDocPtr cur) {
return(htmlSaveFileFormat(filename, cur, NULL, 1));
}
/**
* Serialize an HTML document to a file using a given encoding.
*
* If `filename` is `"-"`, stdout is used. This is potentially
* insecure and might be changed in a future version.
*
* If encoding is NULL, ASCII with HTML 4.0 named character entities
* will be used. This is inefficient compared to UTF-8 and might be
* changed in a future version.
*
* Sets or updates meta tags containing the character encoding.
*
* @param filename the filename
* @param cur the document
* @param format should formatting newlines been added
* @param encoding the document encoding (optional)
* @returns the number of bytes written or -1 in case of failure.
*/
int
htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
const char *encoding, int format) {
xmlOutputBufferPtr buf;
xmlCharEncodingHandlerPtr handler = NULL;
int ret;
if ((cur == NULL) || (filename == NULL))
return(-1);
xmlInitParser();
if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK)
return(-1);
/*
* save the content to a temp buffer.
*/
buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
if (buf == NULL) {
xmlCharEncCloseFunc(handler);
return(0);
}
htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
ret = xmlOutputBufferClose(buf);
return(ret);
}
/**
* Serialize an HTML document to a file.
*
* Same as htmlSaveFileFormat() with `format` set to 1 which is
* typically undesired. Also see the warnings there. Use of this
* function is DISCOURAGED in favor of htmlSaveFileFormat().
*
* @param filename the filename
* @param cur the document
* @param encoding the document encoding
* @returns the number of bytes written or -1 in case of failure.
*/
int
htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
return(htmlSaveFileFormat(filename, cur, encoding, 1));
}
#endif /* LIBXML_OUTPUT_ENABLED */
#endif /* LIBXML_HTML_ENABLED */