mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-10-24 13:33:01 +03:00
It seems that the specification of the HTML output method in XSLT 1.0 had a lot of influence on how the HTML serializer in libxml2 ended up: https://www.w3.org/TR/xslt-10/#section-HTML-Output-Method There are two remaining behaviors suggested by XSLT 1.0 that don't match the HTML5 fragment serialization algorithm: We escape non-ASCII characters in URI attributes (the list of which is probably outdated). This was originally recommended in appendix B of the HTML 4.01 spec, but only for user agents: https://www.w3.org/TR/html401/appendix/notes.html#h-B.2.1 From my experience, any tool that processes HTML should escape as little as possible. For example, we used to escape many more characters which are invalid in URIs, but often used in template languages. (Note that we still escape whitespace and control chars.) Nevertheless, I guess that some libxslt users continue to expect this behavior from libxml2. Then we collapse Boolean attributes using an outdated list. This is mostly a cosmetic issue, but a somewhat important one for libxslt users. We probably need a serialization option for the xmlsave module that enables fully HTML5-conformant output.
1315 lines
38 KiB
C
1315 lines
38 KiB
C
/*
|
|
* HTMLtree.c : implementation of access function for an HTML tree.
|
|
*
|
|
* See Copyright for the status of this software.
|
|
*
|
|
* Author: Daniel Veillard
|
|
*/
|
|
|
|
|
|
#define IN_LIBXML
|
|
#include "libxml.h"
|
|
#ifdef LIBXML_HTML_ENABLED
|
|
|
|
#include <string.h> /* for memset() only ! */
|
|
#include <ctype.h>
|
|
#include <stdlib.h>
|
|
|
|
#include <libxml/xmlmemory.h>
|
|
#include <libxml/HTMLparser.h>
|
|
#include <libxml/HTMLtree.h>
|
|
#include <libxml/entities.h>
|
|
#include <libxml/xmlerror.h>
|
|
#include <libxml/parserInternals.h>
|
|
#include <libxml/uri.h>
|
|
|
|
#include "private/buf.h"
|
|
#include "private/html.h"
|
|
#include "private/error.h"
|
|
#include "private/html.h"
|
|
#include "private/io.h"
|
|
#include "private/save.h"
|
|
#include "private/tree.h"
|
|
|
|
/************************************************************************
|
|
* *
|
|
* Getting/Setting encoding meta tags *
|
|
* *
|
|
************************************************************************/
|
|
|
|
typedef struct {
|
|
xmlAttrPtr attr; /* charset or content */
|
|
const xmlChar *attrValue;
|
|
htmlMetaEncodingOffsets off;
|
|
} htmlMetaEncoding;
|
|
|
|
static htmlNodePtr
|
|
htmlFindFirstChild(htmlNodePtr parent, const char *name) {
|
|
htmlNodePtr child;
|
|
|
|
for (child = parent->children; child != NULL; child = child->next) {
|
|
if ((child->type == XML_ELEMENT_NODE) &&
|
|
(xmlStrcasecmp(child->name, BAD_CAST name) == 0))
|
|
return(child);
|
|
}
|
|
|
|
return(NULL);
|
|
}
|
|
|
|
static htmlNodePtr
|
|
htmlFindHead(htmlDocPtr doc) {
|
|
htmlNodePtr html;
|
|
|
|
if (doc == NULL)
|
|
return(NULL);
|
|
|
|
html = htmlFindFirstChild((htmlNodePtr) doc, "html");
|
|
if (html == NULL)
|
|
return(NULL);
|
|
|
|
return(htmlFindFirstChild(html, "head"));
|
|
}
|
|
|
|
int
|
|
htmlParseContentType(const xmlChar *val, htmlMetaEncodingOffsets *off) {
|
|
const xmlChar *p = val;
|
|
|
|
while (1) {
|
|
size_t start, end;
|
|
|
|
while ((*p != 'c') && (*p != 'C')) {
|
|
if (*p == 0)
|
|
return(0);
|
|
p += 1;
|
|
}
|
|
p += 1;
|
|
|
|
if (xmlStrncasecmp(p, BAD_CAST "harset", 6) != 0)
|
|
continue;
|
|
|
|
p += 6;
|
|
while (IS_WS_HTML(*p)) p += 1;
|
|
|
|
if (*p != '=')
|
|
continue;
|
|
|
|
p += 1;
|
|
while (IS_WS_HTML(*p)) p += 1;
|
|
|
|
if (*p == 0)
|
|
return(0);
|
|
|
|
if ((*p == '"') || (*p == '\'')) {
|
|
int quote = *p;
|
|
|
|
p += 1;
|
|
while (IS_WS_HTML(*p)) p += 1;
|
|
|
|
start = p - val;
|
|
end = start;
|
|
|
|
while (*p != quote) {
|
|
if (*p == 0)
|
|
return(0);
|
|
if (!IS_WS_HTML(*p))
|
|
end = p + 1 - val;
|
|
p += 1;
|
|
}
|
|
} else {
|
|
start = p - val;
|
|
|
|
while ((*p != 0) && (*p != ';') && (!IS_WS_HTML(*p)))
|
|
p += 1;
|
|
|
|
end = p - val;
|
|
}
|
|
|
|
off->start = start;
|
|
off->end = end;
|
|
off->size = p - val + strlen((char *) p);
|
|
|
|
return(1);
|
|
}
|
|
|
|
return(0);
|
|
}
|
|
|
|
static xmlAttrPtr
|
|
htmlFindMetaEncodingAttr(htmlNodePtr elem, int *outIsContentType) {
|
|
xmlAttrPtr attr, contentAttr = NULL;
|
|
int isContentType = 0;
|
|
|
|
if (xmlStrcasecmp(elem->name, BAD_CAST "meta") != 0)
|
|
return(NULL);
|
|
|
|
for (attr = elem->properties; attr != NULL; attr = attr->next) {
|
|
if (attr->ns != NULL)
|
|
continue;
|
|
if (xmlStrcasecmp(attr->name, BAD_CAST "charset") == 0) {
|
|
*outIsContentType = 0;
|
|
return(attr);
|
|
}
|
|
if (xmlStrcasecmp(attr->name, BAD_CAST "content") == 0)
|
|
contentAttr = attr;
|
|
if ((xmlStrcasecmp(attr->name, BAD_CAST "http-equiv") == 0) &&
|
|
(attr->children != NULL) &&
|
|
(attr->children->type == XML_TEXT_NODE) &&
|
|
(attr->children->next == NULL) &&
|
|
(xmlStrcasecmp(attr->children->content,
|
|
BAD_CAST "Content-Type") == 0))
|
|
isContentType = 1;
|
|
}
|
|
|
|
if ((isContentType) && (contentAttr != NULL)) {
|
|
*outIsContentType = 1;
|
|
return(contentAttr);
|
|
}
|
|
|
|
return(NULL);
|
|
}
|
|
|
|
static int
|
|
htmlParseMetaEncoding(htmlNodePtr elem, htmlMetaEncoding *menc) {
|
|
xmlAttrPtr attr;
|
|
const xmlChar *val = NULL;
|
|
int isContentType;
|
|
|
|
if ((elem->type != XML_ELEMENT_NODE) ||
|
|
(xmlStrcasecmp(elem->name, BAD_CAST "meta") != 0))
|
|
return(0);
|
|
|
|
attr = htmlFindMetaEncodingAttr(elem, &isContentType);
|
|
if (attr == NULL)
|
|
return(0);
|
|
|
|
if ((attr->children != NULL) &&
|
|
(attr->children->type == XML_TEXT_NODE) &&
|
|
(attr->children->next == NULL) &&
|
|
(attr->children->content != NULL))
|
|
val = attr->children->content;
|
|
else
|
|
val = BAD_CAST "";
|
|
|
|
|
|
if (!isContentType) {
|
|
size_t size = strlen((char *) val);
|
|
size_t start = 0;
|
|
size_t end = size;
|
|
|
|
while ((start < size) && (IS_WS_HTML(val[start])))
|
|
start += 1;
|
|
|
|
while ((end > 0) && (IS_WS_HTML(val[end-1])))
|
|
end -= 1;
|
|
|
|
menc->attr = attr;
|
|
menc->attrValue = val;
|
|
menc->off.start = start;
|
|
menc->off.end = end;
|
|
menc->off.size = size;
|
|
|
|
return(1);
|
|
} else {
|
|
if (htmlParseContentType(val, &menc->off)) {
|
|
menc->attr = attr;
|
|
menc->attrValue = val;
|
|
|
|
return(1);
|
|
}
|
|
}
|
|
|
|
return(0);
|
|
}
|
|
|
|
static xmlChar *
|
|
htmlUpdateMetaEncoding(htmlMetaEncoding *menc, const char *encoding) {
|
|
xmlChar *newVal, *p;
|
|
size_t size, oldEncSize, newEncSize;
|
|
|
|
/*
|
|
* The pseudo "HTML" encoding only produces ASCII.
|
|
*/
|
|
if (xmlStrcasecmp(BAD_CAST encoding, BAD_CAST "HTML") == 0)
|
|
encoding = "ASCII";
|
|
|
|
oldEncSize = menc->off.end - menc->off.start;
|
|
newEncSize = strlen((char *) encoding);
|
|
size = menc->off.size - oldEncSize + newEncSize;
|
|
newVal = xmlMalloc(size + 1);
|
|
if (newVal == NULL)
|
|
return(NULL);
|
|
|
|
p = newVal;
|
|
memcpy(p, menc->attrValue, menc->off.start);
|
|
p += menc->off.start;
|
|
memcpy(p, encoding, newEncSize);
|
|
p += newEncSize;
|
|
memcpy(p, menc->attrValue + menc->off.end, menc->off.size - menc->off.end);
|
|
newVal[size] = 0;
|
|
|
|
return(newVal);
|
|
}
|
|
|
|
/**
|
|
* Look up and encoding declaration in the meta tags.
|
|
*
|
|
* The returned string points into attribute content and can contain
|
|
* trailing garbage. It should be copied before modifying or freeing
|
|
* nodes.
|
|
*
|
|
* @param doc the document
|
|
* @returns the encoding ot NULL if not found.
|
|
*/
|
|
const xmlChar *
|
|
htmlGetMetaEncoding(htmlDocPtr doc) {
|
|
htmlNodePtr head, node;
|
|
|
|
head = htmlFindHead(doc);
|
|
if (head == NULL)
|
|
return(NULL);
|
|
|
|
for (node = head->children; node != NULL; node = node->next) {
|
|
htmlMetaEncoding menc;
|
|
|
|
if (htmlParseMetaEncoding(node, &menc)) {
|
|
/*
|
|
* Returning a `const xmlChar *` only allows to return
|
|
* a suffix. In http-equiv meta tags, there could be
|
|
* more data after the charset, although it's probably
|
|
* rare in practice.
|
|
*/
|
|
return(menc.attrValue + menc.off.start);
|
|
}
|
|
}
|
|
|
|
return(NULL);
|
|
}
|
|
|
|
/**
|
|
* Creates or updates a meta tag with an encoding declaration.
|
|
*
|
|
* NOTE: This will not change the document content encoding.
|
|
*
|
|
* @param doc the document
|
|
* @param encoding the encoding string
|
|
* @returns 0 in case of success, 1 if no head element was found or
|
|
* arguments are invalid and -1 if memory allocation failed.
|
|
*/
|
|
int
|
|
htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
|
|
htmlNodePtr head, meta;
|
|
int found = 0;
|
|
|
|
if (encoding == NULL)
|
|
return(1);
|
|
|
|
head = htmlFindHead(doc);
|
|
if (head == NULL)
|
|
return(1);
|
|
|
|
for (meta = head->children; meta != NULL; meta = meta->next) {
|
|
htmlMetaEncoding menc;
|
|
|
|
if (htmlParseMetaEncoding(meta, &menc)) {
|
|
xmlChar *newVal;
|
|
int ret;
|
|
|
|
found = 1;
|
|
|
|
newVal = htmlUpdateMetaEncoding(&menc, (char *) encoding);
|
|
if (newVal == NULL)
|
|
return(-1);
|
|
xmlNodeSetContent((xmlNodePtr) menc.attr, NULL);
|
|
ret = xmlNodeAddContent((xmlNodePtr) menc.attr, newVal);
|
|
xmlFree(newVal);
|
|
|
|
if (ret < 0)
|
|
return(-1);
|
|
}
|
|
}
|
|
|
|
if (found)
|
|
return(0);
|
|
|
|
meta = xmlNewDocNode(head->doc, NULL, BAD_CAST "meta", NULL);
|
|
if (meta == NULL)
|
|
return(-1);
|
|
|
|
if (xmlNewProp(meta, BAD_CAST "charset", encoding) == NULL) {
|
|
xmlFreeNode(meta);
|
|
return(-1);
|
|
}
|
|
|
|
if (head->children == NULL)
|
|
xmlAddChild(head, meta);
|
|
else
|
|
xmlAddPrevSibling(head->children, meta);
|
|
|
|
return(0);
|
|
}
|
|
|
|
/**
|
|
* Determine if a given attribute is a boolean attribute. This
|
|
* doesn't handle HTML5.
|
|
*
|
|
* @deprecated Internal function, don't use.
|
|
*
|
|
* @param name the name of the attribute to check
|
|
* @returns false if the attribute is not boolean, true otherwise.
|
|
*/
|
|
int
|
|
htmlIsBooleanAttr(const xmlChar *name)
|
|
{
|
|
const char *str = NULL;
|
|
|
|
if (name == NULL)
|
|
return(0);
|
|
|
|
/*
|
|
* These are the HTML attributes which will be output
|
|
* in minimized form, i.e. `<option selected="selected">` will be
|
|
* output as `<option selected>`, as per XSLT 1.0 16.2 "HTML Output
|
|
* Method":
|
|
*
|
|
* "checked", "compact", "declare", "defer", "disabled", "ismap",
|
|
* "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
|
|
* "selected"
|
|
*
|
|
* Additional attributes from HTML5 (not implemented yet):
|
|
*
|
|
* "allowfullscreen", "alpha", "async", "autofocus", "autoplay",
|
|
* "controls", "default", "formnovalidate", "inert", "itemscope",
|
|
* "loop", "muted", "nomodule", "novalidate", "open", "playsinline",
|
|
* "required", "reversed", "shadowrootdelegatesfocus",
|
|
* "shadowrootclonable", "shadowrootserializable",
|
|
* "shadowrootcustomelementregistry", "truespeed"
|
|
*/
|
|
|
|
switch (name[0] | 0x20) {
|
|
case 'c':
|
|
name += 1;
|
|
switch (name[0] | 0x20) {
|
|
case 'h': str = "ecked"; break;
|
|
case 'o': str = "mpact"; break;
|
|
}
|
|
break;
|
|
case 'd':
|
|
name += 1;
|
|
switch (name[0] | 0x20) {
|
|
case 'e':
|
|
name += 1;
|
|
switch (name[0] | 0x20) {
|
|
case 'c': str = "lare"; break;
|
|
case 'f': str = "er"; break;
|
|
}
|
|
break;
|
|
case 'i': str = "sabled"; break;
|
|
}
|
|
break;
|
|
case 'i':
|
|
str = "smap";
|
|
break;
|
|
case 'm':
|
|
str = "ultiple";
|
|
break;
|
|
case 'n':
|
|
name += 1;
|
|
if ((name[0] | 0x20) != 'o')
|
|
break;
|
|
name += 1;
|
|
switch (name[0] | 0x20) {
|
|
case 'h': str = "ref"; break;
|
|
case 'r': str = "esize"; break;
|
|
case 's': str = "hade"; break;
|
|
case 'w': str = "rap"; break;
|
|
}
|
|
break;
|
|
case 'r':
|
|
str = "eadonly";
|
|
break;
|
|
case 's':
|
|
str = "elected";
|
|
break;
|
|
}
|
|
|
|
if (str == NULL)
|
|
return(0);
|
|
|
|
return(xmlStrcasecmp(name + 1, BAD_CAST str) == 0);
|
|
}
|
|
|
|
#ifdef LIBXML_OUTPUT_ENABLED
|
|
/************************************************************************
|
|
* *
|
|
* Dumping HTML tree content to a simple buffer *
|
|
* *
|
|
************************************************************************/
|
|
|
|
static xmlParserErrors
|
|
htmlFindOutputEncoder(const char *encoding, xmlCharEncodingHandler **out) {
|
|
/*
|
|
* Fallback to HTML if the encoding is unspecified
|
|
*/
|
|
if (encoding == NULL)
|
|
encoding = "HTML";
|
|
|
|
return(xmlOpenCharEncodingHandler(encoding, /* output */ 1, out));
|
|
}
|
|
|
|
/**
|
|
* Serialize an HTML document to an xmlBuf.
|
|
*
|
|
* @param buf the xmlBufPtr output
|
|
* @param doc the document (unused)
|
|
* @param cur the current node
|
|
* @param format should formatting newlines been added
|
|
* @returns the number of bytes written or -1 in case of error
|
|
*/
|
|
static size_t
|
|
htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc ATTRIBUTE_UNUSED,
|
|
xmlNodePtr cur, int format) {
|
|
size_t use;
|
|
size_t ret;
|
|
xmlOutputBufferPtr outbuf;
|
|
|
|
if (cur == NULL) {
|
|
return ((size_t) -1);
|
|
}
|
|
if (buf == NULL) {
|
|
return ((size_t) -1);
|
|
}
|
|
outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
|
|
if (outbuf == NULL)
|
|
return ((size_t) -1);
|
|
memset(outbuf, 0, sizeof(xmlOutputBuffer));
|
|
outbuf->buffer = buf;
|
|
outbuf->encoder = NULL;
|
|
outbuf->writecallback = NULL;
|
|
outbuf->closecallback = NULL;
|
|
outbuf->context = NULL;
|
|
outbuf->written = 0;
|
|
|
|
use = xmlBufUse(buf);
|
|
htmlNodeDumpInternal(outbuf, cur, NULL, format);
|
|
if (outbuf->error)
|
|
ret = (size_t) -1;
|
|
else
|
|
ret = xmlBufUse(buf) - use;
|
|
xmlFree(outbuf);
|
|
return (ret);
|
|
}
|
|
|
|
/**
|
|
* Serialize an HTML node to an xmlBuffer. Always uses UTF-8.
|
|
*
|
|
* @param buf the HTML buffer output
|
|
* @param doc the document
|
|
* @param cur the current node
|
|
* @returns the number of bytes written or -1 in case of error
|
|
*/
|
|
int
|
|
htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
|
|
xmlBufPtr buffer;
|
|
size_t ret1;
|
|
int ret2;
|
|
|
|
if ((buf == NULL) || (cur == NULL))
|
|
return(-1);
|
|
|
|
xmlInitParser();
|
|
buffer = xmlBufFromBuffer(buf);
|
|
if (buffer == NULL)
|
|
return(-1);
|
|
|
|
ret1 = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
|
|
|
|
ret2 = xmlBufBackToBuffer(buffer, buf);
|
|
|
|
if ((ret1 == (size_t) -1) || (ret2 < 0))
|
|
return(-1);
|
|
return(ret1 > INT_MAX ? INT_MAX : ret1);
|
|
}
|
|
|
|
/**
|
|
* Serialize an HTML node to an xmlBuffer.
|
|
*
|
|
* If encoding is NULL, ASCII with HTML 4.0 named character entities
|
|
* will be used. This is inefficient compared to UTF-8 and might be
|
|
* changed in a future version.
|
|
*
|
|
* @param out the FILE pointer
|
|
* @param doc the document (unused)
|
|
* @param cur the current node
|
|
* @param encoding the document encoding (optional)
|
|
* @param format should formatting newlines been added
|
|
* @returns the number of bytes written or -1 in case of failure.
|
|
*/
|
|
int
|
|
htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc ATTRIBUTE_UNUSED,
|
|
xmlNodePtr cur, const char *encoding, int format) {
|
|
xmlOutputBufferPtr buf;
|
|
xmlCharEncodingHandlerPtr handler;
|
|
int ret;
|
|
|
|
xmlInitParser();
|
|
|
|
/*
|
|
* save the content to a temp buffer.
|
|
*/
|
|
if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK)
|
|
return(-1);
|
|
buf = xmlOutputBufferCreateFile(out, handler);
|
|
if (buf == NULL) {
|
|
xmlCharEncCloseFunc(handler);
|
|
return(-1);
|
|
}
|
|
|
|
htmlNodeDumpInternal(buf, cur, NULL, format);
|
|
|
|
ret = xmlOutputBufferClose(buf);
|
|
return(ret);
|
|
}
|
|
|
|
/**
|
|
* Same as htmlNodeDumpFileFormat() with `format` set to 1 which is
|
|
* typically undesired. Use of this function is DISCOURAGED in favor
|
|
* of htmlNodeDumpFileFormat().
|
|
*
|
|
* @param out the FILE pointer
|
|
* @param doc the document
|
|
* @param cur the current node
|
|
*/
|
|
void
|
|
htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
|
|
htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
|
|
}
|
|
|
|
/**
|
|
* Serialize an HTML node to a memory, also returning the size of
|
|
* the result. It's up to the caller to free the memory.
|
|
*
|
|
* Uses the encoding of the document. If the document has no
|
|
* encoding, ASCII with HTML 4.0 named character entities will
|
|
* be used. This is inefficient compared to UTF-8 and might be
|
|
* changed in a future version.
|
|
*
|
|
* @param cur the document
|
|
* @param mem OUT: the memory pointer
|
|
* @param size OUT: the memory length
|
|
* @param format should formatting newlines been added
|
|
*/
|
|
void
|
|
htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
|
|
xmlOutputBufferPtr buf;
|
|
xmlCharEncodingHandlerPtr handler = NULL;
|
|
|
|
xmlInitParser();
|
|
|
|
if ((mem == NULL) || (size == NULL))
|
|
return;
|
|
*mem = NULL;
|
|
*size = 0;
|
|
if (cur == NULL)
|
|
return;
|
|
|
|
if (htmlFindOutputEncoder((char *) cur->encoding, &handler) != XML_ERR_OK)
|
|
return;
|
|
buf = xmlAllocOutputBuffer(handler);
|
|
if (buf == NULL) {
|
|
xmlCharEncCloseFunc(handler);
|
|
return;
|
|
}
|
|
|
|
htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
|
|
|
|
xmlOutputBufferFlush(buf);
|
|
|
|
if (!buf->error) {
|
|
if (buf->conv != NULL) {
|
|
*size = xmlBufUse(buf->conv);
|
|
*mem = xmlStrndup(xmlBufContent(buf->conv), *size);
|
|
} else {
|
|
*size = xmlBufUse(buf->buffer);
|
|
*mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
|
|
}
|
|
}
|
|
|
|
xmlOutputBufferClose(buf);
|
|
}
|
|
|
|
/**
|
|
* Same as htmlDocDumpMemoryFormat() with `format` set to 1 which
|
|
* is typically undesired. Also see the warnings there. Use of
|
|
* this function is DISCOURAGED in favor of
|
|
* htmlDocContentDumpFormatOutput().
|
|
*
|
|
* @param cur the document
|
|
* @param mem OUT: the memory pointer
|
|
* @param size OUT: the memory length
|
|
*/
|
|
void
|
|
htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
|
|
htmlDocDumpMemoryFormat(cur, mem, size, 1);
|
|
}
|
|
|
|
|
|
/************************************************************************
|
|
* *
|
|
* Dumping HTML tree content to an I/O output buffer *
|
|
* *
|
|
************************************************************************/
|
|
|
|
/**
|
|
* Serialize the HTML document's DTD, if any.
|
|
*
|
|
* Ignores `encoding` and uses the encoding of the output buffer.
|
|
*
|
|
* @param buf the HTML buffer output
|
|
* @param doc the document
|
|
* @param encoding the encoding string (unused)
|
|
*/
|
|
static void
|
|
htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
|
|
const char *encoding ATTRIBUTE_UNUSED) {
|
|
xmlDtdPtr cur = doc->intSubset;
|
|
|
|
if (cur == NULL)
|
|
return;
|
|
xmlOutputBufferWrite(buf, 10, "<!DOCTYPE ");
|
|
xmlOutputBufferWriteString(buf, (const char *)cur->name);
|
|
if (cur->ExternalID != NULL) {
|
|
xmlOutputBufferWrite(buf, 8, " PUBLIC ");
|
|
xmlOutputBufferWriteQuotedString(buf, cur->ExternalID);
|
|
if (cur->SystemID != NULL) {
|
|
xmlOutputBufferWrite(buf, 1, " ");
|
|
xmlOutputBufferWriteQuotedString(buf, cur->SystemID);
|
|
}
|
|
} else if (cur->SystemID != NULL &&
|
|
xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
|
|
xmlOutputBufferWrite(buf, 8, " SYSTEM ");
|
|
xmlOutputBufferWriteQuotedString(buf, cur->SystemID);
|
|
}
|
|
xmlOutputBufferWrite(buf, 2, ">\n");
|
|
}
|
|
|
|
static void
|
|
htmlSerializeUri(xmlOutputBufferPtr buf, const xmlChar *content) {
|
|
const xmlChar *tmp = content;
|
|
|
|
/*
|
|
* See appendix "B.2.1 Non-ASCII characters in URI attribute
|
|
* values" in the HTML 4.01 spec. This is also recommended
|
|
* by the HTML output method of the XSLT 1.0 spec.
|
|
*
|
|
* We also escape space and control chars.
|
|
*/
|
|
|
|
/* Skip over initial whitespace */
|
|
while (IS_WS_HTML(*tmp)) tmp++;
|
|
if (tmp > content) {
|
|
xmlOutputBufferWrite(buf, tmp - content, (char *) content);
|
|
content = tmp;
|
|
}
|
|
|
|
while (1) {
|
|
char escbuf[3];
|
|
const char *repl;
|
|
int replSize;
|
|
int c = *tmp;
|
|
|
|
while ((c > 0x20) && (c < 0x7F) && (c != '"') && (c != '&')) {
|
|
tmp += 1;
|
|
c = *tmp;
|
|
}
|
|
|
|
if (tmp > content)
|
|
xmlOutputBufferWrite(buf, tmp - content, (char *) content);
|
|
|
|
if ((c <= 0x20) || (c >= 0x7F)) {
|
|
static const char hex[16] = {
|
|
'0', '1', '2', '3', '4', '5', '6', '7',
|
|
'8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
|
|
};
|
|
|
|
if (c == 0)
|
|
break;
|
|
|
|
escbuf[0] = '%';
|
|
escbuf[1] = hex[(c >> 4) & 0x0F];
|
|
escbuf[2] = hex[c & 0x0F];
|
|
repl = escbuf;
|
|
replSize = 3;
|
|
} else if (c == '"') {
|
|
repl = """;
|
|
replSize = 6;
|
|
} else {
|
|
repl = "&";
|
|
replSize = 5;
|
|
}
|
|
|
|
xmlOutputBufferWrite(buf, replSize, repl);
|
|
tmp += 1;
|
|
content = tmp;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Serialize an HTML attribute.
|
|
*
|
|
* @param buf the HTML buffer output
|
|
* @param cur the attribute pointer
|
|
*/
|
|
static void
|
|
htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlAttrPtr cur) {
|
|
xmlOutputBufferWrite(buf, 1, " ");
|
|
|
|
if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
|
|
xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
|
|
xmlOutputBufferWrite(buf, 1, ":");
|
|
}
|
|
xmlOutputBufferWriteString(buf, (const char *)cur->name);
|
|
|
|
/*
|
|
* The HTML5 spec requires to always serialize empty attribute
|
|
* values as `=""`. We should probably align with HTML5 at some
|
|
* point.
|
|
*/
|
|
if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
|
|
xmlNodePtr child;
|
|
int isUri;
|
|
|
|
xmlOutputBufferWrite(buf, 2, "=\"");
|
|
|
|
/*
|
|
* Special handling of URIs doesn't conform to HTML5 and
|
|
* should probably be removed at some point.
|
|
*/
|
|
isUri = (cur->ns == NULL) && (cur->parent != NULL) &&
|
|
(cur->parent->ns == NULL) &&
|
|
((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
|
|
(!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
|
|
(!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
|
|
((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
|
|
(!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))));
|
|
|
|
for (child = cur->children; child != NULL; child = child->next) {
|
|
if (child->type == XML_TEXT_NODE) {
|
|
const xmlChar *content = child->content;
|
|
|
|
if (content == NULL)
|
|
continue;
|
|
|
|
if (isUri) {
|
|
htmlSerializeUri(buf, content);
|
|
} else {
|
|
xmlSerializeText(buf, content, SIZE_MAX,
|
|
XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
|
|
}
|
|
} else if (child->type == XML_ENTITY_REF_NODE) {
|
|
/* TODO: We should probably expand entity refs */
|
|
xmlOutputBufferWrite(buf, 1, "&");
|
|
xmlOutputBufferWriteString(buf, (char *) child->name);
|
|
xmlOutputBufferWrite(buf, 1, ";");
|
|
}
|
|
}
|
|
|
|
xmlOutputBufferWrite(buf, 1, "\"");
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Serialize an HTML node to an output buffer.
|
|
*
|
|
* If `encoding` is specified, it is used to create or update meta
|
|
* tags containing the character encoding.
|
|
*
|
|
* @param buf the HTML buffer output
|
|
* @param cur the current node
|
|
* @param encoding the encoding string (optional)
|
|
* @param format should formatting newlines been added
|
|
*/
|
|
void
|
|
htmlNodeDumpInternal(xmlOutputBufferPtr buf, xmlNodePtr cur,
|
|
const char *encoding, int format) {
|
|
xmlNodePtr root, parent, metaHead = NULL;
|
|
xmlAttrPtr attr;
|
|
const htmlElemDesc * info;
|
|
int isRaw = 0;
|
|
|
|
xmlInitParser();
|
|
|
|
if ((cur == NULL) || (buf == NULL)) {
|
|
return;
|
|
}
|
|
|
|
root = cur;
|
|
parent = cur->parent;
|
|
while (1) {
|
|
switch (cur->type) {
|
|
case XML_HTML_DOCUMENT_NODE:
|
|
case XML_DOCUMENT_NODE:
|
|
if (((xmlDocPtr) cur)->intSubset != NULL) {
|
|
htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
|
|
}
|
|
if (cur->children != NULL) {
|
|
/* Always validate cur->parent when descending. */
|
|
if (cur->parent == parent) {
|
|
parent = cur;
|
|
cur = cur->children;
|
|
continue;
|
|
}
|
|
} else {
|
|
xmlOutputBufferWrite(buf, 1, "\n");
|
|
}
|
|
break;
|
|
|
|
case XML_ELEMENT_NODE: {
|
|
htmlMetaEncoding menc;
|
|
int isMeta = 0;
|
|
int addMeta = 0;
|
|
|
|
/*
|
|
* Some users like lxml are known to pass nodes with a corrupted
|
|
* tree structure. Fall back to a recursive call to handle this
|
|
* case.
|
|
*/
|
|
if ((cur->parent != parent) && (cur->children != NULL)) {
|
|
htmlNodeDumpInternal(buf, cur, encoding, format);
|
|
break;
|
|
}
|
|
|
|
/*
|
|
* Get specific HTML info for that node.
|
|
*/
|
|
if (cur->ns == NULL)
|
|
info = htmlTagLookup(cur->name);
|
|
else
|
|
info = NULL;
|
|
|
|
if (encoding != NULL) {
|
|
isMeta = htmlParseMetaEncoding(cur, &menc);
|
|
|
|
/*
|
|
* Don't add meta tag for "HTML" encoding.
|
|
*/
|
|
if ((xmlStrcasecmp(BAD_CAST encoding,
|
|
BAD_CAST "HTML") != 0) &&
|
|
(xmlStrcasecmp(cur->name, BAD_CAST "head") == 0) &&
|
|
(parent != NULL) &&
|
|
(xmlStrcasecmp(parent->name, BAD_CAST "html") == 0) &&
|
|
(parent->parent != NULL) &&
|
|
(parent->parent->parent == NULL) &&
|
|
(metaHead == NULL)) {
|
|
xmlNodePtr n;
|
|
|
|
metaHead = cur;
|
|
addMeta = 1;
|
|
|
|
for (n = cur->children; n != NULL; n = n->next) {
|
|
int unused;
|
|
|
|
if (htmlFindMetaEncodingAttr(n, &unused) != NULL) {
|
|
metaHead = NULL;
|
|
addMeta = 0;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
xmlOutputBufferWrite(buf, 1, "<");
|
|
if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
|
|
xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
|
|
xmlOutputBufferWrite(buf, 1, ":");
|
|
}
|
|
xmlOutputBufferWriteString(buf, (const char *)cur->name);
|
|
if (cur->nsDef)
|
|
xmlNsListDumpOutput(buf, cur->nsDef);
|
|
attr = cur->properties;
|
|
while (attr != NULL) {
|
|
if ((!isMeta) || (attr != menc.attr)) {
|
|
htmlAttrDumpOutput(buf, attr);
|
|
} else {
|
|
xmlOutputBufferWrite(buf, 1, " ");
|
|
xmlOutputBufferWriteString(buf, (char *) attr->name);
|
|
|
|
xmlOutputBufferWrite(buf, 2, "=\"");
|
|
xmlSerializeText(buf, menc.attrValue, menc.off.start,
|
|
XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
|
|
xmlSerializeText(buf, BAD_CAST encoding, SIZE_MAX,
|
|
XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
|
|
xmlSerializeText(buf, menc.attrValue + menc.off.end,
|
|
menc.off.size - menc.off.end,
|
|
XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
|
|
xmlOutputBufferWrite(buf, 1, "\"");
|
|
}
|
|
attr = attr->next;
|
|
}
|
|
|
|
if ((info != NULL) && (info->empty)) {
|
|
xmlOutputBufferWrite(buf, 1, ">");
|
|
} else if (cur->children == NULL) {
|
|
if (addMeta) {
|
|
xmlOutputBufferWrite(buf, 16, "><meta charset=\"");
|
|
xmlSerializeText(buf, BAD_CAST encoding, SIZE_MAX,
|
|
XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
|
|
xmlOutputBufferWrite(buf, 4, "\"></");
|
|
} else {
|
|
xmlOutputBufferWrite(buf, 3, "></");
|
|
}
|
|
if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
|
|
xmlOutputBufferWriteString(buf,
|
|
(const char *)cur->ns->prefix);
|
|
xmlOutputBufferWrite(buf, 1, ":");
|
|
}
|
|
xmlOutputBufferWriteString(buf, (const char *)cur->name);
|
|
xmlOutputBufferWrite(buf, 1, ">");
|
|
} else {
|
|
xmlOutputBufferWrite(buf, 1, ">");
|
|
if ((format) &&
|
|
((addMeta) ||
|
|
((info != NULL) && (!info->isinline) &&
|
|
(cur->children->type != HTML_TEXT_NODE) &&
|
|
(cur->children->type != HTML_ENTITY_REF_NODE) &&
|
|
(cur->children != cur->last) &&
|
|
(cur->name != NULL) &&
|
|
(cur->name[0] != 'p')))) /* p, pre, param */
|
|
xmlOutputBufferWrite(buf, 1, "\n");
|
|
if (addMeta) {
|
|
xmlOutputBufferWrite(buf, 15, "<meta charset=\"");
|
|
xmlSerializeText(buf, BAD_CAST encoding, SIZE_MAX,
|
|
XML_ESCAPE_HTML | XML_ESCAPE_ATTR);
|
|
xmlOutputBufferWrite(buf, 2, "\">");
|
|
if ((format) &&
|
|
(cur->children->type != HTML_TEXT_NODE) &&
|
|
(cur->children->type != HTML_ENTITY_REF_NODE))
|
|
xmlOutputBufferWrite(buf, 1, "\n");
|
|
}
|
|
|
|
if ((info != NULL) && (info->dataMode >= DATA_RAWTEXT))
|
|
isRaw = 1;
|
|
|
|
parent = cur;
|
|
cur = cur->children;
|
|
continue;
|
|
}
|
|
|
|
if ((format) && (cur->next != NULL) &&
|
|
(info != NULL) && (!info->isinline)) {
|
|
if ((cur->next->type != HTML_TEXT_NODE) &&
|
|
(cur->next->type != HTML_ENTITY_REF_NODE) &&
|
|
(parent != NULL) &&
|
|
(parent->name != NULL) &&
|
|
(parent->name[0] != 'p')) /* p, pre, param */
|
|
xmlOutputBufferWrite(buf, 1, "\n");
|
|
}
|
|
|
|
break;
|
|
}
|
|
|
|
case XML_ATTRIBUTE_NODE:
|
|
htmlAttrDumpOutput(buf, (xmlAttrPtr) cur);
|
|
break;
|
|
|
|
case HTML_TEXT_NODE:
|
|
if (cur->content == NULL)
|
|
break;
|
|
if ((cur->name == (const xmlChar *)xmlStringTextNoenc) ||
|
|
(isRaw)) {
|
|
xmlOutputBufferWriteString(buf, (const char *)cur->content);
|
|
} else {
|
|
xmlSerializeText(buf, cur->content, SIZE_MAX, XML_ESCAPE_HTML);
|
|
}
|
|
break;
|
|
|
|
case HTML_COMMENT_NODE:
|
|
if (cur->content != NULL) {
|
|
xmlOutputBufferWrite(buf, 4, "<!--");
|
|
xmlOutputBufferWriteString(buf, (const char *)cur->content);
|
|
xmlOutputBufferWrite(buf, 3, "-->");
|
|
}
|
|
break;
|
|
|
|
case HTML_PI_NODE:
|
|
if (cur->name != NULL) {
|
|
xmlOutputBufferWrite(buf, 2, "<?");
|
|
xmlOutputBufferWriteString(buf, (const char *)cur->name);
|
|
if (cur->content != NULL) {
|
|
xmlOutputBufferWrite(buf, 1, " ");
|
|
xmlOutputBufferWriteString(buf,
|
|
(const char *)cur->content);
|
|
}
|
|
xmlOutputBufferWrite(buf, 1, ">");
|
|
}
|
|
break;
|
|
|
|
case HTML_ENTITY_REF_NODE:
|
|
xmlOutputBufferWrite(buf, 1, "&");
|
|
xmlOutputBufferWriteString(buf, (const char *)cur->name);
|
|
xmlOutputBufferWrite(buf, 1, ";");
|
|
break;
|
|
|
|
case HTML_PRESERVE_NODE:
|
|
if (cur->content != NULL) {
|
|
xmlOutputBufferWriteString(buf, (const char *)cur->content);
|
|
}
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
|
|
while (1) {
|
|
if (cur == root)
|
|
return;
|
|
if (cur->next != NULL) {
|
|
cur = cur->next;
|
|
break;
|
|
}
|
|
|
|
isRaw = 0;
|
|
|
|
cur = parent;
|
|
/* cur->parent was validated when descending. */
|
|
parent = cur->parent;
|
|
|
|
if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
|
|
(cur->type == XML_DOCUMENT_NODE)) {
|
|
xmlOutputBufferWrite(buf, 1, "\n");
|
|
} else {
|
|
if ((format) && (cur->ns == NULL))
|
|
info = htmlTagLookup(cur->name);
|
|
else
|
|
info = NULL;
|
|
|
|
if ((format) && (info != NULL) && (!info->isinline) &&
|
|
(cur->last->type != HTML_TEXT_NODE) &&
|
|
(cur->last->type != HTML_ENTITY_REF_NODE) &&
|
|
((cur->children != cur->last) || (cur == metaHead)) &&
|
|
(cur->name != NULL) &&
|
|
(cur->name[0] != 'p')) /* p, pre, param */
|
|
xmlOutputBufferWrite(buf, 1, "\n");
|
|
|
|
xmlOutputBufferWrite(buf, 2, "</");
|
|
if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
|
|
xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
|
|
xmlOutputBufferWrite(buf, 1, ":");
|
|
}
|
|
xmlOutputBufferWriteString(buf, (const char *)cur->name);
|
|
xmlOutputBufferWrite(buf, 1, ">");
|
|
|
|
if ((format) && (info != NULL) && (!info->isinline) &&
|
|
(cur->next != NULL)) {
|
|
if ((cur->next->type != HTML_TEXT_NODE) &&
|
|
(cur->next->type != HTML_ENTITY_REF_NODE) &&
|
|
(parent != NULL) &&
|
|
(parent->name != NULL) &&
|
|
(parent->name[0] != 'p')) /* p, pre, param */
|
|
xmlOutputBufferWrite(buf, 1, "\n");
|
|
}
|
|
|
|
if (cur == metaHead)
|
|
metaHead = NULL;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Serialize an HTML node to an output buffer.
|
|
*
|
|
* @param buf the HTML buffer output
|
|
* @param doc the document (unused)
|
|
* @param cur the current node
|
|
* @param encoding the encoding string (unused)
|
|
* @param format should formatting newlines been added
|
|
*/
|
|
void
|
|
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf,
|
|
xmlDocPtr doc ATTRIBUTE_UNUSED, xmlNodePtr cur,
|
|
const char *encoding ATTRIBUTE_UNUSED, int format) {
|
|
htmlNodeDumpInternal(buf, cur, NULL, format);
|
|
}
|
|
|
|
/**
|
|
* Same as htmlNodeDumpFormatOutput() with `format` set to 1 which is
|
|
* typically undesired. Use of this function is DISCOURAGED in favor
|
|
* of htmlNodeDumpFormatOutput().
|
|
*
|
|
* @param buf the HTML buffer output
|
|
* @param doc the document (unused)
|
|
* @param cur the current node
|
|
* @param encoding the encoding string (unused)
|
|
*/
|
|
void
|
|
htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc ATTRIBUTE_UNUSED,
|
|
xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) {
|
|
htmlNodeDumpInternal(buf, cur, NULL, 1);
|
|
}
|
|
|
|
/**
|
|
* Serialize an HTML document to an output buffer.
|
|
*
|
|
* @param buf the HTML buffer output
|
|
* @param cur the document
|
|
* @param encoding the encoding string (unused)
|
|
* @param format should formatting newlines been added
|
|
*/
|
|
void
|
|
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
|
|
const char *encoding ATTRIBUTE_UNUSED,
|
|
int format) {
|
|
htmlNodeDumpInternal(buf, (xmlNodePtr) cur, NULL, format);
|
|
}
|
|
|
|
/**
|
|
* Same as htmlDocContentDumpFormatDump() with `format` set to 1
|
|
* which is typically undesired. Use of this function is DISCOURAGED
|
|
* in favor of htmlDocContentDumpFormatOutput().
|
|
*
|
|
* @param buf the HTML buffer output
|
|
* @param cur the document
|
|
* @param encoding the encoding string (unused)
|
|
*/
|
|
void
|
|
htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
|
|
const char *encoding ATTRIBUTE_UNUSED) {
|
|
htmlNodeDumpInternal(buf, (xmlNodePtr) cur, NULL, 1);
|
|
}
|
|
|
|
/************************************************************************
|
|
* *
|
|
* Saving functions front-ends *
|
|
* *
|
|
************************************************************************/
|
|
|
|
/**
|
|
* Serialize an HTML document to an open `FILE`.
|
|
*
|
|
* Uses the encoding of the document. If the document has no
|
|
* encoding, ASCII with HTML 4.0 named character entities will
|
|
* be used. This is inefficient compared to UTF-8 and might be
|
|
* changed in a future version.
|
|
*
|
|
* Enables "formatting" unconditionally which is typically
|
|
* undesired.
|
|
*
|
|
* Use of this function is DISCOURAGED in favor of
|
|
* htmlNodeDumpFileFormat().
|
|
*
|
|
* @param f the FILE*
|
|
* @param cur the document
|
|
* @returns the number of bytes written or -1 in case of failure.
|
|
*/
|
|
int
|
|
htmlDocDump(FILE *f, xmlDocPtr cur) {
|
|
xmlOutputBufferPtr buf;
|
|
xmlCharEncodingHandlerPtr handler = NULL;
|
|
int ret;
|
|
|
|
xmlInitParser();
|
|
|
|
if ((cur == NULL) || (f == NULL)) {
|
|
return(-1);
|
|
}
|
|
|
|
if (htmlFindOutputEncoder((char *) cur->encoding, &handler) != XML_ERR_OK)
|
|
return(-1);
|
|
buf = xmlOutputBufferCreateFile(f, handler);
|
|
if (buf == NULL) {
|
|
xmlCharEncCloseFunc(handler);
|
|
return(-1);
|
|
}
|
|
htmlDocContentDumpOutput(buf, cur, NULL);
|
|
|
|
ret = xmlOutputBufferClose(buf);
|
|
return(ret);
|
|
}
|
|
|
|
/**
|
|
* Serialize an HTML document to a file.
|
|
*
|
|
* Same as htmlSaveFileFormat() with `encoding` set to NULL and
|
|
* `format` set to 1 which is typically undesired.
|
|
*
|
|
* Use of this function is DISCOURAGED in favor of
|
|
* htmlSaveFileFormat().
|
|
*
|
|
* @param filename the filename (or URL)
|
|
* @param cur the document
|
|
* @returns the number of bytes written or -1 in case of failure.
|
|
*/
|
|
int
|
|
htmlSaveFile(const char *filename, xmlDocPtr cur) {
|
|
return(htmlSaveFileFormat(filename, cur, NULL, 1));
|
|
}
|
|
|
|
/**
|
|
* Serialize an HTML document to a file using a given encoding.
|
|
*
|
|
* If `filename` is `"-"`, stdout is used. This is potentially
|
|
* insecure and might be changed in a future version.
|
|
*
|
|
* If encoding is NULL, ASCII with HTML 4.0 named character entities
|
|
* will be used. This is inefficient compared to UTF-8 and might be
|
|
* changed in a future version.
|
|
*
|
|
* Sets or updates meta tags containing the character encoding.
|
|
*
|
|
* @param filename the filename
|
|
* @param cur the document
|
|
* @param format should formatting newlines been added
|
|
* @param encoding the document encoding (optional)
|
|
* @returns the number of bytes written or -1 in case of failure.
|
|
*/
|
|
int
|
|
htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
|
|
const char *encoding, int format) {
|
|
xmlOutputBufferPtr buf;
|
|
xmlCharEncodingHandlerPtr handler = NULL;
|
|
int ret;
|
|
|
|
if ((cur == NULL) || (filename == NULL))
|
|
return(-1);
|
|
|
|
xmlInitParser();
|
|
|
|
if (htmlFindOutputEncoder(encoding, &handler) != XML_ERR_OK)
|
|
return(-1);
|
|
|
|
/*
|
|
* save the content to a temp buffer.
|
|
*/
|
|
buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
|
|
if (buf == NULL) {
|
|
xmlCharEncCloseFunc(handler);
|
|
return(0);
|
|
}
|
|
|
|
htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
|
|
|
|
ret = xmlOutputBufferClose(buf);
|
|
return(ret);
|
|
}
|
|
|
|
/**
|
|
* Serialize an HTML document to a file.
|
|
*
|
|
* Same as htmlSaveFileFormat() with `format` set to 1 which is
|
|
* typically undesired. Also see the warnings there. Use of this
|
|
* function is DISCOURAGED in favor of htmlSaveFileFormat().
|
|
*
|
|
* @param filename the filename
|
|
* @param cur the document
|
|
* @param encoding the document encoding
|
|
* @returns the number of bytes written or -1 in case of failure.
|
|
*/
|
|
int
|
|
htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
|
|
return(htmlSaveFileFormat(filename, cur, encoding, 1));
|
|
}
|
|
|
|
#endif /* LIBXML_OUTPUT_ENABLED */
|
|
|
|
#endif /* LIBXML_HTML_ENABLED */
|