1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-10-26 00:37:43 +03:00

doc: Misc fixes to HTML tree docs

This commit is contained in:
Nick Wellnhofer
2025-05-05 21:36:36 +02:00
parent 18d20a68bc
commit 298f70b3d7
2 changed files with 136 additions and 89 deletions

View File

@@ -37,9 +37,15 @@
/** /**
* @param doc the document * @param doc the document
* *
* Encoding definition lookup in the Meta tags * Look up and encoding declaration in the meta tags.
* *
* @returns the current encoding as flagged in the HTML source * Does not support `<meta charset="">` yet. Only supports deprecated
* `<meta http-equiv="Content-Type" content="">`.
*
* The returned string points into attribute content. It should be
* copied before modifying or freeing nodes.
*
* @returns the encoding ot NULL if not found.
*/ */
const xmlChar * const xmlChar *
htmlGetMetaEncoding(htmlDocPtr doc) { htmlGetMetaEncoding(htmlDocPtr doc) {
@@ -148,9 +154,12 @@ found_content:
* @param doc the document * @param doc the document
* @param encoding the encoding string * @param encoding the encoding string
* *
* Sets the current encoding in the Meta tags * Creates or updates a meta tag with an encoding declaration.
* NOTE: this will not change the document content encoding, just *
* the META flag associated. * Does not support `<meta charset="">` yet. Only supports deprecated
* `<meta http-equiv="Content-Type" content="">`.
*
* NOTE: This will not change the document content encoding.
* *
* @returns 0 in case of success and -1 in case of error * @returns 0 in case of success and -1 in case of error
*/ */
@@ -307,7 +316,8 @@ static const char* const htmlBooleanAttrs[] = {
* *
* @deprecated Internal function, don't use. * @deprecated Internal function, don't use.
* *
* Determine if a given attribute is a boolean attribute. * Determine if a given attribute is a boolean attribute. This
* doesn't handle HTML5.
* *
* @returns false if the attribute is not boolean, true otherwise. * @returns false if the attribute is not boolean, true otherwise.
*/ */
@@ -346,11 +356,11 @@ htmlFindOutputEncoder(const char *encoding, xmlCharEncodingHandler **out) {
* @param buf the xmlBufPtr output * @param buf the xmlBufPtr output
* @param doc the document * @param doc the document
* @param cur the current node * @param cur the current node
* @param format should formatting spaces been added * @param format should formatting newlines been added
* *
* Dump an HTML node, recursive behaviour,children are printed too. * Serialize an HTML document to an xmlBuf.
* *
* @returns the number of byte written or -1 in case of error * @returns the number of bytes written or -1 in case of error
*/ */
static size_t static size_t
htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur, htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
@@ -391,10 +401,9 @@ htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
* @param doc the document * @param doc the document
* @param cur the current node * @param cur the current node
* *
* Dump an HTML node, recursive behaviour,children are printed too, * Serialize an HTML node to an xmlBuffer. Always uses UTF-8.
* and formatting returns are added.
* *
* @returns the number of byte written or -1 in case of error * @returns the number of bytes written or -1 in case of error
*/ */
int int
htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) { htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
@@ -423,14 +432,16 @@ htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
* @param out the FILE pointer * @param out the FILE pointer
* @param doc the document * @param doc the document
* @param cur the current node * @param cur the current node
* @param encoding the document encoding * @param encoding the document encoding (optional)
* @param format should formatting spaces been added * @param format should formatting newlines been added
* *
* Dump an HTML node, recursive behaviour,children are printed too. * Serialize an HTML node to an xmlBuffer.
* *
* TODO: if encoding == NULL try to save in the doc encoding * If encoding is NULL, ASCII with HTML 4.0 named character entities
* will be used. This is inefficient compared to UTF-8 and might be
* changed in a future version.
* *
* @returns the number of byte written or -1 in case of failure. * @returns the number of bytes written or -1 in case of failure.
*/ */
int int
htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc, htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
@@ -461,8 +472,9 @@ htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
* @param doc the document * @param doc the document
* @param cur the current node * @param cur the current node
* *
* Dump an HTML node, recursive behaviour,children are printed too, * Same as htmlNodeDumpFileFormat() with `format` set to 1 which is
* and formatting returns are added. * typically undesired. Use of this function is DISCOURAGED in favor
* of htmlNodeDumpFileFormat().
*/ */
void void
htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) { htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
@@ -473,10 +485,19 @@ htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
* @param cur the document * @param cur the document
* @param mem OUT: the memory pointer * @param mem OUT: the memory pointer
* @param size OUT: the memory length * @param size OUT: the memory length
* @param format should formatting spaces been added * @param format should formatting newlines been added
* *
* Dump an HTML document in memory and return the xmlChar * and it's size. * Serialize an HTML node to a memory, also returning the size of
* It's up to the caller to free the memory. * the result. It's up to the caller to free the memory.
*
* WARNING: Uses the encoding from a deprecated meta tag, see
* htmlGetMetaEncoding(). This is typically undesired. If no such
* tag was found, ASCII with HTML 4.0 named character entities will
* be used. This is inefficient compared to UTF-8 and might be
* changed in a future version.
*
* Use of this function is therefore DISCOURAGED in favor of
* htmlDocContentDumpFormatOutput().
*/ */
void void
htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) { htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
@@ -522,8 +543,10 @@ htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
* @param mem OUT: the memory pointer * @param mem OUT: the memory pointer
* @param size OUT: the memory length * @param size OUT: the memory length
* *
* Dump an HTML document in memory and return the xmlChar * and it's size. * Same as htmlDocDumpMemoryFormat() with `format` set to 1 which
* It's up to the caller to free the memory. * is typically undesired. Also see the warnings there. Use of
* this function is DISCOURAGED in favor of
* htmlDocContentDumpFormatOutput().
*/ */
void void
htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) { htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
@@ -540,11 +563,11 @@ htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
/** /**
* @param buf the HTML buffer output * @param buf the HTML buffer output
* @param doc the document * @param doc the document
* @param encoding the encoding string * @param encoding the encoding string (unused)
* *
* TODO: check whether encoding is needed * Serialize the HTML document's DTD, if any.
* *
* Dump the HTML document DTD, if any. * Ignores `encoding` and uses the encoding of the output buffer.
*/ */
static void static void
htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
@@ -575,7 +598,7 @@ htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
* @param doc the document * @param doc the document
* @param cur the attribute pointer * @param cur the attribute pointer
* *
* Dump an HTML attribute * Serialize an HTML attribute.
*/ */
static void static void
htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) { htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
@@ -644,9 +667,11 @@ htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
* @param doc the document * @param doc the document
* @param cur the current node * @param cur the current node
* @param encoding the encoding string (unused) * @param encoding the encoding string (unused)
* @param format should formatting spaces been added * @param format should formatting newlines been added
* *
* Dump an HTML node, recursive behaviour,children are printed too. * Serialize an HTML node to an output buffer.
*
* Ignores `encoding` and uses the encoding of the output buffer.
*/ */
void void
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
@@ -879,8 +904,11 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
* @param cur the current node * @param cur the current node
* @param encoding the encoding string (unused) * @param encoding the encoding string (unused)
* *
* Dump an HTML node, recursive behaviour,children are printed too, * Same as htmlNodeDumpFormatOutput() with `format` set to 1 which is
* and formatting returns/spaces are added. * typically undesired. Use of this function is DISCOURAGED in favor
* of htmlNodeDumpFormatOutput().
*
* Ignores `encoding` and uses the encoding of the output buffer.
*/ */
void void
htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
@@ -892,9 +920,11 @@ htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
* @param buf the HTML buffer output * @param buf the HTML buffer output
* @param cur the document * @param cur the document
* @param encoding the encoding string (unused) * @param encoding the encoding string (unused)
* @param format should formatting spaces been added * @param format should formatting newlines been added
* *
* Dump an HTML document. * Serialize an HTML document to an output buffer.
*
* Ignores `encoding` and uses the encoding of the output buffer.
*/ */
void void
htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
@@ -915,7 +945,11 @@ htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
* @param cur the document * @param cur the document
* @param encoding the encoding string (unused) * @param encoding the encoding string (unused)
* *
* Dump an HTML document. Formatting return/spaces are added. * Same as htmlNodeDumpFormatOutput() with `format` set to 1 which is
* typically undesired. Use of this function is DISCOURAGED in favor
* of htmlDocContentDumpFormatOutput().
*
* Ignores `encoding` and uses the encoding of the output buffer.
*/ */
void void
htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
@@ -933,9 +967,21 @@ htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
* @param f the FILE* * @param f the FILE*
* @param cur the document * @param cur the document
* *
* Dump an HTML document to an open FILE. * Serialize an HTML document to an open `FILE`.
* *
* @returns the number of byte written or -1 in case of failure. * WARNING: Uses the encoding from a deprecated meta tag, see
* htmlGetMetaEncoding(). This is typically undesired. If no such
* tag was found, ASCII with HTML 4.0 named character entities will
* be used. This is inefficient compared to UTF-8 and might be
* changed in a future version.
*
* Also enables "formatting" unconditionally which is typically
* undesired.
*
* Use of this function is DISCOURAGED in favor of
* htmlNodeDumpFileFormat().
*
* @returns the number of bytes written or -1 in case of failure.
*/ */
int int
htmlDocDump(FILE *f, xmlDocPtr cur) { htmlDocDump(FILE *f, xmlDocPtr cur) {
@@ -966,9 +1012,23 @@ htmlDocDump(FILE *f, xmlDocPtr cur) {
* @param filename the filename (or URL) * @param filename the filename (or URL)
* @param cur the document * @param cur the document
* *
* Dump an HTML document to a file. If `filename` is "-" the stdout file is * Serialize an HTML document to a file. If `filename` is `"-"`,
* used. * stdout is used. This is potentially insecure and might be
* @returns the number of byte written or -1 in case of failure. * changed in a future version.
*
* WARNING: Uses the encoding from a deprecated meta tag, see
* htmlGetMetaEncoding(). This is typically undesired. If no such
* tag was found, ASCII with HTML 4.0 named character entities will
* be used. This is inefficient compared to UTF-8 and might be
* changed in a future version.
*
* Also enables "formatting" unconditionally which is typically
* undesired.
*
* Use of this function is DISCOURAGED in favor of
* htmlSaveFileFormat().
*
* @returns the number of bytes written or -1 in case of failure.
*/ */
int int
htmlSaveFile(const char *filename, xmlDocPtr cur) { htmlSaveFile(const char *filename, xmlDocPtr cur) {
@@ -998,12 +1058,18 @@ htmlSaveFile(const char *filename, xmlDocPtr cur) {
/** /**
* @param filename the filename * @param filename the filename
* @param cur the document * @param cur the document
* @param format should formatting spaces been added * @param format should formatting newlines been added
* @param encoding the document encoding * @param encoding the document encoding (optional)
* *
* Dump an HTML document to a file using a given encoding. * Serialize an HTML document to a file using a given encoding.
* If `filename` is `"-"`, stdout is used. This is potentially
* insecure and might be changed in a future version.
* *
* @returns the number of byte written or -1 in case of failure. * If encoding is NULL, ASCII with HTML 4.0 named character entities
* will be used. This is inefficient compared to UTF-8 and might be
* changed in a future version.
*
* @returns the number of bytes written or -1 in case of failure.
*/ */
int int
htmlSaveFileFormat(const char *filename, xmlDocPtr cur, htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
@@ -1042,10 +1108,11 @@ htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
* @param cur the document * @param cur the document
* @param encoding the document encoding * @param encoding the document encoding
* *
* Dump an HTML document to a file using a given encoding * Same as htmlSaveFileFormat() with `format` set to 1 which is
* and formatting returns/spaces are added. * typically undesired. Also see the warnings there. Use of this
* function is DISCOURAGED in favor of htmlSaveFileFormat().
* *
* @returns the number of byte written or -1 in case of failure. * @returns the number of bytes written or -1 in case of failure.
*/ */
int int
htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) { htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {

View File

@@ -1,10 +1,10 @@
/** /**
* @file * @file
* *
* @brief specific APIs to process HTML tree, especially serialization * @brief HTML documents
* *
* this module implements a few function needed to process * This modules implements functions to work with HTML documents,
* tree in an HTML specific way. * most of them related to serialization.
* *
* @copyright See Copyright for the status of this software. * @copyright See Copyright for the status of this software.
* *
@@ -25,31 +25,11 @@
extern "C" { extern "C" {
#endif #endif
/* Deprecated */
/**
* Macro. A text node in a HTML document is really implemented
* the same way as a text node in an XML document.
*/
#define HTML_TEXT_NODE XML_TEXT_NODE #define HTML_TEXT_NODE XML_TEXT_NODE
/**
* Macro. An entity reference in a HTML document is really implemented
* the same way as an entity reference in an XML document.
*/
#define HTML_ENTITY_REF_NODE XML_ENTITY_REF_NODE #define HTML_ENTITY_REF_NODE XML_ENTITY_REF_NODE
/**
* Macro. A comment in a HTML document is really implemented
* the same way as a comment in an XML document.
*/
#define HTML_COMMENT_NODE XML_COMMENT_NODE #define HTML_COMMENT_NODE XML_COMMENT_NODE
/**
* Macro. A preserved node in a HTML document is really implemented
* the same way as a CDATA section in an XML document.
*/
#define HTML_PRESERVE_NODE XML_CDATA_SECTION_NODE #define HTML_PRESERVE_NODE XML_CDATA_SECTION_NODE
/**
* Macro. A processing instruction in a HTML document is really implemented
* the same way as a processing instruction in an XML document.
*/
#define HTML_PI_NODE XML_PI_NODE #define HTML_PI_NODE XML_PI_NODE
XMLPUBFUN htmlDocPtr XMLPUBFUN htmlDocPtr
@@ -73,16 +53,25 @@ XMLPUBFUN void
xmlChar **mem, xmlChar **mem,
int *size, int *size,
int format); int format);
XMLPUBFUN int
htmlDocDump (FILE *f,
xmlDocPtr cur);
XMLPUBFUN int XMLPUBFUN int
htmlSaveFile (const char *filename, htmlSaveFile (const char *filename,
xmlDocPtr cur); xmlDocPtr cur);
XMLPUBFUN int
htmlSaveFileEnc (const char *filename,
xmlDocPtr cur,
const char *encoding);
XMLPUBFUN int
htmlSaveFileFormat (const char *filename,
xmlDocPtr cur,
const char *encoding,
int format);
XMLPUBFUN int XMLPUBFUN int
htmlNodeDump (xmlBufferPtr buf, htmlNodeDump (xmlBufferPtr buf,
xmlDocPtr doc, xmlDocPtr doc,
xmlNodePtr cur); xmlNodePtr cur);
XMLPUBFUN int
htmlDocDump (FILE *f,
xmlDocPtr cur);
XMLPUBFUN void XMLPUBFUN void
htmlNodeDumpFile (FILE *out, htmlNodeDumpFile (FILE *out,
xmlDocPtr doc, xmlDocPtr doc,
@@ -93,16 +82,12 @@ XMLPUBFUN int
xmlNodePtr cur, xmlNodePtr cur,
const char *encoding, const char *encoding,
int format); int format);
XMLPUBFUN int
htmlSaveFileEnc (const char *filename,
xmlDocPtr cur,
const char *encoding);
XMLPUBFUN int
htmlSaveFileFormat (const char *filename,
xmlDocPtr cur,
const char *encoding,
int format);
XMLPUBFUN void
htmlNodeDumpOutput (xmlOutputBufferPtr buf,
xmlDocPtr doc,
xmlNodePtr cur,
const char *encoding);
XMLPUBFUN void XMLPUBFUN void
htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf,
xmlDocPtr doc, xmlDocPtr doc,
@@ -118,11 +103,6 @@ XMLPUBFUN void
xmlDocPtr cur, xmlDocPtr cur,
const char *encoding, const char *encoding,
int format); int format);
XMLPUBFUN void
htmlNodeDumpOutput (xmlOutputBufferPtr buf,
xmlDocPtr doc,
xmlNodePtr cur,
const char *encoding);
#endif /* LIBXML_OUTPUT_ENABLED */ #endif /* LIBXML_OUTPUT_ENABLED */