doc: Misc fixes to HTML tree docs

2025-10-26 00:37:43 +03:00 · 2025-05-05 21:36:36 +02:00
parent 18d20a68bc
commit 298f70b3d7
2 changed files with 136 additions and 89 deletions
--- a/HTMLtree.c
+++ b/HTMLtree.c
@@ -37,9 +37,15 @@
 /**
 * @param doc  the document
 *
- * Encoding definition lookup in the Meta tags
+ * Look up and encoding declaration in the meta tags.
 *
- * @returns the current encoding as flagged in the HTML source
+ * Does not support `<meta charset="">` yet. Only supports deprecated
 * `<meta http-equiv="Content-Type" content="">`.
 *
 * The returned string points into attribute content. It should be
 * copied before modifying or freeing nodes.
 *
 * @returns the encoding ot NULL if not found.
 */
 const xmlChar *
 htmlGetMetaEncoding(htmlDocPtr doc) {
@@ -148,9 +154,12 @@ found_content:
 * @param doc  the document
 * @param encoding  the encoding string
 *
- * Sets the current encoding in the Meta tags
+ * Creates or updates a meta tag with an encoding declaration.
- * NOTE: this will not change the document content encoding, just
+ *
- * the META flag associated.
+ * Does not support `<meta charset="">` yet. Only supports deprecated
 * `<meta http-equiv="Content-Type" content="">`.
 *
 * NOTE: This will not change the document content encoding.
 *
 * @returns 0 in case of success and -1 in case of error
 */
@@ -307,7 +316,8 @@ static const char* const htmlBooleanAttrs[] = {
 *
 * @deprecated Internal function, don't use.
 *
- * Determine if a given attribute is a boolean attribute.
+ * Determine if a given attribute is a boolean attribute. This
 * doesn't handle HTML5.
 *
 * @returns false if the attribute is not boolean, true otherwise.
 */
@@ -346,11 +356,11 @@ htmlFindOutputEncoder(const char *encoding, xmlCharEncodingHandler **out) {
 * @param buf  the xmlBufPtr output
 * @param doc  the document
 * @param cur  the current node
- * @param format  should formatting spaces been added
+ * @param format  should formatting newlines been added
 *
- * Dump an HTML node, recursive behaviour,children are printed too.
+ * Serialize an HTML document to an xmlBuf.
 *
- * @returns the number of byte written or -1 in case of error
+ * @returns the number of bytes written or -1 in case of error
 */
 static size_t
 htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
@@ -391,10 +401,9 @@ htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
 * @param doc  the document
 * @param cur  the current node
 *
- * Dump an HTML node, recursive behaviour,children are printed too,
+ * Serialize an HTML node to an xmlBuffer. Always uses UTF-8.
 * and formatting returns are added.
 *
- * @returns the number of byte written or -1 in case of error
+ * @returns the number of bytes written or -1 in case of error
 */
 int
 htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
@@ -423,14 +432,16 @@ htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
 * @param out  the FILE pointer
 * @param doc  the document
 * @param cur  the current node
- * @param encoding  the document encoding
+ * @param encoding  the document encoding (optional)
- * @param format  should formatting spaces been added
+ * @param format  should formatting newlines been added
 *
- * Dump an HTML node, recursive behaviour,children are printed too.
+ * Serialize an HTML node to an xmlBuffer.
 *
- * TODO: if encoding == NULL try to save in the doc encoding
+ * If encoding is NULL, ASCII with HTML 4.0 named character entities
 * will be used. This is inefficient compared to UTF-8 and might be
 * changed in a future version.
 *
- * @returns the number of byte written or -1 in case of failure.
+ * @returns the number of bytes written or -1 in case of failure.
 */
 int
 htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
@@ -461,8 +472,9 @@ htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
 * @param doc  the document
 * @param cur  the current node
 *
- * Dump an HTML node, recursive behaviour,children are printed too,
+ * Same as htmlNodeDumpFileFormat() with `format` set to 1 which is
- * and formatting returns are added.
+ * typically undesired. Use of this function is DISCOURAGED in favor
 * of htmlNodeDumpFileFormat().
 */
 void
 htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
@@ -473,10 +485,19 @@ htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
 * @param cur  the document
 * @param mem  OUT: the memory pointer
 * @param size  OUT: the memory length
- * @param format  should formatting spaces been added
+ * @param format  should formatting newlines been added
 *
- * Dump an HTML document in memory and return the xmlChar * and it's size.
+ * Serialize an HTML node to a memory, also returning the size of
- * It's up to the caller to free the memory.
+ * the result. It's up to the caller to free the memory.
 *
 * WARNING: Uses the encoding from a deprecated meta tag, see
 * htmlGetMetaEncoding(). This is typically undesired. If no such
 * tag was found, ASCII with HTML 4.0 named character entities will
 * be used. This is inefficient compared to UTF-8 and might be
 * changed in a future version.
 *
 * Use of this function is therefore DISCOURAGED in favor of
 * htmlDocContentDumpFormatOutput().
 */
 void
 htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
@@ -522,8 +543,10 @@ htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
 * @param mem  OUT: the memory pointer
 * @param size  OUT: the memory length
 *
- * Dump an HTML document in memory and return the xmlChar * and it's size.
+ * Same as htmlDocDumpMemoryFormat() with `format` set to 1 which
- * It's up to the caller to free the memory.
+ * is typically undesired. Also see the warnings there. Use of
 * this function is DISCOURAGED in favor of
 * htmlDocContentDumpFormatOutput().
 */
 void
 htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
@@ -540,11 +563,11 @@ htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
 /**
 * @param buf  the HTML buffer output
 * @param doc  the document
- * @param encoding  the encoding string
+ * @param encoding  the encoding string (unused)
 *
- * TODO: check whether encoding is needed
+ * Serialize the HTML document's DTD, if any.
 *
- * Dump the HTML document DTD, if any.
+ * Ignores `encoding` and uses the encoding of the output buffer.
 */
 static void
 htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
@@ -575,7 +598,7 @@ htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
 * @param doc  the document
 * @param cur  the attribute pointer
 *
- * Dump an HTML attribute
+ * Serialize an HTML attribute.
 */
 static void
 htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
@@ -644,9 +667,11 @@ htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
 * @param doc  the document
 * @param cur  the current node
 * @param encoding  the encoding string (unused)
- * @param format  should formatting spaces been added
+ * @param format  should formatting newlines been added
 *
- * Dump an HTML node, recursive behaviour,children are printed too.
+ * Serialize an HTML node to an output buffer.
 *
 * Ignores `encoding` and uses the encoding of the output buffer.
 */
 void
 htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
@@ -879,8 +904,11 @@ htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
 * @param cur  the current node
 * @param encoding  the encoding string (unused)
 *
- * Dump an HTML node, recursive behaviour,children are printed too,
+ * Same as htmlNodeDumpFormatOutput() with `format` set to 1 which is
- * and formatting returns/spaces are added.
+ * typically undesired. Use of this function is DISCOURAGED in favor
 * of htmlNodeDumpFormatOutput().
 *
 * Ignores `encoding` and uses the encoding of the output buffer.
 */
 void
 htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
@@ -892,9 +920,11 @@ htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
 * @param buf  the HTML buffer output
 * @param cur  the document
 * @param encoding  the encoding string (unused)
- * @param format  should formatting spaces been added
+ * @param format  should formatting newlines been added
 *
- * Dump an HTML document.
+ * Serialize an HTML document to an output buffer.
 *
 * Ignores `encoding` and uses the encoding of the output buffer.
 */
 void
 htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
@@ -915,7 +945,11 @@ htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
 * @param cur  the document
 * @param encoding  the encoding string (unused)
 *
- * Dump an HTML document. Formatting return/spaces are added.
+ * Same as htmlNodeDumpFormatOutput() with `format` set to 1 which is
 * typically undesired. Use of this function is DISCOURAGED in favor
 * of htmlDocContentDumpFormatOutput().
 *
 * Ignores `encoding` and uses the encoding of the output buffer.
 */
 void
 htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
@@ -933,9 +967,21 @@ htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
 * @param f  the FILE*
 * @param cur  the document
 *
- * Dump an HTML document to an open FILE.
+ * Serialize an HTML document to an open `FILE`.
 *
- * @returns the number of byte written or -1 in case of failure.
+ * WARNING: Uses the encoding from a deprecated meta tag, see
 * htmlGetMetaEncoding(). This is typically undesired. If no such
 * tag was found, ASCII with HTML 4.0 named character entities will
 * be used. This is inefficient compared to UTF-8 and might be
 * changed in a future version.
 *
 * Also enables "formatting" unconditionally which is typically
 * undesired.
 *
 * Use of this function is DISCOURAGED in favor of
 * htmlNodeDumpFileFormat().
 *
 * @returns the number of bytes written or -1 in case of failure.
 */
 int
 htmlDocDump(FILE *f, xmlDocPtr cur) {
@@ -966,9 +1012,23 @@ htmlDocDump(FILE *f, xmlDocPtr cur) {
 * @param filename  the filename (or URL)
 * @param cur  the document
 *
- * Dump an HTML document to a file. If `filename` is "-" the stdout file is
+ * Serialize an HTML document to a file. If `filename` is `"-"`,
- * used.
+ * stdout is used. This is potentially insecure and might be
- * @returns the number of byte written or -1 in case of failure.
+ * changed in a future version.
 *
 * WARNING: Uses the encoding from a deprecated meta tag, see
 * htmlGetMetaEncoding(). This is typically undesired. If no such
 * tag was found, ASCII with HTML 4.0 named character entities will
 * be used. This is inefficient compared to UTF-8 and might be
 * changed in a future version.
 *
 * Also enables "formatting" unconditionally which is typically
 * undesired.
 *
 * Use of this function is DISCOURAGED in favor of
 * htmlSaveFileFormat().
 *
 * @returns the number of bytes written or -1 in case of failure.
 */
 int
 htmlSaveFile(const char *filename, xmlDocPtr cur) {
@@ -998,12 +1058,18 @@ htmlSaveFile(const char *filename, xmlDocPtr cur) {
 /**
 * @param filename  the filename
 * @param cur  the document
- * @param format  should formatting spaces been added
+ * @param format  should formatting newlines been added
- * @param encoding  the document encoding
+ * @param encoding  the document encoding (optional)
 *
- * Dump an HTML document to a file using a given encoding.
+ * Serialize an HTML document to a file using a given encoding.
 * If `filename` is `"-"`, stdout is used. This is potentially
 * insecure and might be changed in a future version.
 *
- * @returns the number of byte written or -1 in case of failure.
+ * If encoding is NULL, ASCII with HTML 4.0 named character entities
 * will be used. This is inefficient compared to UTF-8 and might be
 * changed in a future version.
 *
 * @returns the number of bytes written or -1 in case of failure.
 */
 int
 htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
@@ -1042,10 +1108,11 @@ htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
 * @param cur  the document
 * @param encoding  the document encoding
 *
- * Dump an HTML document to a file using a given encoding
+ * Same as htmlSaveFileFormat() with `format` set to 1 which is
- * and formatting returns/spaces are added.
+ * typically undesired. Also see the warnings there. Use of this
 * function is DISCOURAGED in favor of htmlSaveFileFormat().
 *
- * @returns the number of byte written or -1 in case of failure.
+ * @returns the number of bytes written or -1 in case of failure.
 */
 int
 htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
--- a/include/libxml/HTMLtree.h
+++ b/include/libxml/HTMLtree.h
@@ -1,10 +1,10 @@
 /**
 * @file
 * 
- * @brief specific APIs to process HTML tree, especially serialization
+ * @brief HTML documents
 * 
- * this module implements a few function needed to process
+ * This modules implements functions to work with HTML documents,
- *              tree in an HTML specific way.
+ * most of them related to serialization.
 *
 * @copyright See Copyright for the status of this software.
 *
@@ -25,31 +25,11 @@
 extern "C" {
 #endif
-
+/* Deprecated */
 /**
 * Macro. A text node in a HTML document is really implemented
 * the same way as a text node in an XML document.
 */
 #define HTML_TEXT_NODE		XML_TEXT_NODE
 /**
 * Macro. An entity reference in a HTML document is really implemented
 * the same way as an entity reference in an XML document.
 */
 #define HTML_ENTITY_REF_NODE	XML_ENTITY_REF_NODE
 /**
 * Macro. A comment in a HTML document is really implemented
 * the same way as a comment in an XML document.
 */
 #define HTML_COMMENT_NODE	XML_COMMENT_NODE
 /**
 * Macro. A preserved node in a HTML document is really implemented
 * the same way as a CDATA section in an XML document.
 */
 #define HTML_PRESERVE_NODE	XML_CDATA_SECTION_NODE
 /**
 * Macro. A processing instruction in a HTML document is really implemented
 * the same way as a processing instruction in an XML document.
 */
 #define HTML_PI_NODE		XML_PI_NODE
 XMLPUBFUN htmlDocPtr
@@ -73,16 +53,25 @@ XMLPUBFUN void
 					 xmlChar **mem,
 					 int *size,
 					 int format);
 XMLPUBFUN int
 		htmlDocDump		(FILE *f,
 					 xmlDocPtr cur);
 XMLPUBFUN int
 		htmlSaveFile		(const char *filename,
 					 xmlDocPtr cur);
 XMLPUBFUN int
 		htmlSaveFileEnc		(const char *filename,
 					 xmlDocPtr cur,
 					 const char *encoding);
 XMLPUBFUN int
 		htmlSaveFileFormat	(const char *filename,
 					 xmlDocPtr cur,
 					 const char *encoding,
 					 int format);
 XMLPUBFUN int
 		htmlNodeDump		(xmlBufferPtr buf,
 					 xmlDocPtr doc,
 					 xmlNodePtr cur);
 XMLPUBFUN int
 		htmlDocDump		(FILE *f,
 					 xmlDocPtr cur);
 XMLPUBFUN void
 		htmlNodeDumpFile	(FILE *out,
 					 xmlDocPtr doc,
@@ -93,16 +82,12 @@ XMLPUBFUN int
 					 xmlNodePtr cur,
 					 const char *encoding,
 					 int format);
 XMLPUBFUN int
 		htmlSaveFileEnc		(const char *filename,
 					 xmlDocPtr cur,
 					 const char *encoding);
 XMLPUBFUN int
 		htmlSaveFileFormat	(const char *filename,
 					 xmlDocPtr cur,
 					 const char *encoding,
 					 int format);
 XMLPUBFUN void
 		htmlNodeDumpOutput	(xmlOutputBufferPtr buf,
 					 xmlDocPtr doc,
 					 xmlNodePtr cur,
 					 const char *encoding);
 XMLPUBFUN void
 		htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf,
 					 xmlDocPtr doc,
@@ -118,11 +103,6 @@ XMLPUBFUN void
 					 xmlDocPtr cur,
 					 const char *encoding,
 					 int format);
 XMLPUBFUN void
 		htmlNodeDumpOutput	(xmlOutputBufferPtr buf,
 					 xmlDocPtr doc,
 					 xmlNodePtr cur,
 					 const char *encoding);
 #endif /* LIBXML_OUTPUT_ENABLED */