|  |  |  | Gnome XML Library Reference Manual |  | 
|---|
HTMLparser —
typedef htmlParserCtxt; typedef htmlParserCtxtPtr; typedef htmlParserNodeInfo; typedef htmlSAXHandler; typedef htmlSAXHandlerPtr; typedef htmlParserInput; typedef htmlParserInputPtr; typedef htmlDocPtr; typedef htmlNodePtr; struct htmlElemDesc; typedef htmlElemDescPtr; struct htmlEntityDesc; typedef htmlEntityDescPtr; const htmlElemDesc* htmlTagLookup (const xmlChar *tag); const htmlEntityDesc* htmlEntityLookup (const xmlChar *name); const htmlEntityDesc* htmlEntityValueLookup (unsigned int value); int htmlIsAutoClosed (htmlDocPtr doc, htmlNodePtr elem); int htmlAutoCloseTag (htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem); const htmlEntityDesc* htmlParseEntityRef (htmlParserCtxtPtr ctxt, const xmlChar **str); int htmlParseCharRef (htmlParserCtxtPtr ctxt); void htmlParseElement (htmlParserCtxtPtr ctxt); htmlParserCtxtPtr htmlCreateMemoryParserCtxt (const char *buffer, int size); int htmlParseDocument (htmlParserCtxtPtr ctxt); htmlDocPtr htmlSAXParseDoc (xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData); htmlDocPtr htmlParseDoc (xmlChar *cur, const char *encoding); htmlDocPtr htmlSAXParseFile (const char *filename, const char *encoding, htmlSAXHandlerPtr sax, void *userData); htmlDocPtr htmlParseFile (const char *filename, const char *encoding); int UTF8ToHtml (unsigned char *out, int *outlen, unsigned char *in, int *inlen); int htmlEncodeEntities (unsigned char *out, int *outlen, unsigned char *in, int *inlen, int quoteChar); int htmlIsScriptAttribute (const xmlChar *name); int htmlHandleOmittedElem (int val); htmlParserCtxtPtr htmlCreatePushParserCtxt (htmlSAXHandlerPtr sax, void *user_data, const char *chunk, int size, const char *filename, xmlCharEncoding enc); int htmlParseChunk (htmlParserCtxtPtr ctxt, const char *chunk, int size, int terminate); void htmlFreeParserCtxt (htmlParserCtxtPtr ctxt); enum htmlParserOption; void htmlCtxtReset (htmlParserCtxtPtr ctxt); int htmlCtxtUseOptions (htmlParserCtxtPtr ctxt, int options); htmlDocPtr htmlReadDoc (const xmlChar *cur, const char *URL, const char *encoding, int options); htmlDocPtr htmlReadFile (const char *URL, const char *encoding, int options); htmlDocPtr htmlReadMemory (const char *buffer, int size, const char *URL, const char *encoding, int options); htmlDocPtr htmlReadFd (int fd, const char *URL, const char *encoding, int options); htmlDocPtr htmlReadIO (xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, void *ioctx, const char *URL, const char *encoding, int options); htmlDocPtr htmlCtxtReadDoc (xmlParserCtxtPtr ctxt, const xmlChar *cur, const char *URL, const char *encoding, int options); htmlDocPtr htmlCtxtReadFile (xmlParserCtxtPtr ctxt, const char *filename, const char *encoding, int options); htmlDocPtr htmlCtxtReadMemory (xmlParserCtxtPtr ctxt, const char *buffer, int size, const char *URL, const char *encoding, int options); htmlDocPtr htmlCtxtReadFd (xmlParserCtxtPtr ctxt, int fd, const char *URL, const char *encoding, int options); htmlDocPtr htmlCtxtReadIO (xmlParserCtxtPtr ctxt, xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, void *ioctx, const char *URL, const char *encoding, int options);
struct htmlElemDesc {
    const char *name;	/* The tag name */
    char startTag;      /* Whether the start tag can be implied */
    char endTag;        /* Whether the end tag can be implied */
    char saveEndTag;    /* Whether the end tag should be saved */
    char empty;         /* Is this an empty element ? */
    char depr;          /* Is this a deprecated element ? */
    char dtd;           /* 1: only in Loose DTD, 2: only Frameset one */
    char isinline;      /* is this a block 0 or inline 1 element */
    const char *desc;   /* the description */
/* NRK Jan.2003
 * New fields encapsulating HTML structure
 *
 * Bugs:
 *	This is a very limited representation.  It fails to tell us when
 *	an element *requires* subelements (we only have whether they're
 *	allowed or not), and it doesn't tell us where CDATA and PCDATA
 *	are allowed.  Some element relationships are not fully represented:
 *	these are flagged with the word MODIFIER
 */
    const char** subelts;		/* allowed sub-elements of this element */
    const char* defaultsubelt;	/* subelement for suggested auto-repair
					   if necessary or NULL */
    const char** attrs_opt;		/* Optional Attributes */
    const char** attrs_depr;		/* Additional deprecated attributes */
    const char** attrs_req;		/* Required attributes */
};
struct htmlEntityDesc {
    unsigned int value;	/* the UNICODE value for the character */
    const char *name;	/* The entity name */
    const char *desc;   /* the description */
};
const htmlElemDesc* htmlTagLookup (const xmlChar *tag);
Lookup the HTML tag in the ElementTable
| tag: | The tag name in lowercase | 
| Returns : | the related htmlElemDescPtr or NULL if not found. | 
const htmlEntityDesc* htmlEntityLookup (const xmlChar *name);
Lookup the given entity in EntitiesTable
TODO: the linear scan is really ugly, an hash table is really needed.
| name: | the entity name | 
| Returns : | the associated htmlEntityDescPtr if found, NULL otherwise. | 
const htmlEntityDesc* htmlEntityValueLookup (unsigned int value);
Lookup the given entity in EntitiesTable
TODO: the linear scan is really ugly, an hash table is really needed.
| value: | the entity's unicode value | 
| Returns : | the associated htmlEntityDescPtr if found, NULL otherwise. | 
int htmlIsAutoClosed (htmlDocPtr doc, htmlNodePtr elem);
The HTML DTD allows a tag to implicitly close other tags. The list is kept in htmlStartClose array. This function checks if a tag is autoclosed by one of it's child
| doc: | the HTML document | 
| elem: | the HTML element | 
| Returns : | 1 if autoclosed, 0 otherwise | 
int htmlAutoCloseTag (htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem);
The HTML DTD allows a tag to implicitly close other tags. The list is kept in htmlStartClose array. This function checks if the element or one of it's children would autoclose the given tag.
| doc: | the HTML document | 
| name: | The tag name | 
| elem: | the HTML element | 
| Returns : | 1 if autoclose, 0 otherwise | 
const htmlEntityDesc* htmlParseEntityRef (htmlParserCtxtPtr ctxt, const xmlChar **str);
parse an HTML ENTITY references
[68] EntityRef ::= '&' Name ';'
| ctxt: | an HTML parser context | 
| str: | location to store the entity name | 
| Returns : | the associated htmlEntityDescPtr if found, or NULL otherwise, if non-NULL *str will have to be freed by the caller. | 
int htmlParseCharRef (htmlParserCtxtPtr ctxt);
parse Reference declarations
[66] CharRef ::= '&#' [0-9]+ ';' |
                 '&
| ctxt: | an HTML parser context | 
| Returns : | the value parsed (as an int) | 
void htmlParseElement (htmlParserCtxtPtr ctxt);
parse an HTML element, this is highly recursive
[39] element ::= EmptyElemTag | STag content ETag
[41] Attribute ::= Name Eq AttValue
| ctxt: | an HTML parser context | 
htmlParserCtxtPtr htmlCreateMemoryParserCtxt (const char *buffer, int size);
Create a parser context for an HTML in-memory document.
| buffer: | a pointer to a char array | 
| size: | the size of the array | 
| Returns : | the new parser context or NULL | 
int htmlParseDocument (htmlParserCtxtPtr ctxt);
parse an HTML document (and build a tree if using the standard SAX interface).
| ctxt: | an HTML parser context | 
| Returns : | 0, -1 in case of error. the parser context is augmented as a result of the parsing. | 
htmlDocPtr htmlSAXParseDoc (xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData);
Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks to handle parse events. If sax is NULL, fallback to the default DOM behavior and return a tree.
| cur: | a pointer to an array of xmlChar | 
| encoding: | a free form C string describing the HTML document encoding, or NULL | 
| sax: | the SAX handler block | 
| userData: | if using SAX, this pointer will be provided on callbacks. | 
| Returns : | the resulting document tree unless SAX is NULL or the document is not well formed. | 
htmlDocPtr htmlParseDoc (xmlChar *cur, const char *encoding);
parse an HTML in-memory document and build a tree.
| cur: | a pointer to an array of xmlChar | 
| encoding: | a free form C string describing the HTML document encoding, or NULL | 
| Returns : | the resulting document tree | 
htmlDocPtr htmlSAXParseFile (const char *filename, const char *encoding, htmlSAXHandlerPtr sax, void *userData);
parse an HTML file and build a tree. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time. It use the given SAX function block to handle the parsing callback. If sax is NULL, fallback to the default DOM tree building routines.
| filename: | the filename | 
| encoding: | a free form C string describing the HTML document encoding, or NULL | 
| sax: | the SAX handler block | 
| userData: | if using SAX, this pointer will be provided on callbacks. | 
| Returns : | the resulting document tree unless SAX is NULL or the document is not well formed. | 
htmlDocPtr htmlParseFile (const char *filename, const char *encoding);
parse an HTML file and build a tree. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time.
| filename: | the filename | 
| encoding: | a free form C string describing the HTML document encoding, or NULL | 
| Returns : | the resulting document tree | 
int         UTF8ToHtml                      (unsigned char *out,
                                             int *outlen,
                                             unsigned char *in,
                                             int *inlen);Take a block of UTF-8 chars in and try to convert it to an ASCII plus HTML entities block of chars out.
| out: | a pointer to an array of bytes to store the result | 
| outlen: | the length of out | 
| in: | a pointer to an array of UTF-8 chars | 
| inlen: | the length of in | 
| Returns : | 0 if success, -2 if the transcoding fails, or -1 otherwise The value of inlen after return is the number of octets consumed as the return value is positive, else unpredictable. The value of outlen after return is the number of octets consumed. | 
int         htmlEncodeEntities              (unsigned char *out,
                                             int *outlen,
                                             unsigned char *in,
                                             int *inlen,
                                             int quoteChar);Take a block of UTF-8 chars in and try to convert it to an ASCII plus HTML entities block of chars out.
| out: | a pointer to an array of bytes to store the result | 
| outlen: | the length of out | 
| in: | a pointer to an array of UTF-8 chars | 
| inlen: | the length of in | 
| quoteChar: | the quote character to escape (' or ") or zero. | 
| Returns : | 0 if success, -2 if the transcoding fails, or -1 otherwise The value of inlen after return is the number of octets consumed as the return value is positive, else unpredictable. The value of outlen after return is the number of octets consumed. | 
int htmlIsScriptAttribute (const xmlChar *name);
Check if an attribute is of content type Script
| name: | an attribute name | 
| Returns : | 1 is the attribute is a script 0 otherwise | 
int htmlHandleOmittedElem (int val);
Set and return the previous value for handling HTML omitted tags.
| val: | int 0 or 1 | 
| Returns : | the last value for 0 for no handling, 1 for auto insertion. | 
htmlParserCtxtPtr htmlCreatePushParserCtxt (htmlSAXHandlerPtr sax, void *user_data, const char *chunk, int size, const char *filename, xmlCharEncoding enc);
Create a parser context for using the HTML parser in push mode The value of filename is used for fetching external entities and error/warning reports.
| sax: | a SAX handler | 
| user_data: | The user data returned on SAX callbacks | 
| chunk: | a pointer to an array of chars | 
| size: | number of chars in the array | 
| filename: | an optional file name or URI | 
| enc: | an optional encoding | 
| Returns : | the new parser context or NULL | 
int htmlParseChunk (htmlParserCtxtPtr ctxt, const char *chunk, int size, int terminate);
Parse a Chunk of memory
| ctxt: | an HTML parser context | 
| chunk: | an char array | 
| size: | the size in byte of the chunk | 
| terminate: | last chunk indicator | 
| Returns : | zero if no error, the xmlParserErrors otherwise. | 
void htmlFreeParserCtxt (htmlParserCtxtPtr ctxt);
Free all the memory used by a parser context. However the parsed document in ctxt->myDoc is not freed.
| ctxt: | an HTML parser context | 
typedef enum {
    HTML_PARSE_NOERROR	= 1<<5,	/* suppress error reports */
    HTML_PARSE_NOWARNING= 1<<6,	/* suppress warning reports */
    HTML_PARSE_PEDANTIC	= 1<<7,	/* pedantic error reporting */
    HTML_PARSE_NOBLANKS	= 1<<8,	/* remove blank nodes */
    HTML_PARSE_NONET	= 1<<11 /* Forbid network access */
} htmlParserOption;
void htmlCtxtReset (htmlParserCtxtPtr ctxt);
Reset a parser context
| ctxt: | an HTML parser context | 
int htmlCtxtUseOptions (htmlParserCtxtPtr ctxt, int options);
Applies the options to the parser context
| ctxt: | an HTML parser context | 
| options: | a combination of htmlParserOption(s) | 
| Returns : | 0 in case of success, the set of unknown or unimplemented options in case of error. | 
htmlDocPtr htmlReadDoc (const xmlChar *cur, const char *URL, const char *encoding, int options);
parse an XML in-memory document and build a tree.
| cur: | a pointer to a zero terminated string | 
| URL: | the base URL to use for the document | 
| encoding: | the document encoding, or NULL | 
| options: | a combination of htmlParserOption(s) | 
| Returns : | the resulting document tree | 
htmlDocPtr htmlReadFile (const char *URL, const char *encoding, int options);
parse an XML file from the filesystem or the network.
| URL: | |
| encoding: | the document encoding, or NULL | 
| options: | a combination of htmlParserOption(s) | 
| Returns : | the resulting document tree | 
htmlDocPtr htmlReadMemory (const char *buffer, int size, const char *URL, const char *encoding, int options);
parse an XML in-memory document and build a tree.
| buffer: | a pointer to a char array | 
| size: | the size of the array | 
| URL: | the base URL to use for the document | 
| encoding: | the document encoding, or NULL | 
| options: | a combination of htmlParserOption(s) | 
| Returns : | the resulting document tree | 
htmlDocPtr htmlReadFd (int fd, const char *URL, const char *encoding, int options);
parse an XML from a file descriptor and build a tree.
| fd: | an open file descriptor | 
| URL: | the base URL to use for the document | 
| encoding: | the document encoding, or NULL | 
| options: | a combination of htmlParserOption(s) | 
| Returns : | the resulting document tree | 
htmlDocPtr htmlReadIO (xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, void *ioctx, const char *URL, const char *encoding, int options);
parse an HTML document from I/O functions and source and build a tree.
| ioread: | an I/O read function | 
| ioclose: | an I/O close function | 
| ioctx: | an I/O handler | 
| URL: | the base URL to use for the document | 
| encoding: | the document encoding, or NULL | 
| options: | a combination of htmlParserOption(s) | 
| Returns : | the resulting document tree | 
htmlDocPtr htmlCtxtReadDoc (xmlParserCtxtPtr ctxt, const xmlChar *cur, const char *URL, const char *encoding, int options);
parse an XML in-memory document and build a tree. This reuses the existing ctxt parser context
| ctxt: | an HTML parser context | 
| cur: | a pointer to a zero terminated string | 
| URL: | the base URL to use for the document | 
| encoding: | the document encoding, or NULL | 
| options: | a combination of htmlParserOption(s) | 
| Returns : | the resulting document tree | 
htmlDocPtr htmlCtxtReadFile (xmlParserCtxtPtr ctxt, const char *filename, const char *encoding, int options);
parse an XML file from the filesystem or the network. This reuses the existing ctxt parser context
| ctxt: | an HTML parser context | 
| filename: | a file or URL | 
| encoding: | the document encoding, or NULL | 
| options: | a combination of htmlParserOption(s) | 
| Returns : | the resulting document tree | 
htmlDocPtr htmlCtxtReadMemory (xmlParserCtxtPtr ctxt, const char *buffer, int size, const char *URL, const char *encoding, int options);
parse an XML in-memory document and build a tree. This reuses the existing ctxt parser context
| ctxt: | an HTML parser context | 
| buffer: | a pointer to a char array | 
| size: | the size of the array | 
| URL: | the base URL to use for the document | 
| encoding: | the document encoding, or NULL | 
| options: | a combination of htmlParserOption(s) | 
| Returns : | the resulting document tree | 
htmlDocPtr htmlCtxtReadFd (xmlParserCtxtPtr ctxt, int fd, const char *URL, const char *encoding, int options);
parse an XML from a file descriptor and build a tree. This reuses the existing ctxt parser context
| ctxt: | an HTML parser context | 
| fd: | an open file descriptor | 
| URL: | the base URL to use for the document | 
| encoding: | the document encoding, or NULL | 
| options: | a combination of htmlParserOption(s) | 
| Returns : | the resulting document tree | 
htmlDocPtr htmlCtxtReadIO (xmlParserCtxtPtr ctxt, xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, void *ioctx, const char *URL, const char *encoding, int options);
parse an HTML document from I/O functions and source and build a tree. This reuses the existing ctxt parser context
| ctxt: | an HTML parser context | 
| ioread: | an I/O read function | 
| ioclose: | an I/O close function | 
| ioctx: | an I/O handler | 
| URL: | the base URL to use for the document | 
| encoding: | the document encoding, or NULL | 
| options: | a combination of htmlParserOption(s) | 
| Returns : | the resulting document tree | 
| << entities | valid >> |