Table of ContentsTypedef xmlParserNodeInfo htmlParserNodeInfo
int htmlIsScriptAttribute (const xmlChar * name)
int htmlHandleOmittedElem (int val)
htmlDocPtr htmlCtxtReadDoc (htmlParserCtxtPtr ctxt, const xmlChar * cur, const char * URL, const char * encoding, int options)
Typedef xmlNodePtr htmlNodePtr
Typedef xmlParserCtxtPtr htmlParserCtxtPtr
htmlDocPtr htmlParseFile (const char * filename, const char * encoding)
htmlDocPtr htmlReadIO (xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, void * ioctx, const char * URL, const char * encoding, int options)
int htmlAutoCloseTag (htmlDocPtr doc, const xmlChar * name, htmlNodePtr elem)
int htmlParseChunk (htmlParserCtxtPtr ctxt, const char * chunk, int size, int terminate)
const htmlElemDesc * htmlTagLookup (const xmlChar * tag)
htmlParserCtxtPtr htmlCreateMemoryParserCtxt (const char * buffer, int size)
void htmlCtxtReset (htmlParserCtxtPtr ctxt)
Typedef xmlSAXHandler htmlSAXHandler
int htmlElementAllowedHere (const htmlElemDesc * parent, const xmlChar * elt)
htmlDocPtr htmlCtxtReadIO (htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, void * ioctx, const char * URL, const char * encoding, int options)
Typedef xmlSAXHandlerPtr htmlSAXHandlerPtr
Enum htmlStatus {
HTML_NA = 0 : something we don't check at all
HTML_INVALID = 1
HTML_DEPRECATED = 2
HTML_VALID = 4
HTML_REQUIRED = 12 : VALID bit set so ( & HTML_VALID ) is TRUE
}
htmlParserCtxtPtr htmlCreatePushParserCtxt (htmlSAXHandlerPtr sax, void * user_data, const char * chunk, int size, const char * filename, xmlCharEncoding enc)
htmlDocPtr htmlReadMemory (const char * buffer, int size, const char * URL, const char * encoding, int options)
int htmlIsAutoClosed (htmlDocPtr doc, htmlNodePtr elem)
Typedef xmlDocPtr htmlDocPtr
htmlDocPtr htmlReadDoc (const xmlChar * cur, const char * URL, const char * encoding, int options)
Enum htmlParserOption {
HTML_PARSE_NOERROR = 32 : suppress error reports
HTML_PARSE_NOWARNING = 64 : suppress warning reports
HTML_PARSE_PEDANTIC = 128 : pedantic error reporting
HTML_PARSE_NOBLANKS = 256 : remove blank nodes
HTML_PARSE_NONET = 2048 : Forbid network access
}
Typedef htmlEntityDesc * htmlEntityDescPtr
int htmlEncodeEntities (unsigned char * out, int * outlen, const unsigned char * in, int * inlen, int quoteChar)
Typedef xmlParserCtxt htmlParserCtxt
htmlStatus htmlNodeStatus (const htmlNodePtr node, int legacy)
htmlStatus htmlAttrAllowed (const htmlElemDesc * elt, const xmlChar * attr, int legacy)
#define htmlDefaultSubelement
int htmlParseCharRef (htmlParserCtxtPtr ctxt)
htmlDocPtr htmlSAXParseFile (const char * filename, const char * encoding, htmlSAXHandlerPtr sax, void * userData)
const htmlEntityDesc * htmlParseEntityRef (htmlParserCtxtPtr ctxt, const xmlChar ** str)
Typedef xmlParserInput htmlParserInput
htmlStatus htmlElementStatusHere (const htmlElemDesc * parent, const htmlElemDesc * elt)
const htmlEntityDesc * htmlEntityValueLookup (unsigned int value)
void htmlParseElement (htmlParserCtxtPtr ctxt)
int UTF8ToHtml (unsigned char * out, int * outlen, const unsigned char * in, int * inlen)
#define htmlRequiredAttrs
const htmlEntityDesc * htmlEntityLookup (const xmlChar * name)
void htmlFreeParserCtxt (htmlParserCtxtPtr ctxt)
htmlDocPtr htmlCtxtReadMemory (htmlParserCtxtPtr ctxt, const char * buffer, int size, const char * URL, const char * encoding, int options)
htmlDocPtr htmlCtxtReadFd (htmlParserCtxtPtr ctxt, int fd, const char * URL, const char * encoding, int options)
Structure htmlEntityDesc struct _htmlEntityDesc {
unsigned int value : the UNICODE value for the character
const char * name : The entity name
const char * desc : the description
}
#define htmlElementAllowedHereDesc
htmlDocPtr htmlReadFile (const char * filename, const char * encoding, int options)
htmlDocPtr htmlCtxtReadFile (htmlParserCtxtPtr ctxt, const char * filename, const char * encoding, int options)
Typedef htmlElemDesc * htmlElemDescPtr
Structure htmlElemDesc struct _htmlElemDesc {
const char * name : The tag name
char startTag : Whether the start tag can be implied
char endTag : Whether the end tag can be implied
char saveEndTag : Whether the end tag should be saved
char empty : Is this an empty element ?
char depr : Is this a deprecated element ?
char dtd : 1: only in Loose DTD, 2: only Frameset one
char isinline : is this a block 0 or inline 1 element
const char * desc : the description NRK Jan.2003 * New fields encapsu
const char ** subelts : allowed sub-elements of this element
const char * defaultsubelt : subelement for suggested auto-repair if necessary
const char ** attrs_opt : Optional Attributes
const char ** attrs_depr : Additional deprecated attributes
const char ** attrs_req : Required attributes
}
htmlDocPtr htmlSAXParseDoc (xmlChar * cur, const char * encoding, htmlSAXHandlerPtr sax, void * userData)
int htmlCtxtUseOptions (htmlParserCtxtPtr ctxt, int options)
Typedef xmlParserInputPtr htmlParserInputPtr
htmlDocPtr htmlReadFd (int fd, const char * URL, const char * encoding, int options)
htmlDocPtr htmlParseDoc (xmlChar * cur, const char * encoding)
int htmlParseDocument (htmlParserCtxtPtr ctxt)
Description
Function: htmlIsScriptAttributeint htmlIsScriptAttribute (const xmlChar * name)
Check if an attribute is of content type Script
Function: htmlHandleOmittedElemint htmlHandleOmittedElem (int val)
Set and return the previous value for handling HTML omitted tags.
val: | int 0 or 1 | Returns: | the last value for 0 for no handling, 1 for auto insertion. |
Function: htmlCtxtReadDochtmlDocPtr htmlCtxtReadDoc (htmlParserCtxtPtr ctxt, const xmlChar * cur, const char * URL, const char * encoding, int options)
parse an XML in-memory document and build a tree. This reuses the existing @ctxt parser context
ctxt: | an HTML parser context | cur: | a pointer to a zero terminated string | URL: | the base URL to use for the document | encoding: | the document encoding, or NULL | options: | a combination of htmlParserOption(s) | Returns: | the resulting document tree |
Function: htmlParseFilehtmlDocPtr htmlParseFile (const char * filename, const char * encoding)
parse an HTML file and build a tree. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time.
filename: | the filename | encoding: | a free form C string describing the HTML document encoding, or NULL | Returns: | the resulting document tree |
Function: htmlReadIOhtmlDocPtr htmlReadIO (xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, void * ioctx, const char * URL, const char * encoding, int options)
parse an HTML document from I/O functions and source and build a tree.
ioread: | an I/O read function | ioclose: | an I/O close function | ioctx: | an I/O handler | URL: | the base URL to use for the document | encoding: | the document encoding, or NULL | options: | a combination of htmlParserOption(s) | Returns: | the resulting document tree |
Function: htmlAutoCloseTagint htmlAutoCloseTag (htmlDocPtr doc, const xmlChar * name, htmlNodePtr elem)
The HTML DTD allows a tag to implicitly close other tags. The list is kept in htmlStartClose array. This function checks if the element or one of it's children would autoclose the given tag.
doc: | the HTML document | name: | The tag name | elem: | the HTML element | Returns: | 1 if autoclose, 0 otherwise |
Function: htmlParseChunkint htmlParseChunk (htmlParserCtxtPtr ctxt, const char * chunk, int size, int terminate)
Parse a Chunk of memory
ctxt: | an HTML parser context | chunk: | an char array | size: | the size in byte of the chunk | terminate: | last chunk indicator | Returns: | zero if no error, the xmlParserErrors otherwise. |
Function: htmlTagLookupconst htmlElemDesc * htmlTagLookup (const xmlChar * tag)
Lookup the HTML tag in the ElementTable
tag: | The tag name in lowercase | Returns: | the related htmlElemDescPtr or NULL if not found. |
Function: htmlCreateMemoryParserCtxthtmlParserCtxtPtr htmlCreateMemoryParserCtxt (const char * buffer, int size)
Create a parser context for an HTML in-memory document.
buffer: | a pointer to a char array | size: | the size of the array | Returns: | the new parser context or NULL |
Function: htmlCtxtResetvoid htmlCtxtReset (htmlParserCtxtPtr ctxt)
Reset a parser context
ctxt: | an HTML parser context |
Function: htmlElementAllowedHereint htmlElementAllowedHere (const htmlElemDesc * parent, const xmlChar * elt)
Checks whether an HTML element may be a direct child of a parent element. Note - doesn't check for deprecated elements
parent: | HTML parent element | elt: | HTML element | Returns: | 1 if allowed; 0 otherwise. |
Function: htmlCtxtReadIOhtmlDocPtr htmlCtxtReadIO (htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, void * ioctx, const char * URL, const char * encoding, int options)
parse an HTML document from I/O functions and source and build a tree. This reuses the existing @ctxt parser context
ctxt: | an HTML parser context | ioread: | an I/O read function | ioclose: | an I/O close function | ioctx: | an I/O handler | URL: | the base URL to use for the document | encoding: | the document encoding, or NULL | options: | a combination of htmlParserOption(s) | Returns: | the resulting document tree |
Function: htmlCreatePushParserCtxthtmlParserCtxtPtr htmlCreatePushParserCtxt (htmlSAXHandlerPtr sax, void * user_data, const char * chunk, int size, const char * filename, xmlCharEncoding enc)
Create a parser context for using the HTML parser in push mode The value of @filename is used for fetching external entities and error/warning reports.
sax: | a SAX handler | user_data: | The user data returned on SAX callbacks | chunk: | a pointer to an array of chars | size: | number of chars in the array | filename: | an optional file name or URI | enc: | an optional encoding | Returns: | the new parser context or NULL |
Function: htmlReadMemoryhtmlDocPtr htmlReadMemory (const char * buffer, int size, const char * URL, const char * encoding, int options)
parse an XML in-memory document and build a tree.
buffer: | a pointer to a char array | size: | the size of the array | URL: | the base URL to use for the document | encoding: | the document encoding, or NULL | options: | a combination of htmlParserOption(s) | Returns: | the resulting document tree |
Function: htmlIsAutoClosedint htmlIsAutoClosed (htmlDocPtr doc, htmlNodePtr elem)
The HTML DTD allows a tag to implicitly close other tags. The list is kept in htmlStartClose array. This function checks if a tag is autoclosed by one of it's child
doc: | the HTML document | elem: | the HTML element | Returns: | 1 if autoclosed, 0 otherwise |
Function: htmlReadDochtmlDocPtr htmlReadDoc (const xmlChar * cur, const char * URL, const char * encoding, int options)
parse an XML in-memory document and build a tree.
cur: | a pointer to a zero terminated string | URL: | the base URL to use for the document | encoding: | the document encoding, or NULL | options: | a combination of htmlParserOption(s) | Returns: | the resulting document tree |
Function: htmlEncodeEntitiesint htmlEncodeEntities (unsigned char * out, int * outlen, const unsigned char * in, int * inlen, int quoteChar)
Take a block of UTF-8 chars in and try to convert it to an ASCII plus HTML entities block of chars out.
out: | a pointer to an array of bytes to store the result | outlen: | the length of @out | in: | a pointer to an array of UTF-8 chars | inlen: | the length of @in | quoteChar: | the quote character to escape (' or ") or zero. | Returns: | 0 if success, -2 if the transcoding fails, or -1 otherwise The value of @inlen after return is the number of octets consumed as the return value is positive, else unpredictable. The value of @outlen after return is the number of octets consumed. |
Function: htmlNodeStatushtmlStatus htmlNodeStatus (const htmlNodePtr node, int legacy)
Checks whether the tree node is valid. Experimental (the author only uses the HTML enhancements in a SAX parser)
Function: htmlAttrAllowedhtmlStatus htmlAttrAllowed (const htmlElemDesc * elt, const xmlChar * attr, int legacy)
Checks whether an attribute is valid for an element Has full knowledge of Required and Deprecated attributes
elt: | HTML element | attr: | HTML attribute | legacy: | whether to allow deprecated attributes | Returns: | one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID |
Macro: htmlDefaultSubelement#define htmlDefaultSubelement Returns the default subelement for this element
Function: htmlParseCharRefint htmlParseCharRef (htmlParserCtxtPtr ctxt)
parse Reference declarations [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
ctxt: | an HTML parser context | Returns: | the value parsed (as an int) |
Function: htmlSAXParseFilehtmlDocPtr htmlSAXParseFile (const char * filename, const char * encoding, htmlSAXHandlerPtr sax, void * userData)
parse an HTML file and build a tree. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time. It use the given SAX function block to handle the parsing callback. If sax is NULL, fallback to the default DOM tree building routines.
filename: | the filename | encoding: | a free form C string describing the HTML document encoding, or NULL | sax: | the SAX handler block | userData: | if using SAX, this pointer will be provided on callbacks. | Returns: | the resulting document tree unless SAX is NULL or the document is not well formed. |
Function: htmlParseEntityRefconst htmlEntityDesc * htmlParseEntityRef (htmlParserCtxtPtr ctxt, const xmlChar ** str)
parse an HTML ENTITY references [68] EntityRef ::= '&' Name ';'
ctxt: | an HTML parser context | str: | location to store the entity name | Returns: | the associated htmlEntityDescPtr if found, or NULL otherwise, if non-NULL *str will have to be freed by the caller. |
Function: htmlElementStatusHerehtmlStatus htmlElementStatusHere (const htmlElemDesc * parent, const htmlElemDesc * elt)
Checks whether an HTML element may be a direct child of a parent element. and if so whether it is valid or deprecated.
parent: | HTML parent element | elt: | HTML element | Returns: | one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID |
Function: htmlEntityValueLookupconst htmlEntityDesc * htmlEntityValueLookup (unsigned int value)
Lookup the given entity in EntitiesTable TODO: the linear scan is really ugly, an hash table is really needed.
value: | the entity's unicode value | Returns: | the associated htmlEntityDescPtr if found, NULL otherwise. |
Function: htmlParseElementvoid htmlParseElement (htmlParserCtxtPtr ctxt)
parse an HTML element, this is highly recursive [39] element ::= EmptyElemTag | STag content ETag [41] Attribute ::= Name Eq AttValue
ctxt: | an HTML parser context |
Function: UTF8ToHtmlint UTF8ToHtml (unsigned char * out, int * outlen, const unsigned char * in, int * inlen)
Take a block of UTF-8 chars in and try to convert it to an ASCII plus HTML entities block of chars out.
out: | a pointer to an array of bytes to store the result | outlen: | the length of @out | in: | a pointer to an array of UTF-8 chars | inlen: | the length of @in | Returns: | 0 if success, -2 if the transcoding fails, or -1 otherwise The value of @inlen after return is the number of octets consumed as the return value is positive, else unpredictable. The value of @outlen after return is the number of octets consumed. |
Macro: htmlRequiredAttrs#define htmlRequiredAttrs Returns the attributes required for the specified element.
Function: htmlEntityLookupconst htmlEntityDesc * htmlEntityLookup (const xmlChar * name)
Lookup the given entity in EntitiesTable TODO: the linear scan is really ugly, an hash table is really needed.
name: | the entity name | Returns: | the associated htmlEntityDescPtr if found, NULL otherwise. |
Function: htmlFreeParserCtxtvoid htmlFreeParserCtxt (htmlParserCtxtPtr ctxt)
Free all the memory used by a parser context. However the parsed document in ctxt->myDoc is not freed.
ctxt: | an HTML parser context |
Function: htmlCtxtReadMemoryhtmlDocPtr htmlCtxtReadMemory (htmlParserCtxtPtr ctxt, const char * buffer, int size, const char * URL, const char * encoding, int options)
parse an XML in-memory document and build a tree. This reuses the existing @ctxt parser context
ctxt: | an HTML parser context | buffer: | a pointer to a char array | size: | the size of the array | URL: | the base URL to use for the document | encoding: | the document encoding, or NULL | options: | a combination of htmlParserOption(s) | Returns: | the resulting document tree |
Function: htmlCtxtReadFdhtmlDocPtr htmlCtxtReadFd (htmlParserCtxtPtr ctxt, int fd, const char * URL, const char * encoding, int options)
parse an XML from a file descriptor and build a tree. This reuses the existing @ctxt parser context
ctxt: | an HTML parser context | fd: | an open file descriptor | URL: | the base URL to use for the document | encoding: | the document encoding, or NULL | options: | a combination of htmlParserOption(s) | Returns: | the resulting document tree |
Macro: htmlElementAllowedHereDesc#define htmlElementAllowedHereDesc Checks whether an HTML element description may be a direct child of the specified element. Returns 1 if allowed; 0 otherwise.
Function: htmlReadFilehtmlDocPtr htmlReadFile (const char * filename, const char * encoding, int options)
parse an XML file from the filesystem or the network.
filename: | a file or URL | encoding: | the document encoding, or NULL | options: | a combination of htmlParserOption(s) | Returns: | the resulting document tree |
Function: htmlCtxtReadFilehtmlDocPtr htmlCtxtReadFile (htmlParserCtxtPtr ctxt, const char * filename, const char * encoding, int options)
parse an XML file from the filesystem or the network. This reuses the existing @ctxt parser context
ctxt: | an HTML parser context | filename: | a file or URL | encoding: | the document encoding, or NULL | options: | a combination of htmlParserOption(s) | Returns: | the resulting document tree |
Function: htmlSAXParseDochtmlDocPtr htmlSAXParseDoc (xmlChar * cur, const char * encoding, htmlSAXHandlerPtr sax, void * userData)
Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks to handle parse events. If sax is NULL, fallback to the default DOM behavior and return a tree.
cur: | a pointer to an array of xmlChar | encoding: | a free form C string describing the HTML document encoding, or NULL | sax: | the SAX handler block | userData: | if using SAX, this pointer will be provided on callbacks. | Returns: | the resulting document tree unless SAX is NULL or the document is not well formed. |
Function: htmlCtxtUseOptionsint htmlCtxtUseOptions (htmlParserCtxtPtr ctxt, int options)
Applies the options to the parser context
ctxt: | an HTML parser context | options: | a combination of htmlParserOption(s) | Returns: | 0 in case of success, the set of unknown or unimplemented options in case of error. |
Function: htmlReadFdhtmlDocPtr htmlReadFd (int fd, const char * URL, const char * encoding, int options)
parse an XML from a file descriptor and build a tree.
fd: | an open file descriptor | URL: | the base URL to use for the document | encoding: | the document encoding, or NULL | options: | a combination of htmlParserOption(s) | Returns: | the resulting document tree |
Function: htmlParseDochtmlDocPtr htmlParseDoc (xmlChar * cur, const char * encoding)
parse an HTML in-memory document and build a tree.
cur: | a pointer to an array of xmlChar | encoding: | a free form C string describing the HTML document encoding, or NULL | Returns: | the resulting document tree |
Function: htmlParseDocumentint htmlParseDocument (htmlParserCtxtPtr ctxt)
parse an HTML document (and build a tree if using the standard SAX interface).
ctxt: | an HTML parser context | Returns: | 0, -1 in case of error. the parser context is augmented as a result of the parsing. |
Daniel Veillard |