![]() | ![]() | ![]() | Gnome XML Library Reference Manual | ![]() |
---|
HTMLparser —
typedef htmlParserCtxt; typedef htmlParserCtxtPtr; typedef htmlParserNodeInfo; typedef htmlSAXHandler; typedef htmlSAXHandlerPtr; typedef htmlParserInput; typedef htmlParserInputPtr; typedef htmlDocPtr; typedef htmlNodePtr; struct htmlElemDesc; typedef htmlElemDescPtr; struct htmlEntityDesc; typedef htmlEntityDescPtr; const htmlElemDesc* htmlTagLookup (const xmlChar *tag); const htmlEntityDesc* htmlEntityLookup (const xmlChar *name); const htmlEntityDesc* htmlEntityValueLookup (unsigned int value); int htmlIsAutoClosed (htmlDocPtr doc, htmlNodePtr elem); int htmlAutoCloseTag (htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem); const htmlEntityDesc* htmlParseEntityRef (htmlParserCtxtPtr ctxt, xmlChar **str); int htmlParseCharRef (htmlParserCtxtPtr ctxt); void htmlParseElement (htmlParserCtxtPtr ctxt); htmlParserCtxtPtr htmlCreateMemoryParserCtxt (const char *buffer, int size); int htmlParseDocument (htmlParserCtxtPtr ctxt); htmlDocPtr htmlSAXParseDoc (xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData); htmlDocPtr htmlParseDoc (xmlChar *cur, const char *encoding); htmlDocPtr htmlSAXParseFile (const char *filename, const char *encoding, htmlSAXHandlerPtr sax, void *userData); htmlDocPtr htmlParseFile (const char *filename, const char *encoding); int UTF8ToHtml (unsigned char *out, int *outlen, unsigned char *in, int *inlen); int htmlEncodeEntities (unsigned char *out, int *outlen, unsigned char *in, int *inlen, int quoteChar); int htmlIsScriptAttribute (const xmlChar *name); int htmlHandleOmittedElem (int val); void htmlFreeParserCtxt (htmlParserCtxtPtr ctxt); htmlParserCtxtPtr htmlCreatePushParserCtxt (htmlSAXHandlerPtr sax, void *user_data, const char *chunk, int size, const char *filename, xmlCharEncoding enc); int htmlParseChunk (htmlParserCtxtPtr ctxt, const char *chunk, int size, int terminate);
struct htmlElemDesc { const char *name; /* The tag name */ char startTag; /* Whether the start tag can be implied */ char endTag; /* Whether the end tag can be implied */ char saveEndTag; /* Whether the end tag should be saved */ char empty; /* Is this an empty element ? */ char depr; /* Is this a deprecated element ? */ char dtd; /* 1: only in Loose DTD, 2: only Frameset one */ char isinline; /* is this a block 0 or inline 1 element */ const char *desc; /* the description */ /* NRK Jan.2003 * New fields encapsulating HTML structure * * Bugs: * This is a very limited representation. It fails to tell us when * an element *requires* subelements (we only have whether they're * allowed or not), and it doesn't tell us where CDATA and PCDATA * are allowed. Some element relationships are not fully represented: * these are flagged with the word MODIFIER */ const char** subelts; /* allowed sub-elements of this element */ const char* defaultsubelt; /* subelement for suggested auto-repair if necessary or NULL */ const char** attrs_opt; /* Optional Attributes */ const char** attrs_depr; /* Additional deprecated attributes */ const char** attrs_req; /* Required attributes */ };
struct htmlEntityDesc { unsigned int value; /* the UNICODE value for the character */ const char *name; /* The entity name */ const char *desc; /* the description */ };
const htmlElemDesc* htmlTagLookup (const xmlChar *tag);
Lookup the HTML tag in the ElementTable
tag : | |
Returns : |
const htmlEntityDesc* htmlEntityLookup (const xmlChar *name);
Lookup the given entity in EntitiesTable
TODO: the linear scan is really ugly, an hash table is really needed.
name : | |
Returns : |
const htmlEntityDesc* htmlEntityValueLookup (unsigned int value);
Lookup the given entity in EntitiesTable
TODO: the linear scan is really ugly, an hash table is really needed.
value : | |
Returns : |
int htmlIsAutoClosed (htmlDocPtr doc, htmlNodePtr elem);
The HTML DTD allows a tag to implicitly close other tags. The list is kept in htmlStartClose array. This function checks if a tag is autoclosed by one of it's child
doc : | |
elem : | |
Returns : |
int htmlAutoCloseTag (htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem);
The HTML DTD allows a tag to implicitly close other tags. The list is kept in htmlStartClose array. This function checks if the element or one of it's children would autoclose the given tag.
doc : | |
name : | |
elem : | |
Returns : |
const htmlEntityDesc* htmlParseEntityRef (htmlParserCtxtPtr ctxt, xmlChar **str);
parse an HTML ENTITY references
[68] EntityRef ::= '&' Name ';'
ctxt : | |
str : | |
Returns : |
int htmlParseCharRef (htmlParserCtxtPtr ctxt);
parse Reference declarations
[66] CharRef ::= '&#' [0-9]+ ';' |
'&
ctxt : | |
Returns : |
void htmlParseElement (htmlParserCtxtPtr ctxt);
parse an HTML element, this is highly recursive
[39] element ::= EmptyElemTag | STag content ETag
[41] Attribute ::= Name Eq AttValue
ctxt : |
htmlParserCtxtPtr htmlCreateMemoryParserCtxt (const char *buffer, int size);
Create a parser context for an HTML in-memory document.
buffer : | |
size : | |
Returns : |
int htmlParseDocument (htmlParserCtxtPtr ctxt);
parse an HTML document (and build a tree if using the standard SAX interface).
ctxt : | |
Returns : |
htmlDocPtr htmlSAXParseDoc (xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData);
Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks to handle parse events. If sax is NULL, fallback to the default DOM behavior and return a tree.
cur : | |
encoding : | |
sax : | |
userData : | |
Returns : |
htmlDocPtr htmlParseDoc (xmlChar *cur, const char *encoding);
parse an HTML in-memory document and build a tree.
cur : | |
encoding : | |
Returns : |
htmlDocPtr htmlSAXParseFile (const char *filename, const char *encoding, htmlSAXHandlerPtr sax, void *userData);
parse an HTML file and build a tree. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time. It use the given SAX function block to handle the parsing callback. If sax is NULL, fallback to the default DOM tree building routines.
filename : | |
encoding : | |
sax : | |
userData : | |
Returns : |
htmlDocPtr htmlParseFile (const char *filename, const char *encoding);
parse an HTML file and build a tree. Automatic support for ZLIB/Compress compressed document is provided by default if found at compile-time.
filename : | |
encoding : | |
Returns : |
int UTF8ToHtml (unsigned char *out, int *outlen, unsigned char *in, int *inlen);
Take a block of UTF-8 chars in and try to convert it to an ASCII plus HTML entities block of chars out.
out : | |
outlen : | |
in : | |
inlen : | |
Returns : |
int htmlEncodeEntities (unsigned char *out, int *outlen, unsigned char *in, int *inlen, int quoteChar);
Take a block of UTF-8 chars in and try to convert it to an ASCII plus HTML entities block of chars out.
out : | |
outlen : | |
in : | |
inlen : | |
quoteChar : | |
Returns : |
int htmlIsScriptAttribute (const xmlChar *name);
Check if an attribute is of content type Script
name : | |
Returns : |
int htmlHandleOmittedElem (int val);
Set and return the previous value for handling HTML omitted tags.
val : | |
Returns : |
void htmlFreeParserCtxt (htmlParserCtxtPtr ctxt);
Free all the memory used by a parser context. However the parsed document in ctxt->myDoc is not freed.
ctxt : |
htmlParserCtxtPtr htmlCreatePushParserCtxt (htmlSAXHandlerPtr sax, void *user_data, const char *chunk, int size, const char *filename, xmlCharEncoding enc);
Create a parser context for using the HTML parser in push mode The value of filename is used for fetching external entities and error/warning reports.
sax : | |
user_data : | |
chunk : | |
size : | |
filename : | |
enc : | |
Returns : |
int htmlParseChunk (htmlParserCtxtPtr ctxt, const char *chunk, int size, int terminate);
Parse a Chunk of memory
ctxt : | |
chunk : | |
size : | |
terminate : | |
Returns : |
<< xmlerror | HTMLtree >> |