/** * @file * * @brief HTML parser, doesn't support HTML5 * * This module orginally implemented an HTML parser based on the * (underspecified) HTML 4.0 spec. As of 2.14, the tokenizer * conforms to HTML5. Tree construction still follows a custom, * unspecified algorithm with many differences to HTML5. * * The parser defaults to ISO-8859-1, the default encoding of * HTTP/1.0. * * @copyright See Copyright for the status of this software. * * @author Daniel Veillard */ #ifndef __HTML_PARSER_H__ #define __HTML_PARSER_H__ #include #include #ifdef LIBXML_HTML_ENABLED #ifdef __cplusplus extern "C" { #endif /** @cond ignore */ /* * Backward compatibility */ #define UTF8ToHtml htmlUTF8ToHtml #define htmlDefaultSubelement(elt) elt->defaultsubelt #define htmlElementAllowedHereDesc(parent,elt) \ htmlElementAllowedHere((parent), (elt)->name) #define htmlRequiredAttrs(elt) (elt)->attrs_req /* * Most of the back-end structures from XML and HTML are shared. */ typedef xmlParserCtxt htmlParserCtxt; typedef xmlParserCtxtPtr htmlParserCtxtPtr; typedef xmlParserNodeInfo htmlParserNodeInfo; typedef xmlSAXHandler htmlSAXHandler; typedef xmlSAXHandlerPtr htmlSAXHandlerPtr; typedef xmlParserInput htmlParserInput; typedef xmlParserInputPtr htmlParserInputPtr; typedef xmlDocPtr htmlDocPtr; typedef xmlNodePtr htmlNodePtr; /* * Internal description of an HTML element, representing HTML 4.01 * and XHTML 1.0 (which share the same structure). */ typedef struct _htmlElemDesc htmlElemDesc; typedef htmlElemDesc *htmlElemDescPtr; struct _htmlElemDesc { const char *name; /* The tag name */ char startTag; /* unused */ char endTag; /* Whether the end tag can be implied */ char saveEndTag; /* unused */ char empty; /* Is this an empty element ? */ char depr; /* unused */ char dtd; /* unused */ char isinline; /* is this a block 0 or inline 1 element */ const char *desc; /* the description */ const char** subelts XML_DEPRECATED_MEMBER; const char* defaultsubelt XML_DEPRECATED_MEMBER; const char** attrs_opt XML_DEPRECATED_MEMBER; const char** attrs_depr XML_DEPRECATED_MEMBER; const char** attrs_req XML_DEPRECATED_MEMBER; int dataMode; }; /* * Internal description of an HTML entity. */ typedef struct _htmlEntityDesc htmlEntityDesc; typedef htmlEntityDesc *htmlEntityDescPtr; struct _htmlEntityDesc { unsigned int value; /* the UNICODE value for the character */ const char *name; /* The entity name */ const char *desc; /* the description */ }; #ifdef LIBXML_SAX1_ENABLED XML_DEPRECATED XMLPUBVAR const xmlSAXHandlerV1 htmlDefaultSAXHandler; #endif /* LIBXML_SAX1_ENABLED */ /** @endcond */ /* * There is only few public functions. */ XML_DEPRECATED XMLPUBFUN void htmlInitAutoClose (void); XML_DEPRECATED XMLPUBFUN const htmlElemDesc * htmlTagLookup (const xmlChar *tag); XML_DEPRECATED XMLPUBFUN const htmlEntityDesc * htmlEntityLookup(const xmlChar *name); XML_DEPRECATED XMLPUBFUN const htmlEntityDesc * htmlEntityValueLookup(unsigned int value); XML_DEPRECATED XMLPUBFUN int htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem); XML_DEPRECATED XMLPUBFUN int htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem); XML_DEPRECATED XMLPUBFUN const htmlEntityDesc * htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str); XML_DEPRECATED XMLPUBFUN int htmlParseCharRef(htmlParserCtxtPtr ctxt); XML_DEPRECATED XMLPUBFUN void htmlParseElement(htmlParserCtxtPtr ctxt); XMLPUBFUN htmlParserCtxtPtr htmlNewParserCtxt(void); XMLPUBFUN htmlParserCtxtPtr htmlNewSAXParserCtxt(const htmlSAXHandler *sax, void *userData); XMLPUBFUN htmlParserCtxtPtr htmlCreateMemoryParserCtxt(const char *buffer, int size); XMLPUBFUN int htmlParseDocument(htmlParserCtxtPtr ctxt); XML_DEPRECATED XMLPUBFUN htmlDocPtr htmlSAXParseDoc (const xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData); XMLPUBFUN htmlDocPtr htmlParseDoc (const xmlChar *cur, const char *encoding); XMLPUBFUN htmlParserCtxtPtr htmlCreateFileParserCtxt(const char *filename, const char *encoding); XML_DEPRECATED XMLPUBFUN htmlDocPtr htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax, void *userData); XMLPUBFUN htmlDocPtr htmlParseFile (const char *filename, const char *encoding); XML_DEPRECATED XMLPUBFUN int htmlUTF8ToHtml (unsigned char *out, int *outlen, const unsigned char *in, int *inlen); XML_DEPRECATED XMLPUBFUN int htmlEncodeEntities(unsigned char *out, int *outlen, const unsigned char *in, int *inlen, int quoteChar); XML_DEPRECATED XMLPUBFUN int htmlIsScriptAttribute(const xmlChar *name); XML_DEPRECATED XMLPUBFUN int htmlHandleOmittedElem(int val); #ifdef LIBXML_PUSH_ENABLED /* * Interfaces for the Push mode. */ XMLPUBFUN htmlParserCtxtPtr htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data, const char *chunk, int size, const char *filename, xmlCharEncoding enc); XMLPUBFUN int htmlParseChunk (htmlParserCtxtPtr ctxt, const char *chunk, int size, int terminate); #endif /* LIBXML_PUSH_ENABLED */ XMLPUBFUN void htmlFreeParserCtxt (htmlParserCtxtPtr ctxt); /* * New set of simpler/more flexible APIs */ /** * This is the set of HTML parser options that can be passed to * htmlReadDoc(), htmlCtxtSetOptions() and other functions. */ typedef enum { /** * No effect as of 2.14.0. */ HTML_PARSE_RECOVER = 1<<0, /** * Do not default to a doctype if none was found. */ HTML_PARSE_NODEFDTD = 1<<2, /** * Disable error and warning reports to the error handlers. * Errors are still accessible with xmlCtxtGetLastError(). */ HTML_PARSE_NOERROR = 1<<5, /** * Disable warning reports. */ HTML_PARSE_NOWARNING = 1<<6, /** * No effect. */ HTML_PARSE_PEDANTIC = 1<<7, /** * Remove some text nodes containing only whitespace from the * result document. Which nodes are removed depends on a conservative * heuristic. The reindenting feature of the serialization code relies * on this option to be set when parsing. Use of this option is * DISCOURAGED. */ HTML_PARSE_NOBLANKS = 1<<8, /** * No effect. */ HTML_PARSE_NONET = 1<<11, /** * Do not add implied html, head or body elements. */ HTML_PARSE_NOIMPLIED = 1<<13, /** * Store small strings directly in the node struct to save * memory. */ HTML_PARSE_COMPACT = 1<<16, /** * Relax some internal limits. See XML_PARSE_HUGE in xmlParserOption. * * @since 2.14.0 * * Use XML_PARSE_HUGE with older versions. */ HTML_PARSE_HUGE = 1<<19, /** * Ignore the encoding in the HTML declaration. This option is * mostly unneeded these days. The only effect is to enforce * ISO-8859-1 decoding of ASCII-like data. */ HTML_PARSE_IGNORE_ENC =1<<21, /** * Enable reporting of line numbers larger than 65535. * * @since 2.14.0 * * Use XML_PARSE_BIG_LINES with older versions. */ HTML_PARSE_BIG_LINES = 1<<22, /** * Make the tokenizer emit a SAX callback for each token. This results * in unbalanced invocations of startElement and endElement. * * For now, this is only usable to tokenize HTML5 with custom SAX * callbacks. A tree builder isn't implemented yet. * * @since 2.14.0 */ HTML_PARSE_HTML5 = 1<<26 } htmlParserOption; XMLPUBFUN void htmlCtxtReset (htmlParserCtxtPtr ctxt); XMLPUBFUN int htmlCtxtSetOptions (htmlParserCtxtPtr ctxt, int options); XMLPUBFUN int htmlCtxtUseOptions (htmlParserCtxtPtr ctxt, int options); XMLPUBFUN htmlDocPtr htmlReadDoc (const xmlChar *cur, const char *URL, const char *encoding, int options); XMLPUBFUN htmlDocPtr htmlReadFile (const char *URL, const char *encoding, int options); XMLPUBFUN htmlDocPtr htmlReadMemory (const char *buffer, int size, const char *URL, const char *encoding, int options); XMLPUBFUN htmlDocPtr htmlReadFd (int fd, const char *URL, const char *encoding, int options); XMLPUBFUN htmlDocPtr htmlReadIO (xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, void *ioctx, const char *URL, const char *encoding, int options); XMLPUBFUN htmlDocPtr htmlCtxtParseDocument (htmlParserCtxtPtr ctxt, xmlParserInputPtr input); XMLPUBFUN htmlDocPtr htmlCtxtReadDoc (xmlParserCtxtPtr ctxt, const xmlChar *cur, const char *URL, const char *encoding, int options); XMLPUBFUN htmlDocPtr htmlCtxtReadFile (xmlParserCtxtPtr ctxt, const char *filename, const char *encoding, int options); XMLPUBFUN htmlDocPtr htmlCtxtReadMemory (xmlParserCtxtPtr ctxt, const char *buffer, int size, const char *URL, const char *encoding, int options); XMLPUBFUN htmlDocPtr htmlCtxtReadFd (xmlParserCtxtPtr ctxt, int fd, const char *URL, const char *encoding, int options); XMLPUBFUN htmlDocPtr htmlCtxtReadIO (xmlParserCtxtPtr ctxt, xmlInputReadCallback ioread, xmlInputCloseCallback ioclose, void *ioctx, const char *URL, const char *encoding, int options); /** * deprecated content model */ typedef enum { HTML_NA = 0 , /* something we don't check at all */ HTML_INVALID = 0x1 , HTML_DEPRECATED = 0x2 , HTML_VALID = 0x4 , HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */ } htmlStatus ; /* Using htmlElemDesc rather than name here, to emphasise the fact that otherwise there's a lookup overhead */ XML_DEPRECATED XMLPUBFUN htmlStatus htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ; XML_DEPRECATED XMLPUBFUN int htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ; XML_DEPRECATED XMLPUBFUN htmlStatus htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ; XML_DEPRECATED XMLPUBFUN htmlStatus htmlNodeStatus(htmlNodePtr, int) ; #ifdef __cplusplus } #endif #endif /* LIBXML_HTML_ENABLED */ #endif /* __HTML_PARSER_H__ */