diff --git a/HTMLparser.c b/HTMLparser.c index 132c7d9f..c332f610 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -2623,7 +2623,7 @@ static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt) { int len = 0, l; int c; - int maxLength = (ctxt->options & XML_PARSE_HUGE) ? + int maxLength = (ctxt->options & HTML_PARSE_HUGE) ? XML_MAX_TEXT_LENGTH : XML_MAX_NAME_LENGTH; const xmlChar *base = ctxt->input->base; @@ -2796,7 +2796,7 @@ static xmlChar * htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, int stop) { xmlChar *buffer = NULL; int buffer_size = 0; - int maxLength = (ctxt->options & XML_PARSE_HUGE) ? + int maxLength = (ctxt->options & HTML_PARSE_HUGE) ? XML_MAX_HUGE_LENGTH : XML_MAX_TEXT_LENGTH; xmlChar *out = NULL; @@ -3162,7 +3162,7 @@ htmlParseComment(htmlParserCtxtPtr ctxt, int bogus) { int len; int size = HTML_PARSER_BUFFER_SIZE; int cur, l; - int maxLength = (ctxt->options & XML_PARSE_HUGE) ? + int maxLength = (ctxt->options & HTML_PARSE_HUGE) ? XML_MAX_HUGE_LENGTH : XML_MAX_TEXT_LENGTH; xmlParserInputState state; @@ -3356,7 +3356,7 @@ htmlParseDoctypeLiteral(htmlParserCtxtPtr ctxt) { int len; int size = HTML_PARSER_BUFFER_SIZE; int quote, cur, l; - int maxLength = (ctxt->options & XML_PARSE_HUGE) ? + int maxLength = (ctxt->options & HTML_PARSE_HUGE) ? XML_MAX_TEXT_LENGTH : XML_MAX_NAME_LENGTH; @@ -3424,7 +3424,7 @@ htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) { xmlChar *publicId = NULL; xmlChar *URI = NULL; int nameCap, nameSize; - int maxLength = (ctxt->options & XML_PARSE_HUGE) ? + int maxLength = (ctxt->options & HTML_PARSE_HUGE) ? XML_MAX_TEXT_LENGTH : XML_MAX_NAME_LENGTH; @@ -5714,12 +5714,178 @@ htmlCtxtReset(htmlParserCtxtPtr ctxt) xmlResetError(&ctxt->lastError); } +static int +htmlCtxtSetOptionsInternal(xmlParserCtxtPtr ctxt, int options, int keepMask) +{ + int allMask; + + if (ctxt == NULL) + return(-1); + + allMask = HTML_PARSE_RECOVER | + HTML_PARSE_HTML5 | + HTML_PARSE_NODEFDTD | + HTML_PARSE_NOERROR | + HTML_PARSE_NOWARNING | + HTML_PARSE_PEDANTIC | + HTML_PARSE_NOBLANKS | + HTML_PARSE_NONET | + HTML_PARSE_NOIMPLIED | + HTML_PARSE_COMPACT | + HTML_PARSE_HUGE | + HTML_PARSE_IGNORE_ENC | + HTML_PARSE_BIG_LINES; + + ctxt->options = (ctxt->options & keepMask) | (options & allMask); + + /* + * For some options, struct members are historically the source + * of truth. See xmlCtxtSetOptionsInternal. + */ + ctxt->keepBlanks = (options & HTML_PARSE_NOBLANKS) ? 0 : 1; + + /* + * Changing SAX callbacks is a bad idea. This should be fixed. + */ + if (options & HTML_PARSE_NOBLANKS) { + ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace; + } + if (options & HTML_PARSE_HUGE) { + if (ctxt->dict != NULL) + xmlDictSetLimit(ctxt->dict, 0); + } + + /* + * It would be useful to allow this feature. + */ + ctxt->dictNames = 0; + + ctxt->linenumbers = 1; + + return(options & ~allMask); +} + +/** + * htmlCtxtSetOptions: + * @ctxt: an HTML parser context + * @options: a bitmask of xmlParserOption values + * + * Applies the options to the parser context. Unset options are + * cleared. + * + * Available since 2.14.0. With older versions, you can use + * htmlCtxtUseOptions. + * + * HTML_PARSE_RECOVER + * + * No effect as of 2.14.0. + * + * HTML_PARSE_HTML5 + * + * Make the tokenizer emit a SAX callback for each token. This results + * in unbalanced invocations of startElement and endElement. + * + * For now, this is only usable with custom SAX callbacks. + * + * HTML_PARSE_NODEFDTD + * + * Do not default to a doctype if none was found. + * + * HTML_PARSE_NOERROR + * + * Disable error and warning reports to the error handlers. + * Errors are still accessible with xmlCtxtGetLastError. + * + * HTML_PARSE_NOWARNING + * + * Disable warning reports. + * + * HTML_PARSE_PEDANTIC + * + * No effect. + * + * HTML_PARSE_NOBLANKS + * + * Remove some text nodes containing only whitespace from the + * result document. Which nodes are removed depends on a conservative + * heuristic. The reindenting feature of the serialization code relies + * on this option to be set when parsing. Use of this option is + * DISCOURAGED. + * + * HTML_PARSE_NONET + * + * No effect. + * + * HTML_PARSE_NOIMPLIED + * + * Do not add implied html, head or body elements. + * + * HTML_PARSE_COMPACT + * + * Store small strings directly in the node struct to save + * memory. + * + * HTML_PARSE_HUGE + * + * Relax some internal limits. + * + * Available since 2.14.0. Use XML_PARSE_HUGE works with older + * versions. + * + * Maximum size of text nodes, tags, comments, CDATA sections + * + * normal: 10M + * huge: 1B + * + * Maximum size of names, system literals, pubid literals + * + * normal: 50K + * huge: 10M + * + * Maximum nesting depth of elements + * + * normal: 256 + * huge: 2048 + * + * HTML_PARSE_IGNORE_ENC + * + * Ignore the encoding in the HTML declaration. This option is + * mostly unneeded these days. The only effect is to enforce + * UTF-8 decoding of ASCII-like data. + * + * HTML_PARSE_BIG_LINES + * + * Enable reporting of line numbers larger than 65535. + * + * Available since 2.14.0. + * + * Returns 0 in case of success, the set of unknown or unimplemented options + * in case of error. + */ +int +htmlCtxtSetOptions(xmlParserCtxtPtr ctxt, int options) +{ + return(htmlCtxtSetOptionsInternal(ctxt, options, 0)); +} + /** * htmlCtxtUseOptions: * @ctxt: an HTML parser context * @options: a combination of htmlParserOption(s) * - * Applies the options to the parser context + * DEPRECATED: Use htmlCtxtSetOptions. + * + * Applies the options to the parser context. The following options + * are never cleared and can only be enabled: + * + * HTML_PARSE_NODEFDTD + * HTML_PARSE_NOERROR + * HTML_PARSE_NOWARNING + * HTML_PARSE_NOIMPLIED + * HTML_PARSE_COMPACT + * HTML_PARSE_HUGE + * HTML_PARSE_IGNORE_ENC + * HTML_PARSE_BIG_LINES * * Returns 0 in case of success, the set of unknown or unimplemented options * in case of error. @@ -5727,67 +5893,21 @@ htmlCtxtReset(htmlParserCtxtPtr ctxt) int htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options) { - if (ctxt == NULL) - return(-1); + int keepMask; - if (options & HTML_PARSE_NOWARNING) { - ctxt->sax->warning = NULL; - ctxt->vctxt.warning = NULL; - options -= XML_PARSE_NOWARNING; - ctxt->options |= XML_PARSE_NOWARNING; - } - if (options & HTML_PARSE_NOERROR) { - ctxt->sax->error = NULL; - ctxt->vctxt.error = NULL; - ctxt->sax->fatalError = NULL; - options -= XML_PARSE_NOERROR; - ctxt->options |= XML_PARSE_NOERROR; - } - if (options & HTML_PARSE_PEDANTIC) { - ctxt->pedantic = 1; - options -= XML_PARSE_PEDANTIC; - ctxt->options |= XML_PARSE_PEDANTIC; - } else - ctxt->pedantic = 0; - if (options & XML_PARSE_NOBLANKS) { - ctxt->keepBlanks = 0; - ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace; - options -= XML_PARSE_NOBLANKS; - ctxt->options |= XML_PARSE_NOBLANKS; - } else - ctxt->keepBlanks = 1; - if (options & HTML_PARSE_RECOVER) { - ctxt->recovery = 1; - options -= HTML_PARSE_RECOVER; - } else - ctxt->recovery = 0; - if (options & HTML_PARSE_COMPACT) { - ctxt->options |= HTML_PARSE_COMPACT; - options -= HTML_PARSE_COMPACT; - } - if (options & XML_PARSE_HUGE) { - ctxt->options |= XML_PARSE_HUGE; - options -= XML_PARSE_HUGE; - } - if (options & HTML_PARSE_NODEFDTD) { - ctxt->options |= HTML_PARSE_NODEFDTD; - options -= HTML_PARSE_NODEFDTD; - } - if (options & HTML_PARSE_IGNORE_ENC) { - ctxt->options |= HTML_PARSE_IGNORE_ENC; - options -= HTML_PARSE_IGNORE_ENC; - } - if (options & HTML_PARSE_NOIMPLIED) { - ctxt->options |= HTML_PARSE_NOIMPLIED; - options -= HTML_PARSE_NOIMPLIED; - } - if (options & HTML_PARSE_HTML5) { - ctxt->options |= HTML_PARSE_HTML5; - options -= HTML_PARSE_HTML5; - } - ctxt->dictNames = 0; - ctxt->linenumbers = 1; - return (options); + /* + * For historic reasons, some options can only be enabled. + */ + keepMask = HTML_PARSE_NODEFDTD | + HTML_PARSE_NOERROR | + HTML_PARSE_NOWARNING | + HTML_PARSE_NOIMPLIED | + HTML_PARSE_COMPACT | + HTML_PARSE_HUGE | + HTML_PARSE_IGNORE_ENC | + HTML_PARSE_BIG_LINES; + + return(htmlCtxtSetOptionsInternal(ctxt, options, keepMask)); } /** diff --git a/include/libxml/HTMLparser.h b/include/libxml/HTMLparser.h index 4e73e5ef..f303f2d4 100644 --- a/include/libxml/HTMLparser.h +++ b/include/libxml/HTMLparser.h @@ -205,21 +205,26 @@ XMLPUBFUN void * to the xmlReadDoc() and similar calls. */ typedef enum { - HTML_PARSE_RECOVER = 1<<0, /* Relaxed parsing */ + HTML_PARSE_RECOVER = 1<<0, /* No effect */ HTML_PARSE_HTML5 = 1<<1, /* HTML5 support */ HTML_PARSE_NODEFDTD = 1<<2, /* do not default a doctype if not found */ HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */ HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */ - HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */ + HTML_PARSE_PEDANTIC = 1<<7, /* No effect */ HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */ - HTML_PARSE_NONET = 1<<11,/* Forbid network access */ + HTML_PARSE_NONET = 1<<11,/* No effect */ HTML_PARSE_NOIMPLIED= 1<<13,/* Do not add implied html/body... elements */ HTML_PARSE_COMPACT = 1<<16,/* compact small text nodes */ - HTML_PARSE_IGNORE_ENC=1<<21 /* ignore internal document encoding hint */ + HTML_PARSE_HUGE = 1<<19,/* relax any hardcoded limit from the parser */ + HTML_PARSE_IGNORE_ENC=1<<21,/* ignore internal document encoding hint */ + HTML_PARSE_BIG_LINES= 1<<22 /* Store big lines numbers in text PSVI field */ } htmlParserOption; XMLPUBFUN void htmlCtxtReset (htmlParserCtxtPtr ctxt); +XMLPUBFUN int + htmlCtxtSetOptions (htmlParserCtxtPtr ctxt, + int options); XMLPUBFUN int htmlCtxtUseOptions (htmlParserCtxtPtr ctxt, int options);