mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-07-29 11:41:22 +03:00
html: Rework options
Introduce htmlCtxtSetOptions, see similar changes made to XML parser. Add HTML_PARSE_HUGE alias. Support HTML_PARSE_BIG_LINES.
This commit is contained in:
252
HTMLparser.c
252
HTMLparser.c
@ -2623,7 +2623,7 @@ static const xmlChar *
|
|||||||
htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
|
htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
|
||||||
int len = 0, l;
|
int len = 0, l;
|
||||||
int c;
|
int c;
|
||||||
int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
|
int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
|
||||||
XML_MAX_TEXT_LENGTH :
|
XML_MAX_TEXT_LENGTH :
|
||||||
XML_MAX_NAME_LENGTH;
|
XML_MAX_NAME_LENGTH;
|
||||||
const xmlChar *base = ctxt->input->base;
|
const xmlChar *base = ctxt->input->base;
|
||||||
@ -2796,7 +2796,7 @@ static xmlChar *
|
|||||||
htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, int stop) {
|
htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, int stop) {
|
||||||
xmlChar *buffer = NULL;
|
xmlChar *buffer = NULL;
|
||||||
int buffer_size = 0;
|
int buffer_size = 0;
|
||||||
int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
|
int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
|
||||||
XML_MAX_HUGE_LENGTH :
|
XML_MAX_HUGE_LENGTH :
|
||||||
XML_MAX_TEXT_LENGTH;
|
XML_MAX_TEXT_LENGTH;
|
||||||
xmlChar *out = NULL;
|
xmlChar *out = NULL;
|
||||||
@ -3162,7 +3162,7 @@ htmlParseComment(htmlParserCtxtPtr ctxt, int bogus) {
|
|||||||
int len;
|
int len;
|
||||||
int size = HTML_PARSER_BUFFER_SIZE;
|
int size = HTML_PARSER_BUFFER_SIZE;
|
||||||
int cur, l;
|
int cur, l;
|
||||||
int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
|
int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
|
||||||
XML_MAX_HUGE_LENGTH :
|
XML_MAX_HUGE_LENGTH :
|
||||||
XML_MAX_TEXT_LENGTH;
|
XML_MAX_TEXT_LENGTH;
|
||||||
xmlParserInputState state;
|
xmlParserInputState state;
|
||||||
@ -3356,7 +3356,7 @@ htmlParseDoctypeLiteral(htmlParserCtxtPtr ctxt) {
|
|||||||
int len;
|
int len;
|
||||||
int size = HTML_PARSER_BUFFER_SIZE;
|
int size = HTML_PARSER_BUFFER_SIZE;
|
||||||
int quote, cur, l;
|
int quote, cur, l;
|
||||||
int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
|
int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
|
||||||
XML_MAX_TEXT_LENGTH :
|
XML_MAX_TEXT_LENGTH :
|
||||||
XML_MAX_NAME_LENGTH;
|
XML_MAX_NAME_LENGTH;
|
||||||
|
|
||||||
@ -3424,7 +3424,7 @@ htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
|
|||||||
xmlChar *publicId = NULL;
|
xmlChar *publicId = NULL;
|
||||||
xmlChar *URI = NULL;
|
xmlChar *URI = NULL;
|
||||||
int nameCap, nameSize;
|
int nameCap, nameSize;
|
||||||
int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
|
int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
|
||||||
XML_MAX_TEXT_LENGTH :
|
XML_MAX_TEXT_LENGTH :
|
||||||
XML_MAX_NAME_LENGTH;
|
XML_MAX_NAME_LENGTH;
|
||||||
|
|
||||||
@ -5714,12 +5714,178 @@ htmlCtxtReset(htmlParserCtxtPtr ctxt)
|
|||||||
xmlResetError(&ctxt->lastError);
|
xmlResetError(&ctxt->lastError);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
htmlCtxtSetOptionsInternal(xmlParserCtxtPtr ctxt, int options, int keepMask)
|
||||||
|
{
|
||||||
|
int allMask;
|
||||||
|
|
||||||
|
if (ctxt == NULL)
|
||||||
|
return(-1);
|
||||||
|
|
||||||
|
allMask = HTML_PARSE_RECOVER |
|
||||||
|
HTML_PARSE_HTML5 |
|
||||||
|
HTML_PARSE_NODEFDTD |
|
||||||
|
HTML_PARSE_NOERROR |
|
||||||
|
HTML_PARSE_NOWARNING |
|
||||||
|
HTML_PARSE_PEDANTIC |
|
||||||
|
HTML_PARSE_NOBLANKS |
|
||||||
|
HTML_PARSE_NONET |
|
||||||
|
HTML_PARSE_NOIMPLIED |
|
||||||
|
HTML_PARSE_COMPACT |
|
||||||
|
HTML_PARSE_HUGE |
|
||||||
|
HTML_PARSE_IGNORE_ENC |
|
||||||
|
HTML_PARSE_BIG_LINES;
|
||||||
|
|
||||||
|
ctxt->options = (ctxt->options & keepMask) | (options & allMask);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* For some options, struct members are historically the source
|
||||||
|
* of truth. See xmlCtxtSetOptionsInternal.
|
||||||
|
*/
|
||||||
|
ctxt->keepBlanks = (options & HTML_PARSE_NOBLANKS) ? 0 : 1;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Changing SAX callbacks is a bad idea. This should be fixed.
|
||||||
|
*/
|
||||||
|
if (options & HTML_PARSE_NOBLANKS) {
|
||||||
|
ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
|
||||||
|
}
|
||||||
|
if (options & HTML_PARSE_HUGE) {
|
||||||
|
if (ctxt->dict != NULL)
|
||||||
|
xmlDictSetLimit(ctxt->dict, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* It would be useful to allow this feature.
|
||||||
|
*/
|
||||||
|
ctxt->dictNames = 0;
|
||||||
|
|
||||||
|
ctxt->linenumbers = 1;
|
||||||
|
|
||||||
|
return(options & ~allMask);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* htmlCtxtSetOptions:
|
||||||
|
* @ctxt: an HTML parser context
|
||||||
|
* @options: a bitmask of xmlParserOption values
|
||||||
|
*
|
||||||
|
* Applies the options to the parser context. Unset options are
|
||||||
|
* cleared.
|
||||||
|
*
|
||||||
|
* Available since 2.14.0. With older versions, you can use
|
||||||
|
* htmlCtxtUseOptions.
|
||||||
|
*
|
||||||
|
* HTML_PARSE_RECOVER
|
||||||
|
*
|
||||||
|
* No effect as of 2.14.0.
|
||||||
|
*
|
||||||
|
* HTML_PARSE_HTML5
|
||||||
|
*
|
||||||
|
* Make the tokenizer emit a SAX callback for each token. This results
|
||||||
|
* in unbalanced invocations of startElement and endElement.
|
||||||
|
*
|
||||||
|
* For now, this is only usable with custom SAX callbacks.
|
||||||
|
*
|
||||||
|
* HTML_PARSE_NODEFDTD
|
||||||
|
*
|
||||||
|
* Do not default to a doctype if none was found.
|
||||||
|
*
|
||||||
|
* HTML_PARSE_NOERROR
|
||||||
|
*
|
||||||
|
* Disable error and warning reports to the error handlers.
|
||||||
|
* Errors are still accessible with xmlCtxtGetLastError.
|
||||||
|
*
|
||||||
|
* HTML_PARSE_NOWARNING
|
||||||
|
*
|
||||||
|
* Disable warning reports.
|
||||||
|
*
|
||||||
|
* HTML_PARSE_PEDANTIC
|
||||||
|
*
|
||||||
|
* No effect.
|
||||||
|
*
|
||||||
|
* HTML_PARSE_NOBLANKS
|
||||||
|
*
|
||||||
|
* Remove some text nodes containing only whitespace from the
|
||||||
|
* result document. Which nodes are removed depends on a conservative
|
||||||
|
* heuristic. The reindenting feature of the serialization code relies
|
||||||
|
* on this option to be set when parsing. Use of this option is
|
||||||
|
* DISCOURAGED.
|
||||||
|
*
|
||||||
|
* HTML_PARSE_NONET
|
||||||
|
*
|
||||||
|
* No effect.
|
||||||
|
*
|
||||||
|
* HTML_PARSE_NOIMPLIED
|
||||||
|
*
|
||||||
|
* Do not add implied html, head or body elements.
|
||||||
|
*
|
||||||
|
* HTML_PARSE_COMPACT
|
||||||
|
*
|
||||||
|
* Store small strings directly in the node struct to save
|
||||||
|
* memory.
|
||||||
|
*
|
||||||
|
* HTML_PARSE_HUGE
|
||||||
|
*
|
||||||
|
* Relax some internal limits.
|
||||||
|
*
|
||||||
|
* Available since 2.14.0. Use XML_PARSE_HUGE works with older
|
||||||
|
* versions.
|
||||||
|
*
|
||||||
|
* Maximum size of text nodes, tags, comments, CDATA sections
|
||||||
|
*
|
||||||
|
* normal: 10M
|
||||||
|
* huge: 1B
|
||||||
|
*
|
||||||
|
* Maximum size of names, system literals, pubid literals
|
||||||
|
*
|
||||||
|
* normal: 50K
|
||||||
|
* huge: 10M
|
||||||
|
*
|
||||||
|
* Maximum nesting depth of elements
|
||||||
|
*
|
||||||
|
* normal: 256
|
||||||
|
* huge: 2048
|
||||||
|
*
|
||||||
|
* HTML_PARSE_IGNORE_ENC
|
||||||
|
*
|
||||||
|
* Ignore the encoding in the HTML declaration. This option is
|
||||||
|
* mostly unneeded these days. The only effect is to enforce
|
||||||
|
* UTF-8 decoding of ASCII-like data.
|
||||||
|
*
|
||||||
|
* HTML_PARSE_BIG_LINES
|
||||||
|
*
|
||||||
|
* Enable reporting of line numbers larger than 65535.
|
||||||
|
*
|
||||||
|
* Available since 2.14.0.
|
||||||
|
*
|
||||||
|
* Returns 0 in case of success, the set of unknown or unimplemented options
|
||||||
|
* in case of error.
|
||||||
|
*/
|
||||||
|
int
|
||||||
|
htmlCtxtSetOptions(xmlParserCtxtPtr ctxt, int options)
|
||||||
|
{
|
||||||
|
return(htmlCtxtSetOptionsInternal(ctxt, options, 0));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* htmlCtxtUseOptions:
|
* htmlCtxtUseOptions:
|
||||||
* @ctxt: an HTML parser context
|
* @ctxt: an HTML parser context
|
||||||
* @options: a combination of htmlParserOption(s)
|
* @options: a combination of htmlParserOption(s)
|
||||||
*
|
*
|
||||||
* Applies the options to the parser context
|
* DEPRECATED: Use htmlCtxtSetOptions.
|
||||||
|
*
|
||||||
|
* Applies the options to the parser context. The following options
|
||||||
|
* are never cleared and can only be enabled:
|
||||||
|
*
|
||||||
|
* HTML_PARSE_NODEFDTD
|
||||||
|
* HTML_PARSE_NOERROR
|
||||||
|
* HTML_PARSE_NOWARNING
|
||||||
|
* HTML_PARSE_NOIMPLIED
|
||||||
|
* HTML_PARSE_COMPACT
|
||||||
|
* HTML_PARSE_HUGE
|
||||||
|
* HTML_PARSE_IGNORE_ENC
|
||||||
|
* HTML_PARSE_BIG_LINES
|
||||||
*
|
*
|
||||||
* Returns 0 in case of success, the set of unknown or unimplemented options
|
* Returns 0 in case of success, the set of unknown or unimplemented options
|
||||||
* in case of error.
|
* in case of error.
|
||||||
@ -5727,67 +5893,21 @@ htmlCtxtReset(htmlParserCtxtPtr ctxt)
|
|||||||
int
|
int
|
||||||
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
|
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
|
||||||
{
|
{
|
||||||
if (ctxt == NULL)
|
int keepMask;
|
||||||
return(-1);
|
|
||||||
|
|
||||||
if (options & HTML_PARSE_NOWARNING) {
|
/*
|
||||||
ctxt->sax->warning = NULL;
|
* For historic reasons, some options can only be enabled.
|
||||||
ctxt->vctxt.warning = NULL;
|
*/
|
||||||
options -= XML_PARSE_NOWARNING;
|
keepMask = HTML_PARSE_NODEFDTD |
|
||||||
ctxt->options |= XML_PARSE_NOWARNING;
|
HTML_PARSE_NOERROR |
|
||||||
}
|
HTML_PARSE_NOWARNING |
|
||||||
if (options & HTML_PARSE_NOERROR) {
|
HTML_PARSE_NOIMPLIED |
|
||||||
ctxt->sax->error = NULL;
|
HTML_PARSE_COMPACT |
|
||||||
ctxt->vctxt.error = NULL;
|
HTML_PARSE_HUGE |
|
||||||
ctxt->sax->fatalError = NULL;
|
HTML_PARSE_IGNORE_ENC |
|
||||||
options -= XML_PARSE_NOERROR;
|
HTML_PARSE_BIG_LINES;
|
||||||
ctxt->options |= XML_PARSE_NOERROR;
|
|
||||||
}
|
return(htmlCtxtSetOptionsInternal(ctxt, options, keepMask));
|
||||||
if (options & HTML_PARSE_PEDANTIC) {
|
|
||||||
ctxt->pedantic = 1;
|
|
||||||
options -= XML_PARSE_PEDANTIC;
|
|
||||||
ctxt->options |= XML_PARSE_PEDANTIC;
|
|
||||||
} else
|
|
||||||
ctxt->pedantic = 0;
|
|
||||||
if (options & XML_PARSE_NOBLANKS) {
|
|
||||||
ctxt->keepBlanks = 0;
|
|
||||||
ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
|
|
||||||
options -= XML_PARSE_NOBLANKS;
|
|
||||||
ctxt->options |= XML_PARSE_NOBLANKS;
|
|
||||||
} else
|
|
||||||
ctxt->keepBlanks = 1;
|
|
||||||
if (options & HTML_PARSE_RECOVER) {
|
|
||||||
ctxt->recovery = 1;
|
|
||||||
options -= HTML_PARSE_RECOVER;
|
|
||||||
} else
|
|
||||||
ctxt->recovery = 0;
|
|
||||||
if (options & HTML_PARSE_COMPACT) {
|
|
||||||
ctxt->options |= HTML_PARSE_COMPACT;
|
|
||||||
options -= HTML_PARSE_COMPACT;
|
|
||||||
}
|
|
||||||
if (options & XML_PARSE_HUGE) {
|
|
||||||
ctxt->options |= XML_PARSE_HUGE;
|
|
||||||
options -= XML_PARSE_HUGE;
|
|
||||||
}
|
|
||||||
if (options & HTML_PARSE_NODEFDTD) {
|
|
||||||
ctxt->options |= HTML_PARSE_NODEFDTD;
|
|
||||||
options -= HTML_PARSE_NODEFDTD;
|
|
||||||
}
|
|
||||||
if (options & HTML_PARSE_IGNORE_ENC) {
|
|
||||||
ctxt->options |= HTML_PARSE_IGNORE_ENC;
|
|
||||||
options -= HTML_PARSE_IGNORE_ENC;
|
|
||||||
}
|
|
||||||
if (options & HTML_PARSE_NOIMPLIED) {
|
|
||||||
ctxt->options |= HTML_PARSE_NOIMPLIED;
|
|
||||||
options -= HTML_PARSE_NOIMPLIED;
|
|
||||||
}
|
|
||||||
if (options & HTML_PARSE_HTML5) {
|
|
||||||
ctxt->options |= HTML_PARSE_HTML5;
|
|
||||||
options -= HTML_PARSE_HTML5;
|
|
||||||
}
|
|
||||||
ctxt->dictNames = 0;
|
|
||||||
ctxt->linenumbers = 1;
|
|
||||||
return (options);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -205,21 +205,26 @@ XMLPUBFUN void
|
|||||||
* to the xmlReadDoc() and similar calls.
|
* to the xmlReadDoc() and similar calls.
|
||||||
*/
|
*/
|
||||||
typedef enum {
|
typedef enum {
|
||||||
HTML_PARSE_RECOVER = 1<<0, /* Relaxed parsing */
|
HTML_PARSE_RECOVER = 1<<0, /* No effect */
|
||||||
HTML_PARSE_HTML5 = 1<<1, /* HTML5 support */
|
HTML_PARSE_HTML5 = 1<<1, /* HTML5 support */
|
||||||
HTML_PARSE_NODEFDTD = 1<<2, /* do not default a doctype if not found */
|
HTML_PARSE_NODEFDTD = 1<<2, /* do not default a doctype if not found */
|
||||||
HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */
|
HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */
|
||||||
HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */
|
HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */
|
||||||
HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */
|
HTML_PARSE_PEDANTIC = 1<<7, /* No effect */
|
||||||
HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */
|
HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */
|
||||||
HTML_PARSE_NONET = 1<<11,/* Forbid network access */
|
HTML_PARSE_NONET = 1<<11,/* No effect */
|
||||||
HTML_PARSE_NOIMPLIED= 1<<13,/* Do not add implied html/body... elements */
|
HTML_PARSE_NOIMPLIED= 1<<13,/* Do not add implied html/body... elements */
|
||||||
HTML_PARSE_COMPACT = 1<<16,/* compact small text nodes */
|
HTML_PARSE_COMPACT = 1<<16,/* compact small text nodes */
|
||||||
HTML_PARSE_IGNORE_ENC=1<<21 /* ignore internal document encoding hint */
|
HTML_PARSE_HUGE = 1<<19,/* relax any hardcoded limit from the parser */
|
||||||
|
HTML_PARSE_IGNORE_ENC=1<<21,/* ignore internal document encoding hint */
|
||||||
|
HTML_PARSE_BIG_LINES= 1<<22 /* Store big lines numbers in text PSVI field */
|
||||||
} htmlParserOption;
|
} htmlParserOption;
|
||||||
|
|
||||||
XMLPUBFUN void
|
XMLPUBFUN void
|
||||||
htmlCtxtReset (htmlParserCtxtPtr ctxt);
|
htmlCtxtReset (htmlParserCtxtPtr ctxt);
|
||||||
|
XMLPUBFUN int
|
||||||
|
htmlCtxtSetOptions (htmlParserCtxtPtr ctxt,
|
||||||
|
int options);
|
||||||
XMLPUBFUN int
|
XMLPUBFUN int
|
||||||
htmlCtxtUseOptions (htmlParserCtxtPtr ctxt,
|
htmlCtxtUseOptions (htmlParserCtxtPtr ctxt,
|
||||||
int options);
|
int options);
|
||||||
|
Reference in New Issue
Block a user