mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-10-21 14:53:44 +03:00
added a recovery mode for the HTML parser based on the suggestions of bug
* HTMLparser.c include/libxml/HTMLparser.h: added a recovery mode for the HTML parser based on the suggestions of bug #169834 by Paul Loberg Daniel
This commit is contained in:
@@ -1,3 +1,9 @@
|
|||||||
|
Tue Aug 23 18:04:08 CEST 2005 Daniel Veillard <daniel@veillard.com>
|
||||||
|
|
||||||
|
* HTMLparser.c include/libxml/HTMLparser.h: added a recovery mode
|
||||||
|
for the HTML parser based on the suggestions of bug #169834 by
|
||||||
|
Paul Loberg
|
||||||
|
|
||||||
Tue Aug 23 15:38:46 CEST 2005 Daniel Veillard <daniel@veillard.com>
|
Tue Aug 23 15:38:46 CEST 2005 Daniel Veillard <daniel@veillard.com>
|
||||||
|
|
||||||
* elfgcchack.h testapi.c doc/*: regenerated
|
* elfgcchack.h testapi.c doc/*: regenerated
|
||||||
|
51
HTMLparser.c
51
HTMLparser.c
@@ -2651,15 +2651,34 @@ htmlParseScript(htmlParserCtxtPtr ctxt) {
|
|||||||
cur = CUR_CHAR(l);
|
cur = CUR_CHAR(l);
|
||||||
continue;
|
continue;
|
||||||
} else if ((cur == '<') && (NXT(1) == '/')) {
|
} else if ((cur == '<') && (NXT(1) == '/')) {
|
||||||
/*
|
/*
|
||||||
* One should break here, the specification is clear:
|
* One should break here, the specification is clear:
|
||||||
* Authors should therefore escape "</" within the content.
|
* Authors should therefore escape "</" within the content.
|
||||||
* Escape mechanisms are specific to each scripting or
|
* Escape mechanisms are specific to each scripting or
|
||||||
* style sheet language.
|
* style sheet language.
|
||||||
*/
|
*
|
||||||
if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
|
* In recovery mode, only break if end tag match the
|
||||||
((NXT(2) >= 'a') && (NXT(2) <= 'z')))
|
* current tag, effectively ignoring all tags inside the
|
||||||
break; /* while */
|
* script/style block and treating the entire block as
|
||||||
|
* CDATA.
|
||||||
|
*/
|
||||||
|
if (ctxt->recovery) {
|
||||||
|
if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
|
||||||
|
xmlStrlen(ctxt->name)) == 0)
|
||||||
|
{
|
||||||
|
break; /* while */
|
||||||
|
} else {
|
||||||
|
htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
|
||||||
|
"Element %s embbeds close tag\n",
|
||||||
|
ctxt->name, NULL);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
|
||||||
|
((NXT(2) >= 'a') && (NXT(2) <= 'z')))
|
||||||
|
{
|
||||||
|
break; /* while */
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
COPY_BUF(l,buf,nbchar,cur);
|
COPY_BUF(l,buf,nbchar,cur);
|
||||||
if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
|
if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
|
||||||
@@ -2676,6 +2695,7 @@ htmlParseScript(htmlParserCtxtPtr ctxt) {
|
|||||||
NEXTL(l);
|
NEXTL(l);
|
||||||
cur = CUR_CHAR(l);
|
cur = CUR_CHAR(l);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!(IS_CHAR_CH(cur))) {
|
if (!(IS_CHAR_CH(cur))) {
|
||||||
htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
|
htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
|
||||||
"Invalid char in CDATA 0x%X\n", cur);
|
"Invalid char in CDATA 0x%X\n", cur);
|
||||||
@@ -3580,6 +3600,15 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)
|
|||||||
if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
|
if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
|
||||||
htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
|
htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
|
||||||
"End tag : expected '>'\n", NULL, NULL);
|
"End tag : expected '>'\n", NULL, NULL);
|
||||||
|
if (ctxt->recovery) {
|
||||||
|
/*
|
||||||
|
* We're not at the ending > !!
|
||||||
|
* Error, unless in recover mode where we search forwards
|
||||||
|
* until we find a >
|
||||||
|
*/
|
||||||
|
while (CUR != '\0' && CUR != '>') NEXT;
|
||||||
|
NEXT;
|
||||||
|
}
|
||||||
} else
|
} else
|
||||||
NEXT;
|
NEXT;
|
||||||
|
|
||||||
@@ -5787,6 +5816,10 @@ htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
|
|||||||
ctxt->options |= XML_PARSE_NOBLANKS;
|
ctxt->options |= XML_PARSE_NOBLANKS;
|
||||||
} else
|
} else
|
||||||
ctxt->keepBlanks = 1;
|
ctxt->keepBlanks = 1;
|
||||||
|
if (options & HTML_PARSE_RECOVER) {
|
||||||
|
ctxt->recovery = 1;
|
||||||
|
} else
|
||||||
|
ctxt->recovery = 0;
|
||||||
ctxt->dictNames = 0;
|
ctxt->dictNames = 0;
|
||||||
return (options);
|
return (options);
|
||||||
}
|
}
|
||||||
|
@@ -173,6 +173,7 @@ XMLPUBFUN void XMLCALL
|
|||||||
* to the xmlReadDoc() and similar calls.
|
* to the xmlReadDoc() and similar calls.
|
||||||
*/
|
*/
|
||||||
typedef enum {
|
typedef enum {
|
||||||
|
HTML_PARSE_RECOVER = 1<<0, /* Relaxed parsing */
|
||||||
HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */
|
HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */
|
||||||
HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */
|
HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */
|
||||||
HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */
|
HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */
|
||||||
|
Reference in New Issue
Block a user