1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-10-21 14:53:44 +03:00

added a recovery mode for the HTML parser based on the suggestions of bug

* HTMLparser.c include/libxml/HTMLparser.h: added a recovery mode
  for the HTML parser based on the suggestions of bug #169834 by
  Paul Loberg
Daniel
This commit is contained in:
Daniel Veillard
2005-08-23 16:06:08 +00:00
parent 17cccb5e01
commit ea4b0baef2
3 changed files with 49 additions and 9 deletions

View File

@@ -1,3 +1,9 @@
Tue Aug 23 18:04:08 CEST 2005 Daniel Veillard <daniel@veillard.com>
* HTMLparser.c include/libxml/HTMLparser.h: added a recovery mode
for the HTML parser based on the suggestions of bug #169834 by
Paul Loberg
Tue Aug 23 15:38:46 CEST 2005 Daniel Veillard <daniel@veillard.com> Tue Aug 23 15:38:46 CEST 2005 Daniel Veillard <daniel@veillard.com>
* elfgcchack.h testapi.c doc/*: regenerated * elfgcchack.h testapi.c doc/*: regenerated

View File

@@ -2651,15 +2651,34 @@ htmlParseScript(htmlParserCtxtPtr ctxt) {
cur = CUR_CHAR(l); cur = CUR_CHAR(l);
continue; continue;
} else if ((cur == '<') && (NXT(1) == '/')) { } else if ((cur == '<') && (NXT(1) == '/')) {
/* /*
* One should break here, the specification is clear: * One should break here, the specification is clear:
* Authors should therefore escape "</" within the content. * Authors should therefore escape "</" within the content.
* Escape mechanisms are specific to each scripting or * Escape mechanisms are specific to each scripting or
* style sheet language. * style sheet language.
*/ *
if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) || * In recovery mode, only break if end tag match the
((NXT(2) >= 'a') && (NXT(2) <= 'z'))) * current tag, effectively ignoring all tags inside the
break; /* while */ * script/style block and treating the entire block as
* CDATA.
*/
if (ctxt->recovery) {
if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
xmlStrlen(ctxt->name)) == 0)
{
break; /* while */
} else {
htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
"Element %s embbeds close tag\n",
ctxt->name, NULL);
}
} else {
if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
((NXT(2) >= 'a') && (NXT(2) <= 'z')))
{
break; /* while */
}
}
} }
COPY_BUF(l,buf,nbchar,cur); COPY_BUF(l,buf,nbchar,cur);
if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
@@ -2676,6 +2695,7 @@ htmlParseScript(htmlParserCtxtPtr ctxt) {
NEXTL(l); NEXTL(l);
cur = CUR_CHAR(l); cur = CUR_CHAR(l);
} }
if (!(IS_CHAR_CH(cur))) { if (!(IS_CHAR_CH(cur))) {
htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
"Invalid char in CDATA 0x%X\n", cur); "Invalid char in CDATA 0x%X\n", cur);
@@ -3580,6 +3600,15 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)
if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) { if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
"End tag : expected '>'\n", NULL, NULL); "End tag : expected '>'\n", NULL, NULL);
if (ctxt->recovery) {
/*
* We're not at the ending > !!
* Error, unless in recover mode where we search forwards
* until we find a >
*/
while (CUR != '\0' && CUR != '>') NEXT;
NEXT;
}
} else } else
NEXT; NEXT;
@@ -5787,6 +5816,10 @@ htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
ctxt->options |= XML_PARSE_NOBLANKS; ctxt->options |= XML_PARSE_NOBLANKS;
} else } else
ctxt->keepBlanks = 1; ctxt->keepBlanks = 1;
if (options & HTML_PARSE_RECOVER) {
ctxt->recovery = 1;
} else
ctxt->recovery = 0;
ctxt->dictNames = 0; ctxt->dictNames = 0;
return (options); return (options);
} }

View File

@@ -173,6 +173,7 @@ XMLPUBFUN void XMLCALL
* to the xmlReadDoc() and similar calls. * to the xmlReadDoc() and similar calls.
*/ */
typedef enum { typedef enum {
HTML_PARSE_RECOVER = 1<<0, /* Relaxed parsing */
HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */ HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */
HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */ HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */
HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */ HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */