From ea4b0baef2922ccf32e6b3914fc024f309dba9b8 Mon Sep 17 00:00:00 2001 From: Daniel Veillard Date: Tue, 23 Aug 2005 16:06:08 +0000 Subject: [PATCH] added a recovery mode for the HTML parser based on the suggestions of bug * HTMLparser.c include/libxml/HTMLparser.h: added a recovery mode for the HTML parser based on the suggestions of bug #169834 by Paul Loberg Daniel --- ChangeLog | 6 +++++ HTMLparser.c | 51 ++++++++++++++++++++++++++++++------- include/libxml/HTMLparser.h | 1 + 3 files changed, 49 insertions(+), 9 deletions(-) diff --git a/ChangeLog b/ChangeLog index 42746c62..c3a9cb9b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +Tue Aug 23 18:04:08 CEST 2005 Daniel Veillard + + * HTMLparser.c include/libxml/HTMLparser.h: added a recovery mode + for the HTML parser based on the suggestions of bug #169834 by + Paul Loberg + Tue Aug 23 15:38:46 CEST 2005 Daniel Veillard * elfgcchack.h testapi.c doc/*: regenerated diff --git a/HTMLparser.c b/HTMLparser.c index 0ea0f9c6..fe36c89f 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -2651,15 +2651,34 @@ htmlParseScript(htmlParserCtxtPtr ctxt) { cur = CUR_CHAR(l); continue; } else if ((cur == '<') && (NXT(1) == '/')) { - /* - * One should break here, the specification is clear: - * Authors should therefore escape "= 'A') && (NXT(2) <= 'Z')) || - ((NXT(2) >= 'a') && (NXT(2) <= 'z'))) - break; /* while */ + /* + * One should break here, the specification is clear: + * Authors should therefore escape "recovery) { + if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2, + xmlStrlen(ctxt->name)) == 0) + { + break; /* while */ + } else { + htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, + "Element %s embbeds close tag\n", + ctxt->name, NULL); + } + } else { + if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) || + ((NXT(2) >= 'a') && (NXT(2) <= 'z'))) + { + break; /* while */ + } + } } COPY_BUF(l,buf,nbchar,cur); if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { @@ -2676,6 +2695,7 @@ htmlParseScript(htmlParserCtxtPtr ctxt) { NEXTL(l); cur = CUR_CHAR(l); } + if (!(IS_CHAR_CH(cur))) { htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, "Invalid char in CDATA 0x%X\n", cur); @@ -3580,6 +3600,15 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt) if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) { htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, "End tag : expected '>'\n", NULL, NULL); + if (ctxt->recovery) { + /* + * We're not at the ending > !! + * Error, unless in recover mode where we search forwards + * until we find a > + */ + while (CUR != '\0' && CUR != '>') NEXT; + NEXT; + } } else NEXT; @@ -5787,6 +5816,10 @@ htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options) ctxt->options |= XML_PARSE_NOBLANKS; } else ctxt->keepBlanks = 1; + if (options & HTML_PARSE_RECOVER) { + ctxt->recovery = 1; + } else + ctxt->recovery = 0; ctxt->dictNames = 0; return (options); } diff --git a/include/libxml/HTMLparser.h b/include/libxml/HTMLparser.h index 2604d860..2c1e8d21 100644 --- a/include/libxml/HTMLparser.h +++ b/include/libxml/HTMLparser.h @@ -173,6 +173,7 @@ XMLPUBFUN void XMLCALL * to the xmlReadDoc() and similar calls. */ typedef enum { + HTML_PARSE_RECOVER = 1<<0, /* Relaxed parsing */ HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */ HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */ HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */