From ea4b0baef2922ccf32e6b3914fc024f309dba9b8 Mon Sep 17 00:00:00 2001
From: Daniel Veillard <veillard@src.gnome.org>
Date: Tue, 23 Aug 2005 16:06:08 +0000
Subject: [PATCH] added a recovery mode for the HTML parser based on the
 suggestions of bug

* HTMLparser.c include/libxml/HTMLparser.h: added a recovery mode
  for the HTML parser based on the suggestions of bug #169834 by
  Paul Loberg
Daniel
---
 ChangeLog                   |  6 +++++
 HTMLparser.c                | 51 ++++++++++++++++++++++++++++++-------
 include/libxml/HTMLparser.h |  1 +
 3 files changed, 49 insertions(+), 9 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 42746c62..c3a9cb9b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+Tue Aug 23 18:04:08 CEST 2005 Daniel Veillard <daniel@veillard.com>
+
+	* HTMLparser.c include/libxml/HTMLparser.h: added a recovery mode
+	  for the HTML parser based on the suggestions of bug #169834 by
+	  Paul Loberg
+
 Tue Aug 23 15:38:46 CEST 2005 Daniel Veillard <daniel@veillard.com>
 
 	* elfgcchack.h testapi.c doc/*: regenerated
diff --git a/HTMLparser.c b/HTMLparser.c
index 0ea0f9c6..fe36c89f 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -2651,15 +2651,34 @@ htmlParseScript(htmlParserCtxtPtr ctxt) {
 	    cur = CUR_CHAR(l);
 	    continue;
 	} else if ((cur == '<') && (NXT(1) == '/')) {
-	    /*
-	     * One should break here, the specification is clear:
-	     * Authors should therefore escape "</" within the content.
-	     * Escape mechanisms are specific to each scripting or
-	     * style sheet language.
-	     */
-	    if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
-	        ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
-		break; /* while */
+            /*
+             * One should break here, the specification is clear:
+             * Authors should therefore escape "</" within the content.
+             * Escape mechanisms are specific to each scripting or
+             * style sheet language.
+             *
+             * In recovery mode, only break if end tag match the
+             * current tag, effectively ignoring all tags inside the
+             * script/style block and treating the entire block as
+             * CDATA.
+             */
+            if (ctxt->recovery) {
+                if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2, 
+				   xmlStrlen(ctxt->name)) == 0) 
+                {
+                    break; /* while */
+                } else {
+		    htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
+				 "Element %s embbeds close tag\n",
+		                 ctxt->name, NULL);
+		}
+            } else {
+                if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
+                    ((NXT(2) >= 'a') && (NXT(2) <= 'z'))) 
+                {
+                    break; /* while */
+                }
+            }
 	}
 	COPY_BUF(l,buf,nbchar,cur);
 	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
@@ -2676,6 +2695,7 @@ htmlParseScript(htmlParserCtxtPtr ctxt) {
 	NEXTL(l);
 	cur = CUR_CHAR(l);
     }
+
     if (!(IS_CHAR_CH(cur))) {
 	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
 	                "Invalid char in CDATA 0x%X\n", cur);
@@ -3580,6 +3600,15 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)
     if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
         htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
 	             "End tag : expected '>'\n", NULL, NULL);
+	if (ctxt->recovery) {
+	    /*
+	     * We're not at the ending > !!
+	     * Error, unless in recover mode where we search forwards
+	     * until we find a >
+	     */
+	    while (CUR != '\0' && CUR != '>') NEXT;
+	    NEXT;
+	}
     } else
         NEXT;
 
@@ -5787,6 +5816,10 @@ htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
 	ctxt->options |= XML_PARSE_NOBLANKS;
     } else
         ctxt->keepBlanks = 1;
+    if (options & HTML_PARSE_RECOVER) {
+        ctxt->recovery = 1;
+    } else
+        ctxt->recovery = 0;
     ctxt->dictNames = 0;
     return (options);
 }
diff --git a/include/libxml/HTMLparser.h b/include/libxml/HTMLparser.h
index 2604d860..2c1e8d21 100644
--- a/include/libxml/HTMLparser.h
+++ b/include/libxml/HTMLparser.h
@@ -173,6 +173,7 @@ XMLPUBFUN void XMLCALL
  * to the xmlReadDoc() and similar calls.
  */
 typedef enum {
+    HTML_PARSE_RECOVER  = 1<<0, /* Relaxed parsing */
     HTML_PARSE_NOERROR	= 1<<5,	/* suppress error reports */
     HTML_PARSE_NOWARNING= 1<<6,	/* suppress warning reports */
     HTML_PARSE_PEDANTIC	= 1<<7,	/* pedantic error reporting */