From 365e13be6c7faa99d2797b973e39b3491ab7c18a Mon Sep 17 00:00:00 2001
From: Daniel Veillard <veillard@src.gnome.org>
Date: Sun, 2 Jul 2000 07:56:37 +0000
Subject: [PATCH] Work on character encoding support for the HTML parser Fixed
 some autoopen/autoclose probs for the HTML parser Fixed a potential memleak
 in the encoding stuff Daniel.

---
 ChangeLog    |   6 +
 HTMLparser.c | 339 +++++++++++++++++++++++++++------------------------
 doc/xml.html |  14 +--
 encoding.c   |   2 +-
 4 files changed, 193 insertions(+), 168 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index 72686396..31d42a6b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+Sun Jul  2 09:52:45 MEST 2000 Daniel Veillard <Daniel.Veillard@w3.org>
+
+	* HTMLparser.c: Work on character encoding support for the HTML parser
+	* HTMLparser.c: Fixed some autoopen/autoclose probs for the HTML parser
+	* encoding.c: Fixed a potential memleak in the encoding stuff
+
 Sat Jul  1 13:44:22 MEST 2000 Daniel Veillard <Daniel.Veillard@w3.org>
 
 	* doc/FAQ.html doc/Makefile.am : added a FAQ
diff --git a/HTMLparser.c b/HTMLparser.c
index 375a038e..ff331488 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -479,17 +479,19 @@ htmlCheckAutoClose(const xmlChar *new, const xmlChar *old) {
 /**
  * htmlAutoClose:
  * @ctxt:  an HTML parser context
- * @new:  The new tag name
+ * @new:  The new tag name or NULL
  *
  * The HTmL DtD allows a tag to implicitely close other tags.
  * The list is kept in htmlStartClose array. This function is
  * called when a new tag has been detected and generates the
  * appropriates closes if possible/needed.
+ * If new is NULL this mean we are at the end of the resource
+ * and we should check 
  */
 void
 htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *new) {
     xmlChar *oldname;
-    while ((ctxt->name != NULL) && 
+    while ((new != NULL) && (ctxt->name != NULL) && 
            (htmlCheckAutoClose(new, ctxt->name))) {
 #ifdef DEBUG
 	fprintf(stderr,"htmlAutoClose: %s closes %s\n", new, ctxt->name);
@@ -504,6 +506,24 @@ htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar *new) {
 	    xmlFree(oldname);
         }
     }
+    while ((new == NULL) && (ctxt->name != NULL) &&
+	   ((!xmlStrcmp(ctxt->name, BAD_CAST"head")) ||
+	    (!xmlStrcmp(ctxt->name, BAD_CAST"body")) ||
+	    (!xmlStrcmp(ctxt->name, BAD_CAST"html")))) {
+#ifdef DEBUG
+	fprintf(stderr,"htmlAutoClose: EOF closes %s\n", ctxt->name);
+#endif
+	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
+	    ctxt->sax->endElement(ctxt->userData, ctxt->name);
+	oldname = htmlnamePop(ctxt);
+	if (oldname != NULL) {
+#ifdef DEBUG
+	    fprintf(stderr,"htmlAutoClose: popped %s\n", oldname);
+#endif
+	    xmlFree(oldname);
+        }
+   }
+
 }
 
 /**
@@ -1086,140 +1106,6 @@ htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
     return(buffer);
 }
 
-
-/************************************************************************
- *									*
- *		Commodity functions to handle encodings			*
- *									*
- ************************************************************************/
-
-/**
- * htmlSwitchEncoding:
- * @ctxt:  the parser context
- * @len:  the len of @cur
- *
- * change the input functions when discovering the character encoding
- * of a given entity.
- *
- */
-void
-htmlSwitchEncoding(htmlParserCtxtPtr ctxt, xmlCharEncoding enc)
-{
-    switch (enc) {
-        case XML_CHAR_ENCODING_ERROR:
-	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-	        ctxt->sax->error(ctxt->userData, "encoding unknown\n");
-	    ctxt->wellFormed = 0;
-            break;
-        case XML_CHAR_ENCODING_NONE:
-	    /* let's assume it's UTF-8 without the XML decl */
-            return;
-        case XML_CHAR_ENCODING_UTF8:
-	    /* default encoding, no conversion should be needed */
-            return;
-        case XML_CHAR_ENCODING_UTF16LE:
-	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-		  "char encoding UTF16 little endian not supported\n");
-            break;
-        case XML_CHAR_ENCODING_UTF16BE:
-	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-		  "char encoding UTF16 big endian not supported\n");
-            break;
-        case XML_CHAR_ENCODING_UCS4LE:
-	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-		  "char encoding USC4 little endian not supported\n");
-            break;
-        case XML_CHAR_ENCODING_UCS4BE:
-	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-		  "char encoding USC4 big endian not supported\n");
-            break;
-        case XML_CHAR_ENCODING_EBCDIC:
-	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-		  "char encoding EBCDIC not supported\n");
-            break;
-        case XML_CHAR_ENCODING_UCS4_2143:
-	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-		  "char encoding UCS4 2143 not supported\n");
-            break;
-        case XML_CHAR_ENCODING_UCS4_3412:
-	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-		  "char encoding UCS4 3412 not supported\n");
-            break;
-        case XML_CHAR_ENCODING_UCS2:
-	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-		  "char encoding UCS2 not supported\n");
-            break;
-        case XML_CHAR_ENCODING_8859_1:
-	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-		  "char encoding ISO_8859_1 ISO Latin 1 not supported\n");
-            break;
-        case XML_CHAR_ENCODING_8859_2:
-	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-		  "char encoding ISO_8859_2 ISO Latin 2 not supported\n");
-            break;
-        case XML_CHAR_ENCODING_8859_3:
-	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-		  "char encoding ISO_8859_3 not supported\n");
-            break;
-        case XML_CHAR_ENCODING_8859_4:
-	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-		  "char encoding ISO_8859_4 not supported\n");
-            break;
-        case XML_CHAR_ENCODING_8859_5:
-	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-		  "char encoding ISO_8859_5 not supported\n");
-            break;
-        case XML_CHAR_ENCODING_8859_6:
-	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-		  "char encoding ISO_8859_6 not supported\n");
-            break;
-        case XML_CHAR_ENCODING_8859_7:
-	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-		  "char encoding ISO_8859_7 not supported\n");
-            break;
-        case XML_CHAR_ENCODING_8859_8:
-	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-		  "char encoding ISO_8859_8 not supported\n");
-            break;
-        case XML_CHAR_ENCODING_8859_9:
-	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-		  "char encoding ISO_8859_9 not supported\n");
-            break;
-        case XML_CHAR_ENCODING_2022_JP:
-            if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-                  "char encoding ISO-2022-JPnot supported\n");
-            break;
-        case XML_CHAR_ENCODING_SHIFT_JIS:
-            if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-                  "char encoding Shift_JISnot supported\n");
-            break;
-        case XML_CHAR_ENCODING_EUC_JP:
-            if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
-                ctxt->sax->error(ctxt->userData,
-                  "char encoding EUC-JPnot supported\n");
-            break;
-    }
-}
-
 /************************************************************************
  *									*
  *		Commodity functions to handle streams			*
@@ -2227,6 +2113,112 @@ htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
     return(name);
 }
 
+/**
+ * htmlCheckEncoding:
+ * @ctxt:  an HTML parser context
+ * @attvalue: the attribute value
+ *
+ * Checks an http-equiv attribute from a Meta tag to detect
+ * the encoding
+ * If a new encoding is detected the parser is switched to decode
+ * it and pass UTF8
+ */
+void
+htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
+    const xmlChar *encoding;
+
+    if ((ctxt == NULL) || (attvalue == NULL))
+	return;
+
+fprintf(stderr, "htmlCheckEncoding: \"%s\"\n", attvalue);
+
+    encoding = xmlStrstr(attvalue, BAD_CAST"charset=");
+    if (encoding == NULL) 
+	encoding = xmlStrstr(attvalue, BAD_CAST"Charset=");
+    if (encoding == NULL) 
+	encoding = xmlStrstr(attvalue, BAD_CAST"CHARSET=");
+    if (encoding != NULL) {
+	encoding += 8;
+    } else {
+	encoding = xmlStrstr(attvalue, BAD_CAST"charset =");
+	if (encoding == NULL) 
+	    encoding = xmlStrstr(attvalue, BAD_CAST"Charset =");
+	if (encoding == NULL) 
+	    encoding = xmlStrstr(attvalue, BAD_CAST"CHARSET =");
+	if (encoding != NULL)
+	    encoding += 9;
+    }
+    if (encoding != NULL) {
+	xmlCharEncoding enc;
+	xmlCharEncodingHandlerPtr handler;
+
+	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
+
+	if (ctxt->input->encoding != NULL)
+	    xmlFree((xmlChar *) ctxt->input->encoding);
+	ctxt->input->encoding = xmlStrdup(encoding);
+
+	enc = xmlParseCharEncoding((const char *) encoding);
+	/*
+	 * registered set of known encodings
+	 */
+	if (enc != XML_CHAR_ENCODING_ERROR) {
+	    xmlSwitchEncoding(ctxt, enc);
+	} else {
+	    /*
+	     * fallback for unknown encodings
+	     */
+	    handler = xmlFindCharEncodingHandler((const char *) encoding);
+	    if (handler != NULL) {
+		xmlSwitchToEncoding(ctxt, handler);
+	    } else {
+		ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
+	    }
+	}
+    }
+}
+
+/**
+ * htmlCheckMeta:
+ * @ctxt:  an HTML parser context
+ * @atts:  the attributes values
+ *
+ * Checks an attributes from a Meta tag
+ */
+void
+htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
+    int i;
+    const xmlChar *att, *value;
+    int http = 0;
+    const xmlChar *content = NULL;
+
+    if ((ctxt == NULL) || (atts == NULL))
+	return;
+
+    i = 0;
+    att = atts[i++];
+    while (att != NULL) {
+	value = atts[i++];
+	if ((value != NULL) &&
+	    ((!xmlStrcmp(att, BAD_CAST"http-equiv")) ||
+	     (!xmlStrcmp(att, BAD_CAST"Http-Equiv")) ||
+	     (!xmlStrcmp(att, BAD_CAST"HTTP-EQUIV"))) &&
+	    ((!xmlStrcmp(value, BAD_CAST"Content-Type")) ||
+	     (!xmlStrcmp(value, BAD_CAST"content-type")) ||
+	     (!xmlStrcmp(value, BAD_CAST"CONTENT-TYPE"))))
+	    http = 1;
+	else if ((value != NULL) &&
+		 ((!xmlStrcmp(att, BAD_CAST"content")) ||
+		  (!xmlStrcmp(att, BAD_CAST"Content")) ||
+		  (!xmlStrcmp(att, BAD_CAST"CONTENT"))))
+	    content = value;
+	att = atts[i++];
+    }
+    if ((http) && (content != NULL))
+	htmlCheckEncoding(ctxt, content);
+
+}
+
 /**
  * htmlParseStartTag:
  * @ctxt:  an HTML parser context
@@ -2254,6 +2246,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
     const xmlChar **atts = NULL;
     int nbatts = 0;
     int maxatts = 0;
+    int meta = 0;
     int i;
 
     if (CUR != '<') return;
@@ -2268,6 +2261,8 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
 	ctxt->wellFormed = 0;
         return;
     }
+    if (!xmlStrcmp(name, BAD_CAST"meta"))
+	meta = 1;
 
     /*
      * Check for auto-closure of HTML elements.
@@ -2293,6 +2288,7 @@ htmlParseStartTag(htmlParserCtxtPtr ctxt) {
 	GROW;
 	attname = htmlParseAttribute(ctxt, &attvalue);
         if (attname != NULL) {
+
 	    /*
 	     * Well formedness requires at most one declaration of an attribute
 	     */
@@ -2349,6 +2345,12 @@ failed:
 	}
     }
 
+    /*
+     * Handle specific association to the META tag
+     */
+    if (meta)
+	htmlCheckMeta(ctxt, atts);
+
     /*
      * SAX: Start of Element !
      */
@@ -2587,6 +2589,13 @@ htmlParseContent(htmlParserCtxtPtr ctxt) {
 	    htmlParseReference(ctxt);
 	}
 
+	/*
+	 * Fourth : end of the resource
+	 */
+	else if (CUR == 0) {
+	    htmlAutoClose(ctxt, NULL);
+	}
+
 	/*
 	 * Last case, text. Note that References are handled directly.
 	 */
@@ -2851,6 +2860,13 @@ htmlParseDocument(htmlParserCtxtPtr ctxt) {
      */
     htmlParseContent(ctxt);
 
+    /*
+     * autoclose
+     */
+    if (CUR == 0)
+	htmlAutoClose(ctxt, NULL);
+
+
     /*
      * SAX: end of the document processing.
      */
@@ -2942,26 +2958,7 @@ htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
 void
 htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
 {
-    htmlParserInputPtr input;
-    xmlChar *oldname;
-
-    if (ctxt == NULL) return;
-
-    while ((input = inputPop(ctxt)) != NULL) {
-        xmlFreeInputStream(input);
-    }
-
-    if (ctxt->nodeTab != NULL) xmlFree(ctxt->nodeTab);
-    while ((oldname = htmlnamePop(ctxt)) != NULL) {
-	xmlFree(oldname);
-    }
-    if (ctxt->nameTab != NULL) xmlFree(ctxt->nameTab);
-    if (ctxt->directory != NULL) xmlFree(ctxt->directory);
-    if (ctxt->inputTab != NULL) xmlFree(ctxt->inputTab);
-    if (ctxt->version != NULL) xmlFree((char *) ctxt->version);
-    if ((ctxt->sax != NULL) && (ctxt->sax != &htmlDefaultSAXHandler))
-        xmlFree(ctxt->sax);
-    xmlFree(ctxt);
+    xmlFreeParserCtxt(ctxt);
 }
 
 /**
@@ -3096,7 +3093,7 @@ int
 htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
     int ret = 0;
     htmlParserInputPtr in;
-    int avail;
+    int avail = 0;
     xmlChar cur, next;
 
 #ifdef DEBUG_PUSH
@@ -3142,6 +3139,11 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 	    avail = in->length - (in->cur - in->base);
 	else
 	    avail = in->buf->buffer->use - (in->cur - in->base);
+	if ((avail == 0) && (terminate)) {
+	    htmlAutoClose(ctxt, NULL);
+	    if (ctxt->nameNr == 0)
+		ctxt->instate = XML_PARSER_EOF;
+	}
         if (avail < 1)
 	    goto done;
         switch (ctxt->instate) {
@@ -3445,14 +3447,26 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 		 * Handle preparsed entities and charRef
 		 */
 		if (ctxt->token != 0) {
-		    xmlChar cur[2] = { 0 , 0 } ;
+		    xmlChar chr[2] = { 0 , 0 } ;
 
-		    cur[0] = (xmlChar) ctxt->token;
+		    chr[0] = (xmlChar) ctxt->token;
 		    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
-			ctxt->sax->characters(ctxt->userData, cur, 1);
+			ctxt->sax->characters(ctxt->userData, chr, 1);
 		    ctxt->token = 0;
 		    ctxt->checkIndex = 0;
 		}
+		if ((avail == 1) && (terminate)) {
+		    cur = in->cur[0];
+		    if ((cur != '<') && (cur != '&')) {
+			if ((ctxt->sax != NULL) &&
+			    (ctxt->sax->characters != NULL))
+			ctxt->sax->characters(ctxt->userData, &cur, 1);
+			ctxt->token = 0;
+			ctxt->checkIndex = 0;
+			NEXT;
+		    }
+		    break;
+		}
 		if (avail < 2)
 		    goto done;
 		cur = in->cur[0];
@@ -3588,6 +3602,11 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
 	}
     }
 done:    
+    if ((avail == 0) && (terminate)) {
+	htmlAutoClose(ctxt, NULL);
+	if (ctxt->nameNr == 0)
+	    ctxt->instate = XML_PARSER_EOF;
+    }
 #ifdef DEBUG_PUSH
     fprintf(stderr, "HPP: done %d\n", ret);
 #endif
diff --git a/doc/xml.html b/doc/xml.html
index 84a6aaf1..02d9c98a 100644
--- a/doc/xml.html
+++ b/doc/xml.html
@@ -3,7 +3,7 @@
 <html>
 <head>
   <title>The XML library for Gnome</title>
-  <meta name="GENERATOR" content="amaya V3.1">
+  <meta name="GENERATOR" content="amaya V2.1">
   <meta http-equiv="Content-Type" content="text/html">
 </head>
 
@@ -75,9 +75,9 @@ building tag-based structured documents/data.</p>
 
 <p>There are some on-line resources about using libxml:</p>
 <ol>
-  <li>The code is commented in a way which allows <a
-    href="http://xmlsoft.org/libxml.html">extensive documentation</a> to be
-    automatically extracted.</li>
+  <li>Check the <a href="FAQ.html">FAQ</a></li>
+  <li>Check the <a href="http://xmlsoft.org/libxml.html">extensive
+    documentation</a> automatically extracted from code comments.</li>
   <li>This page provides a global overview and <a href="#real">some
     examples</a> on how to use libxml.</li>
   <li><a href="mailto:james@daa.com.au">James Henstridge</a> wrote <a
@@ -166,8 +166,8 @@ platform, get in touch with me to upload the package. I will keep them in the
     this base for my own development, so it's updated more regularly, but the
     content may not be as stable):</p>
     <pre>CVSROOT=:pserver:anonymous@dev.w3.org:/sources/public
-    password: anonymous
-    module: XML</pre>
+        password: anonymous
+        module: XML</pre>
   </li>
   <li><p>The <a
     href="http://cvs.gnome.org/bonsai/rview.cgi?cvsroot=/cvs/gnome&amp;dir=gnome-xml">Gnome
@@ -1180,6 +1180,6 @@ base under gnome-xml/example</p>
 
 <p><a href="mailto:Daniel.Veillard@w3.org">Daniel Veillard</a></p>
 
-<p>$Id: xml.html,v 1.34 2000/06/23 18:32:15 veillard Exp $</p>
+<p>$Id: xml.html,v 1.35 2000/06/30 17:58:22 veillard Exp $</p>
 </body>
 </html>
diff --git a/encoding.c b/encoding.c
index d1897991..dd367201 100644
--- a/encoding.c
+++ b/encoding.c
@@ -1167,7 +1167,7 @@ xmlFindCharEncodingHandler(const char *name) {
 	        iconv_close(icv_out);
 		return(NULL);
 	    }
-	    enc->name = NULL;
+	    enc->name = xmlMemStrdup(name);
 	    enc->input = NULL;
 	    enc->output = NULL;
 	    enc->iconv_in = icv_in;