From 4a557d97bfff5497500a6e707f7892cc4c092153 Mon Sep 17 00:00:00 2001
From: "William M. Brack" <wbrack@src.gnome.org>
Date: Tue, 29 Jul 2003 04:28:04 +0000
Subject: [PATCH] fixed problem with comments reported by Nick Kew added
 routines

* HTMLparser.c: fixed problem with comments reported by Nick Kew
* encoding.c: added routines xmlUTF8Size and xmlUTF8Charcmp for
  some future cleanup of UTF8 handling
---
 ChangeLog                 |  6 +++++
 HTMLparser.c              |  5 +++--
 encoding.c                | 47 +++++++++++++++++++++++++++++++++++++++
 include/libxml/encoding.h |  6 ++---
 4 files changed, 59 insertions(+), 5 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index fe52aad6..ea4e735b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+Tue Jul 29 12:28:17 HKT 2003 William Brack <wbrack@mmm.com.hk>
+
+	* HTMLparser.c: fixed problem with comments reported by Nick Kew
+	* encoding.c: added routines xmlUTF8Size and xmlUTF8Charcmp for
+	  some future cleanup of UTF8 handling
+
 Mon Jul 28 16:39:14 EDT 2003 Daniel Veillard <daniel@veillard.com>
 
 	* xpath.c: applied a change suggested by Sean Griffin in bug
diff --git a/HTMLparser.c b/HTMLparser.c
index e7dcb344..2ee458c6 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -4358,10 +4358,11 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
 		(buf[base + 2] == '-') && (buf[base + 3] == '-')) {
 		incomment = 1;
 	    }
-	    /* do not increment base, some people use <!--> */
+	    /* do not increment past <!, some people use <!--> */
+	    base += 2;
 	}
 	if (incomment) {
-	    if (base + 3 < len)
+	    if (base + 3 > len)
 		return(-1);
 	    if ((buf[base] == '-') && (buf[base + 1] == '-') &&
 		(buf[base + 2] == '>')) {
diff --git a/encoding.c b/encoding.c
index 8d43f45d..5cefd39c 100644
--- a/encoding.c
+++ b/encoding.c
@@ -84,6 +84,53 @@ static int xmlLittleEndian = 1;
  *									*
  ************************************************************************/
 
+/**
+ * xmlUTF8Size:
+ * @utf: pointer to the UTF8 character
+ *
+ * returns the numbers of bytes in the character, -1 on format error
+ */
+int
+xmlUTF8Size(const xmlChar *utf) {
+    xmlChar mask;
+    int len;
+
+    if (utf == NULL)
+        return -1;
+    if (*utf < 0x80)
+        return 1;
+    /* check valid UTF8 character */
+    if (!(*utf & 0x40))
+        return -1;
+    /* determine number of bytes in char */
+    len = 2;
+    for (mask=0x20; mask != 0; mask>>=1) {
+        if (!(*utf & mask))
+            return len;
+        len++;
+    }
+    return -1;
+}
+
+/**
+ * xmlUTF8Charcmp
+ * @utf1: pointer to first UTF8 char
+ * @utf2: pointer to second UTF8 char
+ *
+ * returns result of comparing the two UCS4 values
+ * as with xmlStrncmp
+ */
+int
+xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
+
+    if (utf1 == NULL ) {
+        if (utf2 == NULL)
+            return 0;
+        return -1;
+    }
+    return xmlStrncmp(utf1, utf2, xsltUTF8Size(utf1));
+}
+
 /**
  * xmlUTF8Strlen:
  * @utf:  a sequence of UTF-8 encoded bytes
diff --git a/include/libxml/encoding.h b/include/libxml/encoding.h
index 3c0fbb91..9841e14c 100644
--- a/include/libxml/encoding.h
+++ b/include/libxml/encoding.h
@@ -208,7 +208,6 @@ int	xmlGetUTF8Char			(const unsigned char *utf,
  */
 
 int	xmlCheckUTF8			(const unsigned char *utf);
-
 int	xmlUTF8Strsize			(const xmlChar *utf,
 					 int len);
 xmlChar * xmlUTF8Strndup		(const xmlChar *utf,
@@ -220,9 +219,10 @@ int	xmlUTF8Strloc			(const xmlChar *utf,
 xmlChar * xmlUTF8Strsub			(const xmlChar *utf,
 					 int start,
 					 int len);
-
 int	xmlUTF8Strlen			(const xmlChar *utf);
-
+int	xmlUTF8Size			(const xmlChar *utf);
+int	xmlUTF8Charcmp			(const xmlChar *utf1,
+					 const xmlChar *utf2);
 #ifdef __cplusplus
 }
 #endif