From 4a557d97bfff5497500a6e707f7892cc4c092153 Mon Sep 17 00:00:00 2001 From: "William M. Brack" Date: Tue, 29 Jul 2003 04:28:04 +0000 Subject: [PATCH] fixed problem with comments reported by Nick Kew added routines * HTMLparser.c: fixed problem with comments reported by Nick Kew * encoding.c: added routines xmlUTF8Size and xmlUTF8Charcmp for some future cleanup of UTF8 handling --- ChangeLog | 6 +++++ HTMLparser.c | 5 +++-- encoding.c | 47 +++++++++++++++++++++++++++++++++++++++ include/libxml/encoding.h | 6 ++--- 4 files changed, 59 insertions(+), 5 deletions(-) diff --git a/ChangeLog b/ChangeLog index fe52aad6..ea4e735b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +Tue Jul 29 12:28:17 HKT 2003 William Brack + + * HTMLparser.c: fixed problem with comments reported by Nick Kew + * encoding.c: added routines xmlUTF8Size and xmlUTF8Charcmp for + some future cleanup of UTF8 handling + Mon Jul 28 16:39:14 EDT 2003 Daniel Veillard * xpath.c: applied a change suggested by Sean Griffin in bug diff --git a/HTMLparser.c b/HTMLparser.c index e7dcb344..2ee458c6 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -4358,10 +4358,11 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, (buf[base + 2] == '-') && (buf[base + 3] == '-')) { incomment = 1; } - /* do not increment base, some people use */ + /* do not increment past */ + base += 2; } if (incomment) { - if (base + 3 < len) + if (base + 3 > len) return(-1); if ((buf[base] == '-') && (buf[base + 1] == '-') && (buf[base + 2] == '>')) { diff --git a/encoding.c b/encoding.c index 8d43f45d..5cefd39c 100644 --- a/encoding.c +++ b/encoding.c @@ -84,6 +84,53 @@ static int xmlLittleEndian = 1; * * ************************************************************************/ +/** + * xmlUTF8Size: + * @utf: pointer to the UTF8 character + * + * returns the numbers of bytes in the character, -1 on format error + */ +int +xmlUTF8Size(const xmlChar *utf) { + xmlChar mask; + int len; + + if (utf == NULL) + return -1; + if (*utf < 0x80) + return 1; + /* check valid UTF8 character */ + if (!(*utf & 0x40)) + return -1; + /* determine number of bytes in char */ + len = 2; + for (mask=0x20; mask != 0; mask>>=1) { + if (!(*utf & mask)) + return len; + len++; + } + return -1; +} + +/** + * xmlUTF8Charcmp + * @utf1: pointer to first UTF8 char + * @utf2: pointer to second UTF8 char + * + * returns result of comparing the two UCS4 values + * as with xmlStrncmp + */ +int +xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) { + + if (utf1 == NULL ) { + if (utf2 == NULL) + return 0; + return -1; + } + return xmlStrncmp(utf1, utf2, xsltUTF8Size(utf1)); +} + /** * xmlUTF8Strlen: * @utf: a sequence of UTF-8 encoded bytes diff --git a/include/libxml/encoding.h b/include/libxml/encoding.h index 3c0fbb91..9841e14c 100644 --- a/include/libxml/encoding.h +++ b/include/libxml/encoding.h @@ -208,7 +208,6 @@ int xmlGetUTF8Char (const unsigned char *utf, */ int xmlCheckUTF8 (const unsigned char *utf); - int xmlUTF8Strsize (const xmlChar *utf, int len); xmlChar * xmlUTF8Strndup (const xmlChar *utf, @@ -220,9 +219,10 @@ int xmlUTF8Strloc (const xmlChar *utf, xmlChar * xmlUTF8Strsub (const xmlChar *utf, int start, int len); - int xmlUTF8Strlen (const xmlChar *utf); - +int xmlUTF8Size (const xmlChar *utf); +int xmlUTF8Charcmp (const xmlChar *utf1, + const xmlChar *utf2); #ifdef __cplusplus } #endif