diff --git a/ChangeLog b/ChangeLog index 199ccd8d..7ebab61a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +Wed May 30 21:12:45 CEST 2001 Daniel Veillard + + * xpath.c encoding.[ch]: William M. Brack provided a set of UTF8 + string oriented functions and started cleaning the related areas + in xpath.c which needed fixing in this respect + Wed May 30 20:30:47 CEST 2001 Daniel Veillard * HTMLtree.c: applied patch from Jaroslaw Kolakowski to close bug diff --git a/encoding.c b/encoding.c index 020f4de8..c0b73163 100644 --- a/encoding.c +++ b/encoding.c @@ -13,11 +13,14 @@ * [US-ASCII] Coded Character Set--7-bit American Standard Code for * Information Interchange, ANSI X3.4-1986. * - * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" - * * See Copyright for the status of this software. * * Daniel.Veillard@w3.org + * + * UTF8 string routines from: + * "William M. Brack" + * + * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" */ #include "libxml.h" @@ -64,16 +67,20 @@ static int xmlCharEncodingAliasesMax = 0; static int xmlLittleEndian = 1; -/* - * From rfc2044: encoding of the Unicode values on UTF-8: - * - * UCS-4 range (hex.) UTF-8 octet sequence (binary) - * 0000 0000-0000 007F 0xxxxxxx - * 0000 0080-0000 07FF 110xxxxx 10xxxxxx - * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx - * - * I hope we won't use values > 0xFFFF anytime soon ! - */ +/************************************************************************ + * * + * Generic UTF8 handling routines * + * * + * From rfc2044: encoding of the Unicode values on UTF-8: * + * * + * UCS-4 range (hex.) UTF-8 octet sequence (binary) * + * 0000 0000-0000 007F 0xxxxxxx * + * 0000 0080-0000 07FF 110xxxxx 10xxxxxx * + * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx * + * * + * I hope we won't use values > 0xFFFF anytime soon ! * + * * + ************************************************************************/ /** * xmlUTF8Strlen: @@ -85,7 +92,7 @@ static int xmlLittleEndian = 1; * Returns the number of characters in the string or -1 in case of error */ int -xmlUTF8Strlen(const unsigned char *utf) { +xmlUTF8Strlen(const xmlChar *utf) { int ret = 0; if (utf == NULL) @@ -227,6 +234,178 @@ xmlCheckUTF8(const unsigned char *utf) return(1); } +/** + * xmlUTF8Strsize: + * @utf: a sequence of UTF-8 encoded bytes + * @len: the number of characters in the array + * + * storage size of an UTF8 string + * + * Returns the storage size of + * the first 'len' characters of ARRAY + * + */ + +int +xmlUTF8Strsize(const xmlChar *utf, int len) { + const xmlChar *ptr=utf; + xmlChar ch; + + if (len <= 0) + return(0); + + while ( len-- > 0) { + if ( !*ptr ) + break; + if ( (ch = *ptr++) & 0x80) + while ( (ch<<=1) & 0x80 ) + ptr++; + } + return (ptr - utf); +} + + +/** + * xmlUTF8Strndup: + * @utf: the input UTF8 * + * @len: the len of @utf (in chars) + * + * a strndup for array of UTF8's + * + * Returns a new UTF8 * or NULL + */ +xmlChar * +xmlUTF8Strndup(const xmlChar *utf, int len) { + xmlChar *ret; + int i; + + if ((utf == NULL) || (len < 0)) return(NULL); + i = xmlUTF8Strsize(utf, len); + ret = (xmlChar *) xmlMalloc((i + 1) * sizeof(xmlChar)); + if (ret == NULL) { + xmlGenericError(xmlGenericErrorContext, + "malloc of %ld byte failed\n", + (len + 1) * (long)sizeof(xmlChar)); + return(NULL); + } + memcpy(ret, utf, i * sizeof(xmlChar)); + ret[i] = 0; + return(ret); +} + +/** + * xmlUTF8Strpos: + * @utf: the input UTF8 * + * @pos: the position of the desired UTF8 char (in chars) + * + * a function to provide the equivalent of fetching a + * character from a string array + * + * Returns a pointer to the UTF8 character or NULL + */ +xmlChar * +xmlUTF8Strpos(const xmlChar *utf, int pos) { + xmlChar ch; + + if (utf == NULL) return(NULL); + if ( (pos < 0) || (pos >= xmlUTF8Strlen(utf)) ) + return(NULL); + while (pos--) { + if ((ch=*utf++) == 0) return(NULL); + if ( ch & 0x80 ) { + /* if not simple ascii, verify proper format */ + if ( (ch & 0xc0) != 0xc0 ) + return(NULL); + /* then skip over remaining bytes for this char */ + while ( (ch <<= 1) & 0x80 ) + if ( (*utf++ & 0xc0) != 0x80 ) + return(NULL); + } + } + return((xmlChar *)utf); +} + +/** + * xmlUTF8Strloc: + * @utf: the input UTF8 * + * @utfchar: the UTF8 character to be found + * + * a function to provide relative location of a UTF8 char + * + * Returns the relative character position of the desired char + * or -1 if not found + */ +int +xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) { + int i, size; + xmlChar ch; + + if (utf==NULL || utfchar==NULL) return -1; + size = xmlUTF8Strsize(utfchar, 1); + for(i=0; (ch=*utf) != 0; i++) { + if (xmlStrncmp(utf, utfchar, size)==0) + return(i); + utf++; + if ( ch & 0x80 ) { + /* if not simple ascii, verify proper format */ + if ( (ch & 0xc0) != 0xc0 ) + return(-1); + /* then skip over remaining bytes for this char */ + while ( (ch <<= 1) & 0x80 ) + if ( (*utf++ & 0xc0) != 0x80 ) + return(-1); + } + } + + return(-1); +} +/** + * xmlUTF8Strsub: + * @utf: a sequence of UTF-8 encoded bytes + * + * @start: relative pos of first char + * @len: total number to copy + * + * Note: positions are given in units of UTF-8 chars + * + * Returns a pointer to a newly created string + * or NULL if any problem + */ + +xmlChar * +xmlUTF8Strsub(const xmlChar *utf, int start, int len) { + int i; + xmlChar ch; + + if (utf == NULL) return(NULL); + if (start < 0) return(NULL); + if (len < 0) return(NULL); + + /* + * Skip over any leading chars + */ + for (i = 0;i < start;i++) { + if ((ch=*utf++) == 0) return(NULL); + if ( ch & 0x80 ) { + /* if not simple ascii, verify proper format */ + if ( (ch & 0xc0) != 0xc0 ) + return(NULL); + /* then skip over remaining bytes for this char */ + while ( (ch <<= 1) & 0x80 ) + if ( (*utf++ & 0xc0) != 0x80 ) + return(NULL); + } + } + + return(xmlUTF8Strndup(utf, len)); +} + +/************************************************************************ + * * + * Conversions To/From UTF8 encoding * + * * + ************************************************************************/ + /** * asciiToUTF8: * @out: a pointer to an array of bytes to store the result @@ -912,6 +1091,12 @@ UTF8ToUTF16BE(unsigned char* outb, int *outlen, return(0); } +/************************************************************************ + * * + * Generic encoding handling routines * + * * + ************************************************************************/ + /** * xmlDetectCharEncoding: * @in: a pointer to the first bytes of the XML entity, must be at least @@ -1256,11 +1441,12 @@ xmlGetCharEncodingName(xmlCharEncoding enc) { return(NULL); } -/**************************************************************** - * * - * Char encoding handlers * - * * - ****************************************************************/ +/************************************************************************ + * * + * Char encoding handlers * + * * + ************************************************************************/ + /* the size should be growable, but it's not a big deal ... */ #define MAX_ENCODING_HANDLERS 50 @@ -1669,6 +1855,12 @@ xmlFindCharEncodingHandler(const char *name) { return(NULL); } +/************************************************************************ + * * + * ICONV based generic conversion functions * + * * + ************************************************************************/ + #ifdef LIBXML_ICONV_ENABLED /** * xmlIconvWrapper: @@ -1730,6 +1922,12 @@ xmlIconvWrapper(iconv_t cd, } #endif /* LIBXML_ICONV_ENABLED */ +/************************************************************************ + * * + * The real API used by libxml for on-the-fly conversion * + * * + ************************************************************************/ + /** * xmlCharEncFirstLine: * @handler: char enconding transformation data structure diff --git a/encoding.h b/encoding.h index f25d9fde..fc1760e3 100644 --- a/encoding.h +++ b/encoding.h @@ -191,8 +191,25 @@ int isolat1ToUTF8 (unsigned char* out, int *outlen, const unsigned char* in, int *inlen); +/* + * exports additional "UTF-8 aware" string routines which are + */ + int xmlCheckUTF8 (const unsigned char *utf); -int xmlUTF8Strlen (const unsigned char *utf); + +int xmlUTF8Strsize (const xmlChar *utf, + int len); +xmlChar * xmlUTF8Strndup (const xmlChar *utf, + int len); +xmlChar * xmlUTF8Strpos (const xmlChar *utf, + int pos); +int xmlUTF8Strloc (const xmlChar *utf, + const xmlChar *utfchar); +xmlChar * xmlUTF8Strsub (const xmlChar *utf, + int start, + int len); + +int xmlUTF8Strlen (const xmlChar *utf); #ifdef __cplusplus } diff --git a/include/libxml/encoding.h b/include/libxml/encoding.h index f25d9fde..fc1760e3 100644 --- a/include/libxml/encoding.h +++ b/include/libxml/encoding.h @@ -191,8 +191,25 @@ int isolat1ToUTF8 (unsigned char* out, int *outlen, const unsigned char* in, int *inlen); +/* + * exports additional "UTF-8 aware" string routines which are + */ + int xmlCheckUTF8 (const unsigned char *utf); -int xmlUTF8Strlen (const unsigned char *utf); + +int xmlUTF8Strsize (const xmlChar *utf, + int len); +xmlChar * xmlUTF8Strndup (const xmlChar *utf, + int len); +xmlChar * xmlUTF8Strpos (const xmlChar *utf, + int pos); +int xmlUTF8Strloc (const xmlChar *utf, + const xmlChar *utfchar); +xmlChar * xmlUTF8Strsub (const xmlChar *utf, + int start, + int len); + +int xmlUTF8Strlen (const xmlChar *utf); #ifdef __cplusplus } diff --git a/xpath.c b/xpath.c index 3dee3e87..8d68888e 100644 --- a/xpath.c +++ b/xpath.c @@ -4840,28 +4840,27 @@ xmlXPathStartsWithFunction(xmlXPathParserContextPtr ctxt, int nargs) { void xmlXPathSubstringFunction(xmlXPathParserContextPtr ctxt, int nargs) { xmlXPathObjectPtr str, start, len; - double le, in; - int i, l; + double le=0, in; + int i, l, m; xmlChar *ret; - /* - * TODO: need to be converted to UTF8 strings - */ if (nargs < 2) { CHECK_ARITY(2); } if (nargs > 3) { CHECK_ARITY(3); } + /* + * take care of possible last (position) argument + */ if (nargs == 3) { CAST_TO_NUMBER; CHECK_TYPE(XPATH_NUMBER); len = valuePop(ctxt); le = len->floatval; xmlXPathFreeObject(len); - } else { - le = 2000000000; } + CAST_TO_NUMBER; CHECK_TYPE(XPATH_NUMBER); start = valuePop(ctxt); @@ -4870,38 +4869,49 @@ xmlXPathSubstringFunction(xmlXPathParserContextPtr ctxt, int nargs) { CAST_TO_STRING; CHECK_TYPE(XPATH_STRING); str = valuePop(ctxt); - le += in; + m = xmlUTF8Strlen((const unsigned char *)str->stringval); - /* integer index of the first char */ + /* + * If last pos not present, calculate last position + */ + if (nargs != 3) + le = m; + + /* + * To meet our requirements, initial index calculations + * must be done before we convert to integer format + * + * First we normalize indices + */ + in -= 1.0; + le += in; + if (in < 0.0) + in = 0.0; + if (le > (double)m) + le = (double)m; + + /* + * Now we go to integer form, rounding up + */ i = (int) in; if (((double)i) != in) i++; - /* integer index of the last char */ l = (int) le; if (((double)l) != le) l++; - /* back to a zero based len */ - i--; - l--; - - /* check against the string len */ - if (l > 1024) { - l = xmlStrlen(str->stringval); - } - if (i < 0) { - i = 0; - } + if (l > m) l=m; /* number of chars to copy */ l -= i; - ret = xmlStrsub(str->stringval, i, l); + ret = xmlUTF8Strsub(str->stringval, i, l); if (ret == NULL) valuePush(ctxt, xmlXPathNewCString("")); else { valuePush(ctxt, xmlXPathNewString(ret)); xmlFree(ret); } + xmlXPathFreeObject(str); } @@ -5037,7 +5047,7 @@ xmlXPathNormalizeFunction(xmlXPathParserContextPtr ctxt, int nargs) { blank = 0; while (*source) { if (IS_BLANK(*source)) { - blank = *source; + blank = 0x20; } else { if (blank) { xmlBufferAdd(target, &blank, 1); @@ -5081,13 +5091,11 @@ xmlXPathTranslateFunction(xmlXPathParserContextPtr ctxt, int nargs) { xmlXPathObjectPtr from; xmlXPathObjectPtr to; xmlBufferPtr target; - int i, offset, max; + int offset, max; xmlChar ch; - const xmlChar *point; + xmlChar *point; + xmlChar *cptr; - /* - * TODO: need to be converted to UTF8 strings - */ CHECK_ARITY(3); CAST_TO_STRING; @@ -5099,15 +5107,37 @@ xmlXPathTranslateFunction(xmlXPathParserContextPtr ctxt, int nargs) { target = xmlBufferCreate(); if (target) { - max = xmlStrlen(to->stringval); - for (i = 0; (ch = str->stringval[i]); i++) { - point = xmlStrchr(from->stringval, ch); - if (point) { - offset = (int)(point - from->stringval); - if (offset < max) - xmlBufferAdd(target, &to->stringval[offset], 1); - } else - xmlBufferAdd(target, &ch, 1); + max = xmlUTF8Strlen(to->stringval); + for (cptr = str->stringval; (ch=*cptr); ) { + offset = xmlUTF8Strloc(from->stringval, cptr); + if (offset >= 0) { + if (offset < max) { + point = xmlUTF8Strpos(to->stringval, offset); + if (point) + xmlBufferAdd(target, point, xmlUTF8Strsize(point, 1)); + } + } else + xmlBufferAdd(target, cptr, xmlUTF8Strsize(cptr, 1)); + + /* Step to next character in input */ + cptr++; + if ( ch & 0x80 ) { + /* if not simple ascii, verify proper format */ + if ( (ch & 0xc0) != 0xc0 ) { + xmlGenericError(xmlGenericErrorContext, + "xmlXPathTranslateFunction: Invalid UTF8 string\n"); + break; + } + /* then skip over remaining bytes for this char */ + while ( (ch <<= 1) & 0x80 ) + if ( (*cptr++ & 0xc0) != 0x80 ) { + xmlGenericError(xmlGenericErrorContext, + "xmlXPathTranslateFunction: Invalid UTF8 string\n"); + break; + } + if (ch & 0x80) /* must have had error encountered */ + break; + } } } valuePush(ctxt, xmlXPathNewString(xmlBufferContent(target)));