1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-07-28 00:21:53 +03:00

- xpath.c encoding.[ch]: William M. Brack provided a set of UTF8

string oriented functions and started cleaning the related areas
  in xpath.c which needed fixing in this respect
Daniel
This commit is contained in:
Daniel Veillard
2001-05-30 19:14:17 +00:00
parent 2d70372ce3
commit 97ac13197c
5 changed files with 325 additions and 57 deletions

View File

@ -13,11 +13,14 @@
* [US-ASCII] Coded Character Set--7-bit American Standard Code for
* Information Interchange, ANSI X3.4-1986.
*
* Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
*
* See Copyright for the status of this software.
*
* Daniel.Veillard@w3.org
*
* UTF8 string routines from:
* "William M. Brack" <wbrack@mmm.com.hk>
*
* Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
*/
#include "libxml.h"
@ -64,16 +67,20 @@ static int xmlCharEncodingAliasesMax = 0;
static int xmlLittleEndian = 1;
/*
* From rfc2044: encoding of the Unicode values on UTF-8:
*
* UCS-4 range (hex.) UTF-8 octet sequence (binary)
* 0000 0000-0000 007F 0xxxxxxx
* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
*
* I hope we won't use values > 0xFFFF anytime soon !
*/
/************************************************************************
* *
* Generic UTF8 handling routines *
* *
* From rfc2044: encoding of the Unicode values on UTF-8: *
* *
* UCS-4 range (hex.) UTF-8 octet sequence (binary) *
* 0000 0000-0000 007F 0xxxxxxx *
* 0000 0080-0000 07FF 110xxxxx 10xxxxxx *
* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx *
* *
* I hope we won't use values > 0xFFFF anytime soon ! *
* *
************************************************************************/
/**
* xmlUTF8Strlen:
@ -85,7 +92,7 @@ static int xmlLittleEndian = 1;
* Returns the number of characters in the string or -1 in case of error
*/
int
xmlUTF8Strlen(const unsigned char *utf) {
xmlUTF8Strlen(const xmlChar *utf) {
int ret = 0;
if (utf == NULL)
@ -227,6 +234,178 @@ xmlCheckUTF8(const unsigned char *utf)
return(1);
}
/**
* xmlUTF8Strsize:
* @utf: a sequence of UTF-8 encoded bytes
* @len: the number of characters in the array
*
* storage size of an UTF8 string
*
* Returns the storage size of
* the first 'len' characters of ARRAY
*
*/
int
xmlUTF8Strsize(const xmlChar *utf, int len) {
const xmlChar *ptr=utf;
xmlChar ch;
if (len <= 0)
return(0);
while ( len-- > 0) {
if ( !*ptr )
break;
if ( (ch = *ptr++) & 0x80)
while ( (ch<<=1) & 0x80 )
ptr++;
}
return (ptr - utf);
}
/**
* xmlUTF8Strndup:
* @utf: the input UTF8 *
* @len: the len of @utf (in chars)
*
* a strndup for array of UTF8's
*
* Returns a new UTF8 * or NULL
*/
xmlChar *
xmlUTF8Strndup(const xmlChar *utf, int len) {
xmlChar *ret;
int i;
if ((utf == NULL) || (len < 0)) return(NULL);
i = xmlUTF8Strsize(utf, len);
ret = (xmlChar *) xmlMalloc((i + 1) * sizeof(xmlChar));
if (ret == NULL) {
xmlGenericError(xmlGenericErrorContext,
"malloc of %ld byte failed\n",
(len + 1) * (long)sizeof(xmlChar));
return(NULL);
}
memcpy(ret, utf, i * sizeof(xmlChar));
ret[i] = 0;
return(ret);
}
/**
* xmlUTF8Strpos:
* @utf: the input UTF8 *
* @pos: the position of the desired UTF8 char (in chars)
*
* a function to provide the equivalent of fetching a
* character from a string array
*
* Returns a pointer to the UTF8 character or NULL
*/
xmlChar *
xmlUTF8Strpos(const xmlChar *utf, int pos) {
xmlChar ch;
if (utf == NULL) return(NULL);
if ( (pos < 0) || (pos >= xmlUTF8Strlen(utf)) )
return(NULL);
while (pos--) {
if ((ch=*utf++) == 0) return(NULL);
if ( ch & 0x80 ) {
/* if not simple ascii, verify proper format */
if ( (ch & 0xc0) != 0xc0 )
return(NULL);
/* then skip over remaining bytes for this char */
while ( (ch <<= 1) & 0x80 )
if ( (*utf++ & 0xc0) != 0x80 )
return(NULL);
}
}
return((xmlChar *)utf);
}
/**
* xmlUTF8Strloc:
* @utf: the input UTF8 *
* @utfchar: the UTF8 character to be found
*
* a function to provide relative location of a UTF8 char
*
* Returns the relative character position of the desired char
* or -1 if not found
*/
int
xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
int i, size;
xmlChar ch;
if (utf==NULL || utfchar==NULL) return -1;
size = xmlUTF8Strsize(utfchar, 1);
for(i=0; (ch=*utf) != 0; i++) {
if (xmlStrncmp(utf, utfchar, size)==0)
return(i);
utf++;
if ( ch & 0x80 ) {
/* if not simple ascii, verify proper format */
if ( (ch & 0xc0) != 0xc0 )
return(-1);
/* then skip over remaining bytes for this char */
while ( (ch <<= 1) & 0x80 )
if ( (*utf++ & 0xc0) != 0x80 )
return(-1);
}
}
return(-1);
}
/**
* xmlUTF8Strsub:
* @utf: a sequence of UTF-8 encoded bytes
*
* @start: relative pos of first char
* @len: total number to copy
*
* Note: positions are given in units of UTF-8 chars
*
* Returns a pointer to a newly created string
* or NULL if any problem
*/
xmlChar *
xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
int i;
xmlChar ch;
if (utf == NULL) return(NULL);
if (start < 0) return(NULL);
if (len < 0) return(NULL);
/*
* Skip over any leading chars
*/
for (i = 0;i < start;i++) {
if ((ch=*utf++) == 0) return(NULL);
if ( ch & 0x80 ) {
/* if not simple ascii, verify proper format */
if ( (ch & 0xc0) != 0xc0 )
return(NULL);
/* then skip over remaining bytes for this char */
while ( (ch <<= 1) & 0x80 )
if ( (*utf++ & 0xc0) != 0x80 )
return(NULL);
}
}
return(xmlUTF8Strndup(utf, len));
}
/************************************************************************
* *
* Conversions To/From UTF8 encoding *
* *
************************************************************************/
/**
* asciiToUTF8:
* @out: a pointer to an array of bytes to store the result
@ -912,6 +1091,12 @@ UTF8ToUTF16BE(unsigned char* outb, int *outlen,
return(0);
}
/************************************************************************
* *
* Generic encoding handling routines *
* *
************************************************************************/
/**
* xmlDetectCharEncoding:
* @in: a pointer to the first bytes of the XML entity, must be at least
@ -1256,11 +1441,12 @@ xmlGetCharEncodingName(xmlCharEncoding enc) {
return(NULL);
}
/****************************************************************
* *
* Char encoding handlers *
* *
****************************************************************/
/************************************************************************
* *
* Char encoding handlers *
* *
************************************************************************/
/* the size should be growable, but it's not a big deal ... */
#define MAX_ENCODING_HANDLERS 50
@ -1669,6 +1855,12 @@ xmlFindCharEncodingHandler(const char *name) {
return(NULL);
}
/************************************************************************
* *
* ICONV based generic conversion functions *
* *
************************************************************************/
#ifdef LIBXML_ICONV_ENABLED
/**
* xmlIconvWrapper:
@ -1730,6 +1922,12 @@ xmlIconvWrapper(iconv_t cd,
}
#endif /* LIBXML_ICONV_ENABLED */
/************************************************************************
* *
* The real API used by libxml for on-the-fly conversion *
* *
************************************************************************/
/**
* xmlCharEncFirstLine:
* @handler: char enconding transformation data structure