1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-07-29 11:41:22 +03:00

- doc/encoding.html doc/xml.html: added I18N doc

- encoding.[ch] HTMLtree.[ch] parser.c HTMLparser.c: I18N encoding
  improvements, both parser and filters, added ASCII & HTML,
  fixed the ISO-Latin-1 one
- xmllint.c testHTML.c: added/made visible --encode
- debugXML.c : cleanup
- most .c files: applied patches due to warning on Windows and
  when using Sun Pro cc compiler
- xpath.c : cleanup memleaks
- nanoftp.c : added a TESTING preprocessor flag for standalong
  compile so that people can report bugs more easilly
- nanohttp.c : ditched socklen_t which was a portability mess
  and replaced it with unsigned int.
- tree.[ch]: added xmlHasProp()
- TODO: updated
- test/ : added more test for entities, NS, encoding, HTML, wap
- configure.in: preparing for 2.2.0 release
Daniel
This commit is contained in:
Daniel Veillard
2000-07-14 14:49:25 +00:00
parent 8d86964a4a
commit 32bc74ef98
41 changed files with 2068 additions and 928 deletions

View File

@ -43,6 +43,9 @@
#endif
#include <libxml/encoding.h>
#include <libxml/xmlmemory.h>
#ifdef LIBXML_HTML_ENABLED
#include <libxml/HTMLparser.h>
#endif
xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
@ -177,6 +180,140 @@ xmlCheckUTF8(const unsigned char *utf)
return(1);
}
/**
* asciiToUTF8:
* @out: a pointer to an array of bytes to store the result
* @outlen: the length of @out
* @in: a pointer to an array of ASCII chars
* @inlen: the length of @in
*
* Take a block of ASCII chars in and try to convert it to an UTF-8
* block of chars out.
* Returns 0 if success, or -1 otherwise
* The value of @inlen after return is the number of octets consumed
* as the return value is positive, else unpredictiable.
* The value of @outlen after return is the number of ocetes consumed.
*/
int
asciiToUTF8(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen) {
unsigned char* outstart = out;
const unsigned char* base = in;
const unsigned char* processed = in;
unsigned char* outend = out + *outlen;
const unsigned char* inend;
unsigned int c;
int bits;
inend = in + (*inlen);
while ((in < inend) && (out - outstart + 5 < *outlen)) {
c= *in++;
/* assertion: c is a single UTF-4 value */
if (out >= outend)
break;
if (c < 0x80) { *out++= c; bits= -6; }
else {
*outlen = out - outstart;
*inlen = processed - base;
return(-1);
}
for ( ; bits >= 0; bits-= 6) {
if (out >= outend)
break;
*out++= ((c >> bits) & 0x3F) | 0x80;
}
processed = (const unsigned char*) in;
}
*outlen = out - outstart;
*inlen = processed - base;
return(0);
}
/**
* UTF8Toascii:
* @out: a pointer to an array of bytes to store the result
* @outlen: the length of @out
* @in: a pointer to an array of UTF-8 chars
* @inlen: the length of @in
*
* Take a block of UTF-8 chars in and try to convert it to an ASCII
* block of chars out.
*
* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
* The value of @inlen after return is the number of octets consumed
* as the return value is positive, else unpredictiable.
* The value of @outlen after return is the number of ocetes consumed.
*/
int
UTF8Toascii(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen) {
const unsigned char* processed = in;
const unsigned char* outend;
const unsigned char* outstart = out;
const unsigned char* instart = in;
const unsigned char* inend;
unsigned int c, d;
int trailing;
if (in == NULL) {
/*
* initialization nothing to do
*/
*outlen = 0;
*inlen = 0;
return(0);
}
inend = in + (*inlen);
outend = out + (*outlen);
while (in < inend) {
d = *in++;
if (d < 0x80) { c= d; trailing= 0; }
else if (d < 0xC0) {
/* trailing byte in leading position */
*outlen = out - outstart;
*inlen = processed - instart;
return(-2);
} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
else {
/* no chance for this in Ascii */
*outlen = out - outstart;
*inlen = processed - instart;
return(-2);
}
if (inend - in < trailing) {
break;
}
for ( ; trailing; trailing--) {
if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
break;
c <<= 6;
c |= d & 0x3F;
}
/* assertion: c is a single UTF-4 value */
if (c < 0x80) {
if (out >= outend)
break;
*out++ = c;
} else {
/* no chance for this in Ascii */
*outlen = out - outstart;
*inlen = processed - instart;
return(-2);
}
processed = in;
}
*outlen = out - outstart;
*inlen = processed - instart;
return(0);
}
/**
* isolat1ToUTF8:
* @out: a pointer to an array of bytes to store the result
@ -195,28 +332,32 @@ int
isolat1ToUTF8(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen) {
unsigned char* outstart = out;
const unsigned char* base = in;
const unsigned char* processed = in;
unsigned char* outend = out + *outlen;
const unsigned char* inend = in + *inlen;
unsigned char c;
const unsigned char* inend;
unsigned int c;
int bits;
while (in < inend) {
c= *in++;
if (c < 0x80) {
inend = in + (*inlen);
while ((in < inend) && (out - outstart + 5 < *outlen)) {
c= *in++;
/* assertion: c is a single UTF-4 value */
if (out >= outend)
break;
if (c < 0x80) { *out++= c; bits= -6; }
else { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
for ( ; bits >= 0; bits-= 6) {
if (out >= outend)
break;
*out++ = c;
break;
*out++= ((c >> bits) & 0x3F) | 0x80;
}
else {
if (out + 1 >= outend) break;
*out++ = 0xC0 | (c >> 6);
*out++ = 0x80 | (0x3F & c);
}
processed = in;
processed = (const unsigned char*) in;
}
*outlen = out - outstart;
*inlen = processed - in;
*inlen = processed - base;
return(0);
}
@ -229,7 +370,6 @@ isolat1ToUTF8(unsigned char* out, int *outlen,
*
* Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
* block of chars out.
* TODO: UTF8Toisolat1 need a fallback mechanism ...
*
* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
* The value of @inlen after return is the number of octets consumed
@ -239,34 +379,68 @@ isolat1ToUTF8(unsigned char* out, int *outlen,
int
UTF8Toisolat1(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen) {
unsigned char* outstart = out;
const unsigned char* processed = in;
unsigned char* outend = out + *outlen;
const unsigned char* inend = in + *inlen;
unsigned char c;
const unsigned char* outend;
const unsigned char* outstart = out;
const unsigned char* instart = in;
const unsigned char* inend;
unsigned int c, d;
int trailing;
if (in == NULL) {
/*
* initialization nothing to do
*/
*outlen = 0;
*inlen = 0;
return(0);
}
inend = in + (*inlen);
outend = out + (*outlen);
while (in < inend) {
c= *in++;
if (c < 0x80) {
if (out >= outend) return(-1);
*out++= c;
}
else if (in == inend) {
break;
}
else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) {
/* a two byte utf-8 and can be encoding as isolate1 */
*out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
}
else {
d = *in++;
if (d < 0x80) { c= d; trailing= 0; }
else if (d < 0xC0) {
/* trailing byte in leading position */
*outlen = out - outstart;
*inlen = processed - in;
*inlen = processed - instart;
return(-2);
} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
else {
/* no chance for this in IsoLat1 */
*outlen = out - outstart;
*inlen = processed - instart;
return(-2);
}
if (inend - in < trailing) {
break;
}
for ( ; trailing; trailing--) {
if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
break;
c <<= 6;
c |= d & 0x3F;
}
/* assertion: c is a single UTF-4 value */
if (c <= 0xFF) {
if (out >= outend)
break;
*out++ = c;
} else {
/* no chance for this in IsoLat1 */
*outlen = out - outstart;
*inlen = processed - instart;
return(-2);
}
processed = in;
}
*outlen = out - outstart;
*inlen = processed - in;
*inlen = processed - instart;
return(0);
}
@ -367,7 +541,6 @@ UTF16LEToUTF8(unsigned char* out, int *outlen,
*
* Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
* block of chars out.
* TODO: UTF8ToUTF16LE need a fallback mechanism ...
*
* Returns the number of byte written, or -1 by lack of space, or -2
* if the transcoding failed.
@ -410,7 +583,7 @@ UTF8ToUTF16LE(unsigned char* outb, int *outlen,
if (d < 0x80) { c= d; trailing= 0; }
else if (d < 0xC0) {
/* trailing byte in leading position */
*outlen = out - outstart;
*outlen = (out - outstart) * 2;
*inlen = processed - in;
return(-2);
} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
@ -418,7 +591,7 @@ UTF8ToUTF16LE(unsigned char* outb, int *outlen,
else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
else {
/* no chance for this in UTF-16 */
*outlen = out - outstart;
*outlen = (out - outstart) * 2;
*inlen = processed - in;
return(-2);
}
@ -578,7 +751,6 @@ UTF16BEToUTF8(unsigned char* out, int *outlen,
*
* Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
* block of chars out.
* TODO: UTF8ToUTF16BE need a fallback mechanism ...
*
* Returns the number of byte written, or -1 by lack of space, or -2
* if the transcoding failed.
@ -861,6 +1033,8 @@ xmlGetCharEncodingName(xmlCharEncoding enc) {
return("Shift-JIS");
case XML_CHAR_ENCODING_EUC_JP:
return("EUC-JP");
case XML_CHAR_ENCODING_ASCII:
return("ASCII");
}
return(NULL);
}
@ -974,6 +1148,10 @@ xmlInitCharEncodingHandlers(void) {
xmlUTF16BEHandler =
xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
#ifdef LIBXML_HTML_ENABLED
xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
#endif
}
/**
@ -1081,16 +1259,51 @@ xmlGetCharEncodingHandler(xmlCharEncoding enc) {
handler = xmlFindCharEncodingHandler("UCS2");
if (handler != NULL) return(handler);
break;
/*
* We used to keep ISO Latin encodings native in the
* generated data. This led to so many problems that
* this has been removed. One can still change this
* back by registering no-ops encoders for those
*/
case XML_CHAR_ENCODING_8859_1:
handler = xmlFindCharEncodingHandler("ISO-8859-1");
if (handler != NULL) return(handler);
break;
case XML_CHAR_ENCODING_8859_2:
handler = xmlFindCharEncodingHandler("ISO-8859-2");
if (handler != NULL) return(handler);
break;
case XML_CHAR_ENCODING_8859_3:
handler = xmlFindCharEncodingHandler("ISO-8859-3");
if (handler != NULL) return(handler);
break;
case XML_CHAR_ENCODING_8859_4:
handler = xmlFindCharEncodingHandler("ISO-8859-4");
if (handler != NULL) return(handler);
break;
case XML_CHAR_ENCODING_8859_5:
handler = xmlFindCharEncodingHandler("ISO-8859-5");
if (handler != NULL) return(handler);
break;
case XML_CHAR_ENCODING_8859_6:
handler = xmlFindCharEncodingHandler("ISO-8859-6");
if (handler != NULL) return(handler);
break;
case XML_CHAR_ENCODING_8859_7:
handler = xmlFindCharEncodingHandler("ISO-8859-7");
if (handler != NULL) return(handler);
break;
case XML_CHAR_ENCODING_8859_8:
handler = xmlFindCharEncodingHandler("ISO-8859-8");
if (handler != NULL) return(handler);
break;
case XML_CHAR_ENCODING_8859_9:
return(NULL);
handler = xmlFindCharEncodingHandler("ISO-8859-9");
if (handler != NULL) return(handler);
break;
case XML_CHAR_ENCODING_2022_JP:
handler = xmlFindCharEncodingHandler("ISO-2022-JP");
if (handler != NULL) return(handler);
@ -1161,7 +1374,8 @@ xmlFindCharEncodingHandler(const char *name) {
icv_in = iconv_open("UTF-8", name);
icv_out = iconv_open(name, "UTF-8");
if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
enc = xmlMalloc(sizeof(xmlCharEncodingHandler));
enc = (xmlCharEncodingHandlerPtr)
xmlMalloc(sizeof(xmlCharEncodingHandler));
if (enc == NULL) {
iconv_close(icv_in);
iconv_close(icv_out);
@ -1506,6 +1720,10 @@ retry:
if (ret == -1) ret = -3;
}
#endif /* LIBXML_ICONV_ENABLED */
else {
fprintf(stderr, "xmlCharEncOutFunc: no output function !\n");
return(-1);
}
if (ret >= 0) output += ret;
@ -1528,7 +1746,7 @@ retry:
#endif
case -2: {
int len = in->use;
const char *utf = (const char *) in->content;
const xmlChar *utf = (const xmlChar *) in->content;
int cur;
cur = xmlGetUTF8Char(utf, &len);
@ -1546,7 +1764,7 @@ retry:
* and continue the transcoding phase, hoping the error
* did not mangle the encoder state.
*/
sprintf(charref, "&#x%X;", cur);
sprintf((char *) charref, "&#x%X;", cur);
xmlBufferShrink(in, len);
xmlBufferAddHead(in, charref, -1);