mirror of
https://gitlab.gnome.org/GNOME/libxml2.git
synced 2025-07-29 11:41:22 +03:00
- doc/encoding.html doc/xml.html: added I18N doc
- encoding.[ch] HTMLtree.[ch] parser.c HTMLparser.c: I18N encoding improvements, both parser and filters, added ASCII & HTML, fixed the ISO-Latin-1 one - xmllint.c testHTML.c: added/made visible --encode - debugXML.c : cleanup - most .c files: applied patches due to warning on Windows and when using Sun Pro cc compiler - xpath.c : cleanup memleaks - nanoftp.c : added a TESTING preprocessor flag for standalong compile so that people can report bugs more easilly - nanohttp.c : ditched socklen_t which was a portability mess and replaced it with unsigned int. - tree.[ch]: added xmlHasProp() - TODO: updated - test/ : added more test for entities, NS, encoding, HTML, wap - configure.in: preparing for 2.2.0 release Daniel
This commit is contained in:
304
encoding.c
304
encoding.c
@ -43,6 +43,9 @@
|
||||
#endif
|
||||
#include <libxml/encoding.h>
|
||||
#include <libxml/xmlmemory.h>
|
||||
#ifdef LIBXML_HTML_ENABLED
|
||||
#include <libxml/HTMLparser.h>
|
||||
#endif
|
||||
|
||||
xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
|
||||
xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
|
||||
@ -177,6 +180,140 @@ xmlCheckUTF8(const unsigned char *utf)
|
||||
return(1);
|
||||
}
|
||||
|
||||
/**
|
||||
* asciiToUTF8:
|
||||
* @out: a pointer to an array of bytes to store the result
|
||||
* @outlen: the length of @out
|
||||
* @in: a pointer to an array of ASCII chars
|
||||
* @inlen: the length of @in
|
||||
*
|
||||
* Take a block of ASCII chars in and try to convert it to an UTF-8
|
||||
* block of chars out.
|
||||
* Returns 0 if success, or -1 otherwise
|
||||
* The value of @inlen after return is the number of octets consumed
|
||||
* as the return value is positive, else unpredictiable.
|
||||
* The value of @outlen after return is the number of ocetes consumed.
|
||||
*/
|
||||
int
|
||||
asciiToUTF8(unsigned char* out, int *outlen,
|
||||
const unsigned char* in, int *inlen) {
|
||||
unsigned char* outstart = out;
|
||||
const unsigned char* base = in;
|
||||
const unsigned char* processed = in;
|
||||
unsigned char* outend = out + *outlen;
|
||||
const unsigned char* inend;
|
||||
unsigned int c;
|
||||
int bits;
|
||||
|
||||
inend = in + (*inlen);
|
||||
while ((in < inend) && (out - outstart + 5 < *outlen)) {
|
||||
c= *in++;
|
||||
|
||||
/* assertion: c is a single UTF-4 value */
|
||||
if (out >= outend)
|
||||
break;
|
||||
if (c < 0x80) { *out++= c; bits= -6; }
|
||||
else {
|
||||
*outlen = out - outstart;
|
||||
*inlen = processed - base;
|
||||
return(-1);
|
||||
}
|
||||
|
||||
for ( ; bits >= 0; bits-= 6) {
|
||||
if (out >= outend)
|
||||
break;
|
||||
*out++= ((c >> bits) & 0x3F) | 0x80;
|
||||
}
|
||||
processed = (const unsigned char*) in;
|
||||
}
|
||||
*outlen = out - outstart;
|
||||
*inlen = processed - base;
|
||||
return(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* UTF8Toascii:
|
||||
* @out: a pointer to an array of bytes to store the result
|
||||
* @outlen: the length of @out
|
||||
* @in: a pointer to an array of UTF-8 chars
|
||||
* @inlen: the length of @in
|
||||
*
|
||||
* Take a block of UTF-8 chars in and try to convert it to an ASCII
|
||||
* block of chars out.
|
||||
*
|
||||
* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
|
||||
* The value of @inlen after return is the number of octets consumed
|
||||
* as the return value is positive, else unpredictiable.
|
||||
* The value of @outlen after return is the number of ocetes consumed.
|
||||
*/
|
||||
int
|
||||
UTF8Toascii(unsigned char* out, int *outlen,
|
||||
const unsigned char* in, int *inlen) {
|
||||
const unsigned char* processed = in;
|
||||
const unsigned char* outend;
|
||||
const unsigned char* outstart = out;
|
||||
const unsigned char* instart = in;
|
||||
const unsigned char* inend;
|
||||
unsigned int c, d;
|
||||
int trailing;
|
||||
|
||||
if (in == NULL) {
|
||||
/*
|
||||
* initialization nothing to do
|
||||
*/
|
||||
*outlen = 0;
|
||||
*inlen = 0;
|
||||
return(0);
|
||||
}
|
||||
inend = in + (*inlen);
|
||||
outend = out + (*outlen);
|
||||
while (in < inend) {
|
||||
d = *in++;
|
||||
if (d < 0x80) { c= d; trailing= 0; }
|
||||
else if (d < 0xC0) {
|
||||
/* trailing byte in leading position */
|
||||
*outlen = out - outstart;
|
||||
*inlen = processed - instart;
|
||||
return(-2);
|
||||
} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
|
||||
else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
|
||||
else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
|
||||
else {
|
||||
/* no chance for this in Ascii */
|
||||
*outlen = out - outstart;
|
||||
*inlen = processed - instart;
|
||||
return(-2);
|
||||
}
|
||||
|
||||
if (inend - in < trailing) {
|
||||
break;
|
||||
}
|
||||
|
||||
for ( ; trailing; trailing--) {
|
||||
if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
|
||||
break;
|
||||
c <<= 6;
|
||||
c |= d & 0x3F;
|
||||
}
|
||||
|
||||
/* assertion: c is a single UTF-4 value */
|
||||
if (c < 0x80) {
|
||||
if (out >= outend)
|
||||
break;
|
||||
*out++ = c;
|
||||
} else {
|
||||
/* no chance for this in Ascii */
|
||||
*outlen = out - outstart;
|
||||
*inlen = processed - instart;
|
||||
return(-2);
|
||||
}
|
||||
processed = in;
|
||||
}
|
||||
*outlen = out - outstart;
|
||||
*inlen = processed - instart;
|
||||
return(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* isolat1ToUTF8:
|
||||
* @out: a pointer to an array of bytes to store the result
|
||||
@ -195,28 +332,32 @@ int
|
||||
isolat1ToUTF8(unsigned char* out, int *outlen,
|
||||
const unsigned char* in, int *inlen) {
|
||||
unsigned char* outstart = out;
|
||||
const unsigned char* base = in;
|
||||
const unsigned char* processed = in;
|
||||
unsigned char* outend = out + *outlen;
|
||||
const unsigned char* inend = in + *inlen;
|
||||
unsigned char c;
|
||||
const unsigned char* inend;
|
||||
unsigned int c;
|
||||
int bits;
|
||||
|
||||
while (in < inend) {
|
||||
c= *in++;
|
||||
if (c < 0x80) {
|
||||
inend = in + (*inlen);
|
||||
while ((in < inend) && (out - outstart + 5 < *outlen)) {
|
||||
c= *in++;
|
||||
|
||||
/* assertion: c is a single UTF-4 value */
|
||||
if (out >= outend)
|
||||
break;
|
||||
if (c < 0x80) { *out++= c; bits= -6; }
|
||||
else { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
|
||||
|
||||
for ( ; bits >= 0; bits-= 6) {
|
||||
if (out >= outend)
|
||||
break;
|
||||
*out++ = c;
|
||||
break;
|
||||
*out++= ((c >> bits) & 0x3F) | 0x80;
|
||||
}
|
||||
else {
|
||||
if (out + 1 >= outend) break;
|
||||
*out++ = 0xC0 | (c >> 6);
|
||||
*out++ = 0x80 | (0x3F & c);
|
||||
}
|
||||
processed = in;
|
||||
processed = (const unsigned char*) in;
|
||||
}
|
||||
*outlen = out - outstart;
|
||||
*inlen = processed - in;
|
||||
|
||||
*inlen = processed - base;
|
||||
return(0);
|
||||
}
|
||||
|
||||
@ -229,7 +370,6 @@ isolat1ToUTF8(unsigned char* out, int *outlen,
|
||||
*
|
||||
* Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
|
||||
* block of chars out.
|
||||
* TODO: UTF8Toisolat1 need a fallback mechanism ...
|
||||
*
|
||||
* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
|
||||
* The value of @inlen after return is the number of octets consumed
|
||||
@ -239,34 +379,68 @@ isolat1ToUTF8(unsigned char* out, int *outlen,
|
||||
int
|
||||
UTF8Toisolat1(unsigned char* out, int *outlen,
|
||||
const unsigned char* in, int *inlen) {
|
||||
unsigned char* outstart = out;
|
||||
const unsigned char* processed = in;
|
||||
unsigned char* outend = out + *outlen;
|
||||
const unsigned char* inend = in + *inlen;
|
||||
unsigned char c;
|
||||
const unsigned char* outend;
|
||||
const unsigned char* outstart = out;
|
||||
const unsigned char* instart = in;
|
||||
const unsigned char* inend;
|
||||
unsigned int c, d;
|
||||
int trailing;
|
||||
|
||||
if (in == NULL) {
|
||||
/*
|
||||
* initialization nothing to do
|
||||
*/
|
||||
*outlen = 0;
|
||||
*inlen = 0;
|
||||
return(0);
|
||||
}
|
||||
inend = in + (*inlen);
|
||||
outend = out + (*outlen);
|
||||
while (in < inend) {
|
||||
c= *in++;
|
||||
if (c < 0x80) {
|
||||
if (out >= outend) return(-1);
|
||||
*out++= c;
|
||||
}
|
||||
else if (in == inend) {
|
||||
break;
|
||||
}
|
||||
else if (((c & 0xFC) == 0xC0) && ((*in & 0xC0) == 0x80)) {
|
||||
/* a two byte utf-8 and can be encoding as isolate1 */
|
||||
*out++= ((c & 0x03) << 6) | (*in++ & 0x3F);
|
||||
}
|
||||
else {
|
||||
d = *in++;
|
||||
if (d < 0x80) { c= d; trailing= 0; }
|
||||
else if (d < 0xC0) {
|
||||
/* trailing byte in leading position */
|
||||
*outlen = out - outstart;
|
||||
*inlen = processed - in;
|
||||
*inlen = processed - instart;
|
||||
return(-2);
|
||||
} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
|
||||
else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
|
||||
else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
|
||||
else {
|
||||
/* no chance for this in IsoLat1 */
|
||||
*outlen = out - outstart;
|
||||
*inlen = processed - instart;
|
||||
return(-2);
|
||||
}
|
||||
|
||||
if (inend - in < trailing) {
|
||||
break;
|
||||
}
|
||||
|
||||
for ( ; trailing; trailing--) {
|
||||
if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
|
||||
break;
|
||||
c <<= 6;
|
||||
c |= d & 0x3F;
|
||||
}
|
||||
|
||||
/* assertion: c is a single UTF-4 value */
|
||||
if (c <= 0xFF) {
|
||||
if (out >= outend)
|
||||
break;
|
||||
*out++ = c;
|
||||
} else {
|
||||
/* no chance for this in IsoLat1 */
|
||||
*outlen = out - outstart;
|
||||
*inlen = processed - instart;
|
||||
return(-2);
|
||||
}
|
||||
processed = in;
|
||||
}
|
||||
*outlen = out - outstart;
|
||||
*inlen = processed - in;
|
||||
*inlen = processed - instart;
|
||||
return(0);
|
||||
}
|
||||
|
||||
@ -367,7 +541,6 @@ UTF16LEToUTF8(unsigned char* out, int *outlen,
|
||||
*
|
||||
* Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
|
||||
* block of chars out.
|
||||
* TODO: UTF8ToUTF16LE need a fallback mechanism ...
|
||||
*
|
||||
* Returns the number of byte written, or -1 by lack of space, or -2
|
||||
* if the transcoding failed.
|
||||
@ -410,7 +583,7 @@ UTF8ToUTF16LE(unsigned char* outb, int *outlen,
|
||||
if (d < 0x80) { c= d; trailing= 0; }
|
||||
else if (d < 0xC0) {
|
||||
/* trailing byte in leading position */
|
||||
*outlen = out - outstart;
|
||||
*outlen = (out - outstart) * 2;
|
||||
*inlen = processed - in;
|
||||
return(-2);
|
||||
} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
|
||||
@ -418,7 +591,7 @@ UTF8ToUTF16LE(unsigned char* outb, int *outlen,
|
||||
else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
|
||||
else {
|
||||
/* no chance for this in UTF-16 */
|
||||
*outlen = out - outstart;
|
||||
*outlen = (out - outstart) * 2;
|
||||
*inlen = processed - in;
|
||||
return(-2);
|
||||
}
|
||||
@ -578,7 +751,6 @@ UTF16BEToUTF8(unsigned char* out, int *outlen,
|
||||
*
|
||||
* Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
|
||||
* block of chars out.
|
||||
* TODO: UTF8ToUTF16BE need a fallback mechanism ...
|
||||
*
|
||||
* Returns the number of byte written, or -1 by lack of space, or -2
|
||||
* if the transcoding failed.
|
||||
@ -861,6 +1033,8 @@ xmlGetCharEncodingName(xmlCharEncoding enc) {
|
||||
return("Shift-JIS");
|
||||
case XML_CHAR_ENCODING_EUC_JP:
|
||||
return("EUC-JP");
|
||||
case XML_CHAR_ENCODING_ASCII:
|
||||
return("ASCII");
|
||||
}
|
||||
return(NULL);
|
||||
}
|
||||
@ -974,6 +1148,10 @@ xmlInitCharEncodingHandlers(void) {
|
||||
xmlUTF16BEHandler =
|
||||
xmlNewCharEncodingHandler("UTF-16BE", UTF16BEToUTF8, UTF8ToUTF16BE);
|
||||
xmlNewCharEncodingHandler("ISO-8859-1", isolat1ToUTF8, UTF8Toisolat1);
|
||||
xmlNewCharEncodingHandler("ASCII", asciiToUTF8, UTF8Toascii);
|
||||
#ifdef LIBXML_HTML_ENABLED
|
||||
xmlNewCharEncodingHandler("HTML", NULL, UTF8ToHtml);
|
||||
#endif
|
||||
}
|
||||
|
||||
/**
|
||||
@ -1081,16 +1259,51 @@ xmlGetCharEncodingHandler(xmlCharEncoding enc) {
|
||||
handler = xmlFindCharEncodingHandler("UCS2");
|
||||
if (handler != NULL) return(handler);
|
||||
break;
|
||||
|
||||
/*
|
||||
* We used to keep ISO Latin encodings native in the
|
||||
* generated data. This led to so many problems that
|
||||
* this has been removed. One can still change this
|
||||
* back by registering no-ops encoders for those
|
||||
*/
|
||||
case XML_CHAR_ENCODING_8859_1:
|
||||
handler = xmlFindCharEncodingHandler("ISO-8859-1");
|
||||
if (handler != NULL) return(handler);
|
||||
break;
|
||||
case XML_CHAR_ENCODING_8859_2:
|
||||
handler = xmlFindCharEncodingHandler("ISO-8859-2");
|
||||
if (handler != NULL) return(handler);
|
||||
break;
|
||||
case XML_CHAR_ENCODING_8859_3:
|
||||
handler = xmlFindCharEncodingHandler("ISO-8859-3");
|
||||
if (handler != NULL) return(handler);
|
||||
break;
|
||||
case XML_CHAR_ENCODING_8859_4:
|
||||
handler = xmlFindCharEncodingHandler("ISO-8859-4");
|
||||
if (handler != NULL) return(handler);
|
||||
break;
|
||||
case XML_CHAR_ENCODING_8859_5:
|
||||
handler = xmlFindCharEncodingHandler("ISO-8859-5");
|
||||
if (handler != NULL) return(handler);
|
||||
break;
|
||||
case XML_CHAR_ENCODING_8859_6:
|
||||
handler = xmlFindCharEncodingHandler("ISO-8859-6");
|
||||
if (handler != NULL) return(handler);
|
||||
break;
|
||||
case XML_CHAR_ENCODING_8859_7:
|
||||
handler = xmlFindCharEncodingHandler("ISO-8859-7");
|
||||
if (handler != NULL) return(handler);
|
||||
break;
|
||||
case XML_CHAR_ENCODING_8859_8:
|
||||
handler = xmlFindCharEncodingHandler("ISO-8859-8");
|
||||
if (handler != NULL) return(handler);
|
||||
break;
|
||||
case XML_CHAR_ENCODING_8859_9:
|
||||
return(NULL);
|
||||
handler = xmlFindCharEncodingHandler("ISO-8859-9");
|
||||
if (handler != NULL) return(handler);
|
||||
break;
|
||||
|
||||
|
||||
case XML_CHAR_ENCODING_2022_JP:
|
||||
handler = xmlFindCharEncodingHandler("ISO-2022-JP");
|
||||
if (handler != NULL) return(handler);
|
||||
@ -1161,7 +1374,8 @@ xmlFindCharEncodingHandler(const char *name) {
|
||||
icv_in = iconv_open("UTF-8", name);
|
||||
icv_out = iconv_open(name, "UTF-8");
|
||||
if ((icv_in != (iconv_t) -1) && (icv_out != (iconv_t) -1)) {
|
||||
enc = xmlMalloc(sizeof(xmlCharEncodingHandler));
|
||||
enc = (xmlCharEncodingHandlerPtr)
|
||||
xmlMalloc(sizeof(xmlCharEncodingHandler));
|
||||
if (enc == NULL) {
|
||||
iconv_close(icv_in);
|
||||
iconv_close(icv_out);
|
||||
@ -1506,6 +1720,10 @@ retry:
|
||||
if (ret == -1) ret = -3;
|
||||
}
|
||||
#endif /* LIBXML_ICONV_ENABLED */
|
||||
else {
|
||||
fprintf(stderr, "xmlCharEncOutFunc: no output function !\n");
|
||||
return(-1);
|
||||
}
|
||||
|
||||
if (ret >= 0) output += ret;
|
||||
|
||||
@ -1528,7 +1746,7 @@ retry:
|
||||
#endif
|
||||
case -2: {
|
||||
int len = in->use;
|
||||
const char *utf = (const char *) in->content;
|
||||
const xmlChar *utf = (const xmlChar *) in->content;
|
||||
int cur;
|
||||
|
||||
cur = xmlGetUTF8Char(utf, &len);
|
||||
@ -1546,7 +1764,7 @@ retry:
|
||||
* and continue the transcoding phase, hoping the error
|
||||
* did not mangle the encoder state.
|
||||
*/
|
||||
sprintf(charref, "&#x%X;", cur);
|
||||
sprintf((char *) charref, "&#x%X;", cur);
|
||||
xmlBufferShrink(in, len);
|
||||
xmlBufferAddHead(in, charref, -1);
|
||||
|
||||
|
Reference in New Issue
Block a user