1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-08-01 10:06:59 +03:00

CORBA defines fixes, char encoding atodetection, Daniel

This commit is contained in:
Daniel Veillard
1999-05-29 11:51:49 +00:00
parent 5e60f5a236
commit 27d88744f9
12 changed files with 664 additions and 311 deletions

View File

@ -19,6 +19,7 @@
* Daniel.Veillard@w3.org
*/
#include <ctype.h>
#include "encoding.h"
/*
@ -204,3 +205,109 @@ UTF8ToUTF16(unsigned short* out, int outlen, unsigned char* in, int inlen)
}
/**
* xmlDetectCharEncoding:
* @in: a pointer to the first bytes of the XML entity, must be at least
* 4 bytes long.
*
* Guess the encoding of the entity using the first bytes of the entity content
* accordingly of the non-normative appendix F of the XML-1.0 recommendation.
*
* Returns one of the XML_CHAR_ENCODING_... values.
*/
xmlCharEncoding
xmlDetectCharEncoding(unsigned char* in)
{
if ((in[0] == 0x00) && (in[1] == 0x00) &&
(in[2] == 0x00) && (in[3] == 0x3C))
return(XML_CHAR_ENCODING_UCS4BE);
if ((in[0] == 0x3C) && (in[1] == 0x00) &&
(in[2] == 0x00) && (in[3] == 0x00))
return(XML_CHAR_ENCODING_UCS4LE);
if ((in[0] == 0x00) && (in[1] == 0x00) &&
(in[2] == 0x3C) && (in[3] == 0x00))
return(XML_CHAR_ENCODING_UCS4_2143);
if ((in[0] == 0x00) && (in[1] == 0x3C) &&
(in[2] == 0x00) && (in[3] == 0x00))
return(XML_CHAR_ENCODING_UCS4_3412);
if ((in[0] == 0xFE) && (in[1] == 0xFF))
return(XML_CHAR_ENCODING_UTF16BE);
if ((in[0] == 0xFF) && (in[1] == 0xFE))
return(XML_CHAR_ENCODING_UTF16LE);
if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
(in[2] == 0xA7) && (in[3] == 0x94))
return(XML_CHAR_ENCODING_EBCDIC);
if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
(in[2] == 0x78) && (in[3] == 0x6D))
return(XML_CHAR_ENCODING_UTF8);
return(XML_CHAR_ENCODING_NONE);
}
/**
* xmlParseCharEncoding:
* @name: the encoding name as parsed, in UTF-8 format (ASCCI actually)
*
* Conpare the string to the known encoding schemes already known. Note
* that the comparison is case insensitive accordingly to the section
* [XML] 4.3.3 Character Encoding in Entities.
*
* Returns one of the XML_CHAR_ENCODING_... values or XML_CHAR_ENCODING_NONE
* if not recognized.
*/
xmlCharEncoding
xmlParseCharEncoding(char* name)
{
char upper[500];
int i;
for (i = 0;i < 499;i++) {
upper[i] = toupper(name[i]);
if (upper[i] == 0) break;
}
upper[i] = 0;
if (!strcmp(upper, "")) return(XML_CHAR_ENCODING_NONE);
if (!strcmp(upper, "UTF-8")) return(XML_CHAR_ENCODING_UTF8);
if (!strcmp(upper, "UTF8")) return(XML_CHAR_ENCODING_UTF8);
/*
* NOTE: if we were able to parse this, the endianness of UTF16 is
* already found and in use
*/
if (!strcmp(upper, "UTF-16")) return(XML_CHAR_ENCODING_UTF16LE);
if (!strcmp(upper, "UTF16")) return(XML_CHAR_ENCODING_UTF16LE);
if (!strcmp(upper, "ISO-10646-UCS-2")) return(XML_CHAR_ENCODING_UCS2);
if (!strcmp(upper, "UCS-2")) return(XML_CHAR_ENCODING_UCS2);
if (!strcmp(upper, "UCS2")) return(XML_CHAR_ENCODING_UCS2);
/*
* NOTE: if we were able to parse this, the endianness of UCS4 is
* already found and in use
*/
if (!strcmp(upper, "ISO-10646-UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
if (!strcmp(upper, "UCS-4")) return(XML_CHAR_ENCODING_UCS4LE);
if (!strcmp(upper, "UCS4")) return(XML_CHAR_ENCODING_UCS4LE);
if (!strcmp(upper, "ISO-8859-1")) return(XML_CHAR_ENCODING_8859_1);
if (!strcmp(upper, "ISO-LATIN-1")) return(XML_CHAR_ENCODING_8859_1);
if (!strcmp(upper, "ISO LATIN 1")) return(XML_CHAR_ENCODING_8859_1);
if (!strcmp(upper, "ISO-8859-2")) return(XML_CHAR_ENCODING_8859_2);
if (!strcmp(upper, "ISO-LATIN-2")) return(XML_CHAR_ENCODING_8859_2);
if (!strcmp(upper, "ISO LATIN 2")) return(XML_CHAR_ENCODING_8859_2);
if (!strcmp(upper, "ISO-8859-3")) return(XML_CHAR_ENCODING_8859_3);
if (!strcmp(upper, "ISO-8859-4")) return(XML_CHAR_ENCODING_8859_4);
if (!strcmp(upper, "ISO-8859-5")) return(XML_CHAR_ENCODING_8859_5);
if (!strcmp(upper, "ISO-8859-6")) return(XML_CHAR_ENCODING_8859_6);
if (!strcmp(upper, "ISO-8859-7")) return(XML_CHAR_ENCODING_8859_7);
if (!strcmp(upper, "ISO-8859-8")) return(XML_CHAR_ENCODING_8859_8);
if (!strcmp(upper, "ISO-8859-9")) return(XML_CHAR_ENCODING_8859_9);
if (!strcmp(upper, "ISO-2022-JP")) return(XML_CHAR_ENCODING_2022_JP);
if (!strcmp(upper, "Shift_JIS")) return(XML_CHAR_ENCODING_SHIFT_JIS);
if (!strcmp(upper, "EUC-JP")) return(XML_CHAR_ENCODING_EUC_JP);
return(XML_CHAR_ENCODING_ERROR);
}