mirror of
				https://gitlab.gnome.org/GNOME/libxml2.git
				synced 2025-10-24 13:33:01 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			242 lines
		
	
	
		
			6.2 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			242 lines
		
	
	
		
			6.2 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /*
 | |
|  * icu.c: Example how to use ICU for character encoding conversion
 | |
|  *
 | |
|  * This example shows how to use ICU by installing a custom character
 | |
|  * encoding converter with xmlCtxtSetCharEncConvImpl, available
 | |
|  * since libxml2 2.14.
 | |
|  *
 | |
|  * This approach makes it possible to use ICU even if libxml2 is
 | |
|  * compiled without ICU support. It also makes sure that *only* ICU
 | |
|  * is used. Many Linux distros currently ship libxml2 with support
 | |
|  * for both ICU and iconv which makes the library's behavior hard to
 | |
|  * predict.
 | |
|  *
 | |
|  * The long-term plan is to make libxml2 only support a single
 | |
|  * conversion library internally (iconv on POSIX).
 | |
|  */
 | |
| 
 | |
| #include <stdio.h>
 | |
| #include <libxml/parser.h>
 | |
| #include <unicode/ucnv.h>
 | |
| 
 | |
| #define ICU_PIVOT_BUF_SIZE 1024
 | |
| 
 | |
| typedef struct {
 | |
|     UConverter *uconv; /* for conversion between an encoding and UTF-16 */
 | |
|     UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */
 | |
|     UChar      *pivot_source;
 | |
|     UChar      *pivot_target;
 | |
|     int        isInput;
 | |
|     UChar      pivot_buf[ICU_PIVOT_BUF_SIZE];
 | |
| } myConvCtxt;
 | |
| 
 | |
| static xmlCharEncError
 | |
| icuConvert(void *vctxt, unsigned char *out, int *outlen,
 | |
|            const unsigned char *in, int *inlen, int flush) {
 | |
|     myConvCtxt *cd = vctxt;
 | |
|     const char *ucv_in = (const char *) in;
 | |
|     char *ucv_out = (char *) out;
 | |
|     UConverter *target, *source;
 | |
|     UErrorCode err = U_ZERO_ERROR;
 | |
|     int ret;
 | |
| 
 | |
|     if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL)) {
 | |
|         if (outlen != NULL)
 | |
|             *outlen = 0;
 | |
|         return XML_ENC_ERR_INTERNAL;
 | |
|     }
 | |
| 
 | |
|     /*
 | |
|      * The ICU API can consume input, including partial sequences,
 | |
|      * even if the output buffer would overflow. The remaining input
 | |
|      * must be processed by calling ucnv_convertEx with a possibly
 | |
|      * empty input buffer.
 | |
|      */
 | |
|     if (cd->isInput) {
 | |
|         source = cd->uconv;
 | |
|         target = cd->utf8;
 | |
|     } else {
 | |
|         source = cd->utf8;
 | |
|         target = cd->uconv;
 | |
|     }
 | |
| 
 | |
|     ucnv_convertEx(target, source, &ucv_out, ucv_out + *outlen,
 | |
|                    &ucv_in, ucv_in + *inlen, cd->pivot_buf,
 | |
|                    &cd->pivot_source, &cd->pivot_target,
 | |
|                    cd->pivot_buf + ICU_PIVOT_BUF_SIZE,
 | |
|                    /* reset */ 0, flush, &err);
 | |
| 
 | |
|     *inlen = ucv_in - (const char*) in;
 | |
|     *outlen = ucv_out - (char *) out;
 | |
| 
 | |
|     if (U_SUCCESS(err)) {
 | |
|         ret = XML_ENC_ERR_SUCCESS;
 | |
|     } else {
 | |
|         switch (err) {
 | |
|             case U_TRUNCATED_CHAR_FOUND:
 | |
|                 /* Should only happen with flush */
 | |
|                 ret = XML_ENC_ERR_INPUT;
 | |
|                 break;
 | |
| 
 | |
|             case U_BUFFER_OVERFLOW_ERROR:
 | |
|                 ret = XML_ENC_ERR_SPACE;
 | |
|                 break;
 | |
| 
 | |
|             case U_INVALID_CHAR_FOUND:
 | |
|             case U_ILLEGAL_CHAR_FOUND:
 | |
|             case U_ILLEGAL_ESCAPE_SEQUENCE:
 | |
|             case U_UNSUPPORTED_ESCAPE_SEQUENCE:
 | |
|                 ret = XML_ENC_ERR_INPUT;
 | |
|                 break;
 | |
| 
 | |
|             case U_MEMORY_ALLOCATION_ERROR:
 | |
|                 ret = XML_ENC_ERR_MEMORY;
 | |
|                 break;
 | |
| 
 | |
|             default:
 | |
|                 ret = XML_ENC_ERR_INTERNAL;
 | |
|                 break;
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     return ret;
 | |
| }
 | |
| 
 | |
| static xmlParserErrors
 | |
| icuOpen(const char* name, int isInput, myConvCtxt **out)
 | |
| {
 | |
|     UErrorCode status;
 | |
|     myConvCtxt *cd;
 | |
| 
 | |
|     *out = NULL;
 | |
| 
 | |
|     cd = xmlMalloc(sizeof(myConvCtxt));
 | |
|     if (cd == NULL)
 | |
|         return XML_ERR_NO_MEMORY;
 | |
| 
 | |
|     cd->isInput = isInput;
 | |
|     cd->pivot_source = cd->pivot_buf;
 | |
|     cd->pivot_target = cd->pivot_buf;
 | |
| 
 | |
|     status = U_ZERO_ERROR;
 | |
|     cd->uconv = ucnv_open(name, &status);
 | |
|     if (U_FAILURE(status))
 | |
|         goto error;
 | |
| 
 | |
|     status = U_ZERO_ERROR;
 | |
|     if (isInput) {
 | |
|         ucnv_setToUCallBack(cd->uconv, UCNV_TO_U_CALLBACK_STOP,
 | |
|                             NULL, NULL, NULL, &status);
 | |
|     }
 | |
|     else {
 | |
|         ucnv_setFromUCallBack(cd->uconv, UCNV_FROM_U_CALLBACK_STOP,
 | |
|                               NULL, NULL, NULL, &status);
 | |
|     }
 | |
|     if (U_FAILURE(status))
 | |
|         goto error;
 | |
| 
 | |
|     status = U_ZERO_ERROR;
 | |
|     cd->utf8 = ucnv_open("UTF-8", &status);
 | |
|     if (U_FAILURE(status))
 | |
|         goto error;
 | |
| 
 | |
|     *out = cd;
 | |
|     return 0;
 | |
| 
 | |
| error:
 | |
|     if (cd->uconv)
 | |
|         ucnv_close(cd->uconv);
 | |
|     xmlFree(cd);
 | |
| 
 | |
|     if (status == U_FILE_ACCESS_ERROR)
 | |
|         return XML_ERR_UNSUPPORTED_ENCODING;
 | |
|     if (status == U_MEMORY_ALLOCATION_ERROR)
 | |
|         return XML_ERR_NO_MEMORY;
 | |
|     return XML_ERR_SYSTEM;
 | |
| }
 | |
| 
 | |
| static void
 | |
| icuClose(myConvCtxt *cd)
 | |
| {
 | |
|     if (cd == NULL)
 | |
|         return;
 | |
|     ucnv_close(cd->uconv);
 | |
|     ucnv_close(cd->utf8);
 | |
|     xmlFree(cd);
 | |
| }
 | |
| 
 | |
| static void
 | |
| icuConvCtxtDtor(void *vctxt) {
 | |
|     icuClose(vctxt);
 | |
| }
 | |
| 
 | |
| static xmlParserErrors
 | |
| icuConvImpl(void *vctxt, const char *name, xmlCharEncFlags flags,
 | |
|             xmlCharEncodingHandler **result) {
 | |
|     xmlCharEncConvFunc inFunc = NULL, outFunc = NULL;
 | |
|     myConvCtxt *inputCtxt = NULL;
 | |
|     myConvCtxt *outputCtxt = NULL;
 | |
|     xmlParserErrors ret;
 | |
| 
 | |
|     if (flags & XML_ENC_INPUT) {
 | |
|         ret = icuOpen(name, 1, &inputCtxt);
 | |
|         if (ret != 0)
 | |
|             goto error;
 | |
|         inFunc = icuConvert;
 | |
|     }
 | |
| 
 | |
|     if (flags & XML_ENC_OUTPUT) {
 | |
|         ret = icuOpen(name, 0, &outputCtxt);
 | |
|         if (ret != 0)
 | |
|             goto error;
 | |
|         outFunc = icuConvert;
 | |
|     }
 | |
| 
 | |
|     return xmlCharEncNewCustomHandler(name, inFunc, outFunc, icuConvCtxtDtor,
 | |
|                                       inputCtxt, outputCtxt, result);
 | |
| 
 | |
| error:
 | |
|     if (inputCtxt != NULL)
 | |
|         icuClose(inputCtxt);
 | |
|     if (outputCtxt != NULL)
 | |
|         icuClose(outputCtxt);
 | |
|     return ret;
 | |
| }
 | |
| 
 | |
| int
 | |
| main(void) {
 | |
|     xmlParserCtxtPtr ctxt;
 | |
|     xmlDocPtr doc;
 | |
|     const char *xml;
 | |
|     xmlChar *content;
 | |
|     int ret = 0;
 | |
| 
 | |
|     /*
 | |
|      * We use IBM-1051, an alias for HP Roman, as a simple example that
 | |
|      * ICU supports, but iconv (typically) doesn't.
 | |
|      *
 | |
|      * Character code 0xDE is U+00DF Latin Small Letter Sharp S.
 | |
|      */
 | |
|     xml = "<doc>\xDE</doc>";
 | |
| 
 | |
|     ctxt = xmlNewParserCtxt();
 | |
|     xmlCtxtSetCharEncConvImpl(ctxt, icuConvImpl, NULL);
 | |
|     doc = xmlCtxtReadDoc(ctxt, BAD_CAST xml, NULL, "IBM-1051", 0);
 | |
|     xmlFreeParserCtxt(ctxt);
 | |
| 
 | |
|     content = xmlNodeGetContent((xmlNodePtr) doc);
 | |
| 
 | |
|     printf("content: %s\n", content);
 | |
| 
 | |
|     if (!xmlStrEqual(content, BAD_CAST "\xC3\x9F")) {
 | |
|         fprintf(stderr, "conversion failed\n");
 | |
|         ret = 1;
 | |
|     }
 | |
| 
 | |
|     xmlFree(content);
 | |
|     xmlFreeDoc(doc);
 | |
| 
 | |
|     return ret;
 | |
| }
 | |
| 
 |