New set of cleanups, released 2.2.3:

- SAX.c debugXML.c parser.c parserInternals.c tree.c valid.c xpath.c: removed a few warnings in pedantic mode ... - parserInternals.c parser.c: moved encoding switching function to parserInternals.c - configure.in, doc/Makefile.am libxml.spec.in: released 2.2.3 Daniel
2025-07-29 11:41:22 +03:00 · 2000-09-17 16:00:22 +00:00
parent a2c6da94f8
commit 04698d9e1c
12 changed files with 371 additions and 317 deletions
--- a/parserInternals.c
+++ b/parserInternals.c
@ -1494,6 +1494,311 @@ xmlCopyChar(int len, xmlChar *out, int val) {
    return(1);
 }

+/************************************************************************
+ *									*
+ *		Commodity functions to switch encodings			*
+ *									*
+ ************************************************************************/
+
+/**
+ * xmlSwitchEncoding:
+ * @ctxt:  the parser context
+ * @enc:  the encoding value (number)
+ *
+ * change the input functions when discovering the character encoding
+ * of a given entity.
+ *
+ * Returns 0 in case of success, -1 otherwise
+ */
+int
+xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
+{
+    xmlCharEncodingHandlerPtr handler;
+
+    switch (enc) {
+	case XML_CHAR_ENCODING_ERROR:
+	    ctxt->errNo = XML_ERR_UNKNOWN_ENCODING;
+	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+		ctxt->sax->error(ctxt->userData, "encoding unknown\n");
+	    ctxt->wellFormed = 0;
+	    ctxt->disableSAX = 1;
+	    break;
+	case XML_CHAR_ENCODING_NONE:
+	    /* let's assume it's UTF-8 without the XML decl */
+	    ctxt->charset = XML_CHAR_ENCODING_UTF8;
+	    return(0);
+	case XML_CHAR_ENCODING_UTF8:
+	    /* default encoding, no conversion should be needed */
+	    ctxt->charset = XML_CHAR_ENCODING_UTF8;
+	    return(0);
+	default:
+	    break;
+    }
+    handler = xmlGetCharEncodingHandler(enc);
+    if (handler == NULL) {
+	/*
+	 * Default handlers.
+	 */
+	switch (enc) {
+	    case XML_CHAR_ENCODING_ERROR:
+		ctxt->errNo = XML_ERR_UNKNOWN_ENCODING;
+		if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+		    ctxt->sax->error(ctxt->userData, "encoding unknown\n");
+		ctxt->wellFormed = 0;
+		ctxt->disableSAX = 1;
+		ctxt->charset = XML_CHAR_ENCODING_UTF8;
+		break;
+	    case XML_CHAR_ENCODING_NONE:
+		/* let's assume it's UTF-8 without the XML decl */
+		ctxt->charset = XML_CHAR_ENCODING_UTF8;
+		return(0);
+	    case XML_CHAR_ENCODING_UTF8:
+	    case XML_CHAR_ENCODING_ASCII:
+		/* default encoding, no conversion should be needed */
+		ctxt->charset = XML_CHAR_ENCODING_UTF8;
+		return(0);
+	    case XML_CHAR_ENCODING_UTF16LE:
+		break;
+	    case XML_CHAR_ENCODING_UTF16BE:
+		break;
+	    case XML_CHAR_ENCODING_UCS4LE:
+		ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
+		if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+		    ctxt->sax->error(ctxt->userData,
+		      "char encoding USC4 little endian not supported\n");
+		break;
+	    case XML_CHAR_ENCODING_UCS4BE:
+		ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
+		if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+		    ctxt->sax->error(ctxt->userData,
+		      "char encoding USC4 big endian not supported\n");
+		break;
+	    case XML_CHAR_ENCODING_EBCDIC:
+		ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
+		if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+		    ctxt->sax->error(ctxt->userData,
+		      "char encoding EBCDIC not supported\n");
+		break;
+	    case XML_CHAR_ENCODING_UCS4_2143:
+		ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
+		if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+		    ctxt->sax->error(ctxt->userData,
+		      "char encoding UCS4 2143 not supported\n");
+		break;
+	    case XML_CHAR_ENCODING_UCS4_3412:
+		ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
+		if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+		    ctxt->sax->error(ctxt->userData,
+		      "char encoding UCS4 3412 not supported\n");
+		break;
+	    case XML_CHAR_ENCODING_UCS2:
+		ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
+		if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+		    ctxt->sax->error(ctxt->userData,
+		      "char encoding UCS2 not supported\n");
+		break;
+	    case XML_CHAR_ENCODING_8859_1:
+	    case XML_CHAR_ENCODING_8859_2:
+	    case XML_CHAR_ENCODING_8859_3:
+	    case XML_CHAR_ENCODING_8859_4:
+	    case XML_CHAR_ENCODING_8859_5:
+	    case XML_CHAR_ENCODING_8859_6:
+	    case XML_CHAR_ENCODING_8859_7:
+	    case XML_CHAR_ENCODING_8859_8:
+	    case XML_CHAR_ENCODING_8859_9:
+		/*
+		 * We used to keep the internal content in the
+		 * document encoding however this turns being unmaintainable
+		 * So xmlGetCharEncodingHandler() will return non-null
+		 * values for this now.
+		 */
+		if ((ctxt->inputNr == 1) &&
+		    (ctxt->encoding == NULL) &&
+		    (ctxt->input->encoding != NULL)) {
+		    ctxt->encoding = xmlStrdup(ctxt->input->encoding);
+		}
+		ctxt->charset = enc;
+		return(0);
+	    case XML_CHAR_ENCODING_2022_JP:
+		ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
+		if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+		    ctxt->sax->error(ctxt->userData,
+		      "char encoding ISO-2022-JPnot supported\n");
+		break;
+	    case XML_CHAR_ENCODING_SHIFT_JIS:
+		ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
+		if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+		    ctxt->sax->error(ctxt->userData,
+		      "char encoding Shift_JIS not supported\n");
+		break;
+	    case XML_CHAR_ENCODING_EUC_JP:
+		ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
+		if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+		    ctxt->sax->error(ctxt->userData,
+		      "char encoding EUC-JPnot supported\n");
+		break;
+	}
+    }
+    if (handler == NULL)
+	return(-1);
+    ctxt->charset = XML_CHAR_ENCODING_UTF8;
+    return(xmlSwitchToEncoding(ctxt, handler));
+}
+
+/**
+ * xmlSwitchToEncoding:
+ * @ctxt:  the parser context
+ * @handler:  the encoding handler
+ *
+ * change the input functions when discovering the character encoding
+ * of a given entity.
+ *
+ * Returns 0 in case of success, -1 otherwise
+ */
+int
+xmlSwitchToEncoding(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler) 
+{
+    int nbchars;
+
+    if (handler != NULL) {
+        if (ctxt->input != NULL) {
+	    if (ctxt->input->buf != NULL) {
+	        if (ctxt->input->buf->encoder != NULL) {
+		    if (ctxt->input->buf->encoder == handler)
+			return(0);
+		    /*
+		     * Note: this is a bit dangerous, but that's what it
+		     * takes to use nearly compatible signature for different
+		     * encodings.
+		     */
+		    xmlCharEncCloseFunc(ctxt->input->buf->encoder);
+		    ctxt->input->buf->encoder = handler;
+		    return(0);
+		}
+		ctxt->input->buf->encoder = handler;
+
+	        /*
+		 * Is there already some content down the pipe to convert ?
+		 */
+	        if ((ctxt->input->buf->buffer != NULL) &&
+		    (ctxt->input->buf->buffer->use > 0)) {
+		    int processed;
+
+		    /*
+		     * Specific handling of the Byte Order Mark for 
+		     * UTF-16
+		     */
+		    if ((handler->name != NULL) &&
+			(!strcmp(handler->name, "UTF-16LE")) && 
+		        (ctxt->input->cur[0] == 0xFF) &&
+		        (ctxt->input->cur[1] == 0xFE)) {
+			ctxt->input->cur += 2;
+		    }
+		    if ((handler->name != NULL) &&
+			(!strcmp(handler->name, "UTF-16BE")) && 
+		        (ctxt->input->cur[0] == 0xFE) &&
+		        (ctxt->input->cur[1] == 0xFF)) {
+			ctxt->input->cur += 2;
+		    }
+
+		    /*
+		     * Shring the current input buffer.
+		     * Move it as the raw buffer and create a new input buffer
+		     */
+		    processed = ctxt->input->cur - ctxt->input->base;
+		    xmlBufferShrink(ctxt->input->buf->buffer, processed);
+		    ctxt->input->buf->raw = ctxt->input->buf->buffer;
+		    ctxt->input->buf->buffer = xmlBufferCreate();
+
+		    if (ctxt->html) {
+			/*
+			 * converst as much as possbile of the buffer
+			 */
+			nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
+				                   ctxt->input->buf->buffer,
+						   ctxt->input->buf->raw);
+		    } else {
+			/*
+			 * convert just enough to get
+			 * '<?xml version="1.0" encoding="xxx"?>'
+			 * parsed with the autodetected encoding
+			 * into the parser reading buffer.
+			 */
+			nbchars = xmlCharEncFirstLine(ctxt->input->buf->encoder,
+						      ctxt->input->buf->buffer,
+						      ctxt->input->buf->raw);
+		    }
+		    if (nbchars < 0) {
+			fprintf(stderr, "xmlSwitchToEncoding: encoder error\n");
+			return(-1);
+		    }
+		    ctxt->input->base =
+		    ctxt->input->cur = ctxt->input->buf->buffer->content;
+
+		}
+		return(0);
+	    } else {
+	        if ((ctxt->input->length == 0) || (ctxt->input->buf == NULL)) {
+		    /*
+		     * When parsing a static memory array one must know the
+		     * size to be able to convert the buffer.
+		     */
+		    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+			ctxt->sax->error(ctxt->userData,
+					 "xmlSwitchEncoding : no input\n");
+		    return(-1);
+		} else {
+		    int processed;
+
+		    /*
+		     * Shring the current input buffer.
+		     * Move it as the raw buffer and create a new input buffer
+		     */
+		    processed = ctxt->input->cur - ctxt->input->base;
+
+		    ctxt->input->buf->raw = xmlBufferCreate();
+		    xmlBufferAdd(ctxt->input->buf->raw, ctxt->input->cur,
+				 ctxt->input->length - processed);
+		    ctxt->input->buf->buffer = xmlBufferCreate();
+
+		    /*
+		     * convert as much as possible of the raw input
+		     * to the parser reading buffer.
+		     */
+		    nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
+		                               ctxt->input->buf->buffer,
+					       ctxt->input->buf->raw);
+		    if (nbchars < 0) {
+			fprintf(stderr, "xmlSwitchToEncoding: encoder error\n");
+			return(-1);
+		    }
+
+		    /*
+		     * Conversion succeeded, get rid of the old buffer
+		     */
+		    if ((ctxt->input->free != NULL) &&
+		        (ctxt->input->base != NULL))
+			ctxt->input->free((xmlChar *) ctxt->input->base);
+		    ctxt->input->base =
+		    ctxt->input->cur = ctxt->input->buf->buffer->content;
+		}
+	    }
+	} else {
+	    if ((ctxt->sax != NULL) && (ctxt->sax->error != NULL))
+	        ctxt->sax->error(ctxt->userData,
+		                 "xmlSwitchEncoding : no input\n");
+	    return(-1);
+	}
+	/*
+	 * The parsing is now done in UTF8 natively
+	 */
+	ctxt->charset = XML_CHAR_ENCODING_UTF8;
+    } else 
+	return(-1);
+    return(0);
+
+}
+
 /************************************************************************
 *									*
 *	Commodity functions to handle entities processing		*
@ -1705,7 +2010,7 @@ xmlNewInputFromFile(xmlParserCtxtPtr ctxt, const char *filename) {
 	return(NULL);

    URI = xmlStrdup((xmlChar *) filename);
-    directory = xmlParserGetDirectory(URI);
+    directory = xmlParserGetDirectory((const char *) URI);

    inputStream = xmlNewInputStream(ctxt);
    if (inputStream == NULL) {
@ -1714,7 +2019,7 @@ xmlNewInputFromFile(xmlParserCtxtPtr ctxt, const char *filename) {
 	return(NULL);
    }

-    inputStream->filename = URI;
+    inputStream->filename = (const char *) URI;
    inputStream->directory = directory;
    inputStream->buf = buf;