- parser.c parserInternals.c encoding.c: Since Notepad on Win2k

outputs a BOM in UTF8, an errata has been issued to avoid the problem, that was the most reasonable solution... Add support for a leading UTF8 BOM in entities. Daniel
2025-07-30 22:43:14 +03:00 · 2001-06-20 17:41:10 +00:00
parent 10ea86cba4
commit 87a764ed85
4 changed files with 110 additions and 2 deletions
--- a/7
+++ b/7
@ -1,3 +1,10 @@
 Wed Jun 20 19:37:25 CEST 2001 Daniel Veillard <Daniel.Veillard@imag.fr>
 	* parser.c parserInternals.c encoding.c: Since Notepad on Win2k
 	  outputs a BOM in UTF8, an errata has been issued to avoid the
 	  problem, that was the most reasonable solution... Add support
 	  for a leading UTF8 BOM in entities.
 Wed Jun 20 15:38:59 CEST 2001 Daniel Veillard <Daniel.Veillard@imag.fr>
 	* valid.c: fixed a bug found when post validating an entity ref
--- a/encoding.c
+++ b/encoding.c
@ -1131,6 +1131,15 @@ xmlDetectCharEncoding(const unsigned char* in, int len)
 	    (in[2] == 0x78) && (in[3] == 0x6D))
 	    return(XML_CHAR_ENCODING_UTF8);
    }
    if (len >= 3) {
 	/*
 	 * Errata on XML-1.0 June 20 2001
 	 * We now allow an UTF8 encoded BOM
 	 */
 	if ((in[0] == 0xEF) && (in[1] == 0xBB) &&
 	    (in[2] == 0xBF))
 	    return(XML_CHAR_ENCODING_UTF8);
    }
    if (len >= 2) {
 	if ((in[0] == 0xFE) && (in[1] == 0xFF))
 	    return(XML_CHAR_ENCODING_UTF16BE);
--- a/parser.c
+++ b/parser.c
@ -768,6 +768,9 @@ xmlParserHandlePEReference(xmlParserCtxtPtr ctxt) {
 	    } else {
 	        if ((entity->etype == XML_INTERNAL_PARAMETER_ENTITY) ||
 		    (entity->etype == XML_EXTERNAL_PARAMETER_ENTITY)) {
 		    xmlChar start[4];
 		    xmlCharEncoding enc;
 		    /*
 		     * handle the extra spaces added before and after
 		     * c.f. http://www.w3.org/TR/REC-xml#as-PE
@ -775,6 +778,22 @@ xmlParserHandlePEReference(xmlParserCtxtPtr ctxt) {
 		     */
 		    input = xmlNewEntityInputStream(ctxt, entity);
 		    xmlPushInput(ctxt, input);
 		    /* 
 		     * Get the 4 first bytes and decode the charset
 		     * if enc != XML_CHAR_ENCODING_NONE
 		     * plug some encoding conversion routines.
 		     */
 		    GROW
 		    start[0] = RAW;
 		    start[1] = NXT(1);
 		    start[2] = NXT(2);
 		    start[3] = NXT(3);
 		    enc = xmlDetectCharEncoding(start, 4);
 		    if (enc != XML_CHAR_ENCODING_NONE) {
 			xmlSwitchEncoding(ctxt, enc);
 		    }
 		    if ((entity->etype == XML_EXTERNAL_PARAMETER_ENTITY) &&
 			(RAW == '<') && (NXT(1) == '?') &&
 			(NXT(2) == 'x') && (NXT(3) == 'm') &&
@ -8585,6 +8604,7 @@ xmlIOParseDTD(xmlSAXHandlerPtr sax, xmlParserInputBufferPtr input,
    xmlDtdPtr ret = NULL;
    xmlParserCtxtPtr ctxt;
    xmlParserInputPtr pinput = NULL;
    xmlChar start[4];
    if (input == NULL)
 	return(NULL);
@ -8634,6 +8654,23 @@ xmlIOParseDTD(xmlSAXHandlerPtr sax, xmlParserInputBufferPtr input,
    ctxt->myDoc = xmlNewDoc(BAD_CAST "1.0");
    ctxt->myDoc->extSubset = xmlNewDtd(ctxt->myDoc, BAD_CAST "none",
 	                               BAD_CAST "none", BAD_CAST "none");
    if (enc == XML_CHAR_ENCODING_NONE) {
 	/* 
 	 * Get the 4 first bytes and decode the charset
 	 * if enc != XML_CHAR_ENCODING_NONE
 	 * plug some encoding conversion routines.
 	 */
 	start[0] = RAW;
 	start[1] = NXT(1);
 	start[2] = NXT(2);
 	start[3] = NXT(3);
 	enc = xmlDetectCharEncoding(start, 4);
 	if (enc != XML_CHAR_ENCODING_NONE) {
 	    xmlSwitchEncoding(ctxt, enc);
 	}
    }
    xmlParseExternalSubset(ctxt, BAD_CAST "none", BAD_CAST "none");
    if (ctxt->myDoc != NULL) {
@ -8785,6 +8822,8 @@ xmlParseCtxtExternalEntity(xmlParserCtxtPtr ctx, const xmlChar *URL,
    xmlDocPtr newDoc;
    xmlSAXHandlerPtr oldsax = NULL;
    int ret = 0;
    xmlChar start[4];
    xmlCharEncoding enc;
    if (ctx->depth > 40) {
 	return(XML_ERR_ENTITY_LOOP);
@ -8832,10 +8871,24 @@ xmlParseCtxtExternalEntity(xmlParserCtxtPtr ctx, const xmlChar *URL,
 	newDoc->children->doc = ctx->myDoc;
    }
    /* 
     * Get the 4 first bytes and decode the charset
     * if enc != XML_CHAR_ENCODING_NONE
     * plug some encoding conversion routines.
     */
    GROW
    start[0] = RAW;
    start[1] = NXT(1);
    start[2] = NXT(2);
    start[3] = NXT(3);
    enc = xmlDetectCharEncoding(start, 4);
    if (enc != XML_CHAR_ENCODING_NONE) {
        xmlSwitchEncoding(ctxt, enc);
    }
    /*
     * Parse a possible text declaration first
     */
    GROW;
    if ((RAW == '<') && (NXT(1) == '?') &&
 	(NXT(2) == 'x') && (NXT(3) == 'm') &&
 	(NXT(4) == 'l') && (IS_BLANK(NXT(5)))) {
@ -8946,6 +8999,8 @@ xmlParseExternalEntityPrivate(xmlDocPtr doc, xmlParserCtxtPtr oldctxt,
    xmlDocPtr newDoc;
    xmlSAXHandlerPtr oldsax = NULL;
    int ret = 0;
    xmlChar start[4];
    xmlCharEncoding enc;
    if (depth > 40) {
 	return(XML_ERR_ENTITY_LOOP);
@ -9015,10 +9070,24 @@ xmlParseExternalEntityPrivate(xmlDocPtr doc, xmlParserCtxtPtr oldctxt,
 	newDoc->children->doc = doc;
    }
    /* 
     * Get the 4 first bytes and decode the charset
     * if enc != XML_CHAR_ENCODING_NONE
     * plug some encoding conversion routines.
     */
    GROW;
    start[0] = RAW;
    start[1] = NXT(1);
    start[2] = NXT(2);
    start[3] = NXT(3);
    enc = xmlDetectCharEncoding(start, 4);
    if (enc != XML_CHAR_ENCODING_NONE) {
        xmlSwitchEncoding(ctxt, enc);
    }
    /*
     * Parse a possible text declaration first
     */
    GROW;
    if ((RAW == '<') && (NXT(1) == '?') &&
 	(NXT(2) == 'x') && (NXT(3) == 'm') &&
 	(NXT(4) == 'l') && (IS_BLANK(NXT(5)))) {
--- a/parserInternals.c
+++ b/parserInternals.c
@ -1569,6 +1569,17 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
 	case XML_CHAR_ENCODING_UTF8:
 	    /* default encoding, no conversion should be needed */
 	    ctxt->charset = XML_CHAR_ENCODING_UTF8;
 	    /*
 	     * Errata on XML-1.0 June 20 2001
 	     * Specific handling of the Byte Order Mark for
 	     * UTF-8
 	     */
 	    if ((ctxt->input->cur[0] == 0xEF) &&
 		(ctxt->input->cur[1] == 0xBB) &&
 		(ctxt->input->cur[2] == 0xBF)) {
 		ctxt->input->cur += 3;
 	    }
 	    return(0);
 	default:
 	    break;
@ -1739,6 +1750,18 @@ xmlSwitchToEncoding(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler)
 		        (ctxt->input->cur[1] == 0xFF)) {
 			ctxt->input->cur += 2;
 		    }
 		    /*
 		     * Errata on XML-1.0 June 20 2001
 		     * Specific handling of the Byte Order Mark for
 		     * UTF-8
 		     */
 		    if ((handler->name != NULL) &&
 			(!strcmp(handler->name, "UTF-8")) &&
 			(ctxt->input->cur[0] == 0xEF) &&
 			(ctxt->input->cur[1] == 0xBB) &&
 			(ctxt->input->cur[1] == 0xBF)) {
 			ctxt->input->cur += 3;
 		    }
 		    /*
 		     * Shring the current input buffer.