diff --git a/ChangeLog b/ChangeLog index 800d9718..7799b531 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,10 @@ +Wed Jun 20 19:37:25 CEST 2001 Daniel Veillard + + * parser.c parserInternals.c encoding.c: Since Notepad on Win2k + outputs a BOM in UTF8, an errata has been issued to avoid the + problem, that was the most reasonable solution... Add support + for a leading UTF8 BOM in entities. + Wed Jun 20 15:38:59 CEST 2001 Daniel Veillard * valid.c: fixed a bug found when post validating an entity ref diff --git a/encoding.c b/encoding.c index f86adf3e..df760f76 100644 --- a/encoding.c +++ b/encoding.c @@ -1131,6 +1131,15 @@ xmlDetectCharEncoding(const unsigned char* in, int len) (in[2] == 0x78) && (in[3] == 0x6D)) return(XML_CHAR_ENCODING_UTF8); } + if (len >= 3) { + /* + * Errata on XML-1.0 June 20 2001 + * We now allow an UTF8 encoded BOM + */ + if ((in[0] == 0xEF) && (in[1] == 0xBB) && + (in[2] == 0xBF)) + return(XML_CHAR_ENCODING_UTF8); + } if (len >= 2) { if ((in[0] == 0xFE) && (in[1] == 0xFF)) return(XML_CHAR_ENCODING_UTF16BE); diff --git a/parser.c b/parser.c index d7c7c565..f1e90ca3 100644 --- a/parser.c +++ b/parser.c @@ -768,6 +768,9 @@ xmlParserHandlePEReference(xmlParserCtxtPtr ctxt) { } else { if ((entity->etype == XML_INTERNAL_PARAMETER_ENTITY) || (entity->etype == XML_EXTERNAL_PARAMETER_ENTITY)) { + xmlChar start[4]; + xmlCharEncoding enc; + /* * handle the extra spaces added before and after * c.f. http://www.w3.org/TR/REC-xml#as-PE @@ -775,6 +778,22 @@ xmlParserHandlePEReference(xmlParserCtxtPtr ctxt) { */ input = xmlNewEntityInputStream(ctxt, entity); xmlPushInput(ctxt, input); + + /* + * Get the 4 first bytes and decode the charset + * if enc != XML_CHAR_ENCODING_NONE + * plug some encoding conversion routines. + */ + GROW + start[0] = RAW; + start[1] = NXT(1); + start[2] = NXT(2); + start[3] = NXT(3); + enc = xmlDetectCharEncoding(start, 4); + if (enc != XML_CHAR_ENCODING_NONE) { + xmlSwitchEncoding(ctxt, enc); + } + if ((entity->etype == XML_EXTERNAL_PARAMETER_ENTITY) && (RAW == '<') && (NXT(1) == '?') && (NXT(2) == 'x') && (NXT(3) == 'm') && @@ -8585,6 +8604,7 @@ xmlIOParseDTD(xmlSAXHandlerPtr sax, xmlParserInputBufferPtr input, xmlDtdPtr ret = NULL; xmlParserCtxtPtr ctxt; xmlParserInputPtr pinput = NULL; + xmlChar start[4]; if (input == NULL) return(NULL); @@ -8634,6 +8654,23 @@ xmlIOParseDTD(xmlSAXHandlerPtr sax, xmlParserInputBufferPtr input, ctxt->myDoc = xmlNewDoc(BAD_CAST "1.0"); ctxt->myDoc->extSubset = xmlNewDtd(ctxt->myDoc, BAD_CAST "none", BAD_CAST "none", BAD_CAST "none"); + + if (enc == XML_CHAR_ENCODING_NONE) { + /* + * Get the 4 first bytes and decode the charset + * if enc != XML_CHAR_ENCODING_NONE + * plug some encoding conversion routines. + */ + start[0] = RAW; + start[1] = NXT(1); + start[2] = NXT(2); + start[3] = NXT(3); + enc = xmlDetectCharEncoding(start, 4); + if (enc != XML_CHAR_ENCODING_NONE) { + xmlSwitchEncoding(ctxt, enc); + } + } + xmlParseExternalSubset(ctxt, BAD_CAST "none", BAD_CAST "none"); if (ctxt->myDoc != NULL) { @@ -8785,6 +8822,8 @@ xmlParseCtxtExternalEntity(xmlParserCtxtPtr ctx, const xmlChar *URL, xmlDocPtr newDoc; xmlSAXHandlerPtr oldsax = NULL; int ret = 0; + xmlChar start[4]; + xmlCharEncoding enc; if (ctx->depth > 40) { return(XML_ERR_ENTITY_LOOP); @@ -8832,10 +8871,24 @@ xmlParseCtxtExternalEntity(xmlParserCtxtPtr ctx, const xmlChar *URL, newDoc->children->doc = ctx->myDoc; } + /* + * Get the 4 first bytes and decode the charset + * if enc != XML_CHAR_ENCODING_NONE + * plug some encoding conversion routines. + */ + GROW + start[0] = RAW; + start[1] = NXT(1); + start[2] = NXT(2); + start[3] = NXT(3); + enc = xmlDetectCharEncoding(start, 4); + if (enc != XML_CHAR_ENCODING_NONE) { + xmlSwitchEncoding(ctxt, enc); + } + /* * Parse a possible text declaration first */ - GROW; if ((RAW == '<') && (NXT(1) == '?') && (NXT(2) == 'x') && (NXT(3) == 'm') && (NXT(4) == 'l') && (IS_BLANK(NXT(5)))) { @@ -8946,6 +8999,8 @@ xmlParseExternalEntityPrivate(xmlDocPtr doc, xmlParserCtxtPtr oldctxt, xmlDocPtr newDoc; xmlSAXHandlerPtr oldsax = NULL; int ret = 0; + xmlChar start[4]; + xmlCharEncoding enc; if (depth > 40) { return(XML_ERR_ENTITY_LOOP); @@ -9015,10 +9070,24 @@ xmlParseExternalEntityPrivate(xmlDocPtr doc, xmlParserCtxtPtr oldctxt, newDoc->children->doc = doc; } + /* + * Get the 4 first bytes and decode the charset + * if enc != XML_CHAR_ENCODING_NONE + * plug some encoding conversion routines. + */ + GROW; + start[0] = RAW; + start[1] = NXT(1); + start[2] = NXT(2); + start[3] = NXT(3); + enc = xmlDetectCharEncoding(start, 4); + if (enc != XML_CHAR_ENCODING_NONE) { + xmlSwitchEncoding(ctxt, enc); + } + /* * Parse a possible text declaration first */ - GROW; if ((RAW == '<') && (NXT(1) == '?') && (NXT(2) == 'x') && (NXT(3) == 'm') && (NXT(4) == 'l') && (IS_BLANK(NXT(5)))) { diff --git a/parserInternals.c b/parserInternals.c index 4039c99a..90b48122 100644 --- a/parserInternals.c +++ b/parserInternals.c @@ -1569,6 +1569,17 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) case XML_CHAR_ENCODING_UTF8: /* default encoding, no conversion should be needed */ ctxt->charset = XML_CHAR_ENCODING_UTF8; + + /* + * Errata on XML-1.0 June 20 2001 + * Specific handling of the Byte Order Mark for + * UTF-8 + */ + if ((ctxt->input->cur[0] == 0xEF) && + (ctxt->input->cur[1] == 0xBB) && + (ctxt->input->cur[2] == 0xBF)) { + ctxt->input->cur += 3; + } return(0); default: break; @@ -1739,6 +1750,18 @@ xmlSwitchToEncoding(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler) (ctxt->input->cur[1] == 0xFF)) { ctxt->input->cur += 2; } + /* + * Errata on XML-1.0 June 20 2001 + * Specific handling of the Byte Order Mark for + * UTF-8 + */ + if ((handler->name != NULL) && + (!strcmp(handler->name, "UTF-8")) && + (ctxt->input->cur[0] == 0xEF) && + (ctxt->input->cur[1] == 0xBB) && + (ctxt->input->cur[1] == 0xBF)) { + ctxt->input->cur += 3; + } /* * Shring the current input buffer.