1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2025-07-30 22:43:14 +03:00

- parser.c parserInternals.c encoding.c: Since Notepad on Win2k

outputs a BOM in UTF8, an errata has been issued to avoid the
  problem, that was the most reasonable solution... Add support
  for a leading UTF8 BOM in entities.
Daniel
This commit is contained in:
Daniel Veillard
2001-06-20 17:41:10 +00:00
parent 10ea86cba4
commit 87a764ed85
4 changed files with 110 additions and 2 deletions

View File

@ -1,3 +1,10 @@
Wed Jun 20 19:37:25 CEST 2001 Daniel Veillard <Daniel.Veillard@imag.fr>
* parser.c parserInternals.c encoding.c: Since Notepad on Win2k
outputs a BOM in UTF8, an errata has been issued to avoid the
problem, that was the most reasonable solution... Add support
for a leading UTF8 BOM in entities.
Wed Jun 20 15:38:59 CEST 2001 Daniel Veillard <Daniel.Veillard@imag.fr> Wed Jun 20 15:38:59 CEST 2001 Daniel Veillard <Daniel.Veillard@imag.fr>
* valid.c: fixed a bug found when post validating an entity ref * valid.c: fixed a bug found when post validating an entity ref

View File

@ -1131,6 +1131,15 @@ xmlDetectCharEncoding(const unsigned char* in, int len)
(in[2] == 0x78) && (in[3] == 0x6D)) (in[2] == 0x78) && (in[3] == 0x6D))
return(XML_CHAR_ENCODING_UTF8); return(XML_CHAR_ENCODING_UTF8);
} }
if (len >= 3) {
/*
* Errata on XML-1.0 June 20 2001
* We now allow an UTF8 encoded BOM
*/
if ((in[0] == 0xEF) && (in[1] == 0xBB) &&
(in[2] == 0xBF))
return(XML_CHAR_ENCODING_UTF8);
}
if (len >= 2) { if (len >= 2) {
if ((in[0] == 0xFE) && (in[1] == 0xFF)) if ((in[0] == 0xFE) && (in[1] == 0xFF))
return(XML_CHAR_ENCODING_UTF16BE); return(XML_CHAR_ENCODING_UTF16BE);

View File

@ -768,6 +768,9 @@ xmlParserHandlePEReference(xmlParserCtxtPtr ctxt) {
} else { } else {
if ((entity->etype == XML_INTERNAL_PARAMETER_ENTITY) || if ((entity->etype == XML_INTERNAL_PARAMETER_ENTITY) ||
(entity->etype == XML_EXTERNAL_PARAMETER_ENTITY)) { (entity->etype == XML_EXTERNAL_PARAMETER_ENTITY)) {
xmlChar start[4];
xmlCharEncoding enc;
/* /*
* handle the extra spaces added before and after * handle the extra spaces added before and after
* c.f. http://www.w3.org/TR/REC-xml#as-PE * c.f. http://www.w3.org/TR/REC-xml#as-PE
@ -775,6 +778,22 @@ xmlParserHandlePEReference(xmlParserCtxtPtr ctxt) {
*/ */
input = xmlNewEntityInputStream(ctxt, entity); input = xmlNewEntityInputStream(ctxt, entity);
xmlPushInput(ctxt, input); xmlPushInput(ctxt, input);
/*
* Get the 4 first bytes and decode the charset
* if enc != XML_CHAR_ENCODING_NONE
* plug some encoding conversion routines.
*/
GROW
start[0] = RAW;
start[1] = NXT(1);
start[2] = NXT(2);
start[3] = NXT(3);
enc = xmlDetectCharEncoding(start, 4);
if (enc != XML_CHAR_ENCODING_NONE) {
xmlSwitchEncoding(ctxt, enc);
}
if ((entity->etype == XML_EXTERNAL_PARAMETER_ENTITY) && if ((entity->etype == XML_EXTERNAL_PARAMETER_ENTITY) &&
(RAW == '<') && (NXT(1) == '?') && (RAW == '<') && (NXT(1) == '?') &&
(NXT(2) == 'x') && (NXT(3) == 'm') && (NXT(2) == 'x') && (NXT(3) == 'm') &&
@ -8585,6 +8604,7 @@ xmlIOParseDTD(xmlSAXHandlerPtr sax, xmlParserInputBufferPtr input,
xmlDtdPtr ret = NULL; xmlDtdPtr ret = NULL;
xmlParserCtxtPtr ctxt; xmlParserCtxtPtr ctxt;
xmlParserInputPtr pinput = NULL; xmlParserInputPtr pinput = NULL;
xmlChar start[4];
if (input == NULL) if (input == NULL)
return(NULL); return(NULL);
@ -8634,6 +8654,23 @@ xmlIOParseDTD(xmlSAXHandlerPtr sax, xmlParserInputBufferPtr input,
ctxt->myDoc = xmlNewDoc(BAD_CAST "1.0"); ctxt->myDoc = xmlNewDoc(BAD_CAST "1.0");
ctxt->myDoc->extSubset = xmlNewDtd(ctxt->myDoc, BAD_CAST "none", ctxt->myDoc->extSubset = xmlNewDtd(ctxt->myDoc, BAD_CAST "none",
BAD_CAST "none", BAD_CAST "none"); BAD_CAST "none", BAD_CAST "none");
if (enc == XML_CHAR_ENCODING_NONE) {
/*
* Get the 4 first bytes and decode the charset
* if enc != XML_CHAR_ENCODING_NONE
* plug some encoding conversion routines.
*/
start[0] = RAW;
start[1] = NXT(1);
start[2] = NXT(2);
start[3] = NXT(3);
enc = xmlDetectCharEncoding(start, 4);
if (enc != XML_CHAR_ENCODING_NONE) {
xmlSwitchEncoding(ctxt, enc);
}
}
xmlParseExternalSubset(ctxt, BAD_CAST "none", BAD_CAST "none"); xmlParseExternalSubset(ctxt, BAD_CAST "none", BAD_CAST "none");
if (ctxt->myDoc != NULL) { if (ctxt->myDoc != NULL) {
@ -8785,6 +8822,8 @@ xmlParseCtxtExternalEntity(xmlParserCtxtPtr ctx, const xmlChar *URL,
xmlDocPtr newDoc; xmlDocPtr newDoc;
xmlSAXHandlerPtr oldsax = NULL; xmlSAXHandlerPtr oldsax = NULL;
int ret = 0; int ret = 0;
xmlChar start[4];
xmlCharEncoding enc;
if (ctx->depth > 40) { if (ctx->depth > 40) {
return(XML_ERR_ENTITY_LOOP); return(XML_ERR_ENTITY_LOOP);
@ -8832,10 +8871,24 @@ xmlParseCtxtExternalEntity(xmlParserCtxtPtr ctx, const xmlChar *URL,
newDoc->children->doc = ctx->myDoc; newDoc->children->doc = ctx->myDoc;
} }
/*
* Get the 4 first bytes and decode the charset
* if enc != XML_CHAR_ENCODING_NONE
* plug some encoding conversion routines.
*/
GROW
start[0] = RAW;
start[1] = NXT(1);
start[2] = NXT(2);
start[3] = NXT(3);
enc = xmlDetectCharEncoding(start, 4);
if (enc != XML_CHAR_ENCODING_NONE) {
xmlSwitchEncoding(ctxt, enc);
}
/* /*
* Parse a possible text declaration first * Parse a possible text declaration first
*/ */
GROW;
if ((RAW == '<') && (NXT(1) == '?') && if ((RAW == '<') && (NXT(1) == '?') &&
(NXT(2) == 'x') && (NXT(3) == 'm') && (NXT(2) == 'x') && (NXT(3) == 'm') &&
(NXT(4) == 'l') && (IS_BLANK(NXT(5)))) { (NXT(4) == 'l') && (IS_BLANK(NXT(5)))) {
@ -8946,6 +8999,8 @@ xmlParseExternalEntityPrivate(xmlDocPtr doc, xmlParserCtxtPtr oldctxt,
xmlDocPtr newDoc; xmlDocPtr newDoc;
xmlSAXHandlerPtr oldsax = NULL; xmlSAXHandlerPtr oldsax = NULL;
int ret = 0; int ret = 0;
xmlChar start[4];
xmlCharEncoding enc;
if (depth > 40) { if (depth > 40) {
return(XML_ERR_ENTITY_LOOP); return(XML_ERR_ENTITY_LOOP);
@ -9015,10 +9070,24 @@ xmlParseExternalEntityPrivate(xmlDocPtr doc, xmlParserCtxtPtr oldctxt,
newDoc->children->doc = doc; newDoc->children->doc = doc;
} }
/*
* Get the 4 first bytes and decode the charset
* if enc != XML_CHAR_ENCODING_NONE
* plug some encoding conversion routines.
*/
GROW;
start[0] = RAW;
start[1] = NXT(1);
start[2] = NXT(2);
start[3] = NXT(3);
enc = xmlDetectCharEncoding(start, 4);
if (enc != XML_CHAR_ENCODING_NONE) {
xmlSwitchEncoding(ctxt, enc);
}
/* /*
* Parse a possible text declaration first * Parse a possible text declaration first
*/ */
GROW;
if ((RAW == '<') && (NXT(1) == '?') && if ((RAW == '<') && (NXT(1) == '?') &&
(NXT(2) == 'x') && (NXT(3) == 'm') && (NXT(2) == 'x') && (NXT(3) == 'm') &&
(NXT(4) == 'l') && (IS_BLANK(NXT(5)))) { (NXT(4) == 'l') && (IS_BLANK(NXT(5)))) {

View File

@ -1569,6 +1569,17 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
case XML_CHAR_ENCODING_UTF8: case XML_CHAR_ENCODING_UTF8:
/* default encoding, no conversion should be needed */ /* default encoding, no conversion should be needed */
ctxt->charset = XML_CHAR_ENCODING_UTF8; ctxt->charset = XML_CHAR_ENCODING_UTF8;
/*
* Errata on XML-1.0 June 20 2001
* Specific handling of the Byte Order Mark for
* UTF-8
*/
if ((ctxt->input->cur[0] == 0xEF) &&
(ctxt->input->cur[1] == 0xBB) &&
(ctxt->input->cur[2] == 0xBF)) {
ctxt->input->cur += 3;
}
return(0); return(0);
default: default:
break; break;
@ -1739,6 +1750,18 @@ xmlSwitchToEncoding(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler)
(ctxt->input->cur[1] == 0xFF)) { (ctxt->input->cur[1] == 0xFF)) {
ctxt->input->cur += 2; ctxt->input->cur += 2;
} }
/*
* Errata on XML-1.0 June 20 2001
* Specific handling of the Byte Order Mark for
* UTF-8
*/
if ((handler->name != NULL) &&
(!strcmp(handler->name, "UTF-8")) &&
(ctxt->input->cur[0] == 0xEF) &&
(ctxt->input->cur[1] == 0xBB) &&
(ctxt->input->cur[1] == 0xBF)) {
ctxt->input->cur += 3;
}
/* /*
* Shring the current input buffer. * Shring the current input buffer.