1
0
mirror of https://gitlab.gnome.org/GNOME/libxml2.git synced 2026-01-26 21:41:34 +03:00

parser: Improve handling of NOBLANKS option

Don't change the SAX handler.

Use a helper function to invoke "characters" SAX callback.

The old code didn't advance the input pointer consistently before
invoking the callback. There was also some inconsistency wrt to
ctxt->space handling. I don't understand the ctxt->space thing, but
now we always behave like the non-complex case before.
This commit is contained in:
Nick Wellnhofer
2025-01-31 16:21:20 +01:00
parent 7a8722f557
commit 1f5b5371cf
2 changed files with 39 additions and 97 deletions

2
SAX2.c
View File

@@ -2733,7 +2733,7 @@ xmlSAXVersion(xmlSAXHandler *hdlr, int version)
hdlr->reference = xmlSAX2Reference; hdlr->reference = xmlSAX2Reference;
hdlr->characters = xmlSAX2Characters; hdlr->characters = xmlSAX2Characters;
hdlr->cdataBlock = xmlSAX2CDataBlock; hdlr->cdataBlock = xmlSAX2CDataBlock;
hdlr->ignorableWhitespace = xmlSAX2Characters; hdlr->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
hdlr->processingInstruction = xmlSAX2ProcessingInstruction; hdlr->processingInstruction = xmlSAX2ProcessingInstruction;
hdlr->comment = xmlSAX2Comment; hdlr->comment = xmlSAX2Comment;
hdlr->warning = xmlParserWarning; hdlr->warning = xmlParserWarning;

134
parser.c
View File

@@ -2978,13 +2978,6 @@ static int areBlanks(xmlParserCtxtPtr ctxt, const xmlChar *str, int len,
int i; int i;
xmlNodePtr lastChild; xmlNodePtr lastChild;
/*
* Don't spend time trying to differentiate them, the same callback is
* used !
*/
if (ctxt->sax->ignorableWhitespace == ctxt->sax->characters)
return(0);
/* /*
* Check for xml:space value. * Check for xml:space value.
*/ */
@@ -4865,6 +4858,34 @@ static const unsigned char test_char_data[256] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
}; };
static void
xmlCharacters(xmlParserCtxtPtr ctxt, const xmlChar *buf, int size) {
if ((ctxt->sax == NULL) || (ctxt->disableSAX))
return;
/*
* Calling areBlanks with only parts of a text node
* is fundamentally broken, making the NOBLANKS option
* essentially unusable.
*/
if ((!ctxt->keepBlanks) &&
(ctxt->sax->ignorableWhitespace != ctxt->sax->characters) &&
(areBlanks(ctxt, buf, size, 1))) {
if (ctxt->sax->ignorableWhitespace != NULL)
ctxt->sax->ignorableWhitespace(ctxt->userData, buf, size);
} else {
if (ctxt->sax->characters != NULL)
ctxt->sax->characters(ctxt->userData, buf, size);
/*
* The old code used to update this value for "complex" data
* even if keepBlanks was true. This was probably a bug.
*/
if ((!ctxt->keepBlanks) && (*ctxt->space == -1))
*ctxt->space = -2;
}
}
/** /**
* xmlParseCharDataInternal: * xmlParseCharDataInternal:
* @ctxt: an XML parser context * @ctxt: an XML parser context
@@ -4910,32 +4931,7 @@ get_more_space:
const xmlChar *tmp = ctxt->input->cur; const xmlChar *tmp = ctxt->input->cur;
ctxt->input->cur = in; ctxt->input->cur = in;
if ((ctxt->sax != NULL) && xmlCharacters(ctxt, tmp, nbchar);
(ctxt->disableSAX == 0) &&
(ctxt->sax->ignorableWhitespace !=
ctxt->sax->characters)) {
/*
* Calling areBlanks with only parts of a text node
* is fundamentally broken, making the NOBLANKS option
* essentially unusable.
*/
if (areBlanks(ctxt, tmp, nbchar, 1)) {
if (ctxt->sax->ignorableWhitespace != NULL)
ctxt->sax->ignorableWhitespace(ctxt->userData,
tmp, nbchar);
} else {
if (ctxt->sax->characters != NULL)
ctxt->sax->characters(ctxt->userData,
tmp, nbchar);
if (*ctxt->space == -1)
*ctxt->space = -2;
}
} else if ((ctxt->sax != NULL) &&
(ctxt->disableSAX == 0) &&
(ctxt->sax->characters != NULL)) {
ctxt->sax->characters(ctxt->userData,
tmp, nbchar);
}
} }
return; return;
} }
@@ -4968,35 +4964,13 @@ get_more:
} }
nbchar = in - ctxt->input->cur; nbchar = in - ctxt->input->cur;
if (nbchar > 0) { if (nbchar > 0) {
if ((ctxt->sax != NULL) && const xmlChar *tmp = ctxt->input->cur;
(ctxt->disableSAX == 0) && ctxt->input->cur = in;
(ctxt->sax->ignorableWhitespace !=
ctxt->sax->characters) &&
(IS_BLANK_CH(*ctxt->input->cur))) {
const xmlChar *tmp = ctxt->input->cur;
ctxt->input->cur = in;
if (areBlanks(ctxt, tmp, nbchar, 0)) { xmlCharacters(ctxt, tmp, nbchar);
if (ctxt->sax->ignorableWhitespace != NULL)
ctxt->sax->ignorableWhitespace(ctxt->userData, line = ctxt->input->line;
tmp, nbchar); col = ctxt->input->col;
} else {
if (ctxt->sax->characters != NULL)
ctxt->sax->characters(ctxt->userData,
tmp, nbchar);
if (*ctxt->space == -1)
*ctxt->space = -2;
}
line = ctxt->input->line;
col = ctxt->input->col;
} else if ((ctxt->sax != NULL) &&
(ctxt->disableSAX == 0)) {
if (ctxt->sax->characters != NULL)
ctxt->sax->characters(ctxt->userData,
ctxt->input->cur, nbchar);
line = ctxt->input->line;
col = ctxt->input->col;
}
} }
ctxt->input->cur = in; ctxt->input->cur = in;
if (*in == 0xD) { if (*in == 0xD) {
@@ -5060,23 +5034,7 @@ xmlParseCharDataComplex(xmlParserCtxtPtr ctxt, int partial) {
if (nbchar >= XML_PARSER_BIG_BUFFER_SIZE) { if (nbchar >= XML_PARSER_BIG_BUFFER_SIZE) {
buf[nbchar] = 0; buf[nbchar] = 0;
/* xmlCharacters(ctxt, buf, nbchar);
* OK the segment is to be consumed as chars.
*/
if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
if (areBlanks(ctxt, buf, nbchar, 0)) {
if (ctxt->sax->ignorableWhitespace != NULL)
ctxt->sax->ignorableWhitespace(ctxt->userData,
buf, nbchar);
} else {
if (ctxt->sax->characters != NULL)
ctxt->sax->characters(ctxt->userData, buf, nbchar);
if ((ctxt->sax->characters !=
ctxt->sax->ignorableWhitespace) &&
(*ctxt->space == -1))
*ctxt->space = -2;
}
}
nbchar = 0; nbchar = 0;
SHRINK; SHRINK;
} }
@@ -5084,21 +5042,8 @@ xmlParseCharDataComplex(xmlParserCtxtPtr ctxt, int partial) {
} }
if (nbchar != 0) { if (nbchar != 0) {
buf[nbchar] = 0; buf[nbchar] = 0;
/*
* OK the segment is to be consumed as chars. xmlCharacters(ctxt, buf, nbchar);
*/
if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
if (areBlanks(ctxt, buf, nbchar, 0)) {
if (ctxt->sax->ignorableWhitespace != NULL)
ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
} else {
if (ctxt->sax->characters != NULL)
ctxt->sax->characters(ctxt->userData, buf, nbchar);
if ((ctxt->sax->characters != ctxt->sax->ignorableWhitespace) &&
(*ctxt->space == -1))
*ctxt->space = -2;
}
}
} }
/* /*
* cur == 0 can mean * cur == 0 can mean
@@ -13633,9 +13578,6 @@ xmlCtxtSetOptionsInternal(xmlParserCtxtPtr ctxt, int options, int keepMask)
/* /*
* Changing SAX callbacks is a bad idea. This should be fixed. * Changing SAX callbacks is a bad idea. This should be fixed.
*/ */
if (options & XML_PARSE_NOBLANKS) {
ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
}
if (options & XML_PARSE_NOCDATA) { if (options & XML_PARSE_NOCDATA) {
ctxt->sax->cdataBlock = NULL; ctxt->sax->cdataBlock = NULL;
} }