From 1f5b5371cfbdd6eb0b0c10a59608e9bb2cb64b90 Mon Sep 17 00:00:00 2001 From: Nick Wellnhofer Date: Fri, 31 Jan 2025 16:21:20 +0100 Subject: [PATCH] parser: Improve handling of NOBLANKS option Don't change the SAX handler. Use a helper function to invoke "characters" SAX callback. The old code didn't advance the input pointer consistently before invoking the callback. There was also some inconsistency wrt to ctxt->space handling. I don't understand the ctxt->space thing, but now we always behave like the non-complex case before. --- SAX2.c | 2 +- parser.c | 134 ++++++++++++++++--------------------------------------- 2 files changed, 39 insertions(+), 97 deletions(-) diff --git a/SAX2.c b/SAX2.c index 021c8e904..702041f97 100644 --- a/SAX2.c +++ b/SAX2.c @@ -2733,7 +2733,7 @@ xmlSAXVersion(xmlSAXHandler *hdlr, int version) hdlr->reference = xmlSAX2Reference; hdlr->characters = xmlSAX2Characters; hdlr->cdataBlock = xmlSAX2CDataBlock; - hdlr->ignorableWhitespace = xmlSAX2Characters; + hdlr->ignorableWhitespace = xmlSAX2IgnorableWhitespace; hdlr->processingInstruction = xmlSAX2ProcessingInstruction; hdlr->comment = xmlSAX2Comment; hdlr->warning = xmlParserWarning; diff --git a/parser.c b/parser.c index 3e034d47f..08d914ee9 100644 --- a/parser.c +++ b/parser.c @@ -2978,13 +2978,6 @@ static int areBlanks(xmlParserCtxtPtr ctxt, const xmlChar *str, int len, int i; xmlNodePtr lastChild; - /* - * Don't spend time trying to differentiate them, the same callback is - * used ! - */ - if (ctxt->sax->ignorableWhitespace == ctxt->sax->characters) - return(0); - /* * Check for xml:space value. */ @@ -4865,6 +4858,34 @@ static const unsigned char test_char_data[256] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; +static void +xmlCharacters(xmlParserCtxtPtr ctxt, const xmlChar *buf, int size) { + if ((ctxt->sax == NULL) || (ctxt->disableSAX)) + return; + + /* + * Calling areBlanks with only parts of a text node + * is fundamentally broken, making the NOBLANKS option + * essentially unusable. + */ + if ((!ctxt->keepBlanks) && + (ctxt->sax->ignorableWhitespace != ctxt->sax->characters) && + (areBlanks(ctxt, buf, size, 1))) { + if (ctxt->sax->ignorableWhitespace != NULL) + ctxt->sax->ignorableWhitespace(ctxt->userData, buf, size); + } else { + if (ctxt->sax->characters != NULL) + ctxt->sax->characters(ctxt->userData, buf, size); + + /* + * The old code used to update this value for "complex" data + * even if keepBlanks was true. This was probably a bug. + */ + if ((!ctxt->keepBlanks) && (*ctxt->space == -1)) + *ctxt->space = -2; + } +} + /** * xmlParseCharDataInternal: * @ctxt: an XML parser context @@ -4910,32 +4931,7 @@ get_more_space: const xmlChar *tmp = ctxt->input->cur; ctxt->input->cur = in; - if ((ctxt->sax != NULL) && - (ctxt->disableSAX == 0) && - (ctxt->sax->ignorableWhitespace != - ctxt->sax->characters)) { - /* - * Calling areBlanks with only parts of a text node - * is fundamentally broken, making the NOBLANKS option - * essentially unusable. - */ - if (areBlanks(ctxt, tmp, nbchar, 1)) { - if (ctxt->sax->ignorableWhitespace != NULL) - ctxt->sax->ignorableWhitespace(ctxt->userData, - tmp, nbchar); - } else { - if (ctxt->sax->characters != NULL) - ctxt->sax->characters(ctxt->userData, - tmp, nbchar); - if (*ctxt->space == -1) - *ctxt->space = -2; - } - } else if ((ctxt->sax != NULL) && - (ctxt->disableSAX == 0) && - (ctxt->sax->characters != NULL)) { - ctxt->sax->characters(ctxt->userData, - tmp, nbchar); - } + xmlCharacters(ctxt, tmp, nbchar); } return; } @@ -4968,35 +4964,13 @@ get_more: } nbchar = in - ctxt->input->cur; if (nbchar > 0) { - if ((ctxt->sax != NULL) && - (ctxt->disableSAX == 0) && - (ctxt->sax->ignorableWhitespace != - ctxt->sax->characters) && - (IS_BLANK_CH(*ctxt->input->cur))) { - const xmlChar *tmp = ctxt->input->cur; - ctxt->input->cur = in; + const xmlChar *tmp = ctxt->input->cur; + ctxt->input->cur = in; - if (areBlanks(ctxt, tmp, nbchar, 0)) { - if (ctxt->sax->ignorableWhitespace != NULL) - ctxt->sax->ignorableWhitespace(ctxt->userData, - tmp, nbchar); - } else { - if (ctxt->sax->characters != NULL) - ctxt->sax->characters(ctxt->userData, - tmp, nbchar); - if (*ctxt->space == -1) - *ctxt->space = -2; - } - line = ctxt->input->line; - col = ctxt->input->col; - } else if ((ctxt->sax != NULL) && - (ctxt->disableSAX == 0)) { - if (ctxt->sax->characters != NULL) - ctxt->sax->characters(ctxt->userData, - ctxt->input->cur, nbchar); - line = ctxt->input->line; - col = ctxt->input->col; - } + xmlCharacters(ctxt, tmp, nbchar); + + line = ctxt->input->line; + col = ctxt->input->col; } ctxt->input->cur = in; if (*in == 0xD) { @@ -5060,23 +5034,7 @@ xmlParseCharDataComplex(xmlParserCtxtPtr ctxt, int partial) { if (nbchar >= XML_PARSER_BIG_BUFFER_SIZE) { buf[nbchar] = 0; - /* - * OK the segment is to be consumed as chars. - */ - if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { - if (areBlanks(ctxt, buf, nbchar, 0)) { - if (ctxt->sax->ignorableWhitespace != NULL) - ctxt->sax->ignorableWhitespace(ctxt->userData, - buf, nbchar); - } else { - if (ctxt->sax->characters != NULL) - ctxt->sax->characters(ctxt->userData, buf, nbchar); - if ((ctxt->sax->characters != - ctxt->sax->ignorableWhitespace) && - (*ctxt->space == -1)) - *ctxt->space = -2; - } - } + xmlCharacters(ctxt, buf, nbchar); nbchar = 0; SHRINK; } @@ -5084,21 +5042,8 @@ xmlParseCharDataComplex(xmlParserCtxtPtr ctxt, int partial) { } if (nbchar != 0) { buf[nbchar] = 0; - /* - * OK the segment is to be consumed as chars. - */ - if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { - if (areBlanks(ctxt, buf, nbchar, 0)) { - if (ctxt->sax->ignorableWhitespace != NULL) - ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar); - } else { - if (ctxt->sax->characters != NULL) - ctxt->sax->characters(ctxt->userData, buf, nbchar); - if ((ctxt->sax->characters != ctxt->sax->ignorableWhitespace) && - (*ctxt->space == -1)) - *ctxt->space = -2; - } - } + + xmlCharacters(ctxt, buf, nbchar); } /* * cur == 0 can mean @@ -13633,9 +13578,6 @@ xmlCtxtSetOptionsInternal(xmlParserCtxtPtr ctxt, int options, int keepMask) /* * Changing SAX callbacks is a bad idea. This should be fixed. */ - if (options & XML_PARSE_NOBLANKS) { - ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace; - } if (options & XML_PARSE_NOCDATA) { ctxt->sax->cdataBlock = NULL; }