From fc011b7fb895ed3b43f6c2e5b7a6fbfc078da349 Mon Sep 17 00:00:00 2001 From: Daniel Veillard Date: Sun, 12 Feb 2006 19:14:15 +0000 Subject: [PATCH] bug fixes for #327167 as well as some cleanups and more thorough tests on * xmlregexp.c: bug fixes for #327167 as well as some cleanups and more thorough tests on atoms comparisons. Daniel --- ChangeLog | 5 + xmlregexp.c | 291 ++++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 276 insertions(+), 20 deletions(-) diff --git a/ChangeLog b/ChangeLog index 2e2d6440..b1f9e2c0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +Sun Feb 12 20:12:22 CET 2006 Daniel Veillard + + * xmlregexp.c: bug fixes for #327167 as well as some cleanups + and more thorough tests on atoms comparisons. + Thu Feb 9 10:07:20 CET 2006 Daniel Veillard * include/wsockcompat.h: patch from Eric Zurcher to compile with diff --git a/xmlregexp.c b/xmlregexp.c index de581f0b..21091f39 100644 --- a/xmlregexp.c +++ b/xmlregexp.c @@ -71,6 +71,9 @@ * * ************************************************************************/ +/* + * Note: the order of the enums below is significant, do not shuffle + */ typedef enum { XML_REGEXP_EPSILON = 1, XML_REGEXP_CHARVAL, @@ -2054,34 +2057,281 @@ xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) { } /** - * xmlFACompareAtoms: + * xmlFACompareAtomTypes: + * @type1: an atom type + * @type2: an atom type + * + * Compares two atoms type to check whether they intersect in some ways, + * this is used by xmlFACompareAtoms only + * + * Returns 1 if they may intersect and 0 otherwise + */ +static int +xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) { + if ((type1 == XML_REGEXP_EPSILON) || + (type1 == XML_REGEXP_CHARVAL) || + (type1 == XML_REGEXP_RANGES) || + (type1 == XML_REGEXP_SUBREG) || + (type1 == XML_REGEXP_STRING) || + (type1 == XML_REGEXP_ANYCHAR)) + return(1); + if ((type2 == XML_REGEXP_EPSILON) || + (type2 == XML_REGEXP_CHARVAL) || + (type2 == XML_REGEXP_RANGES) || + (type2 == XML_REGEXP_SUBREG) || + (type2 == XML_REGEXP_STRING) || + (type2 == XML_REGEXP_ANYCHAR)) + return(1); + + if (type1 == type2) return(1); + + /* simplify subsequent compares by making sure type1 < type2 */ + if (type1 > type2) { + xmlRegAtomType tmp = type1; + type1 = type2; + type2 = tmp; + } + switch (type1) { + case XML_REGEXP_ANYSPACE: /* \s */ + /* can't be a letter, number, mark, pontuation, symbol */ + if ((type2 == XML_REGEXP_NOTSPACE) || + ((type2 >= XML_REGEXP_LETTER) && + (type2 <= XML_REGEXP_LETTER_OTHERS)) || + ((type2 >= XML_REGEXP_NUMBER) && + (type2 <= XML_REGEXP_NUMBER_OTHERS)) || + ((type2 >= XML_REGEXP_MARK) && + (type2 <= XML_REGEXP_MARK_ENCLOSING)) || + ((type2 >= XML_REGEXP_PUNCT) && + (type2 <= XML_REGEXP_PUNCT_OTHERS)) || + ((type2 >= XML_REGEXP_SYMBOL) && + (type2 <= XML_REGEXP_SYMBOL_OTHERS)) + ) return(0); + break; + case XML_REGEXP_NOTSPACE: /* \S */ + break; + case XML_REGEXP_INITNAME: /* \l */ + /* can't be a number, mark, separator, pontuation, symbol or other */ + if ((type2 == XML_REGEXP_NOTINITNAME) || + ((type2 >= XML_REGEXP_NUMBER) && + (type2 <= XML_REGEXP_NUMBER_OTHERS)) || + ((type2 >= XML_REGEXP_MARK) && + (type2 <= XML_REGEXP_MARK_ENCLOSING)) || + ((type2 >= XML_REGEXP_SEPAR) && + (type2 <= XML_REGEXP_SEPAR_PARA)) || + ((type2 >= XML_REGEXP_PUNCT) && + (type2 <= XML_REGEXP_PUNCT_OTHERS)) || + ((type2 >= XML_REGEXP_SYMBOL) && + (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || + ((type2 >= XML_REGEXP_OTHER) && + (type2 <= XML_REGEXP_OTHER_NA)) + ) return(0); + break; + case XML_REGEXP_NOTINITNAME: /* \L */ + break; + case XML_REGEXP_NAMECHAR: /* \c */ + /* can't be a mark, separator, pontuation, symbol or other */ + if ((type2 == XML_REGEXP_NOTNAMECHAR) || + ((type2 >= XML_REGEXP_MARK) && + (type2 <= XML_REGEXP_MARK_ENCLOSING)) || + ((type2 >= XML_REGEXP_PUNCT) && + (type2 <= XML_REGEXP_PUNCT_OTHERS)) || + ((type2 >= XML_REGEXP_SEPAR) && + (type2 <= XML_REGEXP_SEPAR_PARA)) || + ((type2 >= XML_REGEXP_SYMBOL) && + (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || + ((type2 >= XML_REGEXP_OTHER) && + (type2 <= XML_REGEXP_OTHER_NA)) + ) return(0); + break; + case XML_REGEXP_NOTNAMECHAR: /* \C */ + break; + case XML_REGEXP_DECIMAL: /* \d */ + /* can't be a letter, mark, separator, pontuation, symbol or other */ + if ((type2 == XML_REGEXP_NOTDECIMAL) || + (type2 == XML_REGEXP_REALCHAR) || + ((type2 >= XML_REGEXP_LETTER) && + (type2 <= XML_REGEXP_LETTER_OTHERS)) || + ((type2 >= XML_REGEXP_MARK) && + (type2 <= XML_REGEXP_MARK_ENCLOSING)) || + ((type2 >= XML_REGEXP_PUNCT) && + (type2 <= XML_REGEXP_PUNCT_OTHERS)) || + ((type2 >= XML_REGEXP_SEPAR) && + (type2 <= XML_REGEXP_SEPAR_PARA)) || + ((type2 >= XML_REGEXP_SYMBOL) && + (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || + ((type2 >= XML_REGEXP_OTHER) && + (type2 <= XML_REGEXP_OTHER_NA)) + )return(0); + break; + case XML_REGEXP_NOTDECIMAL: /* \D */ + break; + case XML_REGEXP_REALCHAR: /* \w */ + /* can't be a mark, separator, pontuation, symbol or other */ + if ((type2 == XML_REGEXP_NOTDECIMAL) || + ((type2 >= XML_REGEXP_MARK) && + (type2 <= XML_REGEXP_MARK_ENCLOSING)) || + ((type2 >= XML_REGEXP_PUNCT) && + (type2 <= XML_REGEXP_PUNCT_OTHERS)) || + ((type2 >= XML_REGEXP_SEPAR) && + (type2 <= XML_REGEXP_SEPAR_PARA)) || + ((type2 >= XML_REGEXP_SYMBOL) && + (type2 <= XML_REGEXP_SYMBOL_OTHERS)) || + ((type2 >= XML_REGEXP_OTHER) && + (type2 <= XML_REGEXP_OTHER_NA)) + )return(0); + break; + case XML_REGEXP_NOTREALCHAR: /* \W */ + break; + /* + * at that point we know both type 1 and type2 are from + * character categories are ordered and are different, + * it becomes simple because this is a partition + */ + case XML_REGEXP_LETTER: + if (type2 <= XML_REGEXP_LETTER_OTHERS) + return(1); + return(0); + case XML_REGEXP_LETTER_UPPERCASE: + case XML_REGEXP_LETTER_LOWERCASE: + case XML_REGEXP_LETTER_TITLECASE: + case XML_REGEXP_LETTER_MODIFIER: + case XML_REGEXP_LETTER_OTHERS: + return(0); + case XML_REGEXP_MARK: + if (type2 <= XML_REGEXP_MARK_ENCLOSING) + return(1); + return(0); + case XML_REGEXP_MARK_NONSPACING: + case XML_REGEXP_MARK_SPACECOMBINING: + case XML_REGEXP_MARK_ENCLOSING: + return(0); + case XML_REGEXP_NUMBER: + if (type2 <= XML_REGEXP_NUMBER_OTHERS) + return(1); + return(0); + case XML_REGEXP_NUMBER_DECIMAL: + case XML_REGEXP_NUMBER_LETTER: + case XML_REGEXP_NUMBER_OTHERS: + return(0); + case XML_REGEXP_PUNCT: + if (type2 <= XML_REGEXP_PUNCT_OTHERS) + return(1); + return(0); + case XML_REGEXP_PUNCT_CONNECTOR: + case XML_REGEXP_PUNCT_DASH: + case XML_REGEXP_PUNCT_OPEN: + case XML_REGEXP_PUNCT_CLOSE: + case XML_REGEXP_PUNCT_INITQUOTE: + case XML_REGEXP_PUNCT_FINQUOTE: + case XML_REGEXP_PUNCT_OTHERS: + return(0); + case XML_REGEXP_SEPAR: + if (type2 <= XML_REGEXP_SEPAR_PARA) + return(1); + return(0); + case XML_REGEXP_SEPAR_SPACE: + case XML_REGEXP_SEPAR_LINE: + case XML_REGEXP_SEPAR_PARA: + return(0); + case XML_REGEXP_SYMBOL: + if (type2 <= XML_REGEXP_SYMBOL_OTHERS) + return(1); + return(0); + case XML_REGEXP_SYMBOL_MATH: + case XML_REGEXP_SYMBOL_CURRENCY: + case XML_REGEXP_SYMBOL_MODIFIER: + case XML_REGEXP_SYMBOL_OTHERS: + return(0); + case XML_REGEXP_OTHER: + if (type2 <= XML_REGEXP_OTHER_NA) + return(1); + return(0); + case XML_REGEXP_OTHER_CONTROL: + case XML_REGEXP_OTHER_FORMAT: + case XML_REGEXP_OTHER_PRIVATE: + case XML_REGEXP_OTHER_NA: + return(0); + default: + break; + } + return(1); +} + +/** + * xmlFAEqualAtoms: * @atom1: an atom * @atom2: an atom * - * Compares two atoms to check whether they intersect in some ways, - * this is used by xmlFAComputesDeterminism only + * Compares two atoms to check whether they are the same exactly + * this is used to remove equivalent transitions * - * Returns 1 if yes and 0 otherwise + * Returns 1 if same and 0 otherwise */ static int -xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) { - int ret; +xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) { + int ret = 0; if (atom1 == atom2) return(1); if ((atom1 == NULL) || (atom2 == NULL)) return(0); - if ((atom1->type == XML_REGEXP_RANGES) && - (atom2->type == XML_REGEXP_CHARVAL)) { - } else if ((atom1->type == XML_REGEXP_CHARVAL) && - (atom2->type == XML_REGEXP_RANGES)) { + if (atom1->type != atom2->type) + return(0); + switch (atom1->type) { + case XML_REGEXP_EPSILON: + ret = 0; + break; + case XML_REGEXP_STRING: + ret = xmlStrEqual((xmlChar *)atom1->valuep, + (xmlChar *)atom2->valuep); + break; + case XML_REGEXP_CHARVAL: + ret = (atom1->codepoint == atom2->codepoint); + break; + case XML_REGEXP_RANGES: + /* too hard to do in the general case */ + ret = 0; + default: + break; + } + return(ret); +} + +/** + * xmlFACompareAtoms: + * @atom1: an atom + * @atom2: an atom + * + * Compares two atoms to check whether they intersect in some ways, + * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only + * + * Returns 1 if yes and 0 otherwise + */ +static int +xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) { + int ret = 1; + + if (atom1 == atom2) + return(1); + if ((atom1 == NULL) || (atom2 == NULL)) + return(0); + + if ((atom1->type == XML_REGEXP_ANYCHAR) || + (atom2->type == XML_REGEXP_ANYCHAR)) + return(1); + + if (atom1->type > atom2->type) { xmlRegAtomPtr tmp; tmp = atom1; atom1 = atom2; atom2 = tmp; - } else if (atom1->type != atom2->type) { - return(0); + } + if (atom1->type != atom2->type) { + ret = xmlFACompareAtomTypes(atom1->type, atom2->type); + /* if they can't intersect at the type level break now */ + if (ret == 0) + return(0); } switch (atom1->type) { case XML_REGEXP_STRING: @@ -2091,15 +2341,16 @@ xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2) { case XML_REGEXP_EPSILON: goto not_determinist; case XML_REGEXP_CHARVAL: - ret = (atom1->codepoint == atom2->codepoint); + if (atom2->type == XML_REGEXP_CHARVAL) { + ret = (atom1->codepoint == atom2->codepoint); + } else { + ret = xmlRegCheckCharacter(atom2, atom1->codepoint); + if (ret < 0) + ret = 1; + } break; case XML_REGEXP_RANGES: - if (atom2->type == XML_REGEXP_CHARVAL) { - ret = xmlRegCheckCharacter(atom1, atom2->codepoint); - if (ret < 0) - return(-1); - break; - } else { + if (atom2->type == XML_REGEXP_RANGES) { int i, j, res; xmlRegRangePtr r1, r2; @@ -2233,7 +2484,7 @@ xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) { continue; if (t2->atom != NULL) { if (t1->to == t2->to) { - if (xmlFACompareAtoms(t1->atom, t2->atom)) + if (xmlFAEqualAtoms(t1->atom, t2->atom)) t2->to = -1; /* eliminated */ } }