mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-25 13:17:41 +03:00 
			
		
		
		
	Re-implement extraction of fixed prefixes from regular expressions.
To generate btree-indexable conditions from regex WHERE conditions (such as WHERE indexed_col ~ '^foo'), we need to be able to identify any fixed prefix that a regex might have; that is, find any string that must be a prefix of all strings satisfying the regex. We used to do that with entirely ad-hoc code that looked at the source text of the regex. It didn't know very much about regex syntax, which mostly meant that it would fail to identify some optimizable cases; but Viktor Rosenfeld reported that it would produce actively wrong answers for quantified parenthesized subexpressions, such as '^(foo)?bar'. Rather than trying to extend the ad-hoc code to cover this, let's get rid of it altogether in favor of identifying prefixes by examining the compiled form of a regex. To do this, I've added a new entry point "pg_regprefix" to the regex library; hopefully it is defined in a sufficiently general fashion that it can remain in the library when/if that code gets split out as a standalone project. Since this bug has been there for a very long time, this fix needs to get back-patched. However it depends on some other recent commits (particularly the addition of wchar-to-database-encoding conversion), so I'll commit this separately and then go to work on back-porting the necessary fixes.
This commit is contained in:
		| @@ -12,7 +12,7 @@ subdir = src/backend/regex | ||||
| top_builddir = ../../.. | ||||
| include $(top_builddir)/src/Makefile.global | ||||
|  | ||||
| OBJS = regcomp.o regerror.o regexec.o regfree.o | ||||
| OBJS = regcomp.o regerror.o regexec.o regfree.o regprefix.o | ||||
|  | ||||
| include $(top_srcdir)/src/backend/common.mk | ||||
|  | ||||
|   | ||||
| @@ -7,12 +7,13 @@ So this file is an attempt to reverse-engineer some docs. | ||||
| General source-file layout | ||||
| -------------------------- | ||||
|  | ||||
| There are four separately-compilable source files, each exposing exactly | ||||
| There are five separately-compilable source files, each exposing exactly | ||||
| one exported function: | ||||
| 	regcomp.c: pg_regcomp | ||||
| 	regexec.c: pg_regexec | ||||
| 	regerror.c: pg_regerror | ||||
| 	regfree.c: pg_regfree | ||||
| 	regprefix.c: pg_regprefix | ||||
| (The pg_ prefixes were added by the Postgres project to distinguish this | ||||
| library version from any similar one that might be present on a particular | ||||
| system.  They'd need to be removed or replaced in any standalone version | ||||
| @@ -44,6 +45,7 @@ regexec.c		Top-level regex execution code | ||||
| rege_dfa.c		DFA creation and execution | ||||
| regerror.c		pg_regerror: generate text for a regex error code | ||||
| regfree.c		pg_regfree: API to free a no-longer-needed regex_t | ||||
| regprefix.c		Code for extracting a common prefix from a regex_t | ||||
|  | ||||
| The locale-specific code is concerned primarily with case-folding and with | ||||
| expanding locale-specific character classes, such as [[:alnum:]].  It | ||||
|   | ||||
| @@ -66,8 +66,9 @@ initcm(struct vars * v, | ||||
| 	cd = cm->cd;				/* cm->cd[WHITE] */ | ||||
| 	cd->sub = NOSUB; | ||||
| 	cd->arcs = NULL; | ||||
| 	cd->flags = 0; | ||||
| 	cd->firstchr = CHR_MIN; | ||||
| 	cd->nchrs = CHR_MAX - CHR_MIN + 1; | ||||
| 	cd->flags = 0; | ||||
|  | ||||
| 	/* upper levels of tree */ | ||||
| 	for (t = &cm->tree[0], j = NBYTS - 1; j > 0; t = nextt, j--) | ||||
| @@ -272,6 +273,7 @@ newcolor(struct colormap * cm) | ||||
| 	cd->nchrs = 0; | ||||
| 	cd->sub = NOSUB; | ||||
| 	cd->arcs = NULL; | ||||
| 	cd->firstchr = CHR_MIN;		/* in case never set otherwise */ | ||||
| 	cd->flags = 0; | ||||
| 	cd->block = NULL; | ||||
|  | ||||
| @@ -371,6 +373,8 @@ subcolor(struct colormap * cm, chr c) | ||||
| 	if (co == sco)				/* already in an open subcolor */ | ||||
| 		return co;				/* rest is redundant */ | ||||
| 	cm->cd[co].nchrs--; | ||||
| 	if (cm->cd[sco].nchrs == 0) | ||||
| 		cm->cd[sco].firstchr = c; | ||||
| 	cm->cd[sco].nchrs++; | ||||
| 	setcolor(cm, c, sco); | ||||
| 	return sco; | ||||
| @@ -438,6 +442,11 @@ subrange(struct vars * v, | ||||
|  | ||||
| /* | ||||
|  * subblock - allocate new subcolors for one tree block of chrs, fill in arcs | ||||
|  * | ||||
|  * Note: subcolors that are created during execution of this function | ||||
|  * will not be given a useful value of firstchr; it'll be left as CHR_MIN. | ||||
|  * For the current usage of firstchr in pg_regprefix, this does not matter | ||||
|  * because such subcolors won't occur in the common prefix of a regex. | ||||
|  */ | ||||
| static void | ||||
| subblock(struct vars * v, | ||||
|   | ||||
							
								
								
									
										259
									
								
								src/backend/regex/regprefix.c
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										259
									
								
								src/backend/regex/regprefix.c
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,259 @@ | ||||
| /*------------------------------------------------------------------------- | ||||
|  * | ||||
|  * regprefix.c | ||||
|  *	  Extract a common prefix, if any, from a compiled regex. | ||||
|  * | ||||
|  * | ||||
|  * Portions Copyright (c) 2012, PostgreSQL Global Development Group | ||||
|  * Portions Copyright (c) 1998, 1999 Henry Spencer | ||||
|  * | ||||
|  * IDENTIFICATION | ||||
|  *	  src/backend/regex/regprefix.c | ||||
|  * | ||||
|  *------------------------------------------------------------------------- | ||||
|  */ | ||||
|  | ||||
| #include "regex/regguts.h" | ||||
|  | ||||
|  | ||||
| /* | ||||
|  * forward declarations | ||||
|  */ | ||||
| static int findprefix(struct cnfa * cnfa, struct colormap * cm, | ||||
| 					  chr *string, size_t *slength); | ||||
|  | ||||
|  | ||||
| /* | ||||
|  * pg_regprefix - get common prefix for regular expression | ||||
|  * | ||||
|  * Returns one of: | ||||
|  *	REG_NOMATCH: there is no common prefix of strings matching the regex | ||||
|  *	REG_PREFIX: there is a common prefix of strings matching the regex | ||||
|  *	REG_EXACT: all strings satisfying the regex must match the same string | ||||
|  *	or a REG_XXX error code | ||||
|  * | ||||
|  * In the non-failure cases, *string is set to a malloc'd string containing | ||||
|  * the common prefix or exact value, of length *slength (measured in chrs | ||||
|  * not bytes!). | ||||
|  * | ||||
|  * This function does not analyze all complex cases (such as lookahead | ||||
|  * constraints) exactly.  Therefore it is possible that some strings matching | ||||
|  * the reported prefix or exact-match string do not satisfy the regex.  But | ||||
|  * it should never be the case that a string satisfying the regex does not | ||||
|  * match the reported prefix or exact-match string. | ||||
|  */ | ||||
| int | ||||
| pg_regprefix(regex_t *re, | ||||
| 			 chr **string, | ||||
| 			 size_t *slength) | ||||
| { | ||||
| 	struct guts *g; | ||||
| 	struct cnfa *cnfa; | ||||
| 	int			st; | ||||
|  | ||||
| 	/* sanity checks */ | ||||
| 	if (string == NULL || slength == NULL) | ||||
| 		return REG_INVARG; | ||||
| 	*string = NULL;				/* initialize for failure cases */ | ||||
| 	*slength = 0; | ||||
| 	if (re == NULL || re->re_magic != REMAGIC) | ||||
| 		return REG_INVARG; | ||||
| 	if (re->re_csize != sizeof(chr)) | ||||
| 		return REG_MIXED; | ||||
|  | ||||
| 	/* Initialize locale-dependent support */ | ||||
| 	pg_set_regex_collation(re->re_collation); | ||||
|  | ||||
| 	/* setup */ | ||||
| 	g = (struct guts *) re->re_guts; | ||||
| 	if (g->info & REG_UIMPOSSIBLE) | ||||
| 		return REG_NOMATCH; | ||||
|  | ||||
| 	/* | ||||
| 	 * This implementation considers only the search NFA for the topmost regex | ||||
| 	 * tree node.  Therefore, constraints such as backrefs are not fully | ||||
| 	 * applied, which is allowed per the function's API spec. | ||||
| 	 */ | ||||
| 	assert(g->tree != NULL); | ||||
| 	cnfa = &g->tree->cnfa; | ||||
|  | ||||
| 	/* | ||||
| 	 * Since a correct NFA should never contain any exit-free loops, it should | ||||
| 	 * not be possible for our traversal to return to a previously visited | ||||
| 	 * NFA state.  Hence we need at most nstates chrs in the output string. | ||||
| 	 */ | ||||
| 	*string = (chr *) MALLOC(cnfa->nstates * sizeof(chr)); | ||||
| 	if (*string == NULL) | ||||
| 		return REG_ESPACE; | ||||
|  | ||||
| 	/* do it */ | ||||
| 	st = findprefix(cnfa, &g->cmap, *string, slength); | ||||
|  | ||||
| 	assert(*slength <= cnfa->nstates); | ||||
|  | ||||
| 	/* clean up */ | ||||
| 	if (st != REG_PREFIX && st != REG_EXACT) | ||||
| 	{ | ||||
| 		FREE(*string); | ||||
| 		*string = NULL; | ||||
| 		*slength = 0; | ||||
| 	} | ||||
|  | ||||
| 	return st; | ||||
| } | ||||
|  | ||||
| /* | ||||
|  * findprefix - extract common prefix from cNFA | ||||
|  * | ||||
|  * Results are returned into the preallocated chr array string[], with | ||||
|  * *slength (which must be preset to zero) incremented for each chr. | ||||
|  */ | ||||
| static int						/* regprefix return code */ | ||||
| findprefix(struct cnfa * cnfa, | ||||
| 		   struct colormap * cm, | ||||
| 		   chr *string, | ||||
| 		   size_t *slength) | ||||
| { | ||||
| 	int			st; | ||||
| 	int			nextst; | ||||
| 	color		thiscolor; | ||||
| 	chr			c; | ||||
| 	struct carc *ca; | ||||
|  | ||||
| 	/* | ||||
| 	 * The "pre" state must have only BOS/BOL outarcs, else pattern isn't | ||||
| 	 * anchored left.  If we have both BOS and BOL, they must go to the | ||||
| 	 * same next state. | ||||
| 	 */ | ||||
| 	st = cnfa->pre; | ||||
| 	nextst = -1; | ||||
| 	for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) | ||||
| 	{ | ||||
| 		if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1]) | ||||
| 		{ | ||||
| 			if (nextst == -1) | ||||
| 				nextst = ca->to; | ||||
| 			else if (nextst != ca->to) | ||||
| 				return REG_NOMATCH; | ||||
| 		} | ||||
| 		else | ||||
| 			return REG_NOMATCH; | ||||
| 	} | ||||
| 	if (nextst == -1) | ||||
| 		return REG_NOMATCH; | ||||
|  | ||||
| 	/* | ||||
| 	 * Scan through successive states, stopping as soon as we find one with | ||||
| 	 * more than one acceptable transition character (either multiple colors | ||||
| 	 * on out-arcs, or a color with more than one member chr). | ||||
| 	 * | ||||
| 	 * We could find a state with multiple out-arcs that are all labeled with | ||||
| 	 * the same singleton color; this comes from patterns like "^ab(cde|cxy)". | ||||
| 	 * In that case we add the chr "c" to the output string but then exit the | ||||
| 	 * loop with nextst == -1.  This leaves a little bit on the table: if the | ||||
| 	 * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added | ||||
| 	 * to the prefix.  But chasing multiple parallel state chains doesn't seem | ||||
| 	 * worth the trouble. | ||||
| 	 */ | ||||
| 	do | ||||
| 	{ | ||||
| 		st = nextst; | ||||
| 		nextst = -1; | ||||
| 		thiscolor = COLORLESS; | ||||
| 		for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) | ||||
| 		{ | ||||
| 			/* We ignore lookahead constraints */ | ||||
| 			if (ca->co >= cnfa->ncolors) | ||||
| 				continue; | ||||
| 			/* We can also ignore BOS/BOL arcs */ | ||||
| 			if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1]) | ||||
| 				continue; | ||||
| 			/* ... but EOS/EOL arcs terminate the search */ | ||||
| 			if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1]) | ||||
| 			{ | ||||
| 				thiscolor = COLORLESS; | ||||
| 				break; | ||||
| 			} | ||||
| 			if (thiscolor == COLORLESS) | ||||
| 			{ | ||||
| 				/* First plain outarc */ | ||||
| 				thiscolor = ca->co; | ||||
| 				nextst = ca->to; | ||||
| 			} | ||||
| 			else if (thiscolor == ca->co) | ||||
| 			{ | ||||
| 				/* Another plain outarc for same color */ | ||||
| 				nextst = -1; | ||||
| 			} | ||||
| 			else | ||||
| 			{ | ||||
| 				/* More than one plain outarc color terminates the search */ | ||||
| 				thiscolor = COLORLESS; | ||||
| 				break; | ||||
| 			} | ||||
| 		} | ||||
| 		/* Done if we didn't find exactly one color on plain outarcs */ | ||||
| 		if (thiscolor == COLORLESS) | ||||
| 			break; | ||||
| 		/* The color must be a singleton */ | ||||
| 		if (cm->cd[thiscolor].nchrs != 1) | ||||
| 			break; | ||||
|  | ||||
| 		/* | ||||
| 		 * Identify the color's sole member chr and add it to the prefix | ||||
| 		 * string.  In general the colormap data structure doesn't provide a | ||||
| 		 * way to find color member chrs, except by trying GETCOLOR() on each | ||||
| 		 * possible chr value, which won't do at all.  However, for the cases | ||||
| 		 * we care about it should be sufficient to test the "firstchr" value, | ||||
| 		 * that is the first chr ever added to the color.  There are cases | ||||
| 		 * where this might no longer be a member of the color (so we do need | ||||
| 		 * to test), but none of them are likely to arise for a character that | ||||
| 		 * is a member of a common prefix.  If we do hit such a corner case, | ||||
| 		 * we just fall out without adding anything to the prefix string. | ||||
| 		 */ | ||||
| 		c = cm->cd[thiscolor].firstchr; | ||||
| 		if (GETCOLOR(cm, c) != thiscolor) | ||||
| 			break; | ||||
|  | ||||
| 		string[(*slength)++] = c; | ||||
|  | ||||
| 		/* Advance to next state, but only if we have a unique next state */ | ||||
| 	} while (nextst != -1); | ||||
|  | ||||
| 	/* | ||||
| 	 * If we ended at a state that only has EOS/EOL outarcs leading to the | ||||
| 	 * "post" state, then we have an exact-match string.  Note this is true | ||||
| 	 * even if the string is of zero length. | ||||
| 	 */ | ||||
| 	nextst = -1; | ||||
| 	for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++) | ||||
| 	{ | ||||
| 		if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1]) | ||||
| 		{ | ||||
| 			if (nextst == -1) | ||||
| 				nextst = ca->to; | ||||
| 			else if (nextst != ca->to) | ||||
| 			{ | ||||
| 				nextst = -1; | ||||
| 				break; | ||||
| 			} | ||||
| 		} | ||||
| 		else | ||||
| 		{ | ||||
| 			nextst = -1; | ||||
| 			break; | ||||
| 		} | ||||
| 	} | ||||
| 	if (nextst == cnfa->post) | ||||
| 		return REG_EXACT; | ||||
|  | ||||
| 	/* | ||||
| 	 * Otherwise, if we were unable to identify any prefix characters, say | ||||
| 	 * NOMATCH --- the pattern is anchored left, but doesn't specify any | ||||
| 	 * particular first character. | ||||
| 	 */ | ||||
| 	if (*slength > 0) | ||||
| 		return REG_PREFIX; | ||||
|  | ||||
| 	return REG_NOMATCH; | ||||
| } | ||||
| @@ -1170,3 +1170,68 @@ build_regexp_split_result(regexp_matches_ctx *splitctx) | ||||
| 								   Int32GetDatum(startpos + 1)); | ||||
| 	} | ||||
| } | ||||
|  | ||||
| /* | ||||
|  * regexp_fixed_prefix - extract fixed prefix, if any, for a regexp | ||||
|  * | ||||
|  * The result is NULL if there is no fixed prefix, else a palloc'd string. | ||||
|  * If it is an exact match, not just a prefix, *exact is returned as TRUE. | ||||
|  */ | ||||
| char * | ||||
| regexp_fixed_prefix(text *text_re, bool case_insensitive, Oid collation, | ||||
| 					bool *exact) | ||||
| { | ||||
| 	char	   *result; | ||||
| 	regex_t    *re; | ||||
| 	int			cflags; | ||||
| 	int			re_result; | ||||
| 	pg_wchar   *str; | ||||
| 	size_t		slen; | ||||
| 	size_t		maxlen; | ||||
| 	char		errMsg[100]; | ||||
|  | ||||
| 	*exact = false;				/* default result */ | ||||
|  | ||||
| 	/* Compile RE */ | ||||
| 	cflags = REG_ADVANCED; | ||||
| 	if (case_insensitive) | ||||
| 		cflags |= REG_ICASE; | ||||
|  | ||||
| 	re = RE_compile_and_cache(text_re, cflags, collation); | ||||
|  | ||||
| 	/* Examine it to see if there's a fixed prefix */ | ||||
| 	re_result = pg_regprefix(re, &str, &slen); | ||||
|  | ||||
| 	switch (re_result) | ||||
| 	{ | ||||
| 		case REG_NOMATCH: | ||||
| 			return NULL; | ||||
|  | ||||
| 		case REG_PREFIX: | ||||
| 			/* continue with wchar conversion */ | ||||
| 			break; | ||||
|  | ||||
| 		case REG_EXACT: | ||||
| 			*exact = true; | ||||
| 			/* continue with wchar conversion */ | ||||
| 			break; | ||||
|  | ||||
| 		default: | ||||
| 			/* re failed??? */ | ||||
| 			pg_regerror(re_result, re, errMsg, sizeof(errMsg)); | ||||
| 			ereport(ERROR, | ||||
| 					(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION), | ||||
| 					 errmsg("regular expression failed: %s", errMsg))); | ||||
| 			break; | ||||
| 	} | ||||
|  | ||||
| 	/* Convert pg_wchar result back to database encoding */ | ||||
| 	maxlen = pg_database_encoding_max_length() * slen + 1; | ||||
| 	result = (char *) palloc(maxlen); | ||||
| 	slen = pg_wchar2mb_with_len(str, result, slen); | ||||
| 	Assert(slen < maxlen); | ||||
|  | ||||
| 	free(str); | ||||
|  | ||||
| 	return result; | ||||
| } | ||||
|   | ||||
| @@ -195,7 +195,8 @@ static Selectivity prefix_selectivity(PlannerInfo *root, | ||||
| static Selectivity like_selectivity(const char *patt, int pattlen, | ||||
| 									bool case_insensitive); | ||||
| static Selectivity regex_selectivity(const char *patt, int pattlen, | ||||
| 									 bool case_insensitive); | ||||
| 									 bool case_insensitive, | ||||
| 									 int fixed_prefix_len); | ||||
| static Datum string_to_datum(const char *str, Oid datatype); | ||||
| static Const *string_to_const(const char *str, Oid datatype); | ||||
| static Const *string_to_bytea_const(const char *str, size_t str_len); | ||||
| @@ -5255,18 +5256,9 @@ static Pattern_Prefix_Status | ||||
| regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, | ||||
| 				   Const **prefix_const, Selectivity *rest_selec) | ||||
| { | ||||
| 	char	   *match; | ||||
| 	int			pos, | ||||
| 				match_pos, | ||||
| 				prev_pos, | ||||
| 				prev_match_pos; | ||||
| 	bool		have_leading_paren; | ||||
| 	char	   *patt; | ||||
| 	char	   *rest; | ||||
| 	Oid			typeid = patt_const->consttype; | ||||
| 	bool		is_multibyte = (pg_database_encoding_max_length() > 1); | ||||
| 	pg_locale_t locale = 0; | ||||
| 	bool		locale_is_c = false; | ||||
| 	char	   *prefix; | ||||
| 	bool		exact; | ||||
|  | ||||
| 	/* | ||||
| 	 * Should be unnecessary, there are no bytea regex operators defined. As | ||||
| @@ -5278,185 +5270,54 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation, | ||||
| 				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED), | ||||
| 		 errmsg("regular-expression matching not supported on type bytea"))); | ||||
|  | ||||
| 	if (case_insensitive) | ||||
| 	{ | ||||
| 		/* If case-insensitive, we need locale info */ | ||||
| 		if (lc_ctype_is_c(collation)) | ||||
| 			locale_is_c = true; | ||||
| 		else if (collation != DEFAULT_COLLATION_OID) | ||||
| 		{ | ||||
| 			if (!OidIsValid(collation)) | ||||
| 			{ | ||||
| 				/* | ||||
| 				 * This typically means that the parser could not resolve a | ||||
| 				 * conflict of implicit collations, so report it that way. | ||||
| 				 */ | ||||
| 				ereport(ERROR, | ||||
| 						(errcode(ERRCODE_INDETERMINATE_COLLATION), | ||||
| 						 errmsg("could not determine which collation to use for regular expression"), | ||||
| 						 errhint("Use the COLLATE clause to set the collation explicitly."))); | ||||
| 			} | ||||
| 			locale = pg_newlocale_from_collation(collation); | ||||
| 		} | ||||
| 	} | ||||
| 	/* Use the regexp machinery to extract the prefix, if any */ | ||||
| 	prefix = regexp_fixed_prefix(DatumGetTextPP(patt_const->constvalue), | ||||
| 								 case_insensitive, collation, | ||||
| 								 &exact); | ||||
|  | ||||
| 	/* the right-hand const is type text for all of these */ | ||||
| 	patt = TextDatumGetCString(patt_const->constvalue); | ||||
|  | ||||
| 	/* | ||||
| 	 * Check for ARE director prefix.  It's worth our trouble to recognize | ||||
| 	 * this because similar_escape() used to use it, and some other code might | ||||
| 	 * still use it, to force ARE mode. | ||||
| 	 */ | ||||
| 	pos = 0; | ||||
| 	if (strncmp(patt, "***:", 4) == 0) | ||||
| 		pos = 4; | ||||
|  | ||||
| 	/* Pattern must be anchored left */ | ||||
| 	if (patt[pos] != '^') | ||||
| 	if (prefix == NULL) | ||||
| 	{ | ||||
| 		*prefix_const = NULL; | ||||
|  | ||||
| 		if (rest_selec != NULL) | ||||
| 		{ | ||||
| 			char   *patt = TextDatumGetCString(patt_const->constvalue); | ||||
|  | ||||
| 			*rest_selec = regex_selectivity(patt, strlen(patt), | ||||
| 											case_insensitive); | ||||
|  | ||||
| 		return Pattern_Prefix_None; | ||||
| 	} | ||||
| 	pos++; | ||||
|  | ||||
| 	/* | ||||
| 	 * If '|' is present in pattern, then there may be multiple alternatives | ||||
| 	 * for the start of the string.  (There are cases where this isn't so, for | ||||
| 	 * instance if the '|' is inside parens, but detecting that reliably is | ||||
| 	 * too hard.) | ||||
| 	 */ | ||||
| 	if (strchr(patt + pos, '|') != NULL) | ||||
| 	{ | ||||
| 		*prefix_const = NULL; | ||||
|  | ||||
| 		if (rest_selec != NULL) | ||||
| 			*rest_selec = regex_selectivity(patt, strlen(patt), | ||||
| 											case_insensitive); | ||||
| 											case_insensitive, | ||||
| 											0); | ||||
| 			pfree(patt); | ||||
| 		} | ||||
|  | ||||
| 		return Pattern_Prefix_None; | ||||
| 	} | ||||
|  | ||||
| 	/* OK, allocate space for pattern */ | ||||
| 	match = palloc(strlen(patt) + 1); | ||||
| 	prev_match_pos = match_pos = 0; | ||||
|  | ||||
| 	/* | ||||
| 	 * We special-case the syntax '^(...)$' because psql uses it.  But beware: | ||||
| 	 * sequences beginning "(?" are not what they seem, unless they're "(?:". | ||||
| 	 * (We must recognize that because of similar_escape().) | ||||
| 	 */ | ||||
| 	have_leading_paren = false; | ||||
| 	if (patt[pos] == '(' && | ||||
| 		(patt[pos + 1] != '?' || patt[pos + 2] == ':')) | ||||
| 	{ | ||||
| 		have_leading_paren = true; | ||||
| 		pos += (patt[pos + 1] != '?' ? 1 : 3); | ||||
| 	} | ||||
|  | ||||
| 	/* Scan remainder of pattern */ | ||||
| 	prev_pos = pos; | ||||
| 	while (patt[pos]) | ||||
| 	{ | ||||
| 		int			len; | ||||
|  | ||||
| 		/* | ||||
| 		 * Check for characters that indicate multiple possible matches here. | ||||
| 		 * Also, drop out at ')' or '$' so the termination test works right. | ||||
| 		 */ | ||||
| 		if (patt[pos] == '.' || | ||||
| 			patt[pos] == '(' || | ||||
| 			patt[pos] == ')' || | ||||
| 			patt[pos] == '[' || | ||||
| 			patt[pos] == '^' || | ||||
| 			patt[pos] == '$') | ||||
| 			break; | ||||
|  | ||||
| 		/* Stop if case-varying character (it's sort of a wildcard) */ | ||||
| 		if (case_insensitive && | ||||
| 		  pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c)) | ||||
| 			break; | ||||
|  | ||||
| 		/* | ||||
| 		 * Check for quantifiers.  Except for +, this means the preceding | ||||
| 		 * character is optional, so we must remove it from the prefix too! | ||||
| 		 */ | ||||
| 		if (patt[pos] == '*' || | ||||
| 			patt[pos] == '?' || | ||||
| 			patt[pos] == '{') | ||||
| 		{ | ||||
| 			match_pos = prev_match_pos; | ||||
| 			pos = prev_pos; | ||||
| 			break; | ||||
| 		} | ||||
| 		if (patt[pos] == '+') | ||||
| 		{ | ||||
| 			pos = prev_pos; | ||||
| 			break; | ||||
| 		} | ||||
|  | ||||
| 		/* | ||||
| 		 * Normally, backslash quotes the next character.  But in AREs, | ||||
| 		 * backslash followed by alphanumeric is an escape, not a quoted | ||||
| 		 * character.  Must treat it as having multiple possible matches. | ||||
| 		 * Note: since only ASCII alphanumerics are escapes, we don't have to | ||||
| 		 * be paranoid about multibyte or collations here. | ||||
| 		 */ | ||||
| 		if (patt[pos] == '\\') | ||||
| 		{ | ||||
| 			if (isalnum((unsigned char) patt[pos + 1])) | ||||
| 				break; | ||||
| 			pos++; | ||||
| 			if (patt[pos] == '\0') | ||||
| 				break; | ||||
| 		} | ||||
| 		/* save position in case we need to back up on next loop cycle */ | ||||
| 		prev_match_pos = match_pos; | ||||
| 		prev_pos = pos; | ||||
| 		/* must use encoding-aware processing here */ | ||||
| 		len = pg_mblen(&patt[pos]); | ||||
| 		memcpy(&match[match_pos], &patt[pos], len); | ||||
| 		match_pos += len; | ||||
| 		pos += len; | ||||
| 	} | ||||
|  | ||||
| 	match[match_pos] = '\0'; | ||||
| 	rest = &patt[pos]; | ||||
|  | ||||
| 	if (have_leading_paren && patt[pos] == ')') | ||||
| 		pos++; | ||||
|  | ||||
| 	if (patt[pos] == '$' && patt[pos + 1] == '\0') | ||||
| 	{ | ||||
| 		*prefix_const = string_to_const(match, typeid); | ||||
|  | ||||
| 		if (rest_selec != NULL) | ||||
| 			*rest_selec = 1.0; | ||||
|  | ||||
| 		pfree(patt); | ||||
| 		pfree(match); | ||||
|  | ||||
| 		return Pattern_Prefix_Exact;	/* pattern specifies exact match */ | ||||
| 	} | ||||
|  | ||||
| 	*prefix_const = string_to_const(match, typeid); | ||||
| 	*prefix_const = string_to_const(prefix, typeid); | ||||
|  | ||||
| 	if (rest_selec != NULL) | ||||
| 		*rest_selec = regex_selectivity(rest, strlen(rest), | ||||
| 										case_insensitive); | ||||
| 	{ | ||||
| 		if (exact) | ||||
| 		{ | ||||
| 			/* Exact match, so there's no additional selectivity */ | ||||
| 			*rest_selec = 1.0; | ||||
| 		} | ||||
| 		else | ||||
| 		{ | ||||
| 			char   *patt = TextDatumGetCString(patt_const->constvalue); | ||||
|  | ||||
| 	pfree(patt); | ||||
| 	pfree(match); | ||||
| 			*rest_selec = regex_selectivity(patt, strlen(patt), | ||||
| 											case_insensitive, | ||||
| 											strlen(prefix)); | ||||
| 			pfree(patt); | ||||
| 		} | ||||
| 	} | ||||
|  | ||||
| 	if (match_pos > 0) | ||||
| 	pfree(prefix); | ||||
|  | ||||
| 	if (exact) | ||||
| 		return Pattern_Prefix_Exact;	/* pattern specifies exact match */ | ||||
| 	else | ||||
| 		return Pattern_Prefix_Partial; | ||||
|  | ||||
| 	return Pattern_Prefix_None; | ||||
| } | ||||
|  | ||||
| Pattern_Prefix_Status | ||||
| @@ -5741,7 +5602,8 @@ regex_selectivity_sub(const char *patt, int pattlen, bool case_insensitive) | ||||
| } | ||||
|  | ||||
| static Selectivity | ||||
| regex_selectivity(const char *patt, int pattlen, bool case_insensitive) | ||||
| regex_selectivity(const char *patt, int pattlen, bool case_insensitive, | ||||
| 				  int fixed_prefix_len) | ||||
| { | ||||
| 	Selectivity sel; | ||||
|  | ||||
| @@ -5757,9 +5619,14 @@ regex_selectivity(const char *patt, int pattlen, bool case_insensitive) | ||||
| 		/* no trailing $ */ | ||||
| 		sel = regex_selectivity_sub(patt, pattlen, case_insensitive); | ||||
| 		sel *= FULL_WILDCARD_SEL; | ||||
| 		if (sel > 1.0) | ||||
| 			sel = 1.0; | ||||
| 	} | ||||
|  | ||||
| 	/* If there's a fixed prefix, discount its selectivity */ | ||||
| 	if (fixed_prefix_len > 0) | ||||
| 		sel /= pow(FIXED_CHAR_SEL, fixed_prefix_len); | ||||
|  | ||||
| 	/* Make sure result stays in range */ | ||||
| 	CLAMP_PROBABILITY(sel); | ||||
| 	return sel; | ||||
| } | ||||
|  | ||||
|   | ||||
| @@ -156,6 +156,9 @@ typedef struct | ||||
| /* two specials for debugging and testing */ | ||||
| #define REG_ATOI	101			/* convert error-code name to number */ | ||||
| #define REG_ITOA	102			/* convert error-code number to name */ | ||||
| /* non-error result codes for pg_regprefix */ | ||||
| #define REG_PREFIX	(-1)		/* identified a common prefix */ | ||||
| #define REG_EXACT	(-2)		/* identified an exact match */ | ||||
|  | ||||
|  | ||||
|  | ||||
| @@ -164,6 +167,7 @@ typedef struct | ||||
|  */ | ||||
| extern int	pg_regcomp(regex_t *, const pg_wchar *, size_t, int, Oid); | ||||
| extern int	pg_regexec(regex_t *, const pg_wchar *, size_t, size_t, rm_detail_t *, size_t, regmatch_t[], int); | ||||
| extern int	pg_regprefix(regex_t *, pg_wchar **, size_t *); | ||||
| extern void pg_regfree(regex_t *); | ||||
| extern size_t pg_regerror(int, const regex_t *, char *, size_t); | ||||
| extern void pg_set_regex_collation(Oid collation); | ||||
|   | ||||
| @@ -199,19 +199,21 @@ struct colordesc | ||||
| 	color		sub;			/* open subcolor, if any; or free-chain ptr */ | ||||
| #define  NOSUB	 COLORLESS		/* value of "sub" when no open subcolor */ | ||||
| 	struct arc *arcs;			/* chain of all arcs of this color */ | ||||
| 	chr			firstchr;		/* char first assigned to this color */ | ||||
| 	int			flags;			/* bit values defined next */ | ||||
| #define  FREECOL 01				/* currently free */ | ||||
| #define  PSEUDO  02				/* pseudocolor, no real chars */ | ||||
| #define  UNUSEDCOLOR(cd) ((cd)->flags&FREECOL) | ||||
| #define  UNUSEDCOLOR(cd) ((cd)->flags & FREECOL) | ||||
| 	union tree *block;			/* block of solid color, if any */ | ||||
| }; | ||||
|  | ||||
| /* | ||||
|  * The color map itself | ||||
|  * | ||||
|  * Only the "tree" part is used at execution time, and that only via the | ||||
|  * GETCOLOR() macro.  Possibly that should be separated from the compile-time | ||||
|  * data. | ||||
|  * Much of the data in the colormap struct is only used at compile time. | ||||
|  * However, the bulk of the space usage is in the "tree" structure, so it's | ||||
|  * not clear that there's much point in converting the rest to a more compact | ||||
|  * form when compilation is finished. | ||||
|  */ | ||||
| struct colormap | ||||
| { | ||||
|   | ||||
| @@ -568,6 +568,8 @@ extern Datum regexp_split_to_table(PG_FUNCTION_ARGS); | ||||
| extern Datum regexp_split_to_table_no_flags(PG_FUNCTION_ARGS); | ||||
| extern Datum regexp_split_to_array(PG_FUNCTION_ARGS); | ||||
| extern Datum regexp_split_to_array_no_flags(PG_FUNCTION_ARGS); | ||||
| extern char *regexp_fixed_prefix(text *text_re, bool case_insensitive, | ||||
| 								 Oid collation, bool *exact); | ||||
|  | ||||
| /* regproc.c */ | ||||
| extern Datum regprocin(PG_FUNCTION_ARGS); | ||||
|   | ||||
| @@ -90,3 +90,66 @@ select substring('a' from '((a)+)'); | ||||
|  a | ||||
| (1 row) | ||||
|  | ||||
| -- Test conversion of regex patterns to indexable conditions | ||||
| explain (costs off) select * from pg_proc where proname ~ 'abc'; | ||||
|             QUERY PLAN              | ||||
| ----------------------------------- | ||||
|  Seq Scan on pg_proc | ||||
|    Filter: (proname ~ 'abc'::text) | ||||
| (2 rows) | ||||
|  | ||||
| explain (costs off) select * from pg_proc where proname ~ '^abc'; | ||||
|                               QUERY PLAN                               | ||||
| ---------------------------------------------------------------------- | ||||
|  Index Scan using pg_proc_proname_args_nsp_index on pg_proc | ||||
|    Index Cond: ((proname >= 'abc'::name) AND (proname < 'abd'::name)) | ||||
|    Filter: (proname ~ '^abc'::text) | ||||
| (3 rows) | ||||
|  | ||||
| explain (costs off) select * from pg_proc where proname ~ '^abc$'; | ||||
|                          QUERY PLAN                          | ||||
| ------------------------------------------------------------ | ||||
|  Index Scan using pg_proc_proname_args_nsp_index on pg_proc | ||||
|    Index Cond: (proname = 'abc'::name) | ||||
|    Filter: (proname ~ '^abc$'::text) | ||||
| (3 rows) | ||||
|  | ||||
| explain (costs off) select * from pg_proc where proname ~ '^abcd*e'; | ||||
|                               QUERY PLAN                               | ||||
| ---------------------------------------------------------------------- | ||||
|  Index Scan using pg_proc_proname_args_nsp_index on pg_proc | ||||
|    Index Cond: ((proname >= 'abc'::name) AND (proname < 'abd'::name)) | ||||
|    Filter: (proname ~ '^abcd*e'::text) | ||||
| (3 rows) | ||||
|  | ||||
| explain (costs off) select * from pg_proc where proname ~ '^abc+d'; | ||||
|                               QUERY PLAN                               | ||||
| ---------------------------------------------------------------------- | ||||
|  Index Scan using pg_proc_proname_args_nsp_index on pg_proc | ||||
|    Index Cond: ((proname >= 'abc'::name) AND (proname < 'abd'::name)) | ||||
|    Filter: (proname ~ '^abc+d'::text) | ||||
| (3 rows) | ||||
|  | ||||
| explain (costs off) select * from pg_proc where proname ~ '^(abc)(def)'; | ||||
|                                  QUERY PLAN                                  | ||||
| ---------------------------------------------------------------------------- | ||||
|  Index Scan using pg_proc_proname_args_nsp_index on pg_proc | ||||
|    Index Cond: ((proname >= 'abcdef'::name) AND (proname < 'abcdeg'::name)) | ||||
|    Filter: (proname ~ '^(abc)(def)'::text) | ||||
| (3 rows) | ||||
|  | ||||
| explain (costs off) select * from pg_proc where proname ~ '^(abc)$'; | ||||
|                          QUERY PLAN                          | ||||
| ------------------------------------------------------------ | ||||
|  Index Scan using pg_proc_proname_args_nsp_index on pg_proc | ||||
|    Index Cond: (proname = 'abc'::name) | ||||
|    Filter: (proname ~ '^(abc)$'::text) | ||||
| (3 rows) | ||||
|  | ||||
| explain (costs off) select * from pg_proc where proname ~ '^(abc)?d'; | ||||
|                QUERY PLAN                | ||||
| ---------------------------------------- | ||||
|  Seq Scan on pg_proc | ||||
|    Filter: (proname ~ '^(abc)?d'::text) | ||||
| (2 rows) | ||||
|  | ||||
|   | ||||
| @@ -24,3 +24,13 @@ select 'abc abc abd' ~ '^(.+)( \1)+$' as f; | ||||
| select substring('asd TO foo' from ' TO (([a-z0-9._]+|"([^"]+|"")+")+)'); | ||||
| select substring('a' from '((a))+'); | ||||
| select substring('a' from '((a)+)'); | ||||
|  | ||||
| -- Test conversion of regex patterns to indexable conditions | ||||
| explain (costs off) select * from pg_proc where proname ~ 'abc'; | ||||
| explain (costs off) select * from pg_proc where proname ~ '^abc'; | ||||
| explain (costs off) select * from pg_proc where proname ~ '^abc$'; | ||||
| explain (costs off) select * from pg_proc where proname ~ '^abcd*e'; | ||||
| explain (costs off) select * from pg_proc where proname ~ '^abc+d'; | ||||
| explain (costs off) select * from pg_proc where proname ~ '^(abc)(def)'; | ||||
| explain (costs off) select * from pg_proc where proname ~ '^(abc)$'; | ||||
| explain (costs off) select * from pg_proc where proname ~ '^(abc)?d'; | ||||
|   | ||||
		Reference in New Issue
	
	Block a user