mirror of
https://github.com/postgres/postgres.git
synced 2025-11-10 17:42:29 +03:00
I made the patch that implements regexp_replace again.
The specification of this function is as follows.
regexp_replace(source text, pattern text, replacement text, [flags
text])
returns text
Replace string that matches to regular expression in source text to
replacement text.
- pattern is regular expression pattern.
- replacement is replace string that can use '\1'-'\9', and '\&'.
'\1'-'\9': back reference to the n'th subexpression.
'\&' : entire matched string.
- flags can use the following values:
g: global (replace all)
i: ignore case
When the flags is not specified, case sensitive, replace the first
instance only.
Atsushi Ogawa
This commit is contained in:
@@ -27,7 +27,7 @@
|
||||
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
||||
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*
|
||||
* $PostgreSQL: pgsql/src/backend/regex/regexec.c,v 1.24 2003/11/29 19:51:55 pgsql Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/regex/regexec.c,v 1.25 2005/07/10 04:54:30 momjian Exp $
|
||||
*
|
||||
*/
|
||||
|
||||
@@ -110,6 +110,7 @@ struct vars
|
||||
regmatch_t *pmatch;
|
||||
rm_detail_t *details;
|
||||
chr *start; /* start of string */
|
||||
chr *search_start; /* search start of string */
|
||||
chr *stop; /* just past end of string */
|
||||
int err; /* error code if any (0 none) */
|
||||
regoff_t *mem; /* memory vector for backtracking */
|
||||
@@ -168,6 +169,7 @@ int
|
||||
pg_regexec(regex_t *re,
|
||||
const chr *string,
|
||||
size_t len,
|
||||
size_t search_start,
|
||||
rm_detail_t *details,
|
||||
size_t nmatch,
|
||||
regmatch_t pmatch[],
|
||||
@@ -219,6 +221,7 @@ pg_regexec(regex_t *re,
|
||||
v->pmatch = pmatch;
|
||||
v->details = details;
|
||||
v->start = (chr *) string;
|
||||
v->search_start = (chr *) string + search_start;
|
||||
v->stop = (chr *) string + len;
|
||||
v->err = 0;
|
||||
if (backref)
|
||||
@@ -288,7 +291,8 @@ find(struct vars * v,
|
||||
NOERR();
|
||||
MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
|
||||
cold = NULL;
|
||||
close = shortest(v, s, v->start, v->start, v->stop, &cold, (int *) NULL);
|
||||
close = shortest(v, s, v->search_start, v->search_start, v->stop,
|
||||
&cold, (int *) NULL);
|
||||
freedfa(s);
|
||||
NOERR();
|
||||
if (v->g->cflags & REG_EXPECT)
|
||||
@@ -415,7 +419,7 @@ cfindloop(struct vars * v,
|
||||
|
||||
assert(d != NULL && s != NULL);
|
||||
cold = NULL;
|
||||
close = v->start;
|
||||
close = v->search_start;
|
||||
do
|
||||
{
|
||||
MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.56 2004/12/31 22:01:22 pgsql Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.57 2005/07/10 04:54:30 momjian Exp $
|
||||
*
|
||||
* Alistair Crooks added the code for the regex caching
|
||||
* agc - cached the regular expressions used - there's a good chance
|
||||
@@ -81,38 +81,27 @@ static cached_re_str re_array[MAX_CACHED_RES]; /* cached re's */
|
||||
|
||||
|
||||
/*
|
||||
* RE_compile_and_execute - compile and execute a RE, caching if possible
|
||||
* RE_compile_and_cache - compile a RE, caching if possible
|
||||
*
|
||||
* Returns TRUE on match, FALSE on no match
|
||||
* Returns regex_t
|
||||
*
|
||||
* text_re --- the pattern, expressed as an *untoasted* TEXT object
|
||||
* dat --- the data to match against (need not be null-terminated)
|
||||
* dat_len --- the length of the data string
|
||||
* cflags --- compile options for the pattern
|
||||
* nmatch, pmatch --- optional return area for match details
|
||||
* text_re --- the pattern, expressed as an *untoasted* TEXT object
|
||||
* cflags --- compile options for the pattern
|
||||
*
|
||||
* Both pattern and data are given in the database encoding. We internally
|
||||
* convert to array of pg_wchar which is what Spencer's regex package wants.
|
||||
* Pattern is given in the database encoding. We internally convert to
|
||||
* array of pg_wchar which is what Spencer's regex package wants.
|
||||
*/
|
||||
static bool
|
||||
RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len,
|
||||
int cflags, int nmatch, regmatch_t *pmatch)
|
||||
static regex_t
|
||||
RE_compile_and_cache(text *text_re, int cflags)
|
||||
{
|
||||
int text_re_len = VARSIZE(text_re);
|
||||
pg_wchar *data;
|
||||
size_t data_len;
|
||||
pg_wchar *pattern;
|
||||
size_t pattern_len;
|
||||
int i;
|
||||
int regcomp_result;
|
||||
int regexec_result;
|
||||
cached_re_str re_temp;
|
||||
char errMsg[100];
|
||||
|
||||
/* Convert data string to wide characters */
|
||||
data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
|
||||
data_len = pg_mb2wchar_with_len(dat, data, dat_len);
|
||||
|
||||
/*
|
||||
* Look for a match among previously compiled REs. Since the data
|
||||
* structure is self-organizing with most-used entries at the front,
|
||||
@@ -134,28 +123,7 @@ RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len,
|
||||
re_array[0] = re_temp;
|
||||
}
|
||||
|
||||
/* Perform RE match and return result */
|
||||
regexec_result = pg_regexec(&re_array[0].cre_re,
|
||||
data,
|
||||
data_len,
|
||||
NULL, /* no details */
|
||||
nmatch,
|
||||
pmatch,
|
||||
0);
|
||||
|
||||
pfree(data);
|
||||
|
||||
if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
|
||||
{
|
||||
/* re failed??? */
|
||||
pg_regerror(regexec_result, &re_array[0].cre_re,
|
||||
errMsg, sizeof(errMsg));
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
|
||||
errmsg("regular expression failed: %s", errMsg)));
|
||||
}
|
||||
|
||||
return (regexec_result == REG_OKAY);
|
||||
return re_array[0].cre_re;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -220,10 +188,45 @@ RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len,
|
||||
re_array[0] = re_temp;
|
||||
num_res++;
|
||||
|
||||
return re_array[0].cre_re;
|
||||
}
|
||||
|
||||
/*
|
||||
* RE_compile_and_execute - compile and execute a RE
|
||||
*
|
||||
* Returns TRUE on match, FALSE on no match
|
||||
*
|
||||
* text_re --- the pattern, expressed as an *untoasted* TEXT object
|
||||
* dat --- the data to match against (need not be null-terminated)
|
||||
* dat_len --- the length of the data string
|
||||
* cflags --- compile options for the pattern
|
||||
* nmatch, pmatch --- optional return area for match details
|
||||
*
|
||||
* Both pattern and data are given in the database encoding. We internally
|
||||
* convert to array of pg_wchar which is what Spencer's regex package wants.
|
||||
*/
|
||||
static bool
|
||||
RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len,
|
||||
int cflags, int nmatch, regmatch_t *pmatch)
|
||||
{
|
||||
pg_wchar *data;
|
||||
size_t data_len;
|
||||
int regexec_result;
|
||||
regex_t re;
|
||||
char errMsg[100];
|
||||
|
||||
/* Convert data string to wide characters */
|
||||
data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
|
||||
data_len = pg_mb2wchar_with_len(dat, data, dat_len);
|
||||
|
||||
/* Compile RE */
|
||||
re = RE_compile_and_cache(text_re, cflags);
|
||||
|
||||
/* Perform RE match and return result */
|
||||
regexec_result = pg_regexec(&re_array[0].cre_re,
|
||||
data,
|
||||
data_len,
|
||||
0,
|
||||
NULL, /* no details */
|
||||
nmatch,
|
||||
pmatch,
|
||||
@@ -428,15 +431,89 @@ textregexsubstr(PG_FUNCTION_ARGS)
|
||||
eo = pmatch[0].rm_eo;
|
||||
}
|
||||
|
||||
return (DirectFunctionCall3(text_substr,
|
||||
return DirectFunctionCall3(text_substr,
|
||||
PointerGetDatum(s),
|
||||
Int32GetDatum(so + 1),
|
||||
Int32GetDatum(eo - so)));
|
||||
Int32GetDatum(eo - so));
|
||||
}
|
||||
|
||||
PG_RETURN_NULL();
|
||||
}
|
||||
|
||||
/*
|
||||
* textregexreplace_noopt()
|
||||
* Return a replace string matched by a regular expression.
|
||||
* This function is a version that doesn't specify the option of
|
||||
* textregexreplace. This is case sensitive, replace the first
|
||||
* instance only.
|
||||
*/
|
||||
Datum
|
||||
textregexreplace_noopt(PG_FUNCTION_ARGS)
|
||||
{
|
||||
text *s = PG_GETARG_TEXT_P(0);
|
||||
text *p = PG_GETARG_TEXT_P(1);
|
||||
text *r = PG_GETARG_TEXT_P(2);
|
||||
regex_t re;
|
||||
|
||||
re = RE_compile_and_cache(p, regex_flavor);
|
||||
|
||||
return DirectFunctionCall4(replace_text_regexp,
|
||||
PointerGetDatum(s),
|
||||
PointerGetDatum(&re),
|
||||
PointerGetDatum(r),
|
||||
BoolGetDatum(false));
|
||||
}
|
||||
|
||||
/*
|
||||
* textregexreplace()
|
||||
* Return a replace string matched by a regular expression.
|
||||
*/
|
||||
Datum
|
||||
textregexreplace(PG_FUNCTION_ARGS)
|
||||
{
|
||||
text *s = PG_GETARG_TEXT_P(0);
|
||||
text *p = PG_GETARG_TEXT_P(1);
|
||||
text *r = PG_GETARG_TEXT_P(2);
|
||||
text *opt = PG_GETARG_TEXT_P(3);
|
||||
char *opt_p = VARDATA(opt);
|
||||
int opt_len = (VARSIZE(opt) - VARHDRSZ);
|
||||
int i;
|
||||
bool global = false;
|
||||
bool ignorecase = false;
|
||||
regex_t re;
|
||||
|
||||
/* parse options */
|
||||
for (i = 0; i < opt_len; i++)
|
||||
{
|
||||
switch (opt_p[i])
|
||||
{
|
||||
case 'i':
|
||||
ignorecase = true;
|
||||
break;
|
||||
case 'g':
|
||||
global = true;
|
||||
break;
|
||||
default:
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
|
||||
errmsg("invalid option of regexp_replace: %c",
|
||||
opt_p[i])));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (ignorecase)
|
||||
re = RE_compile_and_cache(p, regex_flavor | REG_ICASE);
|
||||
else
|
||||
re = RE_compile_and_cache(p, regex_flavor);
|
||||
|
||||
return DirectFunctionCall4(replace_text_regexp,
|
||||
PointerGetDatum(s),
|
||||
PointerGetDatum(&re),
|
||||
PointerGetDatum(r),
|
||||
BoolGetDatum(global));
|
||||
}
|
||||
|
||||
/* similar_escape()
|
||||
* Convert a SQL99 regexp pattern to POSIX style, so it can be used by
|
||||
* our regexp engine.
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
*
|
||||
*
|
||||
* IDENTIFICATION
|
||||
* $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.126 2005/07/07 04:36:08 momjian Exp $
|
||||
* $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.127 2005/07/10 04:54:30 momjian Exp $
|
||||
*
|
||||
*-------------------------------------------------------------------------
|
||||
*/
|
||||
@@ -28,6 +28,7 @@
|
||||
#include "utils/builtins.h"
|
||||
#include "utils/lsyscache.h"
|
||||
#include "utils/pg_locale.h"
|
||||
#include "regex/regex.h"
|
||||
|
||||
|
||||
typedef struct varlena unknown;
|
||||
@@ -1993,6 +1994,225 @@ replace_text(PG_FUNCTION_ARGS)
|
||||
PG_RETURN_TEXT_P(ret_text);
|
||||
}
|
||||
|
||||
/*
|
||||
* check_replace_text_has_escape_char
|
||||
* check whether replace_text has escape char.
|
||||
*/
|
||||
static bool
|
||||
check_replace_text_has_escape_char(const text *replace_text)
|
||||
{
|
||||
const char *p = VARDATA(replace_text);
|
||||
const char *p_end = p + (VARSIZE(replace_text) - VARHDRSZ);
|
||||
|
||||
if (pg_database_encoding_max_length() == 1)
|
||||
{
|
||||
for (; p < p_end; p++)
|
||||
if (*p == '\\') return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
for (; p < p_end; p += pg_mblen(p))
|
||||
if (*p == '\\') return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* appendStringInfoRegexpSubstr
|
||||
* append string by using back references of regexp.
|
||||
*/
|
||||
static void
|
||||
appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
|
||||
regmatch_t *pmatch, text *src_text)
|
||||
{
|
||||
const char *p = VARDATA(replace_text);
|
||||
const char *p_end = p + (VARSIZE(replace_text) - VARHDRSZ);
|
||||
|
||||
int eml = pg_database_encoding_max_length();
|
||||
|
||||
int substr_start = 1;
|
||||
int ch_cnt;
|
||||
|
||||
int so;
|
||||
int eo;
|
||||
|
||||
while (1)
|
||||
{
|
||||
/* Find escape char. */
|
||||
ch_cnt = 0;
|
||||
if (eml == 1)
|
||||
{
|
||||
for (; p < p_end && *p != '\\'; p++)
|
||||
ch_cnt++;
|
||||
}
|
||||
else
|
||||
{
|
||||
for (; p < p_end && *p != '\\'; p += pg_mblen(p))
|
||||
ch_cnt++;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy the text when there is a text in the left of escape char
|
||||
* or escape char is not found.
|
||||
*/
|
||||
if (ch_cnt)
|
||||
{
|
||||
text *append_text = text_substring(PointerGetDatum(replace_text),
|
||||
substr_start, ch_cnt, false);
|
||||
appendStringInfoString(str, PG_TEXT_GET_STR(append_text));
|
||||
pfree(append_text);
|
||||
}
|
||||
substr_start += ch_cnt + 1;
|
||||
|
||||
if (p >= p_end) /* When escape char is not found. */
|
||||
break;
|
||||
|
||||
/* See the next character of escape char. */
|
||||
p++;
|
||||
so = eo = -1;
|
||||
|
||||
if (*p >= '1' && *p <= '9')
|
||||
{
|
||||
/* Use the back reference of regexp. */
|
||||
int idx = *p - '0';
|
||||
so = pmatch[idx].rm_so;
|
||||
eo = pmatch[idx].rm_eo;
|
||||
p++;
|
||||
substr_start++;
|
||||
}
|
||||
else if (*p == '&')
|
||||
{
|
||||
/* Use the entire matched string. */
|
||||
so = pmatch[0].rm_so;
|
||||
eo = pmatch[0].rm_eo;
|
||||
p++;
|
||||
substr_start++;
|
||||
}
|
||||
|
||||
if (so != -1 && eo != -1)
|
||||
{
|
||||
/* Copy the text that is back reference of regexp. */
|
||||
text *append_text = text_substring(PointerGetDatum(src_text),
|
||||
so + 1, (eo - so), false);
|
||||
appendStringInfoString(str, PG_TEXT_GET_STR(append_text));
|
||||
pfree(append_text);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#define REGEXP_REPLACE_BACKREF_CNT 10
|
||||
|
||||
/*
|
||||
* replace_text_regexp
|
||||
* replace text that matches to regexp in src_text to replace_text.
|
||||
*/
|
||||
Datum
|
||||
replace_text_regexp(PG_FUNCTION_ARGS)
|
||||
{
|
||||
text *ret_text;
|
||||
text *src_text = PG_GETARG_TEXT_P(0);
|
||||
int src_text_len = VARSIZE(src_text) - VARHDRSZ;
|
||||
regex_t *re = (regex_t *)PG_GETARG_POINTER(1);
|
||||
text *replace_text = PG_GETARG_TEXT_P(2);
|
||||
bool global = PG_GETARG_BOOL(3);
|
||||
StringInfo str = makeStringInfo();
|
||||
int regexec_result;
|
||||
regmatch_t pmatch[REGEXP_REPLACE_BACKREF_CNT];
|
||||
pg_wchar *data;
|
||||
size_t data_len;
|
||||
int search_start;
|
||||
int data_pos;
|
||||
bool have_escape;
|
||||
|
||||
/* Convert data string to wide characters. */
|
||||
data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
|
||||
data_len = pg_mb2wchar_with_len(VARDATA(src_text), data, src_text_len);
|
||||
|
||||
/* Check whether replace_text has escape char. */
|
||||
have_escape = check_replace_text_has_escape_char(replace_text);
|
||||
|
||||
for (search_start = data_pos = 0; search_start <= data_len;)
|
||||
{
|
||||
regexec_result = pg_regexec(re,
|
||||
data,
|
||||
data_len,
|
||||
search_start,
|
||||
NULL, /* no details */
|
||||
REGEXP_REPLACE_BACKREF_CNT,
|
||||
pmatch,
|
||||
0);
|
||||
|
||||
if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
|
||||
{
|
||||
char errMsg[100];
|
||||
|
||||
/* re failed??? */
|
||||
pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
|
||||
ereport(ERROR,
|
||||
(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
|
||||
errmsg("regular expression failed: %s", errMsg)));
|
||||
}
|
||||
|
||||
if (regexec_result == REG_NOMATCH)
|
||||
break;
|
||||
|
||||
/*
|
||||
* Copy the text when there is a text in the left of matched position.
|
||||
*/
|
||||
if (pmatch[0].rm_so - data_pos > 0)
|
||||
{
|
||||
text *left_text = text_substring(PointerGetDatum(src_text),
|
||||
data_pos + 1,
|
||||
pmatch[0].rm_so - data_pos, false);
|
||||
appendStringInfoString(str, PG_TEXT_GET_STR(left_text));
|
||||
pfree(left_text);
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy the replace_text. Process back references when the
|
||||
* replace_text has escape characters.
|
||||
*/
|
||||
if (have_escape)
|
||||
appendStringInfoRegexpSubstr(str, replace_text, pmatch, src_text);
|
||||
else
|
||||
appendStringInfoString(str, PG_TEXT_GET_STR(replace_text));
|
||||
|
||||
search_start = data_pos = pmatch[0].rm_eo;
|
||||
|
||||
/*
|
||||
* When global option is off, replace the first instance only.
|
||||
*/
|
||||
if (!global)
|
||||
break;
|
||||
|
||||
/*
|
||||
* Search from next character when the matching text is zero width.
|
||||
*/
|
||||
if (pmatch[0].rm_so == pmatch[0].rm_eo)
|
||||
search_start++;
|
||||
}
|
||||
|
||||
/*
|
||||
* Copy the text when there is a text at the right of last matched
|
||||
* or regexp is not matched.
|
||||
*/
|
||||
if (data_pos < data_len)
|
||||
{
|
||||
text *right_text = text_substring(PointerGetDatum(src_text),
|
||||
data_pos + 1, -1, true);
|
||||
appendStringInfoString(str, PG_TEXT_GET_STR(right_text));
|
||||
pfree(right_text);
|
||||
}
|
||||
|
||||
ret_text = PG_STR_GET_TEXT(str->data);
|
||||
pfree(str->data);
|
||||
pfree(str);
|
||||
pfree(data);
|
||||
|
||||
PG_RETURN_TEXT_P(ret_text);
|
||||
}
|
||||
|
||||
/*
|
||||
* split_text
|
||||
* parse input string
|
||||
|
||||
Reference in New Issue
Block a user