mirror of
https://github.com/postgres/postgres.git
synced 2025-12-19 17:02:53 +03:00
Use multibyte-aware extraction of pattern prefixes.
Previously, like_fixed_prefix() used char-at-a-time logic, which
forced it to be too conservative for case-insensitive matching.
Introduce like_fixed_prefix_ci(), and use that for case-insensitive
pattern prefixes. It uses multibyte and locale-aware logic, along with
the new pg_iswcased() API introduced in 630706ced0.
Reviewed-by: Chao Li <li.evan.chao@gmail.com>
Reviewed-by: Peter Eisentraut <peter@eisentraut.org>
Discussion: https://postgr.es/m/450ceb6260cad30d7afdf155d991a9caafee7c0d.camel@j-davis.com
This commit is contained in:
@@ -99,8 +99,6 @@ static Selectivity like_selectivity(const char *patt, int pattlen,
|
|||||||
static Selectivity regex_selectivity(const char *patt, int pattlen,
|
static Selectivity regex_selectivity(const char *patt, int pattlen,
|
||||||
bool case_insensitive,
|
bool case_insensitive,
|
||||||
int fixed_prefix_len);
|
int fixed_prefix_len);
|
||||||
static int pattern_char_isalpha(char c, bool is_multibyte,
|
|
||||||
pg_locale_t locale);
|
|
||||||
static Const *make_greater_string(const Const *str_const, FmgrInfo *ltproc,
|
static Const *make_greater_string(const Const *str_const, FmgrInfo *ltproc,
|
||||||
Oid collation);
|
Oid collation);
|
||||||
static Datum string_to_datum(const char *str, Oid datatype);
|
static Datum string_to_datum(const char *str, Oid datatype);
|
||||||
@@ -986,8 +984,8 @@ icnlikejoinsel(PG_FUNCTION_ARGS)
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
static Pattern_Prefix_Status
|
static Pattern_Prefix_Status
|
||||||
like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
|
like_fixed_prefix(Const *patt_const, Const **prefix_const,
|
||||||
Const **prefix_const, Selectivity *rest_selec)
|
Selectivity *rest_selec)
|
||||||
{
|
{
|
||||||
char *match;
|
char *match;
|
||||||
char *patt;
|
char *patt;
|
||||||
@@ -995,34 +993,10 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
|
|||||||
Oid typeid = patt_const->consttype;
|
Oid typeid = patt_const->consttype;
|
||||||
int pos,
|
int pos,
|
||||||
match_pos;
|
match_pos;
|
||||||
bool is_multibyte = (pg_database_encoding_max_length() > 1);
|
|
||||||
pg_locale_t locale = 0;
|
|
||||||
|
|
||||||
/* the right-hand const is type text or bytea */
|
/* the right-hand const is type text or bytea */
|
||||||
Assert(typeid == BYTEAOID || typeid == TEXTOID);
|
Assert(typeid == BYTEAOID || typeid == TEXTOID);
|
||||||
|
|
||||||
if (case_insensitive)
|
|
||||||
{
|
|
||||||
if (typeid == BYTEAOID)
|
|
||||||
ereport(ERROR,
|
|
||||||
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
|
||||||
errmsg("case insensitive matching not supported on type bytea")));
|
|
||||||
|
|
||||||
if (!OidIsValid(collation))
|
|
||||||
{
|
|
||||||
/*
|
|
||||||
* This typically means that the parser could not resolve a
|
|
||||||
* conflict of implicit collations, so report it that way.
|
|
||||||
*/
|
|
||||||
ereport(ERROR,
|
|
||||||
(errcode(ERRCODE_INDETERMINATE_COLLATION),
|
|
||||||
errmsg("could not determine which collation to use for ILIKE"),
|
|
||||||
errhint("Use the COLLATE clause to set the collation explicitly.")));
|
|
||||||
}
|
|
||||||
|
|
||||||
locale = pg_newlocale_from_collation(collation);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (typeid != BYTEAOID)
|
if (typeid != BYTEAOID)
|
||||||
{
|
{
|
||||||
patt = TextDatumGetCString(patt_const->constvalue);
|
patt = TextDatumGetCString(patt_const->constvalue);
|
||||||
@@ -1055,11 +1029,6 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Stop if case-varying character (it's sort of a wildcard) */
|
|
||||||
if (case_insensitive &&
|
|
||||||
pattern_char_isalpha(patt[pos], is_multibyte, locale))
|
|
||||||
break;
|
|
||||||
|
|
||||||
match[match_pos++] = patt[pos];
|
match[match_pos++] = patt[pos];
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1071,8 +1040,7 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
|
|||||||
*prefix_const = string_to_bytea_const(match, match_pos);
|
*prefix_const = string_to_bytea_const(match, match_pos);
|
||||||
|
|
||||||
if (rest_selec != NULL)
|
if (rest_selec != NULL)
|
||||||
*rest_selec = like_selectivity(&patt[pos], pattlen - pos,
|
*rest_selec = like_selectivity(&patt[pos], pattlen - pos, false);
|
||||||
case_insensitive);
|
|
||||||
|
|
||||||
pfree(patt);
|
pfree(patt);
|
||||||
pfree(match);
|
pfree(match);
|
||||||
@@ -1087,6 +1055,112 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
|
|||||||
return Pattern_Prefix_None;
|
return Pattern_Prefix_None;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Case-insensitive variant of like_fixed_prefix(). Multibyte and
|
||||||
|
* locale-aware for detecting cased characters.
|
||||||
|
*/
|
||||||
|
static Pattern_Prefix_Status
|
||||||
|
like_fixed_prefix_ci(Const *patt_const, Oid collation, Const **prefix_const,
|
||||||
|
Selectivity *rest_selec)
|
||||||
|
{
|
||||||
|
text *val = DatumGetTextPP(patt_const->constvalue);
|
||||||
|
Oid typeid = patt_const->consttype;
|
||||||
|
int nbytes = VARSIZE_ANY_EXHDR(val);
|
||||||
|
int wpos;
|
||||||
|
pg_wchar *wpatt;
|
||||||
|
int wpattlen;
|
||||||
|
pg_wchar *wmatch;
|
||||||
|
int wmatch_pos = 0;
|
||||||
|
char *match;
|
||||||
|
int match_mblen;
|
||||||
|
pg_locale_t locale = 0;
|
||||||
|
|
||||||
|
/* the right-hand const is type text or bytea */
|
||||||
|
Assert(typeid == BYTEAOID || typeid == TEXTOID);
|
||||||
|
|
||||||
|
if (typeid == BYTEAOID)
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
|
||||||
|
errmsg("case insensitive matching not supported on type bytea")));
|
||||||
|
|
||||||
|
if (!OidIsValid(collation))
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* This typically means that the parser could not resolve a conflict
|
||||||
|
* of implicit collations, so report it that way.
|
||||||
|
*/
|
||||||
|
ereport(ERROR,
|
||||||
|
(errcode(ERRCODE_INDETERMINATE_COLLATION),
|
||||||
|
errmsg("could not determine which collation to use for ILIKE"),
|
||||||
|
errhint("Use the COLLATE clause to set the collation explicitly.")));
|
||||||
|
}
|
||||||
|
|
||||||
|
locale = pg_newlocale_from_collation(collation);
|
||||||
|
|
||||||
|
wpatt = palloc((nbytes + 1) * sizeof(pg_wchar));
|
||||||
|
wpattlen = pg_mb2wchar_with_len(VARDATA_ANY(val), wpatt, nbytes);
|
||||||
|
|
||||||
|
wmatch = palloc((nbytes + 1) * sizeof(pg_wchar));
|
||||||
|
for (wpos = 0; wpos < wpattlen; wpos++)
|
||||||
|
{
|
||||||
|
/* % and _ are wildcard characters in LIKE */
|
||||||
|
if (wpatt[wpos] == '%' ||
|
||||||
|
wpatt[wpos] == '_')
|
||||||
|
break;
|
||||||
|
|
||||||
|
/* Backslash escapes the next character */
|
||||||
|
if (wpatt[wpos] == '\\')
|
||||||
|
{
|
||||||
|
wpos++;
|
||||||
|
if (wpos >= wpattlen)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* For ILIKE, stop if it's a case-varying character (it's sort of a
|
||||||
|
* wildcard).
|
||||||
|
*/
|
||||||
|
if (pg_iswcased(wpatt[wpos], locale))
|
||||||
|
break;
|
||||||
|
|
||||||
|
wmatch[wmatch_pos++] = wpatt[wpos];
|
||||||
|
}
|
||||||
|
|
||||||
|
wmatch[wmatch_pos] = '\0';
|
||||||
|
|
||||||
|
match = palloc(pg_database_encoding_max_length() * wmatch_pos + 1);
|
||||||
|
match_mblen = pg_wchar2mb_with_len(wmatch, match, wmatch_pos);
|
||||||
|
match[match_mblen] = '\0';
|
||||||
|
pfree(wmatch);
|
||||||
|
|
||||||
|
*prefix_const = string_to_const(match, TEXTOID);
|
||||||
|
pfree(match);
|
||||||
|
|
||||||
|
if (rest_selec != NULL)
|
||||||
|
{
|
||||||
|
int wrestlen = wpattlen - wmatch_pos;
|
||||||
|
char *rest;
|
||||||
|
int rest_mblen;
|
||||||
|
|
||||||
|
rest = palloc(pg_database_encoding_max_length() * wrestlen + 1);
|
||||||
|
rest_mblen = pg_wchar2mb_with_len(&wpatt[wmatch_pos], rest, wrestlen);
|
||||||
|
|
||||||
|
*rest_selec = like_selectivity(rest, rest_mblen, true);
|
||||||
|
pfree(rest);
|
||||||
|
}
|
||||||
|
|
||||||
|
pfree(wpatt);
|
||||||
|
|
||||||
|
/* in LIKE, an empty pattern is an exact match! */
|
||||||
|
if (wpos == wpattlen)
|
||||||
|
return Pattern_Prefix_Exact; /* reached end of pattern, so exact */
|
||||||
|
|
||||||
|
if (wmatch_pos > 0)
|
||||||
|
return Pattern_Prefix_Partial;
|
||||||
|
|
||||||
|
return Pattern_Prefix_None;
|
||||||
|
}
|
||||||
|
|
||||||
static Pattern_Prefix_Status
|
static Pattern_Prefix_Status
|
||||||
regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
|
regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
|
||||||
Const **prefix_const, Selectivity *rest_selec)
|
Const **prefix_const, Selectivity *rest_selec)
|
||||||
@@ -1164,12 +1238,11 @@ pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation,
|
|||||||
switch (ptype)
|
switch (ptype)
|
||||||
{
|
{
|
||||||
case Pattern_Type_Like:
|
case Pattern_Type_Like:
|
||||||
result = like_fixed_prefix(patt, false, collation,
|
result = like_fixed_prefix(patt, prefix, rest_selec);
|
||||||
prefix, rest_selec);
|
|
||||||
break;
|
break;
|
||||||
case Pattern_Type_Like_IC:
|
case Pattern_Type_Like_IC:
|
||||||
result = like_fixed_prefix(patt, true, collation,
|
result = like_fixed_prefix_ci(patt, collation, prefix,
|
||||||
prefix, rest_selec);
|
rest_selec);
|
||||||
break;
|
break;
|
||||||
case Pattern_Type_Regex:
|
case Pattern_Type_Regex:
|
||||||
result = regex_fixed_prefix(patt, false, collation,
|
result = regex_fixed_prefix(patt, false, collation,
|
||||||
@@ -1481,24 +1554,6 @@ regex_selectivity(const char *patt, int pattlen, bool case_insensitive,
|
|||||||
return sel;
|
return sel;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Check whether char is a letter (and, hence, subject to case-folding)
|
|
||||||
*
|
|
||||||
* In multibyte character sets or with ICU, we can't use isalpha, and it does
|
|
||||||
* not seem worth trying to convert to wchar_t to use iswalpha or u_isalpha.
|
|
||||||
* Instead, just assume any non-ASCII char is potentially case-varying, and
|
|
||||||
* hard-wire knowledge of which ASCII chars are letters.
|
|
||||||
*/
|
|
||||||
static int
|
|
||||||
pattern_char_isalpha(char c, bool is_multibyte,
|
|
||||||
pg_locale_t locale)
|
|
||||||
{
|
|
||||||
if (locale->ctype_is_c)
|
|
||||||
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
|
|
||||||
else
|
|
||||||
return char_is_cased(c, locale);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* For bytea, the increment function need only increment the current byte
|
* For bytea, the increment function need only increment the current byte
|
||||||
|
|||||||
Reference in New Issue
Block a user