1
0
mirror of https://github.com/postgres/postgres.git synced 2025-05-02 11:44:50 +03:00
Peter Eisentraut 5e1963fb76 Collations with nondeterministic comparison
This adds a flag "deterministic" to collations.  If that is false,
such a collation disables various optimizations that assume that
strings are equal only if they are byte-wise equal.  That then allows
use cases such as case-insensitive or accent-insensitive comparisons
or handling of strings with different Unicode normal forms.

This functionality is only supported with the ICU provider.  At least
glibc doesn't appear to have any locales that work in a
nondeterministic way, so it's not worth supporting this for the libc
provider.

The term "deterministic comparison" in this context is from Unicode
Technical Standard #10
(https://unicode.org/reports/tr10/#Deterministic_Comparison).

This patch makes changes in three areas:

- CREATE COLLATION DDL changes and system catalog changes to support
  this new flag.

- Many executor nodes and auxiliary code are extended to track
  collations.  Previously, this code would just throw away collation
  information, because the eventually-called user-defined functions
  didn't use it since they only cared about equality, which didn't
  need collation information.

- String data type functions that do equality comparisons and hashing
  are changed to take the (non-)deterministic flag into account.  For
  comparison, this just means skipping various shortcuts and tie
  breakers that use byte-wise comparison.  For hashing, we first need
  to convert the input string to a canonical "sort key" using the ICU
  analogue of strxfrm().

Reviewed-by: Daniel Verite <daniel@manitou-mail.org>
Reviewed-by: Peter Geoghegan <pg@bowt.ie>
Discussion: https://www.postgresql.org/message-id/flat/1ccc668f-4cbc-0bef-af67-450b47cdfee7@2ndquadrant.com
2019-03-22 12:12:43 +01:00

400 lines
9.0 KiB
C

/*-------------------------------------------------------------------------
*
* name.c
* Functions for the built-in type "name".
*
* name replaces char16 and is carefully implemented so that it
* is a string of physical length NAMEDATALEN.
* DO NOT use hard-coded constants anywhere
* always use NAMEDATALEN as the symbolic constant! - jolly 8/21/95
*
*
* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/utils/adt/name.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "catalog/namespace.h"
#include "catalog/pg_collation.h"
#include "catalog/pg_type.h"
#include "libpq/pqformat.h"
#include "mb/pg_wchar.h"
#include "miscadmin.h"
#include "utils/array.h"
#include "utils/builtins.h"
#include "utils/lsyscache.h"
#include "utils/varlena.h"
/*****************************************************************************
* USER I/O ROUTINES (none) *
*****************************************************************************/
/*
* namein - converts "..." to internal representation
*
* Note:
* [Old] Currently if strlen(s) < NAMEDATALEN, the extra chars are nulls
* Now, always NULL terminated
*/
Datum
namein(PG_FUNCTION_ARGS)
{
char *s = PG_GETARG_CSTRING(0);
Name result;
int len;
len = strlen(s);
/* Truncate oversize input */
if (len >= NAMEDATALEN)
len = pg_mbcliplen(s, len, NAMEDATALEN - 1);
/* We use palloc0 here to ensure result is zero-padded */
result = (Name) palloc0(NAMEDATALEN);
memcpy(NameStr(*result), s, len);
PG_RETURN_NAME(result);
}
/*
* nameout - converts internal representation to "..."
*/
Datum
nameout(PG_FUNCTION_ARGS)
{
Name s = PG_GETARG_NAME(0);
PG_RETURN_CSTRING(pstrdup(NameStr(*s)));
}
/*
* namerecv - converts external binary format to name
*/
Datum
namerecv(PG_FUNCTION_ARGS)
{
StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
Name result;
char *str;
int nbytes;
str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
if (nbytes >= NAMEDATALEN)
ereport(ERROR,
(errcode(ERRCODE_NAME_TOO_LONG),
errmsg("identifier too long"),
errdetail("Identifier must be less than %d characters.",
NAMEDATALEN)));
result = (NameData *) palloc0(NAMEDATALEN);
memcpy(result, str, nbytes);
pfree(str);
PG_RETURN_NAME(result);
}
/*
* namesend - converts name to binary format
*/
Datum
namesend(PG_FUNCTION_ARGS)
{
Name s = PG_GETARG_NAME(0);
StringInfoData buf;
pq_begintypsend(&buf);
pq_sendtext(&buf, NameStr(*s), strlen(NameStr(*s)));
PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
}
/*****************************************************************************
* COMPARISON/SORTING ROUTINES *
*****************************************************************************/
/*
* nameeq - returns 1 iff arguments are equal
* namene - returns 1 iff arguments are not equal
* namelt - returns 1 iff a < b
* namele - returns 1 iff a <= b
* namegt - returns 1 iff a > b
* namege - returns 1 iff a >= b
*
* Note that the use of strncmp with NAMEDATALEN limit is mostly historical;
* strcmp would do as well, because we do not allow NAME values that don't
* have a '\0' terminator. Whatever might be past the terminator is not
* considered relevant to comparisons.
*/
static int
namecmp(Name arg1, Name arg2, Oid collid)
{
/* Fast path for common case used in system catalogs */
if (collid == C_COLLATION_OID)
return strncmp(NameStr(*arg1), NameStr(*arg2), NAMEDATALEN);
/* Else rely on the varstr infrastructure */
return varstr_cmp(NameStr(*arg1), strlen(NameStr(*arg1)),
NameStr(*arg2), strlen(NameStr(*arg2)),
collid);
}
Datum
nameeq(PG_FUNCTION_ARGS)
{
Name arg1 = PG_GETARG_NAME(0);
Name arg2 = PG_GETARG_NAME(1);
PG_RETURN_BOOL(namecmp(arg1, arg2, PG_GET_COLLATION()) == 0);
}
Datum
namene(PG_FUNCTION_ARGS)
{
Name arg1 = PG_GETARG_NAME(0);
Name arg2 = PG_GETARG_NAME(1);
PG_RETURN_BOOL(namecmp(arg1, arg2, PG_GET_COLLATION()) != 0);
}
Datum
namelt(PG_FUNCTION_ARGS)
{
Name arg1 = PG_GETARG_NAME(0);
Name arg2 = PG_GETARG_NAME(1);
PG_RETURN_BOOL(namecmp(arg1, arg2, PG_GET_COLLATION()) < 0);
}
Datum
namele(PG_FUNCTION_ARGS)
{
Name arg1 = PG_GETARG_NAME(0);
Name arg2 = PG_GETARG_NAME(1);
PG_RETURN_BOOL(namecmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
}
Datum
namegt(PG_FUNCTION_ARGS)
{
Name arg1 = PG_GETARG_NAME(0);
Name arg2 = PG_GETARG_NAME(1);
PG_RETURN_BOOL(namecmp(arg1, arg2, PG_GET_COLLATION()) > 0);
}
Datum
namege(PG_FUNCTION_ARGS)
{
Name arg1 = PG_GETARG_NAME(0);
Name arg2 = PG_GETARG_NAME(1);
PG_RETURN_BOOL(namecmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
}
Datum
btnamecmp(PG_FUNCTION_ARGS)
{
Name arg1 = PG_GETARG_NAME(0);
Name arg2 = PG_GETARG_NAME(1);
PG_RETURN_INT32(namecmp(arg1, arg2, PG_GET_COLLATION()));
}
Datum
btnamesortsupport(PG_FUNCTION_ARGS)
{
SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
Oid collid = ssup->ssup_collation;
MemoryContext oldcontext;
oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
/* Use generic string SortSupport */
varstr_sortsupport(ssup, NAMEOID, collid);
MemoryContextSwitchTo(oldcontext);
PG_RETURN_VOID();
}
/*****************************************************************************
* MISCELLANEOUS PUBLIC ROUTINES *
*****************************************************************************/
int
namecpy(Name n1, const NameData *n2)
{
if (!n1 || !n2)
return -1;
StrNCpy(NameStr(*n1), NameStr(*n2), NAMEDATALEN);
return 0;
}
#ifdef NOT_USED
int
namecat(Name n1, Name n2)
{
return namestrcat(n1, NameStr(*n2)); /* n2 can't be any longer than n1 */
}
#endif
int
namestrcpy(Name name, const char *str)
{
if (!name || !str)
return -1;
StrNCpy(NameStr(*name), str, NAMEDATALEN);
return 0;
}
#ifdef NOT_USED
int
namestrcat(Name name, const char *str)
{
int i;
char *p,
*q;
if (!name || !str)
return -1;
for (i = 0, p = NameStr(*name); i < NAMEDATALEN && *p; ++i, ++p)
;
for (q = str; i < NAMEDATALEN; ++i, ++p, ++q)
{
*p = *q;
if (!*q)
break;
}
return 0;
}
#endif
/*
* Compare a NAME to a C string
*
* Assumes C collation always; be careful when using this for
* anything but equality checks!
*/
int
namestrcmp(Name name, const char *str)
{
if (!name && !str)
return 0;
if (!name)
return -1; /* NULL < anything */
if (!str)
return 1; /* NULL < anything */
return strncmp(NameStr(*name), str, NAMEDATALEN);
}
/*
* SQL-functions CURRENT_USER, SESSION_USER
*/
Datum
current_user(PG_FUNCTION_ARGS)
{
PG_RETURN_DATUM(DirectFunctionCall1(namein, CStringGetDatum(GetUserNameFromId(GetUserId(), false))));
}
Datum
session_user(PG_FUNCTION_ARGS)
{
PG_RETURN_DATUM(DirectFunctionCall1(namein, CStringGetDatum(GetUserNameFromId(GetSessionUserId(), false))));
}
/*
* SQL-functions CURRENT_SCHEMA, CURRENT_SCHEMAS
*/
Datum
current_schema(PG_FUNCTION_ARGS)
{
List *search_path = fetch_search_path(false);
char *nspname;
if (search_path == NIL)
PG_RETURN_NULL();
nspname = get_namespace_name(linitial_oid(search_path));
list_free(search_path);
if (!nspname)
PG_RETURN_NULL(); /* recently-deleted namespace? */
PG_RETURN_DATUM(DirectFunctionCall1(namein, CStringGetDatum(nspname)));
}
Datum
current_schemas(PG_FUNCTION_ARGS)
{
List *search_path = fetch_search_path(PG_GETARG_BOOL(0));
ListCell *l;
Datum *names;
int i;
ArrayType *array;
names = (Datum *) palloc(list_length(search_path) * sizeof(Datum));
i = 0;
foreach(l, search_path)
{
char *nspname;
nspname = get_namespace_name(lfirst_oid(l));
if (nspname) /* watch out for deleted namespace */
{
names[i] = DirectFunctionCall1(namein, CStringGetDatum(nspname));
i++;
}
}
list_free(search_path);
array = construct_array(names, i,
NAMEOID,
NAMEDATALEN, /* sizeof(Name) */
false, /* Name is not by-val */
'c'); /* alignment of Name */
PG_RETURN_POINTER(array);
}
/*
* SQL-function nameconcatoid(name, oid) returns name
*
* This is used in the information_schema to produce specific_name columns,
* which are supposed to be unique per schema. We achieve that (in an ugly
* way) by appending the object's OID. The result is the same as
* ($1::text || '_' || $2::text)::name
* except that, if it would not fit in NAMEDATALEN, we make it do so by
* truncating the name input (not the oid).
*/
Datum
nameconcatoid(PG_FUNCTION_ARGS)
{
Name nam = PG_GETARG_NAME(0);
Oid oid = PG_GETARG_OID(1);
Name result;
char suffix[20];
int suflen;
int namlen;
suflen = snprintf(suffix, sizeof(suffix), "_%u", oid);
namlen = strlen(NameStr(*nam));
/* Truncate oversize input by truncating name part, not suffix */
if (namlen + suflen >= NAMEDATALEN)
namlen = pg_mbcliplen(NameStr(*nam), namlen, NAMEDATALEN - 1 - suflen);
/* We use palloc0 here to ensure result is zero-padded */
result = (Name) palloc0(NAMEDATALEN);
memcpy(NameStr(*result), NameStr(*nam), namlen);
memcpy(NameStr(*result) + namlen, suffix, suflen);
PG_RETURN_NAME(result);
}