1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-26 01:22:12 +03:00

Solve the 'Turkish problem' with undesirable locale behavior for case

conversion of basic ASCII letters.  Remove all uses of strcasecmp and
strncasecmp in favor of new functions pg_strcasecmp and pg_strncasecmp;
remove most but not all direct uses of toupper and tolower in favor of
pg_toupper and pg_tolower.  These functions use the same notions of
case folding already developed for identifier case conversion.  I left
the straight locale-based folding in place for situations where we are
just manipulating user data and not trying to match it to built-in
strings --- for example, the SQL upper() function is still locale
dependent.  Perhaps this will prove not to be what's wanted, but at
the moment we can initdb and pass regression tests in Turkish locale.
This commit is contained in:
Tom Lane
2004-05-07 00:24:59 +00:00
parent 4d46274b33
commit 0bd61548ab
67 changed files with 820 additions and 923 deletions

125
src/port/pgstrcasecmp.c Normal file

@ -0,0 +1,125 @@
/*-------------------------------------------------------------------------
*
* pgstrcasecmp.c
* Portable SQL-like case-independent comparisons and conversions.
*
* SQL99 specifies Unicode-aware case normalization, which we don't yet
* have the infrastructure for. Instead we use tolower() to provide a
* locale-aware translation. However, there are some locales where this
* is not right either (eg, Turkish may do strange things with 'i' and
* 'I'). Our current compromise is to use tolower() for characters with
* the high bit set, and use an ASCII-only downcasing for 7-bit
* characters.
*
* NB: this code should match downcase_truncate_identifier() in scansup.c.
*
*
* Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
*
* $PostgreSQL: pgsql/src/port/pgstrcasecmp.c,v 1.1 2004/05/07 00:24:59 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <ctype.h>
/*
* Case-independent comparison of two null-terminated strings.
*/
int
pg_strcasecmp(const char *s1, const char *s2)
{
for (;;)
{
unsigned char ch1 = (unsigned char) *s1++;
unsigned char ch2 = (unsigned char) *s2++;
if (ch1 != ch2)
{
if (ch1 >= 'A' && ch1 <= 'Z')
ch1 += 'a' - 'A';
else if (ch1 >= 0x80 && isupper(ch1))
ch1 = tolower(ch1);
if (ch2 >= 'A' && ch2 <= 'Z')
ch2 += 'a' - 'A';
else if (ch2 >= 0x80 && isupper(ch2))
ch2 = tolower(ch2);
if (ch1 != ch2)
return (int) ch1 - (int) ch2;
}
if (ch1 == 0)
break;
}
return 0;
}
/*
* Case-independent comparison of two not-necessarily-null-terminated strings.
* At most n bytes will be examined from each string.
*/
int
pg_strncasecmp(const char *s1, const char *s2, size_t n)
{
while (n-- > 0)
{
unsigned char ch1 = (unsigned char) *s1++;
unsigned char ch2 = (unsigned char) *s2++;
if (ch1 != ch2)
{
if (ch1 >= 'A' && ch1 <= 'Z')
ch1 += 'a' - 'A';
else if (ch1 >= 0x80 && isupper(ch1))
ch1 = tolower(ch1);
if (ch2 >= 'A' && ch2 <= 'Z')
ch2 += 'a' - 'A';
else if (ch2 >= 0x80 && isupper(ch2))
ch2 = tolower(ch2);
if (ch1 != ch2)
return (int) ch1 - (int) ch2;
}
if (ch1 == 0)
break;
}
return 0;
}
/*
* Fold a character to upper case.
*
* Unlike some versions of toupper(), this is safe to apply to characters
* that aren't upper case letters. Note however that the whole thing is
* a bit bogus for multibyte character sets.
*/
unsigned char
pg_toupper(unsigned char ch)
{
if (ch >= 'a' && ch <= 'z')
ch += 'A' - 'a';
else if (ch >= 0x80 && islower(ch))
ch = toupper(ch);
return ch;
}
/*
* Fold a character to lower case.
*
* Unlike some versions of tolower(), this is safe to apply to characters
* that aren't lower case letters. Note however that the whole thing is
* a bit bogus for multibyte character sets.
*/
unsigned char
pg_tolower(unsigned char ch)
{
if (ch >= 'A' && ch <= 'Z')
ch += 'a' - 'A';
else if (ch >= 0x80 && isupper(ch))
ch = tolower(ch);
return ch;
}