Solve the 'Turkish problem' with undesirable locale behavior for case

conversion of basic ASCII letters. Remove all uses of strcasecmp and strncasecmp in favor of new functions pg_strcasecmp and pg_strncasecmp; remove most but not all direct uses of toupper and tolower in favor of pg_toupper and pg_tolower. These functions use the same notions of case folding already developed for identifier case conversion. I left the straight locale-based folding in place for situations where we are just manipulating user data and not trying to match it to built-in strings --- for example, the SQL upper() function is still locale dependent. Perhaps this will prove not to be what's wanted, but at the moment we can initdb and pass regression tests in Turkish locale.
2025-09-11 00:12:06 +03:00 · 2004-05-07 00:24:59 +00:00
parent 4d46274b33
commit 0bd61548ab
67 changed files with 820 additions and 923 deletions
--- a/src/port/pgstrcasecmp.c
+++ b/src/port/pgstrcasecmp.c
@@ -0,0 +1,125 @@
+/*-------------------------------------------------------------------------
+ *
+ * pgstrcasecmp.c
+ *	   Portable SQL-like case-independent comparisons and conversions.
+ *
+ * SQL99 specifies Unicode-aware case normalization, which we don't yet
+ * have the infrastructure for.  Instead we use tolower() to provide a
+ * locale-aware translation.  However, there are some locales where this
+ * is not right either (eg, Turkish may do strange things with 'i' and
+ * 'I').  Our current compromise is to use tolower() for characters with
+ * the high bit set, and use an ASCII-only downcasing for 7-bit
+ * characters.
+ *
+ * NB: this code should match downcase_truncate_identifier() in scansup.c.
+ *
+ *
+ * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
+ *
+ * $PostgreSQL: pgsql/src/port/pgstrcasecmp.c,v 1.1 2004/05/07 00:24:59 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <ctype.h>
+
+
+/*
+ * Case-independent comparison of two null-terminated strings.
+ */
+int
+pg_strcasecmp(const char *s1, const char *s2)
+{
+	for (;;)
+	{
+		unsigned char	ch1 = (unsigned char) *s1++;
+		unsigned char	ch2 = (unsigned char) *s2++;
+
+		if (ch1 != ch2)
+		{
+			if (ch1 >= 'A' && ch1 <= 'Z')
+				ch1 += 'a' - 'A';
+			else if (ch1 >= 0x80 && isupper(ch1))
+				ch1 = tolower(ch1);
+
+			if (ch2 >= 'A' && ch2 <= 'Z')
+				ch2 += 'a' - 'A';
+			else if (ch2 >= 0x80 && isupper(ch2))
+				ch2 = tolower(ch2);
+
+			if (ch1 != ch2)
+				return (int) ch1 - (int) ch2;
+		}
+		if (ch1 == 0)
+			break;
+	}
+	return 0;
+}
+
+/*
+ * Case-independent comparison of two not-necessarily-null-terminated strings.
+ * At most n bytes will be examined from each string.
+ */
+int
+pg_strncasecmp(const char *s1, const char *s2, size_t n)
+{
+	while (n-- > 0)
+	{
+		unsigned char	ch1 = (unsigned char) *s1++;
+		unsigned char	ch2 = (unsigned char) *s2++;
+
+		if (ch1 != ch2)
+		{
+			if (ch1 >= 'A' && ch1 <= 'Z')
+				ch1 += 'a' - 'A';
+			else if (ch1 >= 0x80 && isupper(ch1))
+				ch1 = tolower(ch1);
+
+			if (ch2 >= 'A' && ch2 <= 'Z')
+				ch2 += 'a' - 'A';
+			else if (ch2 >= 0x80 && isupper(ch2))
+				ch2 = tolower(ch2);
+
+			if (ch1 != ch2)
+				return (int) ch1 - (int) ch2;
+		}
+		if (ch1 == 0)
+			break;
+	}
+	return 0;
+}
+
+/*
+ * Fold a character to upper case.
+ *
+ * Unlike some versions of toupper(), this is safe to apply to characters
+ * that aren't upper case letters.  Note however that the whole thing is
+ * a bit bogus for multibyte character sets.
+ */
+unsigned char
+pg_toupper(unsigned char ch)
+{
+	if (ch >= 'a' && ch <= 'z')
+		ch += 'A' - 'a';
+	else if (ch >= 0x80 && islower(ch))
+		ch = toupper(ch);
+	return ch;
+}
+
+/*
+ * Fold a character to lower case.
+ *
+ * Unlike some versions of tolower(), this is safe to apply to characters
+ * that aren't lower case letters.  Note however that the whole thing is
+ * a bit bogus for multibyte character sets.
+ */
+unsigned char
+pg_tolower(unsigned char ch)
+{
+	if (ch >= 'A' && ch <= 'Z')
+		ch += 'a' - 'A';
+	else if (ch >= 0x80 && isupper(ch))
+		ch = tolower(ch);
+	return ch;
+}