From e1528933058db8db8e7398c044b1e9fee82cddcf Mon Sep 17 00:00:00 2001
From: Andrew Dunstan <andrew@dunslane.net>
Date: Sat, 22 Sep 2007 03:58:34 +0000
Subject: [PATCH] Go back to using a separate method for doing ILIKE for single
 byte character encodings that doesn't involve calling lower(). This should
 cure the performance regression in this case complained of by Guillaume Smet.
 It still leaves the horrid performance for multi-byte encodings introduced in
 8.2, but there's no obvious solution for that in sight.

---
 src/backend/utils/adt/like.c       | 45 +++++++++++++++++++++++-------
 src/backend/utils/adt/like_match.c | 27 +++++++++++++-----
 2 files changed, 55 insertions(+), 17 deletions(-)

diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c
index 1603a4e61ae..4c4ca2c1936 100644
--- a/src/backend/utils/adt/like.c
+++ b/src/backend/utils/adt/like.c
@@ -11,7 +11,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	$PostgreSQL: pgsql/src/backend/utils/adt/like.c,v 1.70 2007/09/21 22:52:52 tgl Exp $
+ *	$PostgreSQL: pgsql/src/backend/utils/adt/like.c,v 1.71 2007/09/22 03:58:34 adunstan Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -36,6 +36,8 @@ static text *MB_do_like_escape(text *, text *);
 
 static int	UTF8_MatchText(char *t, int tlen, char *p, int plen);
 
+static int	SB_IMatchText(char *t, int tlen, char *p, int plen);
+
 static int	GenericMatchText(char *s, int slen, char* p, int plen);
 static int	Generic_Text_IC_like(text *str, text *pat);
 
@@ -104,6 +106,12 @@ wchareq(char *p1, char *p2)
 
 #include "like_match.c"
 
+/* setup to compile like_match.c for single byte case insensitive matches */
+#define MATCH_LOWER
+#define NextChar(p, plen) NextByte((p), (plen))
+#define MatchText SB_IMatchText
+
+#include "like_match.c"
 
 /* setup to compile like_match.c for UTF8 encoding, using fast NextChar */
 
@@ -132,16 +140,33 @@ Generic_Text_IC_like(text *str, text *pat)
 	int			slen,
 				plen;
 
-	/* Force inputs to lower case to achieve case insensitivity */
-	str = DatumGetTextP(DirectFunctionCall1(lower, PointerGetDatum(str)));
-	pat = DatumGetTextP(DirectFunctionCall1(lower, PointerGetDatum(pat)));
-	/* lower's result is never packed, so OK to use old macros here */
-	s = VARDATA(str);
-	slen = (VARSIZE(str) - VARHDRSZ);
-	p = VARDATA(pat);
-	plen = (VARSIZE(pat) - VARHDRSZ);
+	/* For efficiency reasons, in the single byte case we don't call
+	 * lower() on the pattern and text, but instead call to_lower on each
+	 * character.  In the multi-byte case we don't have much choice :-(
+	 */
 
-	return GenericMatchText(s, slen, p, plen);
+	if (pg_database_encoding_max_length() > 1)
+	{
+		/* lower's result is never packed, so OK to use old macros here */
+		pat = DatumGetTextP(DirectFunctionCall1(lower, PointerGetDatum(pat)));
+		p = VARDATA(pat);
+		plen = (VARSIZE(pat) - VARHDRSZ);
+		str = DatumGetTextP(DirectFunctionCall1(lower, PointerGetDatum(str)));
+		s = VARDATA(str);
+		slen = (VARSIZE(str) - VARHDRSZ);
+		if (GetDatabaseEncoding() == PG_UTF8)
+			return UTF8_MatchText(s, slen, p, plen);
+		else
+			return MB_MatchText(s, slen, p, plen);
+	}
+	else
+	{
+		p = VARDATA_ANY(pat);
+		plen = VARSIZE_ANY_EXHDR(pat);
+		s = VARDATA_ANY(str);
+		slen = VARSIZE_ANY_EXHDR(str);
+		return SB_IMatchText(s, slen, p, plen);
+	}
 }
 
 /*
diff --git a/src/backend/utils/adt/like_match.c b/src/backend/utils/adt/like_match.c
index 7ab29623f3c..f2ee0bae0ec 100644
--- a/src/backend/utils/adt/like_match.c
+++ b/src/backend/utils/adt/like_match.c
@@ -3,8 +3,9 @@
  * like_match.c
  *	  like expression handling internal code.
  *
- * This file is included by like.c three times, to provide natching code for
- * single-byte encodings, UTF8, and for other multi-byte encodings.
+ * This file is included by like.c four times, to provide natching code for
+ * single-byte encodings, UTF8, and for other multi-byte encodings,
+ * and case insensitive matches for single byte encodings.
  * UTF8 is a special case because we can use a much more efficient version
  * of NextChar than can be used for other multi-byte encodings.
  *
@@ -13,11 +14,12 @@
  * NextChar 
  * MatchText - to name of function wanted
  * do_like_escape - name of function if wanted - needs CHAREQ and CopyAdvChar
+ * MATCH_LOWER - define iff using to_lower on text chars
  *
  * Copyright (c) 1996-2007, PostgreSQL Global Development Group
  *
  * IDENTIFICATION
- *	$PostgreSQL: pgsql/src/backend/utils/adt/like_match.c,v 1.17 2007/09/21 22:52:52 tgl Exp $
+ *	$PostgreSQL: pgsql/src/backend/utils/adt/like_match.c,v 1.18 2007/09/22 03:58:34 adunstan Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -68,6 +70,12 @@
  *--------------------
  */
 
+#ifdef MATCH_LOWER
+#define TCHAR(t) tolower((t))
+#else
+#define TCHAR(t) (t)
+#endif
+
 static int
 MatchText(char *t, int tlen, char *p, int plen)
 {
@@ -143,13 +151,13 @@ MatchText(char *t, int tlen, char *p, int plen)
 			else
 			{
 
-				char firstpat = *p ;
+				char firstpat = TCHAR(*p) ;
 
 				if (*p == '\\')
 				{
 					if (plen < 2)
 						return LIKE_FALSE;
-					firstpat = p[1];
+					firstpat = TCHAR(p[1]);
 				}
 
 				while (tlen > 0)
@@ -158,7 +166,7 @@ MatchText(char *t, int tlen, char *p, int plen)
 					 * Optimization to prevent most recursion: don't recurse
 					 * unless first pattern byte matches first text byte.
 					 */
-					if (*t == firstpat)
+					if (TCHAR(*t) == firstpat)
 					{
 						int			matched = MatchText(t, tlen, p, plen);
 						
@@ -183,7 +191,7 @@ MatchText(char *t, int tlen, char *p, int plen)
 			NextByte(p, plen);
 			continue;
 		}
-		else if (*t != *p)
+		else if (TCHAR(*t) != TCHAR(*p))
 		{
 			/*
 			 * Not the single-character wildcard and no explicit match? Then
@@ -338,3 +346,8 @@ do_like_escape(text *pat, text *esc)
 #undef do_like_escape
 #endif
 
+#undef TCHAR
+
+#ifdef MATCH_LOWER
+#undef MATCH_LOWER
+#endif