postgres/src/common/unicode_case.c

/*-------------------------------------------------------------------------
 * unicode_case.c
 *		Unicode case mapping and case conversion.
 *
 * Portions Copyright (c) 2017-2025, PostgreSQL Global Development Group
 *
 * IDENTIFICATION
 *	  src/common/unicode_case.c
 *
 *-------------------------------------------------------------------------
 */
#ifndef FRONTEND
#include "postgres.h"
#else
#include "postgres_fe.h"
#endif

#include "common/unicode_case.h"
#include "common/unicode_case_table.h"
#include "common/unicode_category.h"
#include "mb/pg_wchar.h"

enum CaseMapResult
{
	CASEMAP_SELF,
	CASEMAP_SIMPLE,
	CASEMAP_SPECIAL,
};

/*
 * Map for each case kind.
 */
static const pg_wchar *const casekind_map[NCaseKind] =
{
	[CaseLower] = case_map_lower,
	[CaseTitle] = case_map_title,
	[CaseUpper] = case_map_upper,
	[CaseFold] = case_map_fold,
};

static pg_wchar find_case_map(pg_wchar ucs, const pg_wchar *map);
static size_t convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
						   CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
						   void *wbstate);
static enum CaseMapResult casemap(pg_wchar u1, CaseKind casekind, bool full,
								  const char *src, size_t srclen, size_t srcoff,
								  pg_wchar *simple, const pg_wchar **special);

pg_wchar
unicode_lowercase_simple(pg_wchar code)
{
	pg_wchar	cp = find_case_map(code, case_map_lower);

	return cp != 0 ? cp : code;
}

pg_wchar
unicode_titlecase_simple(pg_wchar code)
{
	pg_wchar	cp = find_case_map(code, case_map_title);

	return cp != 0 ? cp : code;
}

pg_wchar
unicode_uppercase_simple(pg_wchar code)
{
	pg_wchar	cp = find_case_map(code, case_map_upper);

	return cp != 0 ? cp : code;
}

pg_wchar
unicode_casefold_simple(pg_wchar code)
{
	pg_wchar	cp = find_case_map(code, case_map_fold);

	return cp != 0 ? cp : code;
}

/*
 * unicode_strlower()
 *
 * Convert src to lowercase, and return the result length (not including
 * terminating NUL).
 *
 * String src must be encoded in UTF-8. If srclen < 0, src must be
 * NUL-terminated.
 *
 * Result string is stored in dst, truncating if larger than dstsize. If
 * dstsize is greater than the result length, dst will be NUL-terminated;
 * otherwise not.
 *
 * If dstsize is zero, dst may be NULL. This is useful for calculating the
 * required buffer size before allocating.
 *
 * If full is true, use special case mappings if available and if the
 * conditions are satisfied.
 */
size_t
unicode_strlower(char *dst, size_t dstsize, const char *src, ssize_t srclen,
				 bool full)
{
	return convert_case(dst, dstsize, src, srclen, CaseLower, full, NULL,
						NULL);
}

/*
 * unicode_strtitle()
 *
 * Convert src to titlecase, and return the result length (not including
 * terminating NUL).
 *
 * String src must be encoded in UTF-8. If srclen < 0, src must be
 * NUL-terminated.
 *
 * Result string is stored in dst, truncating if larger than dstsize. If
 * dstsize is greater than the result length, dst will be NUL-terminated;
 * otherwise not.
 *
 * If dstsize is zero, dst may be NULL. This is useful for calculating the
 * required buffer size before allocating.
 *
 * If full is true, use special case mappings if available and if the
 * conditions are satisfied. Otherwise, use only simple mappings and use
 * uppercase instead of titlecase.
 *
 * Titlecasing requires knowledge about word boundaries, which is provided by
 * the callback wbnext. A word boundary is the offset of the start of a word
 * or the offset of the character immediately following a word.
 *
 * The caller is expected to initialize and free the callback state
 * wbstate. The callback should first return offset 0 for the first boundary;
 * then the offset of each subsequent word boundary; then the total length of
 * the string to indicate the final boundary.
 */
size_t
unicode_strtitle(char *dst, size_t dstsize, const char *src, ssize_t srclen,
				 bool full, WordBoundaryNext wbnext, void *wbstate)
{
	return convert_case(dst, dstsize, src, srclen, CaseTitle, full, wbnext,
						wbstate);
}

/*
 * unicode_strupper()
 *
 * Convert src to uppercase, and return the result length (not including
 * terminating NUL).
 *
 * String src must be encoded in UTF-8. If srclen < 0, src must be
 * NUL-terminated.
 *
 * Result string is stored in dst, truncating if larger than dstsize. If
 * dstsize is greater than the result length, dst will be NUL-terminated;
 * otherwise not.
 *
 * If dstsize is zero, dst may be NULL. This is useful for calculating the
 * required buffer size before allocating.
 *
 * If full is true, use special case mappings if available and if the
 * conditions are satisfied.
 */
size_t
unicode_strupper(char *dst, size_t dstsize, const char *src, ssize_t srclen,
				 bool full)
{
	return convert_case(dst, dstsize, src, srclen, CaseUpper, full, NULL,
						NULL);
}

/*
 * unicode_strfold()
 *
 * Case fold src, and return the result length (not including terminating
 * NUL).
 *
 * String src must be encoded in UTF-8. If srclen < 0, src must be
 * NUL-terminated.
 *
 * Result string is stored in dst, truncating if larger than dstsize. If
 * dstsize is greater than the result length, dst will be NUL-terminated;
 * otherwise not.
 *
 * If dstsize is zero, dst may be NULL. This is useful for calculating the
 * required buffer size before allocating.
 */
size_t
unicode_strfold(char *dst, size_t dstsize, const char *src, ssize_t srclen,
				bool full)
{
	return convert_case(dst, dstsize, src, srclen, CaseFold, full, NULL,
						NULL);
}

/*
 * Implement Unicode Default Case Conversion algorithm.
 *
 * If str_casekind is CaseLower or CaseUpper, map each character in the string
 * for which a mapping is available.
 *
 * If str_casekind is CaseTitle, maps characters found on a word boundary to
 * titlecase (or uppercase if full is false) and other characters to
 * lowercase. NB: does not currently implement the Unicode behavior in which
 * the word boundary is adjusted to the next Cased character. That behavior
 * could be implemented as an option, but it doesn't match the default
 * behavior of ICU, nor does it match the documented behavior of INITCAP().
 *
 * If full is true, use special mappings for relevant characters, which can
 * map a single codepoint to multiple codepoints, or depend on conditions.
 */
static size_t
convert_case(char *dst, size_t dstsize, const char *src, ssize_t srclen,
			 CaseKind str_casekind, bool full, WordBoundaryNext wbnext,
			 void *wbstate)
{
	/* character CaseKind varies while titlecasing */
	CaseKind	chr_casekind = str_casekind;
	size_t		srcoff = 0;
	size_t		result_len = 0;
	size_t		boundary = 0;

	Assert((str_casekind == CaseTitle && wbnext && wbstate) ||
		   (str_casekind != CaseTitle && !wbnext && !wbstate));

	if (str_casekind == CaseTitle)
	{
		boundary = wbnext(wbstate);
		Assert(boundary == 0);	/* start of text is always a boundary */
	}

	while ((srclen < 0 || srcoff < srclen) && src[srcoff] != '\0')
	{
		pg_wchar	u1 = utf8_to_unicode((unsigned char *) src + srcoff);
		int			u1len = unicode_utf8len(u1);
		pg_wchar	simple = 0;
		const pg_wchar *special = NULL;
		enum CaseMapResult casemap_result;

		if (str_casekind == CaseTitle)
		{
			if (srcoff == boundary)
			{
				chr_casekind = full ? CaseTitle : CaseUpper;
				boundary = wbnext(wbstate);
			}
			else
				chr_casekind = CaseLower;
		}

		casemap_result = casemap(u1, chr_casekind, full, src, srclen, srcoff,
								 &simple, &special);

		switch (casemap_result)
		{
			case CASEMAP_SELF:
				/* no mapping; copy bytes from src */
				Assert(simple == 0);
				Assert(special == NULL);
				if (result_len + u1len <= dstsize)
					memcpy(dst + result_len, src + srcoff, u1len);

				result_len += u1len;
				break;
			case CASEMAP_SIMPLE:
				{
					/* replace with single character */
					pg_wchar	u2 = simple;
					pg_wchar	u2len = unicode_utf8len(u2);

					Assert(special == NULL);
					if (result_len + u2len <= dstsize)
						unicode_to_utf8(u2, (unsigned char *) dst + result_len);

					result_len += u2len;
				}
				break;
			case CASEMAP_SPECIAL:
				/* replace with up to MAX_CASE_EXPANSION characters */
				Assert(simple == 0);
				for (int i = 0; i < MAX_CASE_EXPANSION && special[i]; i++)
				{
					pg_wchar	u2 = special[i];
					size_t		u2len = unicode_utf8len(u2);

					if (result_len + u2len <= dstsize)
						unicode_to_utf8(u2, (unsigned char *) dst + result_len);

					result_len += u2len;
				}
				break;
		}

		srcoff += u1len;
	}

	if (result_len < dstsize)
		dst[result_len] = '\0';

	return result_len;
}

/*
 * Check that the condition matches Final_Sigma, described in Unicode Table
 * 3-17. The character at the given offset must be directly preceded by a
 * Cased character, and must not be directly followed by a Cased character.
 *
 * Case_Ignorable characters are ignored. NB: some characters may be both
 * Cased and Case_Ignorable, in which case they are ignored.
 */
static bool
check_final_sigma(const unsigned char *str, size_t len, size_t offset)
{
	/* the start of the string is not preceded by a Cased character */
	if (offset == 0)
		return false;

	/* iterate backwards, looking for Cased character */
	for (int i = offset - 1; i >= 0; i--)
	{
		if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
		{
			pg_wchar	curr = utf8_to_unicode(str + i);

			if (pg_u_prop_case_ignorable(curr))
				continue;
			else if (pg_u_prop_cased(curr))
				break;
			else
				return false;
		}
		else if ((str[i] & 0xC0) == 0x80)
			continue;

		Assert(false);			/* invalid UTF-8 */
	}

	/* end of string is not followed by a Cased character */
	if (offset == len)
		return true;

	/* iterate forwards, looking for Cased character */
	for (int i = offset + 1; i < len && str[i] != '\0'; i++)
	{
		if ((str[i] & 0x80) == 0 || (str[i] & 0xC0) == 0xC0)
		{
			pg_wchar	curr = utf8_to_unicode(str + i);

			if (pg_u_prop_case_ignorable(curr))
				continue;
			else if (pg_u_prop_cased(curr))
				return false;
			else
				break;
		}
		else if ((str[i] & 0xC0) == 0x80)
			continue;

		Assert(false);			/* invalid UTF-8 */
	}

	return true;
}

/*
 * Unicode allows for special casing to be applied only under certain
 * circumstances. The only currently-supported condition is Final_Sigma.
 */
static bool
check_special_conditions(int conditions, const char *str, size_t len,
						 size_t offset)
{
	if (conditions == 0)
		return true;
	else if (conditions == PG_U_FINAL_SIGMA)
		return check_final_sigma((unsigned char *) str, len, offset);

	/* no other conditions supported */
	Assert(false);
	return false;
}

/*
 * Map the given character to the requested case.
 *
 * If full is true, and a special case mapping is found and the conditions are
 * met, 'special' is set to the mapping result (which is an array of up to
 * MAX_CASE_EXPANSION characters) and CASEMAP_SPECIAL is returned.
 *
 * Otherwise, search for a simple mapping, and if found, set 'simple' to the
 * result and return CASEMAP_SIMPLE.
 *
 * If no mapping is found, return CASEMAP_SELF, and the caller should copy the
 * character without modification.
 */
static enum CaseMapResult
casemap(pg_wchar u1, CaseKind casekind, bool full,
		const char *src, size_t srclen, size_t srcoff,
		pg_wchar *simple, const pg_wchar **special)
{
	uint16		idx;

	/* Fast path for codepoints < 0x80 */
	if (u1 < 0x80)
	{
		/*
		 * The first elements in all tables are reserved as 0 (as NULL). The
		 * data starts at index 1, not 0.
		 */
		*simple = casekind_map[casekind][u1 + 1];

		return CASEMAP_SIMPLE;
	}

	idx = case_index(u1);

	if (idx == 0)
		return CASEMAP_SELF;

	if (full && case_map_special[idx] &&
		check_special_conditions(special_case[case_map_special[idx]].conditions,
								 src, srclen, srcoff))
	{
		*special = special_case[case_map_special[idx]].map[casekind];
		return CASEMAP_SPECIAL;
	}

	*simple = casekind_map[casekind][idx];

	return CASEMAP_SIMPLE;
}

/*
 * Find entry in simple case map.
 * If the entry does not exist, 0 will be returned.
 */
static pg_wchar
find_case_map(pg_wchar ucs, const pg_wchar *map)
{
	/* Fast path for codepoints < 0x80 */
	if (ucs < 0x80)
		/* The first elements in all tables are reserved as 0 (as NULL). */
		return map[ucs + 1];
	return map[case_index(ucs)];
}