postgres/src/common/unicode_category.c

/*-------------------------------------------------------------------------
 * unicode_category.c
 *		Determine general category and character properties of Unicode
 *		characters. Encoding must be UTF8, where we assume that the pg_wchar
 *		representation is a code point.
 *
 * Portions Copyright (c) 2017-2024, PostgreSQL Global Development Group
 *
 * IDENTIFICATION
 *	  src/common/unicode_category.c
 *
 *-------------------------------------------------------------------------
 */
#ifndef FRONTEND
#include "postgres.h"
#else
#include "postgres_fe.h"
#endif

#include "common/unicode_category.h"
#include "common/unicode_category_table.h"

/*
 * Create bitmasks from pg_unicode_category values for efficient comparison of
 * multiple categories. For instance, PG_U_MN_MASK is a bitmask representing
 * the general category Mn; and PG_U_M_MASK represents general categories Mn,
 * Me, and Mc.
 *
 * The number of Unicode General Categories should never grow, so a 32-bit
 * mask is fine.
 */
#define PG_U_CATEGORY_MASK(X) ((uint32)(1 << (X)))

#define PG_U_LU_MASK PG_U_CATEGORY_MASK(PG_U_UPPERCASE_LETTER)
#define PG_U_LL_MASK PG_U_CATEGORY_MASK(PG_U_LOWERCASE_LETTER)
#define PG_U_LT_MASK PG_U_CATEGORY_MASK(PG_U_TITLECASE_LETTER)
#define PG_U_LC_MASK (PG_U_LU_MASK|PG_U_LL_MASK|PG_U_LT_MASK)
#define PG_U_LM_MASK PG_U_CATEGORY_MASK(PG_U_MODIFIER_LETTER)
#define PG_U_LO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_LETTER)
#define PG_U_L_MASK (PG_U_LU_MASK|PG_U_LL_MASK|PG_U_LT_MASK|PG_U_LM_MASK|\
					 PG_U_LO_MASK)
#define PG_U_MN_MASK PG_U_CATEGORY_MASK(PG_U_NONSPACING_MARK)
#define PG_U_ME_MASK PG_U_CATEGORY_MASK(PG_U_ENCLOSING_MARK)
#define PG_U_MC_MASK PG_U_CATEGORY_MASK(PG_U_SPACING_MARK)
#define PG_U_M_MASK (PG_U_MN_MASK|PG_U_MC_MASK|PG_U_ME_MASK)
#define PG_U_ND_MASK PG_U_CATEGORY_MASK(PG_U_DECIMAL_NUMBER)
#define PG_U_NL_MASK PG_U_CATEGORY_MASK(PG_U_LETTER_NUMBER)
#define PG_U_NO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_NUMBER)
#define PG_U_N_MASK (PG_U_ND_MASK|PG_U_NL_MASK|PG_U_NO_MASK)
#define PG_U_PC_MASK PG_U_CATEGORY_MASK(PG_U_CONNECTOR_PUNCTUATION)
#define PG_U_PD_MASK PG_U_CATEGORY_MASK(PG_U_DASH_PUNCTUATION)
#define PG_U_PS_MASK PG_U_CATEGORY_MASK(PG_U_OPEN_PUNCTUATION)
#define PG_U_PE_MASK PG_U_CATEGORY_MASK(PG_U_CLOSE_PUNCTUATION)
#define PG_U_PI_MASK PG_U_CATEGORY_MASK(PG_U_INITIAL_PUNCTUATION)
#define PG_U_PF_MASK PG_U_CATEGORY_MASK(PG_U_FINAL_PUNCTUATION)
#define PG_U_PO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_PUNCTUATION)
#define PG_U_P_MASK (PG_U_PC_MASK|PG_U_PD_MASK|PG_U_PS_MASK|PG_U_PE_MASK|\
					 PG_U_PI_MASK|PG_U_PF_MASK|PG_U_PO_MASK)
#define PG_U_SM_MASK PG_U_CATEGORY_MASK(PG_U_MATH_SYMBOL)
#define PG_U_SC_MASK PG_U_CATEGORY_MASK(PG_U_CURRENCY_SYMBOL)
#define PG_U_SK_MASK PG_U_CATEGORY_MASK(PG_U_MODIFIER_SYMBOL)
#define PG_U_SO_MASK PG_U_CATEGORY_MASK(PG_U_OTHER_SYMBOL)
#define PG_U_S_MASK (PG_U_SM_MASK|PG_U_SC_MASK|PG_U_SK_MASK|PG_U_SO_MASK)
#define PG_U_ZS_MASK PG_U_CATEGORY_MASK(PG_U_SPACE_SEPARATOR)
#define PG_U_ZL_MASK PG_U_CATEGORY_MASK(PG_U_LINE_SEPARATOR)
#define PG_U_ZP_MASK PG_U_CATEGORY_MASK(PG_U_PARAGRAPH_SEPARATOR)
#define PG_U_Z_MASK (PG_U_ZS_MASK|PG_U_ZL_MASK|PG_U_ZP_MASK)
#define PG_U_CC_MASK PG_U_CATEGORY_MASK(PG_U_CONTROL)
#define PG_U_CF_MASK PG_U_CATEGORY_MASK(PG_U_FORMAT)
#define PG_U_CS_MASK PG_U_CATEGORY_MASK(PG_U_SURROGATE)
#define PG_U_CO_MASK PG_U_CATEGORY_MASK(PG_U_PRIVATE_USE)
#define PG_U_CN_MASK PG_U_CATEGORY_MASK(PG_U_UNASSIGNED)
#define PG_U_C_MASK (PG_U_CC_MASK|PG_U_CF_MASK|PG_U_CS_MASK|PG_U_CO_MASK|\
					 PG_U_CN_MASK)

#define PG_U_CHARACTER_TAB	0x09

static bool range_search(const pg_unicode_range * tbl, size_t size,
						 pg_wchar code);

/*
 * Unicode general category for the given codepoint.
 */
pg_unicode_category
unicode_category(pg_wchar code)
{
	int			min = 0;
	int			mid;
	int			max = lengthof(unicode_categories) - 1;

	Assert(code <= 0x10ffff);

	if (code < 0x80)
		return unicode_opt_ascii[code].category;

	while (max >= min)
	{
		mid = (min + max) / 2;
		if (code > unicode_categories[mid].last)
			min = mid + 1;
		else if (code < unicode_categories[mid].first)
			max = mid - 1;
		else
			return unicode_categories[mid].category;
	}

	return PG_U_UNASSIGNED;
}

bool
pg_u_prop_alphabetic(pg_wchar code)
{
	if (code < 0x80)
		return unicode_opt_ascii[code].properties & PG_U_PROP_ALPHABETIC;

	return range_search(unicode_alphabetic,
						lengthof(unicode_alphabetic),
						code);
}

bool
pg_u_prop_lowercase(pg_wchar code)
{
	if (code < 0x80)
		return unicode_opt_ascii[code].properties & PG_U_PROP_LOWERCASE;

	return range_search(unicode_lowercase,
						lengthof(unicode_lowercase),
						code);
}

bool
pg_u_prop_uppercase(pg_wchar code)
{
	if (code < 0x80)
		return unicode_opt_ascii[code].properties & PG_U_PROP_UPPERCASE;

	return range_search(unicode_uppercase,
						lengthof(unicode_uppercase),
						code);
}

bool
pg_u_prop_cased(pg_wchar code)
{
	uint32		category_mask;

	if (code < 0x80)
		return unicode_opt_ascii[code].properties & PG_U_PROP_CASED;

	category_mask = PG_U_CATEGORY_MASK(unicode_category(code));

	return category_mask & PG_U_LT_MASK ||
		pg_u_prop_lowercase(code) ||
		pg_u_prop_uppercase(code);
}

bool
pg_u_prop_case_ignorable(pg_wchar code)
{
	if (code < 0x80)
		return unicode_opt_ascii[code].properties & PG_U_PROP_CASE_IGNORABLE;

	return range_search(unicode_case_ignorable,
						lengthof(unicode_case_ignorable),
						code);
}

bool
pg_u_prop_white_space(pg_wchar code)
{
	if (code < 0x80)
		return unicode_opt_ascii[code].properties & PG_U_PROP_WHITE_SPACE;

	return range_search(unicode_white_space,
						lengthof(unicode_white_space),
						code);
}

bool
pg_u_prop_hex_digit(pg_wchar code)
{
	if (code < 0x80)
		return unicode_opt_ascii[code].properties & PG_U_PROP_HEX_DIGIT;

	return range_search(unicode_hex_digit,
						lengthof(unicode_hex_digit),
						code);
}

bool
pg_u_prop_join_control(pg_wchar code)
{
	if (code < 0x80)
		return unicode_opt_ascii[code].properties & PG_U_PROP_JOIN_CONTROL;

	return range_search(unicode_join_control,
						lengthof(unicode_join_control),
						code);
}

/*
 * The following functions implement the Compatibility Properties described
 * at: http://www.unicode.org/reports/tr18/#Compatibility_Properties
 *
 * If 'posix' is true, implements the "POSIX Compatible" variant, otherwise
 * the "Standard" variant.
 */

bool
pg_u_isdigit(pg_wchar code, bool posix)
{
	if (posix)
		return ('0' <= code && code <= '9');
	else
		return unicode_category(code) == PG_U_DECIMAL_NUMBER;
}

bool
pg_u_isalpha(pg_wchar code)
{
	return pg_u_prop_alphabetic(code);
}

bool
pg_u_isalnum(pg_wchar code, bool posix)
{
	return pg_u_isalpha(code) || pg_u_isdigit(code, posix);
}

bool
pg_u_isword(pg_wchar code)
{
	uint32		category_mask = PG_U_CATEGORY_MASK(unicode_category(code));

	return
		category_mask & (PG_U_M_MASK | PG_U_ND_MASK | PG_U_PC_MASK) ||
		pg_u_isalpha(code) ||
		pg_u_prop_join_control(code);
}

bool
pg_u_isupper(pg_wchar code)
{
	return pg_u_prop_uppercase(code);
}

bool
pg_u_islower(pg_wchar code)
{
	return pg_u_prop_lowercase(code);
}

bool
pg_u_isblank(pg_wchar code)
{
	return code == PG_U_CHARACTER_TAB ||
		unicode_category(code) == PG_U_SPACE_SEPARATOR;
}

bool
pg_u_iscntrl(pg_wchar code)
{
	return unicode_category(code) == PG_U_CONTROL;
}

bool
pg_u_isgraph(pg_wchar code)
{
	uint32		category_mask = PG_U_CATEGORY_MASK(unicode_category(code));

	if (category_mask & (PG_U_CC_MASK | PG_U_CS_MASK | PG_U_CN_MASK) ||
		pg_u_isspace(code))
		return false;
	return true;
}

bool
pg_u_isprint(pg_wchar code)
{
	pg_unicode_category category = unicode_category(code);

	if (category == PG_U_CONTROL)
		return false;

	return pg_u_isgraph(code) || pg_u_isblank(code);
}

bool
pg_u_ispunct(pg_wchar code, bool posix)
{
	uint32		category_mask;

	if (posix)
	{
		if (pg_u_isalpha(code))
			return false;

		category_mask = PG_U_CATEGORY_MASK(unicode_category(code));
		return category_mask & (PG_U_P_MASK | PG_U_S_MASK);
	}
	else
	{
		category_mask = PG_U_CATEGORY_MASK(unicode_category(code));

		return category_mask & PG_U_P_MASK;
	}
}

bool
pg_u_isspace(pg_wchar code)
{
	return pg_u_prop_white_space(code);
}

bool
pg_u_isxdigit(pg_wchar code, bool posix)
{
	if (posix)
		return (('0' <= code && code <= '9') ||
				('A' <= code && code <= 'F') ||
				('a' <= code && code <= 'f'));
	else
		return unicode_category(code) == PG_U_DECIMAL_NUMBER ||
			pg_u_prop_hex_digit(code);
}

/*
 * Description of Unicode general category.
 */
const char *
unicode_category_string(pg_unicode_category category)
{
	switch (category)
	{
		case PG_U_UNASSIGNED:
			return "Unassigned";
		case PG_U_UPPERCASE_LETTER:
			return "Uppercase_Letter";
		case PG_U_LOWERCASE_LETTER:
			return "Lowercase_Letter";
		case PG_U_TITLECASE_LETTER:
			return "Titlecase_Letter";
		case PG_U_MODIFIER_LETTER:
			return "Modifier_Letter";
		case PG_U_OTHER_LETTER:
			return "Other_Letter";
		case PG_U_NONSPACING_MARK:
			return "Nonspacing_Mark";
		case PG_U_ENCLOSING_MARK:
			return "Enclosing_Mark";
		case PG_U_SPACING_MARK:
			return "Spacing_Mark";
		case PG_U_DECIMAL_NUMBER:
			return "Decimal_Number";
		case PG_U_LETTER_NUMBER:
			return "Letter_Number";
		case PG_U_OTHER_NUMBER:
			return "Other_Number";
		case PG_U_SPACE_SEPARATOR:
			return "Space_Separator";
		case PG_U_LINE_SEPARATOR:
			return "Line_Separator";
		case PG_U_PARAGRAPH_SEPARATOR:
			return "Paragraph_Separator";
		case PG_U_CONTROL:
			return "Control";
		case PG_U_FORMAT:
			return "Format";
		case PG_U_PRIVATE_USE:
			return "Private_Use";
		case PG_U_SURROGATE:
			return "Surrogate";
		case PG_U_DASH_PUNCTUATION:
			return "Dash_Punctuation";
		case PG_U_OPEN_PUNCTUATION:
			return "Open_Punctuation";
		case PG_U_CLOSE_PUNCTUATION:
			return "Close_Punctuation";
		case PG_U_CONNECTOR_PUNCTUATION:
			return "Connector_Punctuation";
		case PG_U_OTHER_PUNCTUATION:
			return "Other_Punctuation";
		case PG_U_MATH_SYMBOL:
			return "Math_Symbol";
		case PG_U_CURRENCY_SYMBOL:
			return "Currency_Symbol";
		case PG_U_MODIFIER_SYMBOL:
			return "Modifier_Symbol";
		case PG_U_OTHER_SYMBOL:
			return "Other_Symbol";
		case PG_U_INITIAL_PUNCTUATION:
			return "Initial_Punctuation";
		case PG_U_FINAL_PUNCTUATION:
			return "Final_Punctuation";
	}

	Assert(false);
	return "Unrecognized";		/* keep compiler quiet */
}

/*
 * Short code for Unicode general category.
 */
const char *
unicode_category_abbrev(pg_unicode_category category)
{
	switch (category)
	{
		case PG_U_UNASSIGNED:
			return "Cn";
		case PG_U_UPPERCASE_LETTER:
			return "Lu";
		case PG_U_LOWERCASE_LETTER:
			return "Ll";
		case PG_U_TITLECASE_LETTER:
			return "Lt";
		case PG_U_MODIFIER_LETTER:
			return "Lm";
		case PG_U_OTHER_LETTER:
			return "Lo";
		case PG_U_NONSPACING_MARK:
			return "Mn";
		case PG_U_ENCLOSING_MARK:
			return "Me";
		case PG_U_SPACING_MARK:
			return "Mc";
		case PG_U_DECIMAL_NUMBER:
			return "Nd";
		case PG_U_LETTER_NUMBER:
			return "Nl";
		case PG_U_OTHER_NUMBER:
			return "No";
		case PG_U_SPACE_SEPARATOR:
			return "Zs";
		case PG_U_LINE_SEPARATOR:
			return "Zl";
		case PG_U_PARAGRAPH_SEPARATOR:
			return "Zp";
		case PG_U_CONTROL:
			return "Cc";
		case PG_U_FORMAT:
			return "Cf";
		case PG_U_PRIVATE_USE:
			return "Co";
		case PG_U_SURROGATE:
			return "Cs";
		case PG_U_DASH_PUNCTUATION:
			return "Pd";
		case PG_U_OPEN_PUNCTUATION:
			return "Ps";
		case PG_U_CLOSE_PUNCTUATION:
			return "Pe";
		case PG_U_CONNECTOR_PUNCTUATION:
			return "Pc";
		case PG_U_OTHER_PUNCTUATION:
			return "Po";
		case PG_U_MATH_SYMBOL:
			return "Sm";
		case PG_U_CURRENCY_SYMBOL:
			return "Sc";
		case PG_U_MODIFIER_SYMBOL:
			return "Sk";
		case PG_U_OTHER_SYMBOL:
			return "So";
		case PG_U_INITIAL_PUNCTUATION:
			return "Pi";
		case PG_U_FINAL_PUNCTUATION:
			return "Pf";
	}

	Assert(false);
	return "??";				/* keep compiler quiet */
}

/*
 * Binary search to test if given codepoint exists in one of the ranges in the
 * given table.
 */
static bool
range_search(const pg_unicode_range * tbl, size_t size, pg_wchar code)
{
	int			min = 0;
	int			mid;
	int			max = size - 1;

	Assert(code <= 0x10ffff);

	while (max >= min)
	{
		mid = (min + max) / 2;
		if (code > tbl[mid].last)
			min = mid + 1;
		else if (code < tbl[mid].first)
			max = mid - 1;
		else
			return true;
	}

	return false;
}