mirror of
				https://github.com/postgres/postgres.git
				synced 2025-11-03 09:13:20 +03:00 
			
		
		
		
	Add support for other normal forms to Unicode normalization API
It previously only supported NFKC, for use by SASLprep. This expands the API to offer the choice of all four normalization forms. Right now, there are no internal users of the forms other than NFKC. Reviewed-by: Daniel Verite <daniel@manitou-mail.org> Reviewed-by: Andreas Karlsson <andreas@proxel.se> Discussion: https://www.postgresql.org/message-id/flat/c1909f27-c269-2ed9-12f8-3ab72c8caf7a@2ndquadrant.com
This commit is contained in:
		@@ -1156,7 +1156,7 @@ pg_saslprep(const char *input, char **output)
 | 
				
			|||||||
	 * 2) Normalize -- Normalize the result of step 1 using Unicode
 | 
						 * 2) Normalize -- Normalize the result of step 1 using Unicode
 | 
				
			||||||
	 * normalization.
 | 
						 * normalization.
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	output_chars = unicode_normalize_kc(input_chars);
 | 
						output_chars = unicode_normalize(UNICODE_NFKC, input_chars);
 | 
				
			||||||
	if (!output_chars)
 | 
						if (!output_chars)
 | 
				
			||||||
		goto oom;
 | 
							goto oom;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -48,7 +48,7 @@ typedef struct
 | 
				
			|||||||
{
 | 
					{
 | 
				
			||||||
	int			linenum;
 | 
						int			linenum;
 | 
				
			||||||
	pg_wchar	input[50];
 | 
						pg_wchar	input[50];
 | 
				
			||||||
	pg_wchar	output[50];
 | 
						pg_wchar	output[4][50];
 | 
				
			||||||
} pg_unicode_test;
 | 
					} pg_unicode_test;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* test table */
 | 
					/* test table */
 | 
				
			||||||
@@ -89,13 +89,16 @@ while (my $line = <$INPUT>)
 | 
				
			|||||||
	my ($source, $nfc, $nfd, $nfkc, $nfkd) = split(';', $line);
 | 
						my ($source, $nfc, $nfd, $nfkc, $nfkd) = split(';', $line);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	my $source_utf8 = codepoint_string_to_hex($source);
 | 
						my $source_utf8 = codepoint_string_to_hex($source);
 | 
				
			||||||
 | 
						my $nfc_utf8    = codepoint_string_to_hex($nfc);
 | 
				
			||||||
 | 
						my $nfd_utf8    = codepoint_string_to_hex($nfd);
 | 
				
			||||||
	my $nfkc_utf8   = codepoint_string_to_hex($nfkc);
 | 
						my $nfkc_utf8   = codepoint_string_to_hex($nfkc);
 | 
				
			||||||
 | 
						my $nfkd_utf8   = codepoint_string_to_hex($nfkd);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	print $OUTPUT "\t{ $linenum, { $source_utf8 }, { $nfkc_utf8 } },\n";
 | 
						print $OUTPUT "\t{ $linenum, { $source_utf8 }, { { $nfc_utf8 }, { $nfd_utf8 }, { $nfkc_utf8 }, { $nfkd_utf8 } } },\n";
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Output terminator entry
 | 
					# Output terminator entry
 | 
				
			||||||
print $OUTPUT "\t{ 0, { 0 }, { 0 } }";
 | 
					print $OUTPUT "\t{ 0, { 0 }, { { 0 }, { 0 }, { 0 }, { 0 } } }";
 | 
				
			||||||
print $OUTPUT "\n};\n";
 | 
					print $OUTPUT "\n};\n";
 | 
				
			||||||
 | 
					
 | 
				
			||||||
close $OUTPUT;
 | 
					close $OUTPUT;
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -99,10 +99,12 @@ typedef struct
 | 
				
			|||||||
#define DECOMP_NO_COMPOSE	0x80	/* don't use for re-composition */
 | 
					#define DECOMP_NO_COMPOSE	0x80	/* don't use for re-composition */
 | 
				
			||||||
#define DECOMP_INLINE		0x40	/* decomposition is stored inline in
 | 
					#define DECOMP_INLINE		0x40	/* decomposition is stored inline in
 | 
				
			||||||
									 * dec_index */
 | 
														 * dec_index */
 | 
				
			||||||
 | 
					#define DECOMP_COMPAT		0x20	/* compatibility mapping */
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags & 0x3F)
 | 
					#define DECOMPOSITION_SIZE(x) ((x)->dec_size_flags & 0x1F)
 | 
				
			||||||
#define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & DECOMP_NO_COMPOSE) != 0)
 | 
					#define DECOMPOSITION_NO_COMPOSE(x) (((x)->dec_size_flags & (DECOMP_NO_COMPOSE | DECOMP_COMPAT)) != 0)
 | 
				
			||||||
#define DECOMPOSITION_IS_INLINE(x) (((x)->dec_size_flags & DECOMP_INLINE) != 0)
 | 
					#define DECOMPOSITION_IS_INLINE(x) (((x)->dec_size_flags & DECOMP_INLINE) != 0)
 | 
				
			||||||
 | 
					#define DECOMPOSITION_IS_COMPAT(x) (((x)->dec_size_flags & DECOMP_COMPAT) != 0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/* Table of Unicode codepoints and their decompositions */
 | 
					/* Table of Unicode codepoints and their decompositions */
 | 
				
			||||||
static const pg_unicode_decomposition UnicodeDecompMain[$num_characters] =
 | 
					static const pg_unicode_decomposition UnicodeDecompMain[$num_characters] =
 | 
				
			||||||
@@ -136,22 +138,22 @@ foreach my $char (@characters)
 | 
				
			|||||||
	# Decomposition size
 | 
						# Decomposition size
 | 
				
			||||||
	# Print size of decomposition
 | 
						# Print size of decomposition
 | 
				
			||||||
	my $decomp_size = scalar(@decomp_elts);
 | 
						my $decomp_size = scalar(@decomp_elts);
 | 
				
			||||||
 | 
						die if $decomp_size > 0x1F;		# to not overrun bitmask
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	my $first_decomp = shift @decomp_elts;
 | 
						my $first_decomp = shift @decomp_elts;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	my $flags   = "";
 | 
						my $flags   = "";
 | 
				
			||||||
	my $comment = "";
 | 
						my $comment = "";
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if ($compat)
 | 
				
			||||||
 | 
						{
 | 
				
			||||||
 | 
							$flags .= " | DECOMP_COMPAT";
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if ($decomp_size == 2)
 | 
						if ($decomp_size == 2)
 | 
				
			||||||
	{
 | 
						{
 | 
				
			||||||
 | 
					 | 
				
			||||||
		# Should this be used for recomposition?
 | 
							# Should this be used for recomposition?
 | 
				
			||||||
		if ($compat)
 | 
							if ($character_hash{$first_decomp}
 | 
				
			||||||
		{
 | 
					 | 
				
			||||||
			$flags .= " | DECOMP_NO_COMPOSE";
 | 
					 | 
				
			||||||
			$comment = "compatibility mapping";
 | 
					 | 
				
			||||||
		}
 | 
					 | 
				
			||||||
		elsif ($character_hash{$first_decomp}
 | 
					 | 
				
			||||||
			&& $character_hash{$first_decomp}->{class} != 0)
 | 
								&& $character_hash{$first_decomp}->{class} != 0)
 | 
				
			||||||
		{
 | 
							{
 | 
				
			||||||
			$flags .= " | DECOMP_NO_COMPOSE";
 | 
								$flags .= " | DECOMP_NO_COMPOSE";
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -63,18 +63,21 @@ main(int argc, char **argv)
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
	for (test = UnicodeNormalizationTests; test->input[0] != 0; test++)
 | 
						for (test = UnicodeNormalizationTests; test->input[0] != 0; test++)
 | 
				
			||||||
	{
 | 
						{
 | 
				
			||||||
		pg_wchar   *result;
 | 
							for (int form = 0; form < 4; form++)
 | 
				
			||||||
 | 
					 | 
				
			||||||
		result = unicode_normalize_kc(test->input);
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
		if (pg_wcscmp(test->output, result) != 0)
 | 
					 | 
				
			||||||
		{
 | 
							{
 | 
				
			||||||
			printf("FAILURE (NormalizationTest.txt line %d):\n", test->linenum);
 | 
								pg_wchar   *result;
 | 
				
			||||||
			printf("input:    %s\n", print_wchar_str(test->input));
 | 
					
 | 
				
			||||||
			printf("expected: %s\n", print_wchar_str(test->output));
 | 
								result = unicode_normalize(form, test->input);
 | 
				
			||||||
			printf("got:      %s\n", print_wchar_str(result));
 | 
					
 | 
				
			||||||
			printf("\n");
 | 
								if (pg_wcscmp(test->output[form], result) != 0)
 | 
				
			||||||
			exit(1);
 | 
								{
 | 
				
			||||||
 | 
									printf("FAILURE (NormalizationTest.txt line %d form %d):\n", test->linenum, form);
 | 
				
			||||||
 | 
									printf("input:    %s\n", print_wchar_str(test->input));
 | 
				
			||||||
 | 
									printf("expected: %s\n", print_wchar_str(test->output[form]));
 | 
				
			||||||
 | 
									printf("got:      %s\n", print_wchar_str(result));
 | 
				
			||||||
 | 
									printf("\n");
 | 
				
			||||||
 | 
									exit(1);
 | 
				
			||||||
 | 
								}
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,6 +1,6 @@
 | 
				
			|||||||
/*-------------------------------------------------------------------------
 | 
					/*-------------------------------------------------------------------------
 | 
				
			||||||
 * unicode_norm.c
 | 
					 * unicode_norm.c
 | 
				
			||||||
 *		Normalize a Unicode string to NFKC form
 | 
					 *		Normalize a Unicode string
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 * This implements Unicode normalization, per the documentation at
 | 
					 * This implements Unicode normalization, per the documentation at
 | 
				
			||||||
 * https://www.unicode.org/reports/tr15/.
 | 
					 * https://www.unicode.org/reports/tr15/.
 | 
				
			||||||
@@ -98,7 +98,7 @@ get_code_decomposition(pg_unicode_decomposition *entry, int *dec_size)
 | 
				
			|||||||
 * are, in turn, decomposable.
 | 
					 * are, in turn, decomposable.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
static int
 | 
					static int
 | 
				
			||||||
get_decomposed_size(pg_wchar code)
 | 
					get_decomposed_size(pg_wchar code, bool compat)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	pg_unicode_decomposition *entry;
 | 
						pg_unicode_decomposition *entry;
 | 
				
			||||||
	int			size = 0;
 | 
						int			size = 0;
 | 
				
			||||||
@@ -131,7 +131,8 @@ get_decomposed_size(pg_wchar code)
 | 
				
			|||||||
	 * Just count current code if no other decompositions.  A NULL entry is
 | 
						 * Just count current code if no other decompositions.  A NULL entry is
 | 
				
			||||||
	 * equivalent to a character with class 0 and no decompositions.
 | 
						 * equivalent to a character with class 0 and no decompositions.
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0)
 | 
						if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
 | 
				
			||||||
 | 
							(!compat && DECOMPOSITION_IS_COMPAT(entry)))
 | 
				
			||||||
		return 1;
 | 
							return 1;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
@@ -143,7 +144,7 @@ get_decomposed_size(pg_wchar code)
 | 
				
			|||||||
	{
 | 
						{
 | 
				
			||||||
		uint32		lcode = decomp[i];
 | 
							uint32		lcode = decomp[i];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		size += get_decomposed_size(lcode);
 | 
							size += get_decomposed_size(lcode, compat);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	return size;
 | 
						return size;
 | 
				
			||||||
@@ -224,7 +225,7 @@ recompose_code(uint32 start, uint32 code, uint32 *result)
 | 
				
			|||||||
 * in the array result.
 | 
					 * in the array result.
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
static void
 | 
					static void
 | 
				
			||||||
decompose_code(pg_wchar code, pg_wchar **result, int *current)
 | 
					decompose_code(pg_wchar code, bool compat, pg_wchar **result, int *current)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	pg_unicode_decomposition *entry;
 | 
						pg_unicode_decomposition *entry;
 | 
				
			||||||
	int			i;
 | 
						int			i;
 | 
				
			||||||
@@ -272,7 +273,8 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current)
 | 
				
			|||||||
	 * character with class 0 and no decompositions, so just leave also in
 | 
						 * character with class 0 and no decompositions, so just leave also in
 | 
				
			||||||
	 * this case.
 | 
						 * this case.
 | 
				
			||||||
	 */
 | 
						 */
 | 
				
			||||||
	if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0)
 | 
						if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0 ||
 | 
				
			||||||
 | 
							(!compat && DECOMPOSITION_IS_COMPAT(entry)))
 | 
				
			||||||
	{
 | 
						{
 | 
				
			||||||
		pg_wchar   *res = *result;
 | 
							pg_wchar   *res = *result;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -290,12 +292,12 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current)
 | 
				
			|||||||
		pg_wchar	lcode = (pg_wchar) decomp[i];
 | 
							pg_wchar	lcode = (pg_wchar) decomp[i];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		/* Leave if no more decompositions */
 | 
							/* Leave if no more decompositions */
 | 
				
			||||||
		decompose_code(lcode, result, current);
 | 
							decompose_code(lcode, compat, result, current);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/*
 | 
					/*
 | 
				
			||||||
 * unicode_normalize_kc - Normalize a Unicode string to NFKC form.
 | 
					 * unicode_normalize - Normalize a Unicode string to the specified form.
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
 * The input is a 0-terminated array of codepoints.
 | 
					 * The input is a 0-terminated array of codepoints.
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
@@ -304,8 +306,10 @@ decompose_code(pg_wchar code, pg_wchar **result, int *current)
 | 
				
			|||||||
 * string is palloc'd instead, and OOM is reported with ereport().
 | 
					 * string is palloc'd instead, and OOM is reported with ereport().
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
pg_wchar *
 | 
					pg_wchar *
 | 
				
			||||||
unicode_normalize_kc(const pg_wchar *input)
 | 
					unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
 | 
						bool		compat = (form == UNICODE_NFKC || form == UNICODE_NFKD);
 | 
				
			||||||
 | 
						bool		recompose = (form == UNICODE_NFC || form == UNICODE_NFKC);
 | 
				
			||||||
	pg_wchar   *decomp_chars;
 | 
						pg_wchar   *decomp_chars;
 | 
				
			||||||
	pg_wchar   *recomp_chars;
 | 
						pg_wchar   *recomp_chars;
 | 
				
			||||||
	int			decomp_size,
 | 
						int			decomp_size,
 | 
				
			||||||
@@ -326,7 +330,7 @@ unicode_normalize_kc(const pg_wchar *input)
 | 
				
			|||||||
	 */
 | 
						 */
 | 
				
			||||||
	decomp_size = 0;
 | 
						decomp_size = 0;
 | 
				
			||||||
	for (p = input; *p; p++)
 | 
						for (p = input; *p; p++)
 | 
				
			||||||
		decomp_size += get_decomposed_size(*p);
 | 
							decomp_size += get_decomposed_size(*p, compat);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
 | 
						decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
 | 
				
			||||||
	if (decomp_chars == NULL)
 | 
						if (decomp_chars == NULL)
 | 
				
			||||||
@@ -338,7 +342,7 @@ unicode_normalize_kc(const pg_wchar *input)
 | 
				
			|||||||
	 */
 | 
						 */
 | 
				
			||||||
	current_size = 0;
 | 
						current_size = 0;
 | 
				
			||||||
	for (p = input; *p; p++)
 | 
						for (p = input; *p; p++)
 | 
				
			||||||
		decompose_code(*p, &decomp_chars, ¤t_size);
 | 
							decompose_code(*p, compat, &decomp_chars, ¤t_size);
 | 
				
			||||||
	decomp_chars[decomp_size] = '\0';
 | 
						decomp_chars[decomp_size] = '\0';
 | 
				
			||||||
	Assert(decomp_size == current_size);
 | 
						Assert(decomp_size == current_size);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@@ -385,8 +389,11 @@ unicode_normalize_kc(const pg_wchar *input)
 | 
				
			|||||||
			count -= 2;
 | 
								count -= 2;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (!recompose)
 | 
				
			||||||
 | 
							return decomp_chars;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/*
 | 
						/*
 | 
				
			||||||
	 * The last phase of NFKC is the recomposition of the reordered Unicode
 | 
						 * The last phase of NFC and NFKC is the recomposition of the reordered Unicode
 | 
				
			||||||
	 * string using combining classes. The recomposed string cannot be longer
 | 
						 * string using combining classes. The recomposed string cannot be longer
 | 
				
			||||||
	 * than the decomposed one, so make the allocation of the output string
 | 
						 * than the decomposed one, so make the allocation of the output string
 | 
				
			||||||
	 * based on that assumption.
 | 
						 * based on that assumption.
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -16,6 +16,14 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
#include "mb/pg_wchar.h"
 | 
					#include "mb/pg_wchar.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
extern pg_wchar *unicode_normalize_kc(const pg_wchar *input);
 | 
					typedef enum
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						UNICODE_NFC = 0,
 | 
				
			||||||
 | 
						UNICODE_NFD = 1,
 | 
				
			||||||
 | 
						UNICODE_NFKC = 2,
 | 
				
			||||||
 | 
						UNICODE_NFKD = 3,
 | 
				
			||||||
 | 
					} UnicodeNormalizationForm;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					extern pg_wchar *unicode_normalize(UnicodeNormalizationForm form, const pg_wchar *input);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif							/* UNICODE_NORM_H */
 | 
					#endif							/* UNICODE_NORM_H */
 | 
				
			||||||
 
 | 
				
			|||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
		Reference in New Issue
	
	Block a user